diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
commit | ada47b5fe13d89735805b566185f4885f5a3f750 (patch) | |
tree | 644b88f8a71896307d71438e9b3af49126ffb22b /drivers/md | |
parent | 43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff) | |
parent | 3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff) |
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'drivers/md')
40 files changed, 3608 insertions, 1422 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2158377a1359..acb3a4e404ff 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -185,11 +185,10 @@ config MD_MULTIPATH | |||
185 | tristate "Multipath I/O support" | 185 | tristate "Multipath I/O support" |
186 | depends on BLK_DEV_MD | 186 | depends on BLK_DEV_MD |
187 | help | 187 | help |
188 | Multipath-IO is the ability of certain devices to address the same | 188 | MD_MULTIPATH provides a simple multi-path personality for use |
189 | physical disk over multiple 'IO paths'. The code ensures that such | 189 | the MD framework. It is not under active development. New |
190 | paths can be defined and handled at runtime, and ensures that a | 190 | projects should consider using DM_MULTIPATH which has more |
191 | transparent failover to the backup path(s) happens if a IO errors | 191 | features and more testing. |
192 | arrives on the primary path. | ||
193 | 192 | ||
194 | If unsure, say N. | 193 | If unsure, say N. |
195 | 194 | ||
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 60e2b322db11..26ac8aad0b19 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | |||
212 | */ | 212 | */ |
213 | 213 | ||
214 | /* IO operations when bitmap is stored near all superblocks */ | 214 | /* IO operations when bitmap is stored near all superblocks */ |
215 | static struct page *read_sb_page(mddev_t *mddev, long offset, | 215 | static struct page *read_sb_page(mddev_t *mddev, loff_t offset, |
216 | struct page *page, | 216 | struct page *page, |
217 | unsigned long index, int size) | 217 | unsigned long index, int size) |
218 | { | 218 | { |
@@ -287,27 +287,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
287 | 287 | ||
288 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 288 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
289 | int size = PAGE_SIZE; | 289 | int size = PAGE_SIZE; |
290 | loff_t offset = mddev->bitmap_info.offset; | ||
290 | if (page->index == bitmap->file_pages-1) | 291 | if (page->index == bitmap->file_pages-1) |
291 | size = roundup(bitmap->last_page_size, | 292 | size = roundup(bitmap->last_page_size, |
292 | bdev_logical_block_size(rdev->bdev)); | 293 | bdev_logical_block_size(rdev->bdev)); |
293 | /* Just make sure we aren't corrupting data or | 294 | /* Just make sure we aren't corrupting data or |
294 | * metadata | 295 | * metadata |
295 | */ | 296 | */ |
296 | if (bitmap->offset < 0) { | 297 | if (mddev->external) { |
298 | /* Bitmap could be anywhere. */ | ||
299 | if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) > | ||
300 | rdev->data_offset && | ||
301 | rdev->sb_start + offset < | ||
302 | rdev->data_offset + mddev->dev_sectors + | ||
303 | (PAGE_SIZE/512)) | ||
304 | goto bad_alignment; | ||
305 | } else if (offset < 0) { | ||
297 | /* DATA BITMAP METADATA */ | 306 | /* DATA BITMAP METADATA */ |
298 | if (bitmap->offset | 307 | if (offset |
299 | + (long)(page->index * (PAGE_SIZE/512)) | 308 | + (long)(page->index * (PAGE_SIZE/512)) |
300 | + size/512 > 0) | 309 | + size/512 > 0) |
301 | /* bitmap runs in to metadata */ | 310 | /* bitmap runs in to metadata */ |
302 | goto bad_alignment; | 311 | goto bad_alignment; |
303 | if (rdev->data_offset + mddev->dev_sectors | 312 | if (rdev->data_offset + mddev->dev_sectors |
304 | > rdev->sb_start + bitmap->offset) | 313 | > rdev->sb_start + offset) |
305 | /* data runs in to bitmap */ | 314 | /* data runs in to bitmap */ |
306 | goto bad_alignment; | 315 | goto bad_alignment; |
307 | } else if (rdev->sb_start < rdev->data_offset) { | 316 | } else if (rdev->sb_start < rdev->data_offset) { |
308 | /* METADATA BITMAP DATA */ | 317 | /* METADATA BITMAP DATA */ |
309 | if (rdev->sb_start | 318 | if (rdev->sb_start |
310 | + bitmap->offset | 319 | + offset |
311 | + page->index*(PAGE_SIZE/512) + size/512 | 320 | + page->index*(PAGE_SIZE/512) + size/512 |
312 | > rdev->data_offset) | 321 | > rdev->data_offset) |
313 | /* bitmap runs in to data */ | 322 | /* bitmap runs in to data */ |
@@ -316,7 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
316 | /* DATA METADATA BITMAP - no problems */ | 325 | /* DATA METADATA BITMAP - no problems */ |
317 | } | 326 | } |
318 | md_super_write(mddev, rdev, | 327 | md_super_write(mddev, rdev, |
319 | rdev->sb_start + bitmap->offset | 328 | rdev->sb_start + offset |
320 | + page->index * (PAGE_SIZE/512), | 329 | + page->index * (PAGE_SIZE/512), |
321 | size, | 330 | size, |
322 | page); | 331 | page); |
@@ -488,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
488 | 497 | ||
489 | if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ | 498 | if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ |
490 | return; | 499 | return; |
500 | if (bitmap->mddev->bitmap_info.external) | ||
501 | return; | ||
491 | spin_lock_irqsave(&bitmap->lock, flags); | 502 | spin_lock_irqsave(&bitmap->lock, flags); |
492 | if (!bitmap->sb_page) { /* no superblock */ | 503 | if (!bitmap->sb_page) { /* no superblock */ |
493 | spin_unlock_irqrestore(&bitmap->lock, flags); | 504 | spin_unlock_irqrestore(&bitmap->lock, flags); |
@@ -501,6 +512,9 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
501 | bitmap->events_cleared = bitmap->mddev->events; | 512 | bitmap->events_cleared = bitmap->mddev->events; |
502 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | 513 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); |
503 | } | 514 | } |
515 | /* Just in case these have been changed via sysfs: */ | ||
516 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | ||
517 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | ||
504 | kunmap_atomic(sb, KM_USER0); | 518 | kunmap_atomic(sb, KM_USER0); |
505 | write_page(bitmap, bitmap->sb_page, 1); | 519 | write_page(bitmap, bitmap->sb_page, 1); |
506 | } | 520 | } |
@@ -550,7 +564,8 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
550 | 564 | ||
551 | bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); | 565 | bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); |
552 | } else { | 566 | } else { |
553 | bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, | 567 | bitmap->sb_page = read_sb_page(bitmap->mddev, |
568 | bitmap->mddev->bitmap_info.offset, | ||
554 | NULL, | 569 | NULL, |
555 | 0, sizeof(bitmap_super_t)); | 570 | 0, sizeof(bitmap_super_t)); |
556 | } | 571 | } |
@@ -563,7 +578,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
563 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 578 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); |
564 | 579 | ||
565 | chunksize = le32_to_cpu(sb->chunksize); | 580 | chunksize = le32_to_cpu(sb->chunksize); |
566 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); | 581 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
567 | write_behind = le32_to_cpu(sb->write_behind); | 582 | write_behind = le32_to_cpu(sb->write_behind); |
568 | 583 | ||
569 | /* verify that the bitmap-specific fields are valid */ | 584 | /* verify that the bitmap-specific fields are valid */ |
@@ -576,7 +591,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
576 | reason = "bitmap chunksize too small"; | 591 | reason = "bitmap chunksize too small"; |
577 | else if ((1 << ffz(~chunksize)) != chunksize) | 592 | else if ((1 << ffz(~chunksize)) != chunksize) |
578 | reason = "bitmap chunksize not a power of 2"; | 593 | reason = "bitmap chunksize not a power of 2"; |
579 | else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) | 594 | else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) |
580 | reason = "daemon sleep period out of range"; | 595 | reason = "daemon sleep period out of range"; |
581 | else if (write_behind > COUNTER_MAX) | 596 | else if (write_behind > COUNTER_MAX) |
582 | reason = "write-behind limit out of range (0 - 16383)"; | 597 | reason = "write-behind limit out of range (0 - 16383)"; |
@@ -610,10 +625,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
610 | } | 625 | } |
611 | success: | 626 | success: |
612 | /* assign fields using values from superblock */ | 627 | /* assign fields using values from superblock */ |
613 | bitmap->chunksize = chunksize; | 628 | bitmap->mddev->bitmap_info.chunksize = chunksize; |
614 | bitmap->daemon_sleep = daemon_sleep; | 629 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; |
615 | bitmap->daemon_lastrun = jiffies; | 630 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; |
616 | bitmap->max_write_behind = write_behind; | ||
617 | bitmap->flags |= le32_to_cpu(sb->state); | 631 | bitmap->flags |= le32_to_cpu(sb->state); |
618 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 632 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
619 | bitmap->flags |= BITMAP_HOSTENDIAN; | 633 | bitmap->flags |= BITMAP_HOSTENDIAN; |
@@ -664,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
664 | * general bitmap file operations | 678 | * general bitmap file operations |
665 | */ | 679 | */ |
666 | 680 | ||
681 | /* | ||
682 | * on-disk bitmap: | ||
683 | * | ||
684 | * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap | ||
685 | * file a page at a time. There's a superblock at the start of the file. | ||
686 | */ | ||
667 | /* calculate the index of the page that contains this bit */ | 687 | /* calculate the index of the page that contains this bit */ |
668 | static inline unsigned long file_page_index(unsigned long chunk) | 688 | static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) |
669 | { | 689 | { |
670 | return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; | 690 | if (!bitmap->mddev->bitmap_info.external) |
691 | chunk += sizeof(bitmap_super_t) << 3; | ||
692 | return chunk >> PAGE_BIT_SHIFT; | ||
671 | } | 693 | } |
672 | 694 | ||
673 | /* calculate the (bit) offset of this bit within a page */ | 695 | /* calculate the (bit) offset of this bit within a page */ |
674 | static inline unsigned long file_page_offset(unsigned long chunk) | 696 | static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) |
675 | { | 697 | { |
676 | return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); | 698 | if (!bitmap->mddev->bitmap_info.external) |
699 | chunk += sizeof(bitmap_super_t) << 3; | ||
700 | return chunk & (PAGE_BITS - 1); | ||
677 | } | 701 | } |
678 | 702 | ||
679 | /* | 703 | /* |
@@ -686,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk) | |||
686 | static inline struct page *filemap_get_page(struct bitmap *bitmap, | 710 | static inline struct page *filemap_get_page(struct bitmap *bitmap, |
687 | unsigned long chunk) | 711 | unsigned long chunk) |
688 | { | 712 | { |
689 | if (file_page_index(chunk) >= bitmap->file_pages) return NULL; | 713 | if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; |
690 | return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; | 714 | return bitmap->filemap[file_page_index(bitmap, chunk) |
715 | - file_page_index(bitmap, 0)]; | ||
691 | } | 716 | } |
692 | 717 | ||
693 | 718 | ||
@@ -710,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) | |||
710 | spin_unlock_irqrestore(&bitmap->lock, flags); | 735 | spin_unlock_irqrestore(&bitmap->lock, flags); |
711 | 736 | ||
712 | while (pages--) | 737 | while (pages--) |
713 | if (map[pages]->index != 0) /* 0 is sb_page, release it below */ | 738 | if (map[pages] != sb_page) /* 0 is sb_page, release it below */ |
714 | free_buffers(map[pages]); | 739 | free_buffers(map[pages]); |
715 | kfree(map); | 740 | kfree(map); |
716 | kfree(attr); | 741 | kfree(attr); |
@@ -821,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
821 | 846 | ||
822 | page = filemap_get_page(bitmap, chunk); | 847 | page = filemap_get_page(bitmap, chunk); |
823 | if (!page) return; | 848 | if (!page) return; |
824 | bit = file_page_offset(chunk); | 849 | bit = file_page_offset(bitmap, chunk); |
825 | 850 | ||
826 | /* set the bit */ | 851 | /* set the bit */ |
827 | kaddr = kmap_atomic(page, KM_USER0); | 852 | kaddr = kmap_atomic(page, KM_USER0); |
@@ -907,7 +932,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
907 | chunks = bitmap->chunks; | 932 | chunks = bitmap->chunks; |
908 | file = bitmap->file; | 933 | file = bitmap->file; |
909 | 934 | ||
910 | BUG_ON(!file && !bitmap->offset); | 935 | BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); |
911 | 936 | ||
912 | #ifdef INJECT_FAULTS_3 | 937 | #ifdef INJECT_FAULTS_3 |
913 | outofdate = 1; | 938 | outofdate = 1; |
@@ -919,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
919 | "recovery\n", bmname(bitmap)); | 944 | "recovery\n", bmname(bitmap)); |
920 | 945 | ||
921 | bytes = (chunks + 7) / 8; | 946 | bytes = (chunks + 7) / 8; |
947 | if (!bitmap->mddev->bitmap_info.external) | ||
948 | bytes += sizeof(bitmap_super_t); | ||
922 | 949 | ||
923 | num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; | 950 | |
951 | num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; | ||
924 | 952 | ||
925 | if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { | 953 | if (file && i_size_read(file->f_mapping->host) < bytes) { |
926 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", | 954 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", |
927 | bmname(bitmap), | 955 | bmname(bitmap), |
928 | (unsigned long) i_size_read(file->f_mapping->host), | 956 | (unsigned long) i_size_read(file->f_mapping->host), |
929 | bytes + sizeof(bitmap_super_t)); | 957 | bytes); |
930 | goto err; | 958 | goto err; |
931 | } | 959 | } |
932 | 960 | ||
@@ -947,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
947 | 975 | ||
948 | for (i = 0; i < chunks; i++) { | 976 | for (i = 0; i < chunks; i++) { |
949 | int b; | 977 | int b; |
950 | index = file_page_index(i); | 978 | index = file_page_index(bitmap, i); |
951 | bit = file_page_offset(i); | 979 | bit = file_page_offset(bitmap, i); |
952 | if (index != oldindex) { /* this is a new page, read it in */ | 980 | if (index != oldindex) { /* this is a new page, read it in */ |
953 | int count; | 981 | int count; |
954 | /* unmap the old page, we're done with it */ | 982 | /* unmap the old page, we're done with it */ |
955 | if (index == num_pages-1) | 983 | if (index == num_pages-1) |
956 | count = bytes + sizeof(bitmap_super_t) | 984 | count = bytes - index * PAGE_SIZE; |
957 | - index * PAGE_SIZE; | ||
958 | else | 985 | else |
959 | count = PAGE_SIZE; | 986 | count = PAGE_SIZE; |
960 | if (index == 0) { | 987 | if (index == 0 && bitmap->sb_page) { |
961 | /* | 988 | /* |
962 | * if we're here then the superblock page | 989 | * if we're here then the superblock page |
963 | * contains some bits (PAGE_SIZE != sizeof sb) | 990 | * contains some bits (PAGE_SIZE != sizeof sb) |
@@ -967,14 +994,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
967 | offset = sizeof(bitmap_super_t); | 994 | offset = sizeof(bitmap_super_t); |
968 | if (!file) | 995 | if (!file) |
969 | read_sb_page(bitmap->mddev, | 996 | read_sb_page(bitmap->mddev, |
970 | bitmap->offset, | 997 | bitmap->mddev->bitmap_info.offset, |
971 | page, | 998 | page, |
972 | index, count); | 999 | index, count); |
973 | } else if (file) { | 1000 | } else if (file) { |
974 | page = read_page(file, index, bitmap, count); | 1001 | page = read_page(file, index, bitmap, count); |
975 | offset = 0; | 1002 | offset = 0; |
976 | } else { | 1003 | } else { |
977 | page = read_sb_page(bitmap->mddev, bitmap->offset, | 1004 | page = read_sb_page(bitmap->mddev, |
1005 | bitmap->mddev->bitmap_info.offset, | ||
978 | NULL, | 1006 | NULL, |
979 | index, count); | 1007 | index, count); |
980 | offset = 0; | 1008 | offset = 0; |
@@ -1078,23 +1106,32 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | |||
1078 | * out to disk | 1106 | * out to disk |
1079 | */ | 1107 | */ |
1080 | 1108 | ||
1081 | void bitmap_daemon_work(struct bitmap *bitmap) | 1109 | void bitmap_daemon_work(mddev_t *mddev) |
1082 | { | 1110 | { |
1111 | struct bitmap *bitmap; | ||
1083 | unsigned long j; | 1112 | unsigned long j; |
1084 | unsigned long flags; | 1113 | unsigned long flags; |
1085 | struct page *page = NULL, *lastpage = NULL; | 1114 | struct page *page = NULL, *lastpage = NULL; |
1086 | int blocks; | 1115 | int blocks; |
1087 | void *paddr; | 1116 | void *paddr; |
1088 | 1117 | ||
1089 | if (bitmap == NULL) | 1118 | /* Use a mutex to guard daemon_work against |
1119 | * bitmap_destroy. | ||
1120 | */ | ||
1121 | mutex_lock(&mddev->bitmap_info.mutex); | ||
1122 | bitmap = mddev->bitmap; | ||
1123 | if (bitmap == NULL) { | ||
1124 | mutex_unlock(&mddev->bitmap_info.mutex); | ||
1090 | return; | 1125 | return; |
1091 | if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) | 1126 | } |
1127 | if (time_before(jiffies, bitmap->daemon_lastrun | ||
1128 | + bitmap->mddev->bitmap_info.daemon_sleep)) | ||
1092 | goto done; | 1129 | goto done; |
1093 | 1130 | ||
1094 | bitmap->daemon_lastrun = jiffies; | 1131 | bitmap->daemon_lastrun = jiffies; |
1095 | if (bitmap->allclean) { | 1132 | if (bitmap->allclean) { |
1096 | bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | 1133 | bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; |
1097 | return; | 1134 | goto done; |
1098 | } | 1135 | } |
1099 | bitmap->allclean = 1; | 1136 | bitmap->allclean = 1; |
1100 | 1137 | ||
@@ -1142,7 +1179,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1142 | /* We are possibly going to clear some bits, so make | 1179 | /* We are possibly going to clear some bits, so make |
1143 | * sure that events_cleared is up-to-date. | 1180 | * sure that events_cleared is up-to-date. |
1144 | */ | 1181 | */ |
1145 | if (bitmap->need_sync) { | 1182 | if (bitmap->need_sync && |
1183 | bitmap->mddev->bitmap_info.external == 0) { | ||
1146 | bitmap_super_t *sb; | 1184 | bitmap_super_t *sb; |
1147 | bitmap->need_sync = 0; | 1185 | bitmap->need_sync = 0; |
1148 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 1186 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
@@ -1152,7 +1190,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1152 | write_page(bitmap, bitmap->sb_page, 1); | 1190 | write_page(bitmap, bitmap->sb_page, 1); |
1153 | } | 1191 | } |
1154 | spin_lock_irqsave(&bitmap->lock, flags); | 1192 | spin_lock_irqsave(&bitmap->lock, flags); |
1155 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1193 | if (!bitmap->need_sync) |
1194 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | ||
1156 | } | 1195 | } |
1157 | bmc = bitmap_get_counter(bitmap, | 1196 | bmc = bitmap_get_counter(bitmap, |
1158 | (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), | 1197 | (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), |
@@ -1167,7 +1206,7 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1167 | if (*bmc == 2) { | 1206 | if (*bmc == 2) { |
1168 | *bmc=1; /* maybe clear the bit next time */ | 1207 | *bmc=1; /* maybe clear the bit next time */ |
1169 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1208 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
1170 | } else if (*bmc == 1) { | 1209 | } else if (*bmc == 1 && !bitmap->need_sync) { |
1171 | /* we can clear the bit */ | 1210 | /* we can clear the bit */ |
1172 | *bmc = 0; | 1211 | *bmc = 0; |
1173 | bitmap_count_page(bitmap, | 1212 | bitmap_count_page(bitmap, |
@@ -1177,9 +1216,11 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1177 | /* clear the bit */ | 1216 | /* clear the bit */ |
1178 | paddr = kmap_atomic(page, KM_USER0); | 1217 | paddr = kmap_atomic(page, KM_USER0); |
1179 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1218 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1180 | clear_bit(file_page_offset(j), paddr); | 1219 | clear_bit(file_page_offset(bitmap, j), |
1220 | paddr); | ||
1181 | else | 1221 | else |
1182 | ext2_clear_bit(file_page_offset(j), paddr); | 1222 | ext2_clear_bit(file_page_offset(bitmap, j), |
1223 | paddr); | ||
1183 | kunmap_atomic(paddr, KM_USER0); | 1224 | kunmap_atomic(paddr, KM_USER0); |
1184 | } | 1225 | } |
1185 | } else | 1226 | } else |
@@ -1202,7 +1243,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1202 | 1243 | ||
1203 | done: | 1244 | done: |
1204 | if (bitmap->allclean == 0) | 1245 | if (bitmap->allclean == 0) |
1205 | bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; | 1246 | bitmap->mddev->thread->timeout = |
1247 | bitmap->mddev->bitmap_info.daemon_sleep; | ||
1248 | mutex_unlock(&mddev->bitmap_info.mutex); | ||
1206 | } | 1249 | } |
1207 | 1250 | ||
1208 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1251 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
@@ -1332,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1332 | bitmap->events_cleared < bitmap->mddev->events) { | 1375 | bitmap->events_cleared < bitmap->mddev->events) { |
1333 | bitmap->events_cleared = bitmap->mddev->events; | 1376 | bitmap->events_cleared = bitmap->mddev->events; |
1334 | bitmap->need_sync = 1; | 1377 | bitmap->need_sync = 1; |
1378 | sysfs_notify_dirent(bitmap->sysfs_can_clear); | ||
1335 | } | 1379 | } |
1336 | 1380 | ||
1337 | if (!success && ! (*bmc & NEEDED_MASK)) | 1381 | if (!success && ! (*bmc & NEEDED_MASK)) |
@@ -1470,7 +1514,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1470 | return; | 1514 | return; |
1471 | } | 1515 | } |
1472 | if (time_before(jiffies, (bitmap->last_end_sync | 1516 | if (time_before(jiffies, (bitmap->last_end_sync |
1473 | + bitmap->daemon_sleep * HZ))) | 1517 | + bitmap->mddev->bitmap_info.daemon_sleep))) |
1474 | return; | 1518 | return; |
1475 | wait_event(bitmap->mddev->recovery_wait, | 1519 | wait_event(bitmap->mddev->recovery_wait, |
1476 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1520 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
@@ -1522,6 +1566,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
1522 | sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); | 1566 | sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); |
1523 | bitmap_set_memory_bits(bitmap, sec, 1); | 1567 | bitmap_set_memory_bits(bitmap, sec, 1); |
1524 | bitmap_file_set_bit(bitmap, sec); | 1568 | bitmap_file_set_bit(bitmap, sec); |
1569 | if (sec < bitmap->mddev->recovery_cp) | ||
1570 | /* We are asserting that the array is dirty, | ||
1571 | * so move the recovery_cp address back so | ||
1572 | * that it is obvious that it is dirty | ||
1573 | */ | ||
1574 | bitmap->mddev->recovery_cp = sec; | ||
1525 | } | 1575 | } |
1526 | } | 1576 | } |
1527 | 1577 | ||
@@ -1531,7 +1581,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
1531 | void bitmap_flush(mddev_t *mddev) | 1581 | void bitmap_flush(mddev_t *mddev) |
1532 | { | 1582 | { |
1533 | struct bitmap *bitmap = mddev->bitmap; | 1583 | struct bitmap *bitmap = mddev->bitmap; |
1534 | int sleep; | 1584 | long sleep; |
1535 | 1585 | ||
1536 | if (!bitmap) /* there was no bitmap */ | 1586 | if (!bitmap) /* there was no bitmap */ |
1537 | return; | 1587 | return; |
@@ -1539,12 +1589,13 @@ void bitmap_flush(mddev_t *mddev) | |||
1539 | /* run the daemon_work three time to ensure everything is flushed | 1589 | /* run the daemon_work three time to ensure everything is flushed |
1540 | * that can be | 1590 | * that can be |
1541 | */ | 1591 | */ |
1542 | sleep = bitmap->daemon_sleep; | 1592 | sleep = mddev->bitmap_info.daemon_sleep * 2; |
1543 | bitmap->daemon_sleep = 0; | 1593 | bitmap->daemon_lastrun -= sleep; |
1544 | bitmap_daemon_work(bitmap); | 1594 | bitmap_daemon_work(mddev); |
1545 | bitmap_daemon_work(bitmap); | 1595 | bitmap->daemon_lastrun -= sleep; |
1546 | bitmap_daemon_work(bitmap); | 1596 | bitmap_daemon_work(mddev); |
1547 | bitmap->daemon_sleep = sleep; | 1597 | bitmap->daemon_lastrun -= sleep; |
1598 | bitmap_daemon_work(mddev); | ||
1548 | bitmap_update_sb(bitmap); | 1599 | bitmap_update_sb(bitmap); |
1549 | } | 1600 | } |
1550 | 1601 | ||
@@ -1574,6 +1625,7 @@ static void bitmap_free(struct bitmap *bitmap) | |||
1574 | kfree(bp); | 1625 | kfree(bp); |
1575 | kfree(bitmap); | 1626 | kfree(bitmap); |
1576 | } | 1627 | } |
1628 | |||
1577 | void bitmap_destroy(mddev_t *mddev) | 1629 | void bitmap_destroy(mddev_t *mddev) |
1578 | { | 1630 | { |
1579 | struct bitmap *bitmap = mddev->bitmap; | 1631 | struct bitmap *bitmap = mddev->bitmap; |
@@ -1581,10 +1633,15 @@ void bitmap_destroy(mddev_t *mddev) | |||
1581 | if (!bitmap) /* there was no bitmap */ | 1633 | if (!bitmap) /* there was no bitmap */ |
1582 | return; | 1634 | return; |
1583 | 1635 | ||
1636 | mutex_lock(&mddev->bitmap_info.mutex); | ||
1584 | mddev->bitmap = NULL; /* disconnect from the md device */ | 1637 | mddev->bitmap = NULL; /* disconnect from the md device */ |
1638 | mutex_unlock(&mddev->bitmap_info.mutex); | ||
1585 | if (mddev->thread) | 1639 | if (mddev->thread) |
1586 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | 1640 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; |
1587 | 1641 | ||
1642 | if (bitmap->sysfs_can_clear) | ||
1643 | sysfs_put(bitmap->sysfs_can_clear); | ||
1644 | |||
1588 | bitmap_free(bitmap); | 1645 | bitmap_free(bitmap); |
1589 | } | 1646 | } |
1590 | 1647 | ||
@@ -1598,16 +1655,17 @@ int bitmap_create(mddev_t *mddev) | |||
1598 | sector_t blocks = mddev->resync_max_sectors; | 1655 | sector_t blocks = mddev->resync_max_sectors; |
1599 | unsigned long chunks; | 1656 | unsigned long chunks; |
1600 | unsigned long pages; | 1657 | unsigned long pages; |
1601 | struct file *file = mddev->bitmap_file; | 1658 | struct file *file = mddev->bitmap_info.file; |
1602 | int err; | 1659 | int err; |
1603 | sector_t start; | 1660 | sector_t start; |
1661 | struct sysfs_dirent *bm; | ||
1604 | 1662 | ||
1605 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); | 1663 | BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); |
1606 | 1664 | ||
1607 | if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ | 1665 | if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ |
1608 | return 0; | 1666 | return 0; |
1609 | 1667 | ||
1610 | BUG_ON(file && mddev->bitmap_offset); | 1668 | BUG_ON(file && mddev->bitmap_info.offset); |
1611 | 1669 | ||
1612 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); | 1670 | bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); |
1613 | if (!bitmap) | 1671 | if (!bitmap) |
@@ -1620,8 +1678,14 @@ int bitmap_create(mddev_t *mddev) | |||
1620 | 1678 | ||
1621 | bitmap->mddev = mddev; | 1679 | bitmap->mddev = mddev; |
1622 | 1680 | ||
1681 | bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); | ||
1682 | if (bm) { | ||
1683 | bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); | ||
1684 | sysfs_put(bm); | ||
1685 | } else | ||
1686 | bitmap->sysfs_can_clear = NULL; | ||
1687 | |||
1623 | bitmap->file = file; | 1688 | bitmap->file = file; |
1624 | bitmap->offset = mddev->bitmap_offset; | ||
1625 | if (file) { | 1689 | if (file) { |
1626 | get_file(file); | 1690 | get_file(file); |
1627 | /* As future accesses to this file will use bmap, | 1691 | /* As future accesses to this file will use bmap, |
@@ -1630,12 +1694,22 @@ int bitmap_create(mddev_t *mddev) | |||
1630 | */ | 1694 | */ |
1631 | vfs_fsync(file, file->f_dentry, 1); | 1695 | vfs_fsync(file, file->f_dentry, 1); |
1632 | } | 1696 | } |
1633 | /* read superblock from bitmap file (this sets bitmap->chunksize) */ | 1697 | /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ |
1634 | err = bitmap_read_sb(bitmap); | 1698 | if (!mddev->bitmap_info.external) |
1699 | err = bitmap_read_sb(bitmap); | ||
1700 | else { | ||
1701 | err = 0; | ||
1702 | if (mddev->bitmap_info.chunksize == 0 || | ||
1703 | mddev->bitmap_info.daemon_sleep == 0) | ||
1704 | /* chunksize and time_base need to be | ||
1705 | * set first. */ | ||
1706 | err = -EINVAL; | ||
1707 | } | ||
1635 | if (err) | 1708 | if (err) |
1636 | goto error; | 1709 | goto error; |
1637 | 1710 | ||
1638 | bitmap->chunkshift = ffz(~bitmap->chunksize); | 1711 | bitmap->daemon_lastrun = jiffies; |
1712 | bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); | ||
1639 | 1713 | ||
1640 | /* now that chunksize and chunkshift are set, we can use these macros */ | 1714 | /* now that chunksize and chunkshift are set, we can use these macros */ |
1641 | chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> | 1715 | chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> |
@@ -1677,7 +1751,8 @@ int bitmap_create(mddev_t *mddev) | |||
1677 | 1751 | ||
1678 | mddev->bitmap = bitmap; | 1752 | mddev->bitmap = bitmap; |
1679 | 1753 | ||
1680 | mddev->thread->timeout = bitmap->daemon_sleep * HZ; | 1754 | mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; |
1755 | md_wakeup_thread(mddev->thread); | ||
1681 | 1756 | ||
1682 | bitmap_update_sb(bitmap); | 1757 | bitmap_update_sb(bitmap); |
1683 | 1758 | ||
@@ -1688,6 +1763,264 @@ int bitmap_create(mddev_t *mddev) | |||
1688 | return err; | 1763 | return err; |
1689 | } | 1764 | } |
1690 | 1765 | ||
1766 | static ssize_t | ||
1767 | location_show(mddev_t *mddev, char *page) | ||
1768 | { | ||
1769 | ssize_t len; | ||
1770 | if (mddev->bitmap_info.file) { | ||
1771 | len = sprintf(page, "file"); | ||
1772 | } else if (mddev->bitmap_info.offset) { | ||
1773 | len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); | ||
1774 | } else | ||
1775 | len = sprintf(page, "none"); | ||
1776 | len += sprintf(page+len, "\n"); | ||
1777 | return len; | ||
1778 | } | ||
1779 | |||
1780 | static ssize_t | ||
1781 | location_store(mddev_t *mddev, const char *buf, size_t len) | ||
1782 | { | ||
1783 | |||
1784 | if (mddev->pers) { | ||
1785 | if (!mddev->pers->quiesce) | ||
1786 | return -EBUSY; | ||
1787 | if (mddev->recovery || mddev->sync_thread) | ||
1788 | return -EBUSY; | ||
1789 | } | ||
1790 | |||
1791 | if (mddev->bitmap || mddev->bitmap_info.file || | ||
1792 | mddev->bitmap_info.offset) { | ||
1793 | /* bitmap already configured. Only option is to clear it */ | ||
1794 | if (strncmp(buf, "none", 4) != 0) | ||
1795 | return -EBUSY; | ||
1796 | if (mddev->pers) { | ||
1797 | mddev->pers->quiesce(mddev, 1); | ||
1798 | bitmap_destroy(mddev); | ||
1799 | mddev->pers->quiesce(mddev, 0); | ||
1800 | } | ||
1801 | mddev->bitmap_info.offset = 0; | ||
1802 | if (mddev->bitmap_info.file) { | ||
1803 | struct file *f = mddev->bitmap_info.file; | ||
1804 | mddev->bitmap_info.file = NULL; | ||
1805 | restore_bitmap_write_access(f); | ||
1806 | fput(f); | ||
1807 | } | ||
1808 | } else { | ||
1809 | /* No bitmap, OK to set a location */ | ||
1810 | long long offset; | ||
1811 | if (strncmp(buf, "none", 4) == 0) | ||
1812 | /* nothing to be done */; | ||
1813 | else if (strncmp(buf, "file:", 5) == 0) { | ||
1814 | /* Not supported yet */ | ||
1815 | return -EINVAL; | ||
1816 | } else { | ||
1817 | int rv; | ||
1818 | if (buf[0] == '+') | ||
1819 | rv = strict_strtoll(buf+1, 10, &offset); | ||
1820 | else | ||
1821 | rv = strict_strtoll(buf, 10, &offset); | ||
1822 | if (rv) | ||
1823 | return rv; | ||
1824 | if (offset == 0) | ||
1825 | return -EINVAL; | ||
1826 | if (mddev->bitmap_info.external == 0 && | ||
1827 | mddev->major_version == 0 && | ||
1828 | offset != mddev->bitmap_info.default_offset) | ||
1829 | return -EINVAL; | ||
1830 | mddev->bitmap_info.offset = offset; | ||
1831 | if (mddev->pers) { | ||
1832 | mddev->pers->quiesce(mddev, 1); | ||
1833 | rv = bitmap_create(mddev); | ||
1834 | if (rv) { | ||
1835 | bitmap_destroy(mddev); | ||
1836 | mddev->bitmap_info.offset = 0; | ||
1837 | } | ||
1838 | mddev->pers->quiesce(mddev, 0); | ||
1839 | if (rv) | ||
1840 | return rv; | ||
1841 | } | ||
1842 | } | ||
1843 | } | ||
1844 | if (!mddev->external) { | ||
1845 | /* Ensure new bitmap info is stored in | ||
1846 | * metadata promptly. | ||
1847 | */ | ||
1848 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1849 | md_wakeup_thread(mddev->thread); | ||
1850 | } | ||
1851 | return len; | ||
1852 | } | ||
1853 | |||
1854 | static struct md_sysfs_entry bitmap_location = | ||
1855 | __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); | ||
1856 | |||
1857 | static ssize_t | ||
1858 | timeout_show(mddev_t *mddev, char *page) | ||
1859 | { | ||
1860 | ssize_t len; | ||
1861 | unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; | ||
1862 | unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; | ||
1863 | |||
1864 | len = sprintf(page, "%lu", secs); | ||
1865 | if (jifs) | ||
1866 | len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); | ||
1867 | len += sprintf(page+len, "\n"); | ||
1868 | return len; | ||
1869 | } | ||
1870 | |||
1871 | static ssize_t | ||
1872 | timeout_store(mddev_t *mddev, const char *buf, size_t len) | ||
1873 | { | ||
1874 | /* timeout can be set at any time */ | ||
1875 | unsigned long timeout; | ||
1876 | int rv = strict_strtoul_scaled(buf, &timeout, 4); | ||
1877 | if (rv) | ||
1878 | return rv; | ||
1879 | |||
1880 | /* just to make sure we don't overflow... */ | ||
1881 | if (timeout >= LONG_MAX / HZ) | ||
1882 | return -EINVAL; | ||
1883 | |||
1884 | timeout = timeout * HZ / 10000; | ||
1885 | |||
1886 | if (timeout >= MAX_SCHEDULE_TIMEOUT) | ||
1887 | timeout = MAX_SCHEDULE_TIMEOUT-1; | ||
1888 | if (timeout < 1) | ||
1889 | timeout = 1; | ||
1890 | mddev->bitmap_info.daemon_sleep = timeout; | ||
1891 | if (mddev->thread) { | ||
1892 | /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then | ||
1893 | * the bitmap is all clean and we don't need to | ||
1894 | * adjust the timeout right now | ||
1895 | */ | ||
1896 | if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { | ||
1897 | mddev->thread->timeout = timeout; | ||
1898 | md_wakeup_thread(mddev->thread); | ||
1899 | } | ||
1900 | } | ||
1901 | return len; | ||
1902 | } | ||
1903 | |||
1904 | static struct md_sysfs_entry bitmap_timeout = | ||
1905 | __ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); | ||
1906 | |||
1907 | static ssize_t | ||
1908 | backlog_show(mddev_t *mddev, char *page) | ||
1909 | { | ||
1910 | return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); | ||
1911 | } | ||
1912 | |||
1913 | static ssize_t | ||
1914 | backlog_store(mddev_t *mddev, const char *buf, size_t len) | ||
1915 | { | ||
1916 | unsigned long backlog; | ||
1917 | int rv = strict_strtoul(buf, 10, &backlog); | ||
1918 | if (rv) | ||
1919 | return rv; | ||
1920 | if (backlog > COUNTER_MAX) | ||
1921 | return -EINVAL; | ||
1922 | mddev->bitmap_info.max_write_behind = backlog; | ||
1923 | return len; | ||
1924 | } | ||
1925 | |||
1926 | static struct md_sysfs_entry bitmap_backlog = | ||
1927 | __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); | ||
1928 | |||
1929 | static ssize_t | ||
1930 | chunksize_show(mddev_t *mddev, char *page) | ||
1931 | { | ||
1932 | return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); | ||
1933 | } | ||
1934 | |||
1935 | static ssize_t | ||
1936 | chunksize_store(mddev_t *mddev, const char *buf, size_t len) | ||
1937 | { | ||
1938 | /* Can only be changed when no bitmap is active */ | ||
1939 | int rv; | ||
1940 | unsigned long csize; | ||
1941 | if (mddev->bitmap) | ||
1942 | return -EBUSY; | ||
1943 | rv = strict_strtoul(buf, 10, &csize); | ||
1944 | if (rv) | ||
1945 | return rv; | ||
1946 | if (csize < 512 || | ||
1947 | !is_power_of_2(csize)) | ||
1948 | return -EINVAL; | ||
1949 | mddev->bitmap_info.chunksize = csize; | ||
1950 | return len; | ||
1951 | } | ||
1952 | |||
1953 | static struct md_sysfs_entry bitmap_chunksize = | ||
1954 | __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); | ||
1955 | |||
1956 | static ssize_t metadata_show(mddev_t *mddev, char *page) | ||
1957 | { | ||
1958 | return sprintf(page, "%s\n", (mddev->bitmap_info.external | ||
1959 | ? "external" : "internal")); | ||
1960 | } | ||
1961 | |||
1962 | static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len) | ||
1963 | { | ||
1964 | if (mddev->bitmap || | ||
1965 | mddev->bitmap_info.file || | ||
1966 | mddev->bitmap_info.offset) | ||
1967 | return -EBUSY; | ||
1968 | if (strncmp(buf, "external", 8) == 0) | ||
1969 | mddev->bitmap_info.external = 1; | ||
1970 | else if (strncmp(buf, "internal", 8) == 0) | ||
1971 | mddev->bitmap_info.external = 0; | ||
1972 | else | ||
1973 | return -EINVAL; | ||
1974 | return len; | ||
1975 | } | ||
1976 | |||
1977 | static struct md_sysfs_entry bitmap_metadata = | ||
1978 | __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); | ||
1979 | |||
1980 | static ssize_t can_clear_show(mddev_t *mddev, char *page) | ||
1981 | { | ||
1982 | int len; | ||
1983 | if (mddev->bitmap) | ||
1984 | len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? | ||
1985 | "false" : "true")); | ||
1986 | else | ||
1987 | len = sprintf(page, "\n"); | ||
1988 | return len; | ||
1989 | } | ||
1990 | |||
1991 | static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) | ||
1992 | { | ||
1993 | if (mddev->bitmap == NULL) | ||
1994 | return -ENOENT; | ||
1995 | if (strncmp(buf, "false", 5) == 0) | ||
1996 | mddev->bitmap->need_sync = 1; | ||
1997 | else if (strncmp(buf, "true", 4) == 0) { | ||
1998 | if (mddev->degraded) | ||
1999 | return -EBUSY; | ||
2000 | mddev->bitmap->need_sync = 0; | ||
2001 | } else | ||
2002 | return -EINVAL; | ||
2003 | return len; | ||
2004 | } | ||
2005 | |||
2006 | static struct md_sysfs_entry bitmap_can_clear = | ||
2007 | __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); | ||
2008 | |||
2009 | static struct attribute *md_bitmap_attrs[] = { | ||
2010 | &bitmap_location.attr, | ||
2011 | &bitmap_timeout.attr, | ||
2012 | &bitmap_backlog.attr, | ||
2013 | &bitmap_chunksize.attr, | ||
2014 | &bitmap_metadata.attr, | ||
2015 | &bitmap_can_clear.attr, | ||
2016 | NULL | ||
2017 | }; | ||
2018 | struct attribute_group md_bitmap_group = { | ||
2019 | .name = "bitmap", | ||
2020 | .attrs = md_bitmap_attrs, | ||
2021 | }; | ||
2022 | |||
2023 | |||
1691 | /* the bitmap API -- for raid personalities */ | 2024 | /* the bitmap API -- for raid personalities */ |
1692 | EXPORT_SYMBOL(bitmap_startwrite); | 2025 | EXPORT_SYMBOL(bitmap_startwrite); |
1693 | EXPORT_SYMBOL(bitmap_endwrite); | 2026 | EXPORT_SYMBOL(bitmap_endwrite); |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index e98900671ca9..cb821d76d1b4 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t; | |||
106 | #define BITMAP_BLOCK_SHIFT 9 | 106 | #define BITMAP_BLOCK_SHIFT 9 |
107 | 107 | ||
108 | /* how many blocks per chunk? (this is variable) */ | 108 | /* how many blocks per chunk? (this is variable) */ |
109 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) | 109 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) |
110 | #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) | 110 | #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) |
111 | #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) | 111 | #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) |
112 | 112 | ||
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t; | |||
118 | (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) | 118 | (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) |
119 | #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) | 119 | #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) |
120 | 120 | ||
121 | /* | ||
122 | * on-disk bitmap: | ||
123 | * | ||
124 | * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap | ||
125 | * file a page at a time. There's a superblock at the start of the file. | ||
126 | */ | ||
127 | |||
128 | /* map chunks (bits) to file pages - offset by the size of the superblock */ | ||
129 | #define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) | ||
130 | |||
131 | #endif | 121 | #endif |
132 | 122 | ||
133 | /* | 123 | /* |
@@ -209,7 +199,6 @@ struct bitmap { | |||
209 | int counter_bits; /* how many bits per block counter */ | 199 | int counter_bits; /* how many bits per block counter */ |
210 | 200 | ||
211 | /* bitmap chunksize -- how much data does each bit represent? */ | 201 | /* bitmap chunksize -- how much data does each bit represent? */ |
212 | unsigned long chunksize; | ||
213 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ | 202 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ |
214 | unsigned long chunks; /* total number of data chunks for the array */ | 203 | unsigned long chunks; /* total number of data chunks for the array */ |
215 | 204 | ||
@@ -226,7 +215,6 @@ struct bitmap { | |||
226 | /* bitmap spinlock */ | 215 | /* bitmap spinlock */ |
227 | spinlock_t lock; | 216 | spinlock_t lock; |
228 | 217 | ||
229 | long offset; /* offset from superblock if file is NULL */ | ||
230 | struct file *file; /* backing disk file */ | 218 | struct file *file; /* backing disk file */ |
231 | struct page *sb_page; /* cached copy of the bitmap file superblock */ | 219 | struct page *sb_page; /* cached copy of the bitmap file superblock */ |
232 | struct page **filemap; /* list of cache pages for the file */ | 220 | struct page **filemap; /* list of cache pages for the file */ |
@@ -238,7 +226,6 @@ struct bitmap { | |||
238 | 226 | ||
239 | int allclean; | 227 | int allclean; |
240 | 228 | ||
241 | unsigned long max_write_behind; /* write-behind mode */ | ||
242 | atomic_t behind_writes; | 229 | atomic_t behind_writes; |
243 | 230 | ||
244 | /* | 231 | /* |
@@ -246,7 +233,6 @@ struct bitmap { | |||
246 | * file, cleaning up bits and flushing out pages to disk as necessary | 233 | * file, cleaning up bits and flushing out pages to disk as necessary |
247 | */ | 234 | */ |
248 | unsigned long daemon_lastrun; /* jiffies of last run */ | 235 | unsigned long daemon_lastrun; /* jiffies of last run */ |
249 | unsigned long daemon_sleep; /* how many seconds between updates? */ | ||
250 | unsigned long last_end_sync; /* when we lasted called end_sync to | 236 | unsigned long last_end_sync; /* when we lasted called end_sync to |
251 | * update bitmap with resync progress */ | 237 | * update bitmap with resync progress */ |
252 | 238 | ||
@@ -254,6 +240,7 @@ struct bitmap { | |||
254 | wait_queue_head_t write_wait; | 240 | wait_queue_head_t write_wait; |
255 | wait_queue_head_t overflow_wait; | 241 | wait_queue_head_t overflow_wait; |
256 | 242 | ||
243 | struct sysfs_dirent *sysfs_can_clear; | ||
257 | }; | 244 | }; |
258 | 245 | ||
259 | /* the bitmap API */ | 246 | /* the bitmap API */ |
@@ -282,7 +269,7 @@ void bitmap_close_sync(struct bitmap *bitmap); | |||
282 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); | 269 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); |
283 | 270 | ||
284 | void bitmap_unplug(struct bitmap *bitmap); | 271 | void bitmap_unplug(struct bitmap *bitmap); |
285 | void bitmap_daemon_work(struct bitmap *bitmap); | 272 | void bitmap_daemon_work(mddev_t *mddev); |
286 | #endif | 273 | #endif |
287 | 274 | ||
288 | #endif | 275 | #endif |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ed1038164019..3bdbb6115702 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | 2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> |
3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> |
4 | * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. |
5 | * | 5 | * |
6 | * This file is released under the GPL. | 6 | * This file is released under the GPL. |
7 | */ | 7 | */ |
@@ -71,10 +71,21 @@ struct crypt_iv_operations { | |||
71 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, | 71 | int (*ctr)(struct crypt_config *cc, struct dm_target *ti, |
72 | const char *opts); | 72 | const char *opts); |
73 | void (*dtr)(struct crypt_config *cc); | 73 | void (*dtr)(struct crypt_config *cc); |
74 | const char *(*status)(struct crypt_config *cc); | 74 | int (*init)(struct crypt_config *cc); |
75 | int (*wipe)(struct crypt_config *cc); | ||
75 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); | 76 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); |
76 | }; | 77 | }; |
77 | 78 | ||
79 | struct iv_essiv_private { | ||
80 | struct crypto_cipher *tfm; | ||
81 | struct crypto_hash *hash_tfm; | ||
82 | u8 *salt; | ||
83 | }; | ||
84 | |||
85 | struct iv_benbi_private { | ||
86 | int shift; | ||
87 | }; | ||
88 | |||
78 | /* | 89 | /* |
79 | * Crypt: maps a linear range of a block device | 90 | * Crypt: maps a linear range of a block device |
80 | * and encrypts / decrypts at the same time. | 91 | * and encrypts / decrypts at the same time. |
@@ -102,8 +113,8 @@ struct crypt_config { | |||
102 | struct crypt_iv_operations *iv_gen_ops; | 113 | struct crypt_iv_operations *iv_gen_ops; |
103 | char *iv_mode; | 114 | char *iv_mode; |
104 | union { | 115 | union { |
105 | struct crypto_cipher *essiv_tfm; | 116 | struct iv_essiv_private essiv; |
106 | int benbi_shift; | 117 | struct iv_benbi_private benbi; |
107 | } iv_gen_private; | 118 | } iv_gen_private; |
108 | sector_t iv_offset; | 119 | sector_t iv_offset; |
109 | unsigned int iv_size; | 120 | unsigned int iv_size; |
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); | |||
147 | * plain: the initial vector is the 32-bit little-endian version of the sector | 158 | * plain: the initial vector is the 32-bit little-endian version of the sector |
148 | * number, padded with zeros if necessary. | 159 | * number, padded with zeros if necessary. |
149 | * | 160 | * |
161 | * plain64: the initial vector is the 64-bit little-endian version of the sector | ||
162 | * number, padded with zeros if necessary. | ||
163 | * | ||
150 | * essiv: "encrypted sector|salt initial vector", the sector number is | 164 | * essiv: "encrypted sector|salt initial vector", the sector number is |
151 | * encrypted with the bulk cipher using a salt as key. The salt | 165 | * encrypted with the bulk cipher using a salt as key. The salt |
152 | * should be derived from the bulk cipher's key via hashing. | 166 | * should be derived from the bulk cipher's key via hashing. |
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
169 | return 0; | 183 | return 0; |
170 | } | 184 | } |
171 | 185 | ||
172 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 186 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, |
173 | const char *opts) | 187 | sector_t sector) |
174 | { | 188 | { |
175 | struct crypto_cipher *essiv_tfm; | 189 | memset(iv, 0, cc->iv_size); |
176 | struct crypto_hash *hash_tfm; | 190 | *(u64 *)iv = cpu_to_le64(sector); |
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | /* Initialise ESSIV - compute salt but no local memory allocations */ | ||
196 | static int crypt_iv_essiv_init(struct crypt_config *cc) | ||
197 | { | ||
198 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
177 | struct hash_desc desc; | 199 | struct hash_desc desc; |
178 | struct scatterlist sg; | 200 | struct scatterlist sg; |
179 | unsigned int saltsize; | ||
180 | u8 *salt; | ||
181 | int err; | 201 | int err; |
182 | 202 | ||
183 | if (opts == NULL) { | 203 | sg_init_one(&sg, cc->key, cc->key_size); |
204 | desc.tfm = essiv->hash_tfm; | ||
205 | desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
206 | |||
207 | err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); | ||
208 | if (err) | ||
209 | return err; | ||
210 | |||
211 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, | ||
212 | crypto_hash_digestsize(essiv->hash_tfm)); | ||
213 | } | ||
214 | |||
215 | /* Wipe salt and reset key derived from volume key */ | ||
216 | static int crypt_iv_essiv_wipe(struct crypt_config *cc) | ||
217 | { | ||
218 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
219 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); | ||
220 | |||
221 | memset(essiv->salt, 0, salt_size); | ||
222 | |||
223 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); | ||
224 | } | ||
225 | |||
226 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | ||
227 | { | ||
228 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | ||
229 | |||
230 | crypto_free_cipher(essiv->tfm); | ||
231 | essiv->tfm = NULL; | ||
232 | |||
233 | crypto_free_hash(essiv->hash_tfm); | ||
234 | essiv->hash_tfm = NULL; | ||
235 | |||
236 | kzfree(essiv->salt); | ||
237 | essiv->salt = NULL; | ||
238 | } | ||
239 | |||
240 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
241 | const char *opts) | ||
242 | { | ||
243 | struct crypto_cipher *essiv_tfm = NULL; | ||
244 | struct crypto_hash *hash_tfm = NULL; | ||
245 | u8 *salt = NULL; | ||
246 | int err; | ||
247 | |||
248 | if (!opts) { | ||
184 | ti->error = "Digest algorithm missing for ESSIV mode"; | 249 | ti->error = "Digest algorithm missing for ESSIV mode"; |
185 | return -EINVAL; | 250 | return -EINVAL; |
186 | } | 251 | } |
187 | 252 | ||
188 | /* Hash the cipher key with the given hash algorithm */ | 253 | /* Allocate hash algorithm */ |
189 | hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); | 254 | hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); |
190 | if (IS_ERR(hash_tfm)) { | 255 | if (IS_ERR(hash_tfm)) { |
191 | ti->error = "Error initializing ESSIV hash"; | 256 | ti->error = "Error initializing ESSIV hash"; |
192 | return PTR_ERR(hash_tfm); | 257 | err = PTR_ERR(hash_tfm); |
258 | goto bad; | ||
193 | } | 259 | } |
194 | 260 | ||
195 | saltsize = crypto_hash_digestsize(hash_tfm); | 261 | salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); |
196 | salt = kmalloc(saltsize, GFP_KERNEL); | 262 | if (!salt) { |
197 | if (salt == NULL) { | ||
198 | ti->error = "Error kmallocing salt storage in ESSIV"; | 263 | ti->error = "Error kmallocing salt storage in ESSIV"; |
199 | crypto_free_hash(hash_tfm); | 264 | err = -ENOMEM; |
200 | return -ENOMEM; | 265 | goto bad; |
201 | } | 266 | } |
202 | 267 | ||
203 | sg_init_one(&sg, cc->key, cc->key_size); | 268 | /* Allocate essiv_tfm */ |
204 | desc.tfm = hash_tfm; | ||
205 | desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
206 | err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); | ||
207 | crypto_free_hash(hash_tfm); | ||
208 | |||
209 | if (err) { | ||
210 | ti->error = "Error calculating hash in ESSIV"; | ||
211 | kfree(salt); | ||
212 | return err; | ||
213 | } | ||
214 | |||
215 | /* Setup the essiv_tfm with the given salt */ | ||
216 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | 269 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); |
217 | if (IS_ERR(essiv_tfm)) { | 270 | if (IS_ERR(essiv_tfm)) { |
218 | ti->error = "Error allocating crypto tfm for ESSIV"; | 271 | ti->error = "Error allocating crypto tfm for ESSIV"; |
219 | kfree(salt); | 272 | err = PTR_ERR(essiv_tfm); |
220 | return PTR_ERR(essiv_tfm); | 273 | goto bad; |
221 | } | 274 | } |
222 | if (crypto_cipher_blocksize(essiv_tfm) != | 275 | if (crypto_cipher_blocksize(essiv_tfm) != |
223 | crypto_ablkcipher_ivsize(cc->tfm)) { | 276 | crypto_ablkcipher_ivsize(cc->tfm)) { |
224 | ti->error = "Block size of ESSIV cipher does " | 277 | ti->error = "Block size of ESSIV cipher does " |
225 | "not match IV size of block cipher"; | 278 | "not match IV size of block cipher"; |
226 | crypto_free_cipher(essiv_tfm); | 279 | err = -EINVAL; |
227 | kfree(salt); | 280 | goto bad; |
228 | return -EINVAL; | ||
229 | } | 281 | } |
230 | err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); | ||
231 | if (err) { | ||
232 | ti->error = "Failed to set key for ESSIV cipher"; | ||
233 | crypto_free_cipher(essiv_tfm); | ||
234 | kfree(salt); | ||
235 | return err; | ||
236 | } | ||
237 | kfree(salt); | ||
238 | 282 | ||
239 | cc->iv_gen_private.essiv_tfm = essiv_tfm; | 283 | cc->iv_gen_private.essiv.salt = salt; |
284 | cc->iv_gen_private.essiv.tfm = essiv_tfm; | ||
285 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | ||
286 | |||
240 | return 0; | 287 | return 0; |
241 | } | ||
242 | 288 | ||
243 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | 289 | bad: |
244 | { | 290 | if (essiv_tfm && !IS_ERR(essiv_tfm)) |
245 | crypto_free_cipher(cc->iv_gen_private.essiv_tfm); | 291 | crypto_free_cipher(essiv_tfm); |
246 | cc->iv_gen_private.essiv_tfm = NULL; | 292 | if (hash_tfm && !IS_ERR(hash_tfm)) |
293 | crypto_free_hash(hash_tfm); | ||
294 | kfree(salt); | ||
295 | return err; | ||
247 | } | 296 | } |
248 | 297 | ||
249 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 298 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) |
250 | { | 299 | { |
251 | memset(iv, 0, cc->iv_size); | 300 | memset(iv, 0, cc->iv_size); |
252 | *(u64 *)iv = cpu_to_le64(sector); | 301 | *(u64 *)iv = cpu_to_le64(sector); |
253 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); | 302 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); |
254 | return 0; | 303 | return 0; |
255 | } | 304 | } |
256 | 305 | ||
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
273 | return -EINVAL; | 322 | return -EINVAL; |
274 | } | 323 | } |
275 | 324 | ||
276 | cc->iv_gen_private.benbi_shift = 9 - log; | 325 | cc->iv_gen_private.benbi.shift = 9 - log; |
277 | 326 | ||
278 | return 0; | 327 | return 0; |
279 | } | 328 | } |
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | |||
288 | 337 | ||
289 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ | 338 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ |
290 | 339 | ||
291 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); | 340 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); |
292 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); | 341 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); |
293 | 342 | ||
294 | return 0; | 343 | return 0; |
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = { | |||
305 | .generator = crypt_iv_plain_gen | 354 | .generator = crypt_iv_plain_gen |
306 | }; | 355 | }; |
307 | 356 | ||
357 | static struct crypt_iv_operations crypt_iv_plain64_ops = { | ||
358 | .generator = crypt_iv_plain64_gen | ||
359 | }; | ||
360 | |||
308 | static struct crypt_iv_operations crypt_iv_essiv_ops = { | 361 | static struct crypt_iv_operations crypt_iv_essiv_ops = { |
309 | .ctr = crypt_iv_essiv_ctr, | 362 | .ctr = crypt_iv_essiv_ctr, |
310 | .dtr = crypt_iv_essiv_dtr, | 363 | .dtr = crypt_iv_essiv_dtr, |
364 | .init = crypt_iv_essiv_init, | ||
365 | .wipe = crypt_iv_essiv_wipe, | ||
311 | .generator = crypt_iv_essiv_gen | 366 | .generator = crypt_iv_essiv_gen |
312 | }; | 367 | }; |
313 | 368 | ||
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key) | |||
934 | 989 | ||
935 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 990 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
936 | 991 | ||
937 | return 0; | 992 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
938 | } | 993 | } |
939 | 994 | ||
940 | static int crypt_wipe_key(struct crypt_config *cc) | 995 | static int crypt_wipe_key(struct crypt_config *cc) |
941 | { | 996 | { |
942 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 997 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
943 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | 998 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); |
944 | return 0; | 999 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); |
945 | } | 1000 | } |
946 | 1001 | ||
947 | /* | 1002 | /* |
@@ -983,12 +1038,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
983 | return -ENOMEM; | 1038 | return -ENOMEM; |
984 | } | 1039 | } |
985 | 1040 | ||
986 | if (crypt_set_key(cc, argv[1])) { | 1041 | /* Compatibility mode for old dm-crypt cipher strings */ |
987 | ti->error = "Error decoding key"; | ||
988 | goto bad_cipher; | ||
989 | } | ||
990 | |||
991 | /* Compatiblity mode for old dm-crypt cipher strings */ | ||
992 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { | 1042 | if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { |
993 | chainmode = "cbc"; | 1043 | chainmode = "cbc"; |
994 | ivmode = "plain"; | 1044 | ivmode = "plain"; |
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1015 | strcpy(cc->chainmode, chainmode); | 1065 | strcpy(cc->chainmode, chainmode); |
1016 | cc->tfm = tfm; | 1066 | cc->tfm = tfm; |
1017 | 1067 | ||
1068 | if (crypt_set_key(cc, argv[1]) < 0) { | ||
1069 | ti->error = "Error decoding and setting key"; | ||
1070 | goto bad_ivmode; | ||
1071 | } | ||
1072 | |||
1018 | /* | 1073 | /* |
1019 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". | 1074 | * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". |
1020 | * See comments at iv code | 1075 | * See comments at iv code |
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1024 | cc->iv_gen_ops = NULL; | 1079 | cc->iv_gen_ops = NULL; |
1025 | else if (strcmp(ivmode, "plain") == 0) | 1080 | else if (strcmp(ivmode, "plain") == 0) |
1026 | cc->iv_gen_ops = &crypt_iv_plain_ops; | 1081 | cc->iv_gen_ops = &crypt_iv_plain_ops; |
1082 | else if (strcmp(ivmode, "plain64") == 0) | ||
1083 | cc->iv_gen_ops = &crypt_iv_plain64_ops; | ||
1027 | else if (strcmp(ivmode, "essiv") == 0) | 1084 | else if (strcmp(ivmode, "essiv") == 0) |
1028 | cc->iv_gen_ops = &crypt_iv_essiv_ops; | 1085 | cc->iv_gen_ops = &crypt_iv_essiv_ops; |
1029 | else if (strcmp(ivmode, "benbi") == 0) | 1086 | else if (strcmp(ivmode, "benbi") == 0) |
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1039 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) | 1096 | cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) |
1040 | goto bad_ivmode; | 1097 | goto bad_ivmode; |
1041 | 1098 | ||
1099 | if (cc->iv_gen_ops && cc->iv_gen_ops->init && | ||
1100 | cc->iv_gen_ops->init(cc) < 0) { | ||
1101 | ti->error = "Error initialising IV"; | ||
1102 | goto bad_slab_pool; | ||
1103 | } | ||
1104 | |||
1042 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); | 1105 | cc->iv_size = crypto_ablkcipher_ivsize(tfm); |
1043 | if (cc->iv_size) | 1106 | if (cc->iv_size) |
1044 | /* at least a 64 bit sector number should fit in our buffer */ | 1107 | /* at least a 64 bit sector number should fit in our buffer */ |
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1085 | goto bad_bs; | 1148 | goto bad_bs; |
1086 | } | 1149 | } |
1087 | 1150 | ||
1088 | if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { | ||
1089 | ti->error = "Error setting key"; | ||
1090 | goto bad_device; | ||
1091 | } | ||
1092 | |||
1093 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { | 1151 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { |
1094 | ti->error = "Invalid iv_offset sector"; | 1152 | ti->error = "Invalid iv_offset sector"; |
1095 | goto bad_device; | 1153 | goto bad_device; |
@@ -1102,8 +1160,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1102 | } | 1160 | } |
1103 | cc->start = tmpll; | 1161 | cc->start = tmpll; |
1104 | 1162 | ||
1105 | if (dm_get_device(ti, argv[3], cc->start, ti->len, | 1163 | if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { |
1106 | dm_table_get_mode(ti->table), &cc->dev)) { | ||
1107 | ti->error = "Device lookup failed"; | 1164 | ti->error = "Device lookup failed"; |
1108 | goto bad_device; | 1165 | goto bad_device; |
1109 | } | 1166 | } |
@@ -1278,6 +1335,7 @@ static void crypt_resume(struct dm_target *ti) | |||
1278 | static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | 1335 | static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) |
1279 | { | 1336 | { |
1280 | struct crypt_config *cc = ti->private; | 1337 | struct crypt_config *cc = ti->private; |
1338 | int ret = -EINVAL; | ||
1281 | 1339 | ||
1282 | if (argc < 2) | 1340 | if (argc < 2) |
1283 | goto error; | 1341 | goto error; |
@@ -1287,10 +1345,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) | |||
1287 | DMWARN("not suspended during key manipulation."); | 1345 | DMWARN("not suspended during key manipulation."); |
1288 | return -EINVAL; | 1346 | return -EINVAL; |
1289 | } | 1347 | } |
1290 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) | 1348 | if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) { |
1291 | return crypt_set_key(cc, argv[2]); | 1349 | ret = crypt_set_key(cc, argv[2]); |
1292 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) | 1350 | if (ret) |
1351 | return ret; | ||
1352 | if (cc->iv_gen_ops && cc->iv_gen_ops->init) | ||
1353 | ret = cc->iv_gen_ops->init(cc); | ||
1354 | return ret; | ||
1355 | } | ||
1356 | if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) { | ||
1357 | if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { | ||
1358 | ret = cc->iv_gen_ops->wipe(cc); | ||
1359 | if (ret) | ||
1360 | return ret; | ||
1361 | } | ||
1293 | return crypt_wipe_key(cc); | 1362 | return crypt_wipe_key(cc); |
1363 | } | ||
1294 | } | 1364 | } |
1295 | 1365 | ||
1296 | error: | 1366 | error: |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index ebe7381f47c8..852052880d7a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -156,8 +156,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
156 | goto bad; | 156 | goto bad; |
157 | } | 157 | } |
158 | 158 | ||
159 | if (dm_get_device(ti, argv[0], dc->start_read, ti->len, | 159 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
160 | dm_table_get_mode(ti->table), &dc->dev_read)) { | 160 | &dc->dev_read)) { |
161 | ti->error = "Device lookup failed"; | 161 | ti->error = "Device lookup failed"; |
162 | goto bad; | 162 | goto bad; |
163 | } | 163 | } |
@@ -177,8 +177,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
177 | goto bad_dev_read; | 177 | goto bad_dev_read; |
178 | } | 178 | } |
179 | 179 | ||
180 | if (dm_get_device(ti, argv[3], dc->start_write, ti->len, | 180 | if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), |
181 | dm_table_get_mode(ti->table), &dc->dev_write)) { | 181 | &dc->dev_write)) { |
182 | ti->error = "Write device lookup failed"; | 182 | ti->error = "Write device lookup failed"; |
183 | goto bad_dev_read; | 183 | goto bad_dev_read; |
184 | } | 184 | } |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 7dbe652efb5a..2b7907b6dd09 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | |||
172 | } | 172 | } |
173 | 173 | ||
174 | /* Validate the chunk size against the device block size */ | 174 | /* Validate the chunk size against the device block size */ |
175 | if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { | 175 | if (chunk_size % |
176 | (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) { | ||
176 | *error = "Chunk size is not a multiple of device blocksize"; | 177 | *error = "Chunk size is not a multiple of device blocksize"; |
177 | return -EINVAL; | 178 | return -EINVAL; |
178 | } | 179 | } |
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | |||
190 | } | 191 | } |
191 | 192 | ||
192 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 193 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
194 | struct dm_snapshot *snap, | ||
193 | unsigned *args_used, | 195 | unsigned *args_used, |
194 | struct dm_exception_store **store) | 196 | struct dm_exception_store **store) |
195 | { | 197 | { |
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
198 | struct dm_exception_store *tmp_store; | 200 | struct dm_exception_store *tmp_store; |
199 | char persistent; | 201 | char persistent; |
200 | 202 | ||
201 | if (argc < 3) { | 203 | if (argc < 2) { |
202 | ti->error = "Insufficient exception store arguments"; | 204 | ti->error = "Insufficient exception store arguments"; |
203 | return -EINVAL; | 205 | return -EINVAL; |
204 | } | 206 | } |
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
209 | return -ENOMEM; | 211 | return -ENOMEM; |
210 | } | 212 | } |
211 | 213 | ||
212 | persistent = toupper(*argv[1]); | 214 | persistent = toupper(*argv[0]); |
213 | if (persistent == 'P') | 215 | if (persistent == 'P') |
214 | type = get_type("P"); | 216 | type = get_type("P"); |
215 | else if (persistent == 'N') | 217 | else if (persistent == 'N') |
216 | type = get_type("N"); | 218 | type = get_type("N"); |
217 | else { | 219 | else { |
218 | ti->error = "Persistent flag is not P or N"; | 220 | ti->error = "Persistent flag is not P or N"; |
219 | return -EINVAL; | 221 | r = -EINVAL; |
222 | goto bad_type; | ||
220 | } | 223 | } |
221 | 224 | ||
222 | if (!type) { | 225 | if (!type) { |
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | |||
226 | } | 229 | } |
227 | 230 | ||
228 | tmp_store->type = type; | 231 | tmp_store->type = type; |
229 | tmp_store->ti = ti; | 232 | tmp_store->snap = snap; |
230 | |||
231 | r = dm_get_device(ti, argv[0], 0, 0, | ||
232 | FMODE_READ | FMODE_WRITE, &tmp_store->cow); | ||
233 | if (r) { | ||
234 | ti->error = "Cannot get COW device"; | ||
235 | goto bad_cow; | ||
236 | } | ||
237 | 233 | ||
238 | r = set_chunk_size(tmp_store, argv[2], &ti->error); | 234 | r = set_chunk_size(tmp_store, argv[1], &ti->error); |
239 | if (r) | 235 | if (r) |
240 | goto bad_ctr; | 236 | goto bad; |
241 | 237 | ||
242 | r = type->ctr(tmp_store, 0, NULL); | 238 | r = type->ctr(tmp_store, 0, NULL); |
243 | if (r) { | 239 | if (r) { |
244 | ti->error = "Exception store type constructor failed"; | 240 | ti->error = "Exception store type constructor failed"; |
245 | goto bad_ctr; | 241 | goto bad; |
246 | } | 242 | } |
247 | 243 | ||
248 | *args_used = 3; | 244 | *args_used = 2; |
249 | *store = tmp_store; | 245 | *store = tmp_store; |
250 | return 0; | 246 | return 0; |
251 | 247 | ||
252 | bad_ctr: | 248 | bad: |
253 | dm_put_device(ti, tmp_store->cow); | ||
254 | bad_cow: | ||
255 | put_type(type); | 249 | put_type(type); |
256 | bad_type: | 250 | bad_type: |
257 | kfree(tmp_store); | 251 | kfree(tmp_store); |
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create); | |||
262 | void dm_exception_store_destroy(struct dm_exception_store *store) | 256 | void dm_exception_store_destroy(struct dm_exception_store *store) |
263 | { | 257 | { |
264 | store->type->dtr(store); | 258 | store->type->dtr(store); |
265 | dm_put_device(store->ti, store->cow); | ||
266 | put_type(store->type); | 259 | put_type(store->type); |
267 | kfree(store); | 260 | kfree(store); |
268 | } | 261 | } |
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 8a223a48802c..e8dfa06af3ba 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h | |||
@@ -26,7 +26,7 @@ typedef sector_t chunk_t; | |||
26 | * of chunks that follow contiguously. Remaining bits hold the number of the | 26 | * of chunks that follow contiguously. Remaining bits hold the number of the |
27 | * chunk within the device. | 27 | * chunk within the device. |
28 | */ | 28 | */ |
29 | struct dm_snap_exception { | 29 | struct dm_exception { |
30 | struct list_head hash_list; | 30 | struct list_head hash_list; |
31 | 31 | ||
32 | chunk_t old_chunk; | 32 | chunk_t old_chunk; |
@@ -64,17 +64,34 @@ struct dm_exception_store_type { | |||
64 | * Find somewhere to store the next exception. | 64 | * Find somewhere to store the next exception. |
65 | */ | 65 | */ |
66 | int (*prepare_exception) (struct dm_exception_store *store, | 66 | int (*prepare_exception) (struct dm_exception_store *store, |
67 | struct dm_snap_exception *e); | 67 | struct dm_exception *e); |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Update the metadata with this exception. | 70 | * Update the metadata with this exception. |
71 | */ | 71 | */ |
72 | void (*commit_exception) (struct dm_exception_store *store, | 72 | void (*commit_exception) (struct dm_exception_store *store, |
73 | struct dm_snap_exception *e, | 73 | struct dm_exception *e, |
74 | void (*callback) (void *, int success), | 74 | void (*callback) (void *, int success), |
75 | void *callback_context); | 75 | void *callback_context); |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Returns 0 if the exception store is empty. | ||
79 | * | ||
80 | * If there are exceptions still to be merged, sets | ||
81 | * *last_old_chunk and *last_new_chunk to the most recent | ||
82 | * still-to-be-merged chunk and returns the number of | ||
83 | * consecutive previous ones. | ||
84 | */ | ||
85 | int (*prepare_merge) (struct dm_exception_store *store, | ||
86 | chunk_t *last_old_chunk, chunk_t *last_new_chunk); | ||
87 | |||
88 | /* | ||
89 | * Clear the last n exceptions. | ||
90 | * nr_merged must be <= the value returned by prepare_merge. | ||
91 | */ | ||
92 | int (*commit_merge) (struct dm_exception_store *store, int nr_merged); | ||
93 | |||
94 | /* | ||
78 | * The snapshot is invalid, note this in the metadata. | 95 | * The snapshot is invalid, note this in the metadata. |
79 | */ | 96 | */ |
80 | void (*drop_snapshot) (struct dm_exception_store *store); | 97 | void (*drop_snapshot) (struct dm_exception_store *store); |
@@ -86,19 +103,19 @@ struct dm_exception_store_type { | |||
86 | /* | 103 | /* |
87 | * Return how full the snapshot is. | 104 | * Return how full the snapshot is. |
88 | */ | 105 | */ |
89 | void (*fraction_full) (struct dm_exception_store *store, | 106 | void (*usage) (struct dm_exception_store *store, |
90 | sector_t *numerator, | 107 | sector_t *total_sectors, sector_t *sectors_allocated, |
91 | sector_t *denominator); | 108 | sector_t *metadata_sectors); |
92 | 109 | ||
93 | /* For internal device-mapper use only. */ | 110 | /* For internal device-mapper use only. */ |
94 | struct list_head list; | 111 | struct list_head list; |
95 | }; | 112 | }; |
96 | 113 | ||
114 | struct dm_snapshot; | ||
115 | |||
97 | struct dm_exception_store { | 116 | struct dm_exception_store { |
98 | struct dm_exception_store_type *type; | 117 | struct dm_exception_store_type *type; |
99 | struct dm_target *ti; | 118 | struct dm_snapshot *snap; |
100 | |||
101 | struct dm_dev *cow; | ||
102 | 119 | ||
103 | /* Size of data blocks saved - must be a power of 2 */ | 120 | /* Size of data blocks saved - must be a power of 2 */ |
104 | unsigned chunk_size; | 121 | unsigned chunk_size; |
@@ -109,6 +126,11 @@ struct dm_exception_store { | |||
109 | }; | 126 | }; |
110 | 127 | ||
111 | /* | 128 | /* |
129 | * Obtain the cow device used by a given snapshot. | ||
130 | */ | ||
131 | struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); | ||
132 | |||
133 | /* | ||
112 | * Funtions to manipulate consecutive chunks | 134 | * Funtions to manipulate consecutive chunks |
113 | */ | 135 | */ |
114 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) | 136 | # if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) |
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) | |||
120 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); | 142 | return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); |
121 | } | 143 | } |
122 | 144 | ||
123 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | 145 | static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) |
124 | { | 146 | { |
125 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; | 147 | return e->new_chunk >> DM_CHUNK_NUMBER_BITS; |
126 | } | 148 | } |
127 | 149 | ||
128 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | 150 | static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) |
129 | { | 151 | { |
130 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); | 152 | e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); |
131 | 153 | ||
132 | BUG_ON(!dm_consecutive_chunk_count(e)); | 154 | BUG_ON(!dm_consecutive_chunk_count(e)); |
133 | } | 155 | } |
134 | 156 | ||
157 | static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) | ||
158 | { | ||
159 | BUG_ON(!dm_consecutive_chunk_count(e)); | ||
160 | |||
161 | e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); | ||
162 | } | ||
163 | |||
135 | # else | 164 | # else |
136 | # define DM_CHUNK_CONSECUTIVE_BITS 0 | 165 | # define DM_CHUNK_CONSECUTIVE_BITS 0 |
137 | 166 | ||
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk) | |||
140 | return chunk; | 169 | return chunk; |
141 | } | 170 | } |
142 | 171 | ||
143 | static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) | 172 | static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) |
144 | { | 173 | { |
145 | return 0; | 174 | return 0; |
146 | } | 175 | } |
147 | 176 | ||
148 | static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) | 177 | static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) |
178 | { | ||
179 | } | ||
180 | |||
181 | static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) | ||
149 | { | 182 | { |
150 | } | 183 | } |
151 | 184 | ||
@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev) | |||
162 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, | 195 | static inline chunk_t sector_to_chunk(struct dm_exception_store *store, |
163 | sector_t sector) | 196 | sector_t sector) |
164 | { | 197 | { |
165 | return (sector & ~store->chunk_mask) >> store->chunk_shift; | 198 | return sector >> store->chunk_shift; |
166 | } | 199 | } |
167 | 200 | ||
168 | int dm_exception_store_type_register(struct dm_exception_store_type *type); | 201 | int dm_exception_store_type_register(struct dm_exception_store_type *type); |
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store, | |||
173 | char **error); | 206 | char **error); |
174 | 207 | ||
175 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, | 208 | int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, |
209 | struct dm_snapshot *snap, | ||
176 | unsigned *args_used, | 210 | unsigned *args_used, |
177 | struct dm_exception_store **store); | 211 | struct dm_exception_store **store); |
178 | void dm_exception_store_destroy(struct dm_exception_store *store); | 212 | void dm_exception_store_destroy(struct dm_exception_store *store); |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3a2e6a2f8bdd..10f457ca6af2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -5,6 +5,8 @@ | |||
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "dm.h" | ||
9 | |||
8 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
9 | 11 | ||
10 | #include <linux/bio.h> | 12 | #include <linux/bio.h> |
@@ -14,12 +16,19 @@ | |||
14 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
15 | #include <linux/dm-io.h> | 17 | #include <linux/dm-io.h> |
16 | 18 | ||
19 | #define DM_MSG_PREFIX "io" | ||
20 | |||
21 | #define DM_IO_MAX_REGIONS BITS_PER_LONG | ||
22 | |||
17 | struct dm_io_client { | 23 | struct dm_io_client { |
18 | mempool_t *pool; | 24 | mempool_t *pool; |
19 | struct bio_set *bios; | 25 | struct bio_set *bios; |
20 | }; | 26 | }; |
21 | 27 | ||
22 | /* FIXME: can we shrink this ? */ | 28 | /* |
29 | * Aligning 'struct io' reduces the number of bits required to store | ||
30 | * its address. Refer to store_io_and_region_in_bio() below. | ||
31 | */ | ||
23 | struct io { | 32 | struct io { |
24 | unsigned long error_bits; | 33 | unsigned long error_bits; |
25 | unsigned long eopnotsupp_bits; | 34 | unsigned long eopnotsupp_bits; |
@@ -28,7 +37,9 @@ struct io { | |||
28 | struct dm_io_client *client; | 37 | struct dm_io_client *client; |
29 | io_notify_fn callback; | 38 | io_notify_fn callback; |
30 | void *context; | 39 | void *context; |
31 | }; | 40 | } __attribute__((aligned(DM_IO_MAX_REGIONS))); |
41 | |||
42 | static struct kmem_cache *_dm_io_cache; | ||
32 | 43 | ||
33 | /* | 44 | /* |
34 | * io contexts are only dynamically allocated for asynchronous | 45 | * io contexts are only dynamically allocated for asynchronous |
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) | |||
53 | if (!client) | 64 | if (!client) |
54 | return ERR_PTR(-ENOMEM); | 65 | return ERR_PTR(-ENOMEM); |
55 | 66 | ||
56 | client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); | 67 | client->pool = mempool_create_slab_pool(ios, _dm_io_cache); |
57 | if (!client->pool) | 68 | if (!client->pool) |
58 | goto bad; | 69 | goto bad; |
59 | 70 | ||
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy); | |||
88 | 99 | ||
89 | /*----------------------------------------------------------------- | 100 | /*----------------------------------------------------------------- |
90 | * We need to keep track of which region a bio is doing io for. | 101 | * We need to keep track of which region a bio is doing io for. |
91 | * In order to save a memory allocation we store this the last | 102 | * To avoid a memory allocation to store just 5 or 6 bits, we |
92 | * bvec which we know is unused (blech). | 103 | * ensure the 'struct io' pointer is aligned so enough low bits are |
93 | * XXX This is ugly and can OOPS with some configs... find another way. | 104 | * always zero and then combine it with the region number directly in |
105 | * bi_private. | ||
94 | *---------------------------------------------------------------*/ | 106 | *---------------------------------------------------------------*/ |
95 | static inline void bio_set_region(struct bio *bio, unsigned region) | 107 | static void store_io_and_region_in_bio(struct bio *bio, struct io *io, |
108 | unsigned region) | ||
96 | { | 109 | { |
97 | bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; | 110 | if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { |
111 | DMCRIT("Unaligned struct io pointer %p", io); | ||
112 | BUG(); | ||
113 | } | ||
114 | |||
115 | bio->bi_private = (void *)((unsigned long)io | region); | ||
98 | } | 116 | } |
99 | 117 | ||
100 | static inline unsigned bio_get_region(struct bio *bio) | 118 | static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, |
119 | unsigned *region) | ||
101 | { | 120 | { |
102 | return bio->bi_io_vec[bio->bi_max_vecs].bv_len; | 121 | unsigned long val = (unsigned long)bio->bi_private; |
122 | |||
123 | *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); | ||
124 | *region = val & (DM_IO_MAX_REGIONS - 1); | ||
103 | } | 125 | } |
104 | 126 | ||
105 | /*----------------------------------------------------------------- | 127 | /*----------------------------------------------------------------- |
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error) | |||
140 | /* | 162 | /* |
141 | * The bio destructor in bio_put() may use the io object. | 163 | * The bio destructor in bio_put() may use the io object. |
142 | */ | 164 | */ |
143 | io = bio->bi_private; | 165 | retrieve_io_and_region_from_bio(bio, &io, ®ion); |
144 | region = bio_get_region(bio); | ||
145 | 166 | ||
146 | bio->bi_max_vecs++; | ||
147 | bio_put(bio); | 167 | bio_put(bio); |
148 | 168 | ||
149 | dec_count(io, region, error); | 169 | dec_count(io, region, error); |
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data) | |||
243 | 263 | ||
244 | static void dm_bio_destructor(struct bio *bio) | 264 | static void dm_bio_destructor(struct bio *bio) |
245 | { | 265 | { |
246 | struct io *io = bio->bi_private; | 266 | unsigned region; |
267 | struct io *io; | ||
268 | |||
269 | retrieve_io_and_region_from_bio(bio, &io, ®ion); | ||
247 | 270 | ||
248 | bio_free(bio, io->client->bios); | 271 | bio_free(bio, io->client->bios); |
249 | } | 272 | } |
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
286 | unsigned num_bvecs; | 309 | unsigned num_bvecs; |
287 | sector_t remaining = where->count; | 310 | sector_t remaining = where->count; |
288 | 311 | ||
289 | while (remaining) { | 312 | /* |
313 | * where->count may be zero if rw holds a write barrier and we | ||
314 | * need to send a zero-sized barrier. | ||
315 | */ | ||
316 | do { | ||
290 | /* | 317 | /* |
291 | * Allocate a suitably sized-bio: we add an extra | 318 | * Allocate a suitably sized-bio. |
292 | * bvec for bio_get/set_region() and decrement bi_max_vecs | ||
293 | * to hide it from bio_add_page(). | ||
294 | */ | 319 | */ |
295 | num_bvecs = dm_sector_div_up(remaining, | 320 | num_bvecs = dm_sector_div_up(remaining, |
296 | (PAGE_SIZE >> SECTOR_SHIFT)); | 321 | (PAGE_SIZE >> SECTOR_SHIFT)); |
297 | num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), | 322 | num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); |
298 | num_bvecs); | ||
299 | if (unlikely(num_bvecs > BIO_MAX_PAGES)) | ||
300 | num_bvecs = BIO_MAX_PAGES; | ||
301 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); | 323 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); |
302 | bio->bi_sector = where->sector + (where->count - remaining); | 324 | bio->bi_sector = where->sector + (where->count - remaining); |
303 | bio->bi_bdev = where->bdev; | 325 | bio->bi_bdev = where->bdev; |
304 | bio->bi_end_io = endio; | 326 | bio->bi_end_io = endio; |
305 | bio->bi_private = io; | ||
306 | bio->bi_destructor = dm_bio_destructor; | 327 | bio->bi_destructor = dm_bio_destructor; |
307 | bio->bi_max_vecs--; | 328 | store_io_and_region_in_bio(bio, io, region); |
308 | bio_set_region(bio, region); | ||
309 | 329 | ||
310 | /* | 330 | /* |
311 | * Try and add as many pages as possible. | 331 | * Try and add as many pages as possible. |
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
323 | 343 | ||
324 | atomic_inc(&io->count); | 344 | atomic_inc(&io->count); |
325 | submit_bio(rw, bio); | 345 | submit_bio(rw, bio); |
326 | } | 346 | } while (remaining); |
327 | } | 347 | } |
328 | 348 | ||
329 | static void dispatch_io(int rw, unsigned int num_regions, | 349 | static void dispatch_io(int rw, unsigned int num_regions, |
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
333 | int i; | 353 | int i; |
334 | struct dpages old_pages = *dp; | 354 | struct dpages old_pages = *dp; |
335 | 355 | ||
356 | BUG_ON(num_regions > DM_IO_MAX_REGIONS); | ||
357 | |||
336 | if (sync) | 358 | if (sync) |
337 | rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); | 359 | rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); |
338 | 360 | ||
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
342 | */ | 364 | */ |
343 | for (i = 0; i < num_regions; i++) { | 365 | for (i = 0; i < num_regions; i++) { |
344 | *dp = old_pages; | 366 | *dp = old_pages; |
345 | if (where[i].count) | 367 | if (where[i].count || (rw & (1 << BIO_RW_BARRIER))) |
346 | do_region(rw, i, where + i, dp, io); | 368 | do_region(rw, i, where + i, dp, io); |
347 | } | 369 | } |
348 | 370 | ||
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
357 | struct dm_io_region *where, int rw, struct dpages *dp, | 379 | struct dm_io_region *where, int rw, struct dpages *dp, |
358 | unsigned long *error_bits) | 380 | unsigned long *error_bits) |
359 | { | 381 | { |
360 | struct io io; | 382 | /* |
383 | * gcc <= 4.3 can't do the alignment for stack variables, so we must | ||
384 | * align it on our own. | ||
385 | * volatile prevents the optimizer from removing or reusing | ||
386 | * "io_" field from the stack frame (allowed in ANSI C). | ||
387 | */ | ||
388 | volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; | ||
389 | struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); | ||
361 | 390 | ||
362 | if (num_regions > 1 && (rw & RW_MASK) != WRITE) { | 391 | if (num_regions > 1 && (rw & RW_MASK) != WRITE) { |
363 | WARN_ON(1); | 392 | WARN_ON(1); |
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
365 | } | 394 | } |
366 | 395 | ||
367 | retry: | 396 | retry: |
368 | io.error_bits = 0; | 397 | io->error_bits = 0; |
369 | io.eopnotsupp_bits = 0; | 398 | io->eopnotsupp_bits = 0; |
370 | atomic_set(&io.count, 1); /* see dispatch_io() */ | 399 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
371 | io.sleeper = current; | 400 | io->sleeper = current; |
372 | io.client = client; | 401 | io->client = client; |
373 | 402 | ||
374 | dispatch_io(rw, num_regions, where, dp, &io, 1); | 403 | dispatch_io(rw, num_regions, where, dp, io, 1); |
375 | 404 | ||
376 | while (1) { | 405 | while (1) { |
377 | set_current_state(TASK_UNINTERRUPTIBLE); | 406 | set_current_state(TASK_UNINTERRUPTIBLE); |
378 | 407 | ||
379 | if (!atomic_read(&io.count)) | 408 | if (!atomic_read(&io->count)) |
380 | break; | 409 | break; |
381 | 410 | ||
382 | io_schedule(); | 411 | io_schedule(); |
383 | } | 412 | } |
384 | set_current_state(TASK_RUNNING); | 413 | set_current_state(TASK_RUNNING); |
385 | 414 | ||
386 | if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { | 415 | if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { |
387 | rw &= ~(1 << BIO_RW_BARRIER); | 416 | rw &= ~(1 << BIO_RW_BARRIER); |
388 | goto retry; | 417 | goto retry; |
389 | } | 418 | } |
390 | 419 | ||
391 | if (error_bits) | 420 | if (error_bits) |
392 | *error_bits = io.error_bits; | 421 | *error_bits = io->error_bits; |
393 | 422 | ||
394 | return io.error_bits ? -EIO : 0; | 423 | return io->error_bits ? -EIO : 0; |
395 | } | 424 | } |
396 | 425 | ||
397 | static int async_io(struct dm_io_client *client, unsigned int num_regions, | 426 | static int async_io(struct dm_io_client *client, unsigned int num_regions, |
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, | |||
472 | &dp, io_req->notify.fn, io_req->notify.context); | 501 | &dp, io_req->notify.fn, io_req->notify.context); |
473 | } | 502 | } |
474 | EXPORT_SYMBOL(dm_io); | 503 | EXPORT_SYMBOL(dm_io); |
504 | |||
505 | int __init dm_io_init(void) | ||
506 | { | ||
507 | _dm_io_cache = KMEM_CACHE(io, 0); | ||
508 | if (!_dm_io_cache) | ||
509 | return -ENOMEM; | ||
510 | |||
511 | return 0; | ||
512 | } | ||
513 | |||
514 | void dm_io_exit(void) | ||
515 | { | ||
516 | kmem_cache_destroy(_dm_io_cache); | ||
517 | _dm_io_cache = NULL; | ||
518 | } | ||
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a67942931582..d7500e1c26f2 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices); | |||
56 | */ | 56 | */ |
57 | static DECLARE_RWSEM(_hash_lock); | 57 | static DECLARE_RWSEM(_hash_lock); |
58 | 58 | ||
59 | /* | ||
60 | * Protects use of mdptr to obtain hash cell name and uuid from mapped device. | ||
61 | */ | ||
62 | static DEFINE_MUTEX(dm_hash_cells_mutex); | ||
63 | |||
59 | static void init_buckets(struct list_head *buckets) | 64 | static void init_buckets(struct list_head *buckets) |
60 | { | 65 | { |
61 | unsigned int i; | 66 | unsigned int i; |
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi | |||
206 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); | 211 | list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); |
207 | } | 212 | } |
208 | dm_get(md); | 213 | dm_get(md); |
214 | mutex_lock(&dm_hash_cells_mutex); | ||
209 | dm_set_mdptr(md, cell); | 215 | dm_set_mdptr(md, cell); |
216 | mutex_unlock(&dm_hash_cells_mutex); | ||
210 | up_write(&_hash_lock); | 217 | up_write(&_hash_lock); |
211 | 218 | ||
212 | return 0; | 219 | return 0; |
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc) | |||
224 | /* remove from the dev hash */ | 231 | /* remove from the dev hash */ |
225 | list_del(&hc->uuid_list); | 232 | list_del(&hc->uuid_list); |
226 | list_del(&hc->name_list); | 233 | list_del(&hc->name_list); |
234 | mutex_lock(&dm_hash_cells_mutex); | ||
227 | dm_set_mdptr(hc->md, NULL); | 235 | dm_set_mdptr(hc->md, NULL); |
236 | mutex_unlock(&dm_hash_cells_mutex); | ||
228 | 237 | ||
229 | table = dm_get_table(hc->md); | 238 | table = dm_get_live_table(hc->md); |
230 | if (table) { | 239 | if (table) { |
231 | dm_table_event(table); | 240 | dm_table_event(table); |
232 | dm_table_put(table); | 241 | dm_table_put(table); |
@@ -276,7 +285,8 @@ retry: | |||
276 | up_write(&_hash_lock); | 285 | up_write(&_hash_lock); |
277 | } | 286 | } |
278 | 287 | ||
279 | static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) | 288 | static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old, |
289 | const char *new) | ||
280 | { | 290 | { |
281 | char *new_name, *old_name; | 291 | char *new_name, *old_name; |
282 | struct hash_cell *hc; | 292 | struct hash_cell *hc; |
@@ -321,19 +331,22 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) | |||
321 | */ | 331 | */ |
322 | list_del(&hc->name_list); | 332 | list_del(&hc->name_list); |
323 | old_name = hc->name; | 333 | old_name = hc->name; |
334 | mutex_lock(&dm_hash_cells_mutex); | ||
324 | hc->name = new_name; | 335 | hc->name = new_name; |
336 | mutex_unlock(&dm_hash_cells_mutex); | ||
325 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); | 337 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); |
326 | 338 | ||
327 | /* | 339 | /* |
328 | * Wake up any dm event waiters. | 340 | * Wake up any dm event waiters. |
329 | */ | 341 | */ |
330 | table = dm_get_table(hc->md); | 342 | table = dm_get_live_table(hc->md); |
331 | if (table) { | 343 | if (table) { |
332 | dm_table_event(table); | 344 | dm_table_event(table); |
333 | dm_table_put(table); | 345 | dm_table_put(table); |
334 | } | 346 | } |
335 | 347 | ||
336 | dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); | 348 | if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie)) |
349 | *flags |= DM_UEVENT_GENERATED_FLAG; | ||
337 | 350 | ||
338 | dm_put(hc->md); | 351 | dm_put(hc->md); |
339 | up_write(&_hash_lock); | 352 | up_write(&_hash_lock); |
@@ -512,8 +525,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size) | |||
512 | return 0; | 525 | return 0; |
513 | } | 526 | } |
514 | 527 | ||
515 | |||
516 | |||
517 | static int check_name(const char *name) | 528 | static int check_name(const char *name) |
518 | { | 529 | { |
519 | if (strchr(name, '/')) { | 530 | if (strchr(name, '/')) { |
@@ -525,6 +536,40 @@ static int check_name(const char *name) | |||
525 | } | 536 | } |
526 | 537 | ||
527 | /* | 538 | /* |
539 | * On successful return, the caller must not attempt to acquire | ||
540 | * _hash_lock without first calling dm_table_put, because dm_table_destroy | ||
541 | * waits for this dm_table_put and could be called under this lock. | ||
542 | */ | ||
543 | static struct dm_table *dm_get_inactive_table(struct mapped_device *md) | ||
544 | { | ||
545 | struct hash_cell *hc; | ||
546 | struct dm_table *table = NULL; | ||
547 | |||
548 | down_read(&_hash_lock); | ||
549 | hc = dm_get_mdptr(md); | ||
550 | if (!hc || hc->md != md) { | ||
551 | DMWARN("device has been removed from the dev hash table."); | ||
552 | goto out; | ||
553 | } | ||
554 | |||
555 | table = hc->new_map; | ||
556 | if (table) | ||
557 | dm_table_get(table); | ||
558 | |||
559 | out: | ||
560 | up_read(&_hash_lock); | ||
561 | |||
562 | return table; | ||
563 | } | ||
564 | |||
565 | static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, | ||
566 | struct dm_ioctl *param) | ||
567 | { | ||
568 | return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? | ||
569 | dm_get_inactive_table(md) : dm_get_live_table(md); | ||
570 | } | ||
571 | |||
572 | /* | ||
528 | * Fills in a dm_ioctl structure, ready for sending back to | 573 | * Fills in a dm_ioctl structure, ready for sending back to |
529 | * userland. | 574 | * userland. |
530 | */ | 575 | */ |
@@ -536,7 +581,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
536 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | | 581 | param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | |
537 | DM_ACTIVE_PRESENT_FLAG); | 582 | DM_ACTIVE_PRESENT_FLAG); |
538 | 583 | ||
539 | if (dm_suspended(md)) | 584 | if (dm_suspended_md(md)) |
540 | param->flags |= DM_SUSPEND_FLAG; | 585 | param->flags |= DM_SUSPEND_FLAG; |
541 | 586 | ||
542 | param->dev = huge_encode_dev(disk_devt(disk)); | 587 | param->dev = huge_encode_dev(disk_devt(disk)); |
@@ -548,18 +593,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
548 | */ | 593 | */ |
549 | param->open_count = dm_open_count(md); | 594 | param->open_count = dm_open_count(md); |
550 | 595 | ||
551 | if (get_disk_ro(disk)) | ||
552 | param->flags |= DM_READONLY_FLAG; | ||
553 | |||
554 | param->event_nr = dm_get_event_nr(md); | 596 | param->event_nr = dm_get_event_nr(md); |
597 | param->target_count = 0; | ||
555 | 598 | ||
556 | table = dm_get_table(md); | 599 | table = dm_get_live_table(md); |
557 | if (table) { | 600 | if (table) { |
558 | param->flags |= DM_ACTIVE_PRESENT_FLAG; | 601 | if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { |
559 | param->target_count = dm_table_get_num_targets(table); | 602 | if (get_disk_ro(disk)) |
603 | param->flags |= DM_READONLY_FLAG; | ||
604 | param->target_count = dm_table_get_num_targets(table); | ||
605 | } | ||
560 | dm_table_put(table); | 606 | dm_table_put(table); |
561 | } else | 607 | |
562 | param->target_count = 0; | 608 | param->flags |= DM_ACTIVE_PRESENT_FLAG; |
609 | } | ||
610 | |||
611 | if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { | ||
612 | table = dm_get_inactive_table(md); | ||
613 | if (table) { | ||
614 | if (!(dm_table_get_mode(table) & FMODE_WRITE)) | ||
615 | param->flags |= DM_READONLY_FLAG; | ||
616 | param->target_count = dm_table_get_num_targets(table); | ||
617 | dm_table_put(table); | ||
618 | } | ||
619 | } | ||
563 | 620 | ||
564 | return 0; | 621 | return 0; |
565 | } | 622 | } |
@@ -634,9 +691,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param) | |||
634 | * Sneakily write in both the name and the uuid | 691 | * Sneakily write in both the name and the uuid |
635 | * while we have the cell. | 692 | * while we have the cell. |
636 | */ | 693 | */ |
637 | strncpy(param->name, hc->name, sizeof(param->name)); | 694 | strlcpy(param->name, hc->name, sizeof(param->name)); |
638 | if (hc->uuid) | 695 | if (hc->uuid) |
639 | strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); | 696 | strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); |
640 | else | 697 | else |
641 | param->uuid[0] = '\0'; | 698 | param->uuid[0] = '\0'; |
642 | 699 | ||
@@ -681,10 +738,10 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
681 | __hash_remove(hc); | 738 | __hash_remove(hc); |
682 | up_write(&_hash_lock); | 739 | up_write(&_hash_lock); |
683 | 740 | ||
684 | dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); | 741 | if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) |
742 | param->flags |= DM_UEVENT_GENERATED_FLAG; | ||
685 | 743 | ||
686 | dm_put(md); | 744 | dm_put(md); |
687 | param->data_size = 0; | ||
688 | return 0; | 745 | return 0; |
689 | } | 746 | } |
690 | 747 | ||
@@ -718,7 +775,9 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) | |||
718 | return r; | 775 | return r; |
719 | 776 | ||
720 | param->data_size = 0; | 777 | param->data_size = 0; |
721 | return dm_hash_rename(param->event_nr, param->name, new_name); | 778 | |
779 | return dm_hash_rename(param->event_nr, ¶m->flags, param->name, | ||
780 | new_name); | ||
722 | } | 781 | } |
723 | 782 | ||
724 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | 783 | static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) |
@@ -784,7 +843,7 @@ static int do_suspend(struct dm_ioctl *param) | |||
784 | if (param->flags & DM_NOFLUSH_FLAG) | 843 | if (param->flags & DM_NOFLUSH_FLAG) |
785 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; | 844 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; |
786 | 845 | ||
787 | if (!dm_suspended(md)) | 846 | if (!dm_suspended_md(md)) |
788 | r = dm_suspend(md, suspend_flags); | 847 | r = dm_suspend(md, suspend_flags); |
789 | 848 | ||
790 | if (!r) | 849 | if (!r) |
@@ -800,7 +859,7 @@ static int do_resume(struct dm_ioctl *param) | |||
800 | unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; | 859 | unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; |
801 | struct hash_cell *hc; | 860 | struct hash_cell *hc; |
802 | struct mapped_device *md; | 861 | struct mapped_device *md; |
803 | struct dm_table *new_map; | 862 | struct dm_table *new_map, *old_map = NULL; |
804 | 863 | ||
805 | down_write(&_hash_lock); | 864 | down_write(&_hash_lock); |
806 | 865 | ||
@@ -826,14 +885,14 @@ static int do_resume(struct dm_ioctl *param) | |||
826 | suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; | 885 | suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; |
827 | if (param->flags & DM_NOFLUSH_FLAG) | 886 | if (param->flags & DM_NOFLUSH_FLAG) |
828 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; | 887 | suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; |
829 | if (!dm_suspended(md)) | 888 | if (!dm_suspended_md(md)) |
830 | dm_suspend(md, suspend_flags); | 889 | dm_suspend(md, suspend_flags); |
831 | 890 | ||
832 | r = dm_swap_table(md, new_map); | 891 | old_map = dm_swap_table(md, new_map); |
833 | if (r) { | 892 | if (IS_ERR(old_map)) { |
834 | dm_table_destroy(new_map); | 893 | dm_table_destroy(new_map); |
835 | dm_put(md); | 894 | dm_put(md); |
836 | return r; | 895 | return PTR_ERR(old_map); |
837 | } | 896 | } |
838 | 897 | ||
839 | if (dm_table_get_mode(new_map) & FMODE_WRITE) | 898 | if (dm_table_get_mode(new_map) & FMODE_WRITE) |
@@ -842,14 +901,17 @@ static int do_resume(struct dm_ioctl *param) | |||
842 | set_disk_ro(dm_disk(md), 1); | 901 | set_disk_ro(dm_disk(md), 1); |
843 | } | 902 | } |
844 | 903 | ||
845 | if (dm_suspended(md)) | 904 | if (dm_suspended_md(md)) { |
846 | r = dm_resume(md); | 905 | r = dm_resume(md); |
906 | if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr)) | ||
907 | param->flags |= DM_UEVENT_GENERATED_FLAG; | ||
908 | } | ||
847 | 909 | ||
910 | if (old_map) | ||
911 | dm_table_destroy(old_map); | ||
848 | 912 | ||
849 | if (!r) { | 913 | if (!r) |
850 | dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr); | ||
851 | r = __dev_status(md, param); | 914 | r = __dev_status(md, param); |
852 | } | ||
853 | 915 | ||
854 | dm_put(md); | 916 | dm_put(md); |
855 | return r; | 917 | return r; |
@@ -982,7 +1044,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) | |||
982 | if (r) | 1044 | if (r) |
983 | goto out; | 1045 | goto out; |
984 | 1046 | ||
985 | table = dm_get_table(md); | 1047 | table = dm_get_live_or_inactive_table(md, param); |
986 | if (table) { | 1048 | if (table) { |
987 | retrieve_status(table, param, param_size); | 1049 | retrieve_status(table, param, param_size); |
988 | dm_table_put(table); | 1050 | dm_table_put(table); |
@@ -1215,7 +1277,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) | |||
1215 | if (r) | 1277 | if (r) |
1216 | goto out; | 1278 | goto out; |
1217 | 1279 | ||
1218 | table = dm_get_table(md); | 1280 | table = dm_get_live_or_inactive_table(md, param); |
1219 | if (table) { | 1281 | if (table) { |
1220 | retrieve_deps(table, param, param_size); | 1282 | retrieve_deps(table, param, param_size); |
1221 | dm_table_put(table); | 1283 | dm_table_put(table); |
@@ -1244,13 +1306,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size) | |||
1244 | if (r) | 1306 | if (r) |
1245 | goto out; | 1307 | goto out; |
1246 | 1308 | ||
1247 | table = dm_get_table(md); | 1309 | table = dm_get_live_or_inactive_table(md, param); |
1248 | if (table) { | 1310 | if (table) { |
1249 | retrieve_status(table, param, param_size); | 1311 | retrieve_status(table, param, param_size); |
1250 | dm_table_put(table); | 1312 | dm_table_put(table); |
1251 | } | 1313 | } |
1252 | 1314 | ||
1253 | out: | 1315 | out: |
1254 | dm_put(md); | 1316 | dm_put(md); |
1255 | return r; | 1317 | return r; |
1256 | } | 1318 | } |
@@ -1288,10 +1350,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1288 | goto out; | 1350 | goto out; |
1289 | } | 1351 | } |
1290 | 1352 | ||
1291 | table = dm_get_table(md); | 1353 | table = dm_get_live_table(md); |
1292 | if (!table) | 1354 | if (!table) |
1293 | goto out_argv; | 1355 | goto out_argv; |
1294 | 1356 | ||
1357 | if (dm_deleting_md(md)) { | ||
1358 | r = -ENXIO; | ||
1359 | goto out_table; | ||
1360 | } | ||
1361 | |||
1295 | ti = dm_table_find_target(table, tmsg->sector); | 1362 | ti = dm_table_find_target(table, tmsg->sector); |
1296 | if (!dm_target_is_valid(ti)) { | 1363 | if (!dm_target_is_valid(ti)) { |
1297 | DMWARN("Target message sector outside device."); | 1364 | DMWARN("Target message sector outside device."); |
@@ -1303,6 +1370,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1303 | r = -EINVAL; | 1370 | r = -EINVAL; |
1304 | } | 1371 | } |
1305 | 1372 | ||
1373 | out_table: | ||
1306 | dm_table_put(table); | 1374 | dm_table_put(table); |
1307 | out_argv: | 1375 | out_argv: |
1308 | kfree(argv); | 1376 | kfree(argv); |
@@ -1413,6 +1481,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
1413 | { | 1481 | { |
1414 | /* Always clear this flag */ | 1482 | /* Always clear this flag */ |
1415 | param->flags &= ~DM_BUFFER_FULL_FLAG; | 1483 | param->flags &= ~DM_BUFFER_FULL_FLAG; |
1484 | param->flags &= ~DM_UEVENT_GENERATED_FLAG; | ||
1416 | 1485 | ||
1417 | /* Ignores parameters */ | 1486 | /* Ignores parameters */ |
1418 | if (cmd == DM_REMOVE_ALL_CMD || | 1487 | if (cmd == DM_REMOVE_ALL_CMD || |
@@ -1582,8 +1651,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) | |||
1582 | if (!md) | 1651 | if (!md) |
1583 | return -ENXIO; | 1652 | return -ENXIO; |
1584 | 1653 | ||
1585 | dm_get(md); | 1654 | mutex_lock(&dm_hash_cells_mutex); |
1586 | down_read(&_hash_lock); | ||
1587 | hc = dm_get_mdptr(md); | 1655 | hc = dm_get_mdptr(md); |
1588 | if (!hc || hc->md != md) { | 1656 | if (!hc || hc->md != md) { |
1589 | r = -ENXIO; | 1657 | r = -ENXIO; |
@@ -1596,8 +1664,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) | |||
1596 | strcpy(uuid, hc->uuid ? : ""); | 1664 | strcpy(uuid, hc->uuid ? : ""); |
1597 | 1665 | ||
1598 | out: | 1666 | out: |
1599 | up_read(&_hash_lock); | 1667 | mutex_unlock(&dm_hash_cells_mutex); |
1600 | dm_put(md); | ||
1601 | 1668 | ||
1602 | return r; | 1669 | return r; |
1603 | } | 1670 | } |
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 3e3fc06cb861..addf83475040 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c | |||
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job) | |||
450 | { | 450 | { |
451 | struct dm_kcopyd_client *kc = job->kc; | 451 | struct dm_kcopyd_client *kc = job->kc; |
452 | atomic_inc(&kc->nr_jobs); | 452 | atomic_inc(&kc->nr_jobs); |
453 | push(&kc->pages_jobs, job); | 453 | if (unlikely(!job->source.count)) |
454 | push(&kc->complete_jobs, job); | ||
455 | else | ||
456 | push(&kc->pages_jobs, job); | ||
454 | wake(kc); | 457 | wake(kc); |
455 | } | 458 | } |
456 | 459 | ||
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 82f7d6e6b1ea..9200dbf2391a 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -47,8 +47,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
47 | } | 47 | } |
48 | lc->start = tmp; | 48 | lc->start = tmp; |
49 | 49 | ||
50 | if (dm_get_device(ti, argv[0], lc->start, ti->len, | 50 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { |
51 | dm_table_get_mode(ti->table), &lc->dev)) { | ||
52 | ti->error = "dm-linear: Device lookup failed"; | 51 | ti->error = "dm-linear: Device lookup failed"; |
53 | goto bad; | 52 | goto bad; |
54 | } | 53 | } |
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 7ac2c1450d10..1ed0094f064b 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -5,6 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/bio.h> | 7 | #include <linux/bio.h> |
8 | #include <linux/slab.h> | ||
8 | #include <linux/dm-dirty-log.h> | 9 | #include <linux/dm-dirty-log.h> |
9 | #include <linux/device-mapper.h> | 10 | #include <linux/device-mapper.h> |
10 | #include <linux/dm-log-userspace.h> | 11 | #include <linux/dm-log-userspace.h> |
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 54abf9e303b7..075cbcf8a9f5 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/slab.h> | ||
9 | #include <net/sock.h> | 10 | #include <net/sock.h> |
10 | #include <linux/workqueue.h> | 11 | #include <linux/workqueue.h> |
11 | #include <linux/connector.h> | 12 | #include <linux/connector.h> |
@@ -172,11 +173,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, | |||
172 | { | 173 | { |
173 | int r = 0; | 174 | int r = 0; |
174 | size_t dummy = 0; | 175 | size_t dummy = 0; |
175 | int overhead_size = | 176 | int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); |
176 | sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg); | ||
177 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; | 177 | struct dm_ulog_request *tfr = prealloced_ulog_tfr; |
178 | struct receiving_pkg pkg; | 178 | struct receiving_pkg pkg; |
179 | 179 | ||
180 | /* | ||
181 | * Given the space needed to hold the 'struct cn_msg' and | ||
182 | * 'struct dm_ulog_request' - do we have enough payload | ||
183 | * space remaining? | ||
184 | */ | ||
180 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { | 185 | if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { |
181 | DMINFO("Size of tfr exceeds preallocated size"); | 186 | DMINFO("Size of tfr exceeds preallocated size"); |
182 | return -EINVAL; | 187 | return -EINVAL; |
@@ -191,7 +196,7 @@ resend: | |||
191 | */ | 196 | */ |
192 | mutex_lock(&dm_ulog_lock); | 197 | mutex_lock(&dm_ulog_lock); |
193 | 198 | ||
194 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); | 199 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); |
195 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | 200 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); |
196 | tfr->luid = luid; | 201 | tfr->luid = luid; |
197 | tfr->seq = dm_ulog_seq++; | 202 | tfr->seq = dm_ulog_seq++; |
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 9443896ede07..5a08be0222db 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) | |||
145 | EXPORT_SYMBOL(dm_dirty_log_type_unregister); | 145 | EXPORT_SYMBOL(dm_dirty_log_type_unregister); |
146 | 146 | ||
147 | struct dm_dirty_log *dm_dirty_log_create(const char *type_name, | 147 | struct dm_dirty_log *dm_dirty_log_create(const char *type_name, |
148 | struct dm_target *ti, | 148 | struct dm_target *ti, |
149 | unsigned int argc, char **argv) | 149 | int (*flush_callback_fn)(struct dm_target *ti), |
150 | unsigned int argc, char **argv) | ||
150 | { | 151 | { |
151 | struct dm_dirty_log_type *type; | 152 | struct dm_dirty_log_type *type; |
152 | struct dm_dirty_log *log; | 153 | struct dm_dirty_log *log; |
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name, | |||
161 | return NULL; | 162 | return NULL; |
162 | } | 163 | } |
163 | 164 | ||
165 | log->flush_callback_fn = flush_callback_fn; | ||
164 | log->type = type; | 166 | log->type = type; |
165 | if (type->ctr(log, ti, argc, argv)) { | 167 | if (type->ctr(log, ti, argc, argv)) { |
166 | kfree(log); | 168 | kfree(log); |
@@ -208,7 +210,9 @@ struct log_header { | |||
208 | 210 | ||
209 | struct log_c { | 211 | struct log_c { |
210 | struct dm_target *ti; | 212 | struct dm_target *ti; |
211 | int touched; | 213 | int touched_dirtied; |
214 | int touched_cleaned; | ||
215 | int flush_failed; | ||
212 | uint32_t region_size; | 216 | uint32_t region_size; |
213 | unsigned int region_count; | 217 | unsigned int region_count; |
214 | region_t sync_count; | 218 | region_t sync_count; |
@@ -233,6 +237,7 @@ struct log_c { | |||
233 | * Disk log fields | 237 | * Disk log fields |
234 | */ | 238 | */ |
235 | int log_dev_failed; | 239 | int log_dev_failed; |
240 | int log_dev_flush_failed; | ||
236 | struct dm_dev *log_dev; | 241 | struct dm_dev *log_dev; |
237 | struct log_header header; | 242 | struct log_header header; |
238 | 243 | ||
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l, | |||
253 | uint32_t *bs, unsigned bit) | 258 | uint32_t *bs, unsigned bit) |
254 | { | 259 | { |
255 | ext2_set_bit(bit, (unsigned long *) bs); | 260 | ext2_set_bit(bit, (unsigned long *) bs); |
256 | l->touched = 1; | 261 | l->touched_cleaned = 1; |
257 | } | 262 | } |
258 | 263 | ||
259 | static inline void log_clear_bit(struct log_c *l, | 264 | static inline void log_clear_bit(struct log_c *l, |
260 | uint32_t *bs, unsigned bit) | 265 | uint32_t *bs, unsigned bit) |
261 | { | 266 | { |
262 | ext2_clear_bit(bit, (unsigned long *) bs); | 267 | ext2_clear_bit(bit, (unsigned long *) bs); |
263 | l->touched = 1; | 268 | l->touched_dirtied = 1; |
264 | } | 269 | } |
265 | 270 | ||
266 | /*---------------------------------------------------------------- | 271 | /*---------------------------------------------------------------- |
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw) | |||
287 | return dm_io(&lc->io_req, 1, &lc->header_location, NULL); | 292 | return dm_io(&lc->io_req, 1, &lc->header_location, NULL); |
288 | } | 293 | } |
289 | 294 | ||
295 | static int flush_header(struct log_c *lc) | ||
296 | { | ||
297 | struct dm_io_region null_location = { | ||
298 | .bdev = lc->header_location.bdev, | ||
299 | .sector = 0, | ||
300 | .count = 0, | ||
301 | }; | ||
302 | |||
303 | lc->io_req.bi_rw = WRITE_BARRIER; | ||
304 | |||
305 | return dm_io(&lc->io_req, 1, &null_location, NULL); | ||
306 | } | ||
307 | |||
290 | static int read_header(struct log_c *log) | 308 | static int read_header(struct log_c *log) |
291 | { | 309 | { |
292 | int r; | 310 | int r; |
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
378 | } | 396 | } |
379 | 397 | ||
380 | lc->ti = ti; | 398 | lc->ti = ti; |
381 | lc->touched = 0; | 399 | lc->touched_dirtied = 0; |
400 | lc->touched_cleaned = 0; | ||
401 | lc->flush_failed = 0; | ||
382 | lc->region_size = region_size; | 402 | lc->region_size = region_size; |
383 | lc->region_count = region_count; | 403 | lc->region_count = region_count; |
384 | lc->sync = sync; | 404 | lc->sync = sync; |
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
406 | } else { | 426 | } else { |
407 | lc->log_dev = dev; | 427 | lc->log_dev = dev; |
408 | lc->log_dev_failed = 0; | 428 | lc->log_dev_failed = 0; |
429 | lc->log_dev_flush_failed = 0; | ||
409 | lc->header_location.bdev = lc->log_dev->bdev; | 430 | lc->header_location.bdev = lc->log_dev->bdev; |
410 | lc->header_location.sector = 0; | 431 | lc->header_location.sector = 0; |
411 | 432 | ||
@@ -522,8 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
522 | return -EINVAL; | 543 | return -EINVAL; |
523 | } | 544 | } |
524 | 545 | ||
525 | r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, | 546 | r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev); |
526 | FMODE_READ | FMODE_WRITE, &dev); | ||
527 | if (r) | 547 | if (r) |
528 | return r; | 548 | return r; |
529 | 549 | ||
@@ -614,6 +634,11 @@ static int disk_resume(struct dm_dirty_log *log) | |||
614 | 634 | ||
615 | /* write the new header */ | 635 | /* write the new header */ |
616 | r = rw_header(lc, WRITE); | 636 | r = rw_header(lc, WRITE); |
637 | if (!r) { | ||
638 | r = flush_header(lc); | ||
639 | if (r) | ||
640 | lc->log_dev_flush_failed = 1; | ||
641 | } | ||
617 | if (r) { | 642 | if (r) { |
618 | DMWARN("%s: Failed to write header on dirty region log device", | 643 | DMWARN("%s: Failed to write header on dirty region log device", |
619 | lc->log_dev->name); | 644 | lc->log_dev->name); |
@@ -656,18 +681,40 @@ static int core_flush(struct dm_dirty_log *log) | |||
656 | 681 | ||
657 | static int disk_flush(struct dm_dirty_log *log) | 682 | static int disk_flush(struct dm_dirty_log *log) |
658 | { | 683 | { |
659 | int r; | 684 | int r, i; |
660 | struct log_c *lc = (struct log_c *) log->context; | 685 | struct log_c *lc = log->context; |
661 | 686 | ||
662 | /* only write if the log has changed */ | 687 | /* only write if the log has changed */ |
663 | if (!lc->touched) | 688 | if (!lc->touched_cleaned && !lc->touched_dirtied) |
664 | return 0; | 689 | return 0; |
665 | 690 | ||
691 | if (lc->touched_cleaned && log->flush_callback_fn && | ||
692 | log->flush_callback_fn(lc->ti)) { | ||
693 | /* | ||
694 | * At this point it is impossible to determine which | ||
695 | * regions are clean and which are dirty (without | ||
696 | * re-reading the log off disk). So mark all of them | ||
697 | * dirty. | ||
698 | */ | ||
699 | lc->flush_failed = 1; | ||
700 | for (i = 0; i < lc->region_count; i++) | ||
701 | log_clear_bit(lc, lc->clean_bits, i); | ||
702 | } | ||
703 | |||
666 | r = rw_header(lc, WRITE); | 704 | r = rw_header(lc, WRITE); |
667 | if (r) | 705 | if (r) |
668 | fail_log_device(lc); | 706 | fail_log_device(lc); |
669 | else | 707 | else { |
670 | lc->touched = 0; | 708 | if (lc->touched_dirtied) { |
709 | r = flush_header(lc); | ||
710 | if (r) { | ||
711 | lc->log_dev_flush_failed = 1; | ||
712 | fail_log_device(lc); | ||
713 | } else | ||
714 | lc->touched_dirtied = 0; | ||
715 | } | ||
716 | lc->touched_cleaned = 0; | ||
717 | } | ||
671 | 718 | ||
672 | return r; | 719 | return r; |
673 | } | 720 | } |
@@ -681,7 +728,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region) | |||
681 | static void core_clear_region(struct dm_dirty_log *log, region_t region) | 728 | static void core_clear_region(struct dm_dirty_log *log, region_t region) |
682 | { | 729 | { |
683 | struct log_c *lc = (struct log_c *) log->context; | 730 | struct log_c *lc = (struct log_c *) log->context; |
684 | log_set_bit(lc, lc->clean_bits, region); | 731 | if (likely(!lc->flush_failed)) |
732 | log_set_bit(lc, lc->clean_bits, region); | ||
685 | } | 733 | } |
686 | 734 | ||
687 | static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) | 735 | static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) |
@@ -762,7 +810,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status, | |||
762 | switch(status) { | 810 | switch(status) { |
763 | case STATUSTYPE_INFO: | 811 | case STATUSTYPE_INFO: |
764 | DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, | 812 | DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, |
765 | lc->log_dev_failed ? 'D' : 'A'); | 813 | lc->log_dev_flush_failed ? 'F' : |
814 | lc->log_dev_failed ? 'D' : | ||
815 | 'A'); | ||
766 | break; | 816 | break; |
767 | 817 | ||
768 | case STATUSTYPE_TABLE: | 818 | case STATUSTYPE_TABLE: |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 32d0b878eccc..826bce7343b3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -69,6 +69,7 @@ struct multipath { | |||
69 | struct list_head priority_groups; | 69 | struct list_head priority_groups; |
70 | unsigned pg_init_required; /* pg_init needs calling? */ | 70 | unsigned pg_init_required; /* pg_init needs calling? */ |
71 | unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ | 71 | unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ |
72 | wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ | ||
72 | 73 | ||
73 | unsigned nr_valid_paths; /* Total number of usable paths */ | 74 | unsigned nr_valid_paths; /* Total number of usable paths */ |
74 | struct pgpath *current_pgpath; | 75 | struct pgpath *current_pgpath; |
@@ -93,6 +94,8 @@ struct multipath { | |||
93 | * can resubmit bios on error. | 94 | * can resubmit bios on error. |
94 | */ | 95 | */ |
95 | mempool_t *mpio_pool; | 96 | mempool_t *mpio_pool; |
97 | |||
98 | struct mutex work_mutex; | ||
96 | }; | 99 | }; |
97 | 100 | ||
98 | /* | 101 | /* |
@@ -198,6 +201,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
198 | m->queue_io = 1; | 201 | m->queue_io = 1; |
199 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 202 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
200 | INIT_WORK(&m->trigger_event, trigger_event); | 203 | INIT_WORK(&m->trigger_event, trigger_event); |
204 | init_waitqueue_head(&m->pg_init_wait); | ||
205 | mutex_init(&m->work_mutex); | ||
201 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); | 206 | m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); |
202 | if (!m->mpio_pool) { | 207 | if (!m->mpio_pool) { |
203 | kfree(m); | 208 | kfree(m); |
@@ -230,6 +235,21 @@ static void free_multipath(struct multipath *m) | |||
230 | * Path selection | 235 | * Path selection |
231 | *-----------------------------------------------*/ | 236 | *-----------------------------------------------*/ |
232 | 237 | ||
238 | static void __pg_init_all_paths(struct multipath *m) | ||
239 | { | ||
240 | struct pgpath *pgpath; | ||
241 | |||
242 | m->pg_init_count++; | ||
243 | m->pg_init_required = 0; | ||
244 | list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { | ||
245 | /* Skip failed paths */ | ||
246 | if (!pgpath->is_active) | ||
247 | continue; | ||
248 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | ||
249 | m->pg_init_in_progress++; | ||
250 | } | ||
251 | } | ||
252 | |||
233 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) | 253 | static void __switch_pg(struct multipath *m, struct pgpath *pgpath) |
234 | { | 254 | { |
235 | m->current_pg = pgpath->pg; | 255 | m->current_pg = pgpath->pg; |
@@ -434,7 +454,7 @@ static void process_queued_ios(struct work_struct *work) | |||
434 | { | 454 | { |
435 | struct multipath *m = | 455 | struct multipath *m = |
436 | container_of(work, struct multipath, process_queued_ios); | 456 | container_of(work, struct multipath, process_queued_ios); |
437 | struct pgpath *pgpath = NULL, *tmp; | 457 | struct pgpath *pgpath = NULL; |
438 | unsigned must_queue = 1; | 458 | unsigned must_queue = 1; |
439 | unsigned long flags; | 459 | unsigned long flags; |
440 | 460 | ||
@@ -452,14 +472,9 @@ static void process_queued_ios(struct work_struct *work) | |||
452 | (!pgpath && !m->queue_if_no_path)) | 472 | (!pgpath && !m->queue_if_no_path)) |
453 | must_queue = 0; | 473 | must_queue = 0; |
454 | 474 | ||
455 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { | 475 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) |
456 | m->pg_init_count++; | 476 | __pg_init_all_paths(m); |
457 | m->pg_init_required = 0; | 477 | |
458 | list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) { | ||
459 | if (queue_work(kmpath_handlerd, &tmp->activate_path)) | ||
460 | m->pg_init_in_progress++; | ||
461 | } | ||
462 | } | ||
463 | out: | 478 | out: |
464 | spin_unlock_irqrestore(&m->lock, flags); | 479 | spin_unlock_irqrestore(&m->lock, flags); |
465 | if (!must_queue) | 480 | if (!must_queue) |
@@ -592,8 +607,8 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, | |||
592 | if (!p) | 607 | if (!p) |
593 | return ERR_PTR(-ENOMEM); | 608 | return ERR_PTR(-ENOMEM); |
594 | 609 | ||
595 | r = dm_get_device(ti, shift(as), ti->begin, ti->len, | 610 | r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table), |
596 | dm_table_get_mode(ti->table), &p->path.dev); | 611 | &p->path.dev); |
597 | if (r) { | 612 | if (r) { |
598 | ti->error = "error getting device"; | 613 | ti->error = "error getting device"; |
599 | goto bad; | 614 | goto bad; |
@@ -885,13 +900,43 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
885 | return r; | 900 | return r; |
886 | } | 901 | } |
887 | 902 | ||
888 | static void multipath_dtr(struct dm_target *ti) | 903 | static void multipath_wait_for_pg_init_completion(struct multipath *m) |
889 | { | 904 | { |
890 | struct multipath *m = (struct multipath *) ti->private; | 905 | DECLARE_WAITQUEUE(wait, current); |
906 | unsigned long flags; | ||
907 | |||
908 | add_wait_queue(&m->pg_init_wait, &wait); | ||
891 | 909 | ||
910 | while (1) { | ||
911 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
912 | |||
913 | spin_lock_irqsave(&m->lock, flags); | ||
914 | if (!m->pg_init_in_progress) { | ||
915 | spin_unlock_irqrestore(&m->lock, flags); | ||
916 | break; | ||
917 | } | ||
918 | spin_unlock_irqrestore(&m->lock, flags); | ||
919 | |||
920 | io_schedule(); | ||
921 | } | ||
922 | set_current_state(TASK_RUNNING); | ||
923 | |||
924 | remove_wait_queue(&m->pg_init_wait, &wait); | ||
925 | } | ||
926 | |||
927 | static void flush_multipath_work(struct multipath *m) | ||
928 | { | ||
892 | flush_workqueue(kmpath_handlerd); | 929 | flush_workqueue(kmpath_handlerd); |
930 | multipath_wait_for_pg_init_completion(m); | ||
893 | flush_workqueue(kmultipathd); | 931 | flush_workqueue(kmultipathd); |
894 | flush_scheduled_work(); | 932 | flush_scheduled_work(); |
933 | } | ||
934 | |||
935 | static void multipath_dtr(struct dm_target *ti) | ||
936 | { | ||
937 | struct multipath *m = ti->private; | ||
938 | |||
939 | flush_multipath_work(m); | ||
895 | free_multipath(m); | 940 | free_multipath(m); |
896 | } | 941 | } |
897 | 942 | ||
@@ -1116,9 +1161,9 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) | |||
1116 | return limit_reached; | 1161 | return limit_reached; |
1117 | } | 1162 | } |
1118 | 1163 | ||
1119 | static void pg_init_done(struct dm_path *path, int errors) | 1164 | static void pg_init_done(void *data, int errors) |
1120 | { | 1165 | { |
1121 | struct pgpath *pgpath = path_to_pgpath(path); | 1166 | struct pgpath *pgpath = data; |
1122 | struct priority_group *pg = pgpath->pg; | 1167 | struct priority_group *pg = pgpath->pg; |
1123 | struct multipath *m = pg->m; | 1168 | struct multipath *m = pg->m; |
1124 | unsigned long flags; | 1169 | unsigned long flags; |
@@ -1132,8 +1177,8 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
1132 | errors = 0; | 1177 | errors = 0; |
1133 | break; | 1178 | break; |
1134 | } | 1179 | } |
1135 | DMERR("Cannot failover device because scsi_dh_%s was not " | 1180 | DMERR("Could not failover the device: Handler scsi_dh_%s " |
1136 | "loaded.", m->hw_handler_name); | 1181 | "Error %d.", m->hw_handler_name, errors); |
1137 | /* | 1182 | /* |
1138 | * Fail path for now, so we do not ping pong | 1183 | * Fail path for now, so we do not ping pong |
1139 | */ | 1184 | */ |
@@ -1170,25 +1215,34 @@ static void pg_init_done(struct dm_path *path, int errors) | |||
1170 | m->current_pgpath = NULL; | 1215 | m->current_pgpath = NULL; |
1171 | m->current_pg = NULL; | 1216 | m->current_pg = NULL; |
1172 | } | 1217 | } |
1173 | } else if (!m->pg_init_required) { | 1218 | } else if (!m->pg_init_required) |
1174 | m->queue_io = 0; | ||
1175 | pg->bypassed = 0; | 1219 | pg->bypassed = 0; |
1176 | } | ||
1177 | 1220 | ||
1178 | m->pg_init_in_progress--; | 1221 | if (--m->pg_init_in_progress) |
1179 | if (!m->pg_init_in_progress) | 1222 | /* Activations of other paths are still on going */ |
1180 | queue_work(kmultipathd, &m->process_queued_ios); | 1223 | goto out; |
1224 | |||
1225 | if (!m->pg_init_required) | ||
1226 | m->queue_io = 0; | ||
1227 | |||
1228 | queue_work(kmultipathd, &m->process_queued_ios); | ||
1229 | |||
1230 | /* | ||
1231 | * Wake up any thread waiting to suspend. | ||
1232 | */ | ||
1233 | wake_up(&m->pg_init_wait); | ||
1234 | |||
1235 | out: | ||
1181 | spin_unlock_irqrestore(&m->lock, flags); | 1236 | spin_unlock_irqrestore(&m->lock, flags); |
1182 | } | 1237 | } |
1183 | 1238 | ||
1184 | static void activate_path(struct work_struct *work) | 1239 | static void activate_path(struct work_struct *work) |
1185 | { | 1240 | { |
1186 | int ret; | ||
1187 | struct pgpath *pgpath = | 1241 | struct pgpath *pgpath = |
1188 | container_of(work, struct pgpath, activate_path); | 1242 | container_of(work, struct pgpath, activate_path); |
1189 | 1243 | ||
1190 | ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); | 1244 | scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), |
1191 | pg_init_done(&pgpath->path, ret); | 1245 | pg_init_done, pgpath); |
1192 | } | 1246 | } |
1193 | 1247 | ||
1194 | /* | 1248 | /* |
@@ -1261,6 +1315,15 @@ static void multipath_presuspend(struct dm_target *ti) | |||
1261 | queue_if_no_path(m, 0, 1); | 1315 | queue_if_no_path(m, 0, 1); |
1262 | } | 1316 | } |
1263 | 1317 | ||
1318 | static void multipath_postsuspend(struct dm_target *ti) | ||
1319 | { | ||
1320 | struct multipath *m = ti->private; | ||
1321 | |||
1322 | mutex_lock(&m->work_mutex); | ||
1323 | flush_multipath_work(m); | ||
1324 | mutex_unlock(&m->work_mutex); | ||
1325 | } | ||
1326 | |||
1264 | /* | 1327 | /* |
1265 | * Restore the queue_if_no_path setting. | 1328 | * Restore the queue_if_no_path setting. |
1266 | */ | 1329 | */ |
@@ -1397,51 +1460,65 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
1397 | 1460 | ||
1398 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | 1461 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) |
1399 | { | 1462 | { |
1400 | int r; | 1463 | int r = -EINVAL; |
1401 | struct dm_dev *dev; | 1464 | struct dm_dev *dev; |
1402 | struct multipath *m = (struct multipath *) ti->private; | 1465 | struct multipath *m = (struct multipath *) ti->private; |
1403 | action_fn action; | 1466 | action_fn action; |
1404 | 1467 | ||
1468 | mutex_lock(&m->work_mutex); | ||
1469 | |||
1470 | if (dm_suspended(ti)) { | ||
1471 | r = -EBUSY; | ||
1472 | goto out; | ||
1473 | } | ||
1474 | |||
1405 | if (argc == 1) { | 1475 | if (argc == 1) { |
1406 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) | 1476 | if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) { |
1407 | return queue_if_no_path(m, 1, 0); | 1477 | r = queue_if_no_path(m, 1, 0); |
1408 | else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) | 1478 | goto out; |
1409 | return queue_if_no_path(m, 0, 0); | 1479 | } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) { |
1480 | r = queue_if_no_path(m, 0, 0); | ||
1481 | goto out; | ||
1482 | } | ||
1410 | } | 1483 | } |
1411 | 1484 | ||
1412 | if (argc != 2) | 1485 | if (argc != 2) { |
1413 | goto error; | 1486 | DMWARN("Unrecognised multipath message received."); |
1487 | goto out; | ||
1488 | } | ||
1414 | 1489 | ||
1415 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) | 1490 | if (!strnicmp(argv[0], MESG_STR("disable_group"))) { |
1416 | return bypass_pg_num(m, argv[1], 1); | 1491 | r = bypass_pg_num(m, argv[1], 1); |
1417 | else if (!strnicmp(argv[0], MESG_STR("enable_group"))) | 1492 | goto out; |
1418 | return bypass_pg_num(m, argv[1], 0); | 1493 | } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) { |
1419 | else if (!strnicmp(argv[0], MESG_STR("switch_group"))) | 1494 | r = bypass_pg_num(m, argv[1], 0); |
1420 | return switch_pg_num(m, argv[1]); | 1495 | goto out; |
1421 | else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | 1496 | } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) { |
1497 | r = switch_pg_num(m, argv[1]); | ||
1498 | goto out; | ||
1499 | } else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) | ||
1422 | action = reinstate_path; | 1500 | action = reinstate_path; |
1423 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) | 1501 | else if (!strnicmp(argv[0], MESG_STR("fail_path"))) |
1424 | action = fail_path; | 1502 | action = fail_path; |
1425 | else | 1503 | else { |
1426 | goto error; | 1504 | DMWARN("Unrecognised multipath message received."); |
1505 | goto out; | ||
1506 | } | ||
1427 | 1507 | ||
1428 | r = dm_get_device(ti, argv[1], ti->begin, ti->len, | 1508 | r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); |
1429 | dm_table_get_mode(ti->table), &dev); | ||
1430 | if (r) { | 1509 | if (r) { |
1431 | DMWARN("message: error getting device %s", | 1510 | DMWARN("message: error getting device %s", |
1432 | argv[1]); | 1511 | argv[1]); |
1433 | return -EINVAL; | 1512 | goto out; |
1434 | } | 1513 | } |
1435 | 1514 | ||
1436 | r = action_dev(m, dev, action); | 1515 | r = action_dev(m, dev, action); |
1437 | 1516 | ||
1438 | dm_put_device(ti, dev); | 1517 | dm_put_device(ti, dev); |
1439 | 1518 | ||
1519 | out: | ||
1520 | mutex_unlock(&m->work_mutex); | ||
1440 | return r; | 1521 | return r; |
1441 | |||
1442 | error: | ||
1443 | DMWARN("Unrecognised multipath message received."); | ||
1444 | return -EINVAL; | ||
1445 | } | 1522 | } |
1446 | 1523 | ||
1447 | static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, | 1524 | static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, |
@@ -1567,13 +1644,14 @@ out: | |||
1567 | *---------------------------------------------------------------*/ | 1644 | *---------------------------------------------------------------*/ |
1568 | static struct target_type multipath_target = { | 1645 | static struct target_type multipath_target = { |
1569 | .name = "multipath", | 1646 | .name = "multipath", |
1570 | .version = {1, 1, 0}, | 1647 | .version = {1, 1, 1}, |
1571 | .module = THIS_MODULE, | 1648 | .module = THIS_MODULE, |
1572 | .ctr = multipath_ctr, | 1649 | .ctr = multipath_ctr, |
1573 | .dtr = multipath_dtr, | 1650 | .dtr = multipath_dtr, |
1574 | .map_rq = multipath_map, | 1651 | .map_rq = multipath_map, |
1575 | .rq_end_io = multipath_end_io, | 1652 | .rq_end_io = multipath_end_io, |
1576 | .presuspend = multipath_presuspend, | 1653 | .presuspend = multipath_presuspend, |
1654 | .postsuspend = multipath_postsuspend, | ||
1577 | .resume = multipath_resume, | 1655 | .resume = multipath_resume, |
1578 | .status = multipath_status, | 1656 | .status = multipath_status, |
1579 | .message = multipath_message, | 1657 | .message = multipath_message, |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index cc9dc79b0784..ddda531723dc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); | |||
35 | *---------------------------------------------------------------*/ | 35 | *---------------------------------------------------------------*/ |
36 | enum dm_raid1_error { | 36 | enum dm_raid1_error { |
37 | DM_RAID1_WRITE_ERROR, | 37 | DM_RAID1_WRITE_ERROR, |
38 | DM_RAID1_FLUSH_ERROR, | ||
38 | DM_RAID1_SYNC_ERROR, | 39 | DM_RAID1_SYNC_ERROR, |
39 | DM_RAID1_READ_ERROR | 40 | DM_RAID1_READ_ERROR |
40 | }; | 41 | }; |
@@ -57,6 +58,7 @@ struct mirror_set { | |||
57 | struct bio_list reads; | 58 | struct bio_list reads; |
58 | struct bio_list writes; | 59 | struct bio_list writes; |
59 | struct bio_list failures; | 60 | struct bio_list failures; |
61 | struct bio_list holds; /* bios are waiting until suspend */ | ||
60 | 62 | ||
61 | struct dm_region_hash *rh; | 63 | struct dm_region_hash *rh; |
62 | struct dm_kcopyd_client *kcopyd_client; | 64 | struct dm_kcopyd_client *kcopyd_client; |
@@ -67,6 +69,7 @@ struct mirror_set { | |||
67 | region_t nr_regions; | 69 | region_t nr_regions; |
68 | int in_sync; | 70 | int in_sync; |
69 | int log_failure; | 71 | int log_failure; |
72 | int leg_failure; | ||
70 | atomic_t suspend; | 73 | atomic_t suspend; |
71 | 74 | ||
72 | atomic_t default_mirror; /* Default mirror */ | 75 | atomic_t default_mirror; /* Default mirror */ |
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m) | |||
179 | atomic_set(&ms->default_mirror, m - m0); | 182 | atomic_set(&ms->default_mirror, m - m0); |
180 | } | 183 | } |
181 | 184 | ||
185 | static struct mirror *get_valid_mirror(struct mirror_set *ms) | ||
186 | { | ||
187 | struct mirror *m; | ||
188 | |||
189 | for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) | ||
190 | if (!atomic_read(&m->error_count)) | ||
191 | return m; | ||
192 | |||
193 | return NULL; | ||
194 | } | ||
195 | |||
182 | /* fail_mirror | 196 | /* fail_mirror |
183 | * @m: mirror device to fail | 197 | * @m: mirror device to fail |
184 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | 198 | * @error_type: one of the enum's, DM_RAID1_*_ERROR |
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
198 | struct mirror_set *ms = m->ms; | 212 | struct mirror_set *ms = m->ms; |
199 | struct mirror *new; | 213 | struct mirror *new; |
200 | 214 | ||
215 | ms->leg_failure = 1; | ||
216 | |||
201 | /* | 217 | /* |
202 | * error_count is used for nothing more than a | 218 | * error_count is used for nothing more than a |
203 | * simple way to tell if a device has encountered | 219 | * simple way to tell if a device has encountered |
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | |||
224 | goto out; | 240 | goto out; |
225 | } | 241 | } |
226 | 242 | ||
227 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | 243 | new = get_valid_mirror(ms); |
228 | if (!atomic_read(&new->error_count)) { | 244 | if (new) |
229 | set_default_mirror(new); | 245 | set_default_mirror(new); |
230 | break; | 246 | else |
231 | } | ||
232 | |||
233 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | ||
234 | DMWARN("All sides of mirror have failed."); | 247 | DMWARN("All sides of mirror have failed."); |
235 | 248 | ||
236 | out: | 249 | out: |
237 | schedule_work(&ms->trigger_event); | 250 | schedule_work(&ms->trigger_event); |
238 | } | 251 | } |
239 | 252 | ||
253 | static int mirror_flush(struct dm_target *ti) | ||
254 | { | ||
255 | struct mirror_set *ms = ti->private; | ||
256 | unsigned long error_bits; | ||
257 | |||
258 | unsigned int i; | ||
259 | struct dm_io_region io[ms->nr_mirrors]; | ||
260 | struct mirror *m; | ||
261 | struct dm_io_request io_req = { | ||
262 | .bi_rw = WRITE_BARRIER, | ||
263 | .mem.type = DM_IO_KMEM, | ||
264 | .mem.ptr.bvec = NULL, | ||
265 | .client = ms->io_client, | ||
266 | }; | ||
267 | |||
268 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { | ||
269 | io[i].bdev = m->dev->bdev; | ||
270 | io[i].sector = 0; | ||
271 | io[i].count = 0; | ||
272 | } | ||
273 | |||
274 | error_bits = -1; | ||
275 | dm_io(&io_req, ms->nr_mirrors, io, &error_bits); | ||
276 | if (unlikely(error_bits != 0)) { | ||
277 | for (i = 0; i < ms->nr_mirrors; i++) | ||
278 | if (test_bit(i, &error_bits)) | ||
279 | fail_mirror(ms->mirror + i, | ||
280 | DM_RAID1_FLUSH_ERROR); | ||
281 | return -EIO; | ||
282 | } | ||
283 | |||
284 | return 0; | ||
285 | } | ||
286 | |||
240 | /*----------------------------------------------------------------- | 287 | /*----------------------------------------------------------------- |
241 | * Recovery. | 288 | * Recovery. |
242 | * | 289 | * |
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) | |||
396 | */ | 443 | */ |
397 | static sector_t map_sector(struct mirror *m, struct bio *bio) | 444 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
398 | { | 445 | { |
446 | if (unlikely(!bio->bi_size)) | ||
447 | return 0; | ||
399 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | 448 | return m->offset + (bio->bi_sector - m->ms->ti->begin); |
400 | } | 449 | } |
401 | 450 | ||
@@ -413,6 +462,34 @@ static void map_region(struct dm_io_region *io, struct mirror *m, | |||
413 | io->count = bio->bi_size >> 9; | 462 | io->count = bio->bi_size >> 9; |
414 | } | 463 | } |
415 | 464 | ||
465 | static void hold_bio(struct mirror_set *ms, struct bio *bio) | ||
466 | { | ||
467 | /* | ||
468 | * Lock is required to avoid race condition during suspend | ||
469 | * process. | ||
470 | */ | ||
471 | spin_lock_irq(&ms->lock); | ||
472 | |||
473 | if (atomic_read(&ms->suspend)) { | ||
474 | spin_unlock_irq(&ms->lock); | ||
475 | |||
476 | /* | ||
477 | * If device is suspended, complete the bio. | ||
478 | */ | ||
479 | if (dm_noflush_suspending(ms->ti)) | ||
480 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
481 | else | ||
482 | bio_endio(bio, -EIO); | ||
483 | return; | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Hold bio until the suspend is complete. | ||
488 | */ | ||
489 | bio_list_add(&ms->holds, bio); | ||
490 | spin_unlock_irq(&ms->lock); | ||
491 | } | ||
492 | |||
416 | /*----------------------------------------------------------------- | 493 | /*----------------------------------------------------------------- |
417 | * Reads | 494 | * Reads |
418 | *---------------------------------------------------------------*/ | 495 | *---------------------------------------------------------------*/ |
@@ -511,7 +588,6 @@ static void write_callback(unsigned long error, void *context) | |||
511 | unsigned i, ret = 0; | 588 | unsigned i, ret = 0; |
512 | struct bio *bio = (struct bio *) context; | 589 | struct bio *bio = (struct bio *) context; |
513 | struct mirror_set *ms; | 590 | struct mirror_set *ms; |
514 | int uptodate = 0; | ||
515 | int should_wake = 0; | 591 | int should_wake = 0; |
516 | unsigned long flags; | 592 | unsigned long flags; |
517 | 593 | ||
@@ -524,36 +600,27 @@ static void write_callback(unsigned long error, void *context) | |||
524 | * This way we handle both writes to SYNC and NOSYNC | 600 | * This way we handle both writes to SYNC and NOSYNC |
525 | * regions with the same code. | 601 | * regions with the same code. |
526 | */ | 602 | */ |
527 | if (likely(!error)) | 603 | if (likely(!error)) { |
528 | goto out; | 604 | bio_endio(bio, ret); |
605 | return; | ||
606 | } | ||
529 | 607 | ||
530 | for (i = 0; i < ms->nr_mirrors; i++) | 608 | for (i = 0; i < ms->nr_mirrors; i++) |
531 | if (test_bit(i, &error)) | 609 | if (test_bit(i, &error)) |
532 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | 610 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); |
533 | else | ||
534 | uptodate = 1; | ||
535 | 611 | ||
536 | if (unlikely(!uptodate)) { | 612 | /* |
537 | DMERR("All replicated volumes dead, failing I/O"); | 613 | * Need to raise event. Since raising |
538 | /* None of the writes succeeded, fail the I/O. */ | 614 | * events can block, we need to do it in |
539 | ret = -EIO; | 615 | * the main thread. |
540 | } else if (errors_handled(ms)) { | 616 | */ |
541 | /* | 617 | spin_lock_irqsave(&ms->lock, flags); |
542 | * Need to raise event. Since raising | 618 | if (!ms->failures.head) |
543 | * events can block, we need to do it in | 619 | should_wake = 1; |
544 | * the main thread. | 620 | bio_list_add(&ms->failures, bio); |
545 | */ | 621 | spin_unlock_irqrestore(&ms->lock, flags); |
546 | spin_lock_irqsave(&ms->lock, flags); | 622 | if (should_wake) |
547 | if (!ms->failures.head) | 623 | wakeup_mirrord(ms); |
548 | should_wake = 1; | ||
549 | bio_list_add(&ms->failures, bio); | ||
550 | spin_unlock_irqrestore(&ms->lock, flags); | ||
551 | if (should_wake) | ||
552 | wakeup_mirrord(ms); | ||
553 | return; | ||
554 | } | ||
555 | out: | ||
556 | bio_endio(bio, ret); | ||
557 | } | 624 | } |
558 | 625 | ||
559 | static void do_write(struct mirror_set *ms, struct bio *bio) | 626 | static void do_write(struct mirror_set *ms, struct bio *bio) |
@@ -562,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
562 | struct dm_io_region io[ms->nr_mirrors], *dest = io; | 629 | struct dm_io_region io[ms->nr_mirrors], *dest = io; |
563 | struct mirror *m; | 630 | struct mirror *m; |
564 | struct dm_io_request io_req = { | 631 | struct dm_io_request io_req = { |
565 | .bi_rw = WRITE, | 632 | .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), |
566 | .mem.type = DM_IO_BVEC, | 633 | .mem.type = DM_IO_BVEC, |
567 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 634 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
568 | .notify.fn = write_callback, | 635 | .notify.fn = write_callback, |
@@ -603,6 +670,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
603 | bio_list_init(&requeue); | 670 | bio_list_init(&requeue); |
604 | 671 | ||
605 | while ((bio = bio_list_pop(writes))) { | 672 | while ((bio = bio_list_pop(writes))) { |
673 | if (unlikely(bio_empty_barrier(bio))) { | ||
674 | bio_list_add(&sync, bio); | ||
675 | continue; | ||
676 | } | ||
677 | |||
606 | region = dm_rh_bio_to_region(ms->rh, bio); | 678 | region = dm_rh_bio_to_region(ms->rh, bio); |
607 | 679 | ||
608 | if (log->type->is_remote_recovering && | 680 | if (log->type->is_remote_recovering && |
@@ -659,7 +731,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
659 | /* | 731 | /* |
660 | * Dispatch io. | 732 | * Dispatch io. |
661 | */ | 733 | */ |
662 | if (unlikely(ms->log_failure)) { | 734 | if (unlikely(ms->log_failure) && errors_handled(ms)) { |
663 | spin_lock_irq(&ms->lock); | 735 | spin_lock_irq(&ms->lock); |
664 | bio_list_merge(&ms->failures, &sync); | 736 | bio_list_merge(&ms->failures, &sync); |
665 | spin_unlock_irq(&ms->lock); | 737 | spin_unlock_irq(&ms->lock); |
@@ -672,8 +744,15 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
672 | dm_rh_delay(ms->rh, bio); | 744 | dm_rh_delay(ms->rh, bio); |
673 | 745 | ||
674 | while ((bio = bio_list_pop(&nosync))) { | 746 | while ((bio = bio_list_pop(&nosync))) { |
675 | map_bio(get_default_mirror(ms), bio); | 747 | if (unlikely(ms->leg_failure) && errors_handled(ms)) { |
676 | generic_make_request(bio); | 748 | spin_lock_irq(&ms->lock); |
749 | bio_list_add(&ms->failures, bio); | ||
750 | spin_unlock_irq(&ms->lock); | ||
751 | wakeup_mirrord(ms); | ||
752 | } else { | ||
753 | map_bio(get_default_mirror(ms), bio); | ||
754 | generic_make_request(bio); | ||
755 | } | ||
677 | } | 756 | } |
678 | } | 757 | } |
679 | 758 | ||
@@ -681,20 +760,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
681 | { | 760 | { |
682 | struct bio *bio; | 761 | struct bio *bio; |
683 | 762 | ||
684 | if (!failures->head) | 763 | if (likely(!failures->head)) |
685 | return; | ||
686 | |||
687 | if (!ms->log_failure) { | ||
688 | while ((bio = bio_list_pop(failures))) { | ||
689 | ms->in_sync = 0; | ||
690 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); | ||
691 | } | ||
692 | return; | 764 | return; |
693 | } | ||
694 | 765 | ||
695 | /* | 766 | /* |
696 | * If the log has failed, unattempted writes are being | 767 | * If the log has failed, unattempted writes are being |
697 | * put on the failures list. We can't issue those writes | 768 | * put on the holds list. We can't issue those writes |
698 | * until a log has been marked, so we must store them. | 769 | * until a log has been marked, so we must store them. |
699 | * | 770 | * |
700 | * If a 'noflush' suspend is in progress, we can requeue | 771 | * If a 'noflush' suspend is in progress, we can requeue |
@@ -709,23 +780,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) | |||
709 | * for us to treat them the same and requeue them | 780 | * for us to treat them the same and requeue them |
710 | * as well. | 781 | * as well. |
711 | */ | 782 | */ |
712 | if (dm_noflush_suspending(ms->ti)) { | 783 | while ((bio = bio_list_pop(failures))) { |
713 | while ((bio = bio_list_pop(failures))) | 784 | if (!ms->log_failure) { |
714 | bio_endio(bio, DM_ENDIO_REQUEUE); | 785 | ms->in_sync = 0; |
715 | return; | 786 | dm_rh_mark_nosync(ms->rh, bio); |
716 | } | 787 | } |
717 | 788 | ||
718 | if (atomic_read(&ms->suspend)) { | 789 | /* |
719 | while ((bio = bio_list_pop(failures))) | 790 | * If all the legs are dead, fail the I/O. |
791 | * If we have been told to handle errors, hold the bio | ||
792 | * and wait for userspace to deal with the problem. | ||
793 | * Otherwise pretend that the I/O succeeded. (This would | ||
794 | * be wrong if the failed leg returned after reboot and | ||
795 | * got replicated back to the good legs.) | ||
796 | */ | ||
797 | if (!get_valid_mirror(ms)) | ||
720 | bio_endio(bio, -EIO); | 798 | bio_endio(bio, -EIO); |
721 | return; | 799 | else if (errors_handled(ms)) |
800 | hold_bio(ms, bio); | ||
801 | else | ||
802 | bio_endio(bio, 0); | ||
722 | } | 803 | } |
723 | |||
724 | spin_lock_irq(&ms->lock); | ||
725 | bio_list_merge(&ms->failures, failures); | ||
726 | spin_unlock_irq(&ms->lock); | ||
727 | |||
728 | delayed_wake(ms); | ||
729 | } | 804 | } |
730 | 805 | ||
731 | static void trigger_event(struct work_struct *work) | 806 | static void trigger_event(struct work_struct *work) |
@@ -784,12 +859,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
784 | } | 859 | } |
785 | 860 | ||
786 | spin_lock_init(&ms->lock); | 861 | spin_lock_init(&ms->lock); |
862 | bio_list_init(&ms->reads); | ||
863 | bio_list_init(&ms->writes); | ||
864 | bio_list_init(&ms->failures); | ||
865 | bio_list_init(&ms->holds); | ||
787 | 866 | ||
788 | ms->ti = ti; | 867 | ms->ti = ti; |
789 | ms->nr_mirrors = nr_mirrors; | 868 | ms->nr_mirrors = nr_mirrors; |
790 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 869 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
791 | ms->in_sync = 0; | 870 | ms->in_sync = 0; |
792 | ms->log_failure = 0; | 871 | ms->log_failure = 0; |
872 | ms->leg_failure = 0; | ||
793 | atomic_set(&ms->suspend, 0); | 873 | atomic_set(&ms->suspend, 0); |
794 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | 874 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
795 | 875 | ||
@@ -847,8 +927,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
847 | return -EINVAL; | 927 | return -EINVAL; |
848 | } | 928 | } |
849 | 929 | ||
850 | if (dm_get_device(ti, argv[0], offset, ti->len, | 930 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
851 | dm_table_get_mode(ti->table), | ||
852 | &ms->mirror[mirror].dev)) { | 931 | &ms->mirror[mirror].dev)) { |
853 | ti->error = "Device lookup failure"; | 932 | ti->error = "Device lookup failure"; |
854 | return -ENXIO; | 933 | return -ENXIO; |
@@ -889,7 +968,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
889 | return NULL; | 968 | return NULL; |
890 | } | 969 | } |
891 | 970 | ||
892 | dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); | 971 | dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, |
972 | argv + 2); | ||
893 | if (!dl) { | 973 | if (!dl) { |
894 | ti->error = "Error creating mirror dirty log"; | 974 | ti->error = "Error creating mirror dirty log"; |
895 | return NULL; | 975 | return NULL; |
@@ -995,6 +1075,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
995 | 1075 | ||
996 | ti->private = ms; | 1076 | ti->private = ms; |
997 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1077 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1078 | ti->num_flush_requests = 1; | ||
998 | 1079 | ||
999 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 1080 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); |
1000 | if (!ms->kmirrord_wq) { | 1081 | if (!ms->kmirrord_wq) { |
@@ -1122,7 +1203,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1122 | * We need to dec pending if this was a write. | 1203 | * We need to dec pending if this was a write. |
1123 | */ | 1204 | */ |
1124 | if (rw == WRITE) { | 1205 | if (rw == WRITE) { |
1125 | dm_rh_dec(ms->rh, map_context->ll); | 1206 | if (likely(!bio_empty_barrier(bio))) |
1207 | dm_rh_dec(ms->rh, map_context->ll); | ||
1126 | return error; | 1208 | return error; |
1127 | } | 1209 | } |
1128 | 1210 | ||
@@ -1180,9 +1262,26 @@ static void mirror_presuspend(struct dm_target *ti) | |||
1180 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1262 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1181 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1263 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1182 | 1264 | ||
1265 | struct bio_list holds; | ||
1266 | struct bio *bio; | ||
1267 | |||
1183 | atomic_set(&ms->suspend, 1); | 1268 | atomic_set(&ms->suspend, 1); |
1184 | 1269 | ||
1185 | /* | 1270 | /* |
1271 | * Process bios in the hold list to start recovery waiting | ||
1272 | * for bios in the hold list. After the process, no bio has | ||
1273 | * a chance to be added in the hold list because ms->suspend | ||
1274 | * is set. | ||
1275 | */ | ||
1276 | spin_lock_irq(&ms->lock); | ||
1277 | holds = ms->holds; | ||
1278 | bio_list_init(&ms->holds); | ||
1279 | spin_unlock_irq(&ms->lock); | ||
1280 | |||
1281 | while ((bio = bio_list_pop(&holds))) | ||
1282 | hold_bio(ms, bio); | ||
1283 | |||
1284 | /* | ||
1186 | * We must finish up all the work that we've | 1285 | * We must finish up all the work that we've |
1187 | * generated (i.e. recovery work). | 1286 | * generated (i.e. recovery work). |
1188 | */ | 1287 | */ |
@@ -1244,7 +1343,8 @@ static char device_status_char(struct mirror *m) | |||
1244 | if (!atomic_read(&(m->error_count))) | 1343 | if (!atomic_read(&(m->error_count))) |
1245 | return 'A'; | 1344 | return 'A'; |
1246 | 1345 | ||
1247 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | 1346 | return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : |
1347 | (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | ||
1248 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : | 1348 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : |
1249 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; | 1349 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; |
1250 | } | 1350 | } |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 36dbe29f2fd6..bd5c58b28868 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/ctype.h> | 11 | #include <linux/ctype.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/slab.h> | ||
14 | #include <linux/vmalloc.h> | 15 | #include <linux/vmalloc.h> |
15 | 16 | ||
16 | #include "dm.h" | 17 | #include "dm.h" |
@@ -79,6 +80,11 @@ struct dm_region_hash { | |||
79 | struct list_head recovered_regions; | 80 | struct list_head recovered_regions; |
80 | struct list_head failed_recovered_regions; | 81 | struct list_head failed_recovered_regions; |
81 | 82 | ||
83 | /* | ||
84 | * If there was a barrier failure no regions can be marked clean. | ||
85 | */ | ||
86 | int barrier_failure; | ||
87 | |||
82 | void *context; | 88 | void *context; |
83 | sector_t target_begin; | 89 | sector_t target_begin; |
84 | 90 | ||
@@ -211,6 +217,7 @@ struct dm_region_hash *dm_region_hash_create( | |||
211 | INIT_LIST_HEAD(&rh->quiesced_regions); | 217 | INIT_LIST_HEAD(&rh->quiesced_regions); |
212 | INIT_LIST_HEAD(&rh->recovered_regions); | 218 | INIT_LIST_HEAD(&rh->recovered_regions); |
213 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | 219 | INIT_LIST_HEAD(&rh->failed_recovered_regions); |
220 | rh->barrier_failure = 0; | ||
214 | 221 | ||
215 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | 222 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, |
216 | sizeof(struct dm_region)); | 223 | sizeof(struct dm_region)); |
@@ -377,8 +384,6 @@ static void complete_resync_work(struct dm_region *reg, int success) | |||
377 | /* dm_rh_mark_nosync | 384 | /* dm_rh_mark_nosync |
378 | * @ms | 385 | * @ms |
379 | * @bio | 386 | * @bio |
380 | * @done | ||
381 | * @error | ||
382 | * | 387 | * |
383 | * The bio was written on some mirror(s) but failed on other mirror(s). | 388 | * The bio was written on some mirror(s) but failed on other mirror(s). |
384 | * We can successfully endio the bio but should avoid the region being | 389 | * We can successfully endio the bio but should avoid the region being |
@@ -386,8 +391,7 @@ static void complete_resync_work(struct dm_region *reg, int success) | |||
386 | * | 391 | * |
387 | * This function is _not_ safe in interrupt context! | 392 | * This function is _not_ safe in interrupt context! |
388 | */ | 393 | */ |
389 | void dm_rh_mark_nosync(struct dm_region_hash *rh, | 394 | void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) |
390 | struct bio *bio, unsigned done, int error) | ||
391 | { | 395 | { |
392 | unsigned long flags; | 396 | unsigned long flags; |
393 | struct dm_dirty_log *log = rh->log; | 397 | struct dm_dirty_log *log = rh->log; |
@@ -395,6 +399,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, | |||
395 | region_t region = dm_rh_bio_to_region(rh, bio); | 399 | region_t region = dm_rh_bio_to_region(rh, bio); |
396 | int recovering = 0; | 400 | int recovering = 0; |
397 | 401 | ||
402 | if (bio_empty_barrier(bio)) { | ||
403 | rh->barrier_failure = 1; | ||
404 | return; | ||
405 | } | ||
406 | |||
398 | /* We must inform the log that the sync count has changed. */ | 407 | /* We must inform the log that the sync count has changed. */ |
399 | log->type->set_region_sync(log, region, 0); | 408 | log->type->set_region_sync(log, region, 0); |
400 | 409 | ||
@@ -419,7 +428,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, | |||
419 | BUG_ON(!list_empty(®->list)); | 428 | BUG_ON(!list_empty(®->list)); |
420 | spin_unlock_irqrestore(&rh->region_lock, flags); | 429 | spin_unlock_irqrestore(&rh->region_lock, flags); |
421 | 430 | ||
422 | bio_endio(bio, error); | ||
423 | if (recovering) | 431 | if (recovering) |
424 | complete_resync_work(reg, 0); | 432 | complete_resync_work(reg, 0); |
425 | } | 433 | } |
@@ -515,8 +523,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | |||
515 | { | 523 | { |
516 | struct bio *bio; | 524 | struct bio *bio; |
517 | 525 | ||
518 | for (bio = bios->head; bio; bio = bio->bi_next) | 526 | for (bio = bios->head; bio; bio = bio->bi_next) { |
527 | if (bio_empty_barrier(bio)) | ||
528 | continue; | ||
519 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | 529 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); |
530 | } | ||
520 | } | 531 | } |
521 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); | 532 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); |
522 | 533 | ||
@@ -544,7 +555,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) | |||
544 | */ | 555 | */ |
545 | 556 | ||
546 | /* do nothing for DM_RH_NOSYNC */ | 557 | /* do nothing for DM_RH_NOSYNC */ |
547 | if (reg->state == DM_RH_RECOVERING) { | 558 | if (unlikely(rh->barrier_failure)) { |
559 | /* | ||
560 | * If a write barrier failed some time ago, we | ||
561 | * don't know whether or not this write made it | ||
562 | * to the disk, so we must resync the device. | ||
563 | */ | ||
564 | reg->state = DM_RH_NOSYNC; | ||
565 | } else if (reg->state == DM_RH_RECOVERING) { | ||
548 | list_add_tail(®->list, &rh->quiesced_regions); | 566 | list_add_tail(®->list, &rh->quiesced_regions); |
549 | } else if (reg->state == DM_RH_DIRTY) { | 567 | } else if (reg->state == DM_RH_DIRTY) { |
550 | reg->state = DM_RH_CLEAN; | 568 | reg->state = DM_RH_CLEAN; |
@@ -643,10 +661,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success) | |||
643 | spin_lock_irq(&rh->region_lock); | 661 | spin_lock_irq(&rh->region_lock); |
644 | if (success) | 662 | if (success) |
645 | list_add(®->list, ®->rh->recovered_regions); | 663 | list_add(®->list, ®->rh->recovered_regions); |
646 | else { | 664 | else |
647 | reg->state = DM_RH_NOSYNC; | ||
648 | list_add(®->list, ®->rh->failed_recovered_regions); | 665 | list_add(®->list, ®->rh->failed_recovered_regions); |
649 | } | 666 | |
650 | spin_unlock_irq(&rh->region_lock); | 667 | spin_unlock_irq(&rh->region_lock); |
651 | 668 | ||
652 | rh->wakeup_workers(rh->context); | 669 | rh->wakeup_workers(rh->context); |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index cfa668f46c40..9c6c2e47ad62 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include "dm.h" | 11 | #include "dm.h" |
12 | #include "dm-path-selector.h" | 12 | #include "dm-path-selector.h" |
13 | 13 | ||
14 | #include <linux/slab.h> | ||
15 | |||
14 | #define DM_MSG_PREFIX "multipath service-time" | 16 | #define DM_MSG_PREFIX "multipath service-time" |
15 | #define ST_MIN_IO 1 | 17 | #define ST_MIN_IO 1 |
16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | 18 | #define ST_MAX_RELATIVE_THROUGHPUT 100 |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 0c746420c008..c097d8a4823d 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -55,6 +55,8 @@ | |||
55 | */ | 55 | */ |
56 | #define SNAPSHOT_DISK_VERSION 1 | 56 | #define SNAPSHOT_DISK_VERSION 1 |
57 | 57 | ||
58 | #define NUM_SNAPSHOT_HDR_CHUNKS 1 | ||
59 | |||
58 | struct disk_header { | 60 | struct disk_header { |
59 | uint32_t magic; | 61 | uint32_t magic; |
60 | 62 | ||
@@ -120,7 +122,22 @@ struct pstore { | |||
120 | 122 | ||
121 | /* | 123 | /* |
122 | * The next free chunk for an exception. | 124 | * The next free chunk for an exception. |
125 | * | ||
126 | * When creating exceptions, all the chunks here and above are | ||
127 | * free. It holds the next chunk to be allocated. On rare | ||
128 | * occasions (e.g. after a system crash) holes can be left in | ||
129 | * the exception store because chunks can be committed out of | ||
130 | * order. | ||
131 | * | ||
132 | * When merging exceptions, it does not necessarily mean all the | ||
133 | * chunks here and above are free. It holds the value it would | ||
134 | * have held if all chunks had been committed in order of | ||
135 | * allocation. Consequently the value may occasionally be | ||
136 | * slightly too low, but since it's only used for 'status' and | ||
137 | * it can never reach its minimum value too early this doesn't | ||
138 | * matter. | ||
123 | */ | 139 | */ |
140 | |||
124 | chunk_t next_free; | 141 | chunk_t next_free; |
125 | 142 | ||
126 | /* | 143 | /* |
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
214 | int metadata) | 231 | int metadata) |
215 | { | 232 | { |
216 | struct dm_io_region where = { | 233 | struct dm_io_region where = { |
217 | .bdev = ps->store->cow->bdev, | 234 | .bdev = dm_snap_cow(ps->store->snap)->bdev, |
218 | .sector = ps->store->chunk_size * chunk, | 235 | .sector = ps->store->chunk_size * chunk, |
219 | .count = ps->store->chunk_size, | 236 | .count = ps->store->chunk_size, |
220 | }; | 237 | }; |
@@ -237,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
237 | * Issue the synchronous I/O from a different thread | 254 | * Issue the synchronous I/O from a different thread |
238 | * to avoid generic_make_request recursion. | 255 | * to avoid generic_make_request recursion. |
239 | */ | 256 | */ |
240 | INIT_WORK(&req.work, do_metadata); | 257 | INIT_WORK_ON_STACK(&req.work, do_metadata); |
241 | queue_work(ps->metadata_wq, &req.work); | 258 | queue_work(ps->metadata_wq, &req.work); |
242 | flush_workqueue(ps->metadata_wq); | 259 | flush_workqueue(ps->metadata_wq); |
243 | 260 | ||
@@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
294 | */ | 311 | */ |
295 | if (!ps->store->chunk_size) { | 312 | if (!ps->store->chunk_size) { |
296 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, | 313 | ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, |
297 | bdev_logical_block_size(ps->store->cow->bdev) >> 9); | 314 | bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> |
315 | bdev) >> 9); | ||
298 | ps->store->chunk_mask = ps->store->chunk_size - 1; | 316 | ps->store->chunk_mask = ps->store->chunk_size - 1; |
299 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; | 317 | ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; |
300 | chunk_size_supplied = 0; | 318 | chunk_size_supplied = 0; |
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps, | |||
408 | e->new_chunk = cpu_to_le64(de->new_chunk); | 426 | e->new_chunk = cpu_to_le64(de->new_chunk); |
409 | } | 427 | } |
410 | 428 | ||
429 | static void clear_exception(struct pstore *ps, uint32_t index) | ||
430 | { | ||
431 | struct disk_exception *e = get_exception(ps, index); | ||
432 | |||
433 | /* clear it */ | ||
434 | e->old_chunk = 0; | ||
435 | e->new_chunk = 0; | ||
436 | } | ||
437 | |||
411 | /* | 438 | /* |
412 | * Registers the exceptions that are present in the current area. | 439 | * Registers the exceptions that are present in the current area. |
413 | * 'full' is filled in to indicate if the area has been | 440 | * 'full' is filled in to indicate if the area has been |
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store) | |||
489 | return (struct pstore *) store->context; | 516 | return (struct pstore *) store->context; |
490 | } | 517 | } |
491 | 518 | ||
492 | static void persistent_fraction_full(struct dm_exception_store *store, | 519 | static void persistent_usage(struct dm_exception_store *store, |
493 | sector_t *numerator, sector_t *denominator) | 520 | sector_t *total_sectors, |
521 | sector_t *sectors_allocated, | ||
522 | sector_t *metadata_sectors) | ||
494 | { | 523 | { |
495 | *numerator = get_info(store)->next_free * store->chunk_size; | 524 | struct pstore *ps = get_info(store); |
496 | *denominator = get_dev_size(store->cow->bdev); | 525 | |
526 | *sectors_allocated = ps->next_free * store->chunk_size; | ||
527 | *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); | ||
528 | |||
529 | /* | ||
530 | * First chunk is the fixed header. | ||
531 | * Then there are (ps->current_area + 1) metadata chunks, each one | ||
532 | * separated from the next by ps->exceptions_per_area data chunks. | ||
533 | */ | ||
534 | *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * | ||
535 | store->chunk_size; | ||
497 | } | 536 | } |
498 | 537 | ||
499 | static void persistent_dtr(struct dm_exception_store *store) | 538 | static void persistent_dtr(struct dm_exception_store *store) |
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store, | |||
552 | ps->current_area = 0; | 591 | ps->current_area = 0; |
553 | zero_memory_area(ps); | 592 | zero_memory_area(ps); |
554 | r = zero_disk_area(ps, 0); | 593 | r = zero_disk_area(ps, 0); |
555 | if (r) { | 594 | if (r) |
556 | DMWARN("zero_disk_area(0) failed"); | 595 | DMWARN("zero_disk_area(0) failed"); |
557 | return r; | 596 | return r; |
558 | } | 597 | } |
559 | } else { | 598 | /* |
560 | /* | 599 | * Sanity checks. |
561 | * Sanity checks. | 600 | */ |
562 | */ | 601 | if (ps->version != SNAPSHOT_DISK_VERSION) { |
563 | if (ps->version != SNAPSHOT_DISK_VERSION) { | 602 | DMWARN("unable to handle snapshot disk version %d", |
564 | DMWARN("unable to handle snapshot disk version %d", | 603 | ps->version); |
565 | ps->version); | 604 | return -EINVAL; |
566 | return -EINVAL; | 605 | } |
567 | } | ||
568 | 606 | ||
569 | /* | 607 | /* |
570 | * Metadata are valid, but snapshot is invalidated | 608 | * Metadata are valid, but snapshot is invalidated |
571 | */ | 609 | */ |
572 | if (!ps->valid) | 610 | if (!ps->valid) |
573 | return 1; | 611 | return 1; |
574 | 612 | ||
575 | /* | 613 | /* |
576 | * Read the metadata. | 614 | * Read the metadata. |
577 | */ | 615 | */ |
578 | r = read_exceptions(ps, callback, callback_context); | 616 | r = read_exceptions(ps, callback, callback_context); |
579 | if (r) | ||
580 | return r; | ||
581 | } | ||
582 | 617 | ||
583 | return 0; | 618 | return r; |
584 | } | 619 | } |
585 | 620 | ||
586 | static int persistent_prepare_exception(struct dm_exception_store *store, | 621 | static int persistent_prepare_exception(struct dm_exception_store *store, |
587 | struct dm_snap_exception *e) | 622 | struct dm_exception *e) |
588 | { | 623 | { |
589 | struct pstore *ps = get_info(store); | 624 | struct pstore *ps = get_info(store); |
590 | uint32_t stride; | 625 | uint32_t stride; |
591 | chunk_t next_free; | 626 | chunk_t next_free; |
592 | sector_t size = get_dev_size(store->cow->bdev); | 627 | sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); |
593 | 628 | ||
594 | /* Is there enough room ? */ | 629 | /* Is there enough room ? */ |
595 | if (size < ((ps->next_free + 1) * store->chunk_size)) | 630 | if (size < ((ps->next_free + 1) * store->chunk_size)) |
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store, | |||
611 | } | 646 | } |
612 | 647 | ||
613 | static void persistent_commit_exception(struct dm_exception_store *store, | 648 | static void persistent_commit_exception(struct dm_exception_store *store, |
614 | struct dm_snap_exception *e, | 649 | struct dm_exception *e, |
615 | void (*callback) (void *, int success), | 650 | void (*callback) (void *, int success), |
616 | void *callback_context) | 651 | void *callback_context) |
617 | { | 652 | { |
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
672 | ps->callback_count = 0; | 707 | ps->callback_count = 0; |
673 | } | 708 | } |
674 | 709 | ||
710 | static int persistent_prepare_merge(struct dm_exception_store *store, | ||
711 | chunk_t *last_old_chunk, | ||
712 | chunk_t *last_new_chunk) | ||
713 | { | ||
714 | struct pstore *ps = get_info(store); | ||
715 | struct disk_exception de; | ||
716 | int nr_consecutive; | ||
717 | int r; | ||
718 | |||
719 | /* | ||
720 | * When current area is empty, move back to preceding area. | ||
721 | */ | ||
722 | if (!ps->current_committed) { | ||
723 | /* | ||
724 | * Have we finished? | ||
725 | */ | ||
726 | if (!ps->current_area) | ||
727 | return 0; | ||
728 | |||
729 | ps->current_area--; | ||
730 | r = area_io(ps, READ); | ||
731 | if (r < 0) | ||
732 | return r; | ||
733 | ps->current_committed = ps->exceptions_per_area; | ||
734 | } | ||
735 | |||
736 | read_exception(ps, ps->current_committed - 1, &de); | ||
737 | *last_old_chunk = de.old_chunk; | ||
738 | *last_new_chunk = de.new_chunk; | ||
739 | |||
740 | /* | ||
741 | * Find number of consecutive chunks within the current area, | ||
742 | * working backwards. | ||
743 | */ | ||
744 | for (nr_consecutive = 1; nr_consecutive < ps->current_committed; | ||
745 | nr_consecutive++) { | ||
746 | read_exception(ps, ps->current_committed - 1 - nr_consecutive, | ||
747 | &de); | ||
748 | if (de.old_chunk != *last_old_chunk - nr_consecutive || | ||
749 | de.new_chunk != *last_new_chunk - nr_consecutive) | ||
750 | break; | ||
751 | } | ||
752 | |||
753 | return nr_consecutive; | ||
754 | } | ||
755 | |||
756 | static int persistent_commit_merge(struct dm_exception_store *store, | ||
757 | int nr_merged) | ||
758 | { | ||
759 | int r, i; | ||
760 | struct pstore *ps = get_info(store); | ||
761 | |||
762 | BUG_ON(nr_merged > ps->current_committed); | ||
763 | |||
764 | for (i = 0; i < nr_merged; i++) | ||
765 | clear_exception(ps, ps->current_committed - 1 - i); | ||
766 | |||
767 | r = area_io(ps, WRITE); | ||
768 | if (r < 0) | ||
769 | return r; | ||
770 | |||
771 | ps->current_committed -= nr_merged; | ||
772 | |||
773 | /* | ||
774 | * At this stage, only persistent_usage() uses ps->next_free, so | ||
775 | * we make no attempt to keep ps->next_free strictly accurate | ||
776 | * as exceptions may have been committed out-of-order originally. | ||
777 | * Once a snapshot has become merging, we set it to the value it | ||
778 | * would have held had all the exceptions been committed in order. | ||
779 | * | ||
780 | * ps->current_area does not get reduced by prepare_merge() until | ||
781 | * after commit_merge() has removed the nr_merged previous exceptions. | ||
782 | */ | ||
783 | ps->next_free = (area_location(ps, ps->current_area) - 1) + | ||
784 | (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS; | ||
785 | |||
786 | return 0; | ||
787 | } | ||
788 | |||
675 | static void persistent_drop_snapshot(struct dm_exception_store *store) | 789 | static void persistent_drop_snapshot(struct dm_exception_store *store) |
676 | { | 790 | { |
677 | struct pstore *ps = get_info(store); | 791 | struct pstore *ps = get_info(store); |
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store, | |||
697 | ps->area = NULL; | 811 | ps->area = NULL; |
698 | ps->zero_area = NULL; | 812 | ps->zero_area = NULL; |
699 | ps->header_area = NULL; | 813 | ps->header_area = NULL; |
700 | ps->next_free = 2; /* skipping the header and first area */ | 814 | ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ |
701 | ps->current_committed = 0; | 815 | ps->current_committed = 0; |
702 | 816 | ||
703 | ps->callback_count = 0; | 817 | ps->callback_count = 0; |
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store, | |||
726 | case STATUSTYPE_INFO: | 840 | case STATUSTYPE_INFO: |
727 | break; | 841 | break; |
728 | case STATUSTYPE_TABLE: | 842 | case STATUSTYPE_TABLE: |
729 | DMEMIT(" %s P %llu", store->cow->name, | 843 | DMEMIT(" P %llu", (unsigned long long)store->chunk_size); |
730 | (unsigned long long)store->chunk_size); | ||
731 | } | 844 | } |
732 | 845 | ||
733 | return sz; | 846 | return sz; |
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = { | |||
741 | .read_metadata = persistent_read_metadata, | 854 | .read_metadata = persistent_read_metadata, |
742 | .prepare_exception = persistent_prepare_exception, | 855 | .prepare_exception = persistent_prepare_exception, |
743 | .commit_exception = persistent_commit_exception, | 856 | .commit_exception = persistent_commit_exception, |
857 | .prepare_merge = persistent_prepare_merge, | ||
858 | .commit_merge = persistent_commit_merge, | ||
744 | .drop_snapshot = persistent_drop_snapshot, | 859 | .drop_snapshot = persistent_drop_snapshot, |
745 | .fraction_full = persistent_fraction_full, | 860 | .usage = persistent_usage, |
746 | .status = persistent_status, | 861 | .status = persistent_status, |
747 | }; | 862 | }; |
748 | 863 | ||
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = { | |||
754 | .read_metadata = persistent_read_metadata, | 869 | .read_metadata = persistent_read_metadata, |
755 | .prepare_exception = persistent_prepare_exception, | 870 | .prepare_exception = persistent_prepare_exception, |
756 | .commit_exception = persistent_commit_exception, | 871 | .commit_exception = persistent_commit_exception, |
872 | .prepare_merge = persistent_prepare_merge, | ||
873 | .commit_merge = persistent_commit_merge, | ||
757 | .drop_snapshot = persistent_drop_snapshot, | 874 | .drop_snapshot = persistent_drop_snapshot, |
758 | .fraction_full = persistent_fraction_full, | 875 | .usage = persistent_usage, |
759 | .status = persistent_status, | 876 | .status = persistent_status, |
760 | }; | 877 | }; |
761 | 878 | ||
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c index cde5aa558e6d..a0898a66a2f8 100644 --- a/drivers/md/dm-snap-transient.c +++ b/drivers/md/dm-snap-transient.c | |||
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store, | |||
36 | } | 36 | } |
37 | 37 | ||
38 | static int transient_prepare_exception(struct dm_exception_store *store, | 38 | static int transient_prepare_exception(struct dm_exception_store *store, |
39 | struct dm_snap_exception *e) | 39 | struct dm_exception *e) |
40 | { | 40 | { |
41 | struct transient_c *tc = store->context; | 41 | struct transient_c *tc = store->context; |
42 | sector_t size = get_dev_size(store->cow->bdev); | 42 | sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); |
43 | 43 | ||
44 | if (size < (tc->next_free + store->chunk_size)) | 44 | if (size < (tc->next_free + store->chunk_size)) |
45 | return -1; | 45 | return -1; |
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store, | |||
51 | } | 51 | } |
52 | 52 | ||
53 | static void transient_commit_exception(struct dm_exception_store *store, | 53 | static void transient_commit_exception(struct dm_exception_store *store, |
54 | struct dm_snap_exception *e, | 54 | struct dm_exception *e, |
55 | void (*callback) (void *, int success), | 55 | void (*callback) (void *, int success), |
56 | void *callback_context) | 56 | void *callback_context) |
57 | { | 57 | { |
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store, | |||
59 | callback(callback_context, 1); | 59 | callback(callback_context, 1); |
60 | } | 60 | } |
61 | 61 | ||
62 | static void transient_fraction_full(struct dm_exception_store *store, | 62 | static void transient_usage(struct dm_exception_store *store, |
63 | sector_t *numerator, sector_t *denominator) | 63 | sector_t *total_sectors, |
64 | sector_t *sectors_allocated, | ||
65 | sector_t *metadata_sectors) | ||
64 | { | 66 | { |
65 | *numerator = ((struct transient_c *) store->context)->next_free; | 67 | *sectors_allocated = ((struct transient_c *) store->context)->next_free; |
66 | *denominator = get_dev_size(store->cow->bdev); | 68 | *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); |
69 | *metadata_sectors = 0; | ||
67 | } | 70 | } |
68 | 71 | ||
69 | static int transient_ctr(struct dm_exception_store *store, | 72 | static int transient_ctr(struct dm_exception_store *store, |
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store, | |||
91 | case STATUSTYPE_INFO: | 94 | case STATUSTYPE_INFO: |
92 | break; | 95 | break; |
93 | case STATUSTYPE_TABLE: | 96 | case STATUSTYPE_TABLE: |
94 | DMEMIT(" %s N %llu", store->cow->name, | 97 | DMEMIT(" N %llu", (unsigned long long)store->chunk_size); |
95 | (unsigned long long)store->chunk_size); | ||
96 | } | 98 | } |
97 | 99 | ||
98 | return sz; | 100 | return sz; |
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = { | |||
106 | .read_metadata = transient_read_metadata, | 108 | .read_metadata = transient_read_metadata, |
107 | .prepare_exception = transient_prepare_exception, | 109 | .prepare_exception = transient_prepare_exception, |
108 | .commit_exception = transient_commit_exception, | 110 | .commit_exception = transient_commit_exception, |
109 | .fraction_full = transient_fraction_full, | 111 | .usage = transient_usage, |
110 | .status = transient_status, | 112 | .status = transient_status, |
111 | }; | 113 | }; |
112 | 114 | ||
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = { | |||
118 | .read_metadata = transient_read_metadata, | 120 | .read_metadata = transient_read_metadata, |
119 | .prepare_exception = transient_prepare_exception, | 121 | .prepare_exception = transient_prepare_exception, |
120 | .commit_exception = transient_commit_exception, | 122 | .commit_exception = transient_commit_exception, |
121 | .fraction_full = transient_fraction_full, | 123 | .usage = transient_usage, |
122 | .status = transient_status, | 124 | .status = transient_status, |
123 | }; | 125 | }; |
124 | 126 | ||
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3a3ba46e6d4b..54853773510c 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -25,6 +25,11 @@ | |||
25 | 25 | ||
26 | #define DM_MSG_PREFIX "snapshots" | 26 | #define DM_MSG_PREFIX "snapshots" |
27 | 27 | ||
28 | static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; | ||
29 | |||
30 | #define dm_target_is_snapshot_merge(ti) \ | ||
31 | ((ti)->type->name == dm_snapshot_merge_target_name) | ||
32 | |||
28 | /* | 33 | /* |
29 | * The percentage increment we will wake up users at | 34 | * The percentage increment we will wake up users at |
30 | */ | 35 | */ |
@@ -49,7 +54,7 @@ | |||
49 | #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ | 54 | #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ |
50 | (DM_TRACKED_CHUNK_HASH_SIZE - 1)) | 55 | (DM_TRACKED_CHUNK_HASH_SIZE - 1)) |
51 | 56 | ||
52 | struct exception_table { | 57 | struct dm_exception_table { |
53 | uint32_t hash_mask; | 58 | uint32_t hash_mask; |
54 | unsigned hash_shift; | 59 | unsigned hash_shift; |
55 | struct list_head *table; | 60 | struct list_head *table; |
@@ -59,22 +64,31 @@ struct dm_snapshot { | |||
59 | struct rw_semaphore lock; | 64 | struct rw_semaphore lock; |
60 | 65 | ||
61 | struct dm_dev *origin; | 66 | struct dm_dev *origin; |
67 | struct dm_dev *cow; | ||
68 | |||
69 | struct dm_target *ti; | ||
62 | 70 | ||
63 | /* List of snapshots per Origin */ | 71 | /* List of snapshots per Origin */ |
64 | struct list_head list; | 72 | struct list_head list; |
65 | 73 | ||
66 | /* You can't use a snapshot if this is 0 (e.g. if full) */ | 74 | /* |
75 | * You can't use a snapshot if this is 0 (e.g. if full). | ||
76 | * A snapshot-merge target never clears this. | ||
77 | */ | ||
67 | int valid; | 78 | int valid; |
68 | 79 | ||
69 | /* Origin writes don't trigger exceptions until this is set */ | 80 | /* Origin writes don't trigger exceptions until this is set */ |
70 | int active; | 81 | int active; |
71 | 82 | ||
72 | mempool_t *pending_pool; | 83 | /* Whether or not owning mapped_device is suspended */ |
84 | int suspended; | ||
73 | 85 | ||
74 | atomic_t pending_exceptions_count; | 86 | atomic_t pending_exceptions_count; |
75 | 87 | ||
76 | struct exception_table pending; | 88 | mempool_t *pending_pool; |
77 | struct exception_table complete; | 89 | |
90 | struct dm_exception_table pending; | ||
91 | struct dm_exception_table complete; | ||
78 | 92 | ||
79 | /* | 93 | /* |
80 | * pe_lock protects all pending_exception operations and access | 94 | * pe_lock protects all pending_exception operations and access |
@@ -82,6 +96,11 @@ struct dm_snapshot { | |||
82 | */ | 96 | */ |
83 | spinlock_t pe_lock; | 97 | spinlock_t pe_lock; |
84 | 98 | ||
99 | /* Chunks with outstanding reads */ | ||
100 | spinlock_t tracked_chunk_lock; | ||
101 | mempool_t *tracked_chunk_pool; | ||
102 | struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; | ||
103 | |||
85 | /* The on disk metadata handler */ | 104 | /* The on disk metadata handler */ |
86 | struct dm_exception_store *store; | 105 | struct dm_exception_store *store; |
87 | 106 | ||
@@ -91,12 +110,50 @@ struct dm_snapshot { | |||
91 | struct bio_list queued_bios; | 110 | struct bio_list queued_bios; |
92 | struct work_struct queued_bios_work; | 111 | struct work_struct queued_bios_work; |
93 | 112 | ||
94 | /* Chunks with outstanding reads */ | 113 | /* Wait for events based on state_bits */ |
95 | mempool_t *tracked_chunk_pool; | 114 | unsigned long state_bits; |
96 | spinlock_t tracked_chunk_lock; | 115 | |
97 | struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; | 116 | /* Range of chunks currently being merged. */ |
117 | chunk_t first_merging_chunk; | ||
118 | int num_merging_chunks; | ||
119 | |||
120 | /* | ||
121 | * The merge operation failed if this flag is set. | ||
122 | * Failure modes are handled as follows: | ||
123 | * - I/O error reading the header | ||
124 | * => don't load the target; abort. | ||
125 | * - Header does not have "valid" flag set | ||
126 | * => use the origin; forget about the snapshot. | ||
127 | * - I/O error when reading exceptions | ||
128 | * => don't load the target; abort. | ||
129 | * (We can't use the intermediate origin state.) | ||
130 | * - I/O error while merging | ||
131 | * => stop merging; set merge_failed; process I/O normally. | ||
132 | */ | ||
133 | int merge_failed; | ||
134 | |||
135 | /* | ||
136 | * Incoming bios that overlap with chunks being merged must wait | ||
137 | * for them to be committed. | ||
138 | */ | ||
139 | struct bio_list bios_queued_during_merge; | ||
98 | }; | 140 | }; |
99 | 141 | ||
142 | /* | ||
143 | * state_bits: | ||
144 | * RUNNING_MERGE - Merge operation is in progress. | ||
145 | * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; | ||
146 | * cleared afterwards. | ||
147 | */ | ||
148 | #define RUNNING_MERGE 0 | ||
149 | #define SHUTDOWN_MERGE 1 | ||
150 | |||
151 | struct dm_dev *dm_snap_cow(struct dm_snapshot *s) | ||
152 | { | ||
153 | return s->cow; | ||
154 | } | ||
155 | EXPORT_SYMBOL(dm_snap_cow); | ||
156 | |||
100 | static struct workqueue_struct *ksnapd; | 157 | static struct workqueue_struct *ksnapd; |
101 | static void flush_queued_bios(struct work_struct *work); | 158 | static void flush_queued_bios(struct work_struct *work); |
102 | 159 | ||
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs) | |||
116 | } | 173 | } |
117 | 174 | ||
118 | struct dm_snap_pending_exception { | 175 | struct dm_snap_pending_exception { |
119 | struct dm_snap_exception e; | 176 | struct dm_exception e; |
120 | 177 | ||
121 | /* | 178 | /* |
122 | * Origin buffers waiting for this to complete are held | 179 | * Origin buffers waiting for this to complete are held |
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception { | |||
125 | struct bio_list origin_bios; | 182 | struct bio_list origin_bios; |
126 | struct bio_list snapshot_bios; | 183 | struct bio_list snapshot_bios; |
127 | 184 | ||
128 | /* | ||
129 | * Short-term queue of pending exceptions prior to submission. | ||
130 | */ | ||
131 | struct list_head list; | ||
132 | |||
133 | /* | ||
134 | * The primary pending_exception is the one that holds | ||
135 | * the ref_count and the list of origin_bios for a | ||
136 | * group of pending_exceptions. It is always last to get freed. | ||
137 | * These fields get set up when writing to the origin. | ||
138 | */ | ||
139 | struct dm_snap_pending_exception *primary_pe; | ||
140 | |||
141 | /* | ||
142 | * Number of pending_exceptions processing this chunk. | ||
143 | * When this drops to zero we must complete the origin bios. | ||
144 | * If incrementing or decrementing this, hold pe->snap->lock for | ||
145 | * the sibling concerned and not pe->primary_pe->snap->lock unless | ||
146 | * they are the same. | ||
147 | */ | ||
148 | atomic_t ref_count; | ||
149 | |||
150 | /* Pointer back to snapshot context */ | 185 | /* Pointer back to snapshot context */ |
151 | struct dm_snapshot *snap; | 186 | struct dm_snapshot *snap; |
152 | 187 | ||
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) | |||
222 | } | 257 | } |
223 | 258 | ||
224 | /* | 259 | /* |
260 | * This conflicting I/O is extremely improbable in the caller, | ||
261 | * so msleep(1) is sufficient and there is no need for a wait queue. | ||
262 | */ | ||
263 | static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) | ||
264 | { | ||
265 | while (__chunk_is_tracked(s, chunk)) | ||
266 | msleep(1); | ||
267 | } | ||
268 | |||
269 | /* | ||
225 | * One of these per registered origin, held in the snapshot_origins hash | 270 | * One of these per registered origin, held in the snapshot_origins hash |
226 | */ | 271 | */ |
227 | struct origin { | 272 | struct origin { |
@@ -243,6 +288,10 @@ struct origin { | |||
243 | static struct list_head *_origins; | 288 | static struct list_head *_origins; |
244 | static struct rw_semaphore _origins_lock; | 289 | static struct rw_semaphore _origins_lock; |
245 | 290 | ||
291 | static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); | ||
292 | static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); | ||
293 | static uint64_t _pending_exceptions_done_count; | ||
294 | |||
246 | static int init_origin_hash(void) | 295 | static int init_origin_hash(void) |
247 | { | 296 | { |
248 | int i; | 297 | int i; |
@@ -291,22 +340,144 @@ static void __insert_origin(struct origin *o) | |||
291 | } | 340 | } |
292 | 341 | ||
293 | /* | 342 | /* |
343 | * _origins_lock must be held when calling this function. | ||
344 | * Returns number of snapshots registered using the supplied cow device, plus: | ||
345 | * snap_src - a snapshot suitable for use as a source of exception handover | ||
346 | * snap_dest - a snapshot capable of receiving exception handover. | ||
347 | * snap_merge - an existing snapshot-merge target linked to the same origin. | ||
348 | * There can be at most one snapshot-merge target. The parameter is optional. | ||
349 | * | ||
350 | * Possible return values and states of snap_src and snap_dest. | ||
351 | * 0: NULL, NULL - first new snapshot | ||
352 | * 1: snap_src, NULL - normal snapshot | ||
353 | * 2: snap_src, snap_dest - waiting for handover | ||
354 | * 2: snap_src, NULL - handed over, waiting for old to be deleted | ||
355 | * 1: NULL, snap_dest - source got destroyed without handover | ||
356 | */ | ||
357 | static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, | ||
358 | struct dm_snapshot **snap_src, | ||
359 | struct dm_snapshot **snap_dest, | ||
360 | struct dm_snapshot **snap_merge) | ||
361 | { | ||
362 | struct dm_snapshot *s; | ||
363 | struct origin *o; | ||
364 | int count = 0; | ||
365 | int active; | ||
366 | |||
367 | o = __lookup_origin(snap->origin->bdev); | ||
368 | if (!o) | ||
369 | goto out; | ||
370 | |||
371 | list_for_each_entry(s, &o->snapshots, list) { | ||
372 | if (dm_target_is_snapshot_merge(s->ti) && snap_merge) | ||
373 | *snap_merge = s; | ||
374 | if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) | ||
375 | continue; | ||
376 | |||
377 | down_read(&s->lock); | ||
378 | active = s->active; | ||
379 | up_read(&s->lock); | ||
380 | |||
381 | if (active) { | ||
382 | if (snap_src) | ||
383 | *snap_src = s; | ||
384 | } else if (snap_dest) | ||
385 | *snap_dest = s; | ||
386 | |||
387 | count++; | ||
388 | } | ||
389 | |||
390 | out: | ||
391 | return count; | ||
392 | } | ||
393 | |||
394 | /* | ||
395 | * On success, returns 1 if this snapshot is a handover destination, | ||
396 | * otherwise returns 0. | ||
397 | */ | ||
398 | static int __validate_exception_handover(struct dm_snapshot *snap) | ||
399 | { | ||
400 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
401 | struct dm_snapshot *snap_merge = NULL; | ||
402 | |||
403 | /* Does snapshot need exceptions handed over to it? */ | ||
404 | if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, | ||
405 | &snap_merge) == 2) || | ||
406 | snap_dest) { | ||
407 | snap->ti->error = "Snapshot cow pairing for exception " | ||
408 | "table handover failed"; | ||
409 | return -EINVAL; | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * If no snap_src was found, snap cannot become a handover | ||
414 | * destination. | ||
415 | */ | ||
416 | if (!snap_src) | ||
417 | return 0; | ||
418 | |||
419 | /* | ||
420 | * Non-snapshot-merge handover? | ||
421 | */ | ||
422 | if (!dm_target_is_snapshot_merge(snap->ti)) | ||
423 | return 1; | ||
424 | |||
425 | /* | ||
426 | * Do not allow more than one merging snapshot. | ||
427 | */ | ||
428 | if (snap_merge) { | ||
429 | snap->ti->error = "A snapshot is already merging."; | ||
430 | return -EINVAL; | ||
431 | } | ||
432 | |||
433 | if (!snap_src->store->type->prepare_merge || | ||
434 | !snap_src->store->type->commit_merge) { | ||
435 | snap->ti->error = "Snapshot exception store does not " | ||
436 | "support snapshot-merge."; | ||
437 | return -EINVAL; | ||
438 | } | ||
439 | |||
440 | return 1; | ||
441 | } | ||
442 | |||
443 | static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) | ||
444 | { | ||
445 | struct dm_snapshot *l; | ||
446 | |||
447 | /* Sort the list according to chunk size, largest-first smallest-last */ | ||
448 | list_for_each_entry(l, &o->snapshots, list) | ||
449 | if (l->store->chunk_size < s->store->chunk_size) | ||
450 | break; | ||
451 | list_add_tail(&s->list, &l->list); | ||
452 | } | ||
453 | |||
454 | /* | ||
294 | * Make a note of the snapshot and its origin so we can look it | 455 | * Make a note of the snapshot and its origin so we can look it |
295 | * up when the origin has a write on it. | 456 | * up when the origin has a write on it. |
457 | * | ||
458 | * Also validate snapshot exception store handovers. | ||
459 | * On success, returns 1 if this registration is a handover destination, | ||
460 | * otherwise returns 0. | ||
296 | */ | 461 | */ |
297 | static int register_snapshot(struct dm_snapshot *snap) | 462 | static int register_snapshot(struct dm_snapshot *snap) |
298 | { | 463 | { |
299 | struct dm_snapshot *l; | 464 | struct origin *o, *new_o = NULL; |
300 | struct origin *o, *new_o; | ||
301 | struct block_device *bdev = snap->origin->bdev; | 465 | struct block_device *bdev = snap->origin->bdev; |
466 | int r = 0; | ||
302 | 467 | ||
303 | new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); | 468 | new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); |
304 | if (!new_o) | 469 | if (!new_o) |
305 | return -ENOMEM; | 470 | return -ENOMEM; |
306 | 471 | ||
307 | down_write(&_origins_lock); | 472 | down_write(&_origins_lock); |
308 | o = __lookup_origin(bdev); | ||
309 | 473 | ||
474 | r = __validate_exception_handover(snap); | ||
475 | if (r < 0) { | ||
476 | kfree(new_o); | ||
477 | goto out; | ||
478 | } | ||
479 | |||
480 | o = __lookup_origin(bdev); | ||
310 | if (o) | 481 | if (o) |
311 | kfree(new_o); | 482 | kfree(new_o); |
312 | else { | 483 | else { |
@@ -320,14 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap) | |||
320 | __insert_origin(o); | 491 | __insert_origin(o); |
321 | } | 492 | } |
322 | 493 | ||
323 | /* Sort the list according to chunk size, largest-first smallest-last */ | 494 | __insert_snapshot(o, snap); |
324 | list_for_each_entry(l, &o->snapshots, list) | 495 | |
325 | if (l->store->chunk_size < snap->store->chunk_size) | 496 | out: |
326 | break; | 497 | up_write(&_origins_lock); |
327 | list_add_tail(&snap->list, &l->list); | 498 | |
499 | return r; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Move snapshot to correct place in list according to chunk size. | ||
504 | */ | ||
505 | static void reregister_snapshot(struct dm_snapshot *s) | ||
506 | { | ||
507 | struct block_device *bdev = s->origin->bdev; | ||
508 | |||
509 | down_write(&_origins_lock); | ||
510 | |||
511 | list_del(&s->list); | ||
512 | __insert_snapshot(__lookup_origin(bdev), s); | ||
328 | 513 | ||
329 | up_write(&_origins_lock); | 514 | up_write(&_origins_lock); |
330 | return 0; | ||
331 | } | 515 | } |
332 | 516 | ||
333 | static void unregister_snapshot(struct dm_snapshot *s) | 517 | static void unregister_snapshot(struct dm_snapshot *s) |
@@ -338,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
338 | o = __lookup_origin(s->origin->bdev); | 522 | o = __lookup_origin(s->origin->bdev); |
339 | 523 | ||
340 | list_del(&s->list); | 524 | list_del(&s->list); |
341 | if (list_empty(&o->snapshots)) { | 525 | if (o && list_empty(&o->snapshots)) { |
342 | list_del(&o->hash_list); | 526 | list_del(&o->hash_list); |
343 | kfree(o); | 527 | kfree(o); |
344 | } | 528 | } |
@@ -351,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s) | |||
351 | * The lowest hash_shift bits of the chunk number are ignored, allowing | 535 | * The lowest hash_shift bits of the chunk number are ignored, allowing |
352 | * some consecutive chunks to be grouped together. | 536 | * some consecutive chunks to be grouped together. |
353 | */ | 537 | */ |
354 | static int init_exception_table(struct exception_table *et, uint32_t size, | 538 | static int dm_exception_table_init(struct dm_exception_table *et, |
355 | unsigned hash_shift) | 539 | uint32_t size, unsigned hash_shift) |
356 | { | 540 | { |
357 | unsigned int i; | 541 | unsigned int i; |
358 | 542 | ||
@@ -368,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size, | |||
368 | return 0; | 552 | return 0; |
369 | } | 553 | } |
370 | 554 | ||
371 | static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) | 555 | static void dm_exception_table_exit(struct dm_exception_table *et, |
556 | struct kmem_cache *mem) | ||
372 | { | 557 | { |
373 | struct list_head *slot; | 558 | struct list_head *slot; |
374 | struct dm_snap_exception *ex, *next; | 559 | struct dm_exception *ex, *next; |
375 | int i, size; | 560 | int i, size; |
376 | 561 | ||
377 | size = et->hash_mask + 1; | 562 | size = et->hash_mask + 1; |
@@ -385,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * | |||
385 | vfree(et->table); | 570 | vfree(et->table); |
386 | } | 571 | } |
387 | 572 | ||
388 | static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) | 573 | static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) |
389 | { | 574 | { |
390 | return (chunk >> et->hash_shift) & et->hash_mask; | 575 | return (chunk >> et->hash_shift) & et->hash_mask; |
391 | } | 576 | } |
392 | 577 | ||
393 | static void insert_exception(struct exception_table *eh, | 578 | static void dm_remove_exception(struct dm_exception *e) |
394 | struct dm_snap_exception *e) | ||
395 | { | ||
396 | struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; | ||
397 | list_add(&e->hash_list, l); | ||
398 | } | ||
399 | |||
400 | static void remove_exception(struct dm_snap_exception *e) | ||
401 | { | 579 | { |
402 | list_del(&e->hash_list); | 580 | list_del(&e->hash_list); |
403 | } | 581 | } |
@@ -406,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e) | |||
406 | * Return the exception data for a sector, or NULL if not | 584 | * Return the exception data for a sector, or NULL if not |
407 | * remapped. | 585 | * remapped. |
408 | */ | 586 | */ |
409 | static struct dm_snap_exception *lookup_exception(struct exception_table *et, | 587 | static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, |
410 | chunk_t chunk) | 588 | chunk_t chunk) |
411 | { | 589 | { |
412 | struct list_head *slot; | 590 | struct list_head *slot; |
413 | struct dm_snap_exception *e; | 591 | struct dm_exception *e; |
414 | 592 | ||
415 | slot = &et->table[exception_hash(et, chunk)]; | 593 | slot = &et->table[exception_hash(et, chunk)]; |
416 | list_for_each_entry (e, slot, hash_list) | 594 | list_for_each_entry (e, slot, hash_list) |
@@ -421,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, | |||
421 | return NULL; | 599 | return NULL; |
422 | } | 600 | } |
423 | 601 | ||
424 | static struct dm_snap_exception *alloc_exception(void) | 602 | static struct dm_exception *alloc_completed_exception(void) |
425 | { | 603 | { |
426 | struct dm_snap_exception *e; | 604 | struct dm_exception *e; |
427 | 605 | ||
428 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); | 606 | e = kmem_cache_alloc(exception_cache, GFP_NOIO); |
429 | if (!e) | 607 | if (!e) |
@@ -432,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void) | |||
432 | return e; | 610 | return e; |
433 | } | 611 | } |
434 | 612 | ||
435 | static void free_exception(struct dm_snap_exception *e) | 613 | static void free_completed_exception(struct dm_exception *e) |
436 | { | 614 | { |
437 | kmem_cache_free(exception_cache, e); | 615 | kmem_cache_free(exception_cache, e); |
438 | } | 616 | } |
@@ -457,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) | |||
457 | atomic_dec(&s->pending_exceptions_count); | 635 | atomic_dec(&s->pending_exceptions_count); |
458 | } | 636 | } |
459 | 637 | ||
460 | static void insert_completed_exception(struct dm_snapshot *s, | 638 | static void dm_insert_exception(struct dm_exception_table *eh, |
461 | struct dm_snap_exception *new_e) | 639 | struct dm_exception *new_e) |
462 | { | 640 | { |
463 | struct exception_table *eh = &s->complete; | ||
464 | struct list_head *l; | 641 | struct list_head *l; |
465 | struct dm_snap_exception *e = NULL; | 642 | struct dm_exception *e = NULL; |
466 | 643 | ||
467 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; | 644 | l = &eh->table[exception_hash(eh, new_e->old_chunk)]; |
468 | 645 | ||
@@ -478,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s, | |||
478 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + | 655 | new_e->new_chunk == (dm_chunk_number(e->new_chunk) + |
479 | dm_consecutive_chunk_count(e) + 1)) { | 656 | dm_consecutive_chunk_count(e) + 1)) { |
480 | dm_consecutive_chunk_count_inc(e); | 657 | dm_consecutive_chunk_count_inc(e); |
481 | free_exception(new_e); | 658 | free_completed_exception(new_e); |
482 | return; | 659 | return; |
483 | } | 660 | } |
484 | 661 | ||
@@ -488,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s, | |||
488 | dm_consecutive_chunk_count_inc(e); | 665 | dm_consecutive_chunk_count_inc(e); |
489 | e->old_chunk--; | 666 | e->old_chunk--; |
490 | e->new_chunk--; | 667 | e->new_chunk--; |
491 | free_exception(new_e); | 668 | free_completed_exception(new_e); |
492 | return; | 669 | return; |
493 | } | 670 | } |
494 | 671 | ||
@@ -507,9 +684,9 @@ out: | |||
507 | static int dm_add_exception(void *context, chunk_t old, chunk_t new) | 684 | static int dm_add_exception(void *context, chunk_t old, chunk_t new) |
508 | { | 685 | { |
509 | struct dm_snapshot *s = context; | 686 | struct dm_snapshot *s = context; |
510 | struct dm_snap_exception *e; | 687 | struct dm_exception *e; |
511 | 688 | ||
512 | e = alloc_exception(); | 689 | e = alloc_completed_exception(); |
513 | if (!e) | 690 | if (!e) |
514 | return -ENOMEM; | 691 | return -ENOMEM; |
515 | 692 | ||
@@ -518,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) | |||
518 | /* Consecutive_count is implicitly initialised to zero */ | 695 | /* Consecutive_count is implicitly initialised to zero */ |
519 | e->new_chunk = new; | 696 | e->new_chunk = new; |
520 | 697 | ||
521 | insert_completed_exception(s, e); | 698 | dm_insert_exception(&s->complete, e); |
522 | 699 | ||
523 | return 0; | 700 | return 0; |
524 | } | 701 | } |
525 | 702 | ||
703 | #define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) | ||
704 | |||
705 | /* | ||
706 | * Return a minimum chunk size of all snapshots that have the specified origin. | ||
707 | * Return zero if the origin has no snapshots. | ||
708 | */ | ||
709 | static sector_t __minimum_chunk_size(struct origin *o) | ||
710 | { | ||
711 | struct dm_snapshot *snap; | ||
712 | unsigned chunk_size = 0; | ||
713 | |||
714 | if (o) | ||
715 | list_for_each_entry(snap, &o->snapshots, list) | ||
716 | chunk_size = min_not_zero(chunk_size, | ||
717 | snap->store->chunk_size); | ||
718 | |||
719 | return chunk_size; | ||
720 | } | ||
721 | |||
526 | /* | 722 | /* |
527 | * Hard coded magic. | 723 | * Hard coded magic. |
528 | */ | 724 | */ |
@@ -546,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
546 | * Calculate based on the size of the original volume or | 742 | * Calculate based on the size of the original volume or |
547 | * the COW volume... | 743 | * the COW volume... |
548 | */ | 744 | */ |
549 | cow_dev_size = get_dev_size(s->store->cow->bdev); | 745 | cow_dev_size = get_dev_size(s->cow->bdev); |
550 | origin_dev_size = get_dev_size(s->origin->bdev); | 746 | origin_dev_size = get_dev_size(s->origin->bdev); |
551 | max_buckets = calc_max_buckets(); | 747 | max_buckets = calc_max_buckets(); |
552 | 748 | ||
553 | hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; | 749 | hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; |
554 | hash_size = min(hash_size, max_buckets); | 750 | hash_size = min(hash_size, max_buckets); |
555 | 751 | ||
752 | if (hash_size < 64) | ||
753 | hash_size = 64; | ||
556 | hash_size = rounddown_pow_of_two(hash_size); | 754 | hash_size = rounddown_pow_of_two(hash_size); |
557 | if (init_exception_table(&s->complete, hash_size, | 755 | if (dm_exception_table_init(&s->complete, hash_size, |
558 | DM_CHUNK_CONSECUTIVE_BITS)) | 756 | DM_CHUNK_CONSECUTIVE_BITS)) |
559 | return -ENOMEM; | 757 | return -ENOMEM; |
560 | 758 | ||
561 | /* | 759 | /* |
@@ -566,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s) | |||
566 | if (hash_size < 64) | 764 | if (hash_size < 64) |
567 | hash_size = 64; | 765 | hash_size = 64; |
568 | 766 | ||
569 | if (init_exception_table(&s->pending, hash_size, 0)) { | 767 | if (dm_exception_table_init(&s->pending, hash_size, 0)) { |
570 | exit_exception_table(&s->complete, exception_cache); | 768 | dm_exception_table_exit(&s->complete, exception_cache); |
571 | return -ENOMEM; | 769 | return -ENOMEM; |
572 | } | 770 | } |
573 | 771 | ||
574 | return 0; | 772 | return 0; |
575 | } | 773 | } |
576 | 774 | ||
775 | static void merge_shutdown(struct dm_snapshot *s) | ||
776 | { | ||
777 | clear_bit_unlock(RUNNING_MERGE, &s->state_bits); | ||
778 | smp_mb__after_clear_bit(); | ||
779 | wake_up_bit(&s->state_bits, RUNNING_MERGE); | ||
780 | } | ||
781 | |||
782 | static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) | ||
783 | { | ||
784 | s->first_merging_chunk = 0; | ||
785 | s->num_merging_chunks = 0; | ||
786 | |||
787 | return bio_list_get(&s->bios_queued_during_merge); | ||
788 | } | ||
789 | |||
790 | /* | ||
791 | * Remove one chunk from the index of completed exceptions. | ||
792 | */ | ||
793 | static int __remove_single_exception_chunk(struct dm_snapshot *s, | ||
794 | chunk_t old_chunk) | ||
795 | { | ||
796 | struct dm_exception *e; | ||
797 | |||
798 | e = dm_lookup_exception(&s->complete, old_chunk); | ||
799 | if (!e) { | ||
800 | DMERR("Corruption detected: exception for block %llu is " | ||
801 | "on disk but not in memory", | ||
802 | (unsigned long long)old_chunk); | ||
803 | return -EINVAL; | ||
804 | } | ||
805 | |||
806 | /* | ||
807 | * If this is the only chunk using this exception, remove exception. | ||
808 | */ | ||
809 | if (!dm_consecutive_chunk_count(e)) { | ||
810 | dm_remove_exception(e); | ||
811 | free_completed_exception(e); | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * The chunk may be either at the beginning or the end of a | ||
817 | * group of consecutive chunks - never in the middle. We are | ||
818 | * removing chunks in the opposite order to that in which they | ||
819 | * were added, so this should always be true. | ||
820 | * Decrement the consecutive chunk counter and adjust the | ||
821 | * starting point if necessary. | ||
822 | */ | ||
823 | if (old_chunk == e->old_chunk) { | ||
824 | e->old_chunk++; | ||
825 | e->new_chunk++; | ||
826 | } else if (old_chunk != e->old_chunk + | ||
827 | dm_consecutive_chunk_count(e)) { | ||
828 | DMERR("Attempt to merge block %llu from the " | ||
829 | "middle of a chunk range [%llu - %llu]", | ||
830 | (unsigned long long)old_chunk, | ||
831 | (unsigned long long)e->old_chunk, | ||
832 | (unsigned long long) | ||
833 | e->old_chunk + dm_consecutive_chunk_count(e)); | ||
834 | return -EINVAL; | ||
835 | } | ||
836 | |||
837 | dm_consecutive_chunk_count_dec(e); | ||
838 | |||
839 | return 0; | ||
840 | } | ||
841 | |||
842 | static void flush_bios(struct bio *bio); | ||
843 | |||
844 | static int remove_single_exception_chunk(struct dm_snapshot *s) | ||
845 | { | ||
846 | struct bio *b = NULL; | ||
847 | int r; | ||
848 | chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; | ||
849 | |||
850 | down_write(&s->lock); | ||
851 | |||
852 | /* | ||
853 | * Process chunks (and associated exceptions) in reverse order | ||
854 | * so that dm_consecutive_chunk_count_dec() accounting works. | ||
855 | */ | ||
856 | do { | ||
857 | r = __remove_single_exception_chunk(s, old_chunk); | ||
858 | if (r) | ||
859 | goto out; | ||
860 | } while (old_chunk-- > s->first_merging_chunk); | ||
861 | |||
862 | b = __release_queued_bios_after_merge(s); | ||
863 | |||
864 | out: | ||
865 | up_write(&s->lock); | ||
866 | if (b) | ||
867 | flush_bios(b); | ||
868 | |||
869 | return r; | ||
870 | } | ||
871 | |||
872 | static int origin_write_extent(struct dm_snapshot *merging_snap, | ||
873 | sector_t sector, unsigned chunk_size); | ||
874 | |||
875 | static void merge_callback(int read_err, unsigned long write_err, | ||
876 | void *context); | ||
877 | |||
878 | static uint64_t read_pending_exceptions_done_count(void) | ||
879 | { | ||
880 | uint64_t pending_exceptions_done; | ||
881 | |||
882 | spin_lock(&_pending_exceptions_done_spinlock); | ||
883 | pending_exceptions_done = _pending_exceptions_done_count; | ||
884 | spin_unlock(&_pending_exceptions_done_spinlock); | ||
885 | |||
886 | return pending_exceptions_done; | ||
887 | } | ||
888 | |||
889 | static void increment_pending_exceptions_done_count(void) | ||
890 | { | ||
891 | spin_lock(&_pending_exceptions_done_spinlock); | ||
892 | _pending_exceptions_done_count++; | ||
893 | spin_unlock(&_pending_exceptions_done_spinlock); | ||
894 | |||
895 | wake_up_all(&_pending_exceptions_done); | ||
896 | } | ||
897 | |||
898 | static void snapshot_merge_next_chunks(struct dm_snapshot *s) | ||
899 | { | ||
900 | int i, linear_chunks; | ||
901 | chunk_t old_chunk, new_chunk; | ||
902 | struct dm_io_region src, dest; | ||
903 | sector_t io_size; | ||
904 | uint64_t previous_count; | ||
905 | |||
906 | BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); | ||
907 | if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) | ||
908 | goto shut; | ||
909 | |||
910 | /* | ||
911 | * valid flag never changes during merge, so no lock required. | ||
912 | */ | ||
913 | if (!s->valid) { | ||
914 | DMERR("Snapshot is invalid: can't merge"); | ||
915 | goto shut; | ||
916 | } | ||
917 | |||
918 | linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, | ||
919 | &new_chunk); | ||
920 | if (linear_chunks <= 0) { | ||
921 | if (linear_chunks < 0) { | ||
922 | DMERR("Read error in exception store: " | ||
923 | "shutting down merge"); | ||
924 | down_write(&s->lock); | ||
925 | s->merge_failed = 1; | ||
926 | up_write(&s->lock); | ||
927 | } | ||
928 | goto shut; | ||
929 | } | ||
930 | |||
931 | /* Adjust old_chunk and new_chunk to reflect start of linear region */ | ||
932 | old_chunk = old_chunk + 1 - linear_chunks; | ||
933 | new_chunk = new_chunk + 1 - linear_chunks; | ||
934 | |||
935 | /* | ||
936 | * Use one (potentially large) I/O to copy all 'linear_chunks' | ||
937 | * from the exception store to the origin | ||
938 | */ | ||
939 | io_size = linear_chunks * s->store->chunk_size; | ||
940 | |||
941 | dest.bdev = s->origin->bdev; | ||
942 | dest.sector = chunk_to_sector(s->store, old_chunk); | ||
943 | dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); | ||
944 | |||
945 | src.bdev = s->cow->bdev; | ||
946 | src.sector = chunk_to_sector(s->store, new_chunk); | ||
947 | src.count = dest.count; | ||
948 | |||
949 | /* | ||
950 | * Reallocate any exceptions needed in other snapshots then | ||
951 | * wait for the pending exceptions to complete. | ||
952 | * Each time any pending exception (globally on the system) | ||
953 | * completes we are woken and repeat the process to find out | ||
954 | * if we can proceed. While this may not seem a particularly | ||
955 | * efficient algorithm, it is not expected to have any | ||
956 | * significant impact on performance. | ||
957 | */ | ||
958 | previous_count = read_pending_exceptions_done_count(); | ||
959 | while (origin_write_extent(s, dest.sector, io_size)) { | ||
960 | wait_event(_pending_exceptions_done, | ||
961 | (read_pending_exceptions_done_count() != | ||
962 | previous_count)); | ||
963 | /* Retry after the wait, until all exceptions are done. */ | ||
964 | previous_count = read_pending_exceptions_done_count(); | ||
965 | } | ||
966 | |||
967 | down_write(&s->lock); | ||
968 | s->first_merging_chunk = old_chunk; | ||
969 | s->num_merging_chunks = linear_chunks; | ||
970 | up_write(&s->lock); | ||
971 | |||
972 | /* Wait until writes to all 'linear_chunks' drain */ | ||
973 | for (i = 0; i < linear_chunks; i++) | ||
974 | __check_for_conflicting_io(s, old_chunk + i); | ||
975 | |||
976 | dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); | ||
977 | return; | ||
978 | |||
979 | shut: | ||
980 | merge_shutdown(s); | ||
981 | } | ||
982 | |||
983 | static void error_bios(struct bio *bio); | ||
984 | |||
985 | static void merge_callback(int read_err, unsigned long write_err, void *context) | ||
986 | { | ||
987 | struct dm_snapshot *s = context; | ||
988 | struct bio *b = NULL; | ||
989 | |||
990 | if (read_err || write_err) { | ||
991 | if (read_err) | ||
992 | DMERR("Read error: shutting down merge."); | ||
993 | else | ||
994 | DMERR("Write error: shutting down merge."); | ||
995 | goto shut; | ||
996 | } | ||
997 | |||
998 | if (s->store->type->commit_merge(s->store, | ||
999 | s->num_merging_chunks) < 0) { | ||
1000 | DMERR("Write error in exception store: shutting down merge"); | ||
1001 | goto shut; | ||
1002 | } | ||
1003 | |||
1004 | if (remove_single_exception_chunk(s) < 0) | ||
1005 | goto shut; | ||
1006 | |||
1007 | snapshot_merge_next_chunks(s); | ||
1008 | |||
1009 | return; | ||
1010 | |||
1011 | shut: | ||
1012 | down_write(&s->lock); | ||
1013 | s->merge_failed = 1; | ||
1014 | b = __release_queued_bios_after_merge(s); | ||
1015 | up_write(&s->lock); | ||
1016 | error_bios(b); | ||
1017 | |||
1018 | merge_shutdown(s); | ||
1019 | } | ||
1020 | |||
1021 | static void start_merge(struct dm_snapshot *s) | ||
1022 | { | ||
1023 | if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) | ||
1024 | snapshot_merge_next_chunks(s); | ||
1025 | } | ||
1026 | |||
1027 | static int wait_schedule(void *ptr) | ||
1028 | { | ||
1029 | schedule(); | ||
1030 | |||
1031 | return 0; | ||
1032 | } | ||
1033 | |||
1034 | /* | ||
1035 | * Stop the merging process and wait until it finishes. | ||
1036 | */ | ||
1037 | static void stop_merge(struct dm_snapshot *s) | ||
1038 | { | ||
1039 | set_bit(SHUTDOWN_MERGE, &s->state_bits); | ||
1040 | wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, | ||
1041 | TASK_UNINTERRUPTIBLE); | ||
1042 | clear_bit(SHUTDOWN_MERGE, &s->state_bits); | ||
1043 | } | ||
1044 | |||
577 | /* | 1045 | /* |
578 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> | 1046 | * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> |
579 | */ | 1047 | */ |
@@ -582,50 +1050,72 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
582 | struct dm_snapshot *s; | 1050 | struct dm_snapshot *s; |
583 | int i; | 1051 | int i; |
584 | int r = -EINVAL; | 1052 | int r = -EINVAL; |
585 | char *origin_path; | 1053 | char *origin_path, *cow_path; |
586 | struct dm_exception_store *store; | 1054 | unsigned args_used, num_flush_requests = 1; |
587 | unsigned args_used; | 1055 | fmode_t origin_mode = FMODE_READ; |
588 | 1056 | ||
589 | if (argc != 4) { | 1057 | if (argc != 4) { |
590 | ti->error = "requires exactly 4 arguments"; | 1058 | ti->error = "requires exactly 4 arguments"; |
591 | r = -EINVAL; | 1059 | r = -EINVAL; |
592 | goto bad_args; | 1060 | goto bad; |
1061 | } | ||
1062 | |||
1063 | if (dm_target_is_snapshot_merge(ti)) { | ||
1064 | num_flush_requests = 2; | ||
1065 | origin_mode = FMODE_WRITE; | ||
593 | } | 1066 | } |
594 | 1067 | ||
595 | origin_path = argv[0]; | 1068 | origin_path = argv[0]; |
596 | argv++; | 1069 | argv++; |
597 | argc--; | 1070 | argc--; |
598 | 1071 | ||
599 | r = dm_exception_store_create(ti, argc, argv, &args_used, &store); | 1072 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
1073 | if (!s) { | ||
1074 | ti->error = "Cannot allocate snapshot context private " | ||
1075 | "structure"; | ||
1076 | r = -ENOMEM; | ||
1077 | goto bad; | ||
1078 | } | ||
1079 | |||
1080 | cow_path = argv[0]; | ||
1081 | argv++; | ||
1082 | argc--; | ||
1083 | |||
1084 | r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); | ||
1085 | if (r) { | ||
1086 | ti->error = "Cannot get COW device"; | ||
1087 | goto bad_cow; | ||
1088 | } | ||
1089 | |||
1090 | r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); | ||
600 | if (r) { | 1091 | if (r) { |
601 | ti->error = "Couldn't create exception store"; | 1092 | ti->error = "Couldn't create exception store"; |
602 | r = -EINVAL; | 1093 | r = -EINVAL; |
603 | goto bad_args; | 1094 | goto bad_store; |
604 | } | 1095 | } |
605 | 1096 | ||
606 | argv += args_used; | 1097 | argv += args_used; |
607 | argc -= args_used; | 1098 | argc -= args_used; |
608 | 1099 | ||
609 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 1100 | r = dm_get_device(ti, origin_path, origin_mode, &s->origin); |
610 | if (!s) { | ||
611 | ti->error = "Cannot allocate snapshot context private " | ||
612 | "structure"; | ||
613 | r = -ENOMEM; | ||
614 | goto bad_snap; | ||
615 | } | ||
616 | |||
617 | r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); | ||
618 | if (r) { | 1101 | if (r) { |
619 | ti->error = "Cannot get origin device"; | 1102 | ti->error = "Cannot get origin device"; |
620 | goto bad_origin; | 1103 | goto bad_origin; |
621 | } | 1104 | } |
622 | 1105 | ||
623 | s->store = store; | 1106 | s->ti = ti; |
624 | s->valid = 1; | 1107 | s->valid = 1; |
625 | s->active = 0; | 1108 | s->active = 0; |
1109 | s->suspended = 0; | ||
626 | atomic_set(&s->pending_exceptions_count, 0); | 1110 | atomic_set(&s->pending_exceptions_count, 0); |
627 | init_rwsem(&s->lock); | 1111 | init_rwsem(&s->lock); |
1112 | INIT_LIST_HEAD(&s->list); | ||
628 | spin_lock_init(&s->pe_lock); | 1113 | spin_lock_init(&s->pe_lock); |
1114 | s->state_bits = 0; | ||
1115 | s->merge_failed = 0; | ||
1116 | s->first_merging_chunk = 0; | ||
1117 | s->num_merging_chunks = 0; | ||
1118 | bio_list_init(&s->bios_queued_during_merge); | ||
629 | 1119 | ||
630 | /* Allocate hash table for COW data */ | 1120 | /* Allocate hash table for COW data */ |
631 | if (init_hash_tables(s)) { | 1121 | if (init_hash_tables(s)) { |
@@ -659,39 +1149,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
659 | 1149 | ||
660 | spin_lock_init(&s->tracked_chunk_lock); | 1150 | spin_lock_init(&s->tracked_chunk_lock); |
661 | 1151 | ||
662 | /* Metadata must only be loaded into one table at once */ | 1152 | bio_list_init(&s->queued_bios); |
1153 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | ||
1154 | |||
1155 | ti->private = s; | ||
1156 | ti->num_flush_requests = num_flush_requests; | ||
1157 | |||
1158 | /* Add snapshot to the list of snapshots for this origin */ | ||
1159 | /* Exceptions aren't triggered till snapshot_resume() is called */ | ||
1160 | r = register_snapshot(s); | ||
1161 | if (r == -ENOMEM) { | ||
1162 | ti->error = "Snapshot origin struct allocation failed"; | ||
1163 | goto bad_load_and_register; | ||
1164 | } else if (r < 0) { | ||
1165 | /* invalid handover, register_snapshot has set ti->error */ | ||
1166 | goto bad_load_and_register; | ||
1167 | } | ||
1168 | |||
1169 | /* | ||
1170 | * Metadata must only be loaded into one table at once, so skip this | ||
1171 | * if metadata will be handed over during resume. | ||
1172 | * Chunk size will be set during the handover - set it to zero to | ||
1173 | * ensure it's ignored. | ||
1174 | */ | ||
1175 | if (r > 0) { | ||
1176 | s->store->chunk_size = 0; | ||
1177 | return 0; | ||
1178 | } | ||
1179 | |||
663 | r = s->store->type->read_metadata(s->store, dm_add_exception, | 1180 | r = s->store->type->read_metadata(s->store, dm_add_exception, |
664 | (void *)s); | 1181 | (void *)s); |
665 | if (r < 0) { | 1182 | if (r < 0) { |
666 | ti->error = "Failed to read snapshot metadata"; | 1183 | ti->error = "Failed to read snapshot metadata"; |
667 | goto bad_load_and_register; | 1184 | goto bad_read_metadata; |
668 | } else if (r > 0) { | 1185 | } else if (r > 0) { |
669 | s->valid = 0; | 1186 | s->valid = 0; |
670 | DMWARN("Snapshot is marked invalid."); | 1187 | DMWARN("Snapshot is marked invalid."); |
671 | } | 1188 | } |
672 | 1189 | ||
673 | bio_list_init(&s->queued_bios); | ||
674 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | ||
675 | |||
676 | if (!s->store->chunk_size) { | 1190 | if (!s->store->chunk_size) { |
677 | ti->error = "Chunk size not set"; | 1191 | ti->error = "Chunk size not set"; |
678 | goto bad_load_and_register; | 1192 | goto bad_read_metadata; |
679 | } | 1193 | } |
680 | |||
681 | /* Add snapshot to the list of snapshots for this origin */ | ||
682 | /* Exceptions aren't triggered till snapshot_resume() is called */ | ||
683 | if (register_snapshot(s)) { | ||
684 | r = -EINVAL; | ||
685 | ti->error = "Cannot register snapshot origin"; | ||
686 | goto bad_load_and_register; | ||
687 | } | ||
688 | |||
689 | ti->private = s; | ||
690 | ti->split_io = s->store->chunk_size; | 1194 | ti->split_io = s->store->chunk_size; |
691 | ti->num_flush_requests = 1; | ||
692 | 1195 | ||
693 | return 0; | 1196 | return 0; |
694 | 1197 | ||
1198 | bad_read_metadata: | ||
1199 | unregister_snapshot(s); | ||
1200 | |||
695 | bad_load_and_register: | 1201 | bad_load_and_register: |
696 | mempool_destroy(s->tracked_chunk_pool); | 1202 | mempool_destroy(s->tracked_chunk_pool); |
697 | 1203 | ||
@@ -702,19 +1208,22 @@ bad_pending_pool: | |||
702 | dm_kcopyd_client_destroy(s->kcopyd_client); | 1208 | dm_kcopyd_client_destroy(s->kcopyd_client); |
703 | 1209 | ||
704 | bad_kcopyd: | 1210 | bad_kcopyd: |
705 | exit_exception_table(&s->pending, pending_cache); | 1211 | dm_exception_table_exit(&s->pending, pending_cache); |
706 | exit_exception_table(&s->complete, exception_cache); | 1212 | dm_exception_table_exit(&s->complete, exception_cache); |
707 | 1213 | ||
708 | bad_hash_tables: | 1214 | bad_hash_tables: |
709 | dm_put_device(ti, s->origin); | 1215 | dm_put_device(ti, s->origin); |
710 | 1216 | ||
711 | bad_origin: | 1217 | bad_origin: |
712 | kfree(s); | 1218 | dm_exception_store_destroy(s->store); |
1219 | |||
1220 | bad_store: | ||
1221 | dm_put_device(ti, s->cow); | ||
713 | 1222 | ||
714 | bad_snap: | 1223 | bad_cow: |
715 | dm_exception_store_destroy(store); | 1224 | kfree(s); |
716 | 1225 | ||
717 | bad_args: | 1226 | bad: |
718 | return r; | 1227 | return r; |
719 | } | 1228 | } |
720 | 1229 | ||
@@ -723,8 +1232,39 @@ static void __free_exceptions(struct dm_snapshot *s) | |||
723 | dm_kcopyd_client_destroy(s->kcopyd_client); | 1232 | dm_kcopyd_client_destroy(s->kcopyd_client); |
724 | s->kcopyd_client = NULL; | 1233 | s->kcopyd_client = NULL; |
725 | 1234 | ||
726 | exit_exception_table(&s->pending, pending_cache); | 1235 | dm_exception_table_exit(&s->pending, pending_cache); |
727 | exit_exception_table(&s->complete, exception_cache); | 1236 | dm_exception_table_exit(&s->complete, exception_cache); |
1237 | } | ||
1238 | |||
1239 | static void __handover_exceptions(struct dm_snapshot *snap_src, | ||
1240 | struct dm_snapshot *snap_dest) | ||
1241 | { | ||
1242 | union { | ||
1243 | struct dm_exception_table table_swap; | ||
1244 | struct dm_exception_store *store_swap; | ||
1245 | } u; | ||
1246 | |||
1247 | /* | ||
1248 | * Swap all snapshot context information between the two instances. | ||
1249 | */ | ||
1250 | u.table_swap = snap_dest->complete; | ||
1251 | snap_dest->complete = snap_src->complete; | ||
1252 | snap_src->complete = u.table_swap; | ||
1253 | |||
1254 | u.store_swap = snap_dest->store; | ||
1255 | snap_dest->store = snap_src->store; | ||
1256 | snap_src->store = u.store_swap; | ||
1257 | |||
1258 | snap_dest->store->snap = snap_dest; | ||
1259 | snap_src->store->snap = snap_src; | ||
1260 | |||
1261 | snap_dest->ti->split_io = snap_dest->store->chunk_size; | ||
1262 | snap_dest->valid = snap_src->valid; | ||
1263 | |||
1264 | /* | ||
1265 | * Set source invalid to ensure it receives no further I/O. | ||
1266 | */ | ||
1267 | snap_src->valid = 0; | ||
728 | } | 1268 | } |
729 | 1269 | ||
730 | static void snapshot_dtr(struct dm_target *ti) | 1270 | static void snapshot_dtr(struct dm_target *ti) |
@@ -733,9 +1273,24 @@ static void snapshot_dtr(struct dm_target *ti) | |||
733 | int i; | 1273 | int i; |
734 | #endif | 1274 | #endif |
735 | struct dm_snapshot *s = ti->private; | 1275 | struct dm_snapshot *s = ti->private; |
1276 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
736 | 1277 | ||
737 | flush_workqueue(ksnapd); | 1278 | flush_workqueue(ksnapd); |
738 | 1279 | ||
1280 | down_read(&_origins_lock); | ||
1281 | /* Check whether exception handover must be cancelled */ | ||
1282 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1283 | if (snap_src && snap_dest && (s == snap_src)) { | ||
1284 | down_write(&snap_dest->lock); | ||
1285 | snap_dest->valid = 0; | ||
1286 | up_write(&snap_dest->lock); | ||
1287 | DMERR("Cancelling snapshot handover."); | ||
1288 | } | ||
1289 | up_read(&_origins_lock); | ||
1290 | |||
1291 | if (dm_target_is_snapshot_merge(ti)) | ||
1292 | stop_merge(s); | ||
1293 | |||
739 | /* Prevent further origin writes from using this snapshot. */ | 1294 | /* Prevent further origin writes from using this snapshot. */ |
740 | /* After this returns there can be no new kcopyd jobs. */ | 1295 | /* After this returns there can be no new kcopyd jobs. */ |
741 | unregister_snapshot(s); | 1296 | unregister_snapshot(s); |
@@ -763,6 +1318,8 @@ static void snapshot_dtr(struct dm_target *ti) | |||
763 | 1318 | ||
764 | dm_exception_store_destroy(s->store); | 1319 | dm_exception_store_destroy(s->store); |
765 | 1320 | ||
1321 | dm_put_device(ti, s->cow); | ||
1322 | |||
766 | kfree(s); | 1323 | kfree(s); |
767 | } | 1324 | } |
768 | 1325 | ||
@@ -795,6 +1352,26 @@ static void flush_queued_bios(struct work_struct *work) | |||
795 | flush_bios(queued_bios); | 1352 | flush_bios(queued_bios); |
796 | } | 1353 | } |
797 | 1354 | ||
1355 | static int do_origin(struct dm_dev *origin, struct bio *bio); | ||
1356 | |||
1357 | /* | ||
1358 | * Flush a list of buffers. | ||
1359 | */ | ||
1360 | static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) | ||
1361 | { | ||
1362 | struct bio *n; | ||
1363 | int r; | ||
1364 | |||
1365 | while (bio) { | ||
1366 | n = bio->bi_next; | ||
1367 | bio->bi_next = NULL; | ||
1368 | r = do_origin(s->origin, bio); | ||
1369 | if (r == DM_MAPIO_REMAPPED) | ||
1370 | generic_make_request(bio); | ||
1371 | bio = n; | ||
1372 | } | ||
1373 | } | ||
1374 | |||
798 | /* | 1375 | /* |
799 | * Error a list of buffers. | 1376 | * Error a list of buffers. |
800 | */ | 1377 | */ |
@@ -825,45 +1402,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) | |||
825 | 1402 | ||
826 | s->valid = 0; | 1403 | s->valid = 0; |
827 | 1404 | ||
828 | dm_table_event(s->store->ti->table); | 1405 | dm_table_event(s->ti->table); |
829 | } | ||
830 | |||
831 | static void get_pending_exception(struct dm_snap_pending_exception *pe) | ||
832 | { | ||
833 | atomic_inc(&pe->ref_count); | ||
834 | } | ||
835 | |||
836 | static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe) | ||
837 | { | ||
838 | struct dm_snap_pending_exception *primary_pe; | ||
839 | struct bio *origin_bios = NULL; | ||
840 | |||
841 | primary_pe = pe->primary_pe; | ||
842 | |||
843 | /* | ||
844 | * If this pe is involved in a write to the origin and | ||
845 | * it is the last sibling to complete then release | ||
846 | * the bios for the original write to the origin. | ||
847 | */ | ||
848 | if (primary_pe && | ||
849 | atomic_dec_and_test(&primary_pe->ref_count)) { | ||
850 | origin_bios = bio_list_get(&primary_pe->origin_bios); | ||
851 | free_pending_exception(primary_pe); | ||
852 | } | ||
853 | |||
854 | /* | ||
855 | * Free the pe if it's not linked to an origin write or if | ||
856 | * it's not itself a primary pe. | ||
857 | */ | ||
858 | if (!primary_pe || primary_pe != pe) | ||
859 | free_pending_exception(pe); | ||
860 | |||
861 | return origin_bios; | ||
862 | } | 1406 | } |
863 | 1407 | ||
864 | static void pending_complete(struct dm_snap_pending_exception *pe, int success) | 1408 | static void pending_complete(struct dm_snap_pending_exception *pe, int success) |
865 | { | 1409 | { |
866 | struct dm_snap_exception *e; | 1410 | struct dm_exception *e; |
867 | struct dm_snapshot *s = pe->snap; | 1411 | struct dm_snapshot *s = pe->snap; |
868 | struct bio *origin_bios = NULL; | 1412 | struct bio *origin_bios = NULL; |
869 | struct bio *snapshot_bios = NULL; | 1413 | struct bio *snapshot_bios = NULL; |
@@ -877,7 +1421,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
877 | goto out; | 1421 | goto out; |
878 | } | 1422 | } |
879 | 1423 | ||
880 | e = alloc_exception(); | 1424 | e = alloc_completed_exception(); |
881 | if (!e) { | 1425 | if (!e) { |
882 | down_write(&s->lock); | 1426 | down_write(&s->lock); |
883 | __invalidate_snapshot(s, -ENOMEM); | 1427 | __invalidate_snapshot(s, -ENOMEM); |
@@ -888,28 +1432,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
888 | 1432 | ||
889 | down_write(&s->lock); | 1433 | down_write(&s->lock); |
890 | if (!s->valid) { | 1434 | if (!s->valid) { |
891 | free_exception(e); | 1435 | free_completed_exception(e); |
892 | error = 1; | 1436 | error = 1; |
893 | goto out; | 1437 | goto out; |
894 | } | 1438 | } |
895 | 1439 | ||
896 | /* | 1440 | /* Check for conflicting reads */ |
897 | * Check for conflicting reads. This is extremely improbable, | 1441 | __check_for_conflicting_io(s, pe->e.old_chunk); |
898 | * so msleep(1) is sufficient and there is no need for a wait queue. | ||
899 | */ | ||
900 | while (__chunk_is_tracked(s, pe->e.old_chunk)) | ||
901 | msleep(1); | ||
902 | 1442 | ||
903 | /* | 1443 | /* |
904 | * Add a proper exception, and remove the | 1444 | * Add a proper exception, and remove the |
905 | * in-flight exception from the list. | 1445 | * in-flight exception from the list. |
906 | */ | 1446 | */ |
907 | insert_completed_exception(s, e); | 1447 | dm_insert_exception(&s->complete, e); |
908 | 1448 | ||
909 | out: | 1449 | out: |
910 | remove_exception(&pe->e); | 1450 | dm_remove_exception(&pe->e); |
911 | snapshot_bios = bio_list_get(&pe->snapshot_bios); | 1451 | snapshot_bios = bio_list_get(&pe->snapshot_bios); |
912 | origin_bios = put_pending_exception(pe); | 1452 | origin_bios = bio_list_get(&pe->origin_bios); |
1453 | free_pending_exception(pe); | ||
1454 | |||
1455 | increment_pending_exceptions_done_count(); | ||
913 | 1456 | ||
914 | up_write(&s->lock); | 1457 | up_write(&s->lock); |
915 | 1458 | ||
@@ -919,7 +1462,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) | |||
919 | else | 1462 | else |
920 | flush_bios(snapshot_bios); | 1463 | flush_bios(snapshot_bios); |
921 | 1464 | ||
922 | flush_bios(origin_bios); | 1465 | retry_origin_bios(s, origin_bios); |
923 | } | 1466 | } |
924 | 1467 | ||
925 | static void commit_callback(void *context, int success) | 1468 | static void commit_callback(void *context, int success) |
@@ -963,7 +1506,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
963 | src.sector = chunk_to_sector(s->store, pe->e.old_chunk); | 1506 | src.sector = chunk_to_sector(s->store, pe->e.old_chunk); |
964 | src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); | 1507 | src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); |
965 | 1508 | ||
966 | dest.bdev = s->store->cow->bdev; | 1509 | dest.bdev = s->cow->bdev; |
967 | dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); | 1510 | dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); |
968 | dest.count = src.count; | 1511 | dest.count = src.count; |
969 | 1512 | ||
@@ -975,7 +1518,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) | |||
975 | static struct dm_snap_pending_exception * | 1518 | static struct dm_snap_pending_exception * |
976 | __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) | 1519 | __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) |
977 | { | 1520 | { |
978 | struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); | 1521 | struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); |
979 | 1522 | ||
980 | if (!e) | 1523 | if (!e) |
981 | return NULL; | 1524 | return NULL; |
@@ -1006,8 +1549,6 @@ __find_pending_exception(struct dm_snapshot *s, | |||
1006 | pe->e.old_chunk = chunk; | 1549 | pe->e.old_chunk = chunk; |
1007 | bio_list_init(&pe->origin_bios); | 1550 | bio_list_init(&pe->origin_bios); |
1008 | bio_list_init(&pe->snapshot_bios); | 1551 | bio_list_init(&pe->snapshot_bios); |
1009 | pe->primary_pe = NULL; | ||
1010 | atomic_set(&pe->ref_count, 0); | ||
1011 | pe->started = 0; | 1552 | pe->started = 0; |
1012 | 1553 | ||
1013 | if (s->store->type->prepare_exception(s->store, &pe->e)) { | 1554 | if (s->store->type->prepare_exception(s->store, &pe->e)) { |
@@ -1015,16 +1556,15 @@ __find_pending_exception(struct dm_snapshot *s, | |||
1015 | return NULL; | 1556 | return NULL; |
1016 | } | 1557 | } |
1017 | 1558 | ||
1018 | get_pending_exception(pe); | 1559 | dm_insert_exception(&s->pending, &pe->e); |
1019 | insert_exception(&s->pending, &pe->e); | ||
1020 | 1560 | ||
1021 | return pe; | 1561 | return pe; |
1022 | } | 1562 | } |
1023 | 1563 | ||
1024 | static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | 1564 | static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, |
1025 | struct bio *bio, chunk_t chunk) | 1565 | struct bio *bio, chunk_t chunk) |
1026 | { | 1566 | { |
1027 | bio->bi_bdev = s->store->cow->bdev; | 1567 | bio->bi_bdev = s->cow->bdev; |
1028 | bio->bi_sector = chunk_to_sector(s->store, | 1568 | bio->bi_sector = chunk_to_sector(s->store, |
1029 | dm_chunk_number(e->new_chunk) + | 1569 | dm_chunk_number(e->new_chunk) + |
1030 | (chunk - e->old_chunk)) + | 1570 | (chunk - e->old_chunk)) + |
@@ -1035,14 +1575,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, | |||
1035 | static int snapshot_map(struct dm_target *ti, struct bio *bio, | 1575 | static int snapshot_map(struct dm_target *ti, struct bio *bio, |
1036 | union map_info *map_context) | 1576 | union map_info *map_context) |
1037 | { | 1577 | { |
1038 | struct dm_snap_exception *e; | 1578 | struct dm_exception *e; |
1039 | struct dm_snapshot *s = ti->private; | 1579 | struct dm_snapshot *s = ti->private; |
1040 | int r = DM_MAPIO_REMAPPED; | 1580 | int r = DM_MAPIO_REMAPPED; |
1041 | chunk_t chunk; | 1581 | chunk_t chunk; |
1042 | struct dm_snap_pending_exception *pe = NULL; | 1582 | struct dm_snap_pending_exception *pe = NULL; |
1043 | 1583 | ||
1044 | if (unlikely(bio_empty_barrier(bio))) { | 1584 | if (unlikely(bio_empty_barrier(bio))) { |
1045 | bio->bi_bdev = s->store->cow->bdev; | 1585 | bio->bi_bdev = s->cow->bdev; |
1046 | return DM_MAPIO_REMAPPED; | 1586 | return DM_MAPIO_REMAPPED; |
1047 | } | 1587 | } |
1048 | 1588 | ||
@@ -1063,7 +1603,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1063 | } | 1603 | } |
1064 | 1604 | ||
1065 | /* If the block is already remapped - use that, else remap it */ | 1605 | /* If the block is already remapped - use that, else remap it */ |
1066 | e = lookup_exception(&s->complete, chunk); | 1606 | e = dm_lookup_exception(&s->complete, chunk); |
1067 | if (e) { | 1607 | if (e) { |
1068 | remap_exception(s, e, bio, chunk); | 1608 | remap_exception(s, e, bio, chunk); |
1069 | goto out_unlock; | 1609 | goto out_unlock; |
@@ -1087,7 +1627,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1087 | goto out_unlock; | 1627 | goto out_unlock; |
1088 | } | 1628 | } |
1089 | 1629 | ||
1090 | e = lookup_exception(&s->complete, chunk); | 1630 | e = dm_lookup_exception(&s->complete, chunk); |
1091 | if (e) { | 1631 | if (e) { |
1092 | free_pending_exception(pe); | 1632 | free_pending_exception(pe); |
1093 | remap_exception(s, e, bio, chunk); | 1633 | remap_exception(s, e, bio, chunk); |
@@ -1125,6 +1665,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1125 | return r; | 1665 | return r; |
1126 | } | 1666 | } |
1127 | 1667 | ||
1668 | /* | ||
1669 | * A snapshot-merge target behaves like a combination of a snapshot | ||
1670 | * target and a snapshot-origin target. It only generates new | ||
1671 | * exceptions in other snapshots and not in the one that is being | ||
1672 | * merged. | ||
1673 | * | ||
1674 | * For each chunk, if there is an existing exception, it is used to | ||
1675 | * redirect I/O to the cow device. Otherwise I/O is sent to the origin, | ||
1676 | * which in turn might generate exceptions in other snapshots. | ||
1677 | * If merging is currently taking place on the chunk in question, the | ||
1678 | * I/O is deferred by adding it to s->bios_queued_during_merge. | ||
1679 | */ | ||
1680 | static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, | ||
1681 | union map_info *map_context) | ||
1682 | { | ||
1683 | struct dm_exception *e; | ||
1684 | struct dm_snapshot *s = ti->private; | ||
1685 | int r = DM_MAPIO_REMAPPED; | ||
1686 | chunk_t chunk; | ||
1687 | |||
1688 | if (unlikely(bio_empty_barrier(bio))) { | ||
1689 | if (!map_context->flush_request) | ||
1690 | bio->bi_bdev = s->origin->bdev; | ||
1691 | else | ||
1692 | bio->bi_bdev = s->cow->bdev; | ||
1693 | map_context->ptr = NULL; | ||
1694 | return DM_MAPIO_REMAPPED; | ||
1695 | } | ||
1696 | |||
1697 | chunk = sector_to_chunk(s->store, bio->bi_sector); | ||
1698 | |||
1699 | down_write(&s->lock); | ||
1700 | |||
1701 | /* Full merging snapshots are redirected to the origin */ | ||
1702 | if (!s->valid) | ||
1703 | goto redirect_to_origin; | ||
1704 | |||
1705 | /* If the block is already remapped - use that */ | ||
1706 | e = dm_lookup_exception(&s->complete, chunk); | ||
1707 | if (e) { | ||
1708 | /* Queue writes overlapping with chunks being merged */ | ||
1709 | if (bio_rw(bio) == WRITE && | ||
1710 | chunk >= s->first_merging_chunk && | ||
1711 | chunk < (s->first_merging_chunk + | ||
1712 | s->num_merging_chunks)) { | ||
1713 | bio->bi_bdev = s->origin->bdev; | ||
1714 | bio_list_add(&s->bios_queued_during_merge, bio); | ||
1715 | r = DM_MAPIO_SUBMITTED; | ||
1716 | goto out_unlock; | ||
1717 | } | ||
1718 | |||
1719 | remap_exception(s, e, bio, chunk); | ||
1720 | |||
1721 | if (bio_rw(bio) == WRITE) | ||
1722 | map_context->ptr = track_chunk(s, chunk); | ||
1723 | goto out_unlock; | ||
1724 | } | ||
1725 | |||
1726 | redirect_to_origin: | ||
1727 | bio->bi_bdev = s->origin->bdev; | ||
1728 | |||
1729 | if (bio_rw(bio) == WRITE) { | ||
1730 | up_write(&s->lock); | ||
1731 | return do_origin(s->origin, bio); | ||
1732 | } | ||
1733 | |||
1734 | out_unlock: | ||
1735 | up_write(&s->lock); | ||
1736 | |||
1737 | return r; | ||
1738 | } | ||
1739 | |||
1128 | static int snapshot_end_io(struct dm_target *ti, struct bio *bio, | 1740 | static int snapshot_end_io(struct dm_target *ti, struct bio *bio, |
1129 | int error, union map_info *map_context) | 1741 | int error, union map_info *map_context) |
1130 | { | 1742 | { |
@@ -1137,40 +1749,135 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio, | |||
1137 | return 0; | 1749 | return 0; |
1138 | } | 1750 | } |
1139 | 1751 | ||
1752 | static void snapshot_merge_presuspend(struct dm_target *ti) | ||
1753 | { | ||
1754 | struct dm_snapshot *s = ti->private; | ||
1755 | |||
1756 | stop_merge(s); | ||
1757 | } | ||
1758 | |||
1759 | static void snapshot_postsuspend(struct dm_target *ti) | ||
1760 | { | ||
1761 | struct dm_snapshot *s = ti->private; | ||
1762 | |||
1763 | down_write(&s->lock); | ||
1764 | s->suspended = 1; | ||
1765 | up_write(&s->lock); | ||
1766 | } | ||
1767 | |||
1768 | static int snapshot_preresume(struct dm_target *ti) | ||
1769 | { | ||
1770 | int r = 0; | ||
1771 | struct dm_snapshot *s = ti->private; | ||
1772 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
1773 | |||
1774 | down_read(&_origins_lock); | ||
1775 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1776 | if (snap_src && snap_dest) { | ||
1777 | down_read(&snap_src->lock); | ||
1778 | if (s == snap_src) { | ||
1779 | DMERR("Unable to resume snapshot source until " | ||
1780 | "handover completes."); | ||
1781 | r = -EINVAL; | ||
1782 | } else if (!snap_src->suspended) { | ||
1783 | DMERR("Unable to perform snapshot handover until " | ||
1784 | "source is suspended."); | ||
1785 | r = -EINVAL; | ||
1786 | } | ||
1787 | up_read(&snap_src->lock); | ||
1788 | } | ||
1789 | up_read(&_origins_lock); | ||
1790 | |||
1791 | return r; | ||
1792 | } | ||
1793 | |||
1140 | static void snapshot_resume(struct dm_target *ti) | 1794 | static void snapshot_resume(struct dm_target *ti) |
1141 | { | 1795 | { |
1142 | struct dm_snapshot *s = ti->private; | 1796 | struct dm_snapshot *s = ti->private; |
1797 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | ||
1798 | |||
1799 | down_read(&_origins_lock); | ||
1800 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | ||
1801 | if (snap_src && snap_dest) { | ||
1802 | down_write(&snap_src->lock); | ||
1803 | down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); | ||
1804 | __handover_exceptions(snap_src, snap_dest); | ||
1805 | up_write(&snap_dest->lock); | ||
1806 | up_write(&snap_src->lock); | ||
1807 | } | ||
1808 | up_read(&_origins_lock); | ||
1809 | |||
1810 | /* Now we have correct chunk size, reregister */ | ||
1811 | reregister_snapshot(s); | ||
1143 | 1812 | ||
1144 | down_write(&s->lock); | 1813 | down_write(&s->lock); |
1145 | s->active = 1; | 1814 | s->active = 1; |
1815 | s->suspended = 0; | ||
1146 | up_write(&s->lock); | 1816 | up_write(&s->lock); |
1147 | } | 1817 | } |
1148 | 1818 | ||
1819 | static sector_t get_origin_minimum_chunksize(struct block_device *bdev) | ||
1820 | { | ||
1821 | sector_t min_chunksize; | ||
1822 | |||
1823 | down_read(&_origins_lock); | ||
1824 | min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); | ||
1825 | up_read(&_origins_lock); | ||
1826 | |||
1827 | return min_chunksize; | ||
1828 | } | ||
1829 | |||
1830 | static void snapshot_merge_resume(struct dm_target *ti) | ||
1831 | { | ||
1832 | struct dm_snapshot *s = ti->private; | ||
1833 | |||
1834 | /* | ||
1835 | * Handover exceptions from existing snapshot. | ||
1836 | */ | ||
1837 | snapshot_resume(ti); | ||
1838 | |||
1839 | /* | ||
1840 | * snapshot-merge acts as an origin, so set ti->split_io | ||
1841 | */ | ||
1842 | ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); | ||
1843 | |||
1844 | start_merge(s); | ||
1845 | } | ||
1846 | |||
1149 | static int snapshot_status(struct dm_target *ti, status_type_t type, | 1847 | static int snapshot_status(struct dm_target *ti, status_type_t type, |
1150 | char *result, unsigned int maxlen) | 1848 | char *result, unsigned int maxlen) |
1151 | { | 1849 | { |
1152 | unsigned sz = 0; | 1850 | unsigned sz = 0; |
1153 | struct dm_snapshot *snap = ti->private; | 1851 | struct dm_snapshot *snap = ti->private; |
1154 | 1852 | ||
1155 | down_write(&snap->lock); | ||
1156 | |||
1157 | switch (type) { | 1853 | switch (type) { |
1158 | case STATUSTYPE_INFO: | 1854 | case STATUSTYPE_INFO: |
1855 | |||
1856 | down_write(&snap->lock); | ||
1857 | |||
1159 | if (!snap->valid) | 1858 | if (!snap->valid) |
1160 | DMEMIT("Invalid"); | 1859 | DMEMIT("Invalid"); |
1860 | else if (snap->merge_failed) | ||
1861 | DMEMIT("Merge failed"); | ||
1161 | else { | 1862 | else { |
1162 | if (snap->store->type->fraction_full) { | 1863 | if (snap->store->type->usage) { |
1163 | sector_t numerator, denominator; | 1864 | sector_t total_sectors, sectors_allocated, |
1164 | snap->store->type->fraction_full(snap->store, | 1865 | metadata_sectors; |
1165 | &numerator, | 1866 | snap->store->type->usage(snap->store, |
1166 | &denominator); | 1867 | &total_sectors, |
1167 | DMEMIT("%llu/%llu", | 1868 | §ors_allocated, |
1168 | (unsigned long long)numerator, | 1869 | &metadata_sectors); |
1169 | (unsigned long long)denominator); | 1870 | DMEMIT("%llu/%llu %llu", |
1871 | (unsigned long long)sectors_allocated, | ||
1872 | (unsigned long long)total_sectors, | ||
1873 | (unsigned long long)metadata_sectors); | ||
1170 | } | 1874 | } |
1171 | else | 1875 | else |
1172 | DMEMIT("Unknown"); | 1876 | DMEMIT("Unknown"); |
1173 | } | 1877 | } |
1878 | |||
1879 | up_write(&snap->lock); | ||
1880 | |||
1174 | break; | 1881 | break; |
1175 | 1882 | ||
1176 | case STATUSTYPE_TABLE: | 1883 | case STATUSTYPE_TABLE: |
@@ -1179,14 +1886,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, | |||
1179 | * to make private copies if the output is to | 1886 | * to make private copies if the output is to |
1180 | * make sense. | 1887 | * make sense. |
1181 | */ | 1888 | */ |
1182 | DMEMIT("%s", snap->origin->name); | 1889 | DMEMIT("%s %s", snap->origin->name, snap->cow->name); |
1183 | snap->store->type->status(snap->store, type, result + sz, | 1890 | snap->store->type->status(snap->store, type, result + sz, |
1184 | maxlen - sz); | 1891 | maxlen - sz); |
1185 | break; | 1892 | break; |
1186 | } | 1893 | } |
1187 | 1894 | ||
1188 | up_write(&snap->lock); | ||
1189 | |||
1190 | return 0; | 1895 | return 0; |
1191 | } | 1896 | } |
1192 | 1897 | ||
@@ -1202,17 +1907,36 @@ static int snapshot_iterate_devices(struct dm_target *ti, | |||
1202 | /*----------------------------------------------------------------- | 1907 | /*----------------------------------------------------------------- |
1203 | * Origin methods | 1908 | * Origin methods |
1204 | *---------------------------------------------------------------*/ | 1909 | *---------------------------------------------------------------*/ |
1205 | static int __origin_write(struct list_head *snapshots, struct bio *bio) | 1910 | |
1911 | /* | ||
1912 | * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any | ||
1913 | * supplied bio was ignored. The caller may submit it immediately. | ||
1914 | * (No remapping actually occurs as the origin is always a direct linear | ||
1915 | * map.) | ||
1916 | * | ||
1917 | * If further exceptions are required, DM_MAPIO_SUBMITTED is returned | ||
1918 | * and any supplied bio is added to a list to be submitted once all | ||
1919 | * the necessary exceptions exist. | ||
1920 | */ | ||
1921 | static int __origin_write(struct list_head *snapshots, sector_t sector, | ||
1922 | struct bio *bio) | ||
1206 | { | 1923 | { |
1207 | int r = DM_MAPIO_REMAPPED, first = 0; | 1924 | int r = DM_MAPIO_REMAPPED; |
1208 | struct dm_snapshot *snap; | 1925 | struct dm_snapshot *snap; |
1209 | struct dm_snap_exception *e; | 1926 | struct dm_exception *e; |
1210 | struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; | 1927 | struct dm_snap_pending_exception *pe; |
1928 | struct dm_snap_pending_exception *pe_to_start_now = NULL; | ||
1929 | struct dm_snap_pending_exception *pe_to_start_last = NULL; | ||
1211 | chunk_t chunk; | 1930 | chunk_t chunk; |
1212 | LIST_HEAD(pe_queue); | ||
1213 | 1931 | ||
1214 | /* Do all the snapshots on this origin */ | 1932 | /* Do all the snapshots on this origin */ |
1215 | list_for_each_entry (snap, snapshots, list) { | 1933 | list_for_each_entry (snap, snapshots, list) { |
1934 | /* | ||
1935 | * Don't make new exceptions in a merging snapshot | ||
1936 | * because it has effectively been deleted | ||
1937 | */ | ||
1938 | if (dm_target_is_snapshot_merge(snap->ti)) | ||
1939 | continue; | ||
1216 | 1940 | ||
1217 | down_write(&snap->lock); | 1941 | down_write(&snap->lock); |
1218 | 1942 | ||
@@ -1221,24 +1945,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1221 | goto next_snapshot; | 1945 | goto next_snapshot; |
1222 | 1946 | ||
1223 | /* Nothing to do if writing beyond end of snapshot */ | 1947 | /* Nothing to do if writing beyond end of snapshot */ |
1224 | if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) | 1948 | if (sector >= dm_table_get_size(snap->ti->table)) |
1225 | goto next_snapshot; | 1949 | goto next_snapshot; |
1226 | 1950 | ||
1227 | /* | 1951 | /* |
1228 | * Remember, different snapshots can have | 1952 | * Remember, different snapshots can have |
1229 | * different chunk sizes. | 1953 | * different chunk sizes. |
1230 | */ | 1954 | */ |
1231 | chunk = sector_to_chunk(snap->store, bio->bi_sector); | 1955 | chunk = sector_to_chunk(snap->store, sector); |
1232 | 1956 | ||
1233 | /* | 1957 | /* |
1234 | * Check exception table to see if block | 1958 | * Check exception table to see if block |
1235 | * is already remapped in this snapshot | 1959 | * is already remapped in this snapshot |
1236 | * and trigger an exception if not. | 1960 | * and trigger an exception if not. |
1237 | * | ||
1238 | * ref_count is initialised to 1 so pending_complete() | ||
1239 | * won't destroy the primary_pe while we're inside this loop. | ||
1240 | */ | 1961 | */ |
1241 | e = lookup_exception(&snap->complete, chunk); | 1962 | e = dm_lookup_exception(&snap->complete, chunk); |
1242 | if (e) | 1963 | if (e) |
1243 | goto next_snapshot; | 1964 | goto next_snapshot; |
1244 | 1965 | ||
@@ -1253,7 +1974,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1253 | goto next_snapshot; | 1974 | goto next_snapshot; |
1254 | } | 1975 | } |
1255 | 1976 | ||
1256 | e = lookup_exception(&snap->complete, chunk); | 1977 | e = dm_lookup_exception(&snap->complete, chunk); |
1257 | if (e) { | 1978 | if (e) { |
1258 | free_pending_exception(pe); | 1979 | free_pending_exception(pe); |
1259 | goto next_snapshot; | 1980 | goto next_snapshot; |
@@ -1266,59 +1987,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) | |||
1266 | } | 1987 | } |
1267 | } | 1988 | } |
1268 | 1989 | ||
1269 | if (!primary_pe) { | 1990 | r = DM_MAPIO_SUBMITTED; |
1270 | /* | ||
1271 | * Either every pe here has same | ||
1272 | * primary_pe or none has one yet. | ||
1273 | */ | ||
1274 | if (pe->primary_pe) | ||
1275 | primary_pe = pe->primary_pe; | ||
1276 | else { | ||
1277 | primary_pe = pe; | ||
1278 | first = 1; | ||
1279 | } | ||
1280 | |||
1281 | bio_list_add(&primary_pe->origin_bios, bio); | ||
1282 | 1991 | ||
1283 | r = DM_MAPIO_SUBMITTED; | 1992 | /* |
1284 | } | 1993 | * If an origin bio was supplied, queue it to wait for the |
1994 | * completion of this exception, and start this one last, | ||
1995 | * at the end of the function. | ||
1996 | */ | ||
1997 | if (bio) { | ||
1998 | bio_list_add(&pe->origin_bios, bio); | ||
1999 | bio = NULL; | ||
1285 | 2000 | ||
1286 | if (!pe->primary_pe) { | 2001 | if (!pe->started) { |
1287 | pe->primary_pe = primary_pe; | 2002 | pe->started = 1; |
1288 | get_pending_exception(primary_pe); | 2003 | pe_to_start_last = pe; |
2004 | } | ||
1289 | } | 2005 | } |
1290 | 2006 | ||
1291 | if (!pe->started) { | 2007 | if (!pe->started) { |
1292 | pe->started = 1; | 2008 | pe->started = 1; |
1293 | list_add_tail(&pe->list, &pe_queue); | 2009 | pe_to_start_now = pe; |
1294 | } | 2010 | } |
1295 | 2011 | ||
1296 | next_snapshot: | 2012 | next_snapshot: |
1297 | up_write(&snap->lock); | 2013 | up_write(&snap->lock); |
1298 | } | ||
1299 | 2014 | ||
1300 | if (!primary_pe) | 2015 | if (pe_to_start_now) { |
1301 | return r; | 2016 | start_copy(pe_to_start_now); |
1302 | 2017 | pe_to_start_now = NULL; | |
1303 | /* | 2018 | } |
1304 | * If this is the first time we're processing this chunk and | ||
1305 | * ref_count is now 1 it means all the pending exceptions | ||
1306 | * got completed while we were in the loop above, so it falls to | ||
1307 | * us here to remove the primary_pe and submit any origin_bios. | ||
1308 | */ | ||
1309 | |||
1310 | if (first && atomic_dec_and_test(&primary_pe->ref_count)) { | ||
1311 | flush_bios(bio_list_get(&primary_pe->origin_bios)); | ||
1312 | free_pending_exception(primary_pe); | ||
1313 | /* If we got here, pe_queue is necessarily empty. */ | ||
1314 | return r; | ||
1315 | } | 2019 | } |
1316 | 2020 | ||
1317 | /* | 2021 | /* |
1318 | * Now that we have a complete pe list we can start the copying. | 2022 | * Submit the exception against which the bio is queued last, |
2023 | * to give the other exceptions a head start. | ||
1319 | */ | 2024 | */ |
1320 | list_for_each_entry_safe(pe, next_pe, &pe_queue, list) | 2025 | if (pe_to_start_last) |
1321 | start_copy(pe); | 2026 | start_copy(pe_to_start_last); |
1322 | 2027 | ||
1323 | return r; | 2028 | return r; |
1324 | } | 2029 | } |
@@ -1334,13 +2039,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) | |||
1334 | down_read(&_origins_lock); | 2039 | down_read(&_origins_lock); |
1335 | o = __lookup_origin(origin->bdev); | 2040 | o = __lookup_origin(origin->bdev); |
1336 | if (o) | 2041 | if (o) |
1337 | r = __origin_write(&o->snapshots, bio); | 2042 | r = __origin_write(&o->snapshots, bio->bi_sector, bio); |
1338 | up_read(&_origins_lock); | 2043 | up_read(&_origins_lock); |
1339 | 2044 | ||
1340 | return r; | 2045 | return r; |
1341 | } | 2046 | } |
1342 | 2047 | ||
1343 | /* | 2048 | /* |
2049 | * Trigger exceptions in all non-merging snapshots. | ||
2050 | * | ||
2051 | * The chunk size of the merging snapshot may be larger than the chunk | ||
2052 | * size of some other snapshot so we may need to reallocate multiple | ||
2053 | * chunks in other snapshots. | ||
2054 | * | ||
2055 | * We scan all the overlapping exceptions in the other snapshots. | ||
2056 | * Returns 1 if anything was reallocated and must be waited for, | ||
2057 | * otherwise returns 0. | ||
2058 | * | ||
2059 | * size must be a multiple of merging_snap's chunk_size. | ||
2060 | */ | ||
2061 | static int origin_write_extent(struct dm_snapshot *merging_snap, | ||
2062 | sector_t sector, unsigned size) | ||
2063 | { | ||
2064 | int must_wait = 0; | ||
2065 | sector_t n; | ||
2066 | struct origin *o; | ||
2067 | |||
2068 | /* | ||
2069 | * The origin's __minimum_chunk_size() got stored in split_io | ||
2070 | * by snapshot_merge_resume(). | ||
2071 | */ | ||
2072 | down_read(&_origins_lock); | ||
2073 | o = __lookup_origin(merging_snap->origin->bdev); | ||
2074 | for (n = 0; n < size; n += merging_snap->ti->split_io) | ||
2075 | if (__origin_write(&o->snapshots, sector + n, NULL) == | ||
2076 | DM_MAPIO_SUBMITTED) | ||
2077 | must_wait = 1; | ||
2078 | up_read(&_origins_lock); | ||
2079 | |||
2080 | return must_wait; | ||
2081 | } | ||
2082 | |||
2083 | /* | ||
1344 | * Origin: maps a linear range of a device, with hooks for snapshotting. | 2084 | * Origin: maps a linear range of a device, with hooks for snapshotting. |
1345 | */ | 2085 | */ |
1346 | 2086 | ||
@@ -1359,8 +2099,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1359 | return -EINVAL; | 2099 | return -EINVAL; |
1360 | } | 2100 | } |
1361 | 2101 | ||
1362 | r = dm_get_device(ti, argv[0], 0, ti->len, | 2102 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); |
1363 | dm_table_get_mode(ti->table), &dev); | ||
1364 | if (r) { | 2103 | if (r) { |
1365 | ti->error = "Cannot get target device"; | 2104 | ti->error = "Cannot get target device"; |
1366 | return r; | 2105 | return r; |
@@ -1391,8 +2130,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1391 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; | 2130 | return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; |
1392 | } | 2131 | } |
1393 | 2132 | ||
1394 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
1395 | |||
1396 | /* | 2133 | /* |
1397 | * Set the target "split_io" field to the minimum of all the snapshots' | 2134 | * Set the target "split_io" field to the minimum of all the snapshots' |
1398 | * chunk sizes. | 2135 | * chunk sizes. |
@@ -1400,19 +2137,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
1400 | static void origin_resume(struct dm_target *ti) | 2137 | static void origin_resume(struct dm_target *ti) |
1401 | { | 2138 | { |
1402 | struct dm_dev *dev = ti->private; | 2139 | struct dm_dev *dev = ti->private; |
1403 | struct dm_snapshot *snap; | ||
1404 | struct origin *o; | ||
1405 | unsigned chunk_size = 0; | ||
1406 | |||
1407 | down_read(&_origins_lock); | ||
1408 | o = __lookup_origin(dev->bdev); | ||
1409 | if (o) | ||
1410 | list_for_each_entry (snap, &o->snapshots, list) | ||
1411 | chunk_size = min_not_zero(chunk_size, | ||
1412 | snap->store->chunk_size); | ||
1413 | up_read(&_origins_lock); | ||
1414 | 2140 | ||
1415 | ti->split_io = chunk_size; | 2141 | ti->split_io = get_origin_minimum_chunksize(dev->bdev); |
1416 | } | 2142 | } |
1417 | 2143 | ||
1418 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, | 2144 | static int origin_status(struct dm_target *ti, status_type_t type, char *result, |
@@ -1455,17 +2181,35 @@ static struct target_type origin_target = { | |||
1455 | 2181 | ||
1456 | static struct target_type snapshot_target = { | 2182 | static struct target_type snapshot_target = { |
1457 | .name = "snapshot", | 2183 | .name = "snapshot", |
1458 | .version = {1, 7, 0}, | 2184 | .version = {1, 9, 0}, |
1459 | .module = THIS_MODULE, | 2185 | .module = THIS_MODULE, |
1460 | .ctr = snapshot_ctr, | 2186 | .ctr = snapshot_ctr, |
1461 | .dtr = snapshot_dtr, | 2187 | .dtr = snapshot_dtr, |
1462 | .map = snapshot_map, | 2188 | .map = snapshot_map, |
1463 | .end_io = snapshot_end_io, | 2189 | .end_io = snapshot_end_io, |
2190 | .postsuspend = snapshot_postsuspend, | ||
2191 | .preresume = snapshot_preresume, | ||
1464 | .resume = snapshot_resume, | 2192 | .resume = snapshot_resume, |
1465 | .status = snapshot_status, | 2193 | .status = snapshot_status, |
1466 | .iterate_devices = snapshot_iterate_devices, | 2194 | .iterate_devices = snapshot_iterate_devices, |
1467 | }; | 2195 | }; |
1468 | 2196 | ||
2197 | static struct target_type merge_target = { | ||
2198 | .name = dm_snapshot_merge_target_name, | ||
2199 | .version = {1, 0, 0}, | ||
2200 | .module = THIS_MODULE, | ||
2201 | .ctr = snapshot_ctr, | ||
2202 | .dtr = snapshot_dtr, | ||
2203 | .map = snapshot_merge_map, | ||
2204 | .end_io = snapshot_end_io, | ||
2205 | .presuspend = snapshot_merge_presuspend, | ||
2206 | .postsuspend = snapshot_postsuspend, | ||
2207 | .preresume = snapshot_preresume, | ||
2208 | .resume = snapshot_merge_resume, | ||
2209 | .status = snapshot_status, | ||
2210 | .iterate_devices = snapshot_iterate_devices, | ||
2211 | }; | ||
2212 | |||
1469 | static int __init dm_snapshot_init(void) | 2213 | static int __init dm_snapshot_init(void) |
1470 | { | 2214 | { |
1471 | int r; | 2215 | int r; |
@@ -1477,7 +2221,7 @@ static int __init dm_snapshot_init(void) | |||
1477 | } | 2221 | } |
1478 | 2222 | ||
1479 | r = dm_register_target(&snapshot_target); | 2223 | r = dm_register_target(&snapshot_target); |
1480 | if (r) { | 2224 | if (r < 0) { |
1481 | DMERR("snapshot target register failed %d", r); | 2225 | DMERR("snapshot target register failed %d", r); |
1482 | goto bad_register_snapshot_target; | 2226 | goto bad_register_snapshot_target; |
1483 | } | 2227 | } |
@@ -1485,34 +2229,40 @@ static int __init dm_snapshot_init(void) | |||
1485 | r = dm_register_target(&origin_target); | 2229 | r = dm_register_target(&origin_target); |
1486 | if (r < 0) { | 2230 | if (r < 0) { |
1487 | DMERR("Origin target register failed %d", r); | 2231 | DMERR("Origin target register failed %d", r); |
1488 | goto bad1; | 2232 | goto bad_register_origin_target; |
2233 | } | ||
2234 | |||
2235 | r = dm_register_target(&merge_target); | ||
2236 | if (r < 0) { | ||
2237 | DMERR("Merge target register failed %d", r); | ||
2238 | goto bad_register_merge_target; | ||
1489 | } | 2239 | } |
1490 | 2240 | ||
1491 | r = init_origin_hash(); | 2241 | r = init_origin_hash(); |
1492 | if (r) { | 2242 | if (r) { |
1493 | DMERR("init_origin_hash failed."); | 2243 | DMERR("init_origin_hash failed."); |
1494 | goto bad2; | 2244 | goto bad_origin_hash; |
1495 | } | 2245 | } |
1496 | 2246 | ||
1497 | exception_cache = KMEM_CACHE(dm_snap_exception, 0); | 2247 | exception_cache = KMEM_CACHE(dm_exception, 0); |
1498 | if (!exception_cache) { | 2248 | if (!exception_cache) { |
1499 | DMERR("Couldn't create exception cache."); | 2249 | DMERR("Couldn't create exception cache."); |
1500 | r = -ENOMEM; | 2250 | r = -ENOMEM; |
1501 | goto bad3; | 2251 | goto bad_exception_cache; |
1502 | } | 2252 | } |
1503 | 2253 | ||
1504 | pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); | 2254 | pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); |
1505 | if (!pending_cache) { | 2255 | if (!pending_cache) { |
1506 | DMERR("Couldn't create pending cache."); | 2256 | DMERR("Couldn't create pending cache."); |
1507 | r = -ENOMEM; | 2257 | r = -ENOMEM; |
1508 | goto bad4; | 2258 | goto bad_pending_cache; |
1509 | } | 2259 | } |
1510 | 2260 | ||
1511 | tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); | 2261 | tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); |
1512 | if (!tracked_chunk_cache) { | 2262 | if (!tracked_chunk_cache) { |
1513 | DMERR("Couldn't create cache to track chunks in use."); | 2263 | DMERR("Couldn't create cache to track chunks in use."); |
1514 | r = -ENOMEM; | 2264 | r = -ENOMEM; |
1515 | goto bad5; | 2265 | goto bad_tracked_chunk_cache; |
1516 | } | 2266 | } |
1517 | 2267 | ||
1518 | ksnapd = create_singlethread_workqueue("ksnapd"); | 2268 | ksnapd = create_singlethread_workqueue("ksnapd"); |
@@ -1526,19 +2276,21 @@ static int __init dm_snapshot_init(void) | |||
1526 | 2276 | ||
1527 | bad_pending_pool: | 2277 | bad_pending_pool: |
1528 | kmem_cache_destroy(tracked_chunk_cache); | 2278 | kmem_cache_destroy(tracked_chunk_cache); |
1529 | bad5: | 2279 | bad_tracked_chunk_cache: |
1530 | kmem_cache_destroy(pending_cache); | 2280 | kmem_cache_destroy(pending_cache); |
1531 | bad4: | 2281 | bad_pending_cache: |
1532 | kmem_cache_destroy(exception_cache); | 2282 | kmem_cache_destroy(exception_cache); |
1533 | bad3: | 2283 | bad_exception_cache: |
1534 | exit_origin_hash(); | 2284 | exit_origin_hash(); |
1535 | bad2: | 2285 | bad_origin_hash: |
2286 | dm_unregister_target(&merge_target); | ||
2287 | bad_register_merge_target: | ||
1536 | dm_unregister_target(&origin_target); | 2288 | dm_unregister_target(&origin_target); |
1537 | bad1: | 2289 | bad_register_origin_target: |
1538 | dm_unregister_target(&snapshot_target); | 2290 | dm_unregister_target(&snapshot_target); |
1539 | |||
1540 | bad_register_snapshot_target: | 2291 | bad_register_snapshot_target: |
1541 | dm_exception_store_exit(); | 2292 | dm_exception_store_exit(); |
2293 | |||
1542 | return r; | 2294 | return r; |
1543 | } | 2295 | } |
1544 | 2296 | ||
@@ -1548,6 +2300,7 @@ static void __exit dm_snapshot_exit(void) | |||
1548 | 2300 | ||
1549 | dm_unregister_target(&snapshot_target); | 2301 | dm_unregister_target(&snapshot_target); |
1550 | dm_unregister_target(&origin_target); | 2302 | dm_unregister_target(&origin_target); |
2303 | dm_unregister_target(&merge_target); | ||
1551 | 2304 | ||
1552 | exit_origin_hash(); | 2305 | exit_origin_hash(); |
1553 | kmem_cache_destroy(pending_cache); | 2306 | kmem_cache_destroy(pending_cache); |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index e0efc1adcaff..e610725db766 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -80,8 +80,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
80 | if (sscanf(argv[1], "%llu", &start) != 1) | 80 | if (sscanf(argv[1], "%llu", &start) != 1) |
81 | return -EINVAL; | 81 | return -EINVAL; |
82 | 82 | ||
83 | if (dm_get_device(ti, argv[0], start, sc->stripe_width, | 83 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
84 | dm_table_get_mode(ti->table), | ||
85 | &sc->stripe[stripe].dev)) | 84 | &sc->stripe[stripe].dev)) |
86 | return -ENXIO; | 85 | return -ENXIO; |
87 | 86 | ||
@@ -110,7 +109,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
110 | } | 109 | } |
111 | 110 | ||
112 | stripes = simple_strtoul(argv[0], &end, 10); | 111 | stripes = simple_strtoul(argv[0], &end, 10); |
113 | if (*end) { | 112 | if (!stripes || *end) { |
114 | ti->error = "Invalid stripe count"; | 113 | ti->error = "Invalid stripe count"; |
115 | return -EINVAL; | 114 | return -EINVAL; |
116 | } | 115 | } |
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index 4b045903a4e2..84d2b91e4efb 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c | |||
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) | |||
59 | 59 | ||
60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) | 60 | static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) |
61 | { | 61 | { |
62 | sprintf(buf, "%d\n", dm_suspended(md)); | 62 | sprintf(buf, "%d\n", dm_suspended_md(md)); |
63 | 63 | ||
64 | return strlen(buf); | 64 | return strlen(buf); |
65 | } | 65 | } |
@@ -75,7 +75,7 @@ static struct attribute *dm_attrs[] = { | |||
75 | NULL, | 75 | NULL, |
76 | }; | 76 | }; |
77 | 77 | ||
78 | static struct sysfs_ops dm_sysfs_ops = { | 78 | static const struct sysfs_ops dm_sysfs_ops = { |
79 | .show = dm_attr_show, | 79 | .show = dm_attr_show, |
80 | }; | 80 | }; |
81 | 81 | ||
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1a6cb3c7822e..9924ea23032d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/blkdev.h> | 12 | #include <linux/blkdev.h> |
13 | #include <linux/namei.h> | 13 | #include <linux/namei.h> |
14 | #include <linux/ctype.h> | 14 | #include <linux/ctype.h> |
15 | #include <linux/string.h> | ||
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
17 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
@@ -237,6 +238,9 @@ void dm_table_destroy(struct dm_table *t) | |||
237 | { | 238 | { |
238 | unsigned int i; | 239 | unsigned int i; |
239 | 240 | ||
241 | if (!t) | ||
242 | return; | ||
243 | |||
240 | while (atomic_read(&t->holders)) | 244 | while (atomic_read(&t->holders)) |
241 | msleep(1); | 245 | msleep(1); |
242 | smp_mb(); | 246 | smp_mb(); |
@@ -425,8 +429,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, | |||
425 | * it's already present. | 429 | * it's already present. |
426 | */ | 430 | */ |
427 | static int __table_get_device(struct dm_table *t, struct dm_target *ti, | 431 | static int __table_get_device(struct dm_table *t, struct dm_target *ti, |
428 | const char *path, sector_t start, sector_t len, | 432 | const char *path, fmode_t mode, struct dm_dev **result) |
429 | fmode_t mode, struct dm_dev **result) | ||
430 | { | 433 | { |
431 | int r; | 434 | int r; |
432 | dev_t uninitialized_var(dev); | 435 | dev_t uninitialized_var(dev); |
@@ -499,16 +502,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
499 | return 0; | 502 | return 0; |
500 | } | 503 | } |
501 | 504 | ||
502 | if (blk_stack_limits(limits, &q->limits, start << 9) < 0) | 505 | if (bdev_stack_limits(limits, bdev, start) < 0) |
503 | DMWARN("%s: target device %s is misaligned: " | 506 | DMWARN("%s: adding target device %s caused an alignment inconsistency: " |
504 | "physical_block_size=%u, logical_block_size=%u, " | 507 | "physical_block_size=%u, logical_block_size=%u, " |
505 | "alignment_offset=%u, start=%llu", | 508 | "alignment_offset=%u, start=%llu", |
506 | dm_device_name(ti->table->md), bdevname(bdev, b), | 509 | dm_device_name(ti->table->md), bdevname(bdev, b), |
507 | q->limits.physical_block_size, | 510 | q->limits.physical_block_size, |
508 | q->limits.logical_block_size, | 511 | q->limits.logical_block_size, |
509 | q->limits.alignment_offset, | 512 | q->limits.alignment_offset, |
510 | (unsigned long long) start << 9); | 513 | (unsigned long long) start << SECTOR_SHIFT); |
511 | |||
512 | 514 | ||
513 | /* | 515 | /* |
514 | * Check if merge fn is supported. | 516 | * Check if merge fn is supported. |
@@ -524,11 +526,10 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
524 | } | 526 | } |
525 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 527 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
526 | 528 | ||
527 | int dm_get_device(struct dm_target *ti, const char *path, sector_t start, | 529 | int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, |
528 | sector_t len, fmode_t mode, struct dm_dev **result) | 530 | struct dm_dev **result) |
529 | { | 531 | { |
530 | return __table_get_device(ti->table, ti, path, | 532 | return __table_get_device(ti->table, ti, path, mode, result); |
531 | start, len, mode, result); | ||
532 | } | 533 | } |
533 | 534 | ||
534 | 535 | ||
@@ -600,11 +601,8 @@ int dm_split_args(int *argc, char ***argvp, char *input) | |||
600 | return -ENOMEM; | 601 | return -ENOMEM; |
601 | 602 | ||
602 | while (1) { | 603 | while (1) { |
603 | start = end; | ||
604 | |||
605 | /* Skip whitespace */ | 604 | /* Skip whitespace */ |
606 | while (*start && isspace(*start)) | 605 | start = skip_spaces(end); |
607 | start++; | ||
608 | 606 | ||
609 | if (!*start) | 607 | if (!*start) |
610 | break; /* success, we hit the end */ | 608 | break; /* success, we hit the end */ |
@@ -1025,9 +1023,9 @@ combine_limits: | |||
1025 | * for the table. | 1023 | * for the table. |
1026 | */ | 1024 | */ |
1027 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) | 1025 | if (blk_stack_limits(limits, &ti_limits, 0) < 0) |
1028 | DMWARN("%s: target device " | 1026 | DMWARN("%s: adding target device " |
1029 | "(start sect %llu len %llu) " | 1027 | "(start sect %llu len %llu) " |
1030 | "is misaligned", | 1028 | "caused an alignment inconsistency", |
1031 | dm_device_name(table->md), | 1029 | dm_device_name(table->md), |
1032 | (unsigned long long) ti->begin, | 1030 | (unsigned long long) ti->begin, |
1033 | (unsigned long long) ti->len); | 1031 | (unsigned long long) ti->len); |
@@ -1079,15 +1077,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | |||
1079 | struct queue_limits *limits) | 1077 | struct queue_limits *limits) |
1080 | { | 1078 | { |
1081 | /* | 1079 | /* |
1082 | * Each target device in the table has a data area that should normally | ||
1083 | * be aligned such that the DM device's alignment_offset is 0. | ||
1084 | * FIXME: Propagate alignment_offsets up the stack and warn of | ||
1085 | * sub-optimal or inconsistent settings. | ||
1086 | */ | ||
1087 | limits->alignment_offset = 0; | ||
1088 | limits->misaligned = 0; | ||
1089 | |||
1090 | /* | ||
1091 | * Copy table's limits to the DM device's request_queue | 1080 | * Copy table's limits to the DM device's request_queue |
1092 | */ | 1081 | */ |
1093 | q->limits = *limits; | 1082 | q->limits = *limits; |
@@ -1240,8 +1229,6 @@ void dm_table_unplug_all(struct dm_table *t) | |||
1240 | 1229 | ||
1241 | struct mapped_device *dm_table_get_md(struct dm_table *t) | 1230 | struct mapped_device *dm_table_get_md(struct dm_table *t) |
1242 | { | 1231 | { |
1243 | dm_get(t->md); | ||
1244 | |||
1245 | return t->md; | 1232 | return t->md; |
1246 | } | 1233 | } |
1247 | 1234 | ||
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 04feccf2a997..11dea11dc0b6 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
11 | #include <linux/kmod.h> | 11 | #include <linux/kmod.h> |
12 | #include <linux/bio.h> | 12 | #include <linux/bio.h> |
13 | #include <linux/slab.h> | ||
14 | 13 | ||
15 | #define DM_MSG_PREFIX "target" | 14 | #define DM_MSG_PREFIX "target" |
16 | 15 | ||
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 6f65883aef12..6b1e3b61b25e 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c | |||
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj) | |||
139 | list_del_init(&event->elist); | 139 | list_del_init(&event->elist); |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * Need to call dm_copy_name_and_uuid from here for now. | 142 | * When a device is being removed this copy fails and we |
143 | * Context of previous var adds and locking used for | 143 | * discard these unsent events. |
144 | * hash_cell not compatable. | ||
145 | */ | 144 | */ |
146 | if (dm_copy_name_and_uuid(event->md, event->name, | 145 | if (dm_copy_name_and_uuid(event->md, event->name, |
147 | event->uuid)) { | 146 | event->uuid)) { |
148 | DMERR("%s: dm_copy_name_and_uuid() failed", | 147 | DMINFO("%s: skipping sending uevent for lost device", |
149 | __func__); | 148 | __func__); |
150 | goto uevent_free; | 149 | goto uevent_free; |
151 | } | 150 | } |
152 | 151 | ||
@@ -188,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, | |||
188 | 187 | ||
189 | if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { | 188 | if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { |
190 | DMERR("%s: Invalid event_type %d", __func__, event_type); | 189 | DMERR("%s: Invalid event_type %d", __func__, event_type); |
191 | goto out; | 190 | return; |
192 | } | 191 | } |
193 | 192 | ||
194 | event = dm_build_path_uevent(md, ti, | 193 | event = dm_build_path_uevent(md, ti, |
@@ -196,12 +195,9 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, | |||
196 | _dm_uevent_type_names[event_type].name, | 195 | _dm_uevent_type_names[event_type].name, |
197 | path, nr_valid_paths); | 196 | path, nr_valid_paths); |
198 | if (IS_ERR(event)) | 197 | if (IS_ERR(event)) |
199 | goto out; | 198 | return; |
200 | 199 | ||
201 | dm_uevent_add(md, &event->elist); | 200 | dm_uevent_add(md, &event->elist); |
202 | |||
203 | out: | ||
204 | dm_put(md); | ||
205 | } | 201 | } |
206 | EXPORT_SYMBOL_GPL(dm_path_uevent); | 202 | EXPORT_SYMBOL_GPL(dm_path_uevent); |
207 | 203 | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 724efc63904d..d21e1284604f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -143,9 +143,19 @@ struct mapped_device { | |||
143 | int barrier_error; | 143 | int barrier_error; |
144 | 144 | ||
145 | /* | 145 | /* |
146 | * Protect barrier_error from concurrent endio processing | ||
147 | * in request-based dm. | ||
148 | */ | ||
149 | spinlock_t barrier_error_lock; | ||
150 | |||
151 | /* | ||
146 | * Processing queue (flush/barriers) | 152 | * Processing queue (flush/barriers) |
147 | */ | 153 | */ |
148 | struct workqueue_struct *wq; | 154 | struct workqueue_struct *wq; |
155 | struct work_struct barrier_work; | ||
156 | |||
157 | /* A pointer to the currently processing pre/post flush request */ | ||
158 | struct request *flush_request; | ||
149 | 159 | ||
150 | /* | 160 | /* |
151 | * The current mapping. | 161 | * The current mapping. |
@@ -178,9 +188,6 @@ struct mapped_device { | |||
178 | /* forced geometry settings */ | 188 | /* forced geometry settings */ |
179 | struct hd_geometry geometry; | 189 | struct hd_geometry geometry; |
180 | 190 | ||
181 | /* marker of flush suspend for request-based dm */ | ||
182 | struct request suspend_rq; | ||
183 | |||
184 | /* For saving the address of __make_request for request based dm */ | 191 | /* For saving the address of __make_request for request based dm */ |
185 | make_request_fn *saved_make_request_fn; | 192 | make_request_fn *saved_make_request_fn; |
186 | 193 | ||
@@ -275,6 +282,7 @@ static int (*_inits[])(void) __initdata = { | |||
275 | dm_target_init, | 282 | dm_target_init, |
276 | dm_linear_init, | 283 | dm_linear_init, |
277 | dm_stripe_init, | 284 | dm_stripe_init, |
285 | dm_io_init, | ||
278 | dm_kcopyd_init, | 286 | dm_kcopyd_init, |
279 | dm_interface_init, | 287 | dm_interface_init, |
280 | }; | 288 | }; |
@@ -284,6 +292,7 @@ static void (*_exits[])(void) = { | |||
284 | dm_target_exit, | 292 | dm_target_exit, |
285 | dm_linear_exit, | 293 | dm_linear_exit, |
286 | dm_stripe_exit, | 294 | dm_stripe_exit, |
295 | dm_io_exit, | ||
287 | dm_kcopyd_exit, | 296 | dm_kcopyd_exit, |
288 | dm_interface_exit, | 297 | dm_interface_exit, |
289 | }; | 298 | }; |
@@ -320,6 +329,11 @@ static void __exit dm_exit(void) | |||
320 | /* | 329 | /* |
321 | * Block device functions | 330 | * Block device functions |
322 | */ | 331 | */ |
332 | int dm_deleting_md(struct mapped_device *md) | ||
333 | { | ||
334 | return test_bit(DMF_DELETING, &md->flags); | ||
335 | } | ||
336 | |||
323 | static int dm_blk_open(struct block_device *bdev, fmode_t mode) | 337 | static int dm_blk_open(struct block_device *bdev, fmode_t mode) |
324 | { | 338 | { |
325 | struct mapped_device *md; | 339 | struct mapped_device *md; |
@@ -331,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) | |||
331 | goto out; | 345 | goto out; |
332 | 346 | ||
333 | if (test_bit(DMF_FREEING, &md->flags) || | 347 | if (test_bit(DMF_FREEING, &md->flags) || |
334 | test_bit(DMF_DELETING, &md->flags)) { | 348 | dm_deleting_md(md)) { |
335 | md = NULL; | 349 | md = NULL; |
336 | goto out; | 350 | goto out; |
337 | } | 351 | } |
@@ -388,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, | |||
388 | unsigned int cmd, unsigned long arg) | 402 | unsigned int cmd, unsigned long arg) |
389 | { | 403 | { |
390 | struct mapped_device *md = bdev->bd_disk->private_data; | 404 | struct mapped_device *md = bdev->bd_disk->private_data; |
391 | struct dm_table *map = dm_get_table(md); | 405 | struct dm_table *map = dm_get_live_table(md); |
392 | struct dm_target *tgt; | 406 | struct dm_target *tgt; |
393 | int r = -ENOTTY; | 407 | int r = -ENOTTY; |
394 | 408 | ||
@@ -401,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, | |||
401 | 415 | ||
402 | tgt = dm_table_get_target(map, 0); | 416 | tgt = dm_table_get_target(map, 0); |
403 | 417 | ||
404 | if (dm_suspended(md)) { | 418 | if (dm_suspended_md(md)) { |
405 | r = -EAGAIN; | 419 | r = -EAGAIN; |
406 | goto out; | 420 | goto out; |
407 | } | 421 | } |
@@ -430,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | |||
430 | mempool_free(tio, md->tio_pool); | 444 | mempool_free(tio, md->tio_pool); |
431 | } | 445 | } |
432 | 446 | ||
433 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) | 447 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, |
448 | gfp_t gfp_mask) | ||
434 | { | 449 | { |
435 | return mempool_alloc(md->tio_pool, GFP_ATOMIC); | 450 | return mempool_alloc(md->tio_pool, gfp_mask); |
436 | } | 451 | } |
437 | 452 | ||
438 | static void free_rq_tio(struct dm_rq_target_io *tio) | 453 | static void free_rq_tio(struct dm_rq_target_io *tio) |
@@ -450,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info) | |||
450 | mempool_free(info, info->tio->md->io_pool); | 465 | mempool_free(info, info->tio->md->io_pool); |
451 | } | 466 | } |
452 | 467 | ||
468 | static int md_in_flight(struct mapped_device *md) | ||
469 | { | ||
470 | return atomic_read(&md->pending[READ]) + | ||
471 | atomic_read(&md->pending[WRITE]); | ||
472 | } | ||
473 | |||
453 | static void start_io_acct(struct dm_io *io) | 474 | static void start_io_acct(struct dm_io *io) |
454 | { | 475 | { |
455 | struct mapped_device *md = io->md; | 476 | struct mapped_device *md = io->md; |
@@ -512,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio) | |||
512 | * function to access the md->map field, and make sure they call | 533 | * function to access the md->map field, and make sure they call |
513 | * dm_table_put() when finished. | 534 | * dm_table_put() when finished. |
514 | */ | 535 | */ |
515 | struct dm_table *dm_get_table(struct mapped_device *md) | 536 | struct dm_table *dm_get_live_table(struct mapped_device *md) |
516 | { | 537 | { |
517 | struct dm_table *t; | 538 | struct dm_table *t; |
518 | unsigned long flags; | 539 | unsigned long flags; |
@@ -614,8 +635,10 @@ static void dec_pending(struct dm_io *io, int error) | |||
614 | if (!md->barrier_error && io_error != -EOPNOTSUPP) | 635 | if (!md->barrier_error && io_error != -EOPNOTSUPP) |
615 | md->barrier_error = io_error; | 636 | md->barrier_error = io_error; |
616 | end_io_acct(io); | 637 | end_io_acct(io); |
638 | free_io(md, io); | ||
617 | } else { | 639 | } else { |
618 | end_io_acct(io); | 640 | end_io_acct(io); |
641 | free_io(md, io); | ||
619 | 642 | ||
620 | if (io_error != DM_ENDIO_REQUEUE) { | 643 | if (io_error != DM_ENDIO_REQUEUE) { |
621 | trace_block_bio_complete(md->queue, bio); | 644 | trace_block_bio_complete(md->queue, bio); |
@@ -623,8 +646,6 @@ static void dec_pending(struct dm_io *io, int error) | |||
623 | bio_endio(bio, io_error); | 646 | bio_endio(bio, io_error); |
624 | } | 647 | } |
625 | } | 648 | } |
626 | |||
627 | free_io(md, io); | ||
628 | } | 649 | } |
629 | } | 650 | } |
630 | 651 | ||
@@ -716,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error) | |||
716 | blk_update_request(tio->orig, 0, nr_bytes); | 737 | blk_update_request(tio->orig, 0, nr_bytes); |
717 | } | 738 | } |
718 | 739 | ||
740 | static void store_barrier_error(struct mapped_device *md, int error) | ||
741 | { | ||
742 | unsigned long flags; | ||
743 | |||
744 | spin_lock_irqsave(&md->barrier_error_lock, flags); | ||
745 | /* | ||
746 | * Basically, the first error is taken, but: | ||
747 | * -EOPNOTSUPP supersedes any I/O error. | ||
748 | * Requeue request supersedes any I/O error but -EOPNOTSUPP. | ||
749 | */ | ||
750 | if (!md->barrier_error || error == -EOPNOTSUPP || | ||
751 | (md->barrier_error != -EOPNOTSUPP && | ||
752 | error == DM_ENDIO_REQUEUE)) | ||
753 | md->barrier_error = error; | ||
754 | spin_unlock_irqrestore(&md->barrier_error_lock, flags); | ||
755 | } | ||
756 | |||
719 | /* | 757 | /* |
720 | * Don't touch any member of the md after calling this function because | 758 | * Don't touch any member of the md after calling this function because |
721 | * the md may be freed in dm_put() at the end of this function. | 759 | * the md may be freed in dm_put() at the end of this function. |
722 | * Or do dm_get() before calling this function and dm_put() later. | 760 | * Or do dm_get() before calling this function and dm_put() later. |
723 | */ | 761 | */ |
724 | static void rq_completed(struct mapped_device *md, int run_queue) | 762 | static void rq_completed(struct mapped_device *md, int rw, int run_queue) |
725 | { | 763 | { |
726 | int wakeup_waiters = 0; | 764 | atomic_dec(&md->pending[rw]); |
727 | struct request_queue *q = md->queue; | ||
728 | unsigned long flags; | ||
729 | |||
730 | spin_lock_irqsave(q->queue_lock, flags); | ||
731 | if (!queue_in_flight(q)) | ||
732 | wakeup_waiters = 1; | ||
733 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
734 | 765 | ||
735 | /* nudge anyone waiting on suspend queue */ | 766 | /* nudge anyone waiting on suspend queue */ |
736 | if (wakeup_waiters) | 767 | if (!md_in_flight(md)) |
737 | wake_up(&md->wait); | 768 | wake_up(&md->wait); |
738 | 769 | ||
739 | if (run_queue) | 770 | if (run_queue) |
740 | blk_run_queue(q); | 771 | blk_run_queue(md->queue); |
741 | 772 | ||
742 | /* | 773 | /* |
743 | * dm_put() must be at the end of this function. See the comment above | 774 | * dm_put() must be at the end of this function. See the comment above |
@@ -753,6 +784,44 @@ static void free_rq_clone(struct request *clone) | |||
753 | free_rq_tio(tio); | 784 | free_rq_tio(tio); |
754 | } | 785 | } |
755 | 786 | ||
787 | /* | ||
788 | * Complete the clone and the original request. | ||
789 | * Must be called without queue lock. | ||
790 | */ | ||
791 | static void dm_end_request(struct request *clone, int error) | ||
792 | { | ||
793 | int rw = rq_data_dir(clone); | ||
794 | int run_queue = 1; | ||
795 | bool is_barrier = blk_barrier_rq(clone); | ||
796 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
797 | struct mapped_device *md = tio->md; | ||
798 | struct request *rq = tio->orig; | ||
799 | |||
800 | if (blk_pc_request(rq) && !is_barrier) { | ||
801 | rq->errors = clone->errors; | ||
802 | rq->resid_len = clone->resid_len; | ||
803 | |||
804 | if (rq->sense) | ||
805 | /* | ||
806 | * We are using the sense buffer of the original | ||
807 | * request. | ||
808 | * So setting the length of the sense data is enough. | ||
809 | */ | ||
810 | rq->sense_len = clone->sense_len; | ||
811 | } | ||
812 | |||
813 | free_rq_clone(clone); | ||
814 | |||
815 | if (unlikely(is_barrier)) { | ||
816 | if (unlikely(error)) | ||
817 | store_barrier_error(md, error); | ||
818 | run_queue = 0; | ||
819 | } else | ||
820 | blk_end_request_all(rq, error); | ||
821 | |||
822 | rq_completed(md, rw, run_queue); | ||
823 | } | ||
824 | |||
756 | static void dm_unprep_request(struct request *rq) | 825 | static void dm_unprep_request(struct request *rq) |
757 | { | 826 | { |
758 | struct request *clone = rq->special; | 827 | struct request *clone = rq->special; |
@@ -768,12 +837,23 @@ static void dm_unprep_request(struct request *rq) | |||
768 | */ | 837 | */ |
769 | void dm_requeue_unmapped_request(struct request *clone) | 838 | void dm_requeue_unmapped_request(struct request *clone) |
770 | { | 839 | { |
840 | int rw = rq_data_dir(clone); | ||
771 | struct dm_rq_target_io *tio = clone->end_io_data; | 841 | struct dm_rq_target_io *tio = clone->end_io_data; |
772 | struct mapped_device *md = tio->md; | 842 | struct mapped_device *md = tio->md; |
773 | struct request *rq = tio->orig; | 843 | struct request *rq = tio->orig; |
774 | struct request_queue *q = rq->q; | 844 | struct request_queue *q = rq->q; |
775 | unsigned long flags; | 845 | unsigned long flags; |
776 | 846 | ||
847 | if (unlikely(blk_barrier_rq(clone))) { | ||
848 | /* | ||
849 | * Barrier clones share an original request. | ||
850 | * Leave it to dm_end_request(), which handles this special | ||
851 | * case. | ||
852 | */ | ||
853 | dm_end_request(clone, DM_ENDIO_REQUEUE); | ||
854 | return; | ||
855 | } | ||
856 | |||
777 | dm_unprep_request(rq); | 857 | dm_unprep_request(rq); |
778 | 858 | ||
779 | spin_lock_irqsave(q->queue_lock, flags); | 859 | spin_lock_irqsave(q->queue_lock, flags); |
@@ -782,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone) | |||
782 | blk_requeue_request(q, rq); | 862 | blk_requeue_request(q, rq); |
783 | spin_unlock_irqrestore(q->queue_lock, flags); | 863 | spin_unlock_irqrestore(q->queue_lock, flags); |
784 | 864 | ||
785 | rq_completed(md, 0); | 865 | rq_completed(md, rw, 0); |
786 | } | 866 | } |
787 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); | 867 | EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); |
788 | 868 | ||
@@ -815,34 +895,28 @@ static void start_queue(struct request_queue *q) | |||
815 | spin_unlock_irqrestore(q->queue_lock, flags); | 895 | spin_unlock_irqrestore(q->queue_lock, flags); |
816 | } | 896 | } |
817 | 897 | ||
818 | /* | 898 | static void dm_done(struct request *clone, int error, bool mapped) |
819 | * Complete the clone and the original request. | ||
820 | * Must be called without queue lock. | ||
821 | */ | ||
822 | static void dm_end_request(struct request *clone, int error) | ||
823 | { | 899 | { |
900 | int r = error; | ||
824 | struct dm_rq_target_io *tio = clone->end_io_data; | 901 | struct dm_rq_target_io *tio = clone->end_io_data; |
825 | struct mapped_device *md = tio->md; | 902 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; |
826 | struct request *rq = tio->orig; | ||
827 | 903 | ||
828 | if (blk_pc_request(rq)) { | 904 | if (mapped && rq_end_io) |
829 | rq->errors = clone->errors; | 905 | r = rq_end_io(tio->ti, clone, error, &tio->info); |
830 | rq->resid_len = clone->resid_len; | ||
831 | 906 | ||
832 | if (rq->sense) | 907 | if (r <= 0) |
833 | /* | 908 | /* The target wants to complete the I/O */ |
834 | * We are using the sense buffer of the original | 909 | dm_end_request(clone, r); |
835 | * request. | 910 | else if (r == DM_ENDIO_INCOMPLETE) |
836 | * So setting the length of the sense data is enough. | 911 | /* The target will handle the I/O */ |
837 | */ | 912 | return; |
838 | rq->sense_len = clone->sense_len; | 913 | else if (r == DM_ENDIO_REQUEUE) |
914 | /* The target wants to requeue the I/O */ | ||
915 | dm_requeue_unmapped_request(clone); | ||
916 | else { | ||
917 | DMWARN("unimplemented target endio return value: %d", r); | ||
918 | BUG(); | ||
839 | } | 919 | } |
840 | |||
841 | free_rq_clone(clone); | ||
842 | |||
843 | blk_end_request_all(rq, error); | ||
844 | |||
845 | rq_completed(md, 1); | ||
846 | } | 920 | } |
847 | 921 | ||
848 | /* | 922 | /* |
@@ -850,27 +924,14 @@ static void dm_end_request(struct request *clone, int error) | |||
850 | */ | 924 | */ |
851 | static void dm_softirq_done(struct request *rq) | 925 | static void dm_softirq_done(struct request *rq) |
852 | { | 926 | { |
927 | bool mapped = true; | ||
853 | struct request *clone = rq->completion_data; | 928 | struct request *clone = rq->completion_data; |
854 | struct dm_rq_target_io *tio = clone->end_io_data; | 929 | struct dm_rq_target_io *tio = clone->end_io_data; |
855 | dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; | ||
856 | int error = tio->error; | ||
857 | 930 | ||
858 | if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) | 931 | if (rq->cmd_flags & REQ_FAILED) |
859 | error = rq_end_io(tio->ti, clone, error, &tio->info); | 932 | mapped = false; |
860 | 933 | ||
861 | if (error <= 0) | 934 | dm_done(clone, tio->error, mapped); |
862 | /* The target wants to complete the I/O */ | ||
863 | dm_end_request(clone, error); | ||
864 | else if (error == DM_ENDIO_INCOMPLETE) | ||
865 | /* The target will handle the I/O */ | ||
866 | return; | ||
867 | else if (error == DM_ENDIO_REQUEUE) | ||
868 | /* The target wants to requeue the I/O */ | ||
869 | dm_requeue_unmapped_request(clone); | ||
870 | else { | ||
871 | DMWARN("unimplemented target endio return value: %d", error); | ||
872 | BUG(); | ||
873 | } | ||
874 | } | 935 | } |
875 | 936 | ||
876 | /* | 937 | /* |
@@ -882,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error) | |||
882 | struct dm_rq_target_io *tio = clone->end_io_data; | 943 | struct dm_rq_target_io *tio = clone->end_io_data; |
883 | struct request *rq = tio->orig; | 944 | struct request *rq = tio->orig; |
884 | 945 | ||
946 | if (unlikely(blk_barrier_rq(clone))) { | ||
947 | /* | ||
948 | * Barrier clones share an original request. So can't use | ||
949 | * softirq_done with the original. | ||
950 | * Pass the clone to dm_done() directly in this special case. | ||
951 | * It is safe (even if clone->q->queue_lock is held here) | ||
952 | * because there is no I/O dispatching during the completion | ||
953 | * of barrier clone. | ||
954 | */ | ||
955 | dm_done(clone, error, true); | ||
956 | return; | ||
957 | } | ||
958 | |||
885 | tio->error = error; | 959 | tio->error = error; |
886 | rq->completion_data = clone; | 960 | rq->completion_data = clone; |
887 | blk_complete_request(rq); | 961 | blk_complete_request(rq); |
@@ -898,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error) | |||
898 | struct dm_rq_target_io *tio = clone->end_io_data; | 972 | struct dm_rq_target_io *tio = clone->end_io_data; |
899 | struct request *rq = tio->orig; | 973 | struct request *rq = tio->orig; |
900 | 974 | ||
975 | if (unlikely(blk_barrier_rq(clone))) { | ||
976 | /* | ||
977 | * Barrier clones share an original request. | ||
978 | * Leave it to dm_end_request(), which handles this special | ||
979 | * case. | ||
980 | */ | ||
981 | BUG_ON(error > 0); | ||
982 | dm_end_request(clone, error); | ||
983 | return; | ||
984 | } | ||
985 | |||
901 | rq->cmd_flags |= REQ_FAILED; | 986 | rq->cmd_flags |= REQ_FAILED; |
902 | dm_complete_request(clone, error); | 987 | dm_complete_request(clone, error); |
903 | } | 988 | } |
@@ -1214,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1214 | struct clone_info ci; | 1299 | struct clone_info ci; |
1215 | int error = 0; | 1300 | int error = 0; |
1216 | 1301 | ||
1217 | ci.map = dm_get_table(md); | 1302 | ci.map = dm_get_live_table(md); |
1218 | if (unlikely(!ci.map)) { | 1303 | if (unlikely(!ci.map)) { |
1219 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) | 1304 | if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) |
1220 | bio_io_error(bio); | 1305 | bio_io_error(bio); |
@@ -1255,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q, | |||
1255 | struct bio_vec *biovec) | 1340 | struct bio_vec *biovec) |
1256 | { | 1341 | { |
1257 | struct mapped_device *md = q->queuedata; | 1342 | struct mapped_device *md = q->queuedata; |
1258 | struct dm_table *map = dm_get_table(md); | 1343 | struct dm_table *map = dm_get_live_table(md); |
1259 | struct dm_target *ti; | 1344 | struct dm_target *ti; |
1260 | sector_t max_sectors; | 1345 | sector_t max_sectors; |
1261 | int max_size = 0; | 1346 | int max_size = 0; |
@@ -1352,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio) | |||
1352 | { | 1437 | { |
1353 | struct mapped_device *md = q->queuedata; | 1438 | struct mapped_device *md = q->queuedata; |
1354 | 1439 | ||
1355 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | ||
1356 | bio_endio(bio, -EOPNOTSUPP); | ||
1357 | return 0; | ||
1358 | } | ||
1359 | |||
1360 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ | 1440 | return md->saved_make_request_fn(q, bio); /* call __make_request() */ |
1361 | } | 1441 | } |
1362 | 1442 | ||
@@ -1375,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
1375 | return _dm_request(q, bio); | 1455 | return _dm_request(q, bio); |
1376 | } | 1456 | } |
1377 | 1457 | ||
1458 | /* | ||
1459 | * Mark this request as flush request, so that dm_request_fn() can | ||
1460 | * recognize. | ||
1461 | */ | ||
1462 | static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) | ||
1463 | { | ||
1464 | rq->cmd_type = REQ_TYPE_LINUX_BLOCK; | ||
1465 | rq->cmd[0] = REQ_LB_OP_FLUSH; | ||
1466 | } | ||
1467 | |||
1468 | static bool dm_rq_is_flush_request(struct request *rq) | ||
1469 | { | ||
1470 | if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && | ||
1471 | rq->cmd[0] == REQ_LB_OP_FLUSH) | ||
1472 | return true; | ||
1473 | else | ||
1474 | return false; | ||
1475 | } | ||
1476 | |||
1378 | void dm_dispatch_request(struct request *rq) | 1477 | void dm_dispatch_request(struct request *rq) |
1379 | { | 1478 | { |
1380 | int r; | 1479 | int r; |
@@ -1420,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, | |||
1420 | static int setup_clone(struct request *clone, struct request *rq, | 1519 | static int setup_clone(struct request *clone, struct request *rq, |
1421 | struct dm_rq_target_io *tio) | 1520 | struct dm_rq_target_io *tio) |
1422 | { | 1521 | { |
1423 | int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | 1522 | int r; |
1424 | dm_rq_bio_constructor, tio); | ||
1425 | 1523 | ||
1426 | if (r) | 1524 | if (dm_rq_is_flush_request(rq)) { |
1427 | return r; | 1525 | blk_rq_init(NULL, clone); |
1526 | clone->cmd_type = REQ_TYPE_FS; | ||
1527 | clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); | ||
1528 | } else { | ||
1529 | r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
1530 | dm_rq_bio_constructor, tio); | ||
1531 | if (r) | ||
1532 | return r; | ||
1533 | |||
1534 | clone->cmd = rq->cmd; | ||
1535 | clone->cmd_len = rq->cmd_len; | ||
1536 | clone->sense = rq->sense; | ||
1537 | clone->buffer = rq->buffer; | ||
1538 | } | ||
1428 | 1539 | ||
1429 | clone->cmd = rq->cmd; | ||
1430 | clone->cmd_len = rq->cmd_len; | ||
1431 | clone->sense = rq->sense; | ||
1432 | clone->buffer = rq->buffer; | ||
1433 | clone->end_io = end_clone_request; | 1540 | clone->end_io = end_clone_request; |
1434 | clone->end_io_data = tio; | 1541 | clone->end_io_data = tio; |
1435 | 1542 | ||
1436 | return 0; | 1543 | return 0; |
1437 | } | 1544 | } |
1438 | 1545 | ||
1439 | static int dm_rq_flush_suspending(struct mapped_device *md) | 1546 | static struct request *clone_rq(struct request *rq, struct mapped_device *md, |
1547 | gfp_t gfp_mask) | ||
1440 | { | 1548 | { |
1441 | return !md->suspend_rq.special; | 1549 | struct request *clone; |
1550 | struct dm_rq_target_io *tio; | ||
1551 | |||
1552 | tio = alloc_rq_tio(md, gfp_mask); | ||
1553 | if (!tio) | ||
1554 | return NULL; | ||
1555 | |||
1556 | tio->md = md; | ||
1557 | tio->ti = NULL; | ||
1558 | tio->orig = rq; | ||
1559 | tio->error = 0; | ||
1560 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1561 | |||
1562 | clone = &tio->clone; | ||
1563 | if (setup_clone(clone, rq, tio)) { | ||
1564 | /* -ENOMEM */ | ||
1565 | free_rq_tio(tio); | ||
1566 | return NULL; | ||
1567 | } | ||
1568 | |||
1569 | return clone; | ||
1442 | } | 1570 | } |
1443 | 1571 | ||
1444 | /* | 1572 | /* |
@@ -1447,51 +1575,35 @@ static int dm_rq_flush_suspending(struct mapped_device *md) | |||
1447 | static int dm_prep_fn(struct request_queue *q, struct request *rq) | 1575 | static int dm_prep_fn(struct request_queue *q, struct request *rq) |
1448 | { | 1576 | { |
1449 | struct mapped_device *md = q->queuedata; | 1577 | struct mapped_device *md = q->queuedata; |
1450 | struct dm_rq_target_io *tio; | ||
1451 | struct request *clone; | 1578 | struct request *clone; |
1452 | 1579 | ||
1453 | if (unlikely(rq == &md->suspend_rq)) { | 1580 | if (unlikely(dm_rq_is_flush_request(rq))) |
1454 | if (dm_rq_flush_suspending(md)) | 1581 | return BLKPREP_OK; |
1455 | return BLKPREP_OK; | ||
1456 | else | ||
1457 | /* The flush suspend was interrupted */ | ||
1458 | return BLKPREP_KILL; | ||
1459 | } | ||
1460 | 1582 | ||
1461 | if (unlikely(rq->special)) { | 1583 | if (unlikely(rq->special)) { |
1462 | DMWARN("Already has something in rq->special."); | 1584 | DMWARN("Already has something in rq->special."); |
1463 | return BLKPREP_KILL; | 1585 | return BLKPREP_KILL; |
1464 | } | 1586 | } |
1465 | 1587 | ||
1466 | tio = alloc_rq_tio(md); /* Only one for each original request */ | 1588 | clone = clone_rq(rq, md, GFP_ATOMIC); |
1467 | if (!tio) | 1589 | if (!clone) |
1468 | /* -ENOMEM */ | ||
1469 | return BLKPREP_DEFER; | 1590 | return BLKPREP_DEFER; |
1470 | 1591 | ||
1471 | tio->md = md; | ||
1472 | tio->ti = NULL; | ||
1473 | tio->orig = rq; | ||
1474 | tio->error = 0; | ||
1475 | memset(&tio->info, 0, sizeof(tio->info)); | ||
1476 | |||
1477 | clone = &tio->clone; | ||
1478 | if (setup_clone(clone, rq, tio)) { | ||
1479 | /* -ENOMEM */ | ||
1480 | free_rq_tio(tio); | ||
1481 | return BLKPREP_DEFER; | ||
1482 | } | ||
1483 | |||
1484 | rq->special = clone; | 1592 | rq->special = clone; |
1485 | rq->cmd_flags |= REQ_DONTPREP; | 1593 | rq->cmd_flags |= REQ_DONTPREP; |
1486 | 1594 | ||
1487 | return BLKPREP_OK; | 1595 | return BLKPREP_OK; |
1488 | } | 1596 | } |
1489 | 1597 | ||
1490 | static void map_request(struct dm_target *ti, struct request *rq, | 1598 | /* |
1491 | struct mapped_device *md) | 1599 | * Returns: |
1600 | * 0 : the request has been processed (not requeued) | ||
1601 | * !0 : the request has been requeued | ||
1602 | */ | ||
1603 | static int map_request(struct dm_target *ti, struct request *clone, | ||
1604 | struct mapped_device *md) | ||
1492 | { | 1605 | { |
1493 | int r; | 1606 | int r, requeued = 0; |
1494 | struct request *clone = rq->special; | ||
1495 | struct dm_rq_target_io *tio = clone->end_io_data; | 1607 | struct dm_rq_target_io *tio = clone->end_io_data; |
1496 | 1608 | ||
1497 | /* | 1609 | /* |
@@ -1511,11 +1623,14 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1511 | break; | 1623 | break; |
1512 | case DM_MAPIO_REMAPPED: | 1624 | case DM_MAPIO_REMAPPED: |
1513 | /* The target has remapped the I/O so dispatch it */ | 1625 | /* The target has remapped the I/O so dispatch it */ |
1626 | trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), | ||
1627 | blk_rq_pos(tio->orig)); | ||
1514 | dm_dispatch_request(clone); | 1628 | dm_dispatch_request(clone); |
1515 | break; | 1629 | break; |
1516 | case DM_MAPIO_REQUEUE: | 1630 | case DM_MAPIO_REQUEUE: |
1517 | /* The target wants to requeue the I/O */ | 1631 | /* The target wants to requeue the I/O */ |
1518 | dm_requeue_unmapped_request(clone); | 1632 | dm_requeue_unmapped_request(clone); |
1633 | requeued = 1; | ||
1519 | break; | 1634 | break; |
1520 | default: | 1635 | default: |
1521 | if (r > 0) { | 1636 | if (r > 0) { |
@@ -1527,6 +1642,8 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1527 | dm_kill_unmapped_request(clone, r); | 1642 | dm_kill_unmapped_request(clone, r); |
1528 | break; | 1643 | break; |
1529 | } | 1644 | } |
1645 | |||
1646 | return requeued; | ||
1530 | } | 1647 | } |
1531 | 1648 | ||
1532 | /* | 1649 | /* |
@@ -1536,29 +1653,26 @@ static void map_request(struct dm_target *ti, struct request *rq, | |||
1536 | static void dm_request_fn(struct request_queue *q) | 1653 | static void dm_request_fn(struct request_queue *q) |
1537 | { | 1654 | { |
1538 | struct mapped_device *md = q->queuedata; | 1655 | struct mapped_device *md = q->queuedata; |
1539 | struct dm_table *map = dm_get_table(md); | 1656 | struct dm_table *map = dm_get_live_table(md); |
1540 | struct dm_target *ti; | 1657 | struct dm_target *ti; |
1541 | struct request *rq; | 1658 | struct request *rq, *clone; |
1542 | 1659 | ||
1543 | /* | 1660 | /* |
1544 | * For noflush suspend, check blk_queue_stopped() to immediately | 1661 | * For suspend, check blk_queue_stopped() and increment |
1545 | * quit I/O dispatching. | 1662 | * ->pending within a single queue_lock not to increment the |
1663 | * number of in-flight I/Os after the queue is stopped in | ||
1664 | * dm_suspend(). | ||
1546 | */ | 1665 | */ |
1547 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | 1666 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { |
1548 | rq = blk_peek_request(q); | 1667 | rq = blk_peek_request(q); |
1549 | if (!rq) | 1668 | if (!rq) |
1550 | goto plug_and_out; | 1669 | goto plug_and_out; |
1551 | 1670 | ||
1552 | if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ | 1671 | if (unlikely(dm_rq_is_flush_request(rq))) { |
1553 | if (queue_in_flight(q)) | 1672 | BUG_ON(md->flush_request); |
1554 | /* Not quiet yet. Wait more */ | 1673 | md->flush_request = rq; |
1555 | goto plug_and_out; | ||
1556 | |||
1557 | /* This device should be quiet now */ | ||
1558 | __stop_queue(q); | ||
1559 | blk_start_request(rq); | 1674 | blk_start_request(rq); |
1560 | __blk_end_request_all(rq, 0); | 1675 | queue_work(md->wq, &md->barrier_work); |
1561 | wake_up(&md->wait); | ||
1562 | goto out; | 1676 | goto out; |
1563 | } | 1677 | } |
1564 | 1678 | ||
@@ -1567,13 +1681,21 @@ static void dm_request_fn(struct request_queue *q) | |||
1567 | goto plug_and_out; | 1681 | goto plug_and_out; |
1568 | 1682 | ||
1569 | blk_start_request(rq); | 1683 | blk_start_request(rq); |
1684 | clone = rq->special; | ||
1685 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
1686 | |||
1570 | spin_unlock(q->queue_lock); | 1687 | spin_unlock(q->queue_lock); |
1571 | map_request(ti, rq, md); | 1688 | if (map_request(ti, clone, md)) |
1689 | goto requeued; | ||
1690 | |||
1572 | spin_lock_irq(q->queue_lock); | 1691 | spin_lock_irq(q->queue_lock); |
1573 | } | 1692 | } |
1574 | 1693 | ||
1575 | goto out; | 1694 | goto out; |
1576 | 1695 | ||
1696 | requeued: | ||
1697 | spin_lock_irq(q->queue_lock); | ||
1698 | |||
1577 | plug_and_out: | 1699 | plug_and_out: |
1578 | if (!elv_queue_empty(q)) | 1700 | if (!elv_queue_empty(q)) |
1579 | /* Some requests still remain, retry later */ | 1701 | /* Some requests still remain, retry later */ |
@@ -1595,7 +1717,7 @@ static int dm_lld_busy(struct request_queue *q) | |||
1595 | { | 1717 | { |
1596 | int r; | 1718 | int r; |
1597 | struct mapped_device *md = q->queuedata; | 1719 | struct mapped_device *md = q->queuedata; |
1598 | struct dm_table *map = dm_get_table(md); | 1720 | struct dm_table *map = dm_get_live_table(md); |
1599 | 1721 | ||
1600 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) | 1722 | if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) |
1601 | r = 1; | 1723 | r = 1; |
@@ -1610,7 +1732,7 @@ static int dm_lld_busy(struct request_queue *q) | |||
1610 | static void dm_unplug_all(struct request_queue *q) | 1732 | static void dm_unplug_all(struct request_queue *q) |
1611 | { | 1733 | { |
1612 | struct mapped_device *md = q->queuedata; | 1734 | struct mapped_device *md = q->queuedata; |
1613 | struct dm_table *map = dm_get_table(md); | 1735 | struct dm_table *map = dm_get_live_table(md); |
1614 | 1736 | ||
1615 | if (map) { | 1737 | if (map) { |
1616 | if (dm_request_based(md)) | 1738 | if (dm_request_based(md)) |
@@ -1628,7 +1750,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) | |||
1628 | struct dm_table *map; | 1750 | struct dm_table *map; |
1629 | 1751 | ||
1630 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 1752 | if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
1631 | map = dm_get_table(md); | 1753 | map = dm_get_live_table(md); |
1632 | if (map) { | 1754 | if (map) { |
1633 | /* | 1755 | /* |
1634 | * Request-based dm cares about only own queue for | 1756 | * Request-based dm cares about only own queue for |
@@ -1725,6 +1847,7 @@ out: | |||
1725 | static const struct block_device_operations dm_blk_dops; | 1847 | static const struct block_device_operations dm_blk_dops; |
1726 | 1848 | ||
1727 | static void dm_wq_work(struct work_struct *work); | 1849 | static void dm_wq_work(struct work_struct *work); |
1850 | static void dm_rq_barrier_work(struct work_struct *work); | ||
1728 | 1851 | ||
1729 | /* | 1852 | /* |
1730 | * Allocate and initialise a blank device with a given minor. | 1853 | * Allocate and initialise a blank device with a given minor. |
@@ -1754,6 +1877,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1754 | init_rwsem(&md->io_lock); | 1877 | init_rwsem(&md->io_lock); |
1755 | mutex_init(&md->suspend_lock); | 1878 | mutex_init(&md->suspend_lock); |
1756 | spin_lock_init(&md->deferred_lock); | 1879 | spin_lock_init(&md->deferred_lock); |
1880 | spin_lock_init(&md->barrier_error_lock); | ||
1757 | rwlock_init(&md->map_lock); | 1881 | rwlock_init(&md->map_lock); |
1758 | atomic_set(&md->holders, 1); | 1882 | atomic_set(&md->holders, 1); |
1759 | atomic_set(&md->open_count, 0); | 1883 | atomic_set(&md->open_count, 0); |
@@ -1788,6 +1912,8 @@ static struct mapped_device *alloc_dev(int minor) | |||
1788 | blk_queue_softirq_done(md->queue, dm_softirq_done); | 1912 | blk_queue_softirq_done(md->queue, dm_softirq_done); |
1789 | blk_queue_prep_rq(md->queue, dm_prep_fn); | 1913 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
1790 | blk_queue_lld_busy(md->queue, dm_lld_busy); | 1914 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
1915 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, | ||
1916 | dm_rq_prepare_flush); | ||
1791 | 1917 | ||
1792 | md->disk = alloc_disk(1); | 1918 | md->disk = alloc_disk(1); |
1793 | if (!md->disk) | 1919 | if (!md->disk) |
@@ -1797,6 +1923,7 @@ static struct mapped_device *alloc_dev(int minor) | |||
1797 | atomic_set(&md->pending[1], 0); | 1923 | atomic_set(&md->pending[1], 0); |
1798 | init_waitqueue_head(&md->wait); | 1924 | init_waitqueue_head(&md->wait); |
1799 | INIT_WORK(&md->work, dm_wq_work); | 1925 | INIT_WORK(&md->work, dm_wq_work); |
1926 | INIT_WORK(&md->barrier_work, dm_rq_barrier_work); | ||
1800 | init_waitqueue_head(&md->eventq); | 1927 | init_waitqueue_head(&md->eventq); |
1801 | 1928 | ||
1802 | md->disk->major = _major; | 1929 | md->disk->major = _major; |
@@ -1921,9 +2048,13 @@ static void __set_size(struct mapped_device *md, sector_t size) | |||
1921 | mutex_unlock(&md->bdev->bd_inode->i_mutex); | 2048 | mutex_unlock(&md->bdev->bd_inode->i_mutex); |
1922 | } | 2049 | } |
1923 | 2050 | ||
1924 | static int __bind(struct mapped_device *md, struct dm_table *t, | 2051 | /* |
1925 | struct queue_limits *limits) | 2052 | * Returns old map, which caller must destroy. |
2053 | */ | ||
2054 | static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, | ||
2055 | struct queue_limits *limits) | ||
1926 | { | 2056 | { |
2057 | struct dm_table *old_map; | ||
1927 | struct request_queue *q = md->queue; | 2058 | struct request_queue *q = md->queue; |
1928 | sector_t size; | 2059 | sector_t size; |
1929 | unsigned long flags; | 2060 | unsigned long flags; |
@@ -1938,11 +2069,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t, | |||
1938 | 2069 | ||
1939 | __set_size(md, size); | 2070 | __set_size(md, size); |
1940 | 2071 | ||
1941 | if (!size) { | ||
1942 | dm_table_destroy(t); | ||
1943 | return 0; | ||
1944 | } | ||
1945 | |||
1946 | dm_table_event_callback(t, event_callback, md); | 2072 | dm_table_event_callback(t, event_callback, md); |
1947 | 2073 | ||
1948 | /* | 2074 | /* |
@@ -1958,26 +2084,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t, | |||
1958 | __bind_mempools(md, t); | 2084 | __bind_mempools(md, t); |
1959 | 2085 | ||
1960 | write_lock_irqsave(&md->map_lock, flags); | 2086 | write_lock_irqsave(&md->map_lock, flags); |
2087 | old_map = md->map; | ||
1961 | md->map = t; | 2088 | md->map = t; |
1962 | dm_table_set_restrictions(t, q, limits); | 2089 | dm_table_set_restrictions(t, q, limits); |
1963 | write_unlock_irqrestore(&md->map_lock, flags); | 2090 | write_unlock_irqrestore(&md->map_lock, flags); |
1964 | 2091 | ||
1965 | return 0; | 2092 | return old_map; |
1966 | } | 2093 | } |
1967 | 2094 | ||
1968 | static void __unbind(struct mapped_device *md) | 2095 | /* |
2096 | * Returns unbound table for the caller to free. | ||
2097 | */ | ||
2098 | static struct dm_table *__unbind(struct mapped_device *md) | ||
1969 | { | 2099 | { |
1970 | struct dm_table *map = md->map; | 2100 | struct dm_table *map = md->map; |
1971 | unsigned long flags; | 2101 | unsigned long flags; |
1972 | 2102 | ||
1973 | if (!map) | 2103 | if (!map) |
1974 | return; | 2104 | return NULL; |
1975 | 2105 | ||
1976 | dm_table_event_callback(map, NULL, NULL); | 2106 | dm_table_event_callback(map, NULL, NULL); |
1977 | write_lock_irqsave(&md->map_lock, flags); | 2107 | write_lock_irqsave(&md->map_lock, flags); |
1978 | md->map = NULL; | 2108 | md->map = NULL; |
1979 | write_unlock_irqrestore(&md->map_lock, flags); | 2109 | write_unlock_irqrestore(&md->map_lock, flags); |
1980 | dm_table_destroy(map); | 2110 | |
2111 | return map; | ||
1981 | } | 2112 | } |
1982 | 2113 | ||
1983 | /* | 2114 | /* |
@@ -2059,18 +2190,18 @@ void dm_put(struct mapped_device *md) | |||
2059 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); | 2190 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); |
2060 | 2191 | ||
2061 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { | 2192 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { |
2062 | map = dm_get_table(md); | 2193 | map = dm_get_live_table(md); |
2063 | idr_replace(&_minor_idr, MINOR_ALLOCED, | 2194 | idr_replace(&_minor_idr, MINOR_ALLOCED, |
2064 | MINOR(disk_devt(dm_disk(md)))); | 2195 | MINOR(disk_devt(dm_disk(md)))); |
2065 | set_bit(DMF_FREEING, &md->flags); | 2196 | set_bit(DMF_FREEING, &md->flags); |
2066 | spin_unlock(&_minor_lock); | 2197 | spin_unlock(&_minor_lock); |
2067 | if (!dm_suspended(md)) { | 2198 | if (!dm_suspended_md(md)) { |
2068 | dm_table_presuspend_targets(map); | 2199 | dm_table_presuspend_targets(map); |
2069 | dm_table_postsuspend_targets(map); | 2200 | dm_table_postsuspend_targets(map); |
2070 | } | 2201 | } |
2071 | dm_sysfs_exit(md); | 2202 | dm_sysfs_exit(md); |
2072 | dm_table_put(map); | 2203 | dm_table_put(map); |
2073 | __unbind(md); | 2204 | dm_table_destroy(__unbind(md)); |
2074 | free_dev(md); | 2205 | free_dev(md); |
2075 | } | 2206 | } |
2076 | } | 2207 | } |
@@ -2080,8 +2211,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2080 | { | 2211 | { |
2081 | int r = 0; | 2212 | int r = 0; |
2082 | DECLARE_WAITQUEUE(wait, current); | 2213 | DECLARE_WAITQUEUE(wait, current); |
2083 | struct request_queue *q = md->queue; | ||
2084 | unsigned long flags; | ||
2085 | 2214 | ||
2086 | dm_unplug_all(md->queue); | 2215 | dm_unplug_all(md->queue); |
2087 | 2216 | ||
@@ -2091,15 +2220,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2091 | set_current_state(interruptible); | 2220 | set_current_state(interruptible); |
2092 | 2221 | ||
2093 | smp_mb(); | 2222 | smp_mb(); |
2094 | if (dm_request_based(md)) { | 2223 | if (!md_in_flight(md)) |
2095 | spin_lock_irqsave(q->queue_lock, flags); | ||
2096 | if (!queue_in_flight(q) && blk_queue_stopped(q)) { | ||
2097 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2098 | break; | ||
2099 | } | ||
2100 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2101 | } else if (!atomic_read(&md->pending[0]) && | ||
2102 | !atomic_read(&md->pending[1])) | ||
2103 | break; | 2224 | break; |
2104 | 2225 | ||
2105 | if (interruptible == TASK_INTERRUPTIBLE && | 2226 | if (interruptible == TASK_INTERRUPTIBLE && |
@@ -2194,98 +2315,106 @@ static void dm_queue_flush(struct mapped_device *md) | |||
2194 | queue_work(md->wq, &md->work); | 2315 | queue_work(md->wq, &md->work); |
2195 | } | 2316 | } |
2196 | 2317 | ||
2197 | /* | 2318 | static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) |
2198 | * Swap in a new table (destroying old one). | ||
2199 | */ | ||
2200 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
2201 | { | 2319 | { |
2202 | struct queue_limits limits; | 2320 | struct dm_rq_target_io *tio = clone->end_io_data; |
2203 | int r = -EINVAL; | ||
2204 | 2321 | ||
2205 | mutex_lock(&md->suspend_lock); | 2322 | tio->info.flush_request = flush_nr; |
2323 | } | ||
2206 | 2324 | ||
2207 | /* device must be suspended */ | 2325 | /* Issue barrier requests to targets and wait for their completion. */ |
2208 | if (!dm_suspended(md)) | 2326 | static int dm_rq_barrier(struct mapped_device *md) |
2209 | goto out; | 2327 | { |
2328 | int i, j; | ||
2329 | struct dm_table *map = dm_get_live_table(md); | ||
2330 | unsigned num_targets = dm_table_get_num_targets(map); | ||
2331 | struct dm_target *ti; | ||
2332 | struct request *clone; | ||
2210 | 2333 | ||
2211 | r = dm_calculate_queue_limits(table, &limits); | 2334 | md->barrier_error = 0; |
2212 | if (r) | ||
2213 | goto out; | ||
2214 | 2335 | ||
2215 | /* cannot change the device type, once a table is bound */ | 2336 | for (i = 0; i < num_targets; i++) { |
2216 | if (md->map && | 2337 | ti = dm_table_get_target(map, i); |
2217 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | 2338 | for (j = 0; j < ti->num_flush_requests; j++) { |
2218 | DMWARN("can't change the device type after a table is bound"); | 2339 | clone = clone_rq(md->flush_request, md, GFP_NOIO); |
2219 | goto out; | 2340 | dm_rq_set_flush_nr(clone, j); |
2341 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
2342 | map_request(ti, clone, md); | ||
2343 | } | ||
2220 | } | 2344 | } |
2221 | 2345 | ||
2222 | __unbind(md); | 2346 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); |
2223 | r = __bind(md, table, &limits); | 2347 | dm_table_put(map); |
2224 | |||
2225 | out: | ||
2226 | mutex_unlock(&md->suspend_lock); | ||
2227 | return r; | ||
2228 | } | ||
2229 | 2348 | ||
2230 | static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) | 2349 | return md->barrier_error; |
2231 | { | ||
2232 | md->suspend_rq.special = (void *)0x1; | ||
2233 | } | 2350 | } |
2234 | 2351 | ||
2235 | static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) | 2352 | static void dm_rq_barrier_work(struct work_struct *work) |
2236 | { | 2353 | { |
2354 | int error; | ||
2355 | struct mapped_device *md = container_of(work, struct mapped_device, | ||
2356 | barrier_work); | ||
2237 | struct request_queue *q = md->queue; | 2357 | struct request_queue *q = md->queue; |
2358 | struct request *rq; | ||
2238 | unsigned long flags; | 2359 | unsigned long flags; |
2239 | 2360 | ||
2240 | spin_lock_irqsave(q->queue_lock, flags); | 2361 | /* |
2241 | if (!noflush) | 2362 | * Hold the md reference here and leave it at the last part so that |
2242 | dm_rq_invalidate_suspend_marker(md); | 2363 | * the md can't be deleted by device opener when the barrier request |
2243 | __start_queue(q); | 2364 | * completes. |
2244 | spin_unlock_irqrestore(q->queue_lock, flags); | 2365 | */ |
2245 | } | 2366 | dm_get(md); |
2246 | 2367 | ||
2247 | static void dm_rq_start_suspend(struct mapped_device *md, int noflush) | 2368 | error = dm_rq_barrier(md); |
2248 | { | ||
2249 | struct request *rq = &md->suspend_rq; | ||
2250 | struct request_queue *q = md->queue; | ||
2251 | 2369 | ||
2252 | if (noflush) | 2370 | rq = md->flush_request; |
2253 | stop_queue(q); | 2371 | md->flush_request = NULL; |
2254 | else { | 2372 | |
2255 | blk_rq_init(q, rq); | 2373 | if (error == DM_ENDIO_REQUEUE) { |
2256 | blk_insert_request(q, rq, 0, NULL); | 2374 | spin_lock_irqsave(q->queue_lock, flags); |
2257 | } | 2375 | blk_requeue_request(q, rq); |
2376 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2377 | } else | ||
2378 | blk_end_request_all(rq, error); | ||
2379 | |||
2380 | blk_run_queue(q); | ||
2381 | |||
2382 | dm_put(md); | ||
2258 | } | 2383 | } |
2259 | 2384 | ||
2260 | static int dm_rq_suspend_available(struct mapped_device *md, int noflush) | 2385 | /* |
2386 | * Swap in a new table, returning the old one for the caller to destroy. | ||
2387 | */ | ||
2388 | struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) | ||
2261 | { | 2389 | { |
2262 | int r = 1; | 2390 | struct dm_table *map = ERR_PTR(-EINVAL); |
2263 | struct request *rq = &md->suspend_rq; | 2391 | struct queue_limits limits; |
2264 | struct request_queue *q = md->queue; | 2392 | int r; |
2265 | unsigned long flags; | ||
2266 | 2393 | ||
2267 | if (noflush) | 2394 | mutex_lock(&md->suspend_lock); |
2268 | return r; | ||
2269 | 2395 | ||
2270 | /* The marker must be protected by queue lock if it is in use */ | 2396 | /* device must be suspended */ |
2271 | spin_lock_irqsave(q->queue_lock, flags); | 2397 | if (!dm_suspended_md(md)) |
2272 | if (unlikely(rq->ref_count)) { | 2398 | goto out; |
2273 | /* | 2399 | |
2274 | * This can happen, when the previous flush suspend was | 2400 | r = dm_calculate_queue_limits(table, &limits); |
2275 | * interrupted, the marker is still in the queue and | 2401 | if (r) { |
2276 | * this flush suspend has been invoked, because we don't | 2402 | map = ERR_PTR(r); |
2277 | * remove the marker at the time of suspend interruption. | 2403 | goto out; |
2278 | * We have only one marker per mapped_device, so we can't | ||
2279 | * start another flush suspend while it is in use. | ||
2280 | */ | ||
2281 | BUG_ON(!rq->special); /* The marker should be invalidated */ | ||
2282 | DMWARN("Invalidating the previous flush suspend is still in" | ||
2283 | " progress. Please retry later."); | ||
2284 | r = 0; | ||
2285 | } | 2404 | } |
2286 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2287 | 2405 | ||
2288 | return r; | 2406 | /* cannot change the device type, once a table is bound */ |
2407 | if (md->map && | ||
2408 | (dm_table_get_type(md->map) != dm_table_get_type(table))) { | ||
2409 | DMWARN("can't change the device type after a table is bound"); | ||
2410 | goto out; | ||
2411 | } | ||
2412 | |||
2413 | map = __bind(md, table, &limits); | ||
2414 | |||
2415 | out: | ||
2416 | mutex_unlock(&md->suspend_lock); | ||
2417 | return map; | ||
2289 | } | 2418 | } |
2290 | 2419 | ||
2291 | /* | 2420 | /* |
@@ -2330,49 +2459,11 @@ static void unlock_fs(struct mapped_device *md) | |||
2330 | /* | 2459 | /* |
2331 | * Suspend mechanism in request-based dm. | 2460 | * Suspend mechanism in request-based dm. |
2332 | * | 2461 | * |
2333 | * After the suspend starts, further incoming requests are kept in | 2462 | * 1. Flush all I/Os by lock_fs() if needed. |
2334 | * the request_queue and deferred. | 2463 | * 2. Stop dispatching any I/O by stopping the request_queue. |
2335 | * Remaining requests in the request_queue at the start of suspend are flushed | 2464 | * 3. Wait for all in-flight I/Os to be completed or requeued. |
2336 | * if it is flush suspend. | ||
2337 | * The suspend completes when the following conditions have been satisfied, | ||
2338 | * so wait for it: | ||
2339 | * 1. q->in_flight is 0 (which means no in_flight request) | ||
2340 | * 2. queue has been stopped (which means no request dispatching) | ||
2341 | * | 2465 | * |
2342 | * | 2466 | * To abort suspend, start the request_queue. |
2343 | * Noflush suspend | ||
2344 | * --------------- | ||
2345 | * Noflush suspend doesn't need to dispatch remaining requests. | ||
2346 | * So stop the queue immediately. Then, wait for all in_flight requests | ||
2347 | * to be completed or requeued. | ||
2348 | * | ||
2349 | * To abort noflush suspend, start the queue. | ||
2350 | * | ||
2351 | * | ||
2352 | * Flush suspend | ||
2353 | * ------------- | ||
2354 | * Flush suspend needs to dispatch remaining requests. So stop the queue | ||
2355 | * after the remaining requests are completed. (Requeued request must be also | ||
2356 | * re-dispatched and completed. Until then, we can't stop the queue.) | ||
2357 | * | ||
2358 | * During flushing the remaining requests, further incoming requests are also | ||
2359 | * inserted to the same queue. To distinguish which requests are to be | ||
2360 | * flushed, we insert a marker request to the queue at the time of starting | ||
2361 | * flush suspend, like a barrier. | ||
2362 | * The dispatching is blocked when the marker is found on the top of the queue. | ||
2363 | * And the queue is stopped when all in_flight requests are completed, since | ||
2364 | * that means the remaining requests are completely flushed. | ||
2365 | * Then, the marker is removed from the queue. | ||
2366 | * | ||
2367 | * To abort flush suspend, we also need to take care of the marker, not only | ||
2368 | * starting the queue. | ||
2369 | * We don't remove the marker forcibly from the queue since it's against | ||
2370 | * the block-layer manner. Instead, we put a invalidated mark on the marker. | ||
2371 | * When the invalidated marker is found on the top of the queue, it is | ||
2372 | * immediately removed from the queue, so it doesn't block dispatching. | ||
2373 | * Because we have only one marker per mapped_device, we can't start another | ||
2374 | * flush suspend until the invalidated marker is removed from the queue. | ||
2375 | * So fail and return with -EBUSY in such a case. | ||
2376 | */ | 2467 | */ |
2377 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 2468 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
2378 | { | 2469 | { |
@@ -2383,17 +2474,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2383 | 2474 | ||
2384 | mutex_lock(&md->suspend_lock); | 2475 | mutex_lock(&md->suspend_lock); |
2385 | 2476 | ||
2386 | if (dm_suspended(md)) { | 2477 | if (dm_suspended_md(md)) { |
2387 | r = -EINVAL; | 2478 | r = -EINVAL; |
2388 | goto out_unlock; | 2479 | goto out_unlock; |
2389 | } | 2480 | } |
2390 | 2481 | ||
2391 | if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { | 2482 | map = dm_get_live_table(md); |
2392 | r = -EBUSY; | ||
2393 | goto out_unlock; | ||
2394 | } | ||
2395 | |||
2396 | map = dm_get_table(md); | ||
2397 | 2483 | ||
2398 | /* | 2484 | /* |
2399 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. | 2485 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. |
@@ -2406,8 +2492,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2406 | dm_table_presuspend_targets(map); | 2492 | dm_table_presuspend_targets(map); |
2407 | 2493 | ||
2408 | /* | 2494 | /* |
2409 | * Flush I/O to the device. noflush supersedes do_lockfs, | 2495 | * Flush I/O to the device. |
2410 | * because lock_fs() needs to flush I/Os. | 2496 | * Any I/O submitted after lock_fs() may not be flushed. |
2497 | * noflush takes precedence over do_lockfs. | ||
2498 | * (lock_fs() flushes I/Os and waits for them to complete.) | ||
2411 | */ | 2499 | */ |
2412 | if (!noflush && do_lockfs) { | 2500 | if (!noflush && do_lockfs) { |
2413 | r = lock_fs(md); | 2501 | r = lock_fs(md); |
@@ -2436,10 +2524,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2436 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | 2524 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); |
2437 | up_write(&md->io_lock); | 2525 | up_write(&md->io_lock); |
2438 | 2526 | ||
2439 | flush_workqueue(md->wq); | 2527 | /* |
2440 | 2528 | * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which | |
2529 | * can be kicked until md->queue is stopped. So stop md->queue before | ||
2530 | * flushing md->wq. | ||
2531 | */ | ||
2441 | if (dm_request_based(md)) | 2532 | if (dm_request_based(md)) |
2442 | dm_rq_start_suspend(md, noflush); | 2533 | stop_queue(md->queue); |
2534 | |||
2535 | flush_workqueue(md->wq); | ||
2443 | 2536 | ||
2444 | /* | 2537 | /* |
2445 | * At this point no more requests are entering target request routines. | 2538 | * At this point no more requests are entering target request routines. |
@@ -2458,7 +2551,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2458 | dm_queue_flush(md); | 2551 | dm_queue_flush(md); |
2459 | 2552 | ||
2460 | if (dm_request_based(md)) | 2553 | if (dm_request_based(md)) |
2461 | dm_rq_abort_suspend(md, noflush); | 2554 | start_queue(md->queue); |
2462 | 2555 | ||
2463 | unlock_fs(md); | 2556 | unlock_fs(md); |
2464 | goto out; /* pushback list is already flushed, so skip flush */ | 2557 | goto out; /* pushback list is already flushed, so skip flush */ |
@@ -2470,10 +2563,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2470 | * requests are being added to md->deferred list. | 2563 | * requests are being added to md->deferred list. |
2471 | */ | 2564 | */ |
2472 | 2565 | ||
2473 | dm_table_postsuspend_targets(map); | ||
2474 | |||
2475 | set_bit(DMF_SUSPENDED, &md->flags); | 2566 | set_bit(DMF_SUSPENDED, &md->flags); |
2476 | 2567 | ||
2568 | dm_table_postsuspend_targets(map); | ||
2569 | |||
2477 | out: | 2570 | out: |
2478 | dm_table_put(map); | 2571 | dm_table_put(map); |
2479 | 2572 | ||
@@ -2488,10 +2581,10 @@ int dm_resume(struct mapped_device *md) | |||
2488 | struct dm_table *map = NULL; | 2581 | struct dm_table *map = NULL; |
2489 | 2582 | ||
2490 | mutex_lock(&md->suspend_lock); | 2583 | mutex_lock(&md->suspend_lock); |
2491 | if (!dm_suspended(md)) | 2584 | if (!dm_suspended_md(md)) |
2492 | goto out; | 2585 | goto out; |
2493 | 2586 | ||
2494 | map = dm_get_table(md); | 2587 | map = dm_get_live_table(md); |
2495 | if (!map || !dm_table_get_size(map)) | 2588 | if (!map || !dm_table_get_size(map)) |
2496 | goto out; | 2589 | goto out; |
2497 | 2590 | ||
@@ -2525,18 +2618,19 @@ out: | |||
2525 | /*----------------------------------------------------------------- | 2618 | /*----------------------------------------------------------------- |
2526 | * Event notification. | 2619 | * Event notification. |
2527 | *---------------------------------------------------------------*/ | 2620 | *---------------------------------------------------------------*/ |
2528 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, | 2621 | int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
2529 | unsigned cookie) | 2622 | unsigned cookie) |
2530 | { | 2623 | { |
2531 | char udev_cookie[DM_COOKIE_LENGTH]; | 2624 | char udev_cookie[DM_COOKIE_LENGTH]; |
2532 | char *envp[] = { udev_cookie, NULL }; | 2625 | char *envp[] = { udev_cookie, NULL }; |
2533 | 2626 | ||
2534 | if (!cookie) | 2627 | if (!cookie) |
2535 | kobject_uevent(&disk_to_dev(md->disk)->kobj, action); | 2628 | return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); |
2536 | else { | 2629 | else { |
2537 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", | 2630 | snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", |
2538 | DM_COOKIE_ENV_VAR_NAME, cookie); | 2631 | DM_COOKIE_ENV_VAR_NAME, cookie); |
2539 | kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); | 2632 | return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, |
2633 | action, envp); | ||
2540 | } | 2634 | } |
2541 | } | 2635 | } |
2542 | 2636 | ||
@@ -2592,26 +2686,27 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) | |||
2592 | return NULL; | 2686 | return NULL; |
2593 | 2687 | ||
2594 | if (test_bit(DMF_FREEING, &md->flags) || | 2688 | if (test_bit(DMF_FREEING, &md->flags) || |
2595 | test_bit(DMF_DELETING, &md->flags)) | 2689 | dm_deleting_md(md)) |
2596 | return NULL; | 2690 | return NULL; |
2597 | 2691 | ||
2598 | dm_get(md); | 2692 | dm_get(md); |
2599 | return md; | 2693 | return md; |
2600 | } | 2694 | } |
2601 | 2695 | ||
2602 | int dm_suspended(struct mapped_device *md) | 2696 | int dm_suspended_md(struct mapped_device *md) |
2603 | { | 2697 | { |
2604 | return test_bit(DMF_SUSPENDED, &md->flags); | 2698 | return test_bit(DMF_SUSPENDED, &md->flags); |
2605 | } | 2699 | } |
2606 | 2700 | ||
2607 | int dm_noflush_suspending(struct dm_target *ti) | 2701 | int dm_suspended(struct dm_target *ti) |
2608 | { | 2702 | { |
2609 | struct mapped_device *md = dm_table_get_md(ti->table); | 2703 | return dm_suspended_md(dm_table_get_md(ti->table)); |
2610 | int r = __noflush_suspending(md); | 2704 | } |
2611 | 2705 | EXPORT_SYMBOL_GPL(dm_suspended); | |
2612 | dm_put(md); | ||
2613 | 2706 | ||
2614 | return r; | 2707 | int dm_noflush_suspending(struct dm_target *ti) |
2708 | { | ||
2709 | return __noflush_suspending(dm_table_get_md(ti->table)); | ||
2615 | } | 2710 | } |
2616 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2711 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
2617 | 2712 | ||
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a7663eba17e2..bad1724d4869 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt, | |||
89 | int dm_split_args(int *argc, char ***argvp, char *input); | 89 | int dm_split_args(int *argc, char ***argvp, char *input); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Is this mapped_device being deleted? | ||
93 | */ | ||
94 | int dm_deleting_md(struct mapped_device *md); | ||
95 | |||
96 | /* | ||
97 | * Is this mapped_device suspended? | ||
98 | */ | ||
99 | int dm_suspended_md(struct mapped_device *md); | ||
100 | |||
101 | /* | ||
92 | * The device-mapper can be driven through one of two interfaces; | 102 | * The device-mapper can be driven through one of two interfaces; |
93 | * ioctl or filesystem, depending which patch you have applied. | 103 | * ioctl or filesystem, depending which patch you have applied. |
94 | */ | 104 | */ |
@@ -115,8 +125,11 @@ void dm_stripe_exit(void); | |||
115 | int dm_open_count(struct mapped_device *md); | 125 | int dm_open_count(struct mapped_device *md); |
116 | int dm_lock_for_deletion(struct mapped_device *md); | 126 | int dm_lock_for_deletion(struct mapped_device *md); |
117 | 127 | ||
118 | void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, | 128 | int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, |
119 | unsigned cookie); | 129 | unsigned cookie); |
130 | |||
131 | int dm_io_init(void); | ||
132 | void dm_io_exit(void); | ||
120 | 133 | ||
121 | int dm_kcopyd_init(void); | 134 | int dm_kcopyd_init(void); |
122 | void dm_kcopyd_exit(void); | 135 | void dm_kcopyd_exit(void); |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 87d88dbb667f..8e3850b98cca 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -64,6 +64,7 @@ | |||
64 | #define MaxFault 50 | 64 | #define MaxFault 50 |
65 | #include <linux/blkdev.h> | 65 | #include <linux/blkdev.h> |
66 | #include <linux/raid/md_u.h> | 66 | #include <linux/raid/md_u.h> |
67 | #include <linux/slab.h> | ||
67 | #include "md.h" | 68 | #include "md.h" |
68 | #include <linux/seq_file.h> | 69 | #include <linux/seq_file.h> |
69 | 70 | ||
@@ -360,6 +361,7 @@ static void raid_exit(void) | |||
360 | module_init(raid_init); | 361 | module_init(raid_init); |
361 | module_exit(raid_exit); | 362 | module_exit(raid_exit); |
362 | MODULE_LICENSE("GPL"); | 363 | MODULE_LICENSE("GPL"); |
364 | MODULE_DESCRIPTION("Fault injection personality for MD"); | ||
363 | MODULE_ALIAS("md-personality-10"); /* faulty */ | 365 | MODULE_ALIAS("md-personality-10"); /* faulty */ |
364 | MODULE_ALIAS("md-faulty"); | 366 | MODULE_ALIAS("md-faulty"); |
365 | MODULE_ALIAS("md-level--5"); | 367 | MODULE_ALIAS("md-level--5"); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1ceceb334d5e..09437e958235 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
20 | #include <linux/raid/md_u.h> | 20 | #include <linux/raid/md_u.h> |
21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
22 | #include <linux/slab.h> | ||
22 | #include "md.h" | 23 | #include "md.h" |
23 | #include "linear.h" | 24 | #include "linear.h" |
24 | 25 | ||
@@ -172,12 +173,14 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
172 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 173 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
173 | rdev->data_offset << 9); | 174 | rdev->data_offset << 9); |
174 | /* as we don't honour merge_bvec_fn, we must never risk | 175 | /* as we don't honour merge_bvec_fn, we must never risk |
175 | * violating it, so limit ->max_sector to one PAGE, as | 176 | * violating it, so limit max_segments to 1 lying within |
176 | * a one page request is never in violation. | 177 | * a single page. |
177 | */ | 178 | */ |
178 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 179 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
179 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 180 | blk_queue_max_segments(mddev->queue, 1); |
180 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 181 | blk_queue_segment_boundary(mddev->queue, |
182 | PAGE_CACHE_SIZE - 1); | ||
183 | } | ||
181 | 184 | ||
182 | conf->array_sectors += rdev->sectors; | 185 | conf->array_sectors += rdev->sectors; |
183 | cnt++; | 186 | cnt++; |
@@ -292,7 +295,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) | |||
292 | int cpu; | 295 | int cpu; |
293 | 296 | ||
294 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 297 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
295 | bio_endio(bio, -EOPNOTSUPP); | 298 | md_barrier_request(mddev, bio); |
296 | return 0; | 299 | return 0; |
297 | } | 300 | } |
298 | 301 | ||
@@ -383,6 +386,7 @@ static void linear_exit (void) | |||
383 | module_init(linear_init); | 386 | module_init(linear_init); |
384 | module_exit(linear_exit); | 387 | module_exit(linear_exit); |
385 | MODULE_LICENSE("GPL"); | 388 | MODULE_LICENSE("GPL"); |
389 | MODULE_DESCRIPTION("Linear device concatenation personality for MD"); | ||
386 | MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ | 390 | MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ |
387 | MODULE_ALIAS("md-linear"); | 391 | MODULE_ALIAS("md-linear"); |
388 | MODULE_ALIAS("md-level--1"); | 392 | MODULE_ALIAS("md-level--1"); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index b182f86a19dd..cefd63daff31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -39,14 +39,17 @@ | |||
39 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 39 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
40 | #include <linux/poll.h> | 40 | #include <linux/poll.h> |
41 | #include <linux/ctype.h> | 41 | #include <linux/ctype.h> |
42 | #include <linux/string.h> | ||
42 | #include <linux/hdreg.h> | 43 | #include <linux/hdreg.h> |
43 | #include <linux/proc_fs.h> | 44 | #include <linux/proc_fs.h> |
44 | #include <linux/random.h> | 45 | #include <linux/random.h> |
45 | #include <linux/reboot.h> | 46 | #include <linux/reboot.h> |
46 | #include <linux/file.h> | 47 | #include <linux/file.h> |
48 | #include <linux/compat.h> | ||
47 | #include <linux/delay.h> | 49 | #include <linux/delay.h> |
48 | #include <linux/raid/md_p.h> | 50 | #include <linux/raid/md_p.h> |
49 | #include <linux/raid/md_u.h> | 51 | #include <linux/raid/md_u.h> |
52 | #include <linux/slab.h> | ||
50 | #include "md.h" | 53 | #include "md.h" |
51 | #include "bitmap.h" | 54 | #include "bitmap.h" |
52 | 55 | ||
@@ -68,6 +71,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | |||
68 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } | 71 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
69 | 72 | ||
70 | /* | 73 | /* |
74 | * Default number of read corrections we'll attempt on an rdev | ||
75 | * before ejecting it from the array. We divide the read error | ||
76 | * count by 2 for every hour elapsed between read errors. | ||
77 | */ | ||
78 | #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 | ||
79 | /* | ||
71 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' | 80 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
72 | * is 1000 KB/sec, so the extra system load does not show up that much. | 81 | * is 1000 KB/sec, so the extra system load does not show up that much. |
73 | * Increase it if you want to have more _guaranteed_ speed. Note that | 82 | * Increase it if you want to have more _guaranteed_ speed. Note that |
@@ -98,44 +107,40 @@ static struct ctl_table_header *raid_table_header; | |||
98 | 107 | ||
99 | static ctl_table raid_table[] = { | 108 | static ctl_table raid_table[] = { |
100 | { | 109 | { |
101 | .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, | ||
102 | .procname = "speed_limit_min", | 110 | .procname = "speed_limit_min", |
103 | .data = &sysctl_speed_limit_min, | 111 | .data = &sysctl_speed_limit_min, |
104 | .maxlen = sizeof(int), | 112 | .maxlen = sizeof(int), |
105 | .mode = S_IRUGO|S_IWUSR, | 113 | .mode = S_IRUGO|S_IWUSR, |
106 | .proc_handler = &proc_dointvec, | 114 | .proc_handler = proc_dointvec, |
107 | }, | 115 | }, |
108 | { | 116 | { |
109 | .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, | ||
110 | .procname = "speed_limit_max", | 117 | .procname = "speed_limit_max", |
111 | .data = &sysctl_speed_limit_max, | 118 | .data = &sysctl_speed_limit_max, |
112 | .maxlen = sizeof(int), | 119 | .maxlen = sizeof(int), |
113 | .mode = S_IRUGO|S_IWUSR, | 120 | .mode = S_IRUGO|S_IWUSR, |
114 | .proc_handler = &proc_dointvec, | 121 | .proc_handler = proc_dointvec, |
115 | }, | 122 | }, |
116 | { .ctl_name = 0 } | 123 | { } |
117 | }; | 124 | }; |
118 | 125 | ||
119 | static ctl_table raid_dir_table[] = { | 126 | static ctl_table raid_dir_table[] = { |
120 | { | 127 | { |
121 | .ctl_name = DEV_RAID, | ||
122 | .procname = "raid", | 128 | .procname = "raid", |
123 | .maxlen = 0, | 129 | .maxlen = 0, |
124 | .mode = S_IRUGO|S_IXUGO, | 130 | .mode = S_IRUGO|S_IXUGO, |
125 | .child = raid_table, | 131 | .child = raid_table, |
126 | }, | 132 | }, |
127 | { .ctl_name = 0 } | 133 | { } |
128 | }; | 134 | }; |
129 | 135 | ||
130 | static ctl_table raid_root_table[] = { | 136 | static ctl_table raid_root_table[] = { |
131 | { | 137 | { |
132 | .ctl_name = CTL_DEV, | ||
133 | .procname = "dev", | 138 | .procname = "dev", |
134 | .maxlen = 0, | 139 | .maxlen = 0, |
135 | .mode = 0555, | 140 | .mode = 0555, |
136 | .child = raid_dir_table, | 141 | .child = raid_dir_table, |
137 | }, | 142 | }, |
138 | { .ctl_name = 0 } | 143 | { } |
139 | }; | 144 | }; |
140 | 145 | ||
141 | static const struct block_device_operations md_fops; | 146 | static const struct block_device_operations md_fops; |
@@ -217,12 +222,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
217 | return 0; | 222 | return 0; |
218 | } | 223 | } |
219 | rcu_read_lock(); | 224 | rcu_read_lock(); |
220 | if (mddev->suspended) { | 225 | if (mddev->suspended || mddev->barrier) { |
221 | DEFINE_WAIT(__wait); | 226 | DEFINE_WAIT(__wait); |
222 | for (;;) { | 227 | for (;;) { |
223 | prepare_to_wait(&mddev->sb_wait, &__wait, | 228 | prepare_to_wait(&mddev->sb_wait, &__wait, |
224 | TASK_UNINTERRUPTIBLE); | 229 | TASK_UNINTERRUPTIBLE); |
225 | if (!mddev->suspended) | 230 | if (!mddev->suspended && !mddev->barrier) |
226 | break; | 231 | break; |
227 | rcu_read_unlock(); | 232 | rcu_read_unlock(); |
228 | schedule(); | 233 | schedule(); |
@@ -264,10 +269,110 @@ static void mddev_resume(mddev_t *mddev) | |||
264 | 269 | ||
265 | int mddev_congested(mddev_t *mddev, int bits) | 270 | int mddev_congested(mddev_t *mddev, int bits) |
266 | { | 271 | { |
272 | if (mddev->barrier) | ||
273 | return 1; | ||
267 | return mddev->suspended; | 274 | return mddev->suspended; |
268 | } | 275 | } |
269 | EXPORT_SYMBOL(mddev_congested); | 276 | EXPORT_SYMBOL(mddev_congested); |
270 | 277 | ||
278 | /* | ||
279 | * Generic barrier handling for md | ||
280 | */ | ||
281 | |||
282 | #define POST_REQUEST_BARRIER ((void*)1) | ||
283 | |||
284 | static void md_end_barrier(struct bio *bio, int err) | ||
285 | { | ||
286 | mdk_rdev_t *rdev = bio->bi_private; | ||
287 | mddev_t *mddev = rdev->mddev; | ||
288 | if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) | ||
289 | set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); | ||
290 | |||
291 | rdev_dec_pending(rdev, mddev); | ||
292 | |||
293 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
294 | if (mddev->barrier == POST_REQUEST_BARRIER) { | ||
295 | /* This was a post-request barrier */ | ||
296 | mddev->barrier = NULL; | ||
297 | wake_up(&mddev->sb_wait); | ||
298 | } else | ||
299 | /* The pre-request barrier has finished */ | ||
300 | schedule_work(&mddev->barrier_work); | ||
301 | } | ||
302 | bio_put(bio); | ||
303 | } | ||
304 | |||
305 | static void submit_barriers(mddev_t *mddev) | ||
306 | { | ||
307 | mdk_rdev_t *rdev; | ||
308 | |||
309 | rcu_read_lock(); | ||
310 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
311 | if (rdev->raid_disk >= 0 && | ||
312 | !test_bit(Faulty, &rdev->flags)) { | ||
313 | /* Take two references, one is dropped | ||
314 | * when request finishes, one after | ||
315 | * we reclaim rcu_read_lock | ||
316 | */ | ||
317 | struct bio *bi; | ||
318 | atomic_inc(&rdev->nr_pending); | ||
319 | atomic_inc(&rdev->nr_pending); | ||
320 | rcu_read_unlock(); | ||
321 | bi = bio_alloc(GFP_KERNEL, 0); | ||
322 | bi->bi_end_io = md_end_barrier; | ||
323 | bi->bi_private = rdev; | ||
324 | bi->bi_bdev = rdev->bdev; | ||
325 | atomic_inc(&mddev->flush_pending); | ||
326 | submit_bio(WRITE_BARRIER, bi); | ||
327 | rcu_read_lock(); | ||
328 | rdev_dec_pending(rdev, mddev); | ||
329 | } | ||
330 | rcu_read_unlock(); | ||
331 | } | ||
332 | |||
333 | static void md_submit_barrier(struct work_struct *ws) | ||
334 | { | ||
335 | mddev_t *mddev = container_of(ws, mddev_t, barrier_work); | ||
336 | struct bio *bio = mddev->barrier; | ||
337 | |||
338 | atomic_set(&mddev->flush_pending, 1); | ||
339 | |||
340 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | ||
341 | bio_endio(bio, -EOPNOTSUPP); | ||
342 | else if (bio->bi_size == 0) | ||
343 | /* an empty barrier - all done */ | ||
344 | bio_endio(bio, 0); | ||
345 | else { | ||
346 | bio->bi_rw &= ~(1<<BIO_RW_BARRIER); | ||
347 | if (mddev->pers->make_request(mddev->queue, bio)) | ||
348 | generic_make_request(bio); | ||
349 | mddev->barrier = POST_REQUEST_BARRIER; | ||
350 | submit_barriers(mddev); | ||
351 | } | ||
352 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
353 | mddev->barrier = NULL; | ||
354 | wake_up(&mddev->sb_wait); | ||
355 | } | ||
356 | } | ||
357 | |||
358 | void md_barrier_request(mddev_t *mddev, struct bio *bio) | ||
359 | { | ||
360 | spin_lock_irq(&mddev->write_lock); | ||
361 | wait_event_lock_irq(mddev->sb_wait, | ||
362 | !mddev->barrier, | ||
363 | mddev->write_lock, /*nothing*/); | ||
364 | mddev->barrier = bio; | ||
365 | spin_unlock_irq(&mddev->write_lock); | ||
366 | |||
367 | atomic_set(&mddev->flush_pending, 1); | ||
368 | INIT_WORK(&mddev->barrier_work, md_submit_barrier); | ||
369 | |||
370 | submit_barriers(mddev); | ||
371 | |||
372 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
373 | schedule_work(&mddev->barrier_work); | ||
374 | } | ||
375 | EXPORT_SYMBOL(md_barrier_request); | ||
271 | 376 | ||
272 | static inline mddev_t *mddev_get(mddev_t *mddev) | 377 | static inline mddev_t *mddev_get(mddev_t *mddev) |
273 | { | 378 | { |
@@ -282,7 +387,9 @@ static void mddev_put(mddev_t *mddev) | |||
282 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) | 387 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
283 | return; | 388 | return; |
284 | if (!mddev->raid_disks && list_empty(&mddev->disks) && | 389 | if (!mddev->raid_disks && list_empty(&mddev->disks) && |
285 | !mddev->hold_active) { | 390 | mddev->ctime == 0 && !mddev->hold_active) { |
391 | /* Array is not configured at all, and not held active, | ||
392 | * so destroy it */ | ||
286 | list_del(&mddev->all_mddevs); | 393 | list_del(&mddev->all_mddevs); |
287 | if (mddev->gendisk) { | 394 | if (mddev->gendisk) { |
288 | /* we did a probe so need to clean up. | 395 | /* we did a probe so need to clean up. |
@@ -367,6 +474,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
367 | 474 | ||
368 | mutex_init(&new->open_mutex); | 475 | mutex_init(&new->open_mutex); |
369 | mutex_init(&new->reconfig_mutex); | 476 | mutex_init(&new->reconfig_mutex); |
477 | mutex_init(&new->bitmap_info.mutex); | ||
370 | INIT_LIST_HEAD(&new->disks); | 478 | INIT_LIST_HEAD(&new->disks); |
371 | INIT_LIST_HEAD(&new->all_mddevs); | 479 | INIT_LIST_HEAD(&new->all_mddevs); |
372 | init_timer(&new->safemode_timer); | 480 | init_timer(&new->safemode_timer); |
@@ -374,6 +482,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
374 | atomic_set(&new->openers, 0); | 482 | atomic_set(&new->openers, 0); |
375 | atomic_set(&new->active_io, 0); | 483 | atomic_set(&new->active_io, 0); |
376 | spin_lock_init(&new->write_lock); | 484 | spin_lock_init(&new->write_lock); |
485 | atomic_set(&new->flush_pending, 0); | ||
377 | init_waitqueue_head(&new->sb_wait); | 486 | init_waitqueue_head(&new->sb_wait); |
378 | init_waitqueue_head(&new->recovery_wait); | 487 | init_waitqueue_head(&new->recovery_wait); |
379 | new->reshape_position = MaxSector; | 488 | new->reshape_position = MaxSector; |
@@ -752,7 +861,7 @@ struct super_type { | |||
752 | */ | 861 | */ |
753 | int md_check_no_bitmap(mddev_t *mddev) | 862 | int md_check_no_bitmap(mddev_t *mddev) |
754 | { | 863 | { |
755 | if (!mddev->bitmap_file && !mddev->bitmap_offset) | 864 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
756 | return 0; | 865 | return 0; |
757 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", | 866 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", |
758 | mdname(mddev), mddev->pers->name); | 867 | mdname(mddev), mddev->pers->name); |
@@ -880,8 +989,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
880 | mddev->raid_disks = sb->raid_disks; | 989 | mddev->raid_disks = sb->raid_disks; |
881 | mddev->dev_sectors = sb->size * 2; | 990 | mddev->dev_sectors = sb->size * 2; |
882 | mddev->events = ev1; | 991 | mddev->events = ev1; |
883 | mddev->bitmap_offset = 0; | 992 | mddev->bitmap_info.offset = 0; |
884 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | 993 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
885 | 994 | ||
886 | if (mddev->minor_version >= 91) { | 995 | if (mddev->minor_version >= 91) { |
887 | mddev->reshape_position = sb->reshape_position; | 996 | mddev->reshape_position = sb->reshape_position; |
@@ -915,8 +1024,9 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
915 | mddev->max_disks = MD_SB_DISKS; | 1024 | mddev->max_disks = MD_SB_DISKS; |
916 | 1025 | ||
917 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && | 1026 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
918 | mddev->bitmap_file == NULL) | 1027 | mddev->bitmap_info.file == NULL) |
919 | mddev->bitmap_offset = mddev->default_bitmap_offset; | 1028 | mddev->bitmap_info.offset = |
1029 | mddev->bitmap_info.default_offset; | ||
920 | 1030 | ||
921 | } else if (mddev->pers == NULL) { | 1031 | } else if (mddev->pers == NULL) { |
922 | /* Insist on good event counter while assembling */ | 1032 | /* Insist on good event counter while assembling */ |
@@ -1033,7 +1143,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1033 | sb->layout = mddev->layout; | 1143 | sb->layout = mddev->layout; |
1034 | sb->chunk_size = mddev->chunk_sectors << 9; | 1144 | sb->chunk_size = mddev->chunk_sectors << 9; |
1035 | 1145 | ||
1036 | if (mddev->bitmap && mddev->bitmap_file == NULL) | 1146 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
1037 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); | 1147 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
1038 | 1148 | ||
1039 | sb->disks[0].state = (1<<MD_DISK_REMOVED); | 1149 | sb->disks[0].state = (1<<MD_DISK_REMOVED); |
@@ -1111,7 +1221,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1111 | { | 1221 | { |
1112 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) | 1222 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1113 | return 0; /* component must fit device */ | 1223 | return 0; /* component must fit device */ |
1114 | if (rdev->mddev->bitmap_offset) | 1224 | if (rdev->mddev->bitmap_info.offset) |
1115 | return 0; /* can't move bitmap */ | 1225 | return 0; /* can't move bitmap */ |
1116 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 1226 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
1117 | if (!num_sectors || num_sectors > rdev->sb_start) | 1227 | if (!num_sectors || num_sectors > rdev->sb_start) |
@@ -1290,8 +1400,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1290 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); | 1400 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
1291 | mddev->dev_sectors = le64_to_cpu(sb->size); | 1401 | mddev->dev_sectors = le64_to_cpu(sb->size); |
1292 | mddev->events = ev1; | 1402 | mddev->events = ev1; |
1293 | mddev->bitmap_offset = 0; | 1403 | mddev->bitmap_info.offset = 0; |
1294 | mddev->default_bitmap_offset = 1024 >> 9; | 1404 | mddev->bitmap_info.default_offset = 1024 >> 9; |
1295 | 1405 | ||
1296 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | 1406 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
1297 | memcpy(mddev->uuid, sb->set_uuid, 16); | 1407 | memcpy(mddev->uuid, sb->set_uuid, 16); |
@@ -1299,8 +1409,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1299 | mddev->max_disks = (4096-256)/2; | 1409 | mddev->max_disks = (4096-256)/2; |
1300 | 1410 | ||
1301 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && | 1411 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
1302 | mddev->bitmap_file == NULL ) | 1412 | mddev->bitmap_info.file == NULL ) |
1303 | mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); | 1413 | mddev->bitmap_info.offset = |
1414 | (__s32)le32_to_cpu(sb->bitmap_offset); | ||
1304 | 1415 | ||
1305 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { | 1416 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
1306 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); | 1417 | mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
@@ -1394,19 +1505,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1394 | sb->level = cpu_to_le32(mddev->level); | 1505 | sb->level = cpu_to_le32(mddev->level); |
1395 | sb->layout = cpu_to_le32(mddev->layout); | 1506 | sb->layout = cpu_to_le32(mddev->layout); |
1396 | 1507 | ||
1397 | if (mddev->bitmap && mddev->bitmap_file == NULL) { | 1508 | if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
1398 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); | 1509 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
1399 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); | 1510 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
1400 | } | 1511 | } |
1401 | 1512 | ||
1402 | if (rdev->raid_disk >= 0 && | 1513 | if (rdev->raid_disk >= 0 && |
1403 | !test_bit(In_sync, &rdev->flags)) { | 1514 | !test_bit(In_sync, &rdev->flags)) { |
1404 | if (rdev->recovery_offset > 0) { | 1515 | sb->feature_map |= |
1405 | sb->feature_map |= | 1516 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
1406 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | 1517 | sb->recovery_offset = |
1407 | sb->recovery_offset = | 1518 | cpu_to_le64(rdev->recovery_offset); |
1408 | cpu_to_le64(rdev->recovery_offset); | ||
1409 | } | ||
1410 | } | 1519 | } |
1411 | 1520 | ||
1412 | if (mddev->reshape_position != MaxSector) { | 1521 | if (mddev->reshape_position != MaxSector) { |
@@ -1440,7 +1549,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1440 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | 1549 | sb->dev_roles[i] = cpu_to_le16(0xfffe); |
1441 | else if (test_bit(In_sync, &rdev2->flags)) | 1550 | else if (test_bit(In_sync, &rdev2->flags)) |
1442 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); | 1551 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
1443 | else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) | 1552 | else if (rdev2->raid_disk >= 0) |
1444 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); | 1553 | sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
1445 | else | 1554 | else |
1446 | sb->dev_roles[i] = cpu_to_le16(0xffff); | 1555 | sb->dev_roles[i] = cpu_to_le16(0xffff); |
@@ -1462,7 +1571,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1462 | max_sectors -= rdev->data_offset; | 1571 | max_sectors -= rdev->data_offset; |
1463 | if (!num_sectors || num_sectors > max_sectors) | 1572 | if (!num_sectors || num_sectors > max_sectors) |
1464 | num_sectors = max_sectors; | 1573 | num_sectors = max_sectors; |
1465 | } else if (rdev->mddev->bitmap_offset) { | 1574 | } else if (rdev->mddev->bitmap_info.offset) { |
1466 | /* minor version 0 with bitmap we can't move */ | 1575 | /* minor version 0 with bitmap we can't move */ |
1467 | return 0; | 1576 | return 0; |
1468 | } else { | 1577 | } else { |
@@ -1830,15 +1939,11 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1830 | 1939 | ||
1831 | uuid = sb->set_uuid; | 1940 | uuid = sb->set_uuid; |
1832 | printk(KERN_INFO | 1941 | printk(KERN_INFO |
1833 | "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" | 1942 | "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" |
1834 | ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" | ||
1835 | "md: Name: \"%s\" CT:%llu\n", | 1943 | "md: Name: \"%s\" CT:%llu\n", |
1836 | le32_to_cpu(sb->major_version), | 1944 | le32_to_cpu(sb->major_version), |
1837 | le32_to_cpu(sb->feature_map), | 1945 | le32_to_cpu(sb->feature_map), |
1838 | uuid[0], uuid[1], uuid[2], uuid[3], | 1946 | uuid, |
1839 | uuid[4], uuid[5], uuid[6], uuid[7], | ||
1840 | uuid[8], uuid[9], uuid[10], uuid[11], | ||
1841 | uuid[12], uuid[13], uuid[14], uuid[15], | ||
1842 | sb->set_name, | 1947 | sb->set_name, |
1843 | (unsigned long long)le64_to_cpu(sb->ctime) | 1948 | (unsigned long long)le64_to_cpu(sb->ctime) |
1844 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); | 1949 | & MD_SUPERBLOCK_1_TIME_SEC_MASK); |
@@ -1847,8 +1952,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1847 | printk(KERN_INFO | 1952 | printk(KERN_INFO |
1848 | "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" | 1953 | "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" |
1849 | " RO:%llu\n" | 1954 | " RO:%llu\n" |
1850 | "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" | 1955 | "md: Dev:%08x UUID: %pU\n" |
1851 | ":%02x%02x%02x%02x%02x%02x\n" | ||
1852 | "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" | 1956 | "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" |
1853 | "md: (MaxDev:%u) \n", | 1957 | "md: (MaxDev:%u) \n", |
1854 | le32_to_cpu(sb->level), | 1958 | le32_to_cpu(sb->level), |
@@ -1861,10 +1965,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1861 | (unsigned long long)le64_to_cpu(sb->super_offset), | 1965 | (unsigned long long)le64_to_cpu(sb->super_offset), |
1862 | (unsigned long long)le64_to_cpu(sb->recovery_offset), | 1966 | (unsigned long long)le64_to_cpu(sb->recovery_offset), |
1863 | le32_to_cpu(sb->dev_number), | 1967 | le32_to_cpu(sb->dev_number), |
1864 | uuid[0], uuid[1], uuid[2], uuid[3], | 1968 | uuid, |
1865 | uuid[4], uuid[5], uuid[6], uuid[7], | ||
1866 | uuid[8], uuid[9], uuid[10], uuid[11], | ||
1867 | uuid[12], uuid[13], uuid[14], uuid[15], | ||
1868 | sb->devflags, | 1969 | sb->devflags, |
1869 | (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, | 1970 | (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, |
1870 | (unsigned long long)le64_to_cpu(sb->events), | 1971 | (unsigned long long)le64_to_cpu(sb->events), |
@@ -2008,12 +2109,18 @@ repeat: | |||
2008 | if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ | 2109 | if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ |
2009 | /* .. if the array isn't clean, an 'even' event must also go | 2110 | /* .. if the array isn't clean, an 'even' event must also go |
2010 | * to spares. */ | 2111 | * to spares. */ |
2011 | if ((mddev->events&1)==0) | 2112 | if ((mddev->events&1)==0) { |
2012 | nospares = 0; | 2113 | nospares = 0; |
2114 | sync_req = 2; /* force a second update to get the | ||
2115 | * even/odd in sync */ | ||
2116 | } | ||
2013 | } else { | 2117 | } else { |
2014 | /* otherwise an 'odd' event must go to spares */ | 2118 | /* otherwise an 'odd' event must go to spares */ |
2015 | if ((mddev->events&1)) | 2119 | if ((mddev->events&1)) { |
2016 | nospares = 0; | 2120 | nospares = 0; |
2121 | sync_req = 2; /* force a second update to get the | ||
2122 | * even/odd in sync */ | ||
2123 | } | ||
2017 | } | 2124 | } |
2018 | } | 2125 | } |
2019 | 2126 | ||
@@ -2446,12 +2553,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2446 | static struct rdev_sysfs_entry rdev_size = | 2553 | static struct rdev_sysfs_entry rdev_size = |
2447 | __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); | 2554 | __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); |
2448 | 2555 | ||
2556 | |||
2557 | static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) | ||
2558 | { | ||
2559 | unsigned long long recovery_start = rdev->recovery_offset; | ||
2560 | |||
2561 | if (test_bit(In_sync, &rdev->flags) || | ||
2562 | recovery_start == MaxSector) | ||
2563 | return sprintf(page, "none\n"); | ||
2564 | |||
2565 | return sprintf(page, "%llu\n", recovery_start); | ||
2566 | } | ||
2567 | |||
2568 | static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) | ||
2569 | { | ||
2570 | unsigned long long recovery_start; | ||
2571 | |||
2572 | if (cmd_match(buf, "none")) | ||
2573 | recovery_start = MaxSector; | ||
2574 | else if (strict_strtoull(buf, 10, &recovery_start)) | ||
2575 | return -EINVAL; | ||
2576 | |||
2577 | if (rdev->mddev->pers && | ||
2578 | rdev->raid_disk >= 0) | ||
2579 | return -EBUSY; | ||
2580 | |||
2581 | rdev->recovery_offset = recovery_start; | ||
2582 | if (recovery_start == MaxSector) | ||
2583 | set_bit(In_sync, &rdev->flags); | ||
2584 | else | ||
2585 | clear_bit(In_sync, &rdev->flags); | ||
2586 | return len; | ||
2587 | } | ||
2588 | |||
2589 | static struct rdev_sysfs_entry rdev_recovery_start = | ||
2590 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); | ||
2591 | |||
2449 | static struct attribute *rdev_default_attrs[] = { | 2592 | static struct attribute *rdev_default_attrs[] = { |
2450 | &rdev_state.attr, | 2593 | &rdev_state.attr, |
2451 | &rdev_errors.attr, | 2594 | &rdev_errors.attr, |
2452 | &rdev_slot.attr, | 2595 | &rdev_slot.attr, |
2453 | &rdev_offset.attr, | 2596 | &rdev_offset.attr, |
2454 | &rdev_size.attr, | 2597 | &rdev_size.attr, |
2598 | &rdev_recovery_start.attr, | ||
2455 | NULL, | 2599 | NULL, |
2456 | }; | 2600 | }; |
2457 | static ssize_t | 2601 | static ssize_t |
@@ -2505,7 +2649,7 @@ static void rdev_free(struct kobject *ko) | |||
2505 | mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); | 2649 | mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); |
2506 | kfree(rdev); | 2650 | kfree(rdev); |
2507 | } | 2651 | } |
2508 | static struct sysfs_ops rdev_sysfs_ops = { | 2652 | static const struct sysfs_ops rdev_sysfs_ops = { |
2509 | .show = rdev_attr_show, | 2653 | .show = rdev_attr_show, |
2510 | .store = rdev_attr_store, | 2654 | .store = rdev_attr_store, |
2511 | }; | 2655 | }; |
@@ -2553,6 +2697,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2553 | rdev->flags = 0; | 2697 | rdev->flags = 0; |
2554 | rdev->data_offset = 0; | 2698 | rdev->data_offset = 0; |
2555 | rdev->sb_events = 0; | 2699 | rdev->sb_events = 0; |
2700 | rdev->last_read_error.tv_sec = 0; | ||
2701 | rdev->last_read_error.tv_nsec = 0; | ||
2556 | atomic_set(&rdev->nr_pending, 0); | 2702 | atomic_set(&rdev->nr_pending, 0); |
2557 | atomic_set(&rdev->read_errors, 0); | 2703 | atomic_set(&rdev->read_errors, 0); |
2558 | atomic_set(&rdev->corrected_errors, 0); | 2704 | atomic_set(&rdev->corrected_errors, 0); |
@@ -2663,6 +2809,47 @@ static void analyze_sbs(mddev_t * mddev) | |||
2663 | } | 2809 | } |
2664 | } | 2810 | } |
2665 | 2811 | ||
2812 | /* Read a fixed-point number. | ||
2813 | * Numbers in sysfs attributes should be in "standard" units where | ||
2814 | * possible, so time should be in seconds. | ||
2815 | * However we internally use a a much smaller unit such as | ||
2816 | * milliseconds or jiffies. | ||
2817 | * This function takes a decimal number with a possible fractional | ||
2818 | * component, and produces an integer which is the result of | ||
2819 | * multiplying that number by 10^'scale'. | ||
2820 | * all without any floating-point arithmetic. | ||
2821 | */ | ||
2822 | int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) | ||
2823 | { | ||
2824 | unsigned long result = 0; | ||
2825 | long decimals = -1; | ||
2826 | while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { | ||
2827 | if (*cp == '.') | ||
2828 | decimals = 0; | ||
2829 | else if (decimals < scale) { | ||
2830 | unsigned int value; | ||
2831 | value = *cp - '0'; | ||
2832 | result = result * 10 + value; | ||
2833 | if (decimals >= 0) | ||
2834 | decimals++; | ||
2835 | } | ||
2836 | cp++; | ||
2837 | } | ||
2838 | if (*cp == '\n') | ||
2839 | cp++; | ||
2840 | if (*cp) | ||
2841 | return -EINVAL; | ||
2842 | if (decimals < 0) | ||
2843 | decimals = 0; | ||
2844 | while (decimals < scale) { | ||
2845 | result *= 10; | ||
2846 | decimals ++; | ||
2847 | } | ||
2848 | *res = result; | ||
2849 | return 0; | ||
2850 | } | ||
2851 | |||
2852 | |||
2666 | static void md_safemode_timeout(unsigned long data); | 2853 | static void md_safemode_timeout(unsigned long data); |
2667 | 2854 | ||
2668 | static ssize_t | 2855 | static ssize_t |
@@ -2674,31 +2861,10 @@ safe_delay_show(mddev_t *mddev, char *page) | |||
2674 | static ssize_t | 2861 | static ssize_t |
2675 | safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) | 2862 | safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) |
2676 | { | 2863 | { |
2677 | int scale=1; | ||
2678 | int dot=0; | ||
2679 | int i; | ||
2680 | unsigned long msec; | 2864 | unsigned long msec; |
2681 | char buf[30]; | ||
2682 | 2865 | ||
2683 | /* remove a period, and count digits after it */ | 2866 | if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) |
2684 | if (len >= sizeof(buf)) | ||
2685 | return -EINVAL; | ||
2686 | strlcpy(buf, cbuf, sizeof(buf)); | ||
2687 | for (i=0; i<len; i++) { | ||
2688 | if (dot) { | ||
2689 | if (isdigit(buf[i])) { | ||
2690 | buf[i-1] = buf[i]; | ||
2691 | scale *= 10; | ||
2692 | } | ||
2693 | buf[i] = 0; | ||
2694 | } else if (buf[i] == '.') { | ||
2695 | dot=1; | ||
2696 | buf[i] = 0; | ||
2697 | } | ||
2698 | } | ||
2699 | if (strict_strtoul(buf, 10, &msec) < 0) | ||
2700 | return -EINVAL; | 2867 | return -EINVAL; |
2701 | msec = (msec * 1000) / scale; | ||
2702 | if (msec == 0) | 2868 | if (msec == 0) |
2703 | mddev->safemode_delay = 0; | 2869 | mddev->safemode_delay = 0; |
2704 | else { | 2870 | else { |
@@ -2974,7 +3140,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) | |||
2974 | 3140 | ||
2975 | if (mddev->pers) | 3141 | if (mddev->pers) |
2976 | return -EBUSY; | 3142 | return -EBUSY; |
2977 | if (!*buf || (*e && *e != '\n')) | 3143 | if (cmd_match(buf, "none")) |
3144 | n = MaxSector; | ||
3145 | else if (!*buf || (*e && *e != '\n')) | ||
2978 | return -EINVAL; | 3146 | return -EINVAL; |
2979 | 3147 | ||
2980 | mddev->recovery_cp = n; | 3148 | mddev->recovery_cp = n; |
@@ -3170,6 +3338,29 @@ static struct md_sysfs_entry md_array_state = | |||
3170 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); | 3338 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
3171 | 3339 | ||
3172 | static ssize_t | 3340 | static ssize_t |
3341 | max_corrected_read_errors_show(mddev_t *mddev, char *page) { | ||
3342 | return sprintf(page, "%d\n", | ||
3343 | atomic_read(&mddev->max_corr_read_errors)); | ||
3344 | } | ||
3345 | |||
3346 | static ssize_t | ||
3347 | max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) | ||
3348 | { | ||
3349 | char *e; | ||
3350 | unsigned long n = simple_strtoul(buf, &e, 10); | ||
3351 | |||
3352 | if (*buf && (*e == 0 || *e == '\n')) { | ||
3353 | atomic_set(&mddev->max_corr_read_errors, n); | ||
3354 | return len; | ||
3355 | } | ||
3356 | return -EINVAL; | ||
3357 | } | ||
3358 | |||
3359 | static struct md_sysfs_entry max_corr_read_errors = | ||
3360 | __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, | ||
3361 | max_corrected_read_errors_store); | ||
3362 | |||
3363 | static ssize_t | ||
3173 | null_show(mddev_t *mddev, char *page) | 3364 | null_show(mddev_t *mddev, char *page) |
3174 | { | 3365 | { |
3175 | return -EINVAL; | 3366 | return -EINVAL; |
@@ -3250,8 +3441,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len) | |||
3250 | } | 3441 | } |
3251 | if (*end && !isspace(*end)) break; | 3442 | if (*end && !isspace(*end)) break; |
3252 | bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); | 3443 | bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); |
3253 | buf = end; | 3444 | buf = skip_spaces(end); |
3254 | while (isspace(*buf)) buf++; | ||
3255 | } | 3445 | } |
3256 | bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ | 3446 | bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ |
3257 | out: | 3447 | out: |
@@ -3794,6 +3984,7 @@ static struct attribute *md_default_attrs[] = { | |||
3794 | &md_array_state.attr, | 3984 | &md_array_state.attr, |
3795 | &md_reshape_position.attr, | 3985 | &md_reshape_position.attr, |
3796 | &md_array_size.attr, | 3986 | &md_array_size.attr, |
3987 | &max_corr_read_errors.attr, | ||
3797 | NULL, | 3988 | NULL, |
3798 | }; | 3989 | }; |
3799 | 3990 | ||
@@ -3875,7 +4066,7 @@ static void md_free(struct kobject *ko) | |||
3875 | kfree(mddev); | 4066 | kfree(mddev); |
3876 | } | 4067 | } |
3877 | 4068 | ||
3878 | static struct sysfs_ops md_sysfs_ops = { | 4069 | static const struct sysfs_ops md_sysfs_ops = { |
3879 | .show = md_attr_show, | 4070 | .show = md_attr_show, |
3880 | .store = md_attr_store, | 4071 | .store = md_attr_store, |
3881 | }; | 4072 | }; |
@@ -3891,13 +4082,16 @@ static void mddev_delayed_delete(struct work_struct *ws) | |||
3891 | { | 4082 | { |
3892 | mddev_t *mddev = container_of(ws, mddev_t, del_work); | 4083 | mddev_t *mddev = container_of(ws, mddev_t, del_work); |
3893 | 4084 | ||
3894 | if (mddev->private == &md_redundancy_group) { | 4085 | if (mddev->private) { |
3895 | sysfs_remove_group(&mddev->kobj, &md_redundancy_group); | 4086 | sysfs_remove_group(&mddev->kobj, &md_redundancy_group); |
4087 | if (mddev->private != (void*)1) | ||
4088 | sysfs_remove_group(&mddev->kobj, mddev->private); | ||
3896 | if (mddev->sysfs_action) | 4089 | if (mddev->sysfs_action) |
3897 | sysfs_put(mddev->sysfs_action); | 4090 | sysfs_put(mddev->sysfs_action); |
3898 | mddev->sysfs_action = NULL; | 4091 | mddev->sysfs_action = NULL; |
3899 | mddev->private = NULL; | 4092 | mddev->private = NULL; |
3900 | } | 4093 | } |
4094 | sysfs_remove_group(&mddev->kobj, &md_bitmap_group); | ||
3901 | kobject_del(&mddev->kobj); | 4095 | kobject_del(&mddev->kobj); |
3902 | kobject_put(&mddev->kobj); | 4096 | kobject_put(&mddev->kobj); |
3903 | } | 4097 | } |
@@ -3989,6 +4183,8 @@ static int md_alloc(dev_t dev, char *name) | |||
3989 | disk->disk_name); | 4183 | disk->disk_name); |
3990 | error = 0; | 4184 | error = 0; |
3991 | } | 4185 | } |
4186 | if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | ||
4187 | printk(KERN_DEBUG "pointless warning\n"); | ||
3992 | abort: | 4188 | abort: |
3993 | mutex_unlock(&disks_mutex); | 4189 | mutex_unlock(&disks_mutex); |
3994 | if (!error) { | 4190 | if (!error) { |
@@ -4100,10 +4296,7 @@ static int do_md_run(mddev_t * mddev) | |||
4100 | sysfs_notify_dirent(rdev->sysfs_state); | 4296 | sysfs_notify_dirent(rdev->sysfs_state); |
4101 | } | 4297 | } |
4102 | 4298 | ||
4103 | md_probe(mddev->unit, NULL, NULL); | ||
4104 | disk = mddev->gendisk; | 4299 | disk = mddev->gendisk; |
4105 | if (!disk) | ||
4106 | return -ENOMEM; | ||
4107 | 4300 | ||
4108 | spin_lock(&pers_lock); | 4301 | spin_lock(&pers_lock); |
4109 | pers = find_pers(mddev->level, mddev->clevel); | 4302 | pers = find_pers(mddev->level, mddev->clevel); |
@@ -4170,7 +4363,7 @@ static int do_md_run(mddev_t * mddev) | |||
4170 | mddev->barriers_work = 1; | 4363 | mddev->barriers_work = 1; |
4171 | mddev->ok_start_degraded = start_dirty_degraded; | 4364 | mddev->ok_start_degraded = start_dirty_degraded; |
4172 | 4365 | ||
4173 | if (start_readonly) | 4366 | if (start_readonly && mddev->ro == 0) |
4174 | mddev->ro = 2; /* read-only, but switch on first write */ | 4367 | mddev->ro = 2; /* read-only, but switch on first write */ |
4175 | 4368 | ||
4176 | err = mddev->pers->run(mddev); | 4369 | err = mddev->pers->run(mddev); |
@@ -4210,6 +4403,8 @@ static int do_md_run(mddev_t * mddev) | |||
4210 | mddev->ro = 0; | 4403 | mddev->ro = 0; |
4211 | 4404 | ||
4212 | atomic_set(&mddev->writes_pending,0); | 4405 | atomic_set(&mddev->writes_pending,0); |
4406 | atomic_set(&mddev->max_corr_read_errors, | ||
4407 | MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); | ||
4213 | mddev->safemode = 0; | 4408 | mddev->safemode = 0; |
4214 | mddev->safemode_timer.function = md_safemode_timeout; | 4409 | mddev->safemode_timer.function = md_safemode_timeout; |
4215 | mddev->safemode_timer.data = (unsigned long) mddev; | 4410 | mddev->safemode_timer.data = (unsigned long) mddev; |
@@ -4232,33 +4427,6 @@ static int do_md_run(mddev_t * mddev) | |||
4232 | 4427 | ||
4233 | set_capacity(disk, mddev->array_sectors); | 4428 | set_capacity(disk, mddev->array_sectors); |
4234 | 4429 | ||
4235 | /* If there is a partially-recovered drive we need to | ||
4236 | * start recovery here. If we leave it to md_check_recovery, | ||
4237 | * it will remove the drives and not do the right thing | ||
4238 | */ | ||
4239 | if (mddev->degraded && !mddev->sync_thread) { | ||
4240 | int spares = 0; | ||
4241 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
4242 | if (rdev->raid_disk >= 0 && | ||
4243 | !test_bit(In_sync, &rdev->flags) && | ||
4244 | !test_bit(Faulty, &rdev->flags)) | ||
4245 | /* complete an interrupted recovery */ | ||
4246 | spares++; | ||
4247 | if (spares && mddev->pers->sync_request) { | ||
4248 | mddev->recovery = 0; | ||
4249 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
4250 | mddev->sync_thread = md_register_thread(md_do_sync, | ||
4251 | mddev, | ||
4252 | "resync"); | ||
4253 | if (!mddev->sync_thread) { | ||
4254 | printk(KERN_ERR "%s: could not start resync" | ||
4255 | " thread...\n", | ||
4256 | mdname(mddev)); | ||
4257 | /* leave the spares where they are, it shouldn't hurt */ | ||
4258 | mddev->recovery = 0; | ||
4259 | } | ||
4260 | } | ||
4261 | } | ||
4262 | md_wakeup_thread(mddev->thread); | 4430 | md_wakeup_thread(mddev->thread); |
4263 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | 4431 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
4264 | 4432 | ||
@@ -4314,7 +4482,7 @@ static int deny_bitmap_write_access(struct file * file) | |||
4314 | return 0; | 4482 | return 0; |
4315 | } | 4483 | } |
4316 | 4484 | ||
4317 | static void restore_bitmap_write_access(struct file *file) | 4485 | void restore_bitmap_write_access(struct file *file) |
4318 | { | 4486 | { |
4319 | struct inode *inode = file->f_mapping->host; | 4487 | struct inode *inode = file->f_mapping->host; |
4320 | 4488 | ||
@@ -4368,8 +4536,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4368 | mddev->queue->unplug_fn = NULL; | 4536 | mddev->queue->unplug_fn = NULL; |
4369 | mddev->queue->backing_dev_info.congested_fn = NULL; | 4537 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4370 | module_put(mddev->pers->owner); | 4538 | module_put(mddev->pers->owner); |
4371 | if (mddev->pers->sync_request) | 4539 | if (mddev->pers->sync_request && mddev->private == NULL) |
4372 | mddev->private = &md_redundancy_group; | 4540 | mddev->private = (void*)1; |
4373 | mddev->pers = NULL; | 4541 | mddev->pers = NULL; |
4374 | /* tell userspace to handle 'inactive' */ | 4542 | /* tell userspace to handle 'inactive' */ |
4375 | sysfs_notify_dirent(mddev->sysfs_state); | 4543 | sysfs_notify_dirent(mddev->sysfs_state); |
@@ -4409,15 +4577,12 @@ out: | |||
4409 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); | 4577 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); |
4410 | 4578 | ||
4411 | bitmap_destroy(mddev); | 4579 | bitmap_destroy(mddev); |
4412 | if (mddev->bitmap_file) { | 4580 | if (mddev->bitmap_info.file) { |
4413 | restore_bitmap_write_access(mddev->bitmap_file); | 4581 | restore_bitmap_write_access(mddev->bitmap_info.file); |
4414 | fput(mddev->bitmap_file); | 4582 | fput(mddev->bitmap_info.file); |
4415 | mddev->bitmap_file = NULL; | 4583 | mddev->bitmap_info.file = NULL; |
4416 | } | 4584 | } |
4417 | mddev->bitmap_offset = 0; | 4585 | mddev->bitmap_info.offset = 0; |
4418 | |||
4419 | /* make sure all md_delayed_delete calls have finished */ | ||
4420 | flush_scheduled_work(); | ||
4421 | 4586 | ||
4422 | export_array(mddev); | 4587 | export_array(mddev); |
4423 | 4588 | ||
@@ -4455,6 +4620,11 @@ out: | |||
4455 | mddev->degraded = 0; | 4620 | mddev->degraded = 0; |
4456 | mddev->barriers_work = 0; | 4621 | mddev->barriers_work = 0; |
4457 | mddev->safemode = 0; | 4622 | mddev->safemode = 0; |
4623 | mddev->bitmap_info.offset = 0; | ||
4624 | mddev->bitmap_info.default_offset = 0; | ||
4625 | mddev->bitmap_info.chunksize = 0; | ||
4626 | mddev->bitmap_info.daemon_sleep = 0; | ||
4627 | mddev->bitmap_info.max_write_behind = 0; | ||
4458 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); | 4628 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); |
4459 | if (mddev->hold_active == UNTIL_STOP) | 4629 | if (mddev->hold_active == UNTIL_STOP) |
4460 | mddev->hold_active = 0; | 4630 | mddev->hold_active = 0; |
@@ -4640,7 +4810,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
4640 | info.state = 0; | 4810 | info.state = 0; |
4641 | if (mddev->in_sync) | 4811 | if (mddev->in_sync) |
4642 | info.state = (1<<MD_SB_CLEAN); | 4812 | info.state = (1<<MD_SB_CLEAN); |
4643 | if (mddev->bitmap && mddev->bitmap_offset) | 4813 | if (mddev->bitmap && mddev->bitmap_info.offset) |
4644 | info.state = (1<<MD_SB_BITMAP_PRESENT); | 4814 | info.state = (1<<MD_SB_BITMAP_PRESENT); |
4645 | info.active_disks = insync; | 4815 | info.active_disks = insync; |
4646 | info.working_disks = working; | 4816 | info.working_disks = working; |
@@ -4998,23 +5168,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd) | |||
4998 | if (fd >= 0) { | 5168 | if (fd >= 0) { |
4999 | if (mddev->bitmap) | 5169 | if (mddev->bitmap) |
5000 | return -EEXIST; /* cannot add when bitmap is present */ | 5170 | return -EEXIST; /* cannot add when bitmap is present */ |
5001 | mddev->bitmap_file = fget(fd); | 5171 | mddev->bitmap_info.file = fget(fd); |
5002 | 5172 | ||
5003 | if (mddev->bitmap_file == NULL) { | 5173 | if (mddev->bitmap_info.file == NULL) { |
5004 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | 5174 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", |
5005 | mdname(mddev)); | 5175 | mdname(mddev)); |
5006 | return -EBADF; | 5176 | return -EBADF; |
5007 | } | 5177 | } |
5008 | 5178 | ||
5009 | err = deny_bitmap_write_access(mddev->bitmap_file); | 5179 | err = deny_bitmap_write_access(mddev->bitmap_info.file); |
5010 | if (err) { | 5180 | if (err) { |
5011 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | 5181 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", |
5012 | mdname(mddev)); | 5182 | mdname(mddev)); |
5013 | fput(mddev->bitmap_file); | 5183 | fput(mddev->bitmap_info.file); |
5014 | mddev->bitmap_file = NULL; | 5184 | mddev->bitmap_info.file = NULL; |
5015 | return err; | 5185 | return err; |
5016 | } | 5186 | } |
5017 | mddev->bitmap_offset = 0; /* file overrides offset */ | 5187 | mddev->bitmap_info.offset = 0; /* file overrides offset */ |
5018 | } else if (mddev->bitmap == NULL) | 5188 | } else if (mddev->bitmap == NULL) |
5019 | return -ENOENT; /* cannot remove what isn't there */ | 5189 | return -ENOENT; /* cannot remove what isn't there */ |
5020 | err = 0; | 5190 | err = 0; |
@@ -5029,11 +5199,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) | |||
5029 | mddev->pers->quiesce(mddev, 0); | 5199 | mddev->pers->quiesce(mddev, 0); |
5030 | } | 5200 | } |
5031 | if (fd < 0) { | 5201 | if (fd < 0) { |
5032 | if (mddev->bitmap_file) { | 5202 | if (mddev->bitmap_info.file) { |
5033 | restore_bitmap_write_access(mddev->bitmap_file); | 5203 | restore_bitmap_write_access(mddev->bitmap_info.file); |
5034 | fput(mddev->bitmap_file); | 5204 | fput(mddev->bitmap_info.file); |
5035 | } | 5205 | } |
5036 | mddev->bitmap_file = NULL; | 5206 | mddev->bitmap_info.file = NULL; |
5037 | } | 5207 | } |
5038 | 5208 | ||
5039 | return err; | 5209 | return err; |
@@ -5070,6 +5240,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
5070 | mddev->minor_version = info->minor_version; | 5240 | mddev->minor_version = info->minor_version; |
5071 | mddev->patch_version = info->patch_version; | 5241 | mddev->patch_version = info->patch_version; |
5072 | mddev->persistent = !info->not_persistent; | 5242 | mddev->persistent = !info->not_persistent; |
5243 | /* ensure mddev_put doesn't delete this now that there | ||
5244 | * is some minimal configuration. | ||
5245 | */ | ||
5246 | mddev->ctime = get_seconds(); | ||
5073 | return 0; | 5247 | return 0; |
5074 | } | 5248 | } |
5075 | mddev->major_version = MD_MAJOR_VERSION; | 5249 | mddev->major_version = MD_MAJOR_VERSION; |
@@ -5100,8 +5274,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
5100 | mddev->flags = 0; | 5274 | mddev->flags = 0; |
5101 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5275 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
5102 | 5276 | ||
5103 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | 5277 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
5104 | mddev->bitmap_offset = 0; | 5278 | mddev->bitmap_info.offset = 0; |
5105 | 5279 | ||
5106 | mddev->reshape_position = MaxSector; | 5280 | mddev->reshape_position = MaxSector; |
5107 | 5281 | ||
@@ -5201,7 +5375,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5201 | int state = 0; | 5375 | int state = 0; |
5202 | 5376 | ||
5203 | /* calculate expected state,ignoring low bits */ | 5377 | /* calculate expected state,ignoring low bits */ |
5204 | if (mddev->bitmap && mddev->bitmap_offset) | 5378 | if (mddev->bitmap && mddev->bitmap_info.offset) |
5205 | state |= (1 << MD_SB_BITMAP_PRESENT); | 5379 | state |= (1 << MD_SB_BITMAP_PRESENT); |
5206 | 5380 | ||
5207 | if (mddev->major_version != info->major_version || | 5381 | if (mddev->major_version != info->major_version || |
@@ -5260,9 +5434,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5260 | /* add the bitmap */ | 5434 | /* add the bitmap */ |
5261 | if (mddev->bitmap) | 5435 | if (mddev->bitmap) |
5262 | return -EEXIST; | 5436 | return -EEXIST; |
5263 | if (mddev->default_bitmap_offset == 0) | 5437 | if (mddev->bitmap_info.default_offset == 0) |
5264 | return -EINVAL; | 5438 | return -EINVAL; |
5265 | mddev->bitmap_offset = mddev->default_bitmap_offset; | 5439 | mddev->bitmap_info.offset = |
5440 | mddev->bitmap_info.default_offset; | ||
5266 | mddev->pers->quiesce(mddev, 1); | 5441 | mddev->pers->quiesce(mddev, 1); |
5267 | rv = bitmap_create(mddev); | 5442 | rv = bitmap_create(mddev); |
5268 | if (rv) | 5443 | if (rv) |
@@ -5277,7 +5452,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
5277 | mddev->pers->quiesce(mddev, 1); | 5452 | mddev->pers->quiesce(mddev, 1); |
5278 | bitmap_destroy(mddev); | 5453 | bitmap_destroy(mddev); |
5279 | mddev->pers->quiesce(mddev, 0); | 5454 | mddev->pers->quiesce(mddev, 0); |
5280 | mddev->bitmap_offset = 0; | 5455 | mddev->bitmap_info.offset = 0; |
5281 | } | 5456 | } |
5282 | } | 5457 | } |
5283 | md_update_sb(mddev, 1); | 5458 | md_update_sb(mddev, 1); |
@@ -5528,6 +5703,25 @@ done: | |||
5528 | abort: | 5703 | abort: |
5529 | return err; | 5704 | return err; |
5530 | } | 5705 | } |
5706 | #ifdef CONFIG_COMPAT | ||
5707 | static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, | ||
5708 | unsigned int cmd, unsigned long arg) | ||
5709 | { | ||
5710 | switch (cmd) { | ||
5711 | case HOT_REMOVE_DISK: | ||
5712 | case HOT_ADD_DISK: | ||
5713 | case SET_DISK_FAULTY: | ||
5714 | case SET_BITMAP_FILE: | ||
5715 | /* These take in integer arg, do not convert */ | ||
5716 | break; | ||
5717 | default: | ||
5718 | arg = (unsigned long)compat_ptr(arg); | ||
5719 | break; | ||
5720 | } | ||
5721 | |||
5722 | return md_ioctl(bdev, mode, cmd, arg); | ||
5723 | } | ||
5724 | #endif /* CONFIG_COMPAT */ | ||
5531 | 5725 | ||
5532 | static int md_open(struct block_device *bdev, fmode_t mode) | 5726 | static int md_open(struct block_device *bdev, fmode_t mode) |
5533 | { | 5727 | { |
@@ -5593,6 +5787,9 @@ static const struct block_device_operations md_fops = | |||
5593 | .open = md_open, | 5787 | .open = md_open, |
5594 | .release = md_release, | 5788 | .release = md_release, |
5595 | .ioctl = md_ioctl, | 5789 | .ioctl = md_ioctl, |
5790 | #ifdef CONFIG_COMPAT | ||
5791 | .compat_ioctl = md_compat_ioctl, | ||
5792 | #endif | ||
5596 | .getgeo = md_getgeo, | 5793 | .getgeo = md_getgeo, |
5597 | .media_changed = md_media_changed, | 5794 | .media_changed = md_media_changed, |
5598 | .revalidate_disk= md_revalidate, | 5795 | .revalidate_disk= md_revalidate, |
@@ -5986,14 +6183,14 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
5986 | unsigned long chunk_kb; | 6183 | unsigned long chunk_kb; |
5987 | unsigned long flags; | 6184 | unsigned long flags; |
5988 | spin_lock_irqsave(&bitmap->lock, flags); | 6185 | spin_lock_irqsave(&bitmap->lock, flags); |
5989 | chunk_kb = bitmap->chunksize >> 10; | 6186 | chunk_kb = mddev->bitmap_info.chunksize >> 10; |
5990 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | 6187 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " |
5991 | "%lu%s chunk", | 6188 | "%lu%s chunk", |
5992 | bitmap->pages - bitmap->missing_pages, | 6189 | bitmap->pages - bitmap->missing_pages, |
5993 | bitmap->pages, | 6190 | bitmap->pages, |
5994 | (bitmap->pages - bitmap->missing_pages) | 6191 | (bitmap->pages - bitmap->missing_pages) |
5995 | << (PAGE_SHIFT - 10), | 6192 | << (PAGE_SHIFT - 10), |
5996 | chunk_kb ? chunk_kb : bitmap->chunksize, | 6193 | chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, |
5997 | chunk_kb ? "KB" : "B"); | 6194 | chunk_kb ? "KB" : "B"); |
5998 | if (bitmap->file) { | 6195 | if (bitmap->file) { |
5999 | seq_printf(seq, ", file: "); | 6196 | seq_printf(seq, ", file: "); |
@@ -6279,10 +6476,11 @@ void md_do_sync(mddev_t *mddev) | |||
6279 | mddev->curr_resync = 2; | 6476 | mddev->curr_resync = 2; |
6280 | 6477 | ||
6281 | try_again: | 6478 | try_again: |
6282 | if (kthread_should_stop()) { | 6479 | if (kthread_should_stop()) |
6283 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 6480 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
6481 | |||
6482 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6284 | goto skip; | 6483 | goto skip; |
6285 | } | ||
6286 | for_each_mddev(mddev2, tmp) { | 6484 | for_each_mddev(mddev2, tmp) { |
6287 | if (mddev2 == mddev) | 6485 | if (mddev2 == mddev) |
6288 | continue; | 6486 | continue; |
@@ -6342,12 +6540,14 @@ void md_do_sync(mddev_t *mddev) | |||
6342 | /* recovery follows the physical size of devices */ | 6540 | /* recovery follows the physical size of devices */ |
6343 | max_sectors = mddev->dev_sectors; | 6541 | max_sectors = mddev->dev_sectors; |
6344 | j = MaxSector; | 6542 | j = MaxSector; |
6345 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6543 | rcu_read_lock(); |
6544 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
6346 | if (rdev->raid_disk >= 0 && | 6545 | if (rdev->raid_disk >= 0 && |
6347 | !test_bit(Faulty, &rdev->flags) && | 6546 | !test_bit(Faulty, &rdev->flags) && |
6348 | !test_bit(In_sync, &rdev->flags) && | 6547 | !test_bit(In_sync, &rdev->flags) && |
6349 | rdev->recovery_offset < j) | 6548 | rdev->recovery_offset < j) |
6350 | j = rdev->recovery_offset; | 6549 | j = rdev->recovery_offset; |
6550 | rcu_read_unlock(); | ||
6351 | } | 6551 | } |
6352 | 6552 | ||
6353 | printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); | 6553 | printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); |
@@ -6384,6 +6584,7 @@ void md_do_sync(mddev_t *mddev) | |||
6384 | desc, mdname(mddev)); | 6584 | desc, mdname(mddev)); |
6385 | mddev->curr_resync = j; | 6585 | mddev->curr_resync = j; |
6386 | } | 6586 | } |
6587 | mddev->curr_resync_completed = mddev->curr_resync; | ||
6387 | 6588 | ||
6388 | while (j < max_sectors) { | 6589 | while (j < max_sectors) { |
6389 | sector_t sectors; | 6590 | sector_t sectors; |
@@ -6516,22 +6717,29 @@ void md_do_sync(mddev_t *mddev) | |||
6516 | } else { | 6717 | } else { |
6517 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 6718 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
6518 | mddev->curr_resync = MaxSector; | 6719 | mddev->curr_resync = MaxSector; |
6519 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6720 | rcu_read_lock(); |
6721 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | ||
6520 | if (rdev->raid_disk >= 0 && | 6722 | if (rdev->raid_disk >= 0 && |
6521 | !test_bit(Faulty, &rdev->flags) && | 6723 | !test_bit(Faulty, &rdev->flags) && |
6522 | !test_bit(In_sync, &rdev->flags) && | 6724 | !test_bit(In_sync, &rdev->flags) && |
6523 | rdev->recovery_offset < mddev->curr_resync) | 6725 | rdev->recovery_offset < mddev->curr_resync) |
6524 | rdev->recovery_offset = mddev->curr_resync; | 6726 | rdev->recovery_offset = mddev->curr_resync; |
6727 | rcu_read_unlock(); | ||
6525 | } | 6728 | } |
6526 | } | 6729 | } |
6527 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 6730 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
6528 | 6731 | ||
6529 | skip: | 6732 | skip: |
6733 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
6734 | /* We completed so min/max setting can be forgotten if used. */ | ||
6735 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
6736 | mddev->resync_min = 0; | ||
6737 | mddev->resync_max = MaxSector; | ||
6738 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
6739 | mddev->resync_min = mddev->curr_resync_completed; | ||
6530 | mddev->curr_resync = 0; | 6740 | mddev->curr_resync = 0; |
6531 | mddev->curr_resync_completed = 0; | ||
6532 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 6741 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
6533 | /* We completed so max setting can be forgotten. */ | 6742 | mddev->curr_resync_completed = 0; |
6534 | mddev->resync_max = MaxSector; | ||
6535 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 6743 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
6536 | wake_up(&resync_wait); | 6744 | wake_up(&resync_wait); |
6537 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 6745 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
@@ -6594,6 +6802,7 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
6594 | nm, mdname(mddev)); | 6802 | nm, mdname(mddev)); |
6595 | spares++; | 6803 | spares++; |
6596 | md_new_event(mddev); | 6804 | md_new_event(mddev); |
6805 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
6597 | } else | 6806 | } else |
6598 | break; | 6807 | break; |
6599 | } | 6808 | } |
@@ -6629,7 +6838,7 @@ void md_check_recovery(mddev_t *mddev) | |||
6629 | 6838 | ||
6630 | 6839 | ||
6631 | if (mddev->bitmap) | 6840 | if (mddev->bitmap) |
6632 | bitmap_daemon_work(mddev->bitmap); | 6841 | bitmap_daemon_work(mddev); |
6633 | 6842 | ||
6634 | if (mddev->ro) | 6843 | if (mddev->ro) |
6635 | return; | 6844 | return; |
@@ -6999,5 +7208,6 @@ EXPORT_SYMBOL(md_unregister_thread); | |||
6999 | EXPORT_SYMBOL(md_wakeup_thread); | 7208 | EXPORT_SYMBOL(md_wakeup_thread); |
7000 | EXPORT_SYMBOL(md_check_recovery); | 7209 | EXPORT_SYMBOL(md_check_recovery); |
7001 | MODULE_LICENSE("GPL"); | 7210 | MODULE_LICENSE("GPL"); |
7211 | MODULE_DESCRIPTION("MD RAID framework"); | ||
7002 | MODULE_ALIAS("md"); | 7212 | MODULE_ALIAS("md"); |
7003 | MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); | 7213 | MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index f184b69ef337..8e4c75c00d46 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -97,6 +97,9 @@ struct mdk_rdev_s | |||
97 | atomic_t read_errors; /* number of consecutive read errors that | 97 | atomic_t read_errors; /* number of consecutive read errors that |
98 | * we have tried to ignore. | 98 | * we have tried to ignore. |
99 | */ | 99 | */ |
100 | struct timespec last_read_error; /* monotonic time since our | ||
101 | * last read error | ||
102 | */ | ||
100 | atomic_t corrected_errors; /* number of corrected read errors, | 103 | atomic_t corrected_errors; /* number of corrected read errors, |
101 | * for reporting to userspace and storing | 104 | * for reporting to userspace and storing |
102 | * in superblock. | 105 | * in superblock. |
@@ -280,17 +283,38 @@ struct mddev_s | |||
280 | unsigned int max_write_behind; /* 0 = sync */ | 283 | unsigned int max_write_behind; /* 0 = sync */ |
281 | 284 | ||
282 | struct bitmap *bitmap; /* the bitmap for the device */ | 285 | struct bitmap *bitmap; /* the bitmap for the device */ |
283 | struct file *bitmap_file; /* the bitmap file */ | 286 | struct { |
284 | long bitmap_offset; /* offset from superblock of | 287 | struct file *file; /* the bitmap file */ |
285 | * start of bitmap. May be | 288 | loff_t offset; /* offset from superblock of |
286 | * negative, but not '0' | 289 | * start of bitmap. May be |
287 | */ | 290 | * negative, but not '0' |
288 | long default_bitmap_offset; /* this is the offset to use when | 291 | * For external metadata, offset |
289 | * hot-adding a bitmap. It should | 292 | * from start of device. |
290 | * eventually be settable by sysfs. | 293 | */ |
291 | */ | 294 | loff_t default_offset; /* this is the offset to use when |
292 | 295 | * hot-adding a bitmap. It should | |
296 | * eventually be settable by sysfs. | ||
297 | */ | ||
298 | struct mutex mutex; | ||
299 | unsigned long chunksize; | ||
300 | unsigned long daemon_sleep; /* how many seconds between updates? */ | ||
301 | unsigned long max_write_behind; /* write-behind mode */ | ||
302 | int external; | ||
303 | } bitmap_info; | ||
304 | |||
305 | atomic_t max_corr_read_errors; /* max read retries */ | ||
293 | struct list_head all_mddevs; | 306 | struct list_head all_mddevs; |
307 | |||
308 | /* Generic barrier handling. | ||
309 | * If there is a pending barrier request, all other | ||
310 | * writes are blocked while the devices are flushed. | ||
311 | * The last to finish a flush schedules a worker to | ||
312 | * submit the barrier request (without the barrier flag), | ||
313 | * then submit more flush requests. | ||
314 | */ | ||
315 | struct bio *barrier; | ||
316 | atomic_t flush_pending; | ||
317 | struct work_struct barrier_work; | ||
294 | }; | 318 | }; |
295 | 319 | ||
296 | 320 | ||
@@ -353,7 +377,7 @@ struct md_sysfs_entry { | |||
353 | ssize_t (*show)(mddev_t *, char *); | 377 | ssize_t (*show)(mddev_t *, char *); |
354 | ssize_t (*store)(mddev_t *, const char *, size_t); | 378 | ssize_t (*store)(mddev_t *, const char *, size_t); |
355 | }; | 379 | }; |
356 | 380 | extern struct attribute_group md_bitmap_group; | |
357 | 381 | ||
358 | static inline char * mdname (mddev_t * mddev) | 382 | static inline char * mdname (mddev_t * mddev) |
359 | { | 383 | { |
@@ -431,6 +455,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); | |||
431 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); | 455 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); |
432 | 456 | ||
433 | extern int mddev_congested(mddev_t *mddev, int bits); | 457 | extern int mddev_congested(mddev_t *mddev, int bits); |
458 | extern void md_barrier_request(mddev_t *mddev, struct bio *bio); | ||
434 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 459 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
435 | sector_t sector, int size, struct page *page); | 460 | sector_t sector, int size, struct page *page); |
436 | extern void md_super_wait(mddev_t *mddev); | 461 | extern void md_super_wait(mddev_t *mddev); |
@@ -443,6 +468,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | |||
443 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); | 468 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); |
444 | extern int md_check_no_bitmap(mddev_t *mddev); | 469 | extern int md_check_no_bitmap(mddev_t *mddev); |
445 | extern int md_integrity_register(mddev_t *mddev); | 470 | extern int md_integrity_register(mddev_t *mddev); |
446 | void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 471 | extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
472 | extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); | ||
473 | extern void restore_bitmap_write_access(struct file *file); | ||
447 | 474 | ||
448 | #endif /* _MD_MD_H */ | 475 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index ee7646f974a0..789bf535d29c 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/raid/md_u.h> | 23 | #include <linux/raid/md_u.h> |
24 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
25 | #include <linux/slab.h> | ||
25 | #include "md.h" | 26 | #include "md.h" |
26 | #include "multipath.h" | 27 | #include "multipath.h" |
27 | 28 | ||
@@ -145,7 +146,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) | |||
145 | int cpu; | 146 | int cpu; |
146 | 147 | ||
147 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 148 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
148 | bio_endio(bio, -EOPNOTSUPP); | 149 | md_barrier_request(mddev, bio); |
149 | return 0; | 150 | return 0; |
150 | } | 151 | } |
151 | 152 | ||
@@ -301,14 +302,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
301 | rdev->data_offset << 9); | 302 | rdev->data_offset << 9); |
302 | 303 | ||
303 | /* as we don't honour merge_bvec_fn, we must never risk | 304 | /* as we don't honour merge_bvec_fn, we must never risk |
304 | * violating it, so limit ->max_sector to one PAGE, as | 305 | * violating it, so limit ->max_segments to one, lying |
305 | * a one page request is never in violation. | 306 | * within a single page. |
306 | * (Note: it is very unlikely that a device with | 307 | * (Note: it is very unlikely that a device with |
307 | * merge_bvec_fn will be involved in multipath.) | 308 | * merge_bvec_fn will be involved in multipath.) |
308 | */ | 309 | */ |
309 | if (q->merge_bvec_fn && | 310 | if (q->merge_bvec_fn) { |
310 | queue_max_sectors(q) > (PAGE_SIZE>>9)) | 311 | blk_queue_max_segments(mddev->queue, 1); |
311 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 312 | blk_queue_segment_boundary(mddev->queue, |
313 | PAGE_CACHE_SIZE - 1); | ||
314 | } | ||
312 | 315 | ||
313 | conf->working_disks++; | 316 | conf->working_disks++; |
314 | mddev->degraded--; | 317 | mddev->degraded--; |
@@ -476,9 +479,11 @@ static int multipath_run (mddev_t *mddev) | |||
476 | /* as we don't honour merge_bvec_fn, we must never risk | 479 | /* as we don't honour merge_bvec_fn, we must never risk |
477 | * violating it, not that we ever expect a device with | 480 | * violating it, not that we ever expect a device with |
478 | * a merge_bvec_fn to be involved in multipath */ | 481 | * a merge_bvec_fn to be involved in multipath */ |
479 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 482 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
480 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 483 | blk_queue_max_segments(mddev->queue, 1); |
481 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 484 | blk_queue_segment_boundary(mddev->queue, |
485 | PAGE_CACHE_SIZE - 1); | ||
486 | } | ||
482 | 487 | ||
483 | if (!test_bit(Faulty, &rdev->flags)) | 488 | if (!test_bit(Faulty, &rdev->flags)) |
484 | conf->working_disks++; | 489 | conf->working_disks++; |
@@ -581,6 +586,7 @@ static void __exit multipath_exit (void) | |||
581 | module_init(multipath_init); | 586 | module_init(multipath_init); |
582 | module_exit(multipath_exit); | 587 | module_exit(multipath_exit); |
583 | MODULE_LICENSE("GPL"); | 588 | MODULE_LICENSE("GPL"); |
589 | MODULE_DESCRIPTION("simple multi-path personality for MD"); | ||
584 | MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ | 590 | MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ |
585 | MODULE_ALIAS("md-multipath"); | 591 | MODULE_ALIAS("md-multipath"); |
586 | MODULE_ALIAS("md-level--4"); | 592 | MODULE_ALIAS("md-level--4"); |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d3a4ce06015a..c3bec024612e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -20,6 +20,7 @@ | |||
20 | 20 | ||
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/slab.h> | ||
23 | #include "md.h" | 24 | #include "md.h" |
24 | #include "raid0.h" | 25 | #include "raid0.h" |
25 | 26 | ||
@@ -176,14 +177,15 @@ static int create_strip_zones(mddev_t *mddev) | |||
176 | disk_stack_limits(mddev->gendisk, rdev1->bdev, | 177 | disk_stack_limits(mddev->gendisk, rdev1->bdev, |
177 | rdev1->data_offset << 9); | 178 | rdev1->data_offset << 9); |
178 | /* as we don't honour merge_bvec_fn, we must never risk | 179 | /* as we don't honour merge_bvec_fn, we must never risk |
179 | * violating it, so limit ->max_sector to one PAGE, as | 180 | * violating it, so limit ->max_segments to 1, lying within |
180 | * a one page request is never in violation. | 181 | * a single page. |
181 | */ | 182 | */ |
182 | 183 | ||
183 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && | 184 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { |
184 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 185 | blk_queue_max_segments(mddev->queue, 1); |
185 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 186 | blk_queue_segment_boundary(mddev->queue, |
186 | 187 | PAGE_CACHE_SIZE - 1); | |
188 | } | ||
187 | if (!smallest || (rdev1->sectors < smallest->sectors)) | 189 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
188 | smallest = rdev1; | 190 | smallest = rdev1; |
189 | cnt++; | 191 | cnt++; |
@@ -325,7 +327,7 @@ static int raid0_run(mddev_t *mddev) | |||
325 | } | 327 | } |
326 | if (md_check_no_bitmap(mddev)) | 328 | if (md_check_no_bitmap(mddev)) |
327 | return -EINVAL; | 329 | return -EINVAL; |
328 | blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); | 330 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); |
329 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | 331 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; |
330 | 332 | ||
331 | ret = create_strip_zones(mddev); | 333 | ret = create_strip_zones(mddev); |
@@ -453,7 +455,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) | |||
453 | int cpu; | 455 | int cpu; |
454 | 456 | ||
455 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 457 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
456 | bio_endio(bio, -EOPNOTSUPP); | 458 | md_barrier_request(mddev, bio); |
457 | return 0; | 459 | return 0; |
458 | } | 460 | } |
459 | 461 | ||
@@ -567,6 +569,7 @@ static void raid0_exit (void) | |||
567 | module_init(raid0_init); | 569 | module_init(raid0_init); |
568 | module_exit(raid0_exit); | 570 | module_exit(raid0_exit); |
569 | MODULE_LICENSE("GPL"); | 571 | MODULE_LICENSE("GPL"); |
572 | MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); | ||
570 | MODULE_ALIAS("md-personality-2"); /* RAID0 */ | 573 | MODULE_ALIAS("md-personality-2"); /* RAID0 */ |
571 | MODULE_ALIAS("md-raid0"); | 574 | MODULE_ALIAS("md-raid0"); |
572 | MODULE_ALIAS("md-level-0"); | 575 | MODULE_ALIAS("md-level-0"); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e07ce2e033a9..e59b10e66edb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -31,6 +31,7 @@ | |||
31 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 31 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
32 | */ | 32 | */ |
33 | 33 | ||
34 | #include <linux/slab.h> | ||
34 | #include <linux/delay.h> | 35 | #include <linux/delay.h> |
35 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
36 | #include <linux/seq_file.h> | 37 | #include <linux/seq_file.h> |
@@ -677,6 +678,7 @@ static void raise_barrier(conf_t *conf) | |||
677 | static void lower_barrier(conf_t *conf) | 678 | static void lower_barrier(conf_t *conf) |
678 | { | 679 | { |
679 | unsigned long flags; | 680 | unsigned long flags; |
681 | BUG_ON(conf->barrier <= 0); | ||
680 | spin_lock_irqsave(&conf->resync_lock, flags); | 682 | spin_lock_irqsave(&conf->resync_lock, flags); |
681 | conf->barrier--; | 683 | conf->barrier--; |
682 | spin_unlock_irqrestore(&conf->resync_lock, flags); | 684 | spin_unlock_irqrestore(&conf->resync_lock, flags); |
@@ -801,6 +803,25 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
801 | 803 | ||
802 | md_write_start(mddev, bio); /* wait on superblock update early */ | 804 | md_write_start(mddev, bio); /* wait on superblock update early */ |
803 | 805 | ||
806 | if (bio_data_dir(bio) == WRITE && | ||
807 | bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && | ||
808 | bio->bi_sector < mddev->suspend_hi) { | ||
809 | /* As the suspend_* range is controlled by | ||
810 | * userspace, we want an interruptible | ||
811 | * wait. | ||
812 | */ | ||
813 | DEFINE_WAIT(w); | ||
814 | for (;;) { | ||
815 | flush_signals(current); | ||
816 | prepare_to_wait(&conf->wait_barrier, | ||
817 | &w, TASK_INTERRUPTIBLE); | ||
818 | if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || | ||
819 | bio->bi_sector >= mddev->suspend_hi) | ||
820 | break; | ||
821 | schedule(); | ||
822 | } | ||
823 | finish_wait(&conf->wait_barrier, &w); | ||
824 | } | ||
804 | if (unlikely(!mddev->barriers_work && | 825 | if (unlikely(!mddev->barriers_work && |
805 | bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 826 | bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
806 | if (rw == WRITE) | 827 | if (rw == WRITE) |
@@ -923,7 +944,8 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
923 | 944 | ||
924 | /* do behind I/O ? */ | 945 | /* do behind I/O ? */ |
925 | if (bitmap && | 946 | if (bitmap && |
926 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | 947 | (atomic_read(&bitmap->behind_writes) |
948 | < mddev->bitmap_info.max_write_behind) && | ||
927 | (behind_pages = alloc_behind_pages(bio)) != NULL) | 949 | (behind_pages = alloc_behind_pages(bio)) != NULL) |
928 | set_bit(R1BIO_BehindIO, &r1_bio->state); | 950 | set_bit(R1BIO_BehindIO, &r1_bio->state); |
929 | 951 | ||
@@ -1131,13 +1153,17 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1131 | 1153 | ||
1132 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1154 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1133 | rdev->data_offset << 9); | 1155 | rdev->data_offset << 9); |
1134 | /* as we don't honour merge_bvec_fn, we must never risk | 1156 | /* as we don't honour merge_bvec_fn, we must |
1135 | * violating it, so limit ->max_sector to one PAGE, as | 1157 | * never risk violating it, so limit |
1136 | * a one page request is never in violation. | 1158 | * ->max_segments to one lying with a single |
1159 | * page, as a one page request is never in | ||
1160 | * violation. | ||
1137 | */ | 1161 | */ |
1138 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1162 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
1139 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 1163 | blk_queue_max_segments(mddev->queue, 1); |
1140 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1164 | blk_queue_segment_boundary(mddev->queue, |
1165 | PAGE_CACHE_SIZE - 1); | ||
1166 | } | ||
1141 | 1167 | ||
1142 | p->head_position = 0; | 1168 | p->head_position = 0; |
1143 | rdev->raid_disk = mirror; | 1169 | rdev->raid_disk = mirror; |
@@ -1941,74 +1967,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) | |||
1941 | return mddev->dev_sectors; | 1967 | return mddev->dev_sectors; |
1942 | } | 1968 | } |
1943 | 1969 | ||
1944 | static int run(mddev_t *mddev) | 1970 | static conf_t *setup_conf(mddev_t *mddev) |
1945 | { | 1971 | { |
1946 | conf_t *conf; | 1972 | conf_t *conf; |
1947 | int i, j, disk_idx; | 1973 | int i; |
1948 | mirror_info_t *disk; | 1974 | mirror_info_t *disk; |
1949 | mdk_rdev_t *rdev; | 1975 | mdk_rdev_t *rdev; |
1976 | int err = -ENOMEM; | ||
1950 | 1977 | ||
1951 | if (mddev->level != 1) { | ||
1952 | printk("raid1: %s: raid level not set to mirroring (%d)\n", | ||
1953 | mdname(mddev), mddev->level); | ||
1954 | goto out; | ||
1955 | } | ||
1956 | if (mddev->reshape_position != MaxSector) { | ||
1957 | printk("raid1: %s: reshape_position set but not supported\n", | ||
1958 | mdname(mddev)); | ||
1959 | goto out; | ||
1960 | } | ||
1961 | /* | ||
1962 | * copy the already verified devices into our private RAID1 | ||
1963 | * bookkeeping area. [whatever we allocate in run(), | ||
1964 | * should be freed in stop()] | ||
1965 | */ | ||
1966 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); | 1978 | conf = kzalloc(sizeof(conf_t), GFP_KERNEL); |
1967 | mddev->private = conf; | ||
1968 | if (!conf) | 1979 | if (!conf) |
1969 | goto out_no_mem; | 1980 | goto abort; |
1970 | 1981 | ||
1971 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, | 1982 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, |
1972 | GFP_KERNEL); | 1983 | GFP_KERNEL); |
1973 | if (!conf->mirrors) | 1984 | if (!conf->mirrors) |
1974 | goto out_no_mem; | 1985 | goto abort; |
1975 | 1986 | ||
1976 | conf->tmppage = alloc_page(GFP_KERNEL); | 1987 | conf->tmppage = alloc_page(GFP_KERNEL); |
1977 | if (!conf->tmppage) | 1988 | if (!conf->tmppage) |
1978 | goto out_no_mem; | 1989 | goto abort; |
1979 | 1990 | ||
1980 | conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); | 1991 | conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); |
1981 | if (!conf->poolinfo) | 1992 | if (!conf->poolinfo) |
1982 | goto out_no_mem; | 1993 | goto abort; |
1983 | conf->poolinfo->mddev = NULL; | ||
1984 | conf->poolinfo->raid_disks = mddev->raid_disks; | 1994 | conf->poolinfo->raid_disks = mddev->raid_disks; |
1985 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, | 1995 | conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, |
1986 | r1bio_pool_free, | 1996 | r1bio_pool_free, |
1987 | conf->poolinfo); | 1997 | conf->poolinfo); |
1988 | if (!conf->r1bio_pool) | 1998 | if (!conf->r1bio_pool) |
1989 | goto out_no_mem; | 1999 | goto abort; |
2000 | |||
1990 | conf->poolinfo->mddev = mddev; | 2001 | conf->poolinfo->mddev = mddev; |
1991 | 2002 | ||
1992 | spin_lock_init(&conf->device_lock); | 2003 | spin_lock_init(&conf->device_lock); |
1993 | mddev->queue->queue_lock = &conf->device_lock; | ||
1994 | |||
1995 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2004 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1996 | disk_idx = rdev->raid_disk; | 2005 | int disk_idx = rdev->raid_disk; |
1997 | if (disk_idx >= mddev->raid_disks | 2006 | if (disk_idx >= mddev->raid_disks |
1998 | || disk_idx < 0) | 2007 | || disk_idx < 0) |
1999 | continue; | 2008 | continue; |
2000 | disk = conf->mirrors + disk_idx; | 2009 | disk = conf->mirrors + disk_idx; |
2001 | 2010 | ||
2002 | disk->rdev = rdev; | 2011 | disk->rdev = rdev; |
2003 | disk_stack_limits(mddev->gendisk, rdev->bdev, | ||
2004 | rdev->data_offset << 9); | ||
2005 | /* as we don't honour merge_bvec_fn, we must never risk | ||
2006 | * violating it, so limit ->max_sector to one PAGE, as | ||
2007 | * a one page request is never in violation. | ||
2008 | */ | ||
2009 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | ||
2010 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | ||
2011 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | ||
2012 | 2012 | ||
2013 | disk->head_position = 0; | 2013 | disk->head_position = 0; |
2014 | } | 2014 | } |
@@ -2022,8 +2022,7 @@ static int run(mddev_t *mddev) | |||
2022 | bio_list_init(&conf->pending_bio_list); | 2022 | bio_list_init(&conf->pending_bio_list); |
2023 | bio_list_init(&conf->flushing_bio_list); | 2023 | bio_list_init(&conf->flushing_bio_list); |
2024 | 2024 | ||
2025 | 2025 | conf->last_used = -1; | |
2026 | mddev->degraded = 0; | ||
2027 | for (i = 0; i < conf->raid_disks; i++) { | 2026 | for (i = 0; i < conf->raid_disks; i++) { |
2028 | 2027 | ||
2029 | disk = conf->mirrors + i; | 2028 | disk = conf->mirrors + i; |
@@ -2031,38 +2030,99 @@ static int run(mddev_t *mddev) | |||
2031 | if (!disk->rdev || | 2030 | if (!disk->rdev || |
2032 | !test_bit(In_sync, &disk->rdev->flags)) { | 2031 | !test_bit(In_sync, &disk->rdev->flags)) { |
2033 | disk->head_position = 0; | 2032 | disk->head_position = 0; |
2034 | mddev->degraded++; | ||
2035 | if (disk->rdev) | 2033 | if (disk->rdev) |
2036 | conf->fullsync = 1; | 2034 | conf->fullsync = 1; |
2037 | } | 2035 | } else if (conf->last_used < 0) |
2036 | /* | ||
2037 | * The first working device is used as a | ||
2038 | * starting point to read balancing. | ||
2039 | */ | ||
2040 | conf->last_used = i; | ||
2038 | } | 2041 | } |
2039 | if (mddev->degraded == conf->raid_disks) { | 2042 | |
2043 | err = -EIO; | ||
2044 | if (conf->last_used < 0) { | ||
2040 | printk(KERN_ERR "raid1: no operational mirrors for %s\n", | 2045 | printk(KERN_ERR "raid1: no operational mirrors for %s\n", |
2041 | mdname(mddev)); | 2046 | mdname(mddev)); |
2042 | goto out_free_conf; | 2047 | goto abort; |
2043 | } | 2048 | } |
2044 | if (conf->raid_disks - mddev->degraded == 1) | 2049 | err = -ENOMEM; |
2045 | mddev->recovery_cp = MaxSector; | 2050 | conf->thread = md_register_thread(raid1d, mddev, NULL); |
2051 | if (!conf->thread) { | ||
2052 | printk(KERN_ERR | ||
2053 | "raid1: couldn't allocate thread for %s\n", | ||
2054 | mdname(mddev)); | ||
2055 | goto abort; | ||
2056 | } | ||
2057 | |||
2058 | return conf; | ||
2059 | |||
2060 | abort: | ||
2061 | if (conf) { | ||
2062 | if (conf->r1bio_pool) | ||
2063 | mempool_destroy(conf->r1bio_pool); | ||
2064 | kfree(conf->mirrors); | ||
2065 | safe_put_page(conf->tmppage); | ||
2066 | kfree(conf->poolinfo); | ||
2067 | kfree(conf); | ||
2068 | } | ||
2069 | return ERR_PTR(err); | ||
2070 | } | ||
2046 | 2071 | ||
2072 | static int run(mddev_t *mddev) | ||
2073 | { | ||
2074 | conf_t *conf; | ||
2075 | int i; | ||
2076 | mdk_rdev_t *rdev; | ||
2077 | |||
2078 | if (mddev->level != 1) { | ||
2079 | printk("raid1: %s: raid level not set to mirroring (%d)\n", | ||
2080 | mdname(mddev), mddev->level); | ||
2081 | return -EIO; | ||
2082 | } | ||
2083 | if (mddev->reshape_position != MaxSector) { | ||
2084 | printk("raid1: %s: reshape_position set but not supported\n", | ||
2085 | mdname(mddev)); | ||
2086 | return -EIO; | ||
2087 | } | ||
2047 | /* | 2088 | /* |
2048 | * find the first working one and use it as a starting point | 2089 | * copy the already verified devices into our private RAID1 |
2049 | * to read balancing. | 2090 | * bookkeeping area. [whatever we allocate in run(), |
2091 | * should be freed in stop()] | ||
2050 | */ | 2092 | */ |
2051 | for (j = 0; j < conf->raid_disks && | 2093 | if (mddev->private == NULL) |
2052 | (!conf->mirrors[j].rdev || | 2094 | conf = setup_conf(mddev); |
2053 | !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) | 2095 | else |
2054 | /* nothing */; | 2096 | conf = mddev->private; |
2055 | conf->last_used = j; | ||
2056 | 2097 | ||
2098 | if (IS_ERR(conf)) | ||
2099 | return PTR_ERR(conf); | ||
2057 | 2100 | ||
2058 | mddev->thread = md_register_thread(raid1d, mddev, NULL); | 2101 | mddev->queue->queue_lock = &conf->device_lock; |
2059 | if (!mddev->thread) { | 2102 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2060 | printk(KERN_ERR | 2103 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2061 | "raid1: couldn't allocate thread for %s\n", | 2104 | rdev->data_offset << 9); |
2062 | mdname(mddev)); | 2105 | /* as we don't honour merge_bvec_fn, we must never risk |
2063 | goto out_free_conf; | 2106 | * violating it, so limit ->max_segments to 1 lying within |
2107 | * a single page, as a one page request is never in violation. | ||
2108 | */ | ||
2109 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
2110 | blk_queue_max_segments(mddev->queue, 1); | ||
2111 | blk_queue_segment_boundary(mddev->queue, | ||
2112 | PAGE_CACHE_SIZE - 1); | ||
2113 | } | ||
2064 | } | 2114 | } |
2065 | 2115 | ||
2116 | mddev->degraded = 0; | ||
2117 | for (i=0; i < conf->raid_disks; i++) | ||
2118 | if (conf->mirrors[i].rdev == NULL || | ||
2119 | !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || | ||
2120 | test_bit(Faulty, &conf->mirrors[i].rdev->flags)) | ||
2121 | mddev->degraded++; | ||
2122 | |||
2123 | if (conf->raid_disks - mddev->degraded == 1) | ||
2124 | mddev->recovery_cp = MaxSector; | ||
2125 | |||
2066 | if (mddev->recovery_cp != MaxSector) | 2126 | if (mddev->recovery_cp != MaxSector) |
2067 | printk(KERN_NOTICE "raid1: %s is not clean" | 2127 | printk(KERN_NOTICE "raid1: %s is not clean" |
2068 | " -- starting background reconstruction\n", | 2128 | " -- starting background reconstruction\n", |
@@ -2071,9 +2131,14 @@ static int run(mddev_t *mddev) | |||
2071 | "raid1: raid set %s active with %d out of %d mirrors\n", | 2131 | "raid1: raid set %s active with %d out of %d mirrors\n", |
2072 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 2132 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
2073 | mddev->raid_disks); | 2133 | mddev->raid_disks); |
2134 | |||
2074 | /* | 2135 | /* |
2075 | * Ok, everything is just fine now | 2136 | * Ok, everything is just fine now |
2076 | */ | 2137 | */ |
2138 | mddev->thread = conf->thread; | ||
2139 | conf->thread = NULL; | ||
2140 | mddev->private = conf; | ||
2141 | |||
2077 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 2142 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
2078 | 2143 | ||
2079 | mddev->queue->unplug_fn = raid1_unplug; | 2144 | mddev->queue->unplug_fn = raid1_unplug; |
@@ -2081,23 +2146,6 @@ static int run(mddev_t *mddev) | |||
2081 | mddev->queue->backing_dev_info.congested_data = mddev; | 2146 | mddev->queue->backing_dev_info.congested_data = mddev; |
2082 | md_integrity_register(mddev); | 2147 | md_integrity_register(mddev); |
2083 | return 0; | 2148 | return 0; |
2084 | |||
2085 | out_no_mem: | ||
2086 | printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", | ||
2087 | mdname(mddev)); | ||
2088 | |||
2089 | out_free_conf: | ||
2090 | if (conf) { | ||
2091 | if (conf->r1bio_pool) | ||
2092 | mempool_destroy(conf->r1bio_pool); | ||
2093 | kfree(conf->mirrors); | ||
2094 | safe_put_page(conf->tmppage); | ||
2095 | kfree(conf->poolinfo); | ||
2096 | kfree(conf); | ||
2097 | mddev->private = NULL; | ||
2098 | } | ||
2099 | out: | ||
2100 | return -EIO; | ||
2101 | } | 2149 | } |
2102 | 2150 | ||
2103 | static int stop(mddev_t *mddev) | 2151 | static int stop(mddev_t *mddev) |
@@ -2271,6 +2319,9 @@ static void raid1_quiesce(mddev_t *mddev, int state) | |||
2271 | conf_t *conf = mddev->private; | 2319 | conf_t *conf = mddev->private; |
2272 | 2320 | ||
2273 | switch(state) { | 2321 | switch(state) { |
2322 | case 2: /* wake for suspend */ | ||
2323 | wake_up(&conf->wait_barrier); | ||
2324 | break; | ||
2274 | case 1: | 2325 | case 1: |
2275 | raise_barrier(conf); | 2326 | raise_barrier(conf); |
2276 | break; | 2327 | break; |
@@ -2280,6 +2331,23 @@ static void raid1_quiesce(mddev_t *mddev, int state) | |||
2280 | } | 2331 | } |
2281 | } | 2332 | } |
2282 | 2333 | ||
2334 | static void *raid1_takeover(mddev_t *mddev) | ||
2335 | { | ||
2336 | /* raid1 can take over: | ||
2337 | * raid5 with 2 devices, any layout or chunk size | ||
2338 | */ | ||
2339 | if (mddev->level == 5 && mddev->raid_disks == 2) { | ||
2340 | conf_t *conf; | ||
2341 | mddev->new_level = 1; | ||
2342 | mddev->new_layout = 0; | ||
2343 | mddev->new_chunk_sectors = 0; | ||
2344 | conf = setup_conf(mddev); | ||
2345 | if (!IS_ERR(conf)) | ||
2346 | conf->barrier = 1; | ||
2347 | return conf; | ||
2348 | } | ||
2349 | return ERR_PTR(-EINVAL); | ||
2350 | } | ||
2283 | 2351 | ||
2284 | static struct mdk_personality raid1_personality = | 2352 | static struct mdk_personality raid1_personality = |
2285 | { | 2353 | { |
@@ -2299,6 +2367,7 @@ static struct mdk_personality raid1_personality = | |||
2299 | .size = raid1_size, | 2367 | .size = raid1_size, |
2300 | .check_reshape = raid1_reshape, | 2368 | .check_reshape = raid1_reshape, |
2301 | .quiesce = raid1_quiesce, | 2369 | .quiesce = raid1_quiesce, |
2370 | .takeover = raid1_takeover, | ||
2302 | }; | 2371 | }; |
2303 | 2372 | ||
2304 | static int __init raid_init(void) | 2373 | static int __init raid_init(void) |
@@ -2314,6 +2383,7 @@ static void raid_exit(void) | |||
2314 | module_init(raid_init); | 2383 | module_init(raid_init); |
2315 | module_exit(raid_exit); | 2384 | module_exit(raid_exit); |
2316 | MODULE_LICENSE("GPL"); | 2385 | MODULE_LICENSE("GPL"); |
2386 | MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); | ||
2317 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ | 2387 | MODULE_ALIAS("md-personality-3"); /* RAID1 */ |
2318 | MODULE_ALIAS("md-raid1"); | 2388 | MODULE_ALIAS("md-raid1"); |
2319 | MODULE_ALIAS("md-level-1"); | 2389 | MODULE_ALIAS("md-level-1"); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e87b84deff68..5f2d443ae28a 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -59,6 +59,11 @@ struct r1_private_data_s { | |||
59 | 59 | ||
60 | mempool_t *r1bio_pool; | 60 | mempool_t *r1bio_pool; |
61 | mempool_t *r1buf_pool; | 61 | mempool_t *r1buf_pool; |
62 | |||
63 | /* When taking over an array from a different personality, we store | ||
64 | * the new thread here until we fully activate the array. | ||
65 | */ | ||
66 | struct mdk_thread_s *thread; | ||
62 | }; | 67 | }; |
63 | 68 | ||
64 | typedef struct r1_private_data_s conf_t; | 69 | typedef struct r1_private_data_s conf_t; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c2cb7b87b440..e2766d8251a1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -18,6 +18,7 @@ | |||
18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/slab.h> | ||
21 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
22 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
23 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
@@ -804,7 +805,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
804 | mdk_rdev_t *blocked_rdev; | 805 | mdk_rdev_t *blocked_rdev; |
805 | 806 | ||
806 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 807 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
807 | bio_endio(bio, -EOPNOTSUPP); | 808 | md_barrier_request(mddev, bio); |
808 | return 0; | 809 | return 0; |
809 | } | 810 | } |
810 | 811 | ||
@@ -1155,13 +1156,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1155 | 1156 | ||
1156 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1157 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1157 | rdev->data_offset << 9); | 1158 | rdev->data_offset << 9); |
1158 | /* as we don't honour merge_bvec_fn, we must never risk | 1159 | /* as we don't honour merge_bvec_fn, we must |
1159 | * violating it, so limit ->max_sector to one PAGE, as | 1160 | * never risk violating it, so limit |
1160 | * a one page request is never in violation. | 1161 | * ->max_segments to one lying with a single |
1162 | * page, as a one page request is never in | ||
1163 | * violation. | ||
1161 | */ | 1164 | */ |
1162 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1165 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
1163 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 1166 | blk_queue_max_segments(mddev->queue, 1); |
1164 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1167 | blk_queue_segment_boundary(mddev->queue, |
1168 | PAGE_CACHE_SIZE - 1); | ||
1169 | } | ||
1165 | 1170 | ||
1166 | p->head_position = 0; | 1171 | p->head_position = 0; |
1167 | rdev->raid_disk = mirror; | 1172 | rdev->raid_disk = mirror; |
@@ -1432,6 +1437,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
1432 | 1437 | ||
1433 | 1438 | ||
1434 | /* | 1439 | /* |
1440 | * Used by fix_read_error() to decay the per rdev read_errors. | ||
1441 | * We halve the read error count for every hour that has elapsed | ||
1442 | * since the last recorded read error. | ||
1443 | * | ||
1444 | */ | ||
1445 | static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | ||
1446 | { | ||
1447 | struct timespec cur_time_mon; | ||
1448 | unsigned long hours_since_last; | ||
1449 | unsigned int read_errors = atomic_read(&rdev->read_errors); | ||
1450 | |||
1451 | ktime_get_ts(&cur_time_mon); | ||
1452 | |||
1453 | if (rdev->last_read_error.tv_sec == 0 && | ||
1454 | rdev->last_read_error.tv_nsec == 0) { | ||
1455 | /* first time we've seen a read error */ | ||
1456 | rdev->last_read_error = cur_time_mon; | ||
1457 | return; | ||
1458 | } | ||
1459 | |||
1460 | hours_since_last = (cur_time_mon.tv_sec - | ||
1461 | rdev->last_read_error.tv_sec) / 3600; | ||
1462 | |||
1463 | rdev->last_read_error = cur_time_mon; | ||
1464 | |||
1465 | /* | ||
1466 | * if hours_since_last is > the number of bits in read_errors | ||
1467 | * just set read errors to 0. We do this to avoid | ||
1468 | * overflowing the shift of read_errors by hours_since_last. | ||
1469 | */ | ||
1470 | if (hours_since_last >= 8 * sizeof(read_errors)) | ||
1471 | atomic_set(&rdev->read_errors, 0); | ||
1472 | else | ||
1473 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | ||
1474 | } | ||
1475 | |||
1476 | /* | ||
1435 | * This is a kernel thread which: | 1477 | * This is a kernel thread which: |
1436 | * | 1478 | * |
1437 | * 1. Retries failed read operations on working mirrors. | 1479 | * 1. Retries failed read operations on working mirrors. |
@@ -1444,6 +1486,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1444 | int sect = 0; /* Offset from r10_bio->sector */ | 1486 | int sect = 0; /* Offset from r10_bio->sector */ |
1445 | int sectors = r10_bio->sectors; | 1487 | int sectors = r10_bio->sectors; |
1446 | mdk_rdev_t*rdev; | 1488 | mdk_rdev_t*rdev; |
1489 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | ||
1490 | |||
1491 | rcu_read_lock(); | ||
1492 | { | ||
1493 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | ||
1494 | char b[BDEVNAME_SIZE]; | ||
1495 | int cur_read_error_count = 0; | ||
1496 | |||
1497 | rdev = rcu_dereference(conf->mirrors[d].rdev); | ||
1498 | bdevname(rdev->bdev, b); | ||
1499 | |||
1500 | if (test_bit(Faulty, &rdev->flags)) { | ||
1501 | rcu_read_unlock(); | ||
1502 | /* drive has already been failed, just ignore any | ||
1503 | more fix_read_error() attempts */ | ||
1504 | return; | ||
1505 | } | ||
1506 | |||
1507 | check_decay_read_errors(mddev, rdev); | ||
1508 | atomic_inc(&rdev->read_errors); | ||
1509 | cur_read_error_count = atomic_read(&rdev->read_errors); | ||
1510 | if (cur_read_error_count > max_read_errors) { | ||
1511 | rcu_read_unlock(); | ||
1512 | printk(KERN_NOTICE | ||
1513 | "raid10: %s: Raid device exceeded " | ||
1514 | "read_error threshold " | ||
1515 | "[cur %d:max %d]\n", | ||
1516 | b, cur_read_error_count, max_read_errors); | ||
1517 | printk(KERN_NOTICE | ||
1518 | "raid10: %s: Failing raid " | ||
1519 | "device\n", b); | ||
1520 | md_error(mddev, conf->mirrors[d].rdev); | ||
1521 | return; | ||
1522 | } | ||
1523 | } | ||
1524 | rcu_read_unlock(); | ||
1525 | |||
1447 | while(sectors) { | 1526 | while(sectors) { |
1448 | int s = sectors; | 1527 | int s = sectors; |
1449 | int sl = r10_bio->read_slot; | 1528 | int sl = r10_bio->read_slot; |
@@ -1488,6 +1567,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1488 | /* write it back and re-read */ | 1567 | /* write it back and re-read */ |
1489 | rcu_read_lock(); | 1568 | rcu_read_lock(); |
1490 | while (sl != r10_bio->read_slot) { | 1569 | while (sl != r10_bio->read_slot) { |
1570 | char b[BDEVNAME_SIZE]; | ||
1491 | int d; | 1571 | int d; |
1492 | if (sl==0) | 1572 | if (sl==0) |
1493 | sl = conf->copies; | 1573 | sl = conf->copies; |
@@ -1503,9 +1583,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1503 | r10_bio->devs[sl].addr + | 1583 | r10_bio->devs[sl].addr + |
1504 | sect + rdev->data_offset, | 1584 | sect + rdev->data_offset, |
1505 | s<<9, conf->tmppage, WRITE) | 1585 | s<<9, conf->tmppage, WRITE) |
1506 | == 0) | 1586 | == 0) { |
1507 | /* Well, this device is dead */ | 1587 | /* Well, this device is dead */ |
1588 | printk(KERN_NOTICE | ||
1589 | "raid10:%s: read correction " | ||
1590 | "write failed" | ||
1591 | " (%d sectors at %llu on %s)\n", | ||
1592 | mdname(mddev), s, | ||
1593 | (unsigned long long)(sect+ | ||
1594 | rdev->data_offset), | ||
1595 | bdevname(rdev->bdev, b)); | ||
1596 | printk(KERN_NOTICE "raid10:%s: failing " | ||
1597 | "drive\n", | ||
1598 | bdevname(rdev->bdev, b)); | ||
1508 | md_error(mddev, rdev); | 1599 | md_error(mddev, rdev); |
1600 | } | ||
1509 | rdev_dec_pending(rdev, mddev); | 1601 | rdev_dec_pending(rdev, mddev); |
1510 | rcu_read_lock(); | 1602 | rcu_read_lock(); |
1511 | } | 1603 | } |
@@ -1526,10 +1618,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1526 | if (sync_page_io(rdev->bdev, | 1618 | if (sync_page_io(rdev->bdev, |
1527 | r10_bio->devs[sl].addr + | 1619 | r10_bio->devs[sl].addr + |
1528 | sect + rdev->data_offset, | 1620 | sect + rdev->data_offset, |
1529 | s<<9, conf->tmppage, READ) == 0) | 1621 | s<<9, conf->tmppage, |
1622 | READ) == 0) { | ||
1530 | /* Well, this device is dead */ | 1623 | /* Well, this device is dead */ |
1624 | printk(KERN_NOTICE | ||
1625 | "raid10:%s: unable to read back " | ||
1626 | "corrected sectors" | ||
1627 | " (%d sectors at %llu on %s)\n", | ||
1628 | mdname(mddev), s, | ||
1629 | (unsigned long long)(sect+ | ||
1630 | rdev->data_offset), | ||
1631 | bdevname(rdev->bdev, b)); | ||
1632 | printk(KERN_NOTICE "raid10:%s: failing drive\n", | ||
1633 | bdevname(rdev->bdev, b)); | ||
1634 | |||
1531 | md_error(mddev, rdev); | 1635 | md_error(mddev, rdev); |
1532 | else | 1636 | } else { |
1533 | printk(KERN_INFO | 1637 | printk(KERN_INFO |
1534 | "raid10:%s: read error corrected" | 1638 | "raid10:%s: read error corrected" |
1535 | " (%d sectors at %llu on %s)\n", | 1639 | " (%d sectors at %llu on %s)\n", |
@@ -1537,6 +1641,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1537 | (unsigned long long)(sect+ | 1641 | (unsigned long long)(sect+ |
1538 | rdev->data_offset), | 1642 | rdev->data_offset), |
1539 | bdevname(rdev->bdev, b)); | 1643 | bdevname(rdev->bdev, b)); |
1644 | } | ||
1540 | 1645 | ||
1541 | rdev_dec_pending(rdev, mddev); | 1646 | rdev_dec_pending(rdev, mddev); |
1542 | rcu_read_lock(); | 1647 | rcu_read_lock(); |
@@ -2155,12 +2260,14 @@ static int run(mddev_t *mddev) | |||
2155 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 2260 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2156 | rdev->data_offset << 9); | 2261 | rdev->data_offset << 9); |
2157 | /* as we don't honour merge_bvec_fn, we must never risk | 2262 | /* as we don't honour merge_bvec_fn, we must never risk |
2158 | * violating it, so limit ->max_sector to one PAGE, as | 2263 | * violating it, so limit max_segments to 1 lying |
2159 | * a one page request is never in violation. | 2264 | * within a single page. |
2160 | */ | 2265 | */ |
2161 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 2266 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
2162 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 2267 | blk_queue_max_segments(mddev->queue, 1); |
2163 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 2268 | blk_queue_segment_boundary(mddev->queue, |
2269 | PAGE_CACHE_SIZE - 1); | ||
2270 | } | ||
2164 | 2271 | ||
2165 | disk->head_position = 0; | 2272 | disk->head_position = 0; |
2166 | } | 2273 | } |
@@ -2275,13 +2382,6 @@ static void raid10_quiesce(mddev_t *mddev, int state) | |||
2275 | lower_barrier(conf); | 2382 | lower_barrier(conf); |
2276 | break; | 2383 | break; |
2277 | } | 2384 | } |
2278 | if (mddev->thread) { | ||
2279 | if (mddev->bitmap) | ||
2280 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
2281 | else | ||
2282 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
2283 | md_wakeup_thread(mddev->thread); | ||
2284 | } | ||
2285 | } | 2385 | } |
2286 | 2386 | ||
2287 | static struct mdk_personality raid10_personality = | 2387 | static struct mdk_personality raid10_personality = |
@@ -2315,6 +2415,7 @@ static void raid_exit(void) | |||
2315 | module_init(raid_init); | 2415 | module_init(raid_init); |
2316 | module_exit(raid_exit); | 2416 | module_exit(raid_exit); |
2317 | MODULE_LICENSE("GPL"); | 2417 | MODULE_LICENSE("GPL"); |
2418 | MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); | ||
2318 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ | 2419 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ |
2319 | MODULE_ALIAS("md-raid10"); | 2420 | MODULE_ALIAS("md-raid10"); |
2320 | MODULE_ALIAS("md-level-10"); | 2421 | MODULE_ALIAS("md-level-10"); |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d29215d966da..15348c393b5d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/async.h> | 50 | #include <linux/async.h> |
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/cpu.h> | 52 | #include <linux/cpu.h> |
53 | #include <linux/slab.h> | ||
53 | #include "md.h" | 54 | #include "md.h" |
54 | #include "raid5.h" | 55 | #include "raid5.h" |
55 | #include "bitmap.h" | 56 | #include "bitmap.h" |
@@ -1526,7 +1527,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1526 | 1527 | ||
1527 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 1528 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
1528 | atomic_inc(&rdev->read_errors); | 1529 | atomic_inc(&rdev->read_errors); |
1529 | if (conf->mddev->degraded) | 1530 | if (conf->mddev->degraded >= conf->max_degraded) |
1530 | printk_rl(KERN_WARNING | 1531 | printk_rl(KERN_WARNING |
1531 | "raid5:%s: read error not correctable " | 1532 | "raid5:%s: read error not correctable " |
1532 | "(sector %llu on %s).\n", | 1533 | "(sector %llu on %s).\n", |
@@ -1649,8 +1650,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1649 | int previous, int *dd_idx, | 1650 | int previous, int *dd_idx, |
1650 | struct stripe_head *sh) | 1651 | struct stripe_head *sh) |
1651 | { | 1652 | { |
1652 | long stripe; | 1653 | sector_t stripe, stripe2; |
1653 | unsigned long chunk_number; | 1654 | sector_t chunk_number; |
1654 | unsigned int chunk_offset; | 1655 | unsigned int chunk_offset; |
1655 | int pd_idx, qd_idx; | 1656 | int pd_idx, qd_idx; |
1656 | int ddf_layout = 0; | 1657 | int ddf_layout = 0; |
@@ -1670,18 +1671,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1670 | */ | 1671 | */ |
1671 | chunk_offset = sector_div(r_sector, sectors_per_chunk); | 1672 | chunk_offset = sector_div(r_sector, sectors_per_chunk); |
1672 | chunk_number = r_sector; | 1673 | chunk_number = r_sector; |
1673 | BUG_ON(r_sector != chunk_number); | ||
1674 | 1674 | ||
1675 | /* | 1675 | /* |
1676 | * Compute the stripe number | 1676 | * Compute the stripe number |
1677 | */ | 1677 | */ |
1678 | stripe = chunk_number / data_disks; | 1678 | stripe = chunk_number; |
1679 | 1679 | *dd_idx = sector_div(stripe, data_disks); | |
1680 | /* | 1680 | stripe2 = stripe; |
1681 | * Compute the data disk and parity disk indexes inside the stripe | ||
1682 | */ | ||
1683 | *dd_idx = chunk_number % data_disks; | ||
1684 | |||
1685 | /* | 1681 | /* |
1686 | * Select the parity disk based on the user selected algorithm. | 1682 | * Select the parity disk based on the user selected algorithm. |
1687 | */ | 1683 | */ |
@@ -1693,21 +1689,21 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1693 | case 5: | 1689 | case 5: |
1694 | switch (algorithm) { | 1690 | switch (algorithm) { |
1695 | case ALGORITHM_LEFT_ASYMMETRIC: | 1691 | case ALGORITHM_LEFT_ASYMMETRIC: |
1696 | pd_idx = data_disks - stripe % raid_disks; | 1692 | pd_idx = data_disks - sector_div(stripe2, raid_disks); |
1697 | if (*dd_idx >= pd_idx) | 1693 | if (*dd_idx >= pd_idx) |
1698 | (*dd_idx)++; | 1694 | (*dd_idx)++; |
1699 | break; | 1695 | break; |
1700 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1696 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1701 | pd_idx = stripe % raid_disks; | 1697 | pd_idx = sector_div(stripe2, raid_disks); |
1702 | if (*dd_idx >= pd_idx) | 1698 | if (*dd_idx >= pd_idx) |
1703 | (*dd_idx)++; | 1699 | (*dd_idx)++; |
1704 | break; | 1700 | break; |
1705 | case ALGORITHM_LEFT_SYMMETRIC: | 1701 | case ALGORITHM_LEFT_SYMMETRIC: |
1706 | pd_idx = data_disks - stripe % raid_disks; | 1702 | pd_idx = data_disks - sector_div(stripe2, raid_disks); |
1707 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | 1703 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1708 | break; | 1704 | break; |
1709 | case ALGORITHM_RIGHT_SYMMETRIC: | 1705 | case ALGORITHM_RIGHT_SYMMETRIC: |
1710 | pd_idx = stripe % raid_disks; | 1706 | pd_idx = sector_div(stripe2, raid_disks); |
1711 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | 1707 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1712 | break; | 1708 | break; |
1713 | case ALGORITHM_PARITY_0: | 1709 | case ALGORITHM_PARITY_0: |
@@ -1727,7 +1723,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1727 | 1723 | ||
1728 | switch (algorithm) { | 1724 | switch (algorithm) { |
1729 | case ALGORITHM_LEFT_ASYMMETRIC: | 1725 | case ALGORITHM_LEFT_ASYMMETRIC: |
1730 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1726 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); |
1731 | qd_idx = pd_idx + 1; | 1727 | qd_idx = pd_idx + 1; |
1732 | if (pd_idx == raid_disks-1) { | 1728 | if (pd_idx == raid_disks-1) { |
1733 | (*dd_idx)++; /* Q D D D P */ | 1729 | (*dd_idx)++; /* Q D D D P */ |
@@ -1736,7 +1732,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1736 | (*dd_idx) += 2; /* D D P Q D */ | 1732 | (*dd_idx) += 2; /* D D P Q D */ |
1737 | break; | 1733 | break; |
1738 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1734 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1739 | pd_idx = stripe % raid_disks; | 1735 | pd_idx = sector_div(stripe2, raid_disks); |
1740 | qd_idx = pd_idx + 1; | 1736 | qd_idx = pd_idx + 1; |
1741 | if (pd_idx == raid_disks-1) { | 1737 | if (pd_idx == raid_disks-1) { |
1742 | (*dd_idx)++; /* Q D D D P */ | 1738 | (*dd_idx)++; /* Q D D D P */ |
@@ -1745,12 +1741,12 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1745 | (*dd_idx) += 2; /* D D P Q D */ | 1741 | (*dd_idx) += 2; /* D D P Q D */ |
1746 | break; | 1742 | break; |
1747 | case ALGORITHM_LEFT_SYMMETRIC: | 1743 | case ALGORITHM_LEFT_SYMMETRIC: |
1748 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1744 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); |
1749 | qd_idx = (pd_idx + 1) % raid_disks; | 1745 | qd_idx = (pd_idx + 1) % raid_disks; |
1750 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; | 1746 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; |
1751 | break; | 1747 | break; |
1752 | case ALGORITHM_RIGHT_SYMMETRIC: | 1748 | case ALGORITHM_RIGHT_SYMMETRIC: |
1753 | pd_idx = stripe % raid_disks; | 1749 | pd_idx = sector_div(stripe2, raid_disks); |
1754 | qd_idx = (pd_idx + 1) % raid_disks; | 1750 | qd_idx = (pd_idx + 1) % raid_disks; |
1755 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; | 1751 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; |
1756 | break; | 1752 | break; |
@@ -1769,7 +1765,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1769 | /* Exactly the same as RIGHT_ASYMMETRIC, but or | 1765 | /* Exactly the same as RIGHT_ASYMMETRIC, but or |
1770 | * of blocks for computing Q is different. | 1766 | * of blocks for computing Q is different. |
1771 | */ | 1767 | */ |
1772 | pd_idx = stripe % raid_disks; | 1768 | pd_idx = sector_div(stripe2, raid_disks); |
1773 | qd_idx = pd_idx + 1; | 1769 | qd_idx = pd_idx + 1; |
1774 | if (pd_idx == raid_disks-1) { | 1770 | if (pd_idx == raid_disks-1) { |
1775 | (*dd_idx)++; /* Q D D D P */ | 1771 | (*dd_idx)++; /* Q D D D P */ |
@@ -1784,7 +1780,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1784 | * D D D P Q rather than | 1780 | * D D D P Q rather than |
1785 | * Q D D D P | 1781 | * Q D D D P |
1786 | */ | 1782 | */ |
1787 | pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); | 1783 | stripe2 += 1; |
1784 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); | ||
1788 | qd_idx = pd_idx + 1; | 1785 | qd_idx = pd_idx + 1; |
1789 | if (pd_idx == raid_disks-1) { | 1786 | if (pd_idx == raid_disks-1) { |
1790 | (*dd_idx)++; /* Q D D D P */ | 1787 | (*dd_idx)++; /* Q D D D P */ |
@@ -1796,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1796 | 1793 | ||
1797 | case ALGORITHM_ROTATING_N_CONTINUE: | 1794 | case ALGORITHM_ROTATING_N_CONTINUE: |
1798 | /* Same as left_symmetric but Q is before P */ | 1795 | /* Same as left_symmetric but Q is before P */ |
1799 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1796 | pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); |
1800 | qd_idx = (pd_idx + raid_disks - 1) % raid_disks; | 1797 | qd_idx = (pd_idx + raid_disks - 1) % raid_disks; |
1801 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | 1798 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1802 | ddf_layout = 1; | 1799 | ddf_layout = 1; |
@@ -1804,27 +1801,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, | |||
1804 | 1801 | ||
1805 | case ALGORITHM_LEFT_ASYMMETRIC_6: | 1802 | case ALGORITHM_LEFT_ASYMMETRIC_6: |
1806 | /* RAID5 left_asymmetric, with Q on last device */ | 1803 | /* RAID5 left_asymmetric, with Q on last device */ |
1807 | pd_idx = data_disks - stripe % (raid_disks-1); | 1804 | pd_idx = data_disks - sector_div(stripe2, raid_disks-1); |
1808 | if (*dd_idx >= pd_idx) | 1805 | if (*dd_idx >= pd_idx) |
1809 | (*dd_idx)++; | 1806 | (*dd_idx)++; |
1810 | qd_idx = raid_disks - 1; | 1807 | qd_idx = raid_disks - 1; |
1811 | break; | 1808 | break; |
1812 | 1809 | ||
1813 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | 1810 | case ALGORITHM_RIGHT_ASYMMETRIC_6: |
1814 | pd_idx = stripe % (raid_disks-1); | 1811 | pd_idx = sector_div(stripe2, raid_disks-1); |
1815 | if (*dd_idx >= pd_idx) | 1812 | if (*dd_idx >= pd_idx) |
1816 | (*dd_idx)++; | 1813 | (*dd_idx)++; |
1817 | qd_idx = raid_disks - 1; | 1814 | qd_idx = raid_disks - 1; |
1818 | break; | 1815 | break; |
1819 | 1816 | ||
1820 | case ALGORITHM_LEFT_SYMMETRIC_6: | 1817 | case ALGORITHM_LEFT_SYMMETRIC_6: |
1821 | pd_idx = data_disks - stripe % (raid_disks-1); | 1818 | pd_idx = data_disks - sector_div(stripe2, raid_disks-1); |
1822 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); | 1819 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); |
1823 | qd_idx = raid_disks - 1; | 1820 | qd_idx = raid_disks - 1; |
1824 | break; | 1821 | break; |
1825 | 1822 | ||
1826 | case ALGORITHM_RIGHT_SYMMETRIC_6: | 1823 | case ALGORITHM_RIGHT_SYMMETRIC_6: |
1827 | pd_idx = stripe % (raid_disks-1); | 1824 | pd_idx = sector_div(stripe2, raid_disks-1); |
1828 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); | 1825 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); |
1829 | qd_idx = raid_disks - 1; | 1826 | qd_idx = raid_disks - 1; |
1830 | break; | 1827 | break; |
@@ -1869,14 +1866,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1869 | : conf->algorithm; | 1866 | : conf->algorithm; |
1870 | sector_t stripe; | 1867 | sector_t stripe; |
1871 | int chunk_offset; | 1868 | int chunk_offset; |
1872 | int chunk_number, dummy1, dd_idx = i; | 1869 | sector_t chunk_number; |
1870 | int dummy1, dd_idx = i; | ||
1873 | sector_t r_sector; | 1871 | sector_t r_sector; |
1874 | struct stripe_head sh2; | 1872 | struct stripe_head sh2; |
1875 | 1873 | ||
1876 | 1874 | ||
1877 | chunk_offset = sector_div(new_sector, sectors_per_chunk); | 1875 | chunk_offset = sector_div(new_sector, sectors_per_chunk); |
1878 | stripe = new_sector; | 1876 | stripe = new_sector; |
1879 | BUG_ON(new_sector != stripe); | ||
1880 | 1877 | ||
1881 | if (i == sh->pd_idx) | 1878 | if (i == sh->pd_idx) |
1882 | return 0; | 1879 | return 0; |
@@ -1969,7 +1966,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
1969 | } | 1966 | } |
1970 | 1967 | ||
1971 | chunk_number = stripe * data_disks + i; | 1968 | chunk_number = stripe * data_disks + i; |
1972 | r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; | 1969 | r_sector = chunk_number * sectors_per_chunk + chunk_offset; |
1973 | 1970 | ||
1974 | check = raid5_compute_sector(conf, r_sector, | 1971 | check = raid5_compute_sector(conf, r_sector, |
1975 | previous, &dummy1, &sh2); | 1972 | previous, &dummy1, &sh2); |
@@ -2947,6 +2944,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2947 | struct r5dev *dev; | 2944 | struct r5dev *dev; |
2948 | mdk_rdev_t *blocked_rdev = NULL; | 2945 | mdk_rdev_t *blocked_rdev = NULL; |
2949 | int prexor; | 2946 | int prexor; |
2947 | int dec_preread_active = 0; | ||
2950 | 2948 | ||
2951 | memset(&s, 0, sizeof(s)); | 2949 | memset(&s, 0, sizeof(s)); |
2952 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " | 2950 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " |
@@ -3096,12 +3094,8 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3096 | set_bit(STRIPE_INSYNC, &sh->state); | 3094 | set_bit(STRIPE_INSYNC, &sh->state); |
3097 | } | 3095 | } |
3098 | } | 3096 | } |
3099 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 3097 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3100 | atomic_dec(&conf->preread_active_stripes); | 3098 | dec_preread_active = 1; |
3101 | if (atomic_read(&conf->preread_active_stripes) < | ||
3102 | IO_THRESHOLD) | ||
3103 | md_wakeup_thread(conf->mddev->thread); | ||
3104 | } | ||
3105 | } | 3099 | } |
3106 | 3100 | ||
3107 | /* Now to consider new write requests and what else, if anything | 3101 | /* Now to consider new write requests and what else, if anything |
@@ -3208,6 +3202,16 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3208 | 3202 | ||
3209 | ops_run_io(sh, &s); | 3203 | ops_run_io(sh, &s); |
3210 | 3204 | ||
3205 | if (dec_preread_active) { | ||
3206 | /* We delay this until after ops_run_io so that if make_request | ||
3207 | * is waiting on a barrier, it won't continue until the writes | ||
3208 | * have actually been submitted. | ||
3209 | */ | ||
3210 | atomic_dec(&conf->preread_active_stripes); | ||
3211 | if (atomic_read(&conf->preread_active_stripes) < | ||
3212 | IO_THRESHOLD) | ||
3213 | md_wakeup_thread(conf->mddev->thread); | ||
3214 | } | ||
3211 | return_io(return_bi); | 3215 | return_io(return_bi); |
3212 | } | 3216 | } |
3213 | 3217 | ||
@@ -3221,6 +3225,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3221 | struct r6_state r6s; | 3225 | struct r6_state r6s; |
3222 | struct r5dev *dev, *pdev, *qdev; | 3226 | struct r5dev *dev, *pdev, *qdev; |
3223 | mdk_rdev_t *blocked_rdev = NULL; | 3227 | mdk_rdev_t *blocked_rdev = NULL; |
3228 | int dec_preread_active = 0; | ||
3224 | 3229 | ||
3225 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3230 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
3226 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", | 3231 | "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", |
@@ -3358,7 +3363,6 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3358 | * completed | 3363 | * completed |
3359 | */ | 3364 | */ |
3360 | if (sh->reconstruct_state == reconstruct_state_drain_result) { | 3365 | if (sh->reconstruct_state == reconstruct_state_drain_result) { |
3361 | int qd_idx = sh->qd_idx; | ||
3362 | 3366 | ||
3363 | sh->reconstruct_state = reconstruct_state_idle; | 3367 | sh->reconstruct_state = reconstruct_state_idle; |
3364 | /* All the 'written' buffers and the parity blocks are ready to | 3368 | /* All the 'written' buffers and the parity blocks are ready to |
@@ -3380,12 +3384,8 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3380 | set_bit(STRIPE_INSYNC, &sh->state); | 3384 | set_bit(STRIPE_INSYNC, &sh->state); |
3381 | } | 3385 | } |
3382 | } | 3386 | } |
3383 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 3387 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
3384 | atomic_dec(&conf->preread_active_stripes); | 3388 | dec_preread_active = 1; |
3385 | if (atomic_read(&conf->preread_active_stripes) < | ||
3386 | IO_THRESHOLD) | ||
3387 | md_wakeup_thread(conf->mddev->thread); | ||
3388 | } | ||
3389 | } | 3389 | } |
3390 | 3390 | ||
3391 | /* Now to consider new write requests and what else, if anything | 3391 | /* Now to consider new write requests and what else, if anything |
@@ -3494,6 +3494,18 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3494 | 3494 | ||
3495 | ops_run_io(sh, &s); | 3495 | ops_run_io(sh, &s); |
3496 | 3496 | ||
3497 | |||
3498 | if (dec_preread_active) { | ||
3499 | /* We delay this until after ops_run_io so that if make_request | ||
3500 | * is waiting on a barrier, it won't continue until the writes | ||
3501 | * have actually been submitted. | ||
3502 | */ | ||
3503 | atomic_dec(&conf->preread_active_stripes); | ||
3504 | if (atomic_read(&conf->preread_active_stripes) < | ||
3505 | IO_THRESHOLD) | ||
3506 | md_wakeup_thread(conf->mddev->thread); | ||
3507 | } | ||
3508 | |||
3497 | return_io(return_bi); | 3509 | return_io(return_bi); |
3498 | } | 3510 | } |
3499 | 3511 | ||
@@ -3724,7 +3736,7 @@ static int bio_fits_rdev(struct bio *bi) | |||
3724 | if ((bi->bi_size>>9) > queue_max_sectors(q)) | 3736 | if ((bi->bi_size>>9) > queue_max_sectors(q)) |
3725 | return 0; | 3737 | return 0; |
3726 | blk_recount_segments(q, bi); | 3738 | blk_recount_segments(q, bi); |
3727 | if (bi->bi_phys_segments > queue_max_phys_segments(q)) | 3739 | if (bi->bi_phys_segments > queue_max_segments(q)) |
3728 | return 0; | 3740 | return 0; |
3729 | 3741 | ||
3730 | if (q->merge_bvec_fn) | 3742 | if (q->merge_bvec_fn) |
@@ -3741,7 +3753,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | |||
3741 | { | 3753 | { |
3742 | mddev_t *mddev = q->queuedata; | 3754 | mddev_t *mddev = q->queuedata; |
3743 | raid5_conf_t *conf = mddev->private; | 3755 | raid5_conf_t *conf = mddev->private; |
3744 | unsigned int dd_idx; | 3756 | int dd_idx; |
3745 | struct bio* align_bi; | 3757 | struct bio* align_bi; |
3746 | mdk_rdev_t *rdev; | 3758 | mdk_rdev_t *rdev; |
3747 | 3759 | ||
@@ -3866,7 +3878,13 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3866 | int cpu, remaining; | 3878 | int cpu, remaining; |
3867 | 3879 | ||
3868 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { | 3880 | if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { |
3869 | bio_endio(bi, -EOPNOTSUPP); | 3881 | /* Drain all pending writes. We only really need |
3882 | * to ensure they have been submitted, but this is | ||
3883 | * easier. | ||
3884 | */ | ||
3885 | mddev->pers->quiesce(mddev, 1); | ||
3886 | mddev->pers->quiesce(mddev, 0); | ||
3887 | md_barrier_request(mddev, bi); | ||
3870 | return 0; | 3888 | return 0; |
3871 | } | 3889 | } |
3872 | 3890 | ||
@@ -3990,6 +4008,9 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3990 | finish_wait(&conf->wait_for_overlap, &w); | 4008 | finish_wait(&conf->wait_for_overlap, &w); |
3991 | set_bit(STRIPE_HANDLE, &sh->state); | 4009 | set_bit(STRIPE_HANDLE, &sh->state); |
3992 | clear_bit(STRIPE_DELAYED, &sh->state); | 4010 | clear_bit(STRIPE_DELAYED, &sh->state); |
4011 | if (mddev->barrier && | ||
4012 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
4013 | atomic_inc(&conf->preread_active_stripes); | ||
3993 | release_stripe(sh); | 4014 | release_stripe(sh); |
3994 | } else { | 4015 | } else { |
3995 | /* cannot get stripe for read-ahead, just give-up */ | 4016 | /* cannot get stripe for read-ahead, just give-up */ |
@@ -4009,6 +4030,14 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
4009 | 4030 | ||
4010 | bio_endio(bi, 0); | 4031 | bio_endio(bi, 0); |
4011 | } | 4032 | } |
4033 | |||
4034 | if (mddev->barrier) { | ||
4035 | /* We need to wait for the stripes to all be handled. | ||
4036 | * So: wait for preread_active_stripes to drop to 0. | ||
4037 | */ | ||
4038 | wait_event(mddev->thread->wqueue, | ||
4039 | atomic_read(&conf->preread_active_stripes) == 0); | ||
4040 | } | ||
4012 | return 0; | 4041 | return 0; |
4013 | } | 4042 | } |
4014 | 4043 | ||
@@ -4648,7 +4677,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf) | |||
4648 | { | 4677 | { |
4649 | unsigned long cpu; | 4678 | unsigned long cpu; |
4650 | struct page *spare_page; | 4679 | struct page *spare_page; |
4651 | struct raid5_percpu *allcpus; | 4680 | struct raid5_percpu __percpu *allcpus; |
4652 | void *scribble; | 4681 | void *scribble; |
4653 | int err; | 4682 | int err; |
4654 | 4683 | ||
@@ -5104,9 +5133,8 @@ static int stop(mddev_t *mddev) | |||
5104 | mddev->thread = NULL; | 5133 | mddev->thread = NULL; |
5105 | mddev->queue->backing_dev_info.congested_fn = NULL; | 5134 | mddev->queue->backing_dev_info.congested_fn = NULL; |
5106 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 5135 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
5107 | sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); | ||
5108 | free_conf(conf); | 5136 | free_conf(conf); |
5109 | mddev->private = NULL; | 5137 | mddev->private = &raid5_attrs_group; |
5110 | return 0; | 5138 | return 0; |
5111 | } | 5139 | } |
5112 | 5140 | ||
@@ -5432,11 +5460,11 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5432 | !test_bit(Faulty, &rdev->flags)) { | 5460 | !test_bit(Faulty, &rdev->flags)) { |
5433 | if (raid5_add_disk(mddev, rdev) == 0) { | 5461 | if (raid5_add_disk(mddev, rdev) == 0) { |
5434 | char nm[20]; | 5462 | char nm[20]; |
5435 | if (rdev->raid_disk >= conf->previous_raid_disks) | 5463 | if (rdev->raid_disk >= conf->previous_raid_disks) { |
5436 | set_bit(In_sync, &rdev->flags); | 5464 | set_bit(In_sync, &rdev->flags); |
5437 | else | 5465 | added_devices++; |
5466 | } else | ||
5438 | rdev->recovery_offset = 0; | 5467 | rdev->recovery_offset = 0; |
5439 | added_devices++; | ||
5440 | sprintf(nm, "rd%d", rdev->raid_disk); | 5468 | sprintf(nm, "rd%d", rdev->raid_disk); |
5441 | if (sysfs_create_link(&mddev->kobj, | 5469 | if (sysfs_create_link(&mddev->kobj, |
5442 | &rdev->kobj, nm)) | 5470 | &rdev->kobj, nm)) |
@@ -5448,9 +5476,12 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5448 | break; | 5476 | break; |
5449 | } | 5477 | } |
5450 | 5478 | ||
5479 | /* When a reshape changes the number of devices, ->degraded | ||
5480 | * is measured against the large of the pre and post number of | ||
5481 | * devices.*/ | ||
5451 | if (mddev->delta_disks > 0) { | 5482 | if (mddev->delta_disks > 0) { |
5452 | spin_lock_irqsave(&conf->device_lock, flags); | 5483 | spin_lock_irqsave(&conf->device_lock, flags); |
5453 | mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) | 5484 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) |
5454 | - added_devices; | 5485 | - added_devices; |
5455 | spin_unlock_irqrestore(&conf->device_lock, flags); | 5486 | spin_unlock_irqrestore(&conf->device_lock, flags); |
5456 | } | 5487 | } |
@@ -5860,6 +5891,7 @@ static void raid5_exit(void) | |||
5860 | module_init(raid5_init); | 5891 | module_init(raid5_init); |
5861 | module_exit(raid5_exit); | 5892 | module_exit(raid5_exit); |
5862 | MODULE_LICENSE("GPL"); | 5893 | MODULE_LICENSE("GPL"); |
5894 | MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); | ||
5863 | MODULE_ALIAS("md-personality-4"); /* RAID5 */ | 5895 | MODULE_ALIAS("md-personality-4"); /* RAID5 */ |
5864 | MODULE_ALIAS("md-raid5"); | 5896 | MODULE_ALIAS("md-raid5"); |
5865 | MODULE_ALIAS("md-raid4"); | 5897 | MODULE_ALIAS("md-raid4"); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index dd708359b451..0f86f5e36724 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -405,7 +405,7 @@ struct raid5_private_data { | |||
405 | * lists and performing address | 405 | * lists and performing address |
406 | * conversions | 406 | * conversions |
407 | */ | 407 | */ |
408 | } *percpu; | 408 | } __percpu *percpu; |
409 | size_t scribble_len; /* size of scribble region must be | 409 | size_t scribble_len; /* size of scribble region must be |
410 | * associated with conf to handle | 410 | * associated with conf to handle |
411 | * cpu hotplug while reshaping | 411 | * cpu hotplug while reshaping |
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 866215ac7f25..1f8784bfd44d 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c | |||
@@ -17,6 +17,7 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/raid/pq.h> | 19 | #include <linux/raid/pq.h> |
20 | #include <linux/gfp.h> | ||
20 | #ifndef __KERNEL__ | 21 | #ifndef __KERNEL__ |
21 | #include <sys/mman.h> | 22 | #include <sys/mman.h> |
22 | #include <stdio.h> | 23 | #include <stdio.h> |
@@ -31,25 +32,6 @@ EXPORT_SYMBOL(raid6_empty_zero_page); | |||
31 | struct raid6_calls raid6_call; | 32 | struct raid6_calls raid6_call; |
32 | EXPORT_SYMBOL_GPL(raid6_call); | 33 | EXPORT_SYMBOL_GPL(raid6_call); |
33 | 34 | ||
34 | /* Various routine sets */ | ||
35 | extern const struct raid6_calls raid6_intx1; | ||
36 | extern const struct raid6_calls raid6_intx2; | ||
37 | extern const struct raid6_calls raid6_intx4; | ||
38 | extern const struct raid6_calls raid6_intx8; | ||
39 | extern const struct raid6_calls raid6_intx16; | ||
40 | extern const struct raid6_calls raid6_intx32; | ||
41 | extern const struct raid6_calls raid6_mmxx1; | ||
42 | extern const struct raid6_calls raid6_mmxx2; | ||
43 | extern const struct raid6_calls raid6_sse1x1; | ||
44 | extern const struct raid6_calls raid6_sse1x2; | ||
45 | extern const struct raid6_calls raid6_sse2x1; | ||
46 | extern const struct raid6_calls raid6_sse2x2; | ||
47 | extern const struct raid6_calls raid6_sse2x4; | ||
48 | extern const struct raid6_calls raid6_altivec1; | ||
49 | extern const struct raid6_calls raid6_altivec2; | ||
50 | extern const struct raid6_calls raid6_altivec4; | ||
51 | extern const struct raid6_calls raid6_altivec8; | ||
52 | |||
53 | const struct raid6_calls * const raid6_algos[] = { | 35 | const struct raid6_calls * const raid6_algos[] = { |
54 | &raid6_intx1, | 36 | &raid6_intx1, |
55 | &raid6_intx2, | 37 | &raid6_intx2, |
@@ -169,3 +151,4 @@ static void raid6_exit(void) | |||
169 | subsys_initcall(raid6_select_algo); | 151 | subsys_initcall(raid6_select_algo); |
170 | module_exit(raid6_exit); | 152 | module_exit(raid6_exit); |
171 | MODULE_LICENSE("GPL"); | 153 | MODULE_LICENSE("GPL"); |
154 | MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); | ||