aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /drivers/md
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig9
-rw-r--r--drivers/md/bitmap.c449
-rw-r--r--drivers/md/bitmap.h19
-rw-r--r--drivers/md/dm-crypt.c212
-rw-r--r--drivers/md/dm-delay.c8
-rw-r--r--drivers/md/dm-exception-store.c33
-rw-r--r--drivers/md/dm-exception-store.h62
-rw-r--r--drivers/md/dm-io.c120
-rw-r--r--drivers/md/dm-ioctl.c145
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log-userspace-base.c1
-rw-r--r--drivers/md/dm-log-userspace-transfer.c11
-rw-r--r--drivers/md/dm-log.c80
-rw-r--r--drivers/md/dm-mpath.c178
-rw-r--r--drivers/md/dm-raid1.c232
-rw-r--r--drivers/md/dm-region-hash.c37
-rw-r--r--drivers/md/dm-service-time.c2
-rw-r--r--drivers/md/dm-snap-persistent.c197
-rw-r--r--drivers/md/dm-snap-transient.c24
-rw-r--r--drivers/md/dm-snap.c1291
-rw-r--r--drivers/md/dm-stripe.c5
-rw-r--r--drivers/md/dm-sysfs.c4
-rw-r--r--drivers/md/dm-table.c41
-rw-r--r--drivers/md/dm-target.c1
-rw-r--r--drivers/md/dm-uevent.c16
-rw-r--r--drivers/md/dm.c673
-rw-r--r--drivers/md/dm.h17
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c16
-rw-r--r--drivers/md/md.c498
-rw-r--r--drivers/md/md.h51
-rw-r--r--drivers/md/multipath.c24
-rw-r--r--drivers/md/raid0.c19
-rw-r--r--drivers/md/raid1.c236
-rw-r--r--drivers/md/raid1.h5
-rw-r--r--drivers/md/raid10.c145
-rw-r--r--drivers/md/raid5.c136
-rw-r--r--drivers/md/raid5.h2
-rw-r--r--drivers/md/raid6algos.c21
40 files changed, 3608 insertions, 1422 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2158377a1359..acb3a4e404ff 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -185,11 +185,10 @@ config MD_MULTIPATH
185 tristate "Multipath I/O support" 185 tristate "Multipath I/O support"
186 depends on BLK_DEV_MD 186 depends on BLK_DEV_MD
187 help 187 help
188 Multipath-IO is the ability of certain devices to address the same 188 MD_MULTIPATH provides a simple multi-path personality for use
189 physical disk over multiple 'IO paths'. The code ensures that such 189 the MD framework. It is not under active development. New
190 paths can be defined and handled at runtime, and ensures that a 190 projects should consider using DM_MULTIPATH which has more
191 transparent failover to the backup path(s) happens if a IO errors 191 features and more testing.
192 arrives on the primary path.
193 192
194 If unsure, say N. 193 If unsure, say N.
195 194
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 60e2b322db11..26ac8aad0b19 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
212 */ 212 */
213 213
214/* IO operations when bitmap is stored near all superblocks */ 214/* IO operations when bitmap is stored near all superblocks */
215static struct page *read_sb_page(mddev_t *mddev, long offset, 215static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
216 struct page *page, 216 struct page *page,
217 unsigned long index, int size) 217 unsigned long index, int size)
218{ 218{
@@ -287,27 +287,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
287 287
288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 288 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
289 int size = PAGE_SIZE; 289 int size = PAGE_SIZE;
290 loff_t offset = mddev->bitmap_info.offset;
290 if (page->index == bitmap->file_pages-1) 291 if (page->index == bitmap->file_pages-1)
291 size = roundup(bitmap->last_page_size, 292 size = roundup(bitmap->last_page_size,
292 bdev_logical_block_size(rdev->bdev)); 293 bdev_logical_block_size(rdev->bdev));
293 /* Just make sure we aren't corrupting data or 294 /* Just make sure we aren't corrupting data or
294 * metadata 295 * metadata
295 */ 296 */
296 if (bitmap->offset < 0) { 297 if (mddev->external) {
298 /* Bitmap could be anywhere. */
299 if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) >
300 rdev->data_offset &&
301 rdev->sb_start + offset <
302 rdev->data_offset + mddev->dev_sectors +
303 (PAGE_SIZE/512))
304 goto bad_alignment;
305 } else if (offset < 0) {
297 /* DATA BITMAP METADATA */ 306 /* DATA BITMAP METADATA */
298 if (bitmap->offset 307 if (offset
299 + (long)(page->index * (PAGE_SIZE/512)) 308 + (long)(page->index * (PAGE_SIZE/512))
300 + size/512 > 0) 309 + size/512 > 0)
301 /* bitmap runs in to metadata */ 310 /* bitmap runs in to metadata */
302 goto bad_alignment; 311 goto bad_alignment;
303 if (rdev->data_offset + mddev->dev_sectors 312 if (rdev->data_offset + mddev->dev_sectors
304 > rdev->sb_start + bitmap->offset) 313 > rdev->sb_start + offset)
305 /* data runs in to bitmap */ 314 /* data runs in to bitmap */
306 goto bad_alignment; 315 goto bad_alignment;
307 } else if (rdev->sb_start < rdev->data_offset) { 316 } else if (rdev->sb_start < rdev->data_offset) {
308 /* METADATA BITMAP DATA */ 317 /* METADATA BITMAP DATA */
309 if (rdev->sb_start 318 if (rdev->sb_start
310 + bitmap->offset 319 + offset
311 + page->index*(PAGE_SIZE/512) + size/512 320 + page->index*(PAGE_SIZE/512) + size/512
312 > rdev->data_offset) 321 > rdev->data_offset)
313 /* bitmap runs in to data */ 322 /* bitmap runs in to data */
@@ -316,7 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
316 /* DATA METADATA BITMAP - no problems */ 325 /* DATA METADATA BITMAP - no problems */
317 } 326 }
318 md_super_write(mddev, rdev, 327 md_super_write(mddev, rdev,
319 rdev->sb_start + bitmap->offset 328 rdev->sb_start + offset
320 + page->index * (PAGE_SIZE/512), 329 + page->index * (PAGE_SIZE/512),
321 size, 330 size,
322 page); 331 page);
@@ -488,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap)
488 497
489 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 498 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
490 return; 499 return;
500 if (bitmap->mddev->bitmap_info.external)
501 return;
491 spin_lock_irqsave(&bitmap->lock, flags); 502 spin_lock_irqsave(&bitmap->lock, flags);
492 if (!bitmap->sb_page) { /* no superblock */ 503 if (!bitmap->sb_page) { /* no superblock */
493 spin_unlock_irqrestore(&bitmap->lock, flags); 504 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -501,6 +512,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
501 bitmap->events_cleared = bitmap->mddev->events; 512 bitmap->events_cleared = bitmap->mddev->events;
502 sb->events_cleared = cpu_to_le64(bitmap->events_cleared); 513 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
503 } 514 }
515 /* Just in case these have been changed via sysfs: */
516 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
517 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
504 kunmap_atomic(sb, KM_USER0); 518 kunmap_atomic(sb, KM_USER0);
505 write_page(bitmap, bitmap->sb_page, 1); 519 write_page(bitmap, bitmap->sb_page, 1);
506} 520}
@@ -550,7 +564,8 @@ static int bitmap_read_sb(struct bitmap *bitmap)
550 564
551 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); 565 bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes);
552 } else { 566 } else {
553 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 567 bitmap->sb_page = read_sb_page(bitmap->mddev,
568 bitmap->mddev->bitmap_info.offset,
554 NULL, 569 NULL,
555 0, sizeof(bitmap_super_t)); 570 0, sizeof(bitmap_super_t));
556 } 571 }
@@ -563,7 +578,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
563 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 578 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
564 579
565 chunksize = le32_to_cpu(sb->chunksize); 580 chunksize = le32_to_cpu(sb->chunksize);
566 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
567 write_behind = le32_to_cpu(sb->write_behind); 582 write_behind = le32_to_cpu(sb->write_behind);
568 583
569 /* verify that the bitmap-specific fields are valid */ 584 /* verify that the bitmap-specific fields are valid */
@@ -576,7 +591,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
576 reason = "bitmap chunksize too small"; 591 reason = "bitmap chunksize too small";
577 else if ((1 << ffz(~chunksize)) != chunksize) 592 else if ((1 << ffz(~chunksize)) != chunksize)
578 reason = "bitmap chunksize not a power of 2"; 593 reason = "bitmap chunksize not a power of 2";
579 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) 594 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
580 reason = "daemon sleep period out of range"; 595 reason = "daemon sleep period out of range";
581 else if (write_behind > COUNTER_MAX) 596 else if (write_behind > COUNTER_MAX)
582 reason = "write-behind limit out of range (0 - 16383)"; 597 reason = "write-behind limit out of range (0 - 16383)";
@@ -610,10 +625,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
610 } 625 }
611success: 626success:
612 /* assign fields using values from superblock */ 627 /* assign fields using values from superblock */
613 bitmap->chunksize = chunksize; 628 bitmap->mddev->bitmap_info.chunksize = chunksize;
614 bitmap->daemon_sleep = daemon_sleep; 629 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
615 bitmap->daemon_lastrun = jiffies; 630 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
616 bitmap->max_write_behind = write_behind;
617 bitmap->flags |= le32_to_cpu(sb->state); 631 bitmap->flags |= le32_to_cpu(sb->state);
618 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 632 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
619 bitmap->flags |= BITMAP_HOSTENDIAN; 633 bitmap->flags |= BITMAP_HOSTENDIAN;
@@ -664,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
664 * general bitmap file operations 678 * general bitmap file operations
665 */ 679 */
666 680
681/*
682 * on-disk bitmap:
683 *
684 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
685 * file a page at a time. There's a superblock at the start of the file.
686 */
667/* calculate the index of the page that contains this bit */ 687/* calculate the index of the page that contains this bit */
668static inline unsigned long file_page_index(unsigned long chunk) 688static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk)
669{ 689{
670 return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; 690 if (!bitmap->mddev->bitmap_info.external)
691 chunk += sizeof(bitmap_super_t) << 3;
692 return chunk >> PAGE_BIT_SHIFT;
671} 693}
672 694
673/* calculate the (bit) offset of this bit within a page */ 695/* calculate the (bit) offset of this bit within a page */
674static inline unsigned long file_page_offset(unsigned long chunk) 696static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk)
675{ 697{
676 return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); 698 if (!bitmap->mddev->bitmap_info.external)
699 chunk += sizeof(bitmap_super_t) << 3;
700 return chunk & (PAGE_BITS - 1);
677} 701}
678 702
679/* 703/*
@@ -686,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk)
686static inline struct page *filemap_get_page(struct bitmap *bitmap, 710static inline struct page *filemap_get_page(struct bitmap *bitmap,
687 unsigned long chunk) 711 unsigned long chunk)
688{ 712{
689 if (file_page_index(chunk) >= bitmap->file_pages) return NULL; 713 if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL;
690 return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; 714 return bitmap->filemap[file_page_index(bitmap, chunk)
715 - file_page_index(bitmap, 0)];
691} 716}
692 717
693 718
@@ -710,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
710 spin_unlock_irqrestore(&bitmap->lock, flags); 735 spin_unlock_irqrestore(&bitmap->lock, flags);
711 736
712 while (pages--) 737 while (pages--)
713 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 738 if (map[pages] != sb_page) /* 0 is sb_page, release it below */
714 free_buffers(map[pages]); 739 free_buffers(map[pages]);
715 kfree(map); 740 kfree(map);
716 kfree(attr); 741 kfree(attr);
@@ -821,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
821 846
822 page = filemap_get_page(bitmap, chunk); 847 page = filemap_get_page(bitmap, chunk);
823 if (!page) return; 848 if (!page) return;
824 bit = file_page_offset(chunk); 849 bit = file_page_offset(bitmap, chunk);
825 850
826 /* set the bit */ 851 /* set the bit */
827 kaddr = kmap_atomic(page, KM_USER0); 852 kaddr = kmap_atomic(page, KM_USER0);
@@ -907,7 +932,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
907 chunks = bitmap->chunks; 932 chunks = bitmap->chunks;
908 file = bitmap->file; 933 file = bitmap->file;
909 934
910 BUG_ON(!file && !bitmap->offset); 935 BUG_ON(!file && !bitmap->mddev->bitmap_info.offset);
911 936
912#ifdef INJECT_FAULTS_3 937#ifdef INJECT_FAULTS_3
913 outofdate = 1; 938 outofdate = 1;
@@ -919,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
919 "recovery\n", bmname(bitmap)); 944 "recovery\n", bmname(bitmap));
920 945
921 bytes = (chunks + 7) / 8; 946 bytes = (chunks + 7) / 8;
947 if (!bitmap->mddev->bitmap_info.external)
948 bytes += sizeof(bitmap_super_t);
922 949
923 num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; 950
951 num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
924 952
925 if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { 953 if (file && i_size_read(file->f_mapping->host) < bytes) {
926 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", 954 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
927 bmname(bitmap), 955 bmname(bitmap),
928 (unsigned long) i_size_read(file->f_mapping->host), 956 (unsigned long) i_size_read(file->f_mapping->host),
929 bytes + sizeof(bitmap_super_t)); 957 bytes);
930 goto err; 958 goto err;
931 } 959 }
932 960
@@ -947,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
947 975
948 for (i = 0; i < chunks; i++) { 976 for (i = 0; i < chunks; i++) {
949 int b; 977 int b;
950 index = file_page_index(i); 978 index = file_page_index(bitmap, i);
951 bit = file_page_offset(i); 979 bit = file_page_offset(bitmap, i);
952 if (index != oldindex) { /* this is a new page, read it in */ 980 if (index != oldindex) { /* this is a new page, read it in */
953 int count; 981 int count;
954 /* unmap the old page, we're done with it */ 982 /* unmap the old page, we're done with it */
955 if (index == num_pages-1) 983 if (index == num_pages-1)
956 count = bytes + sizeof(bitmap_super_t) 984 count = bytes - index * PAGE_SIZE;
957 - index * PAGE_SIZE;
958 else 985 else
959 count = PAGE_SIZE; 986 count = PAGE_SIZE;
960 if (index == 0) { 987 if (index == 0 && bitmap->sb_page) {
961 /* 988 /*
962 * if we're here then the superblock page 989 * if we're here then the superblock page
963 * contains some bits (PAGE_SIZE != sizeof sb) 990 * contains some bits (PAGE_SIZE != sizeof sb)
@@ -967,14 +994,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
967 offset = sizeof(bitmap_super_t); 994 offset = sizeof(bitmap_super_t);
968 if (!file) 995 if (!file)
969 read_sb_page(bitmap->mddev, 996 read_sb_page(bitmap->mddev,
970 bitmap->offset, 997 bitmap->mddev->bitmap_info.offset,
971 page, 998 page,
972 index, count); 999 index, count);
973 } else if (file) { 1000 } else if (file) {
974 page = read_page(file, index, bitmap, count); 1001 page = read_page(file, index, bitmap, count);
975 offset = 0; 1002 offset = 0;
976 } else { 1003 } else {
977 page = read_sb_page(bitmap->mddev, bitmap->offset, 1004 page = read_sb_page(bitmap->mddev,
1005 bitmap->mddev->bitmap_info.offset,
978 NULL, 1006 NULL,
979 index, count); 1007 index, count);
980 offset = 0; 1008 offset = 0;
@@ -1078,23 +1106,32 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1078 * out to disk 1106 * out to disk
1079 */ 1107 */
1080 1108
1081void bitmap_daemon_work(struct bitmap *bitmap) 1109void bitmap_daemon_work(mddev_t *mddev)
1082{ 1110{
1111 struct bitmap *bitmap;
1083 unsigned long j; 1112 unsigned long j;
1084 unsigned long flags; 1113 unsigned long flags;
1085 struct page *page = NULL, *lastpage = NULL; 1114 struct page *page = NULL, *lastpage = NULL;
1086 int blocks; 1115 int blocks;
1087 void *paddr; 1116 void *paddr;
1088 1117
1089 if (bitmap == NULL) 1118 /* Use a mutex to guard daemon_work against
1119 * bitmap_destroy.
1120 */
1121 mutex_lock(&mddev->bitmap_info.mutex);
1122 bitmap = mddev->bitmap;
1123 if (bitmap == NULL) {
1124 mutex_unlock(&mddev->bitmap_info.mutex);
1090 return; 1125 return;
1091 if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) 1126 }
1127 if (time_before(jiffies, bitmap->daemon_lastrun
1128 + bitmap->mddev->bitmap_info.daemon_sleep))
1092 goto done; 1129 goto done;
1093 1130
1094 bitmap->daemon_lastrun = jiffies; 1131 bitmap->daemon_lastrun = jiffies;
1095 if (bitmap->allclean) { 1132 if (bitmap->allclean) {
1096 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1133 bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1097 return; 1134 goto done;
1098 } 1135 }
1099 bitmap->allclean = 1; 1136 bitmap->allclean = 1;
1100 1137
@@ -1142,7 +1179,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1142 /* We are possibly going to clear some bits, so make 1179 /* We are possibly going to clear some bits, so make
1143 * sure that events_cleared is up-to-date. 1180 * sure that events_cleared is up-to-date.
1144 */ 1181 */
1145 if (bitmap->need_sync) { 1182 if (bitmap->need_sync &&
1183 bitmap->mddev->bitmap_info.external == 0) {
1146 bitmap_super_t *sb; 1184 bitmap_super_t *sb;
1147 bitmap->need_sync = 0; 1185 bitmap->need_sync = 0;
1148 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 1186 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1152,7 +1190,8 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1152 write_page(bitmap, bitmap->sb_page, 1); 1190 write_page(bitmap, bitmap->sb_page, 1);
1153 } 1191 }
1154 spin_lock_irqsave(&bitmap->lock, flags); 1192 spin_lock_irqsave(&bitmap->lock, flags);
1155 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1193 if (!bitmap->need_sync)
1194 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1156 } 1195 }
1157 bmc = bitmap_get_counter(bitmap, 1196 bmc = bitmap_get_counter(bitmap,
1158 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1197 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap),
@@ -1167,7 +1206,7 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1167 if (*bmc == 2) { 1206 if (*bmc == 2) {
1168 *bmc=1; /* maybe clear the bit next time */ 1207 *bmc=1; /* maybe clear the bit next time */
1169 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1208 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1170 } else if (*bmc == 1) { 1209 } else if (*bmc == 1 && !bitmap->need_sync) {
1171 /* we can clear the bit */ 1210 /* we can clear the bit */
1172 *bmc = 0; 1211 *bmc = 0;
1173 bitmap_count_page(bitmap, 1212 bitmap_count_page(bitmap,
@@ -1177,9 +1216,11 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1177 /* clear the bit */ 1216 /* clear the bit */
1178 paddr = kmap_atomic(page, KM_USER0); 1217 paddr = kmap_atomic(page, KM_USER0);
1179 if (bitmap->flags & BITMAP_HOSTENDIAN) 1218 if (bitmap->flags & BITMAP_HOSTENDIAN)
1180 clear_bit(file_page_offset(j), paddr); 1219 clear_bit(file_page_offset(bitmap, j),
1220 paddr);
1181 else 1221 else
1182 ext2_clear_bit(file_page_offset(j), paddr); 1222 ext2_clear_bit(file_page_offset(bitmap, j),
1223 paddr);
1183 kunmap_atomic(paddr, KM_USER0); 1224 kunmap_atomic(paddr, KM_USER0);
1184 } 1225 }
1185 } else 1226 } else
@@ -1202,7 +1243,9 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1202 1243
1203 done: 1244 done:
1204 if (bitmap->allclean == 0) 1245 if (bitmap->allclean == 0)
1205 bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1246 bitmap->mddev->thread->timeout =
1247 bitmap->mddev->bitmap_info.daemon_sleep;
1248 mutex_unlock(&mddev->bitmap_info.mutex);
1206} 1249}
1207 1250
1208static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1251static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1332,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1332 bitmap->events_cleared < bitmap->mddev->events) { 1375 bitmap->events_cleared < bitmap->mddev->events) {
1333 bitmap->events_cleared = bitmap->mddev->events; 1376 bitmap->events_cleared = bitmap->mddev->events;
1334 bitmap->need_sync = 1; 1377 bitmap->need_sync = 1;
1378 sysfs_notify_dirent(bitmap->sysfs_can_clear);
1335 } 1379 }
1336 1380
1337 if (!success && ! (*bmc & NEEDED_MASK)) 1381 if (!success && ! (*bmc & NEEDED_MASK))
@@ -1470,7 +1514,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1470 return; 1514 return;
1471 } 1515 }
1472 if (time_before(jiffies, (bitmap->last_end_sync 1516 if (time_before(jiffies, (bitmap->last_end_sync
1473 + bitmap->daemon_sleep * HZ))) 1517 + bitmap->mddev->bitmap_info.daemon_sleep)))
1474 return; 1518 return;
1475 wait_event(bitmap->mddev->recovery_wait, 1519 wait_event(bitmap->mddev->recovery_wait,
1476 atomic_read(&bitmap->mddev->recovery_active) == 0); 1520 atomic_read(&bitmap->mddev->recovery_active) == 0);
@@ -1522,6 +1566,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1522 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); 1566 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
1523 bitmap_set_memory_bits(bitmap, sec, 1); 1567 bitmap_set_memory_bits(bitmap, sec, 1);
1524 bitmap_file_set_bit(bitmap, sec); 1568 bitmap_file_set_bit(bitmap, sec);
1569 if (sec < bitmap->mddev->recovery_cp)
1570 /* We are asserting that the array is dirty,
1571 * so move the recovery_cp address back so
1572 * that it is obvious that it is dirty
1573 */
1574 bitmap->mddev->recovery_cp = sec;
1525 } 1575 }
1526} 1576}
1527 1577
@@ -1531,7 +1581,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1531void bitmap_flush(mddev_t *mddev) 1581void bitmap_flush(mddev_t *mddev)
1532{ 1582{
1533 struct bitmap *bitmap = mddev->bitmap; 1583 struct bitmap *bitmap = mddev->bitmap;
1534 int sleep; 1584 long sleep;
1535 1585
1536 if (!bitmap) /* there was no bitmap */ 1586 if (!bitmap) /* there was no bitmap */
1537 return; 1587 return;
@@ -1539,12 +1589,13 @@ void bitmap_flush(mddev_t *mddev)
1539 /* run the daemon_work three time to ensure everything is flushed 1589 /* run the daemon_work three time to ensure everything is flushed
1540 * that can be 1590 * that can be
1541 */ 1591 */
1542 sleep = bitmap->daemon_sleep; 1592 sleep = mddev->bitmap_info.daemon_sleep * 2;
1543 bitmap->daemon_sleep = 0; 1593 bitmap->daemon_lastrun -= sleep;
1544 bitmap_daemon_work(bitmap); 1594 bitmap_daemon_work(mddev);
1545 bitmap_daemon_work(bitmap); 1595 bitmap->daemon_lastrun -= sleep;
1546 bitmap_daemon_work(bitmap); 1596 bitmap_daemon_work(mddev);
1547 bitmap->daemon_sleep = sleep; 1597 bitmap->daemon_lastrun -= sleep;
1598 bitmap_daemon_work(mddev);
1548 bitmap_update_sb(bitmap); 1599 bitmap_update_sb(bitmap);
1549} 1600}
1550 1601
@@ -1574,6 +1625,7 @@ static void bitmap_free(struct bitmap *bitmap)
1574 kfree(bp); 1625 kfree(bp);
1575 kfree(bitmap); 1626 kfree(bitmap);
1576} 1627}
1628
1577void bitmap_destroy(mddev_t *mddev) 1629void bitmap_destroy(mddev_t *mddev)
1578{ 1630{
1579 struct bitmap *bitmap = mddev->bitmap; 1631 struct bitmap *bitmap = mddev->bitmap;
@@ -1581,10 +1633,15 @@ void bitmap_destroy(mddev_t *mddev)
1581 if (!bitmap) /* there was no bitmap */ 1633 if (!bitmap) /* there was no bitmap */
1582 return; 1634 return;
1583 1635
1636 mutex_lock(&mddev->bitmap_info.mutex);
1584 mddev->bitmap = NULL; /* disconnect from the md device */ 1637 mddev->bitmap = NULL; /* disconnect from the md device */
1638 mutex_unlock(&mddev->bitmap_info.mutex);
1585 if (mddev->thread) 1639 if (mddev->thread)
1586 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1640 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1587 1641
1642 if (bitmap->sysfs_can_clear)
1643 sysfs_put(bitmap->sysfs_can_clear);
1644
1588 bitmap_free(bitmap); 1645 bitmap_free(bitmap);
1589} 1646}
1590 1647
@@ -1598,16 +1655,17 @@ int bitmap_create(mddev_t *mddev)
1598 sector_t blocks = mddev->resync_max_sectors; 1655 sector_t blocks = mddev->resync_max_sectors;
1599 unsigned long chunks; 1656 unsigned long chunks;
1600 unsigned long pages; 1657 unsigned long pages;
1601 struct file *file = mddev->bitmap_file; 1658 struct file *file = mddev->bitmap_info.file;
1602 int err; 1659 int err;
1603 sector_t start; 1660 sector_t start;
1661 struct sysfs_dirent *bm;
1604 1662
1605 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); 1663 BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
1606 1664
1607 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ 1665 if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
1608 return 0; 1666 return 0;
1609 1667
1610 BUG_ON(file && mddev->bitmap_offset); 1668 BUG_ON(file && mddev->bitmap_info.offset);
1611 1669
1612 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1670 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1613 if (!bitmap) 1671 if (!bitmap)
@@ -1620,8 +1678,14 @@ int bitmap_create(mddev_t *mddev)
1620 1678
1621 bitmap->mddev = mddev; 1679 bitmap->mddev = mddev;
1622 1680
1681 bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
1682 if (bm) {
1683 bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
1684 sysfs_put(bm);
1685 } else
1686 bitmap->sysfs_can_clear = NULL;
1687
1623 bitmap->file = file; 1688 bitmap->file = file;
1624 bitmap->offset = mddev->bitmap_offset;
1625 if (file) { 1689 if (file) {
1626 get_file(file); 1690 get_file(file);
1627 /* As future accesses to this file will use bmap, 1691 /* As future accesses to this file will use bmap,
@@ -1630,12 +1694,22 @@ int bitmap_create(mddev_t *mddev)
1630 */ 1694 */
1631 vfs_fsync(file, file->f_dentry, 1); 1695 vfs_fsync(file, file->f_dentry, 1);
1632 } 1696 }
1633 /* read superblock from bitmap file (this sets bitmap->chunksize) */ 1697 /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
1634 err = bitmap_read_sb(bitmap); 1698 if (!mddev->bitmap_info.external)
1699 err = bitmap_read_sb(bitmap);
1700 else {
1701 err = 0;
1702 if (mddev->bitmap_info.chunksize == 0 ||
1703 mddev->bitmap_info.daemon_sleep == 0)
1704 /* chunksize and time_base need to be
1705 * set first. */
1706 err = -EINVAL;
1707 }
1635 if (err) 1708 if (err)
1636 goto error; 1709 goto error;
1637 1710
1638 bitmap->chunkshift = ffz(~bitmap->chunksize); 1711 bitmap->daemon_lastrun = jiffies;
1712 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize);
1639 1713
1640 /* now that chunksize and chunkshift are set, we can use these macros */ 1714 /* now that chunksize and chunkshift are set, we can use these macros */
1641 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1715 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >>
@@ -1677,7 +1751,8 @@ int bitmap_create(mddev_t *mddev)
1677 1751
1678 mddev->bitmap = bitmap; 1752 mddev->bitmap = bitmap;
1679 1753
1680 mddev->thread->timeout = bitmap->daemon_sleep * HZ; 1754 mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
1755 md_wakeup_thread(mddev->thread);
1681 1756
1682 bitmap_update_sb(bitmap); 1757 bitmap_update_sb(bitmap);
1683 1758
@@ -1688,6 +1763,264 @@ int bitmap_create(mddev_t *mddev)
1688 return err; 1763 return err;
1689} 1764}
1690 1765
1766static ssize_t
1767location_show(mddev_t *mddev, char *page)
1768{
1769 ssize_t len;
1770 if (mddev->bitmap_info.file) {
1771 len = sprintf(page, "file");
1772 } else if (mddev->bitmap_info.offset) {
1773 len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset);
1774 } else
1775 len = sprintf(page, "none");
1776 len += sprintf(page+len, "\n");
1777 return len;
1778}
1779
1780static ssize_t
1781location_store(mddev_t *mddev, const char *buf, size_t len)
1782{
1783
1784 if (mddev->pers) {
1785 if (!mddev->pers->quiesce)
1786 return -EBUSY;
1787 if (mddev->recovery || mddev->sync_thread)
1788 return -EBUSY;
1789 }
1790
1791 if (mddev->bitmap || mddev->bitmap_info.file ||
1792 mddev->bitmap_info.offset) {
1793 /* bitmap already configured. Only option is to clear it */
1794 if (strncmp(buf, "none", 4) != 0)
1795 return -EBUSY;
1796 if (mddev->pers) {
1797 mddev->pers->quiesce(mddev, 1);
1798 bitmap_destroy(mddev);
1799 mddev->pers->quiesce(mddev, 0);
1800 }
1801 mddev->bitmap_info.offset = 0;
1802 if (mddev->bitmap_info.file) {
1803 struct file *f = mddev->bitmap_info.file;
1804 mddev->bitmap_info.file = NULL;
1805 restore_bitmap_write_access(f);
1806 fput(f);
1807 }
1808 } else {
1809 /* No bitmap, OK to set a location */
1810 long long offset;
1811 if (strncmp(buf, "none", 4) == 0)
1812 /* nothing to be done */;
1813 else if (strncmp(buf, "file:", 5) == 0) {
1814 /* Not supported yet */
1815 return -EINVAL;
1816 } else {
1817 int rv;
1818 if (buf[0] == '+')
1819 rv = strict_strtoll(buf+1, 10, &offset);
1820 else
1821 rv = strict_strtoll(buf, 10, &offset);
1822 if (rv)
1823 return rv;
1824 if (offset == 0)
1825 return -EINVAL;
1826 if (mddev->bitmap_info.external == 0 &&
1827 mddev->major_version == 0 &&
1828 offset != mddev->bitmap_info.default_offset)
1829 return -EINVAL;
1830 mddev->bitmap_info.offset = offset;
1831 if (mddev->pers) {
1832 mddev->pers->quiesce(mddev, 1);
1833 rv = bitmap_create(mddev);
1834 if (rv) {
1835 bitmap_destroy(mddev);
1836 mddev->bitmap_info.offset = 0;
1837 }
1838 mddev->pers->quiesce(mddev, 0);
1839 if (rv)
1840 return rv;
1841 }
1842 }
1843 }
1844 if (!mddev->external) {
1845 /* Ensure new bitmap info is stored in
1846 * metadata promptly.
1847 */
1848 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1849 md_wakeup_thread(mddev->thread);
1850 }
1851 return len;
1852}
1853
1854static struct md_sysfs_entry bitmap_location =
1855__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
1856
1857static ssize_t
1858timeout_show(mddev_t *mddev, char *page)
1859{
1860 ssize_t len;
1861 unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ;
1862 unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ;
1863
1864 len = sprintf(page, "%lu", secs);
1865 if (jifs)
1866 len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs));
1867 len += sprintf(page+len, "\n");
1868 return len;
1869}
1870
1871static ssize_t
1872timeout_store(mddev_t *mddev, const char *buf, size_t len)
1873{
1874 /* timeout can be set at any time */
1875 unsigned long timeout;
1876 int rv = strict_strtoul_scaled(buf, &timeout, 4);
1877 if (rv)
1878 return rv;
1879
1880 /* just to make sure we don't overflow... */
1881 if (timeout >= LONG_MAX / HZ)
1882 return -EINVAL;
1883
1884 timeout = timeout * HZ / 10000;
1885
1886 if (timeout >= MAX_SCHEDULE_TIMEOUT)
1887 timeout = MAX_SCHEDULE_TIMEOUT-1;
1888 if (timeout < 1)
1889 timeout = 1;
1890 mddev->bitmap_info.daemon_sleep = timeout;
1891 if (mddev->thread) {
1892 /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then
1893 * the bitmap is all clean and we don't need to
1894 * adjust the timeout right now
1895 */
1896 if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) {
1897 mddev->thread->timeout = timeout;
1898 md_wakeup_thread(mddev->thread);
1899 }
1900 }
1901 return len;
1902}
1903
1904static struct md_sysfs_entry bitmap_timeout =
1905__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store);
1906
1907static ssize_t
1908backlog_show(mddev_t *mddev, char *page)
1909{
1910 return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind);
1911}
1912
1913static ssize_t
1914backlog_store(mddev_t *mddev, const char *buf, size_t len)
1915{
1916 unsigned long backlog;
1917 int rv = strict_strtoul(buf, 10, &backlog);
1918 if (rv)
1919 return rv;
1920 if (backlog > COUNTER_MAX)
1921 return -EINVAL;
1922 mddev->bitmap_info.max_write_behind = backlog;
1923 return len;
1924}
1925
1926static struct md_sysfs_entry bitmap_backlog =
1927__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
1928
1929static ssize_t
1930chunksize_show(mddev_t *mddev, char *page)
1931{
1932 return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize);
1933}
1934
1935static ssize_t
1936chunksize_store(mddev_t *mddev, const char *buf, size_t len)
1937{
1938 /* Can only be changed when no bitmap is active */
1939 int rv;
1940 unsigned long csize;
1941 if (mddev->bitmap)
1942 return -EBUSY;
1943 rv = strict_strtoul(buf, 10, &csize);
1944 if (rv)
1945 return rv;
1946 if (csize < 512 ||
1947 !is_power_of_2(csize))
1948 return -EINVAL;
1949 mddev->bitmap_info.chunksize = csize;
1950 return len;
1951}
1952
1953static struct md_sysfs_entry bitmap_chunksize =
1954__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
1955
1956static ssize_t metadata_show(mddev_t *mddev, char *page)
1957{
1958 return sprintf(page, "%s\n", (mddev->bitmap_info.external
1959 ? "external" : "internal"));
1960}
1961
1962static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len)
1963{
1964 if (mddev->bitmap ||
1965 mddev->bitmap_info.file ||
1966 mddev->bitmap_info.offset)
1967 return -EBUSY;
1968 if (strncmp(buf, "external", 8) == 0)
1969 mddev->bitmap_info.external = 1;
1970 else if (strncmp(buf, "internal", 8) == 0)
1971 mddev->bitmap_info.external = 0;
1972 else
1973 return -EINVAL;
1974 return len;
1975}
1976
1977static struct md_sysfs_entry bitmap_metadata =
1978__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
1979
1980static ssize_t can_clear_show(mddev_t *mddev, char *page)
1981{
1982 int len;
1983 if (mddev->bitmap)
1984 len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
1985 "false" : "true"));
1986 else
1987 len = sprintf(page, "\n");
1988 return len;
1989}
1990
1991static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len)
1992{
1993 if (mddev->bitmap == NULL)
1994 return -ENOENT;
1995 if (strncmp(buf, "false", 5) == 0)
1996 mddev->bitmap->need_sync = 1;
1997 else if (strncmp(buf, "true", 4) == 0) {
1998 if (mddev->degraded)
1999 return -EBUSY;
2000 mddev->bitmap->need_sync = 0;
2001 } else
2002 return -EINVAL;
2003 return len;
2004}
2005
2006static struct md_sysfs_entry bitmap_can_clear =
2007__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store);
2008
2009static struct attribute *md_bitmap_attrs[] = {
2010 &bitmap_location.attr,
2011 &bitmap_timeout.attr,
2012 &bitmap_backlog.attr,
2013 &bitmap_chunksize.attr,
2014 &bitmap_metadata.attr,
2015 &bitmap_can_clear.attr,
2016 NULL
2017};
2018struct attribute_group md_bitmap_group = {
2019 .name = "bitmap",
2020 .attrs = md_bitmap_attrs,
2021};
2022
2023
1691/* the bitmap API -- for raid personalities */ 2024/* the bitmap API -- for raid personalities */
1692EXPORT_SYMBOL(bitmap_startwrite); 2025EXPORT_SYMBOL(bitmap_startwrite);
1693EXPORT_SYMBOL(bitmap_endwrite); 2026EXPORT_SYMBOL(bitmap_endwrite);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e98900671ca9..cb821d76d1b4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t;
106#define BITMAP_BLOCK_SHIFT 9 106#define BITMAP_BLOCK_SHIFT 9
107 107
108/* how many blocks per chunk? (this is variable) */ 108/* how many blocks per chunk? (this is variable) */
109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) 109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) 110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) 111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
112 112
@@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t;
118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) 118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) 119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
120 120
121/*
122 * on-disk bitmap:
123 *
124 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
125 * file a page at a time. There's a superblock at the start of the file.
126 */
127
128/* map chunks (bits) to file pages - offset by the size of the superblock */
129#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
130
131#endif 121#endif
132 122
133/* 123/*
@@ -209,7 +199,6 @@ struct bitmap {
209 int counter_bits; /* how many bits per block counter */ 199 int counter_bits; /* how many bits per block counter */
210 200
211 /* bitmap chunksize -- how much data does each bit represent? */ 201 /* bitmap chunksize -- how much data does each bit represent? */
212 unsigned long chunksize;
213 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 202 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
214 unsigned long chunks; /* total number of data chunks for the array */ 203 unsigned long chunks; /* total number of data chunks for the array */
215 204
@@ -226,7 +215,6 @@ struct bitmap {
226 /* bitmap spinlock */ 215 /* bitmap spinlock */
227 spinlock_t lock; 216 spinlock_t lock;
228 217
229 long offset; /* offset from superblock if file is NULL */
230 struct file *file; /* backing disk file */ 218 struct file *file; /* backing disk file */
231 struct page *sb_page; /* cached copy of the bitmap file superblock */ 219 struct page *sb_page; /* cached copy of the bitmap file superblock */
232 struct page **filemap; /* list of cache pages for the file */ 220 struct page **filemap; /* list of cache pages for the file */
@@ -238,7 +226,6 @@ struct bitmap {
238 226
239 int allclean; 227 int allclean;
240 228
241 unsigned long max_write_behind; /* write-behind mode */
242 atomic_t behind_writes; 229 atomic_t behind_writes;
243 230
244 /* 231 /*
@@ -246,7 +233,6 @@ struct bitmap {
246 * file, cleaning up bits and flushing out pages to disk as necessary 233 * file, cleaning up bits and flushing out pages to disk as necessary
247 */ 234 */
248 unsigned long daemon_lastrun; /* jiffies of last run */ 235 unsigned long daemon_lastrun; /* jiffies of last run */
249 unsigned long daemon_sleep; /* how many seconds between updates? */
250 unsigned long last_end_sync; /* when we lasted called end_sync to 236 unsigned long last_end_sync; /* when we lasted called end_sync to
251 * update bitmap with resync progress */ 237 * update bitmap with resync progress */
252 238
@@ -254,6 +240,7 @@ struct bitmap {
254 wait_queue_head_t write_wait; 240 wait_queue_head_t write_wait;
255 wait_queue_head_t overflow_wait; 241 wait_queue_head_t overflow_wait;
256 242
243 struct sysfs_dirent *sysfs_can_clear;
257}; 244};
258 245
259/* the bitmap API */ 246/* the bitmap API */
@@ -282,7 +269,7 @@ void bitmap_close_sync(struct bitmap *bitmap);
282void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 269void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
283 270
284void bitmap_unplug(struct bitmap *bitmap); 271void bitmap_unplug(struct bitmap *bitmap);
285void bitmap_daemon_work(struct bitmap *bitmap); 272void bitmap_daemon_work(mddev_t *mddev);
286#endif 273#endif
287 274
288#endif 275#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ed1038164019..3bdbb6115702 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de> 2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> 3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. 4 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
5 * 5 *
6 * This file is released under the GPL. 6 * This file is released under the GPL.
7 */ 7 */
@@ -71,10 +71,21 @@ struct crypt_iv_operations {
71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti, 71 int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
72 const char *opts); 72 const char *opts);
73 void (*dtr)(struct crypt_config *cc); 73 void (*dtr)(struct crypt_config *cc);
74 const char *(*status)(struct crypt_config *cc); 74 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc);
75 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
76}; 77};
77 78
79struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm;
82 u8 *salt;
83};
84
85struct iv_benbi_private {
86 int shift;
87};
88
78/* 89/*
79 * Crypt: maps a linear range of a block device 90 * Crypt: maps a linear range of a block device
80 * and encrypts / decrypts at the same time. 91 * and encrypts / decrypts at the same time.
@@ -102,8 +113,8 @@ struct crypt_config {
102 struct crypt_iv_operations *iv_gen_ops; 113 struct crypt_iv_operations *iv_gen_ops;
103 char *iv_mode; 114 char *iv_mode;
104 union { 115 union {
105 struct crypto_cipher *essiv_tfm; 116 struct iv_essiv_private essiv;
106 int benbi_shift; 117 struct iv_benbi_private benbi;
107 } iv_gen_private; 118 } iv_gen_private;
108 sector_t iv_offset; 119 sector_t iv_offset;
109 unsigned int iv_size; 120 unsigned int iv_size;
@@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
147 * plain: the initial vector is the 32-bit little-endian version of the sector 158 * plain: the initial vector is the 32-bit little-endian version of the sector
148 * number, padded with zeros if necessary. 159 * number, padded with zeros if necessary.
149 * 160 *
161 * plain64: the initial vector is the 64-bit little-endian version of the sector
162 * number, padded with zeros if necessary.
163 *
150 * essiv: "encrypted sector|salt initial vector", the sector number is 164 * essiv: "encrypted sector|salt initial vector", the sector number is
151 * encrypted with the bulk cipher using a salt as key. The salt 165 * encrypted with the bulk cipher using a salt as key. The salt
152 * should be derived from the bulk cipher's key via hashing. 166 * should be derived from the bulk cipher's key via hashing.
@@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
169 return 0; 183 return 0;
170} 184}
171 185
172static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 186static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
173 const char *opts) 187 sector_t sector)
174{ 188{
175 struct crypto_cipher *essiv_tfm; 189 memset(iv, 0, cc->iv_size);
176 struct crypto_hash *hash_tfm; 190 *(u64 *)iv = cpu_to_le64(sector);
191
192 return 0;
193}
194
195/* Initialise ESSIV - compute salt but no local memory allocations */
196static int crypt_iv_essiv_init(struct crypt_config *cc)
197{
198 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
177 struct hash_desc desc; 199 struct hash_desc desc;
178 struct scatterlist sg; 200 struct scatterlist sg;
179 unsigned int saltsize;
180 u8 *salt;
181 int err; 201 int err;
182 202
183 if (opts == NULL) { 203 sg_init_one(&sg, cc->key, cc->key_size);
204 desc.tfm = essiv->hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206
207 err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
208 if (err)
209 return err;
210
211 return crypto_cipher_setkey(essiv->tfm, essiv->salt,
212 crypto_hash_digestsize(essiv->hash_tfm));
213}
214
215/* Wipe salt and reset key derived from volume key */
216static int crypt_iv_essiv_wipe(struct crypt_config *cc)
217{
218 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
219 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
220
221 memset(essiv->salt, 0, salt_size);
222
223 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
224}
225
226static void crypt_iv_essiv_dtr(struct crypt_config *cc)
227{
228 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
229
230 crypto_free_cipher(essiv->tfm);
231 essiv->tfm = NULL;
232
233 crypto_free_hash(essiv->hash_tfm);
234 essiv->hash_tfm = NULL;
235
236 kzfree(essiv->salt);
237 essiv->salt = NULL;
238}
239
240static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
241 const char *opts)
242{
243 struct crypto_cipher *essiv_tfm = NULL;
244 struct crypto_hash *hash_tfm = NULL;
245 u8 *salt = NULL;
246 int err;
247
248 if (!opts) {
184 ti->error = "Digest algorithm missing for ESSIV mode"; 249 ti->error = "Digest algorithm missing for ESSIV mode";
185 return -EINVAL; 250 return -EINVAL;
186 } 251 }
187 252
188 /* Hash the cipher key with the given hash algorithm */ 253 /* Allocate hash algorithm */
189 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); 254 hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
190 if (IS_ERR(hash_tfm)) { 255 if (IS_ERR(hash_tfm)) {
191 ti->error = "Error initializing ESSIV hash"; 256 ti->error = "Error initializing ESSIV hash";
192 return PTR_ERR(hash_tfm); 257 err = PTR_ERR(hash_tfm);
258 goto bad;
193 } 259 }
194 260
195 saltsize = crypto_hash_digestsize(hash_tfm); 261 salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
196 salt = kmalloc(saltsize, GFP_KERNEL); 262 if (!salt) {
197 if (salt == NULL) {
198 ti->error = "Error kmallocing salt storage in ESSIV"; 263 ti->error = "Error kmallocing salt storage in ESSIV";
199 crypto_free_hash(hash_tfm); 264 err = -ENOMEM;
200 return -ENOMEM; 265 goto bad;
201 } 266 }
202 267
203 sg_init_one(&sg, cc->key, cc->key_size); 268 /* Allocate essiv_tfm */
204 desc.tfm = hash_tfm;
205 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
206 err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
207 crypto_free_hash(hash_tfm);
208
209 if (err) {
210 ti->error = "Error calculating hash in ESSIV";
211 kfree(salt);
212 return err;
213 }
214
215 /* Setup the essiv_tfm with the given salt */
216 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); 269 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
217 if (IS_ERR(essiv_tfm)) { 270 if (IS_ERR(essiv_tfm)) {
218 ti->error = "Error allocating crypto tfm for ESSIV"; 271 ti->error = "Error allocating crypto tfm for ESSIV";
219 kfree(salt); 272 err = PTR_ERR(essiv_tfm);
220 return PTR_ERR(essiv_tfm); 273 goto bad;
221 } 274 }
222 if (crypto_cipher_blocksize(essiv_tfm) != 275 if (crypto_cipher_blocksize(essiv_tfm) !=
223 crypto_ablkcipher_ivsize(cc->tfm)) { 276 crypto_ablkcipher_ivsize(cc->tfm)) {
224 ti->error = "Block size of ESSIV cipher does " 277 ti->error = "Block size of ESSIV cipher does "
225 "not match IV size of block cipher"; 278 "not match IV size of block cipher";
226 crypto_free_cipher(essiv_tfm); 279 err = -EINVAL;
227 kfree(salt); 280 goto bad;
228 return -EINVAL;
229 } 281 }
230 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
231 if (err) {
232 ti->error = "Failed to set key for ESSIV cipher";
233 crypto_free_cipher(essiv_tfm);
234 kfree(salt);
235 return err;
236 }
237 kfree(salt);
238 282
239 cc->iv_gen_private.essiv_tfm = essiv_tfm; 283 cc->iv_gen_private.essiv.salt = salt;
284 cc->iv_gen_private.essiv.tfm = essiv_tfm;
285 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
286
240 return 0; 287 return 0;
241}
242 288
243static void crypt_iv_essiv_dtr(struct crypt_config *cc) 289bad:
244{ 290 if (essiv_tfm && !IS_ERR(essiv_tfm))
245 crypto_free_cipher(cc->iv_gen_private.essiv_tfm); 291 crypto_free_cipher(essiv_tfm);
246 cc->iv_gen_private.essiv_tfm = NULL; 292 if (hash_tfm && !IS_ERR(hash_tfm))
293 crypto_free_hash(hash_tfm);
294 kfree(salt);
295 return err;
247} 296}
248 297
249static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 298static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
250{ 299{
251 memset(iv, 0, cc->iv_size); 300 memset(iv, 0, cc->iv_size);
252 *(u64 *)iv = cpu_to_le64(sector); 301 *(u64 *)iv = cpu_to_le64(sector);
253 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); 302 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
254 return 0; 303 return 0;
255} 304}
256 305
@@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
273 return -EINVAL; 322 return -EINVAL;
274 } 323 }
275 324
276 cc->iv_gen_private.benbi_shift = 9 - log; 325 cc->iv_gen_private.benbi.shift = 9 - log;
277 326
278 return 0; 327 return 0;
279} 328}
@@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
288 337
289 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 338 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
290 339
291 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); 340 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
292 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 341 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
293 342
294 return 0; 343 return 0;
@@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
305 .generator = crypt_iv_plain_gen 354 .generator = crypt_iv_plain_gen
306}; 355};
307 356
357static struct crypt_iv_operations crypt_iv_plain64_ops = {
358 .generator = crypt_iv_plain64_gen
359};
360
308static struct crypt_iv_operations crypt_iv_essiv_ops = { 361static struct crypt_iv_operations crypt_iv_essiv_ops = {
309 .ctr = crypt_iv_essiv_ctr, 362 .ctr = crypt_iv_essiv_ctr,
310 .dtr = crypt_iv_essiv_dtr, 363 .dtr = crypt_iv_essiv_dtr,
364 .init = crypt_iv_essiv_init,
365 .wipe = crypt_iv_essiv_wipe,
311 .generator = crypt_iv_essiv_gen 366 .generator = crypt_iv_essiv_gen
312}; 367};
313 368
@@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
934 989
935 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 990 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
936 991
937 return 0; 992 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
938} 993}
939 994
940static int crypt_wipe_key(struct crypt_config *cc) 995static int crypt_wipe_key(struct crypt_config *cc)
941{ 996{
942 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 997 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
943 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 998 memset(&cc->key, 0, cc->key_size * sizeof(u8));
944 return 0; 999 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
945} 1000}
946 1001
947/* 1002/*
@@ -983,12 +1038,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
983 return -ENOMEM; 1038 return -ENOMEM;
984 } 1039 }
985 1040
986 if (crypt_set_key(cc, argv[1])) { 1041 /* Compatibility mode for old dm-crypt cipher strings */
987 ti->error = "Error decoding key";
988 goto bad_cipher;
989 }
990
991 /* Compatiblity mode for old dm-crypt cipher strings */
992 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { 1042 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
993 chainmode = "cbc"; 1043 chainmode = "cbc";
994 ivmode = "plain"; 1044 ivmode = "plain";
@@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1015 strcpy(cc->chainmode, chainmode); 1065 strcpy(cc->chainmode, chainmode);
1016 cc->tfm = tfm; 1066 cc->tfm = tfm;
1017 1067
1068 if (crypt_set_key(cc, argv[1]) < 0) {
1069 ti->error = "Error decoding and setting key";
1070 goto bad_ivmode;
1071 }
1072
1018 /* 1073 /*
1019 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". 1074 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
1020 * See comments at iv code 1075 * See comments at iv code
@@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1024 cc->iv_gen_ops = NULL; 1079 cc->iv_gen_ops = NULL;
1025 else if (strcmp(ivmode, "plain") == 0) 1080 else if (strcmp(ivmode, "plain") == 0)
1026 cc->iv_gen_ops = &crypt_iv_plain_ops; 1081 cc->iv_gen_ops = &crypt_iv_plain_ops;
1082 else if (strcmp(ivmode, "plain64") == 0)
1083 cc->iv_gen_ops = &crypt_iv_plain64_ops;
1027 else if (strcmp(ivmode, "essiv") == 0) 1084 else if (strcmp(ivmode, "essiv") == 0)
1028 cc->iv_gen_ops = &crypt_iv_essiv_ops; 1085 cc->iv_gen_ops = &crypt_iv_essiv_ops;
1029 else if (strcmp(ivmode, "benbi") == 0) 1086 else if (strcmp(ivmode, "benbi") == 0)
@@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1039 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) 1096 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
1040 goto bad_ivmode; 1097 goto bad_ivmode;
1041 1098
1099 if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
1100 cc->iv_gen_ops->init(cc) < 0) {
1101 ti->error = "Error initialising IV";
1102 goto bad_slab_pool;
1103 }
1104
1042 cc->iv_size = crypto_ablkcipher_ivsize(tfm); 1105 cc->iv_size = crypto_ablkcipher_ivsize(tfm);
1043 if (cc->iv_size) 1106 if (cc->iv_size)
1044 /* at least a 64 bit sector number should fit in our buffer */ 1107 /* at least a 64 bit sector number should fit in our buffer */
@@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1085 goto bad_bs; 1148 goto bad_bs;
1086 } 1149 }
1087 1150
1088 if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
1089 ti->error = "Error setting key";
1090 goto bad_device;
1091 }
1092
1093 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1151 if (sscanf(argv[2], "%llu", &tmpll) != 1) {
1094 ti->error = "Invalid iv_offset sector"; 1152 ti->error = "Invalid iv_offset sector";
1095 goto bad_device; 1153 goto bad_device;
@@ -1102,8 +1160,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1102 } 1160 }
1103 cc->start = tmpll; 1161 cc->start = tmpll;
1104 1162
1105 if (dm_get_device(ti, argv[3], cc->start, ti->len, 1163 if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) {
1106 dm_table_get_mode(ti->table), &cc->dev)) {
1107 ti->error = "Device lookup failed"; 1164 ti->error = "Device lookup failed";
1108 goto bad_device; 1165 goto bad_device;
1109 } 1166 }
@@ -1278,6 +1335,7 @@ static void crypt_resume(struct dm_target *ti)
1278static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) 1335static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1279{ 1336{
1280 struct crypt_config *cc = ti->private; 1337 struct crypt_config *cc = ti->private;
1338 int ret = -EINVAL;
1281 1339
1282 if (argc < 2) 1340 if (argc < 2)
1283 goto error; 1341 goto error;
@@ -1287,10 +1345,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
1287 DMWARN("not suspended during key manipulation."); 1345 DMWARN("not suspended during key manipulation.");
1288 return -EINVAL; 1346 return -EINVAL;
1289 } 1347 }
1290 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) 1348 if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
1291 return crypt_set_key(cc, argv[2]); 1349 ret = crypt_set_key(cc, argv[2]);
1292 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) 1350 if (ret)
1351 return ret;
1352 if (cc->iv_gen_ops && cc->iv_gen_ops->init)
1353 ret = cc->iv_gen_ops->init(cc);
1354 return ret;
1355 }
1356 if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
1357 if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
1358 ret = cc->iv_gen_ops->wipe(cc);
1359 if (ret)
1360 return ret;
1361 }
1293 return crypt_wipe_key(cc); 1362 return crypt_wipe_key(cc);
1363 }
1294 } 1364 }
1295 1365
1296error: 1366error:
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index ebe7381f47c8..852052880d7a 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -156,8 +156,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
156 goto bad; 156 goto bad;
157 } 157 }
158 158
159 if (dm_get_device(ti, argv[0], dc->start_read, ti->len, 159 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
160 dm_table_get_mode(ti->table), &dc->dev_read)) { 160 &dc->dev_read)) {
161 ti->error = "Device lookup failed"; 161 ti->error = "Device lookup failed";
162 goto bad; 162 goto bad;
163 } 163 }
@@ -177,8 +177,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
177 goto bad_dev_read; 177 goto bad_dev_read;
178 } 178 }
179 179
180 if (dm_get_device(ti, argv[3], dc->start_write, ti->len, 180 if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
181 dm_table_get_mode(ti->table), &dc->dev_write)) { 181 &dc->dev_write)) {
182 ti->error = "Write device lookup failed"; 182 ti->error = "Write device lookup failed";
183 goto bad_dev_read; 183 goto bad_dev_read;
184 } 184 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 7dbe652efb5a..2b7907b6dd09 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
172 } 172 }
173 173
174 /* Validate the chunk size against the device block size */ 174 /* Validate the chunk size against the device block size */
175 if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) { 175 if (chunk_size %
176 (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
176 *error = "Chunk size is not a multiple of device blocksize"; 177 *error = "Chunk size is not a multiple of device blocksize";
177 return -EINVAL; 178 return -EINVAL;
178 } 179 }
@@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
190} 191}
191 192
192int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 193int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
194 struct dm_snapshot *snap,
193 unsigned *args_used, 195 unsigned *args_used,
194 struct dm_exception_store **store) 196 struct dm_exception_store **store)
195{ 197{
@@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
198 struct dm_exception_store *tmp_store; 200 struct dm_exception_store *tmp_store;
199 char persistent; 201 char persistent;
200 202
201 if (argc < 3) { 203 if (argc < 2) {
202 ti->error = "Insufficient exception store arguments"; 204 ti->error = "Insufficient exception store arguments";
203 return -EINVAL; 205 return -EINVAL;
204 } 206 }
@@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 return -ENOMEM; 211 return -ENOMEM;
210 } 212 }
211 213
212 persistent = toupper(*argv[1]); 214 persistent = toupper(*argv[0]);
213 if (persistent == 'P') 215 if (persistent == 'P')
214 type = get_type("P"); 216 type = get_type("P");
215 else if (persistent == 'N') 217 else if (persistent == 'N')
216 type = get_type("N"); 218 type = get_type("N");
217 else { 219 else {
218 ti->error = "Persistent flag is not P or N"; 220 ti->error = "Persistent flag is not P or N";
219 return -EINVAL; 221 r = -EINVAL;
222 goto bad_type;
220 } 223 }
221 224
222 if (!type) { 225 if (!type) {
@@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
226 } 229 }
227 230
228 tmp_store->type = type; 231 tmp_store->type = type;
229 tmp_store->ti = ti; 232 tmp_store->snap = snap;
230
231 r = dm_get_device(ti, argv[0], 0, 0,
232 FMODE_READ | FMODE_WRITE, &tmp_store->cow);
233 if (r) {
234 ti->error = "Cannot get COW device";
235 goto bad_cow;
236 }
237 233
238 r = set_chunk_size(tmp_store, argv[2], &ti->error); 234 r = set_chunk_size(tmp_store, argv[1], &ti->error);
239 if (r) 235 if (r)
240 goto bad_ctr; 236 goto bad;
241 237
242 r = type->ctr(tmp_store, 0, NULL); 238 r = type->ctr(tmp_store, 0, NULL);
243 if (r) { 239 if (r) {
244 ti->error = "Exception store type constructor failed"; 240 ti->error = "Exception store type constructor failed";
245 goto bad_ctr; 241 goto bad;
246 } 242 }
247 243
248 *args_used = 3; 244 *args_used = 2;
249 *store = tmp_store; 245 *store = tmp_store;
250 return 0; 246 return 0;
251 247
252bad_ctr: 248bad:
253 dm_put_device(ti, tmp_store->cow);
254bad_cow:
255 put_type(type); 249 put_type(type);
256bad_type: 250bad_type:
257 kfree(tmp_store); 251 kfree(tmp_store);
@@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
262void dm_exception_store_destroy(struct dm_exception_store *store) 256void dm_exception_store_destroy(struct dm_exception_store *store)
263{ 257{
264 store->type->dtr(store); 258 store->type->dtr(store);
265 dm_put_device(store->ti, store->cow);
266 put_type(store->type); 259 put_type(store->type);
267 kfree(store); 260 kfree(store);
268} 261}
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 8a223a48802c..e8dfa06af3ba 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -26,7 +26,7 @@ typedef sector_t chunk_t;
26 * of chunks that follow contiguously. Remaining bits hold the number of the 26 * of chunks that follow contiguously. Remaining bits hold the number of the
27 * chunk within the device. 27 * chunk within the device.
28 */ 28 */
29struct dm_snap_exception { 29struct dm_exception {
30 struct list_head hash_list; 30 struct list_head hash_list;
31 31
32 chunk_t old_chunk; 32 chunk_t old_chunk;
@@ -64,17 +64,34 @@ struct dm_exception_store_type {
64 * Find somewhere to store the next exception. 64 * Find somewhere to store the next exception.
65 */ 65 */
66 int (*prepare_exception) (struct dm_exception_store *store, 66 int (*prepare_exception) (struct dm_exception_store *store,
67 struct dm_snap_exception *e); 67 struct dm_exception *e);
68 68
69 /* 69 /*
70 * Update the metadata with this exception. 70 * Update the metadata with this exception.
71 */ 71 */
72 void (*commit_exception) (struct dm_exception_store *store, 72 void (*commit_exception) (struct dm_exception_store *store,
73 struct dm_snap_exception *e, 73 struct dm_exception *e,
74 void (*callback) (void *, int success), 74 void (*callback) (void *, int success),
75 void *callback_context); 75 void *callback_context);
76 76
77 /* 77 /*
78 * Returns 0 if the exception store is empty.
79 *
80 * If there are exceptions still to be merged, sets
81 * *last_old_chunk and *last_new_chunk to the most recent
82 * still-to-be-merged chunk and returns the number of
83 * consecutive previous ones.
84 */
85 int (*prepare_merge) (struct dm_exception_store *store,
86 chunk_t *last_old_chunk, chunk_t *last_new_chunk);
87
88 /*
89 * Clear the last n exceptions.
90 * nr_merged must be <= the value returned by prepare_merge.
91 */
92 int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
93
94 /*
78 * The snapshot is invalid, note this in the metadata. 95 * The snapshot is invalid, note this in the metadata.
79 */ 96 */
80 void (*drop_snapshot) (struct dm_exception_store *store); 97 void (*drop_snapshot) (struct dm_exception_store *store);
@@ -86,19 +103,19 @@ struct dm_exception_store_type {
86 /* 103 /*
87 * Return how full the snapshot is. 104 * Return how full the snapshot is.
88 */ 105 */
89 void (*fraction_full) (struct dm_exception_store *store, 106 void (*usage) (struct dm_exception_store *store,
90 sector_t *numerator, 107 sector_t *total_sectors, sector_t *sectors_allocated,
91 sector_t *denominator); 108 sector_t *metadata_sectors);
92 109
93 /* For internal device-mapper use only. */ 110 /* For internal device-mapper use only. */
94 struct list_head list; 111 struct list_head list;
95}; 112};
96 113
114struct dm_snapshot;
115
97struct dm_exception_store { 116struct dm_exception_store {
98 struct dm_exception_store_type *type; 117 struct dm_exception_store_type *type;
99 struct dm_target *ti; 118 struct dm_snapshot *snap;
100
101 struct dm_dev *cow;
102 119
103 /* Size of data blocks saved - must be a power of 2 */ 120 /* Size of data blocks saved - must be a power of 2 */
104 unsigned chunk_size; 121 unsigned chunk_size;
@@ -109,6 +126,11 @@ struct dm_exception_store {
109}; 126};
110 127
111/* 128/*
129 * Obtain the cow device used by a given snapshot.
130 */
131struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
132
133/*
112 * Funtions to manipulate consecutive chunks 134 * Funtions to manipulate consecutive chunks
113 */ 135 */
114# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) 136# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
@@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
120 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); 142 return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
121} 143}
122 144
123static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 145static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
124{ 146{
125 return e->new_chunk >> DM_CHUNK_NUMBER_BITS; 147 return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
126} 148}
127 149
128static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 150static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
129{ 151{
130 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); 152 e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
131 153
132 BUG_ON(!dm_consecutive_chunk_count(e)); 154 BUG_ON(!dm_consecutive_chunk_count(e));
133} 155}
134 156
157static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
158{
159 BUG_ON(!dm_consecutive_chunk_count(e));
160
161 e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
162}
163
135# else 164# else
136# define DM_CHUNK_CONSECUTIVE_BITS 0 165# define DM_CHUNK_CONSECUTIVE_BITS 0
137 166
@@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
140 return chunk; 169 return chunk;
141} 170}
142 171
143static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) 172static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
144{ 173{
145 return 0; 174 return 0;
146} 175}
147 176
148static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) 177static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
178{
179}
180
181static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
149{ 182{
150} 183}
151 184
@@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
162static inline chunk_t sector_to_chunk(struct dm_exception_store *store, 195static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
163 sector_t sector) 196 sector_t sector)
164{ 197{
165 return (sector & ~store->chunk_mask) >> store->chunk_shift; 198 return sector >> store->chunk_shift;
166} 199}
167 200
168int dm_exception_store_type_register(struct dm_exception_store_type *type); 201int dm_exception_store_type_register(struct dm_exception_store_type *type);
@@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
173 char **error); 206 char **error);
174 207
175int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, 208int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
209 struct dm_snapshot *snap,
176 unsigned *args_used, 210 unsigned *args_used,
177 struct dm_exception_store **store); 211 struct dm_exception_store **store);
178void dm_exception_store_destroy(struct dm_exception_store *store); 212void dm_exception_store_destroy(struct dm_exception_store *store);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3a2e6a2f8bdd..10f457ca6af2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,6 +5,8 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm.h"
9
8#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
9 11
10#include <linux/bio.h> 12#include <linux/bio.h>
@@ -14,12 +16,19 @@
14#include <linux/slab.h> 16#include <linux/slab.h>
15#include <linux/dm-io.h> 17#include <linux/dm-io.h>
16 18
19#define DM_MSG_PREFIX "io"
20
21#define DM_IO_MAX_REGIONS BITS_PER_LONG
22
17struct dm_io_client { 23struct dm_io_client {
18 mempool_t *pool; 24 mempool_t *pool;
19 struct bio_set *bios; 25 struct bio_set *bios;
20}; 26};
21 27
22/* FIXME: can we shrink this ? */ 28/*
29 * Aligning 'struct io' reduces the number of bits required to store
30 * its address. Refer to store_io_and_region_in_bio() below.
31 */
23struct io { 32struct io {
24 unsigned long error_bits; 33 unsigned long error_bits;
25 unsigned long eopnotsupp_bits; 34 unsigned long eopnotsupp_bits;
@@ -28,7 +37,9 @@ struct io {
28 struct dm_io_client *client; 37 struct dm_io_client *client;
29 io_notify_fn callback; 38 io_notify_fn callback;
30 void *context; 39 void *context;
31}; 40} __attribute__((aligned(DM_IO_MAX_REGIONS)));
41
42static struct kmem_cache *_dm_io_cache;
32 43
33/* 44/*
34 * io contexts are only dynamically allocated for asynchronous 45 * io contexts are only dynamically allocated for asynchronous
@@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
53 if (!client) 64 if (!client)
54 return ERR_PTR(-ENOMEM); 65 return ERR_PTR(-ENOMEM);
55 66
56 client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); 67 client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
57 if (!client->pool) 68 if (!client->pool)
58 goto bad; 69 goto bad;
59 70
@@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
88 99
89/*----------------------------------------------------------------- 100/*-----------------------------------------------------------------
90 * We need to keep track of which region a bio is doing io for. 101 * We need to keep track of which region a bio is doing io for.
91 * In order to save a memory allocation we store this the last 102 * To avoid a memory allocation to store just 5 or 6 bits, we
92 * bvec which we know is unused (blech). 103 * ensure the 'struct io' pointer is aligned so enough low bits are
93 * XXX This is ugly and can OOPS with some configs... find another way. 104 * always zero and then combine it with the region number directly in
105 * bi_private.
94 *---------------------------------------------------------------*/ 106 *---------------------------------------------------------------*/
95static inline void bio_set_region(struct bio *bio, unsigned region) 107static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
108 unsigned region)
96{ 109{
97 bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; 110 if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
111 DMCRIT("Unaligned struct io pointer %p", io);
112 BUG();
113 }
114
115 bio->bi_private = (void *)((unsigned long)io | region);
98} 116}
99 117
100static inline unsigned bio_get_region(struct bio *bio) 118static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
119 unsigned *region)
101{ 120{
102 return bio->bi_io_vec[bio->bi_max_vecs].bv_len; 121 unsigned long val = (unsigned long)bio->bi_private;
122
123 *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
124 *region = val & (DM_IO_MAX_REGIONS - 1);
103} 125}
104 126
105/*----------------------------------------------------------------- 127/*-----------------------------------------------------------------
@@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
140 /* 162 /*
141 * The bio destructor in bio_put() may use the io object. 163 * The bio destructor in bio_put() may use the io object.
142 */ 164 */
143 io = bio->bi_private; 165 retrieve_io_and_region_from_bio(bio, &io, &region);
144 region = bio_get_region(bio);
145 166
146 bio->bi_max_vecs++;
147 bio_put(bio); 167 bio_put(bio);
148 168
149 dec_count(io, region, error); 169 dec_count(io, region, error);
@@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
243 263
244static void dm_bio_destructor(struct bio *bio) 264static void dm_bio_destructor(struct bio *bio)
245{ 265{
246 struct io *io = bio->bi_private; 266 unsigned region;
267 struct io *io;
268
269 retrieve_io_and_region_from_bio(bio, &io, &region);
247 270
248 bio_free(bio, io->client->bios); 271 bio_free(bio, io->client->bios);
249} 272}
@@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
286 unsigned num_bvecs; 309 unsigned num_bvecs;
287 sector_t remaining = where->count; 310 sector_t remaining = where->count;
288 311
289 while (remaining) { 312 /*
313 * where->count may be zero if rw holds a write barrier and we
314 * need to send a zero-sized barrier.
315 */
316 do {
290 /* 317 /*
291 * Allocate a suitably sized-bio: we add an extra 318 * Allocate a suitably sized-bio.
292 * bvec for bio_get/set_region() and decrement bi_max_vecs
293 * to hide it from bio_add_page().
294 */ 319 */
295 num_bvecs = dm_sector_div_up(remaining, 320 num_bvecs = dm_sector_div_up(remaining,
296 (PAGE_SIZE >> SECTOR_SHIFT)); 321 (PAGE_SIZE >> SECTOR_SHIFT));
297 num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), 322 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
298 num_bvecs);
299 if (unlikely(num_bvecs > BIO_MAX_PAGES))
300 num_bvecs = BIO_MAX_PAGES;
301 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 323 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
302 bio->bi_sector = where->sector + (where->count - remaining); 324 bio->bi_sector = where->sector + (where->count - remaining);
303 bio->bi_bdev = where->bdev; 325 bio->bi_bdev = where->bdev;
304 bio->bi_end_io = endio; 326 bio->bi_end_io = endio;
305 bio->bi_private = io;
306 bio->bi_destructor = dm_bio_destructor; 327 bio->bi_destructor = dm_bio_destructor;
307 bio->bi_max_vecs--; 328 store_io_and_region_in_bio(bio, io, region);
308 bio_set_region(bio, region);
309 329
310 /* 330 /*
311 * Try and add as many pages as possible. 331 * Try and add as many pages as possible.
@@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
323 343
324 atomic_inc(&io->count); 344 atomic_inc(&io->count);
325 submit_bio(rw, bio); 345 submit_bio(rw, bio);
326 } 346 } while (remaining);
327} 347}
328 348
329static void dispatch_io(int rw, unsigned int num_regions, 349static void dispatch_io(int rw, unsigned int num_regions,
@@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
333 int i; 353 int i;
334 struct dpages old_pages = *dp; 354 struct dpages old_pages = *dp;
335 355
356 BUG_ON(num_regions > DM_IO_MAX_REGIONS);
357
336 if (sync) 358 if (sync)
337 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 359 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
338 360
@@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
342 */ 364 */
343 for (i = 0; i < num_regions; i++) { 365 for (i = 0; i < num_regions; i++) {
344 *dp = old_pages; 366 *dp = old_pages;
345 if (where[i].count) 367 if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
346 do_region(rw, i, where + i, dp, io); 368 do_region(rw, i, where + i, dp, io);
347 } 369 }
348 370
@@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
357 struct dm_io_region *where, int rw, struct dpages *dp, 379 struct dm_io_region *where, int rw, struct dpages *dp,
358 unsigned long *error_bits) 380 unsigned long *error_bits)
359{ 381{
360 struct io io; 382 /*
383 * gcc <= 4.3 can't do the alignment for stack variables, so we must
384 * align it on our own.
385 * volatile prevents the optimizer from removing or reusing
386 * "io_" field from the stack frame (allowed in ANSI C).
387 */
388 volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
389 struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
361 390
362 if (num_regions > 1 && (rw & RW_MASK) != WRITE) { 391 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
363 WARN_ON(1); 392 WARN_ON(1);
@@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
365 } 394 }
366 395
367retry: 396retry:
368 io.error_bits = 0; 397 io->error_bits = 0;
369 io.eopnotsupp_bits = 0; 398 io->eopnotsupp_bits = 0;
370 atomic_set(&io.count, 1); /* see dispatch_io() */ 399 atomic_set(&io->count, 1); /* see dispatch_io() */
371 io.sleeper = current; 400 io->sleeper = current;
372 io.client = client; 401 io->client = client;
373 402
374 dispatch_io(rw, num_regions, where, dp, &io, 1); 403 dispatch_io(rw, num_regions, where, dp, io, 1);
375 404
376 while (1) { 405 while (1) {
377 set_current_state(TASK_UNINTERRUPTIBLE); 406 set_current_state(TASK_UNINTERRUPTIBLE);
378 407
379 if (!atomic_read(&io.count)) 408 if (!atomic_read(&io->count))
380 break; 409 break;
381 410
382 io_schedule(); 411 io_schedule();
383 } 412 }
384 set_current_state(TASK_RUNNING); 413 set_current_state(TASK_RUNNING);
385 414
386 if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) { 415 if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
387 rw &= ~(1 << BIO_RW_BARRIER); 416 rw &= ~(1 << BIO_RW_BARRIER);
388 goto retry; 417 goto retry;
389 } 418 }
390 419
391 if (error_bits) 420 if (error_bits)
392 *error_bits = io.error_bits; 421 *error_bits = io->error_bits;
393 422
394 return io.error_bits ? -EIO : 0; 423 return io->error_bits ? -EIO : 0;
395} 424}
396 425
397static int async_io(struct dm_io_client *client, unsigned int num_regions, 426static int async_io(struct dm_io_client *client, unsigned int num_regions,
@@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
472 &dp, io_req->notify.fn, io_req->notify.context); 501 &dp, io_req->notify.fn, io_req->notify.context);
473} 502}
474EXPORT_SYMBOL(dm_io); 503EXPORT_SYMBOL(dm_io);
504
505int __init dm_io_init(void)
506{
507 _dm_io_cache = KMEM_CACHE(io, 0);
508 if (!_dm_io_cache)
509 return -ENOMEM;
510
511 return 0;
512}
513
514void dm_io_exit(void)
515{
516 kmem_cache_destroy(_dm_io_cache);
517 _dm_io_cache = NULL;
518}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index a67942931582..d7500e1c26f2 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
56 */ 56 */
57static DECLARE_RWSEM(_hash_lock); 57static DECLARE_RWSEM(_hash_lock);
58 58
59/*
60 * Protects use of mdptr to obtain hash cell name and uuid from mapped device.
61 */
62static DEFINE_MUTEX(dm_hash_cells_mutex);
63
59static void init_buckets(struct list_head *buckets) 64static void init_buckets(struct list_head *buckets)
60{ 65{
61 unsigned int i; 66 unsigned int i;
@@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
206 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); 211 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
207 } 212 }
208 dm_get(md); 213 dm_get(md);
214 mutex_lock(&dm_hash_cells_mutex);
209 dm_set_mdptr(md, cell); 215 dm_set_mdptr(md, cell);
216 mutex_unlock(&dm_hash_cells_mutex);
210 up_write(&_hash_lock); 217 up_write(&_hash_lock);
211 218
212 return 0; 219 return 0;
@@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc)
224 /* remove from the dev hash */ 231 /* remove from the dev hash */
225 list_del(&hc->uuid_list); 232 list_del(&hc->uuid_list);
226 list_del(&hc->name_list); 233 list_del(&hc->name_list);
234 mutex_lock(&dm_hash_cells_mutex);
227 dm_set_mdptr(hc->md, NULL); 235 dm_set_mdptr(hc->md, NULL);
236 mutex_unlock(&dm_hash_cells_mutex);
228 237
229 table = dm_get_table(hc->md); 238 table = dm_get_live_table(hc->md);
230 if (table) { 239 if (table) {
231 dm_table_event(table); 240 dm_table_event(table);
232 dm_table_put(table); 241 dm_table_put(table);
@@ -276,7 +285,8 @@ retry:
276 up_write(&_hash_lock); 285 up_write(&_hash_lock);
277} 286}
278 287
279static int dm_hash_rename(uint32_t cookie, const char *old, const char *new) 288static int dm_hash_rename(uint32_t cookie, uint32_t *flags, const char *old,
289 const char *new)
280{ 290{
281 char *new_name, *old_name; 291 char *new_name, *old_name;
282 struct hash_cell *hc; 292 struct hash_cell *hc;
@@ -321,19 +331,22 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
321 */ 331 */
322 list_del(&hc->name_list); 332 list_del(&hc->name_list);
323 old_name = hc->name; 333 old_name = hc->name;
334 mutex_lock(&dm_hash_cells_mutex);
324 hc->name = new_name; 335 hc->name = new_name;
336 mutex_unlock(&dm_hash_cells_mutex);
325 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 337 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
326 338
327 /* 339 /*
328 * Wake up any dm event waiters. 340 * Wake up any dm event waiters.
329 */ 341 */
330 table = dm_get_table(hc->md); 342 table = dm_get_live_table(hc->md);
331 if (table) { 343 if (table) {
332 dm_table_event(table); 344 dm_table_event(table);
333 dm_table_put(table); 345 dm_table_put(table);
334 } 346 }
335 347
336 dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie); 348 if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie))
349 *flags |= DM_UEVENT_GENERATED_FLAG;
337 350
338 dm_put(hc->md); 351 dm_put(hc->md);
339 up_write(&_hash_lock); 352 up_write(&_hash_lock);
@@ -512,8 +525,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
512 return 0; 525 return 0;
513} 526}
514 527
515
516
517static int check_name(const char *name) 528static int check_name(const char *name)
518{ 529{
519 if (strchr(name, '/')) { 530 if (strchr(name, '/')) {
@@ -525,6 +536,40 @@ static int check_name(const char *name)
525} 536}
526 537
527/* 538/*
539 * On successful return, the caller must not attempt to acquire
540 * _hash_lock without first calling dm_table_put, because dm_table_destroy
541 * waits for this dm_table_put and could be called under this lock.
542 */
543static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
544{
545 struct hash_cell *hc;
546 struct dm_table *table = NULL;
547
548 down_read(&_hash_lock);
549 hc = dm_get_mdptr(md);
550 if (!hc || hc->md != md) {
551 DMWARN("device has been removed from the dev hash table.");
552 goto out;
553 }
554
555 table = hc->new_map;
556 if (table)
557 dm_table_get(table);
558
559out:
560 up_read(&_hash_lock);
561
562 return table;
563}
564
565static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
566 struct dm_ioctl *param)
567{
568 return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
569 dm_get_inactive_table(md) : dm_get_live_table(md);
570}
571
572/*
528 * Fills in a dm_ioctl structure, ready for sending back to 573 * Fills in a dm_ioctl structure, ready for sending back to
529 * userland. 574 * userland.
530 */ 575 */
@@ -536,7 +581,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
536 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 581 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
537 DM_ACTIVE_PRESENT_FLAG); 582 DM_ACTIVE_PRESENT_FLAG);
538 583
539 if (dm_suspended(md)) 584 if (dm_suspended_md(md))
540 param->flags |= DM_SUSPEND_FLAG; 585 param->flags |= DM_SUSPEND_FLAG;
541 586
542 param->dev = huge_encode_dev(disk_devt(disk)); 587 param->dev = huge_encode_dev(disk_devt(disk));
@@ -548,18 +593,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
548 */ 593 */
549 param->open_count = dm_open_count(md); 594 param->open_count = dm_open_count(md);
550 595
551 if (get_disk_ro(disk))
552 param->flags |= DM_READONLY_FLAG;
553
554 param->event_nr = dm_get_event_nr(md); 596 param->event_nr = dm_get_event_nr(md);
597 param->target_count = 0;
555 598
556 table = dm_get_table(md); 599 table = dm_get_live_table(md);
557 if (table) { 600 if (table) {
558 param->flags |= DM_ACTIVE_PRESENT_FLAG; 601 if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
559 param->target_count = dm_table_get_num_targets(table); 602 if (get_disk_ro(disk))
603 param->flags |= DM_READONLY_FLAG;
604 param->target_count = dm_table_get_num_targets(table);
605 }
560 dm_table_put(table); 606 dm_table_put(table);
561 } else 607
562 param->target_count = 0; 608 param->flags |= DM_ACTIVE_PRESENT_FLAG;
609 }
610
611 if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
612 table = dm_get_inactive_table(md);
613 if (table) {
614 if (!(dm_table_get_mode(table) & FMODE_WRITE))
615 param->flags |= DM_READONLY_FLAG;
616 param->target_count = dm_table_get_num_targets(table);
617 dm_table_put(table);
618 }
619 }
563 620
564 return 0; 621 return 0;
565} 622}
@@ -634,9 +691,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
634 * Sneakily write in both the name and the uuid 691 * Sneakily write in both the name and the uuid
635 * while we have the cell. 692 * while we have the cell.
636 */ 693 */
637 strncpy(param->name, hc->name, sizeof(param->name)); 694 strlcpy(param->name, hc->name, sizeof(param->name));
638 if (hc->uuid) 695 if (hc->uuid)
639 strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); 696 strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
640 else 697 else
641 param->uuid[0] = '\0'; 698 param->uuid[0] = '\0';
642 699
@@ -681,10 +738,10 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
681 __hash_remove(hc); 738 __hash_remove(hc);
682 up_write(&_hash_lock); 739 up_write(&_hash_lock);
683 740
684 dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr); 741 if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
742 param->flags |= DM_UEVENT_GENERATED_FLAG;
685 743
686 dm_put(md); 744 dm_put(md);
687 param->data_size = 0;
688 return 0; 745 return 0;
689} 746}
690 747
@@ -718,7 +775,9 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
718 return r; 775 return r;
719 776
720 param->data_size = 0; 777 param->data_size = 0;
721 return dm_hash_rename(param->event_nr, param->name, new_name); 778
779 return dm_hash_rename(param->event_nr, &param->flags, param->name,
780 new_name);
722} 781}
723 782
724static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) 783static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -784,7 +843,7 @@ static int do_suspend(struct dm_ioctl *param)
784 if (param->flags & DM_NOFLUSH_FLAG) 843 if (param->flags & DM_NOFLUSH_FLAG)
785 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 844 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
786 845
787 if (!dm_suspended(md)) 846 if (!dm_suspended_md(md))
788 r = dm_suspend(md, suspend_flags); 847 r = dm_suspend(md, suspend_flags);
789 848
790 if (!r) 849 if (!r)
@@ -800,7 +859,7 @@ static int do_resume(struct dm_ioctl *param)
800 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; 859 unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
801 struct hash_cell *hc; 860 struct hash_cell *hc;
802 struct mapped_device *md; 861 struct mapped_device *md;
803 struct dm_table *new_map; 862 struct dm_table *new_map, *old_map = NULL;
804 863
805 down_write(&_hash_lock); 864 down_write(&_hash_lock);
806 865
@@ -826,14 +885,14 @@ static int do_resume(struct dm_ioctl *param)
826 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; 885 suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
827 if (param->flags & DM_NOFLUSH_FLAG) 886 if (param->flags & DM_NOFLUSH_FLAG)
828 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; 887 suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
829 if (!dm_suspended(md)) 888 if (!dm_suspended_md(md))
830 dm_suspend(md, suspend_flags); 889 dm_suspend(md, suspend_flags);
831 890
832 r = dm_swap_table(md, new_map); 891 old_map = dm_swap_table(md, new_map);
833 if (r) { 892 if (IS_ERR(old_map)) {
834 dm_table_destroy(new_map); 893 dm_table_destroy(new_map);
835 dm_put(md); 894 dm_put(md);
836 return r; 895 return PTR_ERR(old_map);
837 } 896 }
838 897
839 if (dm_table_get_mode(new_map) & FMODE_WRITE) 898 if (dm_table_get_mode(new_map) & FMODE_WRITE)
@@ -842,14 +901,17 @@ static int do_resume(struct dm_ioctl *param)
842 set_disk_ro(dm_disk(md), 1); 901 set_disk_ro(dm_disk(md), 1);
843 } 902 }
844 903
845 if (dm_suspended(md)) 904 if (dm_suspended_md(md)) {
846 r = dm_resume(md); 905 r = dm_resume(md);
906 if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr))
907 param->flags |= DM_UEVENT_GENERATED_FLAG;
908 }
847 909
910 if (old_map)
911 dm_table_destroy(old_map);
848 912
849 if (!r) { 913 if (!r)
850 dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
851 r = __dev_status(md, param); 914 r = __dev_status(md, param);
852 }
853 915
854 dm_put(md); 916 dm_put(md);
855 return r; 917 return r;
@@ -982,7 +1044,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
982 if (r) 1044 if (r)
983 goto out; 1045 goto out;
984 1046
985 table = dm_get_table(md); 1047 table = dm_get_live_or_inactive_table(md, param);
986 if (table) { 1048 if (table) {
987 retrieve_status(table, param, param_size); 1049 retrieve_status(table, param, param_size);
988 dm_table_put(table); 1050 dm_table_put(table);
@@ -1215,7 +1277,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
1215 if (r) 1277 if (r)
1216 goto out; 1278 goto out;
1217 1279
1218 table = dm_get_table(md); 1280 table = dm_get_live_or_inactive_table(md, param);
1219 if (table) { 1281 if (table) {
1220 retrieve_deps(table, param, param_size); 1282 retrieve_deps(table, param, param_size);
1221 dm_table_put(table); 1283 dm_table_put(table);
@@ -1244,13 +1306,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
1244 if (r) 1306 if (r)
1245 goto out; 1307 goto out;
1246 1308
1247 table = dm_get_table(md); 1309 table = dm_get_live_or_inactive_table(md, param);
1248 if (table) { 1310 if (table) {
1249 retrieve_status(table, param, param_size); 1311 retrieve_status(table, param, param_size);
1250 dm_table_put(table); 1312 dm_table_put(table);
1251 } 1313 }
1252 1314
1253 out: 1315out:
1254 dm_put(md); 1316 dm_put(md);
1255 return r; 1317 return r;
1256} 1318}
@@ -1288,10 +1350,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1288 goto out; 1350 goto out;
1289 } 1351 }
1290 1352
1291 table = dm_get_table(md); 1353 table = dm_get_live_table(md);
1292 if (!table) 1354 if (!table)
1293 goto out_argv; 1355 goto out_argv;
1294 1356
1357 if (dm_deleting_md(md)) {
1358 r = -ENXIO;
1359 goto out_table;
1360 }
1361
1295 ti = dm_table_find_target(table, tmsg->sector); 1362 ti = dm_table_find_target(table, tmsg->sector);
1296 if (!dm_target_is_valid(ti)) { 1363 if (!dm_target_is_valid(ti)) {
1297 DMWARN("Target message sector outside device."); 1364 DMWARN("Target message sector outside device.");
@@ -1303,6 +1370,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1303 r = -EINVAL; 1370 r = -EINVAL;
1304 } 1371 }
1305 1372
1373 out_table:
1306 dm_table_put(table); 1374 dm_table_put(table);
1307 out_argv: 1375 out_argv:
1308 kfree(argv); 1376 kfree(argv);
@@ -1413,6 +1481,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1413{ 1481{
1414 /* Always clear this flag */ 1482 /* Always clear this flag */
1415 param->flags &= ~DM_BUFFER_FULL_FLAG; 1483 param->flags &= ~DM_BUFFER_FULL_FLAG;
1484 param->flags &= ~DM_UEVENT_GENERATED_FLAG;
1416 1485
1417 /* Ignores parameters */ 1486 /* Ignores parameters */
1418 if (cmd == DM_REMOVE_ALL_CMD || 1487 if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1582,8 +1651,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1582 if (!md) 1651 if (!md)
1583 return -ENXIO; 1652 return -ENXIO;
1584 1653
1585 dm_get(md); 1654 mutex_lock(&dm_hash_cells_mutex);
1586 down_read(&_hash_lock);
1587 hc = dm_get_mdptr(md); 1655 hc = dm_get_mdptr(md);
1588 if (!hc || hc->md != md) { 1656 if (!hc || hc->md != md) {
1589 r = -ENXIO; 1657 r = -ENXIO;
@@ -1596,8 +1664,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
1596 strcpy(uuid, hc->uuid ? : ""); 1664 strcpy(uuid, hc->uuid ? : "");
1597 1665
1598out: 1666out:
1599 up_read(&_hash_lock); 1667 mutex_unlock(&dm_hash_cells_mutex);
1600 dm_put(md);
1601 1668
1602 return r; 1669 return r;
1603} 1670}
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3e3fc06cb861..addf83475040 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
450{ 450{
451 struct dm_kcopyd_client *kc = job->kc; 451 struct dm_kcopyd_client *kc = job->kc;
452 atomic_inc(&kc->nr_jobs); 452 atomic_inc(&kc->nr_jobs);
453 push(&kc->pages_jobs, job); 453 if (unlikely(!job->source.count))
454 push(&kc->complete_jobs, job);
455 else
456 push(&kc->pages_jobs, job);
454 wake(kc); 457 wake(kc);
455} 458}
456 459
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 82f7d6e6b1ea..9200dbf2391a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -47,8 +47,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
47 } 47 }
48 lc->start = tmp; 48 lc->start = tmp;
49 49
50 if (dm_get_device(ti, argv[0], lc->start, ti->len, 50 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) {
51 dm_table_get_mode(ti->table), &lc->dev)) {
52 ti->error = "dm-linear: Device lookup failed"; 51 ti->error = "dm-linear: Device lookup failed";
53 goto bad; 52 goto bad;
54 } 53 }
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 7ac2c1450d10..1ed0094f064b 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -5,6 +5,7 @@
5 */ 5 */
6 6
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/slab.h>
8#include <linux/dm-dirty-log.h> 9#include <linux/dm-dirty-log.h>
9#include <linux/device-mapper.h> 10#include <linux/device-mapper.h>
10#include <linux/dm-log-userspace.h> 11#include <linux/dm-log-userspace.h>
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 54abf9e303b7..075cbcf8a9f5 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -6,6 +6,7 @@
6 6
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/slab.h>
9#include <net/sock.h> 10#include <net/sock.h>
10#include <linux/workqueue.h> 11#include <linux/workqueue.h>
11#include <linux/connector.h> 12#include <linux/connector.h>
@@ -172,11 +173,15 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
172{ 173{
173 int r = 0; 174 int r = 0;
174 size_t dummy = 0; 175 size_t dummy = 0;
175 int overhead_size = 176 int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
176 sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
177 struct dm_ulog_request *tfr = prealloced_ulog_tfr; 177 struct dm_ulog_request *tfr = prealloced_ulog_tfr;
178 struct receiving_pkg pkg; 178 struct receiving_pkg pkg;
179 179
180 /*
181 * Given the space needed to hold the 'struct cn_msg' and
182 * 'struct dm_ulog_request' - do we have enough payload
183 * space remaining?
184 */
180 if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { 185 if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
181 DMINFO("Size of tfr exceeds preallocated size"); 186 DMINFO("Size of tfr exceeds preallocated size");
182 return -EINVAL; 187 return -EINVAL;
@@ -191,7 +196,7 @@ resend:
191 */ 196 */
192 mutex_lock(&dm_ulog_lock); 197 mutex_lock(&dm_ulog_lock);
193 198
194 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); 199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
195 memcpy(tfr->uuid, uuid, DM_UUID_LEN); 200 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
196 tfr->luid = luid; 201 tfr->luid = luid;
197 tfr->seq = dm_ulog_seq++; 202 tfr->seq = dm_ulog_seq++;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 9443896ede07..5a08be0222db 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
145EXPORT_SYMBOL(dm_dirty_log_type_unregister); 145EXPORT_SYMBOL(dm_dirty_log_type_unregister);
146 146
147struct dm_dirty_log *dm_dirty_log_create(const char *type_name, 147struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
148 struct dm_target *ti, 148 struct dm_target *ti,
149 unsigned int argc, char **argv) 149 int (*flush_callback_fn)(struct dm_target *ti),
150 unsigned int argc, char **argv)
150{ 151{
151 struct dm_dirty_log_type *type; 152 struct dm_dirty_log_type *type;
152 struct dm_dirty_log *log; 153 struct dm_dirty_log *log;
@@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
161 return NULL; 162 return NULL;
162 } 163 }
163 164
165 log->flush_callback_fn = flush_callback_fn;
164 log->type = type; 166 log->type = type;
165 if (type->ctr(log, ti, argc, argv)) { 167 if (type->ctr(log, ti, argc, argv)) {
166 kfree(log); 168 kfree(log);
@@ -208,7 +210,9 @@ struct log_header {
208 210
209struct log_c { 211struct log_c {
210 struct dm_target *ti; 212 struct dm_target *ti;
211 int touched; 213 int touched_dirtied;
214 int touched_cleaned;
215 int flush_failed;
212 uint32_t region_size; 216 uint32_t region_size;
213 unsigned int region_count; 217 unsigned int region_count;
214 region_t sync_count; 218 region_t sync_count;
@@ -233,6 +237,7 @@ struct log_c {
233 * Disk log fields 237 * Disk log fields
234 */ 238 */
235 int log_dev_failed; 239 int log_dev_failed;
240 int log_dev_flush_failed;
236 struct dm_dev *log_dev; 241 struct dm_dev *log_dev;
237 struct log_header header; 242 struct log_header header;
238 243
@@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l,
253 uint32_t *bs, unsigned bit) 258 uint32_t *bs, unsigned bit)
254{ 259{
255 ext2_set_bit(bit, (unsigned long *) bs); 260 ext2_set_bit(bit, (unsigned long *) bs);
256 l->touched = 1; 261 l->touched_cleaned = 1;
257} 262}
258 263
259static inline void log_clear_bit(struct log_c *l, 264static inline void log_clear_bit(struct log_c *l,
260 uint32_t *bs, unsigned bit) 265 uint32_t *bs, unsigned bit)
261{ 266{
262 ext2_clear_bit(bit, (unsigned long *) bs); 267 ext2_clear_bit(bit, (unsigned long *) bs);
263 l->touched = 1; 268 l->touched_dirtied = 1;
264} 269}
265 270
266/*---------------------------------------------------------------- 271/*----------------------------------------------------------------
@@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw)
287 return dm_io(&lc->io_req, 1, &lc->header_location, NULL); 292 return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
288} 293}
289 294
295static int flush_header(struct log_c *lc)
296{
297 struct dm_io_region null_location = {
298 .bdev = lc->header_location.bdev,
299 .sector = 0,
300 .count = 0,
301 };
302
303 lc->io_req.bi_rw = WRITE_BARRIER;
304
305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306}
307
290static int read_header(struct log_c *log) 308static int read_header(struct log_c *log)
291{ 309{
292 int r; 310 int r;
@@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
378 } 396 }
379 397
380 lc->ti = ti; 398 lc->ti = ti;
381 lc->touched = 0; 399 lc->touched_dirtied = 0;
400 lc->touched_cleaned = 0;
401 lc->flush_failed = 0;
382 lc->region_size = region_size; 402 lc->region_size = region_size;
383 lc->region_count = region_count; 403 lc->region_count = region_count;
384 lc->sync = sync; 404 lc->sync = sync;
@@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
406 } else { 426 } else {
407 lc->log_dev = dev; 427 lc->log_dev = dev;
408 lc->log_dev_failed = 0; 428 lc->log_dev_failed = 0;
429 lc->log_dev_flush_failed = 0;
409 lc->header_location.bdev = lc->log_dev->bdev; 430 lc->header_location.bdev = lc->log_dev->bdev;
410 lc->header_location.sector = 0; 431 lc->header_location.sector = 0;
411 432
@@ -522,8 +543,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
522 return -EINVAL; 543 return -EINVAL;
523 } 544 }
524 545
525 r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, 546 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev);
526 FMODE_READ | FMODE_WRITE, &dev);
527 if (r) 547 if (r)
528 return r; 548 return r;
529 549
@@ -614,6 +634,11 @@ static int disk_resume(struct dm_dirty_log *log)
614 634
615 /* write the new header */ 635 /* write the new header */
616 r = rw_header(lc, WRITE); 636 r = rw_header(lc, WRITE);
637 if (!r) {
638 r = flush_header(lc);
639 if (r)
640 lc->log_dev_flush_failed = 1;
641 }
617 if (r) { 642 if (r) {
618 DMWARN("%s: Failed to write header on dirty region log device", 643 DMWARN("%s: Failed to write header on dirty region log device",
619 lc->log_dev->name); 644 lc->log_dev->name);
@@ -656,18 +681,40 @@ static int core_flush(struct dm_dirty_log *log)
656 681
657static int disk_flush(struct dm_dirty_log *log) 682static int disk_flush(struct dm_dirty_log *log)
658{ 683{
659 int r; 684 int r, i;
660 struct log_c *lc = (struct log_c *) log->context; 685 struct log_c *lc = log->context;
661 686
662 /* only write if the log has changed */ 687 /* only write if the log has changed */
663 if (!lc->touched) 688 if (!lc->touched_cleaned && !lc->touched_dirtied)
664 return 0; 689 return 0;
665 690
691 if (lc->touched_cleaned && log->flush_callback_fn &&
692 log->flush_callback_fn(lc->ti)) {
693 /*
694 * At this point it is impossible to determine which
695 * regions are clean and which are dirty (without
696 * re-reading the log off disk). So mark all of them
697 * dirty.
698 */
699 lc->flush_failed = 1;
700 for (i = 0; i < lc->region_count; i++)
701 log_clear_bit(lc, lc->clean_bits, i);
702 }
703
666 r = rw_header(lc, WRITE); 704 r = rw_header(lc, WRITE);
667 if (r) 705 if (r)
668 fail_log_device(lc); 706 fail_log_device(lc);
669 else 707 else {
670 lc->touched = 0; 708 if (lc->touched_dirtied) {
709 r = flush_header(lc);
710 if (r) {
711 lc->log_dev_flush_failed = 1;
712 fail_log_device(lc);
713 } else
714 lc->touched_dirtied = 0;
715 }
716 lc->touched_cleaned = 0;
717 }
671 718
672 return r; 719 return r;
673} 720}
@@ -681,7 +728,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
681static void core_clear_region(struct dm_dirty_log *log, region_t region) 728static void core_clear_region(struct dm_dirty_log *log, region_t region)
682{ 729{
683 struct log_c *lc = (struct log_c *) log->context; 730 struct log_c *lc = (struct log_c *) log->context;
684 log_set_bit(lc, lc->clean_bits, region); 731 if (likely(!lc->flush_failed))
732 log_set_bit(lc, lc->clean_bits, region);
685} 733}
686 734
687static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) 735static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
@@ -762,7 +810,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
762 switch(status) { 810 switch(status) {
763 case STATUSTYPE_INFO: 811 case STATUSTYPE_INFO:
764 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, 812 DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
765 lc->log_dev_failed ? 'D' : 'A'); 813 lc->log_dev_flush_failed ? 'F' :
814 lc->log_dev_failed ? 'D' :
815 'A');
766 break; 816 break;
767 817
768 case STATUSTYPE_TABLE: 818 case STATUSTYPE_TABLE:
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 32d0b878eccc..826bce7343b3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -69,6 +69,7 @@ struct multipath {
69 struct list_head priority_groups; 69 struct list_head priority_groups;
70 unsigned pg_init_required; /* pg_init needs calling? */ 70 unsigned pg_init_required; /* pg_init needs calling? */
71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ 71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
72 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
72 73
73 unsigned nr_valid_paths; /* Total number of usable paths */ 74 unsigned nr_valid_paths; /* Total number of usable paths */
74 struct pgpath *current_pgpath; 75 struct pgpath *current_pgpath;
@@ -93,6 +94,8 @@ struct multipath {
93 * can resubmit bios on error. 94 * can resubmit bios on error.
94 */ 95 */
95 mempool_t *mpio_pool; 96 mempool_t *mpio_pool;
97
98 struct mutex work_mutex;
96}; 99};
97 100
98/* 101/*
@@ -198,6 +201,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
198 m->queue_io = 1; 201 m->queue_io = 1;
199 INIT_WORK(&m->process_queued_ios, process_queued_ios); 202 INIT_WORK(&m->process_queued_ios, process_queued_ios);
200 INIT_WORK(&m->trigger_event, trigger_event); 203 INIT_WORK(&m->trigger_event, trigger_event);
204 init_waitqueue_head(&m->pg_init_wait);
205 mutex_init(&m->work_mutex);
201 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); 206 m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
202 if (!m->mpio_pool) { 207 if (!m->mpio_pool) {
203 kfree(m); 208 kfree(m);
@@ -230,6 +235,21 @@ static void free_multipath(struct multipath *m)
230 * Path selection 235 * Path selection
231 *-----------------------------------------------*/ 236 *-----------------------------------------------*/
232 237
238static void __pg_init_all_paths(struct multipath *m)
239{
240 struct pgpath *pgpath;
241
242 m->pg_init_count++;
243 m->pg_init_required = 0;
244 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
245 /* Skip failed paths */
246 if (!pgpath->is_active)
247 continue;
248 if (queue_work(kmpath_handlerd, &pgpath->activate_path))
249 m->pg_init_in_progress++;
250 }
251}
252
233static void __switch_pg(struct multipath *m, struct pgpath *pgpath) 253static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
234{ 254{
235 m->current_pg = pgpath->pg; 255 m->current_pg = pgpath->pg;
@@ -434,7 +454,7 @@ static void process_queued_ios(struct work_struct *work)
434{ 454{
435 struct multipath *m = 455 struct multipath *m =
436 container_of(work, struct multipath, process_queued_ios); 456 container_of(work, struct multipath, process_queued_ios);
437 struct pgpath *pgpath = NULL, *tmp; 457 struct pgpath *pgpath = NULL;
438 unsigned must_queue = 1; 458 unsigned must_queue = 1;
439 unsigned long flags; 459 unsigned long flags;
440 460
@@ -452,14 +472,9 @@ static void process_queued_ios(struct work_struct *work)
452 (!pgpath && !m->queue_if_no_path)) 472 (!pgpath && !m->queue_if_no_path))
453 must_queue = 0; 473 must_queue = 0;
454 474
455 if (m->pg_init_required && !m->pg_init_in_progress && pgpath) { 475 if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
456 m->pg_init_count++; 476 __pg_init_all_paths(m);
457 m->pg_init_required = 0; 477
458 list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
459 if (queue_work(kmpath_handlerd, &tmp->activate_path))
460 m->pg_init_in_progress++;
461 }
462 }
463out: 478out:
464 spin_unlock_irqrestore(&m->lock, flags); 479 spin_unlock_irqrestore(&m->lock, flags);
465 if (!must_queue) 480 if (!must_queue)
@@ -592,8 +607,8 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
592 if (!p) 607 if (!p)
593 return ERR_PTR(-ENOMEM); 608 return ERR_PTR(-ENOMEM);
594 609
595 r = dm_get_device(ti, shift(as), ti->begin, ti->len, 610 r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
596 dm_table_get_mode(ti->table), &p->path.dev); 611 &p->path.dev);
597 if (r) { 612 if (r) {
598 ti->error = "error getting device"; 613 ti->error = "error getting device";
599 goto bad; 614 goto bad;
@@ -885,13 +900,43 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
885 return r; 900 return r;
886} 901}
887 902
888static void multipath_dtr(struct dm_target *ti) 903static void multipath_wait_for_pg_init_completion(struct multipath *m)
889{ 904{
890 struct multipath *m = (struct multipath *) ti->private; 905 DECLARE_WAITQUEUE(wait, current);
906 unsigned long flags;
907
908 add_wait_queue(&m->pg_init_wait, &wait);
891 909
910 while (1) {
911 set_current_state(TASK_UNINTERRUPTIBLE);
912
913 spin_lock_irqsave(&m->lock, flags);
914 if (!m->pg_init_in_progress) {
915 spin_unlock_irqrestore(&m->lock, flags);
916 break;
917 }
918 spin_unlock_irqrestore(&m->lock, flags);
919
920 io_schedule();
921 }
922 set_current_state(TASK_RUNNING);
923
924 remove_wait_queue(&m->pg_init_wait, &wait);
925}
926
927static void flush_multipath_work(struct multipath *m)
928{
892 flush_workqueue(kmpath_handlerd); 929 flush_workqueue(kmpath_handlerd);
930 multipath_wait_for_pg_init_completion(m);
893 flush_workqueue(kmultipathd); 931 flush_workqueue(kmultipathd);
894 flush_scheduled_work(); 932 flush_scheduled_work();
933}
934
935static void multipath_dtr(struct dm_target *ti)
936{
937 struct multipath *m = ti->private;
938
939 flush_multipath_work(m);
895 free_multipath(m); 940 free_multipath(m);
896} 941}
897 942
@@ -1116,9 +1161,9 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1116 return limit_reached; 1161 return limit_reached;
1117} 1162}
1118 1163
1119static void pg_init_done(struct dm_path *path, int errors) 1164static void pg_init_done(void *data, int errors)
1120{ 1165{
1121 struct pgpath *pgpath = path_to_pgpath(path); 1166 struct pgpath *pgpath = data;
1122 struct priority_group *pg = pgpath->pg; 1167 struct priority_group *pg = pgpath->pg;
1123 struct multipath *m = pg->m; 1168 struct multipath *m = pg->m;
1124 unsigned long flags; 1169 unsigned long flags;
@@ -1132,8 +1177,8 @@ static void pg_init_done(struct dm_path *path, int errors)
1132 errors = 0; 1177 errors = 0;
1133 break; 1178 break;
1134 } 1179 }
1135 DMERR("Cannot failover device because scsi_dh_%s was not " 1180 DMERR("Could not failover the device: Handler scsi_dh_%s "
1136 "loaded.", m->hw_handler_name); 1181 "Error %d.", m->hw_handler_name, errors);
1137 /* 1182 /*
1138 * Fail path for now, so we do not ping pong 1183 * Fail path for now, so we do not ping pong
1139 */ 1184 */
@@ -1170,25 +1215,34 @@ static void pg_init_done(struct dm_path *path, int errors)
1170 m->current_pgpath = NULL; 1215 m->current_pgpath = NULL;
1171 m->current_pg = NULL; 1216 m->current_pg = NULL;
1172 } 1217 }
1173 } else if (!m->pg_init_required) { 1218 } else if (!m->pg_init_required)
1174 m->queue_io = 0;
1175 pg->bypassed = 0; 1219 pg->bypassed = 0;
1176 }
1177 1220
1178 m->pg_init_in_progress--; 1221 if (--m->pg_init_in_progress)
1179 if (!m->pg_init_in_progress) 1222 /* Activations of other paths are still on going */
1180 queue_work(kmultipathd, &m->process_queued_ios); 1223 goto out;
1224
1225 if (!m->pg_init_required)
1226 m->queue_io = 0;
1227
1228 queue_work(kmultipathd, &m->process_queued_ios);
1229
1230 /*
1231 * Wake up any thread waiting to suspend.
1232 */
1233 wake_up(&m->pg_init_wait);
1234
1235out:
1181 spin_unlock_irqrestore(&m->lock, flags); 1236 spin_unlock_irqrestore(&m->lock, flags);
1182} 1237}
1183 1238
1184static void activate_path(struct work_struct *work) 1239static void activate_path(struct work_struct *work)
1185{ 1240{
1186 int ret;
1187 struct pgpath *pgpath = 1241 struct pgpath *pgpath =
1188 container_of(work, struct pgpath, activate_path); 1242 container_of(work, struct pgpath, activate_path);
1189 1243
1190 ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev)); 1244 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1191 pg_init_done(&pgpath->path, ret); 1245 pg_init_done, pgpath);
1192} 1246}
1193 1247
1194/* 1248/*
@@ -1261,6 +1315,15 @@ static void multipath_presuspend(struct dm_target *ti)
1261 queue_if_no_path(m, 0, 1); 1315 queue_if_no_path(m, 0, 1);
1262} 1316}
1263 1317
1318static void multipath_postsuspend(struct dm_target *ti)
1319{
1320 struct multipath *m = ti->private;
1321
1322 mutex_lock(&m->work_mutex);
1323 flush_multipath_work(m);
1324 mutex_unlock(&m->work_mutex);
1325}
1326
1264/* 1327/*
1265 * Restore the queue_if_no_path setting. 1328 * Restore the queue_if_no_path setting.
1266 */ 1329 */
@@ -1397,51 +1460,65 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1397 1460
1398static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) 1461static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1399{ 1462{
1400 int r; 1463 int r = -EINVAL;
1401 struct dm_dev *dev; 1464 struct dm_dev *dev;
1402 struct multipath *m = (struct multipath *) ti->private; 1465 struct multipath *m = (struct multipath *) ti->private;
1403 action_fn action; 1466 action_fn action;
1404 1467
1468 mutex_lock(&m->work_mutex);
1469
1470 if (dm_suspended(ti)) {
1471 r = -EBUSY;
1472 goto out;
1473 }
1474
1405 if (argc == 1) { 1475 if (argc == 1) {
1406 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) 1476 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
1407 return queue_if_no_path(m, 1, 0); 1477 r = queue_if_no_path(m, 1, 0);
1408 else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) 1478 goto out;
1409 return queue_if_no_path(m, 0, 0); 1479 } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
1480 r = queue_if_no_path(m, 0, 0);
1481 goto out;
1482 }
1410 } 1483 }
1411 1484
1412 if (argc != 2) 1485 if (argc != 2) {
1413 goto error; 1486 DMWARN("Unrecognised multipath message received.");
1487 goto out;
1488 }
1414 1489
1415 if (!strnicmp(argv[0], MESG_STR("disable_group"))) 1490 if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
1416 return bypass_pg_num(m, argv[1], 1); 1491 r = bypass_pg_num(m, argv[1], 1);
1417 else if (!strnicmp(argv[0], MESG_STR("enable_group"))) 1492 goto out;
1418 return bypass_pg_num(m, argv[1], 0); 1493 } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
1419 else if (!strnicmp(argv[0], MESG_STR("switch_group"))) 1494 r = bypass_pg_num(m, argv[1], 0);
1420 return switch_pg_num(m, argv[1]); 1495 goto out;
1421 else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) 1496 } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
1497 r = switch_pg_num(m, argv[1]);
1498 goto out;
1499 } else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1422 action = reinstate_path; 1500 action = reinstate_path;
1423 else if (!strnicmp(argv[0], MESG_STR("fail_path"))) 1501 else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1424 action = fail_path; 1502 action = fail_path;
1425 else 1503 else {
1426 goto error; 1504 DMWARN("Unrecognised multipath message received.");
1505 goto out;
1506 }
1427 1507
1428 r = dm_get_device(ti, argv[1], ti->begin, ti->len, 1508 r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1429 dm_table_get_mode(ti->table), &dev);
1430 if (r) { 1509 if (r) {
1431 DMWARN("message: error getting device %s", 1510 DMWARN("message: error getting device %s",
1432 argv[1]); 1511 argv[1]);
1433 return -EINVAL; 1512 goto out;
1434 } 1513 }
1435 1514
1436 r = action_dev(m, dev, action); 1515 r = action_dev(m, dev, action);
1437 1516
1438 dm_put_device(ti, dev); 1517 dm_put_device(ti, dev);
1439 1518
1519out:
1520 mutex_unlock(&m->work_mutex);
1440 return r; 1521 return r;
1441
1442error:
1443 DMWARN("Unrecognised multipath message received.");
1444 return -EINVAL;
1445} 1522}
1446 1523
1447static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, 1524static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
@@ -1567,13 +1644,14 @@ out:
1567 *---------------------------------------------------------------*/ 1644 *---------------------------------------------------------------*/
1568static struct target_type multipath_target = { 1645static struct target_type multipath_target = {
1569 .name = "multipath", 1646 .name = "multipath",
1570 .version = {1, 1, 0}, 1647 .version = {1, 1, 1},
1571 .module = THIS_MODULE, 1648 .module = THIS_MODULE,
1572 .ctr = multipath_ctr, 1649 .ctr = multipath_ctr,
1573 .dtr = multipath_dtr, 1650 .dtr = multipath_dtr,
1574 .map_rq = multipath_map, 1651 .map_rq = multipath_map,
1575 .rq_end_io = multipath_end_io, 1652 .rq_end_io = multipath_end_io,
1576 .presuspend = multipath_presuspend, 1653 .presuspend = multipath_presuspend,
1654 .postsuspend = multipath_postsuspend,
1577 .resume = multipath_resume, 1655 .resume = multipath_resume,
1578 .status = multipath_status, 1656 .status = multipath_status,
1579 .message = multipath_message, 1657 .message = multipath_message,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index cc9dc79b0784..ddda531723dc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
35 *---------------------------------------------------------------*/ 35 *---------------------------------------------------------------*/
36enum dm_raid1_error { 36enum dm_raid1_error {
37 DM_RAID1_WRITE_ERROR, 37 DM_RAID1_WRITE_ERROR,
38 DM_RAID1_FLUSH_ERROR,
38 DM_RAID1_SYNC_ERROR, 39 DM_RAID1_SYNC_ERROR,
39 DM_RAID1_READ_ERROR 40 DM_RAID1_READ_ERROR
40}; 41};
@@ -57,6 +58,7 @@ struct mirror_set {
57 struct bio_list reads; 58 struct bio_list reads;
58 struct bio_list writes; 59 struct bio_list writes;
59 struct bio_list failures; 60 struct bio_list failures;
61 struct bio_list holds; /* bios are waiting until suspend */
60 62
61 struct dm_region_hash *rh; 63 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 64 struct dm_kcopyd_client *kcopyd_client;
@@ -67,6 +69,7 @@ struct mirror_set {
67 region_t nr_regions; 69 region_t nr_regions;
68 int in_sync; 70 int in_sync;
69 int log_failure; 71 int log_failure;
72 int leg_failure;
70 atomic_t suspend; 73 atomic_t suspend;
71 74
72 atomic_t default_mirror; /* Default mirror */ 75 atomic_t default_mirror; /* Default mirror */
@@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
179 atomic_set(&ms->default_mirror, m - m0); 182 atomic_set(&ms->default_mirror, m - m0);
180} 183}
181 184
185static struct mirror *get_valid_mirror(struct mirror_set *ms)
186{
187 struct mirror *m;
188
189 for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
190 if (!atomic_read(&m->error_count))
191 return m;
192
193 return NULL;
194}
195
182/* fail_mirror 196/* fail_mirror
183 * @m: mirror device to fail 197 * @m: mirror device to fail
184 * @error_type: one of the enum's, DM_RAID1_*_ERROR 198 * @error_type: one of the enum's, DM_RAID1_*_ERROR
@@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
198 struct mirror_set *ms = m->ms; 212 struct mirror_set *ms = m->ms;
199 struct mirror *new; 213 struct mirror *new;
200 214
215 ms->leg_failure = 1;
216
201 /* 217 /*
202 * error_count is used for nothing more than a 218 * error_count is used for nothing more than a
203 * simple way to tell if a device has encountered 219 * simple way to tell if a device has encountered
@@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
224 goto out; 240 goto out;
225 } 241 }
226 242
227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 243 new = get_valid_mirror(ms);
228 if (!atomic_read(&new->error_count)) { 244 if (new)
229 set_default_mirror(new); 245 set_default_mirror(new);
230 break; 246 else
231 }
232
233 if (unlikely(new == ms->mirror + ms->nr_mirrors))
234 DMWARN("All sides of mirror have failed."); 247 DMWARN("All sides of mirror have failed.");
235 248
236out: 249out:
237 schedule_work(&ms->trigger_event); 250 schedule_work(&ms->trigger_event);
238} 251}
239 252
253static int mirror_flush(struct dm_target *ti)
254{
255 struct mirror_set *ms = ti->private;
256 unsigned long error_bits;
257
258 unsigned int i;
259 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m;
261 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER,
263 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL,
265 .client = ms->io_client,
266 };
267
268 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
269 io[i].bdev = m->dev->bdev;
270 io[i].sector = 0;
271 io[i].count = 0;
272 }
273
274 error_bits = -1;
275 dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
276 if (unlikely(error_bits != 0)) {
277 for (i = 0; i < ms->nr_mirrors; i++)
278 if (test_bit(i, &error_bits))
279 fail_mirror(ms->mirror + i,
280 DM_RAID1_FLUSH_ERROR);
281 return -EIO;
282 }
283
284 return 0;
285}
286
240/*----------------------------------------------------------------- 287/*-----------------------------------------------------------------
241 * Recovery. 288 * Recovery.
242 * 289 *
@@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
396 */ 443 */
397static sector_t map_sector(struct mirror *m, struct bio *bio) 444static sector_t map_sector(struct mirror *m, struct bio *bio)
398{ 445{
446 if (unlikely(!bio->bi_size))
447 return 0;
399 return m->offset + (bio->bi_sector - m->ms->ti->begin); 448 return m->offset + (bio->bi_sector - m->ms->ti->begin);
400} 449}
401 450
@@ -413,6 +462,34 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
413 io->count = bio->bi_size >> 9; 462 io->count = bio->bi_size >> 9;
414} 463}
415 464
465static void hold_bio(struct mirror_set *ms, struct bio *bio)
466{
467 /*
468 * Lock is required to avoid race condition during suspend
469 * process.
470 */
471 spin_lock_irq(&ms->lock);
472
473 if (atomic_read(&ms->suspend)) {
474 spin_unlock_irq(&ms->lock);
475
476 /*
477 * If device is suspended, complete the bio.
478 */
479 if (dm_noflush_suspending(ms->ti))
480 bio_endio(bio, DM_ENDIO_REQUEUE);
481 else
482 bio_endio(bio, -EIO);
483 return;
484 }
485
486 /*
487 * Hold bio until the suspend is complete.
488 */
489 bio_list_add(&ms->holds, bio);
490 spin_unlock_irq(&ms->lock);
491}
492
416/*----------------------------------------------------------------- 493/*-----------------------------------------------------------------
417 * Reads 494 * Reads
418 *---------------------------------------------------------------*/ 495 *---------------------------------------------------------------*/
@@ -511,7 +588,6 @@ static void write_callback(unsigned long error, void *context)
511 unsigned i, ret = 0; 588 unsigned i, ret = 0;
512 struct bio *bio = (struct bio *) context; 589 struct bio *bio = (struct bio *) context;
513 struct mirror_set *ms; 590 struct mirror_set *ms;
514 int uptodate = 0;
515 int should_wake = 0; 591 int should_wake = 0;
516 unsigned long flags; 592 unsigned long flags;
517 593
@@ -524,36 +600,27 @@ static void write_callback(unsigned long error, void *context)
524 * This way we handle both writes to SYNC and NOSYNC 600 * This way we handle both writes to SYNC and NOSYNC
525 * regions with the same code. 601 * regions with the same code.
526 */ 602 */
527 if (likely(!error)) 603 if (likely(!error)) {
528 goto out; 604 bio_endio(bio, ret);
605 return;
606 }
529 607
530 for (i = 0; i < ms->nr_mirrors; i++) 608 for (i = 0; i < ms->nr_mirrors; i++)
531 if (test_bit(i, &error)) 609 if (test_bit(i, &error))
532 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 610 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
533 else
534 uptodate = 1;
535 611
536 if (unlikely(!uptodate)) { 612 /*
537 DMERR("All replicated volumes dead, failing I/O"); 613 * Need to raise event. Since raising
538 /* None of the writes succeeded, fail the I/O. */ 614 * events can block, we need to do it in
539 ret = -EIO; 615 * the main thread.
540 } else if (errors_handled(ms)) { 616 */
541 /* 617 spin_lock_irqsave(&ms->lock, flags);
542 * Need to raise event. Since raising 618 if (!ms->failures.head)
543 * events can block, we need to do it in 619 should_wake = 1;
544 * the main thread. 620 bio_list_add(&ms->failures, bio);
545 */ 621 spin_unlock_irqrestore(&ms->lock, flags);
546 spin_lock_irqsave(&ms->lock, flags); 622 if (should_wake)
547 if (!ms->failures.head) 623 wakeup_mirrord(ms);
548 should_wake = 1;
549 bio_list_add(&ms->failures, bio);
550 spin_unlock_irqrestore(&ms->lock, flags);
551 if (should_wake)
552 wakeup_mirrord(ms);
553 return;
554 }
555out:
556 bio_endio(bio, ret);
557} 624}
558 625
559static void do_write(struct mirror_set *ms, struct bio *bio) 626static void do_write(struct mirror_set *ms, struct bio *bio)
@@ -562,7 +629,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
562 struct dm_io_region io[ms->nr_mirrors], *dest = io; 629 struct dm_io_region io[ms->nr_mirrors], *dest = io;
563 struct mirror *m; 630 struct mirror *m;
564 struct dm_io_request io_req = { 631 struct dm_io_request io_req = {
565 .bi_rw = WRITE, 632 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
566 .mem.type = DM_IO_BVEC, 633 .mem.type = DM_IO_BVEC,
567 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
568 .notify.fn = write_callback, 635 .notify.fn = write_callback,
@@ -603,6 +670,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
603 bio_list_init(&requeue); 670 bio_list_init(&requeue);
604 671
605 while ((bio = bio_list_pop(writes))) { 672 while ((bio = bio_list_pop(writes))) {
673 if (unlikely(bio_empty_barrier(bio))) {
674 bio_list_add(&sync, bio);
675 continue;
676 }
677
606 region = dm_rh_bio_to_region(ms->rh, bio); 678 region = dm_rh_bio_to_region(ms->rh, bio);
607 679
608 if (log->type->is_remote_recovering && 680 if (log->type->is_remote_recovering &&
@@ -659,7 +731,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
659 /* 731 /*
660 * Dispatch io. 732 * Dispatch io.
661 */ 733 */
662 if (unlikely(ms->log_failure)) { 734 if (unlikely(ms->log_failure) && errors_handled(ms)) {
663 spin_lock_irq(&ms->lock); 735 spin_lock_irq(&ms->lock);
664 bio_list_merge(&ms->failures, &sync); 736 bio_list_merge(&ms->failures, &sync);
665 spin_unlock_irq(&ms->lock); 737 spin_unlock_irq(&ms->lock);
@@ -672,8 +744,15 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
672 dm_rh_delay(ms->rh, bio); 744 dm_rh_delay(ms->rh, bio);
673 745
674 while ((bio = bio_list_pop(&nosync))) { 746 while ((bio = bio_list_pop(&nosync))) {
675 map_bio(get_default_mirror(ms), bio); 747 if (unlikely(ms->leg_failure) && errors_handled(ms)) {
676 generic_make_request(bio); 748 spin_lock_irq(&ms->lock);
749 bio_list_add(&ms->failures, bio);
750 spin_unlock_irq(&ms->lock);
751 wakeup_mirrord(ms);
752 } else {
753 map_bio(get_default_mirror(ms), bio);
754 generic_make_request(bio);
755 }
677 } 756 }
678} 757}
679 758
@@ -681,20 +760,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
681{ 760{
682 struct bio *bio; 761 struct bio *bio;
683 762
684 if (!failures->head) 763 if (likely(!failures->head))
685 return;
686
687 if (!ms->log_failure) {
688 while ((bio = bio_list_pop(failures))) {
689 ms->in_sync = 0;
690 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
691 }
692 return; 764 return;
693 }
694 765
695 /* 766 /*
696 * If the log has failed, unattempted writes are being 767 * If the log has failed, unattempted writes are being
697 * put on the failures list. We can't issue those writes 768 * put on the holds list. We can't issue those writes
698 * until a log has been marked, so we must store them. 769 * until a log has been marked, so we must store them.
699 * 770 *
700 * If a 'noflush' suspend is in progress, we can requeue 771 * If a 'noflush' suspend is in progress, we can requeue
@@ -709,23 +780,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
709 * for us to treat them the same and requeue them 780 * for us to treat them the same and requeue them
710 * as well. 781 * as well.
711 */ 782 */
712 if (dm_noflush_suspending(ms->ti)) { 783 while ((bio = bio_list_pop(failures))) {
713 while ((bio = bio_list_pop(failures))) 784 if (!ms->log_failure) {
714 bio_endio(bio, DM_ENDIO_REQUEUE); 785 ms->in_sync = 0;
715 return; 786 dm_rh_mark_nosync(ms->rh, bio);
716 } 787 }
717 788
718 if (atomic_read(&ms->suspend)) { 789 /*
719 while ((bio = bio_list_pop(failures))) 790 * If all the legs are dead, fail the I/O.
791 * If we have been told to handle errors, hold the bio
792 * and wait for userspace to deal with the problem.
793 * Otherwise pretend that the I/O succeeded. (This would
794 * be wrong if the failed leg returned after reboot and
795 * got replicated back to the good legs.)
796 */
797 if (!get_valid_mirror(ms))
720 bio_endio(bio, -EIO); 798 bio_endio(bio, -EIO);
721 return; 799 else if (errors_handled(ms))
800 hold_bio(ms, bio);
801 else
802 bio_endio(bio, 0);
722 } 803 }
723
724 spin_lock_irq(&ms->lock);
725 bio_list_merge(&ms->failures, failures);
726 spin_unlock_irq(&ms->lock);
727
728 delayed_wake(ms);
729} 804}
730 805
731static void trigger_event(struct work_struct *work) 806static void trigger_event(struct work_struct *work)
@@ -784,12 +859,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
784 } 859 }
785 860
786 spin_lock_init(&ms->lock); 861 spin_lock_init(&ms->lock);
862 bio_list_init(&ms->reads);
863 bio_list_init(&ms->writes);
864 bio_list_init(&ms->failures);
865 bio_list_init(&ms->holds);
787 866
788 ms->ti = ti; 867 ms->ti = ti;
789 ms->nr_mirrors = nr_mirrors; 868 ms->nr_mirrors = nr_mirrors;
790 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 869 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
791 ms->in_sync = 0; 870 ms->in_sync = 0;
792 ms->log_failure = 0; 871 ms->log_failure = 0;
872 ms->leg_failure = 0;
793 atomic_set(&ms->suspend, 0); 873 atomic_set(&ms->suspend, 0);
794 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 874 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
795 875
@@ -847,8 +927,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
847 return -EINVAL; 927 return -EINVAL;
848 } 928 }
849 929
850 if (dm_get_device(ti, argv[0], offset, ti->len, 930 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
851 dm_table_get_mode(ti->table),
852 &ms->mirror[mirror].dev)) { 931 &ms->mirror[mirror].dev)) {
853 ti->error = "Device lookup failure"; 932 ti->error = "Device lookup failure";
854 return -ENXIO; 933 return -ENXIO;
@@ -889,7 +968,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
889 return NULL; 968 return NULL;
890 } 969 }
891 970
892 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); 971 dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
972 argv + 2);
893 if (!dl) { 973 if (!dl) {
894 ti->error = "Error creating mirror dirty log"; 974 ti->error = "Error creating mirror dirty log";
895 return NULL; 975 return NULL;
@@ -995,6 +1075,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
995 1075
996 ti->private = ms; 1076 ti->private = ms;
997 ti->split_io = dm_rh_get_region_size(ms->rh); 1077 ti->split_io = dm_rh_get_region_size(ms->rh);
1078 ti->num_flush_requests = 1;
998 1079
999 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1080 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1000 if (!ms->kmirrord_wq) { 1081 if (!ms->kmirrord_wq) {
@@ -1122,7 +1203,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1122 * We need to dec pending if this was a write. 1203 * We need to dec pending if this was a write.
1123 */ 1204 */
1124 if (rw == WRITE) { 1205 if (rw == WRITE) {
1125 dm_rh_dec(ms->rh, map_context->ll); 1206 if (likely(!bio_empty_barrier(bio)))
1207 dm_rh_dec(ms->rh, map_context->ll);
1126 return error; 1208 return error;
1127 } 1209 }
1128 1210
@@ -1180,9 +1262,26 @@ static void mirror_presuspend(struct dm_target *ti)
1180 struct mirror_set *ms = (struct mirror_set *) ti->private; 1262 struct mirror_set *ms = (struct mirror_set *) ti->private;
1181 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1263 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1182 1264
1265 struct bio_list holds;
1266 struct bio *bio;
1267
1183 atomic_set(&ms->suspend, 1); 1268 atomic_set(&ms->suspend, 1);
1184 1269
1185 /* 1270 /*
1271 * Process bios in the hold list to start recovery waiting
1272 * for bios in the hold list. After the process, no bio has
1273 * a chance to be added in the hold list because ms->suspend
1274 * is set.
1275 */
1276 spin_lock_irq(&ms->lock);
1277 holds = ms->holds;
1278 bio_list_init(&ms->holds);
1279 spin_unlock_irq(&ms->lock);
1280
1281 while ((bio = bio_list_pop(&holds)))
1282 hold_bio(ms, bio);
1283
1284 /*
1186 * We must finish up all the work that we've 1285 * We must finish up all the work that we've
1187 * generated (i.e. recovery work). 1286 * generated (i.e. recovery work).
1188 */ 1287 */
@@ -1244,7 +1343,8 @@ static char device_status_char(struct mirror *m)
1244 if (!atomic_read(&(m->error_count))) 1343 if (!atomic_read(&(m->error_count)))
1245 return 'A'; 1344 return 'A';
1246 1345
1247 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1346 return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
1347 (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1248 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1348 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1249 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1349 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1250} 1350}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 36dbe29f2fd6..bd5c58b28868 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -11,6 +11,7 @@
11#include <linux/ctype.h> 11#include <linux/ctype.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
15 16
16#include "dm.h" 17#include "dm.h"
@@ -79,6 +80,11 @@ struct dm_region_hash {
79 struct list_head recovered_regions; 80 struct list_head recovered_regions;
80 struct list_head failed_recovered_regions; 81 struct list_head failed_recovered_regions;
81 82
83 /*
84 * If there was a barrier failure no regions can be marked clean.
85 */
86 int barrier_failure;
87
82 void *context; 88 void *context;
83 sector_t target_begin; 89 sector_t target_begin;
84 90
@@ -211,6 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
211 INIT_LIST_HEAD(&rh->quiesced_regions); 217 INIT_LIST_HEAD(&rh->quiesced_regions);
212 INIT_LIST_HEAD(&rh->recovered_regions); 218 INIT_LIST_HEAD(&rh->recovered_regions);
213 INIT_LIST_HEAD(&rh->failed_recovered_regions); 219 INIT_LIST_HEAD(&rh->failed_recovered_regions);
220 rh->barrier_failure = 0;
214 221
215 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
216 sizeof(struct dm_region)); 223 sizeof(struct dm_region));
@@ -377,8 +384,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
377/* dm_rh_mark_nosync 384/* dm_rh_mark_nosync
378 * @ms 385 * @ms
379 * @bio 386 * @bio
380 * @done
381 * @error
382 * 387 *
383 * The bio was written on some mirror(s) but failed on other mirror(s). 388 * The bio was written on some mirror(s) but failed on other mirror(s).
384 * We can successfully endio the bio but should avoid the region being 389 * We can successfully endio the bio but should avoid the region being
@@ -386,8 +391,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
386 * 391 *
387 * This function is _not_ safe in interrupt context! 392 * This function is _not_ safe in interrupt context!
388 */ 393 */
389void dm_rh_mark_nosync(struct dm_region_hash *rh, 394void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
390 struct bio *bio, unsigned done, int error)
391{ 395{
392 unsigned long flags; 396 unsigned long flags;
393 struct dm_dirty_log *log = rh->log; 397 struct dm_dirty_log *log = rh->log;
@@ -395,6 +399,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
395 region_t region = dm_rh_bio_to_region(rh, bio); 399 region_t region = dm_rh_bio_to_region(rh, bio);
396 int recovering = 0; 400 int recovering = 0;
397 401
402 if (bio_empty_barrier(bio)) {
403 rh->barrier_failure = 1;
404 return;
405 }
406
398 /* We must inform the log that the sync count has changed. */ 407 /* We must inform the log that the sync count has changed. */
399 log->type->set_region_sync(log, region, 0); 408 log->type->set_region_sync(log, region, 0);
400 409
@@ -419,7 +428,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
419 BUG_ON(!list_empty(&reg->list)); 428 BUG_ON(!list_empty(&reg->list));
420 spin_unlock_irqrestore(&rh->region_lock, flags); 429 spin_unlock_irqrestore(&rh->region_lock, flags);
421 430
422 bio_endio(bio, error);
423 if (recovering) 431 if (recovering)
424 complete_resync_work(reg, 0); 432 complete_resync_work(reg, 0);
425} 433}
@@ -515,8 +523,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
515{ 523{
516 struct bio *bio; 524 struct bio *bio;
517 525
518 for (bio = bios->head; bio; bio = bio->bi_next) 526 for (bio = bios->head; bio; bio = bio->bi_next) {
527 if (bio_empty_barrier(bio))
528 continue;
519 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 }
520} 531}
521EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 532EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
522 533
@@ -544,7 +555,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
544 */ 555 */
545 556
546 /* do nothing for DM_RH_NOSYNC */ 557 /* do nothing for DM_RH_NOSYNC */
547 if (reg->state == DM_RH_RECOVERING) { 558 if (unlikely(rh->barrier_failure)) {
559 /*
560 * If a write barrier failed some time ago, we
561 * don't know whether or not this write made it
562 * to the disk, so we must resync the device.
563 */
564 reg->state = DM_RH_NOSYNC;
565 } else if (reg->state == DM_RH_RECOVERING) {
548 list_add_tail(&reg->list, &rh->quiesced_regions); 566 list_add_tail(&reg->list, &rh->quiesced_regions);
549 } else if (reg->state == DM_RH_DIRTY) { 567 } else if (reg->state == DM_RH_DIRTY) {
550 reg->state = DM_RH_CLEAN; 568 reg->state = DM_RH_CLEAN;
@@ -643,10 +661,9 @@ void dm_rh_recovery_end(struct dm_region *reg, int success)
643 spin_lock_irq(&rh->region_lock); 661 spin_lock_irq(&rh->region_lock);
644 if (success) 662 if (success)
645 list_add(&reg->list, &reg->rh->recovered_regions); 663 list_add(&reg->list, &reg->rh->recovered_regions);
646 else { 664 else
647 reg->state = DM_RH_NOSYNC;
648 list_add(&reg->list, &reg->rh->failed_recovered_regions); 665 list_add(&reg->list, &reg->rh->failed_recovered_regions);
649 } 666
650 spin_unlock_irq(&rh->region_lock); 667 spin_unlock_irq(&rh->region_lock);
651 668
652 rh->wakeup_workers(rh->context); 669 rh->wakeup_workers(rh->context);
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index cfa668f46c40..9c6c2e47ad62 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -11,6 +11,8 @@
11#include "dm.h" 11#include "dm.h"
12#include "dm-path-selector.h" 12#include "dm-path-selector.h"
13 13
14#include <linux/slab.h>
15
14#define DM_MSG_PREFIX "multipath service-time" 16#define DM_MSG_PREFIX "multipath service-time"
15#define ST_MIN_IO 1 17#define ST_MIN_IO 1
16#define ST_MAX_RELATIVE_THROUGHPUT 100 18#define ST_MAX_RELATIVE_THROUGHPUT 100
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 0c746420c008..c097d8a4823d 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -55,6 +55,8 @@
55 */ 55 */
56#define SNAPSHOT_DISK_VERSION 1 56#define SNAPSHOT_DISK_VERSION 1
57 57
58#define NUM_SNAPSHOT_HDR_CHUNKS 1
59
58struct disk_header { 60struct disk_header {
59 uint32_t magic; 61 uint32_t magic;
60 62
@@ -120,7 +122,22 @@ struct pstore {
120 122
121 /* 123 /*
122 * The next free chunk for an exception. 124 * The next free chunk for an exception.
125 *
126 * When creating exceptions, all the chunks here and above are
127 * free. It holds the next chunk to be allocated. On rare
128 * occasions (e.g. after a system crash) holes can be left in
129 * the exception store because chunks can be committed out of
130 * order.
131 *
132 * When merging exceptions, it does not necessarily mean all the
133 * chunks here and above are free. It holds the value it would
134 * have held if all chunks had been committed in order of
135 * allocation. Consequently the value may occasionally be
136 * slightly too low, but since it's only used for 'status' and
137 * it can never reach its minimum value too early this doesn't
138 * matter.
123 */ 139 */
140
124 chunk_t next_free; 141 chunk_t next_free;
125 142
126 /* 143 /*
@@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
214 int metadata) 231 int metadata)
215{ 232{
216 struct dm_io_region where = { 233 struct dm_io_region where = {
217 .bdev = ps->store->cow->bdev, 234 .bdev = dm_snap_cow(ps->store->snap)->bdev,
218 .sector = ps->store->chunk_size * chunk, 235 .sector = ps->store->chunk_size * chunk,
219 .count = ps->store->chunk_size, 236 .count = ps->store->chunk_size,
220 }; 237 };
@@ -237,7 +254,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
237 * Issue the synchronous I/O from a different thread 254 * Issue the synchronous I/O from a different thread
238 * to avoid generic_make_request recursion. 255 * to avoid generic_make_request recursion.
239 */ 256 */
240 INIT_WORK(&req.work, do_metadata); 257 INIT_WORK_ON_STACK(&req.work, do_metadata);
241 queue_work(ps->metadata_wq, &req.work); 258 queue_work(ps->metadata_wq, &req.work);
242 flush_workqueue(ps->metadata_wq); 259 flush_workqueue(ps->metadata_wq);
243 260
@@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
294 */ 311 */
295 if (!ps->store->chunk_size) { 312 if (!ps->store->chunk_size) {
296 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 313 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
297 bdev_logical_block_size(ps->store->cow->bdev) >> 9); 314 bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
315 bdev) >> 9);
298 ps->store->chunk_mask = ps->store->chunk_size - 1; 316 ps->store->chunk_mask = ps->store->chunk_size - 1;
299 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; 317 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
300 chunk_size_supplied = 0; 318 chunk_size_supplied = 0;
@@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps,
408 e->new_chunk = cpu_to_le64(de->new_chunk); 426 e->new_chunk = cpu_to_le64(de->new_chunk);
409} 427}
410 428
429static void clear_exception(struct pstore *ps, uint32_t index)
430{
431 struct disk_exception *e = get_exception(ps, index);
432
433 /* clear it */
434 e->old_chunk = 0;
435 e->new_chunk = 0;
436}
437
411/* 438/*
412 * Registers the exceptions that are present in the current area. 439 * Registers the exceptions that are present in the current area.
413 * 'full' is filled in to indicate if the area has been 440 * 'full' is filled in to indicate if the area has been
@@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store)
489 return (struct pstore *) store->context; 516 return (struct pstore *) store->context;
490} 517}
491 518
492static void persistent_fraction_full(struct dm_exception_store *store, 519static void persistent_usage(struct dm_exception_store *store,
493 sector_t *numerator, sector_t *denominator) 520 sector_t *total_sectors,
521 sector_t *sectors_allocated,
522 sector_t *metadata_sectors)
494{ 523{
495 *numerator = get_info(store)->next_free * store->chunk_size; 524 struct pstore *ps = get_info(store);
496 *denominator = get_dev_size(store->cow->bdev); 525
526 *sectors_allocated = ps->next_free * store->chunk_size;
527 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
528
529 /*
530 * First chunk is the fixed header.
531 * Then there are (ps->current_area + 1) metadata chunks, each one
532 * separated from the next by ps->exceptions_per_area data chunks.
533 */
534 *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
535 store->chunk_size;
497} 536}
498 537
499static void persistent_dtr(struct dm_exception_store *store) 538static void persistent_dtr(struct dm_exception_store *store)
@@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store,
552 ps->current_area = 0; 591 ps->current_area = 0;
553 zero_memory_area(ps); 592 zero_memory_area(ps);
554 r = zero_disk_area(ps, 0); 593 r = zero_disk_area(ps, 0);
555 if (r) { 594 if (r)
556 DMWARN("zero_disk_area(0) failed"); 595 DMWARN("zero_disk_area(0) failed");
557 return r; 596 return r;
558 } 597 }
559 } else { 598 /*
560 /* 599 * Sanity checks.
561 * Sanity checks. 600 */
562 */ 601 if (ps->version != SNAPSHOT_DISK_VERSION) {
563 if (ps->version != SNAPSHOT_DISK_VERSION) { 602 DMWARN("unable to handle snapshot disk version %d",
564 DMWARN("unable to handle snapshot disk version %d", 603 ps->version);
565 ps->version); 604 return -EINVAL;
566 return -EINVAL; 605 }
567 }
568 606
569 /* 607 /*
570 * Metadata are valid, but snapshot is invalidated 608 * Metadata are valid, but snapshot is invalidated
571 */ 609 */
572 if (!ps->valid) 610 if (!ps->valid)
573 return 1; 611 return 1;
574 612
575 /* 613 /*
576 * Read the metadata. 614 * Read the metadata.
577 */ 615 */
578 r = read_exceptions(ps, callback, callback_context); 616 r = read_exceptions(ps, callback, callback_context);
579 if (r)
580 return r;
581 }
582 617
583 return 0; 618 return r;
584} 619}
585 620
586static int persistent_prepare_exception(struct dm_exception_store *store, 621static int persistent_prepare_exception(struct dm_exception_store *store,
587 struct dm_snap_exception *e) 622 struct dm_exception *e)
588{ 623{
589 struct pstore *ps = get_info(store); 624 struct pstore *ps = get_info(store);
590 uint32_t stride; 625 uint32_t stride;
591 chunk_t next_free; 626 chunk_t next_free;
592 sector_t size = get_dev_size(store->cow->bdev); 627 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
593 628
594 /* Is there enough room ? */ 629 /* Is there enough room ? */
595 if (size < ((ps->next_free + 1) * store->chunk_size)) 630 if (size < ((ps->next_free + 1) * store->chunk_size))
@@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
611} 646}
612 647
613static void persistent_commit_exception(struct dm_exception_store *store, 648static void persistent_commit_exception(struct dm_exception_store *store,
614 struct dm_snap_exception *e, 649 struct dm_exception *e,
615 void (*callback) (void *, int success), 650 void (*callback) (void *, int success),
616 void *callback_context) 651 void *callback_context)
617{ 652{
@@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
672 ps->callback_count = 0; 707 ps->callback_count = 0;
673} 708}
674 709
710static int persistent_prepare_merge(struct dm_exception_store *store,
711 chunk_t *last_old_chunk,
712 chunk_t *last_new_chunk)
713{
714 struct pstore *ps = get_info(store);
715 struct disk_exception de;
716 int nr_consecutive;
717 int r;
718
719 /*
720 * When current area is empty, move back to preceding area.
721 */
722 if (!ps->current_committed) {
723 /*
724 * Have we finished?
725 */
726 if (!ps->current_area)
727 return 0;
728
729 ps->current_area--;
730 r = area_io(ps, READ);
731 if (r < 0)
732 return r;
733 ps->current_committed = ps->exceptions_per_area;
734 }
735
736 read_exception(ps, ps->current_committed - 1, &de);
737 *last_old_chunk = de.old_chunk;
738 *last_new_chunk = de.new_chunk;
739
740 /*
741 * Find number of consecutive chunks within the current area,
742 * working backwards.
743 */
744 for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
745 nr_consecutive++) {
746 read_exception(ps, ps->current_committed - 1 - nr_consecutive,
747 &de);
748 if (de.old_chunk != *last_old_chunk - nr_consecutive ||
749 de.new_chunk != *last_new_chunk - nr_consecutive)
750 break;
751 }
752
753 return nr_consecutive;
754}
755
756static int persistent_commit_merge(struct dm_exception_store *store,
757 int nr_merged)
758{
759 int r, i;
760 struct pstore *ps = get_info(store);
761
762 BUG_ON(nr_merged > ps->current_committed);
763
764 for (i = 0; i < nr_merged; i++)
765 clear_exception(ps, ps->current_committed - 1 - i);
766
767 r = area_io(ps, WRITE);
768 if (r < 0)
769 return r;
770
771 ps->current_committed -= nr_merged;
772
773 /*
774 * At this stage, only persistent_usage() uses ps->next_free, so
775 * we make no attempt to keep ps->next_free strictly accurate
776 * as exceptions may have been committed out-of-order originally.
777 * Once a snapshot has become merging, we set it to the value it
778 * would have held had all the exceptions been committed in order.
779 *
780 * ps->current_area does not get reduced by prepare_merge() until
781 * after commit_merge() has removed the nr_merged previous exceptions.
782 */
783 ps->next_free = (area_location(ps, ps->current_area) - 1) +
784 (ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
785
786 return 0;
787}
788
675static void persistent_drop_snapshot(struct dm_exception_store *store) 789static void persistent_drop_snapshot(struct dm_exception_store *store)
676{ 790{
677 struct pstore *ps = get_info(store); 791 struct pstore *ps = get_info(store);
@@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
697 ps->area = NULL; 811 ps->area = NULL;
698 ps->zero_area = NULL; 812 ps->zero_area = NULL;
699 ps->header_area = NULL; 813 ps->header_area = NULL;
700 ps->next_free = 2; /* skipping the header and first area */ 814 ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
701 ps->current_committed = 0; 815 ps->current_committed = 0;
702 816
703 ps->callback_count = 0; 817 ps->callback_count = 0;
@@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
726 case STATUSTYPE_INFO: 840 case STATUSTYPE_INFO:
727 break; 841 break;
728 case STATUSTYPE_TABLE: 842 case STATUSTYPE_TABLE:
729 DMEMIT(" %s P %llu", store->cow->name, 843 DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
730 (unsigned long long)store->chunk_size);
731 } 844 }
732 845
733 return sz; 846 return sz;
@@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = {
741 .read_metadata = persistent_read_metadata, 854 .read_metadata = persistent_read_metadata,
742 .prepare_exception = persistent_prepare_exception, 855 .prepare_exception = persistent_prepare_exception,
743 .commit_exception = persistent_commit_exception, 856 .commit_exception = persistent_commit_exception,
857 .prepare_merge = persistent_prepare_merge,
858 .commit_merge = persistent_commit_merge,
744 .drop_snapshot = persistent_drop_snapshot, 859 .drop_snapshot = persistent_drop_snapshot,
745 .fraction_full = persistent_fraction_full, 860 .usage = persistent_usage,
746 .status = persistent_status, 861 .status = persistent_status,
747}; 862};
748 863
@@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = {
754 .read_metadata = persistent_read_metadata, 869 .read_metadata = persistent_read_metadata,
755 .prepare_exception = persistent_prepare_exception, 870 .prepare_exception = persistent_prepare_exception,
756 .commit_exception = persistent_commit_exception, 871 .commit_exception = persistent_commit_exception,
872 .prepare_merge = persistent_prepare_merge,
873 .commit_merge = persistent_commit_merge,
757 .drop_snapshot = persistent_drop_snapshot, 874 .drop_snapshot = persistent_drop_snapshot,
758 .fraction_full = persistent_fraction_full, 875 .usage = persistent_usage,
759 .status = persistent_status, 876 .status = persistent_status,
760}; 877};
761 878
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index cde5aa558e6d..a0898a66a2f8 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store,
36} 36}
37 37
38static int transient_prepare_exception(struct dm_exception_store *store, 38static int transient_prepare_exception(struct dm_exception_store *store,
39 struct dm_snap_exception *e) 39 struct dm_exception *e)
40{ 40{
41 struct transient_c *tc = store->context; 41 struct transient_c *tc = store->context;
42 sector_t size = get_dev_size(store->cow->bdev); 42 sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
43 43
44 if (size < (tc->next_free + store->chunk_size)) 44 if (size < (tc->next_free + store->chunk_size))
45 return -1; 45 return -1;
@@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
51} 51}
52 52
53static void transient_commit_exception(struct dm_exception_store *store, 53static void transient_commit_exception(struct dm_exception_store *store,
54 struct dm_snap_exception *e, 54 struct dm_exception *e,
55 void (*callback) (void *, int success), 55 void (*callback) (void *, int success),
56 void *callback_context) 56 void *callback_context)
57{ 57{
@@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
59 callback(callback_context, 1); 59 callback(callback_context, 1);
60} 60}
61 61
62static void transient_fraction_full(struct dm_exception_store *store, 62static void transient_usage(struct dm_exception_store *store,
63 sector_t *numerator, sector_t *denominator) 63 sector_t *total_sectors,
64 sector_t *sectors_allocated,
65 sector_t *metadata_sectors)
64{ 66{
65 *numerator = ((struct transient_c *) store->context)->next_free; 67 *sectors_allocated = ((struct transient_c *) store->context)->next_free;
66 *denominator = get_dev_size(store->cow->bdev); 68 *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
69 *metadata_sectors = 0;
67} 70}
68 71
69static int transient_ctr(struct dm_exception_store *store, 72static int transient_ctr(struct dm_exception_store *store,
@@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
91 case STATUSTYPE_INFO: 94 case STATUSTYPE_INFO:
92 break; 95 break;
93 case STATUSTYPE_TABLE: 96 case STATUSTYPE_TABLE:
94 DMEMIT(" %s N %llu", store->cow->name, 97 DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
95 (unsigned long long)store->chunk_size);
96 } 98 }
97 99
98 return sz; 100 return sz;
@@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = {
106 .read_metadata = transient_read_metadata, 108 .read_metadata = transient_read_metadata,
107 .prepare_exception = transient_prepare_exception, 109 .prepare_exception = transient_prepare_exception,
108 .commit_exception = transient_commit_exception, 110 .commit_exception = transient_commit_exception,
109 .fraction_full = transient_fraction_full, 111 .usage = transient_usage,
110 .status = transient_status, 112 .status = transient_status,
111}; 113};
112 114
@@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = {
118 .read_metadata = transient_read_metadata, 120 .read_metadata = transient_read_metadata,
119 .prepare_exception = transient_prepare_exception, 121 .prepare_exception = transient_prepare_exception,
120 .commit_exception = transient_commit_exception, 122 .commit_exception = transient_commit_exception,
121 .fraction_full = transient_fraction_full, 123 .usage = transient_usage,
122 .status = transient_status, 124 .status = transient_status,
123}; 125};
124 126
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3a3ba46e6d4b..54853773510c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -25,6 +25,11 @@
25 25
26#define DM_MSG_PREFIX "snapshots" 26#define DM_MSG_PREFIX "snapshots"
27 27
28static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
29
30#define dm_target_is_snapshot_merge(ti) \
31 ((ti)->type->name == dm_snapshot_merge_target_name)
32
28/* 33/*
29 * The percentage increment we will wake up users at 34 * The percentage increment we will wake up users at
30 */ 35 */
@@ -49,7 +54,7 @@
49#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 54#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
50 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 55 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
51 56
52struct exception_table { 57struct dm_exception_table {
53 uint32_t hash_mask; 58 uint32_t hash_mask;
54 unsigned hash_shift; 59 unsigned hash_shift;
55 struct list_head *table; 60 struct list_head *table;
@@ -59,22 +64,31 @@ struct dm_snapshot {
59 struct rw_semaphore lock; 64 struct rw_semaphore lock;
60 65
61 struct dm_dev *origin; 66 struct dm_dev *origin;
67 struct dm_dev *cow;
68
69 struct dm_target *ti;
62 70
63 /* List of snapshots per Origin */ 71 /* List of snapshots per Origin */
64 struct list_head list; 72 struct list_head list;
65 73
66 /* You can't use a snapshot if this is 0 (e.g. if full) */ 74 /*
75 * You can't use a snapshot if this is 0 (e.g. if full).
76 * A snapshot-merge target never clears this.
77 */
67 int valid; 78 int valid;
68 79
69 /* Origin writes don't trigger exceptions until this is set */ 80 /* Origin writes don't trigger exceptions until this is set */
70 int active; 81 int active;
71 82
72 mempool_t *pending_pool; 83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
73 85
74 atomic_t pending_exceptions_count; 86 atomic_t pending_exceptions_count;
75 87
76 struct exception_table pending; 88 mempool_t *pending_pool;
77 struct exception_table complete; 89
90 struct dm_exception_table pending;
91 struct dm_exception_table complete;
78 92
79 /* 93 /*
80 * pe_lock protects all pending_exception operations and access 94 * pe_lock protects all pending_exception operations and access
@@ -82,6 +96,11 @@ struct dm_snapshot {
82 */ 96 */
83 spinlock_t pe_lock; 97 spinlock_t pe_lock;
84 98
99 /* Chunks with outstanding reads */
100 spinlock_t tracked_chunk_lock;
101 mempool_t *tracked_chunk_pool;
102 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
103
85 /* The on disk metadata handler */ 104 /* The on disk metadata handler */
86 struct dm_exception_store *store; 105 struct dm_exception_store *store;
87 106
@@ -91,12 +110,50 @@ struct dm_snapshot {
91 struct bio_list queued_bios; 110 struct bio_list queued_bios;
92 struct work_struct queued_bios_work; 111 struct work_struct queued_bios_work;
93 112
94 /* Chunks with outstanding reads */ 113 /* Wait for events based on state_bits */
95 mempool_t *tracked_chunk_pool; 114 unsigned long state_bits;
96 spinlock_t tracked_chunk_lock; 115
97 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 116 /* Range of chunks currently being merged. */
117 chunk_t first_merging_chunk;
118 int num_merging_chunks;
119
120 /*
121 * The merge operation failed if this flag is set.
122 * Failure modes are handled as follows:
123 * - I/O error reading the header
124 * => don't load the target; abort.
125 * - Header does not have "valid" flag set
126 * => use the origin; forget about the snapshot.
127 * - I/O error when reading exceptions
128 * => don't load the target; abort.
129 * (We can't use the intermediate origin state.)
130 * - I/O error while merging
131 * => stop merging; set merge_failed; process I/O normally.
132 */
133 int merge_failed;
134
135 /*
136 * Incoming bios that overlap with chunks being merged must wait
137 * for them to be committed.
138 */
139 struct bio_list bios_queued_during_merge;
98}; 140};
99 141
142/*
143 * state_bits:
144 * RUNNING_MERGE - Merge operation is in progress.
145 * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
146 * cleared afterwards.
147 */
148#define RUNNING_MERGE 0
149#define SHUTDOWN_MERGE 1
150
151struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
152{
153 return s->cow;
154}
155EXPORT_SYMBOL(dm_snap_cow);
156
100static struct workqueue_struct *ksnapd; 157static struct workqueue_struct *ksnapd;
101static void flush_queued_bios(struct work_struct *work); 158static void flush_queued_bios(struct work_struct *work);
102 159
@@ -116,7 +173,7 @@ static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
116} 173}
117 174
118struct dm_snap_pending_exception { 175struct dm_snap_pending_exception {
119 struct dm_snap_exception e; 176 struct dm_exception e;
120 177
121 /* 178 /*
122 * Origin buffers waiting for this to complete are held 179 * Origin buffers waiting for this to complete are held
@@ -125,28 +182,6 @@ struct dm_snap_pending_exception {
125 struct bio_list origin_bios; 182 struct bio_list origin_bios;
126 struct bio_list snapshot_bios; 183 struct bio_list snapshot_bios;
127 184
128 /*
129 * Short-term queue of pending exceptions prior to submission.
130 */
131 struct list_head list;
132
133 /*
134 * The primary pending_exception is the one that holds
135 * the ref_count and the list of origin_bios for a
136 * group of pending_exceptions. It is always last to get freed.
137 * These fields get set up when writing to the origin.
138 */
139 struct dm_snap_pending_exception *primary_pe;
140
141 /*
142 * Number of pending_exceptions processing this chunk.
143 * When this drops to zero we must complete the origin bios.
144 * If incrementing or decrementing this, hold pe->snap->lock for
145 * the sibling concerned and not pe->primary_pe->snap->lock unless
146 * they are the same.
147 */
148 atomic_t ref_count;
149
150 /* Pointer back to snapshot context */ 185 /* Pointer back to snapshot context */
151 struct dm_snapshot *snap; 186 struct dm_snapshot *snap;
152 187
@@ -222,6 +257,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
222} 257}
223 258
224/* 259/*
260 * This conflicting I/O is extremely improbable in the caller,
261 * so msleep(1) is sufficient and there is no need for a wait queue.
262 */
263static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
264{
265 while (__chunk_is_tracked(s, chunk))
266 msleep(1);
267}
268
269/*
225 * One of these per registered origin, held in the snapshot_origins hash 270 * One of these per registered origin, held in the snapshot_origins hash
226 */ 271 */
227struct origin { 272struct origin {
@@ -243,6 +288,10 @@ struct origin {
243static struct list_head *_origins; 288static struct list_head *_origins;
244static struct rw_semaphore _origins_lock; 289static struct rw_semaphore _origins_lock;
245 290
291static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
292static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
293static uint64_t _pending_exceptions_done_count;
294
246static int init_origin_hash(void) 295static int init_origin_hash(void)
247{ 296{
248 int i; 297 int i;
@@ -291,22 +340,144 @@ static void __insert_origin(struct origin *o)
291} 340}
292 341
293/* 342/*
343 * _origins_lock must be held when calling this function.
344 * Returns number of snapshots registered using the supplied cow device, plus:
345 * snap_src - a snapshot suitable for use as a source of exception handover
346 * snap_dest - a snapshot capable of receiving exception handover.
347 * snap_merge - an existing snapshot-merge target linked to the same origin.
348 * There can be at most one snapshot-merge target. The parameter is optional.
349 *
350 * Possible return values and states of snap_src and snap_dest.
351 * 0: NULL, NULL - first new snapshot
352 * 1: snap_src, NULL - normal snapshot
353 * 2: snap_src, snap_dest - waiting for handover
354 * 2: snap_src, NULL - handed over, waiting for old to be deleted
355 * 1: NULL, snap_dest - source got destroyed without handover
356 */
357static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
358 struct dm_snapshot **snap_src,
359 struct dm_snapshot **snap_dest,
360 struct dm_snapshot **snap_merge)
361{
362 struct dm_snapshot *s;
363 struct origin *o;
364 int count = 0;
365 int active;
366
367 o = __lookup_origin(snap->origin->bdev);
368 if (!o)
369 goto out;
370
371 list_for_each_entry(s, &o->snapshots, list) {
372 if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
373 *snap_merge = s;
374 if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
375 continue;
376
377 down_read(&s->lock);
378 active = s->active;
379 up_read(&s->lock);
380
381 if (active) {
382 if (snap_src)
383 *snap_src = s;
384 } else if (snap_dest)
385 *snap_dest = s;
386
387 count++;
388 }
389
390out:
391 return count;
392}
393
394/*
395 * On success, returns 1 if this snapshot is a handover destination,
396 * otherwise returns 0.
397 */
398static int __validate_exception_handover(struct dm_snapshot *snap)
399{
400 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
401 struct dm_snapshot *snap_merge = NULL;
402
403 /* Does snapshot need exceptions handed over to it? */
404 if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
405 &snap_merge) == 2) ||
406 snap_dest) {
407 snap->ti->error = "Snapshot cow pairing for exception "
408 "table handover failed";
409 return -EINVAL;
410 }
411
412 /*
413 * If no snap_src was found, snap cannot become a handover
414 * destination.
415 */
416 if (!snap_src)
417 return 0;
418
419 /*
420 * Non-snapshot-merge handover?
421 */
422 if (!dm_target_is_snapshot_merge(snap->ti))
423 return 1;
424
425 /*
426 * Do not allow more than one merging snapshot.
427 */
428 if (snap_merge) {
429 snap->ti->error = "A snapshot is already merging.";
430 return -EINVAL;
431 }
432
433 if (!snap_src->store->type->prepare_merge ||
434 !snap_src->store->type->commit_merge) {
435 snap->ti->error = "Snapshot exception store does not "
436 "support snapshot-merge.";
437 return -EINVAL;
438 }
439
440 return 1;
441}
442
443static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
444{
445 struct dm_snapshot *l;
446
447 /* Sort the list according to chunk size, largest-first smallest-last */
448 list_for_each_entry(l, &o->snapshots, list)
449 if (l->store->chunk_size < s->store->chunk_size)
450 break;
451 list_add_tail(&s->list, &l->list);
452}
453
454/*
294 * Make a note of the snapshot and its origin so we can look it 455 * Make a note of the snapshot and its origin so we can look it
295 * up when the origin has a write on it. 456 * up when the origin has a write on it.
457 *
458 * Also validate snapshot exception store handovers.
459 * On success, returns 1 if this registration is a handover destination,
460 * otherwise returns 0.
296 */ 461 */
297static int register_snapshot(struct dm_snapshot *snap) 462static int register_snapshot(struct dm_snapshot *snap)
298{ 463{
299 struct dm_snapshot *l; 464 struct origin *o, *new_o = NULL;
300 struct origin *o, *new_o;
301 struct block_device *bdev = snap->origin->bdev; 465 struct block_device *bdev = snap->origin->bdev;
466 int r = 0;
302 467
303 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); 468 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
304 if (!new_o) 469 if (!new_o)
305 return -ENOMEM; 470 return -ENOMEM;
306 471
307 down_write(&_origins_lock); 472 down_write(&_origins_lock);
308 o = __lookup_origin(bdev);
309 473
474 r = __validate_exception_handover(snap);
475 if (r < 0) {
476 kfree(new_o);
477 goto out;
478 }
479
480 o = __lookup_origin(bdev);
310 if (o) 481 if (o)
311 kfree(new_o); 482 kfree(new_o);
312 else { 483 else {
@@ -320,14 +491,27 @@ static int register_snapshot(struct dm_snapshot *snap)
320 __insert_origin(o); 491 __insert_origin(o);
321 } 492 }
322 493
323 /* Sort the list according to chunk size, largest-first smallest-last */ 494 __insert_snapshot(o, snap);
324 list_for_each_entry(l, &o->snapshots, list) 495
325 if (l->store->chunk_size < snap->store->chunk_size) 496out:
326 break; 497 up_write(&_origins_lock);
327 list_add_tail(&snap->list, &l->list); 498
499 return r;
500}
501
502/*
503 * Move snapshot to correct place in list according to chunk size.
504 */
505static void reregister_snapshot(struct dm_snapshot *s)
506{
507 struct block_device *bdev = s->origin->bdev;
508
509 down_write(&_origins_lock);
510
511 list_del(&s->list);
512 __insert_snapshot(__lookup_origin(bdev), s);
328 513
329 up_write(&_origins_lock); 514 up_write(&_origins_lock);
330 return 0;
331} 515}
332 516
333static void unregister_snapshot(struct dm_snapshot *s) 517static void unregister_snapshot(struct dm_snapshot *s)
@@ -338,7 +522,7 @@ static void unregister_snapshot(struct dm_snapshot *s)
338 o = __lookup_origin(s->origin->bdev); 522 o = __lookup_origin(s->origin->bdev);
339 523
340 list_del(&s->list); 524 list_del(&s->list);
341 if (list_empty(&o->snapshots)) { 525 if (o && list_empty(&o->snapshots)) {
342 list_del(&o->hash_list); 526 list_del(&o->hash_list);
343 kfree(o); 527 kfree(o);
344 } 528 }
@@ -351,8 +535,8 @@ static void unregister_snapshot(struct dm_snapshot *s)
351 * The lowest hash_shift bits of the chunk number are ignored, allowing 535 * The lowest hash_shift bits of the chunk number are ignored, allowing
352 * some consecutive chunks to be grouped together. 536 * some consecutive chunks to be grouped together.
353 */ 537 */
354static int init_exception_table(struct exception_table *et, uint32_t size, 538static int dm_exception_table_init(struct dm_exception_table *et,
355 unsigned hash_shift) 539 uint32_t size, unsigned hash_shift)
356{ 540{
357 unsigned int i; 541 unsigned int i;
358 542
@@ -368,10 +552,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size,
368 return 0; 552 return 0;
369} 553}
370 554
371static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) 555static void dm_exception_table_exit(struct dm_exception_table *et,
556 struct kmem_cache *mem)
372{ 557{
373 struct list_head *slot; 558 struct list_head *slot;
374 struct dm_snap_exception *ex, *next; 559 struct dm_exception *ex, *next;
375 int i, size; 560 int i, size;
376 561
377 size = et->hash_mask + 1; 562 size = et->hash_mask + 1;
@@ -385,19 +570,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
385 vfree(et->table); 570 vfree(et->table);
386} 571}
387 572
388static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) 573static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
389{ 574{
390 return (chunk >> et->hash_shift) & et->hash_mask; 575 return (chunk >> et->hash_shift) & et->hash_mask;
391} 576}
392 577
393static void insert_exception(struct exception_table *eh, 578static void dm_remove_exception(struct dm_exception *e)
394 struct dm_snap_exception *e)
395{
396 struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
397 list_add(&e->hash_list, l);
398}
399
400static void remove_exception(struct dm_snap_exception *e)
401{ 579{
402 list_del(&e->hash_list); 580 list_del(&e->hash_list);
403} 581}
@@ -406,11 +584,11 @@ static void remove_exception(struct dm_snap_exception *e)
406 * Return the exception data for a sector, or NULL if not 584 * Return the exception data for a sector, or NULL if not
407 * remapped. 585 * remapped.
408 */ 586 */
409static struct dm_snap_exception *lookup_exception(struct exception_table *et, 587static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
410 chunk_t chunk) 588 chunk_t chunk)
411{ 589{
412 struct list_head *slot; 590 struct list_head *slot;
413 struct dm_snap_exception *e; 591 struct dm_exception *e;
414 592
415 slot = &et->table[exception_hash(et, chunk)]; 593 slot = &et->table[exception_hash(et, chunk)];
416 list_for_each_entry (e, slot, hash_list) 594 list_for_each_entry (e, slot, hash_list)
@@ -421,9 +599,9 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
421 return NULL; 599 return NULL;
422} 600}
423 601
424static struct dm_snap_exception *alloc_exception(void) 602static struct dm_exception *alloc_completed_exception(void)
425{ 603{
426 struct dm_snap_exception *e; 604 struct dm_exception *e;
427 605
428 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 606 e = kmem_cache_alloc(exception_cache, GFP_NOIO);
429 if (!e) 607 if (!e)
@@ -432,7 +610,7 @@ static struct dm_snap_exception *alloc_exception(void)
432 return e; 610 return e;
433} 611}
434 612
435static void free_exception(struct dm_snap_exception *e) 613static void free_completed_exception(struct dm_exception *e)
436{ 614{
437 kmem_cache_free(exception_cache, e); 615 kmem_cache_free(exception_cache, e);
438} 616}
@@ -457,12 +635,11 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
457 atomic_dec(&s->pending_exceptions_count); 635 atomic_dec(&s->pending_exceptions_count);
458} 636}
459 637
460static void insert_completed_exception(struct dm_snapshot *s, 638static void dm_insert_exception(struct dm_exception_table *eh,
461 struct dm_snap_exception *new_e) 639 struct dm_exception *new_e)
462{ 640{
463 struct exception_table *eh = &s->complete;
464 struct list_head *l; 641 struct list_head *l;
465 struct dm_snap_exception *e = NULL; 642 struct dm_exception *e = NULL;
466 643
467 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 644 l = &eh->table[exception_hash(eh, new_e->old_chunk)];
468 645
@@ -478,7 +655,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
478 new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 655 new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
479 dm_consecutive_chunk_count(e) + 1)) { 656 dm_consecutive_chunk_count(e) + 1)) {
480 dm_consecutive_chunk_count_inc(e); 657 dm_consecutive_chunk_count_inc(e);
481 free_exception(new_e); 658 free_completed_exception(new_e);
482 return; 659 return;
483 } 660 }
484 661
@@ -488,7 +665,7 @@ static void insert_completed_exception(struct dm_snapshot *s,
488 dm_consecutive_chunk_count_inc(e); 665 dm_consecutive_chunk_count_inc(e);
489 e->old_chunk--; 666 e->old_chunk--;
490 e->new_chunk--; 667 e->new_chunk--;
491 free_exception(new_e); 668 free_completed_exception(new_e);
492 return; 669 return;
493 } 670 }
494 671
@@ -507,9 +684,9 @@ out:
507static int dm_add_exception(void *context, chunk_t old, chunk_t new) 684static int dm_add_exception(void *context, chunk_t old, chunk_t new)
508{ 685{
509 struct dm_snapshot *s = context; 686 struct dm_snapshot *s = context;
510 struct dm_snap_exception *e; 687 struct dm_exception *e;
511 688
512 e = alloc_exception(); 689 e = alloc_completed_exception();
513 if (!e) 690 if (!e)
514 return -ENOMEM; 691 return -ENOMEM;
515 692
@@ -518,11 +695,30 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
518 /* Consecutive_count is implicitly initialised to zero */ 695 /* Consecutive_count is implicitly initialised to zero */
519 e->new_chunk = new; 696 e->new_chunk = new;
520 697
521 insert_completed_exception(s, e); 698 dm_insert_exception(&s->complete, e);
522 699
523 return 0; 700 return 0;
524} 701}
525 702
703#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
704
705/*
706 * Return a minimum chunk size of all snapshots that have the specified origin.
707 * Return zero if the origin has no snapshots.
708 */
709static sector_t __minimum_chunk_size(struct origin *o)
710{
711 struct dm_snapshot *snap;
712 unsigned chunk_size = 0;
713
714 if (o)
715 list_for_each_entry(snap, &o->snapshots, list)
716 chunk_size = min_not_zero(chunk_size,
717 snap->store->chunk_size);
718
719 return chunk_size;
720}
721
526/* 722/*
527 * Hard coded magic. 723 * Hard coded magic.
528 */ 724 */
@@ -546,16 +742,18 @@ static int init_hash_tables(struct dm_snapshot *s)
546 * Calculate based on the size of the original volume or 742 * Calculate based on the size of the original volume or
547 * the COW volume... 743 * the COW volume...
548 */ 744 */
549 cow_dev_size = get_dev_size(s->store->cow->bdev); 745 cow_dev_size = get_dev_size(s->cow->bdev);
550 origin_dev_size = get_dev_size(s->origin->bdev); 746 origin_dev_size = get_dev_size(s->origin->bdev);
551 max_buckets = calc_max_buckets(); 747 max_buckets = calc_max_buckets();
552 748
553 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; 749 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
554 hash_size = min(hash_size, max_buckets); 750 hash_size = min(hash_size, max_buckets);
555 751
752 if (hash_size < 64)
753 hash_size = 64;
556 hash_size = rounddown_pow_of_two(hash_size); 754 hash_size = rounddown_pow_of_two(hash_size);
557 if (init_exception_table(&s->complete, hash_size, 755 if (dm_exception_table_init(&s->complete, hash_size,
558 DM_CHUNK_CONSECUTIVE_BITS)) 756 DM_CHUNK_CONSECUTIVE_BITS))
559 return -ENOMEM; 757 return -ENOMEM;
560 758
561 /* 759 /*
@@ -566,14 +764,284 @@ static int init_hash_tables(struct dm_snapshot *s)
566 if (hash_size < 64) 764 if (hash_size < 64)
567 hash_size = 64; 765 hash_size = 64;
568 766
569 if (init_exception_table(&s->pending, hash_size, 0)) { 767 if (dm_exception_table_init(&s->pending, hash_size, 0)) {
570 exit_exception_table(&s->complete, exception_cache); 768 dm_exception_table_exit(&s->complete, exception_cache);
571 return -ENOMEM; 769 return -ENOMEM;
572 } 770 }
573 771
574 return 0; 772 return 0;
575} 773}
576 774
775static void merge_shutdown(struct dm_snapshot *s)
776{
777 clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
778 smp_mb__after_clear_bit();
779 wake_up_bit(&s->state_bits, RUNNING_MERGE);
780}
781
782static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
783{
784 s->first_merging_chunk = 0;
785 s->num_merging_chunks = 0;
786
787 return bio_list_get(&s->bios_queued_during_merge);
788}
789
790/*
791 * Remove one chunk from the index of completed exceptions.
792 */
793static int __remove_single_exception_chunk(struct dm_snapshot *s,
794 chunk_t old_chunk)
795{
796 struct dm_exception *e;
797
798 e = dm_lookup_exception(&s->complete, old_chunk);
799 if (!e) {
800 DMERR("Corruption detected: exception for block %llu is "
801 "on disk but not in memory",
802 (unsigned long long)old_chunk);
803 return -EINVAL;
804 }
805
806 /*
807 * If this is the only chunk using this exception, remove exception.
808 */
809 if (!dm_consecutive_chunk_count(e)) {
810 dm_remove_exception(e);
811 free_completed_exception(e);
812 return 0;
813 }
814
815 /*
816 * The chunk may be either at the beginning or the end of a
817 * group of consecutive chunks - never in the middle. We are
818 * removing chunks in the opposite order to that in which they
819 * were added, so this should always be true.
820 * Decrement the consecutive chunk counter and adjust the
821 * starting point if necessary.
822 */
823 if (old_chunk == e->old_chunk) {
824 e->old_chunk++;
825 e->new_chunk++;
826 } else if (old_chunk != e->old_chunk +
827 dm_consecutive_chunk_count(e)) {
828 DMERR("Attempt to merge block %llu from the "
829 "middle of a chunk range [%llu - %llu]",
830 (unsigned long long)old_chunk,
831 (unsigned long long)e->old_chunk,
832 (unsigned long long)
833 e->old_chunk + dm_consecutive_chunk_count(e));
834 return -EINVAL;
835 }
836
837 dm_consecutive_chunk_count_dec(e);
838
839 return 0;
840}
841
842static void flush_bios(struct bio *bio);
843
844static int remove_single_exception_chunk(struct dm_snapshot *s)
845{
846 struct bio *b = NULL;
847 int r;
848 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
849
850 down_write(&s->lock);
851
852 /*
853 * Process chunks (and associated exceptions) in reverse order
854 * so that dm_consecutive_chunk_count_dec() accounting works.
855 */
856 do {
857 r = __remove_single_exception_chunk(s, old_chunk);
858 if (r)
859 goto out;
860 } while (old_chunk-- > s->first_merging_chunk);
861
862 b = __release_queued_bios_after_merge(s);
863
864out:
865 up_write(&s->lock);
866 if (b)
867 flush_bios(b);
868
869 return r;
870}
871
872static int origin_write_extent(struct dm_snapshot *merging_snap,
873 sector_t sector, unsigned chunk_size);
874
875static void merge_callback(int read_err, unsigned long write_err,
876 void *context);
877
878static uint64_t read_pending_exceptions_done_count(void)
879{
880 uint64_t pending_exceptions_done;
881
882 spin_lock(&_pending_exceptions_done_spinlock);
883 pending_exceptions_done = _pending_exceptions_done_count;
884 spin_unlock(&_pending_exceptions_done_spinlock);
885
886 return pending_exceptions_done;
887}
888
889static void increment_pending_exceptions_done_count(void)
890{
891 spin_lock(&_pending_exceptions_done_spinlock);
892 _pending_exceptions_done_count++;
893 spin_unlock(&_pending_exceptions_done_spinlock);
894
895 wake_up_all(&_pending_exceptions_done);
896}
897
898static void snapshot_merge_next_chunks(struct dm_snapshot *s)
899{
900 int i, linear_chunks;
901 chunk_t old_chunk, new_chunk;
902 struct dm_io_region src, dest;
903 sector_t io_size;
904 uint64_t previous_count;
905
906 BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
907 if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
908 goto shut;
909
910 /*
911 * valid flag never changes during merge, so no lock required.
912 */
913 if (!s->valid) {
914 DMERR("Snapshot is invalid: can't merge");
915 goto shut;
916 }
917
918 linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
919 &new_chunk);
920 if (linear_chunks <= 0) {
921 if (linear_chunks < 0) {
922 DMERR("Read error in exception store: "
923 "shutting down merge");
924 down_write(&s->lock);
925 s->merge_failed = 1;
926 up_write(&s->lock);
927 }
928 goto shut;
929 }
930
931 /* Adjust old_chunk and new_chunk to reflect start of linear region */
932 old_chunk = old_chunk + 1 - linear_chunks;
933 new_chunk = new_chunk + 1 - linear_chunks;
934
935 /*
936 * Use one (potentially large) I/O to copy all 'linear_chunks'
937 * from the exception store to the origin
938 */
939 io_size = linear_chunks * s->store->chunk_size;
940
941 dest.bdev = s->origin->bdev;
942 dest.sector = chunk_to_sector(s->store, old_chunk);
943 dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
944
945 src.bdev = s->cow->bdev;
946 src.sector = chunk_to_sector(s->store, new_chunk);
947 src.count = dest.count;
948
949 /*
950 * Reallocate any exceptions needed in other snapshots then
951 * wait for the pending exceptions to complete.
952 * Each time any pending exception (globally on the system)
953 * completes we are woken and repeat the process to find out
954 * if we can proceed. While this may not seem a particularly
955 * efficient algorithm, it is not expected to have any
956 * significant impact on performance.
957 */
958 previous_count = read_pending_exceptions_done_count();
959 while (origin_write_extent(s, dest.sector, io_size)) {
960 wait_event(_pending_exceptions_done,
961 (read_pending_exceptions_done_count() !=
962 previous_count));
963 /* Retry after the wait, until all exceptions are done. */
964 previous_count = read_pending_exceptions_done_count();
965 }
966
967 down_write(&s->lock);
968 s->first_merging_chunk = old_chunk;
969 s->num_merging_chunks = linear_chunks;
970 up_write(&s->lock);
971
972 /* Wait until writes to all 'linear_chunks' drain */
973 for (i = 0; i < linear_chunks; i++)
974 __check_for_conflicting_io(s, old_chunk + i);
975
976 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
977 return;
978
979shut:
980 merge_shutdown(s);
981}
982
983static void error_bios(struct bio *bio);
984
985static void merge_callback(int read_err, unsigned long write_err, void *context)
986{
987 struct dm_snapshot *s = context;
988 struct bio *b = NULL;
989
990 if (read_err || write_err) {
991 if (read_err)
992 DMERR("Read error: shutting down merge.");
993 else
994 DMERR("Write error: shutting down merge.");
995 goto shut;
996 }
997
998 if (s->store->type->commit_merge(s->store,
999 s->num_merging_chunks) < 0) {
1000 DMERR("Write error in exception store: shutting down merge");
1001 goto shut;
1002 }
1003
1004 if (remove_single_exception_chunk(s) < 0)
1005 goto shut;
1006
1007 snapshot_merge_next_chunks(s);
1008
1009 return;
1010
1011shut:
1012 down_write(&s->lock);
1013 s->merge_failed = 1;
1014 b = __release_queued_bios_after_merge(s);
1015 up_write(&s->lock);
1016 error_bios(b);
1017
1018 merge_shutdown(s);
1019}
1020
1021static void start_merge(struct dm_snapshot *s)
1022{
1023 if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1024 snapshot_merge_next_chunks(s);
1025}
1026
1027static int wait_schedule(void *ptr)
1028{
1029 schedule();
1030
1031 return 0;
1032}
1033
1034/*
1035 * Stop the merging process and wait until it finishes.
1036 */
1037static void stop_merge(struct dm_snapshot *s)
1038{
1039 set_bit(SHUTDOWN_MERGE, &s->state_bits);
1040 wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule,
1041 TASK_UNINTERRUPTIBLE);
1042 clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1043}
1044
577/* 1045/*
578 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 1046 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
579 */ 1047 */
@@ -582,50 +1050,72 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
582 struct dm_snapshot *s; 1050 struct dm_snapshot *s;
583 int i; 1051 int i;
584 int r = -EINVAL; 1052 int r = -EINVAL;
585 char *origin_path; 1053 char *origin_path, *cow_path;
586 struct dm_exception_store *store; 1054 unsigned args_used, num_flush_requests = 1;
587 unsigned args_used; 1055 fmode_t origin_mode = FMODE_READ;
588 1056
589 if (argc != 4) { 1057 if (argc != 4) {
590 ti->error = "requires exactly 4 arguments"; 1058 ti->error = "requires exactly 4 arguments";
591 r = -EINVAL; 1059 r = -EINVAL;
592 goto bad_args; 1060 goto bad;
1061 }
1062
1063 if (dm_target_is_snapshot_merge(ti)) {
1064 num_flush_requests = 2;
1065 origin_mode = FMODE_WRITE;
593 } 1066 }
594 1067
595 origin_path = argv[0]; 1068 origin_path = argv[0];
596 argv++; 1069 argv++;
597 argc--; 1070 argc--;
598 1071
599 r = dm_exception_store_create(ti, argc, argv, &args_used, &store); 1072 s = kmalloc(sizeof(*s), GFP_KERNEL);
1073 if (!s) {
1074 ti->error = "Cannot allocate snapshot context private "
1075 "structure";
1076 r = -ENOMEM;
1077 goto bad;
1078 }
1079
1080 cow_path = argv[0];
1081 argv++;
1082 argc--;
1083
1084 r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow);
1085 if (r) {
1086 ti->error = "Cannot get COW device";
1087 goto bad_cow;
1088 }
1089
1090 r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
600 if (r) { 1091 if (r) {
601 ti->error = "Couldn't create exception store"; 1092 ti->error = "Couldn't create exception store";
602 r = -EINVAL; 1093 r = -EINVAL;
603 goto bad_args; 1094 goto bad_store;
604 } 1095 }
605 1096
606 argv += args_used; 1097 argv += args_used;
607 argc -= args_used; 1098 argc -= args_used;
608 1099
609 s = kmalloc(sizeof(*s), GFP_KERNEL); 1100 r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
610 if (!s) {
611 ti->error = "Cannot allocate snapshot context private "
612 "structure";
613 r = -ENOMEM;
614 goto bad_snap;
615 }
616
617 r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
618 if (r) { 1101 if (r) {
619 ti->error = "Cannot get origin device"; 1102 ti->error = "Cannot get origin device";
620 goto bad_origin; 1103 goto bad_origin;
621 } 1104 }
622 1105
623 s->store = store; 1106 s->ti = ti;
624 s->valid = 1; 1107 s->valid = 1;
625 s->active = 0; 1108 s->active = 0;
1109 s->suspended = 0;
626 atomic_set(&s->pending_exceptions_count, 0); 1110 atomic_set(&s->pending_exceptions_count, 0);
627 init_rwsem(&s->lock); 1111 init_rwsem(&s->lock);
1112 INIT_LIST_HEAD(&s->list);
628 spin_lock_init(&s->pe_lock); 1113 spin_lock_init(&s->pe_lock);
1114 s->state_bits = 0;
1115 s->merge_failed = 0;
1116 s->first_merging_chunk = 0;
1117 s->num_merging_chunks = 0;
1118 bio_list_init(&s->bios_queued_during_merge);
629 1119
630 /* Allocate hash table for COW data */ 1120 /* Allocate hash table for COW data */
631 if (init_hash_tables(s)) { 1121 if (init_hash_tables(s)) {
@@ -659,39 +1149,55 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
659 1149
660 spin_lock_init(&s->tracked_chunk_lock); 1150 spin_lock_init(&s->tracked_chunk_lock);
661 1151
662 /* Metadata must only be loaded into one table at once */ 1152 bio_list_init(&s->queued_bios);
1153 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1154
1155 ti->private = s;
1156 ti->num_flush_requests = num_flush_requests;
1157
1158 /* Add snapshot to the list of snapshots for this origin */
1159 /* Exceptions aren't triggered till snapshot_resume() is called */
1160 r = register_snapshot(s);
1161 if (r == -ENOMEM) {
1162 ti->error = "Snapshot origin struct allocation failed";
1163 goto bad_load_and_register;
1164 } else if (r < 0) {
1165 /* invalid handover, register_snapshot has set ti->error */
1166 goto bad_load_and_register;
1167 }
1168
1169 /*
1170 * Metadata must only be loaded into one table at once, so skip this
1171 * if metadata will be handed over during resume.
1172 * Chunk size will be set during the handover - set it to zero to
1173 * ensure it's ignored.
1174 */
1175 if (r > 0) {
1176 s->store->chunk_size = 0;
1177 return 0;
1178 }
1179
663 r = s->store->type->read_metadata(s->store, dm_add_exception, 1180 r = s->store->type->read_metadata(s->store, dm_add_exception,
664 (void *)s); 1181 (void *)s);
665 if (r < 0) { 1182 if (r < 0) {
666 ti->error = "Failed to read snapshot metadata"; 1183 ti->error = "Failed to read snapshot metadata";
667 goto bad_load_and_register; 1184 goto bad_read_metadata;
668 } else if (r > 0) { 1185 } else if (r > 0) {
669 s->valid = 0; 1186 s->valid = 0;
670 DMWARN("Snapshot is marked invalid."); 1187 DMWARN("Snapshot is marked invalid.");
671 } 1188 }
672 1189
673 bio_list_init(&s->queued_bios);
674 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
675
676 if (!s->store->chunk_size) { 1190 if (!s->store->chunk_size) {
677 ti->error = "Chunk size not set"; 1191 ti->error = "Chunk size not set";
678 goto bad_load_and_register; 1192 goto bad_read_metadata;
679 } 1193 }
680
681 /* Add snapshot to the list of snapshots for this origin */
682 /* Exceptions aren't triggered till snapshot_resume() is called */
683 if (register_snapshot(s)) {
684 r = -EINVAL;
685 ti->error = "Cannot register snapshot origin";
686 goto bad_load_and_register;
687 }
688
689 ti->private = s;
690 ti->split_io = s->store->chunk_size; 1194 ti->split_io = s->store->chunk_size;
691 ti->num_flush_requests = 1;
692 1195
693 return 0; 1196 return 0;
694 1197
1198bad_read_metadata:
1199 unregister_snapshot(s);
1200
695bad_load_and_register: 1201bad_load_and_register:
696 mempool_destroy(s->tracked_chunk_pool); 1202 mempool_destroy(s->tracked_chunk_pool);
697 1203
@@ -702,19 +1208,22 @@ bad_pending_pool:
702 dm_kcopyd_client_destroy(s->kcopyd_client); 1208 dm_kcopyd_client_destroy(s->kcopyd_client);
703 1209
704bad_kcopyd: 1210bad_kcopyd:
705 exit_exception_table(&s->pending, pending_cache); 1211 dm_exception_table_exit(&s->pending, pending_cache);
706 exit_exception_table(&s->complete, exception_cache); 1212 dm_exception_table_exit(&s->complete, exception_cache);
707 1213
708bad_hash_tables: 1214bad_hash_tables:
709 dm_put_device(ti, s->origin); 1215 dm_put_device(ti, s->origin);
710 1216
711bad_origin: 1217bad_origin:
712 kfree(s); 1218 dm_exception_store_destroy(s->store);
1219
1220bad_store:
1221 dm_put_device(ti, s->cow);
713 1222
714bad_snap: 1223bad_cow:
715 dm_exception_store_destroy(store); 1224 kfree(s);
716 1225
717bad_args: 1226bad:
718 return r; 1227 return r;
719} 1228}
720 1229
@@ -723,8 +1232,39 @@ static void __free_exceptions(struct dm_snapshot *s)
723 dm_kcopyd_client_destroy(s->kcopyd_client); 1232 dm_kcopyd_client_destroy(s->kcopyd_client);
724 s->kcopyd_client = NULL; 1233 s->kcopyd_client = NULL;
725 1234
726 exit_exception_table(&s->pending, pending_cache); 1235 dm_exception_table_exit(&s->pending, pending_cache);
727 exit_exception_table(&s->complete, exception_cache); 1236 dm_exception_table_exit(&s->complete, exception_cache);
1237}
1238
1239static void __handover_exceptions(struct dm_snapshot *snap_src,
1240 struct dm_snapshot *snap_dest)
1241{
1242 union {
1243 struct dm_exception_table table_swap;
1244 struct dm_exception_store *store_swap;
1245 } u;
1246
1247 /*
1248 * Swap all snapshot context information between the two instances.
1249 */
1250 u.table_swap = snap_dest->complete;
1251 snap_dest->complete = snap_src->complete;
1252 snap_src->complete = u.table_swap;
1253
1254 u.store_swap = snap_dest->store;
1255 snap_dest->store = snap_src->store;
1256 snap_src->store = u.store_swap;
1257
1258 snap_dest->store->snap = snap_dest;
1259 snap_src->store->snap = snap_src;
1260
1261 snap_dest->ti->split_io = snap_dest->store->chunk_size;
1262 snap_dest->valid = snap_src->valid;
1263
1264 /*
1265 * Set source invalid to ensure it receives no further I/O.
1266 */
1267 snap_src->valid = 0;
728} 1268}
729 1269
730static void snapshot_dtr(struct dm_target *ti) 1270static void snapshot_dtr(struct dm_target *ti)
@@ -733,9 +1273,24 @@ static void snapshot_dtr(struct dm_target *ti)
733 int i; 1273 int i;
734#endif 1274#endif
735 struct dm_snapshot *s = ti->private; 1275 struct dm_snapshot *s = ti->private;
1276 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
736 1277
737 flush_workqueue(ksnapd); 1278 flush_workqueue(ksnapd);
738 1279
1280 down_read(&_origins_lock);
1281 /* Check whether exception handover must be cancelled */
1282 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1283 if (snap_src && snap_dest && (s == snap_src)) {
1284 down_write(&snap_dest->lock);
1285 snap_dest->valid = 0;
1286 up_write(&snap_dest->lock);
1287 DMERR("Cancelling snapshot handover.");
1288 }
1289 up_read(&_origins_lock);
1290
1291 if (dm_target_is_snapshot_merge(ti))
1292 stop_merge(s);
1293
739 /* Prevent further origin writes from using this snapshot. */ 1294 /* Prevent further origin writes from using this snapshot. */
740 /* After this returns there can be no new kcopyd jobs. */ 1295 /* After this returns there can be no new kcopyd jobs. */
741 unregister_snapshot(s); 1296 unregister_snapshot(s);
@@ -763,6 +1318,8 @@ static void snapshot_dtr(struct dm_target *ti)
763 1318
764 dm_exception_store_destroy(s->store); 1319 dm_exception_store_destroy(s->store);
765 1320
1321 dm_put_device(ti, s->cow);
1322
766 kfree(s); 1323 kfree(s);
767} 1324}
768 1325
@@ -795,6 +1352,26 @@ static void flush_queued_bios(struct work_struct *work)
795 flush_bios(queued_bios); 1352 flush_bios(queued_bios);
796} 1353}
797 1354
1355static int do_origin(struct dm_dev *origin, struct bio *bio);
1356
1357/*
1358 * Flush a list of buffers.
1359 */
1360static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1361{
1362 struct bio *n;
1363 int r;
1364
1365 while (bio) {
1366 n = bio->bi_next;
1367 bio->bi_next = NULL;
1368 r = do_origin(s->origin, bio);
1369 if (r == DM_MAPIO_REMAPPED)
1370 generic_make_request(bio);
1371 bio = n;
1372 }
1373}
1374
798/* 1375/*
799 * Error a list of buffers. 1376 * Error a list of buffers.
800 */ 1377 */
@@ -825,45 +1402,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
825 1402
826 s->valid = 0; 1403 s->valid = 0;
827 1404
828 dm_table_event(s->store->ti->table); 1405 dm_table_event(s->ti->table);
829}
830
831static void get_pending_exception(struct dm_snap_pending_exception *pe)
832{
833 atomic_inc(&pe->ref_count);
834}
835
836static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe)
837{
838 struct dm_snap_pending_exception *primary_pe;
839 struct bio *origin_bios = NULL;
840
841 primary_pe = pe->primary_pe;
842
843 /*
844 * If this pe is involved in a write to the origin and
845 * it is the last sibling to complete then release
846 * the bios for the original write to the origin.
847 */
848 if (primary_pe &&
849 atomic_dec_and_test(&primary_pe->ref_count)) {
850 origin_bios = bio_list_get(&primary_pe->origin_bios);
851 free_pending_exception(primary_pe);
852 }
853
854 /*
855 * Free the pe if it's not linked to an origin write or if
856 * it's not itself a primary pe.
857 */
858 if (!primary_pe || primary_pe != pe)
859 free_pending_exception(pe);
860
861 return origin_bios;
862} 1406}
863 1407
864static void pending_complete(struct dm_snap_pending_exception *pe, int success) 1408static void pending_complete(struct dm_snap_pending_exception *pe, int success)
865{ 1409{
866 struct dm_snap_exception *e; 1410 struct dm_exception *e;
867 struct dm_snapshot *s = pe->snap; 1411 struct dm_snapshot *s = pe->snap;
868 struct bio *origin_bios = NULL; 1412 struct bio *origin_bios = NULL;
869 struct bio *snapshot_bios = NULL; 1413 struct bio *snapshot_bios = NULL;
@@ -877,7 +1421,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
877 goto out; 1421 goto out;
878 } 1422 }
879 1423
880 e = alloc_exception(); 1424 e = alloc_completed_exception();
881 if (!e) { 1425 if (!e) {
882 down_write(&s->lock); 1426 down_write(&s->lock);
883 __invalidate_snapshot(s, -ENOMEM); 1427 __invalidate_snapshot(s, -ENOMEM);
@@ -888,28 +1432,27 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
888 1432
889 down_write(&s->lock); 1433 down_write(&s->lock);
890 if (!s->valid) { 1434 if (!s->valid) {
891 free_exception(e); 1435 free_completed_exception(e);
892 error = 1; 1436 error = 1;
893 goto out; 1437 goto out;
894 } 1438 }
895 1439
896 /* 1440 /* Check for conflicting reads */
897 * Check for conflicting reads. This is extremely improbable, 1441 __check_for_conflicting_io(s, pe->e.old_chunk);
898 * so msleep(1) is sufficient and there is no need for a wait queue.
899 */
900 while (__chunk_is_tracked(s, pe->e.old_chunk))
901 msleep(1);
902 1442
903 /* 1443 /*
904 * Add a proper exception, and remove the 1444 * Add a proper exception, and remove the
905 * in-flight exception from the list. 1445 * in-flight exception from the list.
906 */ 1446 */
907 insert_completed_exception(s, e); 1447 dm_insert_exception(&s->complete, e);
908 1448
909 out: 1449 out:
910 remove_exception(&pe->e); 1450 dm_remove_exception(&pe->e);
911 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1451 snapshot_bios = bio_list_get(&pe->snapshot_bios);
912 origin_bios = put_pending_exception(pe); 1452 origin_bios = bio_list_get(&pe->origin_bios);
1453 free_pending_exception(pe);
1454
1455 increment_pending_exceptions_done_count();
913 1456
914 up_write(&s->lock); 1457 up_write(&s->lock);
915 1458
@@ -919,7 +1462,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
919 else 1462 else
920 flush_bios(snapshot_bios); 1463 flush_bios(snapshot_bios);
921 1464
922 flush_bios(origin_bios); 1465 retry_origin_bios(s, origin_bios);
923} 1466}
924 1467
925static void commit_callback(void *context, int success) 1468static void commit_callback(void *context, int success)
@@ -963,7 +1506,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
963 src.sector = chunk_to_sector(s->store, pe->e.old_chunk); 1506 src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
964 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); 1507 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
965 1508
966 dest.bdev = s->store->cow->bdev; 1509 dest.bdev = s->cow->bdev;
967 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); 1510 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
968 dest.count = src.count; 1511 dest.count = src.count;
969 1512
@@ -975,7 +1518,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
975static struct dm_snap_pending_exception * 1518static struct dm_snap_pending_exception *
976__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) 1519__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
977{ 1520{
978 struct dm_snap_exception *e = lookup_exception(&s->pending, chunk); 1521 struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
979 1522
980 if (!e) 1523 if (!e)
981 return NULL; 1524 return NULL;
@@ -1006,8 +1549,6 @@ __find_pending_exception(struct dm_snapshot *s,
1006 pe->e.old_chunk = chunk; 1549 pe->e.old_chunk = chunk;
1007 bio_list_init(&pe->origin_bios); 1550 bio_list_init(&pe->origin_bios);
1008 bio_list_init(&pe->snapshot_bios); 1551 bio_list_init(&pe->snapshot_bios);
1009 pe->primary_pe = NULL;
1010 atomic_set(&pe->ref_count, 0);
1011 pe->started = 0; 1552 pe->started = 0;
1012 1553
1013 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1554 if (s->store->type->prepare_exception(s->store, &pe->e)) {
@@ -1015,16 +1556,15 @@ __find_pending_exception(struct dm_snapshot *s,
1015 return NULL; 1556 return NULL;
1016 } 1557 }
1017 1558
1018 get_pending_exception(pe); 1559 dm_insert_exception(&s->pending, &pe->e);
1019 insert_exception(&s->pending, &pe->e);
1020 1560
1021 return pe; 1561 return pe;
1022} 1562}
1023 1563
1024static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, 1564static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1025 struct bio *bio, chunk_t chunk) 1565 struct bio *bio, chunk_t chunk)
1026{ 1566{
1027 bio->bi_bdev = s->store->cow->bdev; 1567 bio->bi_bdev = s->cow->bdev;
1028 bio->bi_sector = chunk_to_sector(s->store, 1568 bio->bi_sector = chunk_to_sector(s->store,
1029 dm_chunk_number(e->new_chunk) + 1569 dm_chunk_number(e->new_chunk) +
1030 (chunk - e->old_chunk)) + 1570 (chunk - e->old_chunk)) +
@@ -1035,14 +1575,14 @@ static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
1035static int snapshot_map(struct dm_target *ti, struct bio *bio, 1575static int snapshot_map(struct dm_target *ti, struct bio *bio,
1036 union map_info *map_context) 1576 union map_info *map_context)
1037{ 1577{
1038 struct dm_snap_exception *e; 1578 struct dm_exception *e;
1039 struct dm_snapshot *s = ti->private; 1579 struct dm_snapshot *s = ti->private;
1040 int r = DM_MAPIO_REMAPPED; 1580 int r = DM_MAPIO_REMAPPED;
1041 chunk_t chunk; 1581 chunk_t chunk;
1042 struct dm_snap_pending_exception *pe = NULL; 1582 struct dm_snap_pending_exception *pe = NULL;
1043 1583
1044 if (unlikely(bio_empty_barrier(bio))) { 1584 if (unlikely(bio_empty_barrier(bio))) {
1045 bio->bi_bdev = s->store->cow->bdev; 1585 bio->bi_bdev = s->cow->bdev;
1046 return DM_MAPIO_REMAPPED; 1586 return DM_MAPIO_REMAPPED;
1047 } 1587 }
1048 1588
@@ -1063,7 +1603,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1063 } 1603 }
1064 1604
1065 /* If the block is already remapped - use that, else remap it */ 1605 /* If the block is already remapped - use that, else remap it */
1066 e = lookup_exception(&s->complete, chunk); 1606 e = dm_lookup_exception(&s->complete, chunk);
1067 if (e) { 1607 if (e) {
1068 remap_exception(s, e, bio, chunk); 1608 remap_exception(s, e, bio, chunk);
1069 goto out_unlock; 1609 goto out_unlock;
@@ -1087,7 +1627,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1087 goto out_unlock; 1627 goto out_unlock;
1088 } 1628 }
1089 1629
1090 e = lookup_exception(&s->complete, chunk); 1630 e = dm_lookup_exception(&s->complete, chunk);
1091 if (e) { 1631 if (e) {
1092 free_pending_exception(pe); 1632 free_pending_exception(pe);
1093 remap_exception(s, e, bio, chunk); 1633 remap_exception(s, e, bio, chunk);
@@ -1125,6 +1665,78 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1125 return r; 1665 return r;
1126} 1666}
1127 1667
1668/*
1669 * A snapshot-merge target behaves like a combination of a snapshot
1670 * target and a snapshot-origin target. It only generates new
1671 * exceptions in other snapshots and not in the one that is being
1672 * merged.
1673 *
1674 * For each chunk, if there is an existing exception, it is used to
1675 * redirect I/O to the cow device. Otherwise I/O is sent to the origin,
1676 * which in turn might generate exceptions in other snapshots.
1677 * If merging is currently taking place on the chunk in question, the
1678 * I/O is deferred by adding it to s->bios_queued_during_merge.
1679 */
1680static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1681 union map_info *map_context)
1682{
1683 struct dm_exception *e;
1684 struct dm_snapshot *s = ti->private;
1685 int r = DM_MAPIO_REMAPPED;
1686 chunk_t chunk;
1687
1688 if (unlikely(bio_empty_barrier(bio))) {
1689 if (!map_context->flush_request)
1690 bio->bi_bdev = s->origin->bdev;
1691 else
1692 bio->bi_bdev = s->cow->bdev;
1693 map_context->ptr = NULL;
1694 return DM_MAPIO_REMAPPED;
1695 }
1696
1697 chunk = sector_to_chunk(s->store, bio->bi_sector);
1698
1699 down_write(&s->lock);
1700
1701 /* Full merging snapshots are redirected to the origin */
1702 if (!s->valid)
1703 goto redirect_to_origin;
1704
1705 /* If the block is already remapped - use that */
1706 e = dm_lookup_exception(&s->complete, chunk);
1707 if (e) {
1708 /* Queue writes overlapping with chunks being merged */
1709 if (bio_rw(bio) == WRITE &&
1710 chunk >= s->first_merging_chunk &&
1711 chunk < (s->first_merging_chunk +
1712 s->num_merging_chunks)) {
1713 bio->bi_bdev = s->origin->bdev;
1714 bio_list_add(&s->bios_queued_during_merge, bio);
1715 r = DM_MAPIO_SUBMITTED;
1716 goto out_unlock;
1717 }
1718
1719 remap_exception(s, e, bio, chunk);
1720
1721 if (bio_rw(bio) == WRITE)
1722 map_context->ptr = track_chunk(s, chunk);
1723 goto out_unlock;
1724 }
1725
1726redirect_to_origin:
1727 bio->bi_bdev = s->origin->bdev;
1728
1729 if (bio_rw(bio) == WRITE) {
1730 up_write(&s->lock);
1731 return do_origin(s->origin, bio);
1732 }
1733
1734out_unlock:
1735 up_write(&s->lock);
1736
1737 return r;
1738}
1739
1128static int snapshot_end_io(struct dm_target *ti, struct bio *bio, 1740static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1129 int error, union map_info *map_context) 1741 int error, union map_info *map_context)
1130{ 1742{
@@ -1137,40 +1749,135 @@ static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
1137 return 0; 1749 return 0;
1138} 1750}
1139 1751
1752static void snapshot_merge_presuspend(struct dm_target *ti)
1753{
1754 struct dm_snapshot *s = ti->private;
1755
1756 stop_merge(s);
1757}
1758
1759static void snapshot_postsuspend(struct dm_target *ti)
1760{
1761 struct dm_snapshot *s = ti->private;
1762
1763 down_write(&s->lock);
1764 s->suspended = 1;
1765 up_write(&s->lock);
1766}
1767
1768static int snapshot_preresume(struct dm_target *ti)
1769{
1770 int r = 0;
1771 struct dm_snapshot *s = ti->private;
1772 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1773
1774 down_read(&_origins_lock);
1775 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1776 if (snap_src && snap_dest) {
1777 down_read(&snap_src->lock);
1778 if (s == snap_src) {
1779 DMERR("Unable to resume snapshot source until "
1780 "handover completes.");
1781 r = -EINVAL;
1782 } else if (!snap_src->suspended) {
1783 DMERR("Unable to perform snapshot handover until "
1784 "source is suspended.");
1785 r = -EINVAL;
1786 }
1787 up_read(&snap_src->lock);
1788 }
1789 up_read(&_origins_lock);
1790
1791 return r;
1792}
1793
1140static void snapshot_resume(struct dm_target *ti) 1794static void snapshot_resume(struct dm_target *ti)
1141{ 1795{
1142 struct dm_snapshot *s = ti->private; 1796 struct dm_snapshot *s = ti->private;
1797 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1798
1799 down_read(&_origins_lock);
1800 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1801 if (snap_src && snap_dest) {
1802 down_write(&snap_src->lock);
1803 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
1804 __handover_exceptions(snap_src, snap_dest);
1805 up_write(&snap_dest->lock);
1806 up_write(&snap_src->lock);
1807 }
1808 up_read(&_origins_lock);
1809
1810 /* Now we have correct chunk size, reregister */
1811 reregister_snapshot(s);
1143 1812
1144 down_write(&s->lock); 1813 down_write(&s->lock);
1145 s->active = 1; 1814 s->active = 1;
1815 s->suspended = 0;
1146 up_write(&s->lock); 1816 up_write(&s->lock);
1147} 1817}
1148 1818
1819static sector_t get_origin_minimum_chunksize(struct block_device *bdev)
1820{
1821 sector_t min_chunksize;
1822
1823 down_read(&_origins_lock);
1824 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
1825 up_read(&_origins_lock);
1826
1827 return min_chunksize;
1828}
1829
1830static void snapshot_merge_resume(struct dm_target *ti)
1831{
1832 struct dm_snapshot *s = ti->private;
1833
1834 /*
1835 * Handover exceptions from existing snapshot.
1836 */
1837 snapshot_resume(ti);
1838
1839 /*
1840 * snapshot-merge acts as an origin, so set ti->split_io
1841 */
1842 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev);
1843
1844 start_merge(s);
1845}
1846
1149static int snapshot_status(struct dm_target *ti, status_type_t type, 1847static int snapshot_status(struct dm_target *ti, status_type_t type,
1150 char *result, unsigned int maxlen) 1848 char *result, unsigned int maxlen)
1151{ 1849{
1152 unsigned sz = 0; 1850 unsigned sz = 0;
1153 struct dm_snapshot *snap = ti->private; 1851 struct dm_snapshot *snap = ti->private;
1154 1852
1155 down_write(&snap->lock);
1156
1157 switch (type) { 1853 switch (type) {
1158 case STATUSTYPE_INFO: 1854 case STATUSTYPE_INFO:
1855
1856 down_write(&snap->lock);
1857
1159 if (!snap->valid) 1858 if (!snap->valid)
1160 DMEMIT("Invalid"); 1859 DMEMIT("Invalid");
1860 else if (snap->merge_failed)
1861 DMEMIT("Merge failed");
1161 else { 1862 else {
1162 if (snap->store->type->fraction_full) { 1863 if (snap->store->type->usage) {
1163 sector_t numerator, denominator; 1864 sector_t total_sectors, sectors_allocated,
1164 snap->store->type->fraction_full(snap->store, 1865 metadata_sectors;
1165 &numerator, 1866 snap->store->type->usage(snap->store,
1166 &denominator); 1867 &total_sectors,
1167 DMEMIT("%llu/%llu", 1868 &sectors_allocated,
1168 (unsigned long long)numerator, 1869 &metadata_sectors);
1169 (unsigned long long)denominator); 1870 DMEMIT("%llu/%llu %llu",
1871 (unsigned long long)sectors_allocated,
1872 (unsigned long long)total_sectors,
1873 (unsigned long long)metadata_sectors);
1170 } 1874 }
1171 else 1875 else
1172 DMEMIT("Unknown"); 1876 DMEMIT("Unknown");
1173 } 1877 }
1878
1879 up_write(&snap->lock);
1880
1174 break; 1881 break;
1175 1882
1176 case STATUSTYPE_TABLE: 1883 case STATUSTYPE_TABLE:
@@ -1179,14 +1886,12 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
1179 * to make private copies if the output is to 1886 * to make private copies if the output is to
1180 * make sense. 1887 * make sense.
1181 */ 1888 */
1182 DMEMIT("%s", snap->origin->name); 1889 DMEMIT("%s %s", snap->origin->name, snap->cow->name);
1183 snap->store->type->status(snap->store, type, result + sz, 1890 snap->store->type->status(snap->store, type, result + sz,
1184 maxlen - sz); 1891 maxlen - sz);
1185 break; 1892 break;
1186 } 1893 }
1187 1894
1188 up_write(&snap->lock);
1189
1190 return 0; 1895 return 0;
1191} 1896}
1192 1897
@@ -1202,17 +1907,36 @@ static int snapshot_iterate_devices(struct dm_target *ti,
1202/*----------------------------------------------------------------- 1907/*-----------------------------------------------------------------
1203 * Origin methods 1908 * Origin methods
1204 *---------------------------------------------------------------*/ 1909 *---------------------------------------------------------------*/
1205static int __origin_write(struct list_head *snapshots, struct bio *bio) 1910
1911/*
1912 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
1913 * supplied bio was ignored. The caller may submit it immediately.
1914 * (No remapping actually occurs as the origin is always a direct linear
1915 * map.)
1916 *
1917 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
1918 * and any supplied bio is added to a list to be submitted once all
1919 * the necessary exceptions exist.
1920 */
1921static int __origin_write(struct list_head *snapshots, sector_t sector,
1922 struct bio *bio)
1206{ 1923{
1207 int r = DM_MAPIO_REMAPPED, first = 0; 1924 int r = DM_MAPIO_REMAPPED;
1208 struct dm_snapshot *snap; 1925 struct dm_snapshot *snap;
1209 struct dm_snap_exception *e; 1926 struct dm_exception *e;
1210 struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; 1927 struct dm_snap_pending_exception *pe;
1928 struct dm_snap_pending_exception *pe_to_start_now = NULL;
1929 struct dm_snap_pending_exception *pe_to_start_last = NULL;
1211 chunk_t chunk; 1930 chunk_t chunk;
1212 LIST_HEAD(pe_queue);
1213 1931
1214 /* Do all the snapshots on this origin */ 1932 /* Do all the snapshots on this origin */
1215 list_for_each_entry (snap, snapshots, list) { 1933 list_for_each_entry (snap, snapshots, list) {
1934 /*
1935 * Don't make new exceptions in a merging snapshot
1936 * because it has effectively been deleted
1937 */
1938 if (dm_target_is_snapshot_merge(snap->ti))
1939 continue;
1216 1940
1217 down_write(&snap->lock); 1941 down_write(&snap->lock);
1218 1942
@@ -1221,24 +1945,21 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1221 goto next_snapshot; 1945 goto next_snapshot;
1222 1946
1223 /* Nothing to do if writing beyond end of snapshot */ 1947 /* Nothing to do if writing beyond end of snapshot */
1224 if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table)) 1948 if (sector >= dm_table_get_size(snap->ti->table))
1225 goto next_snapshot; 1949 goto next_snapshot;
1226 1950
1227 /* 1951 /*
1228 * Remember, different snapshots can have 1952 * Remember, different snapshots can have
1229 * different chunk sizes. 1953 * different chunk sizes.
1230 */ 1954 */
1231 chunk = sector_to_chunk(snap->store, bio->bi_sector); 1955 chunk = sector_to_chunk(snap->store, sector);
1232 1956
1233 /* 1957 /*
1234 * Check exception table to see if block 1958 * Check exception table to see if block
1235 * is already remapped in this snapshot 1959 * is already remapped in this snapshot
1236 * and trigger an exception if not. 1960 * and trigger an exception if not.
1237 *
1238 * ref_count is initialised to 1 so pending_complete()
1239 * won't destroy the primary_pe while we're inside this loop.
1240 */ 1961 */
1241 e = lookup_exception(&snap->complete, chunk); 1962 e = dm_lookup_exception(&snap->complete, chunk);
1242 if (e) 1963 if (e)
1243 goto next_snapshot; 1964 goto next_snapshot;
1244 1965
@@ -1253,7 +1974,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1253 goto next_snapshot; 1974 goto next_snapshot;
1254 } 1975 }
1255 1976
1256 e = lookup_exception(&snap->complete, chunk); 1977 e = dm_lookup_exception(&snap->complete, chunk);
1257 if (e) { 1978 if (e) {
1258 free_pending_exception(pe); 1979 free_pending_exception(pe);
1259 goto next_snapshot; 1980 goto next_snapshot;
@@ -1266,59 +1987,43 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1266 } 1987 }
1267 } 1988 }
1268 1989
1269 if (!primary_pe) { 1990 r = DM_MAPIO_SUBMITTED;
1270 /*
1271 * Either every pe here has same
1272 * primary_pe or none has one yet.
1273 */
1274 if (pe->primary_pe)
1275 primary_pe = pe->primary_pe;
1276 else {
1277 primary_pe = pe;
1278 first = 1;
1279 }
1280
1281 bio_list_add(&primary_pe->origin_bios, bio);
1282 1991
1283 r = DM_MAPIO_SUBMITTED; 1992 /*
1284 } 1993 * If an origin bio was supplied, queue it to wait for the
1994 * completion of this exception, and start this one last,
1995 * at the end of the function.
1996 */
1997 if (bio) {
1998 bio_list_add(&pe->origin_bios, bio);
1999 bio = NULL;
1285 2000
1286 if (!pe->primary_pe) { 2001 if (!pe->started) {
1287 pe->primary_pe = primary_pe; 2002 pe->started = 1;
1288 get_pending_exception(primary_pe); 2003 pe_to_start_last = pe;
2004 }
1289 } 2005 }
1290 2006
1291 if (!pe->started) { 2007 if (!pe->started) {
1292 pe->started = 1; 2008 pe->started = 1;
1293 list_add_tail(&pe->list, &pe_queue); 2009 pe_to_start_now = pe;
1294 } 2010 }
1295 2011
1296 next_snapshot: 2012 next_snapshot:
1297 up_write(&snap->lock); 2013 up_write(&snap->lock);
1298 }
1299 2014
1300 if (!primary_pe) 2015 if (pe_to_start_now) {
1301 return r; 2016 start_copy(pe_to_start_now);
1302 2017 pe_to_start_now = NULL;
1303 /* 2018 }
1304 * If this is the first time we're processing this chunk and
1305 * ref_count is now 1 it means all the pending exceptions
1306 * got completed while we were in the loop above, so it falls to
1307 * us here to remove the primary_pe and submit any origin_bios.
1308 */
1309
1310 if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
1311 flush_bios(bio_list_get(&primary_pe->origin_bios));
1312 free_pending_exception(primary_pe);
1313 /* If we got here, pe_queue is necessarily empty. */
1314 return r;
1315 } 2019 }
1316 2020
1317 /* 2021 /*
1318 * Now that we have a complete pe list we can start the copying. 2022 * Submit the exception against which the bio is queued last,
2023 * to give the other exceptions a head start.
1319 */ 2024 */
1320 list_for_each_entry_safe(pe, next_pe, &pe_queue, list) 2025 if (pe_to_start_last)
1321 start_copy(pe); 2026 start_copy(pe_to_start_last);
1322 2027
1323 return r; 2028 return r;
1324} 2029}
@@ -1334,13 +2039,48 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
1334 down_read(&_origins_lock); 2039 down_read(&_origins_lock);
1335 o = __lookup_origin(origin->bdev); 2040 o = __lookup_origin(origin->bdev);
1336 if (o) 2041 if (o)
1337 r = __origin_write(&o->snapshots, bio); 2042 r = __origin_write(&o->snapshots, bio->bi_sector, bio);
1338 up_read(&_origins_lock); 2043 up_read(&_origins_lock);
1339 2044
1340 return r; 2045 return r;
1341} 2046}
1342 2047
1343/* 2048/*
2049 * Trigger exceptions in all non-merging snapshots.
2050 *
2051 * The chunk size of the merging snapshot may be larger than the chunk
2052 * size of some other snapshot so we may need to reallocate multiple
2053 * chunks in other snapshots.
2054 *
2055 * We scan all the overlapping exceptions in the other snapshots.
2056 * Returns 1 if anything was reallocated and must be waited for,
2057 * otherwise returns 0.
2058 *
2059 * size must be a multiple of merging_snap's chunk_size.
2060 */
2061static int origin_write_extent(struct dm_snapshot *merging_snap,
2062 sector_t sector, unsigned size)
2063{
2064 int must_wait = 0;
2065 sector_t n;
2066 struct origin *o;
2067
2068 /*
2069 * The origin's __minimum_chunk_size() got stored in split_io
2070 * by snapshot_merge_resume().
2071 */
2072 down_read(&_origins_lock);
2073 o = __lookup_origin(merging_snap->origin->bdev);
2074 for (n = 0; n < size; n += merging_snap->ti->split_io)
2075 if (__origin_write(&o->snapshots, sector + n, NULL) ==
2076 DM_MAPIO_SUBMITTED)
2077 must_wait = 1;
2078 up_read(&_origins_lock);
2079
2080 return must_wait;
2081}
2082
2083/*
1344 * Origin: maps a linear range of a device, with hooks for snapshotting. 2084 * Origin: maps a linear range of a device, with hooks for snapshotting.
1345 */ 2085 */
1346 2086
@@ -1359,8 +2099,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1359 return -EINVAL; 2099 return -EINVAL;
1360 } 2100 }
1361 2101
1362 r = dm_get_device(ti, argv[0], 0, ti->len, 2102 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
1363 dm_table_get_mode(ti->table), &dev);
1364 if (r) { 2103 if (r) {
1365 ti->error = "Cannot get target device"; 2104 ti->error = "Cannot get target device";
1366 return r; 2105 return r;
@@ -1391,8 +2130,6 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1391 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 2130 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
1392} 2131}
1393 2132
1394#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1395
1396/* 2133/*
1397 * Set the target "split_io" field to the minimum of all the snapshots' 2134 * Set the target "split_io" field to the minimum of all the snapshots'
1398 * chunk sizes. 2135 * chunk sizes.
@@ -1400,19 +2137,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
1400static void origin_resume(struct dm_target *ti) 2137static void origin_resume(struct dm_target *ti)
1401{ 2138{
1402 struct dm_dev *dev = ti->private; 2139 struct dm_dev *dev = ti->private;
1403 struct dm_snapshot *snap;
1404 struct origin *o;
1405 unsigned chunk_size = 0;
1406
1407 down_read(&_origins_lock);
1408 o = __lookup_origin(dev->bdev);
1409 if (o)
1410 list_for_each_entry (snap, &o->snapshots, list)
1411 chunk_size = min_not_zero(chunk_size,
1412 snap->store->chunk_size);
1413 up_read(&_origins_lock);
1414 2140
1415 ti->split_io = chunk_size; 2141 ti->split_io = get_origin_minimum_chunksize(dev->bdev);
1416} 2142}
1417 2143
1418static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2144static int origin_status(struct dm_target *ti, status_type_t type, char *result,
@@ -1455,17 +2181,35 @@ static struct target_type origin_target = {
1455 2181
1456static struct target_type snapshot_target = { 2182static struct target_type snapshot_target = {
1457 .name = "snapshot", 2183 .name = "snapshot",
1458 .version = {1, 7, 0}, 2184 .version = {1, 9, 0},
1459 .module = THIS_MODULE, 2185 .module = THIS_MODULE,
1460 .ctr = snapshot_ctr, 2186 .ctr = snapshot_ctr,
1461 .dtr = snapshot_dtr, 2187 .dtr = snapshot_dtr,
1462 .map = snapshot_map, 2188 .map = snapshot_map,
1463 .end_io = snapshot_end_io, 2189 .end_io = snapshot_end_io,
2190 .postsuspend = snapshot_postsuspend,
2191 .preresume = snapshot_preresume,
1464 .resume = snapshot_resume, 2192 .resume = snapshot_resume,
1465 .status = snapshot_status, 2193 .status = snapshot_status,
1466 .iterate_devices = snapshot_iterate_devices, 2194 .iterate_devices = snapshot_iterate_devices,
1467}; 2195};
1468 2196
2197static struct target_type merge_target = {
2198 .name = dm_snapshot_merge_target_name,
2199 .version = {1, 0, 0},
2200 .module = THIS_MODULE,
2201 .ctr = snapshot_ctr,
2202 .dtr = snapshot_dtr,
2203 .map = snapshot_merge_map,
2204 .end_io = snapshot_end_io,
2205 .presuspend = snapshot_merge_presuspend,
2206 .postsuspend = snapshot_postsuspend,
2207 .preresume = snapshot_preresume,
2208 .resume = snapshot_merge_resume,
2209 .status = snapshot_status,
2210 .iterate_devices = snapshot_iterate_devices,
2211};
2212
1469static int __init dm_snapshot_init(void) 2213static int __init dm_snapshot_init(void)
1470{ 2214{
1471 int r; 2215 int r;
@@ -1477,7 +2221,7 @@ static int __init dm_snapshot_init(void)
1477 } 2221 }
1478 2222
1479 r = dm_register_target(&snapshot_target); 2223 r = dm_register_target(&snapshot_target);
1480 if (r) { 2224 if (r < 0) {
1481 DMERR("snapshot target register failed %d", r); 2225 DMERR("snapshot target register failed %d", r);
1482 goto bad_register_snapshot_target; 2226 goto bad_register_snapshot_target;
1483 } 2227 }
@@ -1485,34 +2229,40 @@ static int __init dm_snapshot_init(void)
1485 r = dm_register_target(&origin_target); 2229 r = dm_register_target(&origin_target);
1486 if (r < 0) { 2230 if (r < 0) {
1487 DMERR("Origin target register failed %d", r); 2231 DMERR("Origin target register failed %d", r);
1488 goto bad1; 2232 goto bad_register_origin_target;
2233 }
2234
2235 r = dm_register_target(&merge_target);
2236 if (r < 0) {
2237 DMERR("Merge target register failed %d", r);
2238 goto bad_register_merge_target;
1489 } 2239 }
1490 2240
1491 r = init_origin_hash(); 2241 r = init_origin_hash();
1492 if (r) { 2242 if (r) {
1493 DMERR("init_origin_hash failed."); 2243 DMERR("init_origin_hash failed.");
1494 goto bad2; 2244 goto bad_origin_hash;
1495 } 2245 }
1496 2246
1497 exception_cache = KMEM_CACHE(dm_snap_exception, 0); 2247 exception_cache = KMEM_CACHE(dm_exception, 0);
1498 if (!exception_cache) { 2248 if (!exception_cache) {
1499 DMERR("Couldn't create exception cache."); 2249 DMERR("Couldn't create exception cache.");
1500 r = -ENOMEM; 2250 r = -ENOMEM;
1501 goto bad3; 2251 goto bad_exception_cache;
1502 } 2252 }
1503 2253
1504 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); 2254 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
1505 if (!pending_cache) { 2255 if (!pending_cache) {
1506 DMERR("Couldn't create pending cache."); 2256 DMERR("Couldn't create pending cache.");
1507 r = -ENOMEM; 2257 r = -ENOMEM;
1508 goto bad4; 2258 goto bad_pending_cache;
1509 } 2259 }
1510 2260
1511 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); 2261 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0);
1512 if (!tracked_chunk_cache) { 2262 if (!tracked_chunk_cache) {
1513 DMERR("Couldn't create cache to track chunks in use."); 2263 DMERR("Couldn't create cache to track chunks in use.");
1514 r = -ENOMEM; 2264 r = -ENOMEM;
1515 goto bad5; 2265 goto bad_tracked_chunk_cache;
1516 } 2266 }
1517 2267
1518 ksnapd = create_singlethread_workqueue("ksnapd"); 2268 ksnapd = create_singlethread_workqueue("ksnapd");
@@ -1526,19 +2276,21 @@ static int __init dm_snapshot_init(void)
1526 2276
1527bad_pending_pool: 2277bad_pending_pool:
1528 kmem_cache_destroy(tracked_chunk_cache); 2278 kmem_cache_destroy(tracked_chunk_cache);
1529bad5: 2279bad_tracked_chunk_cache:
1530 kmem_cache_destroy(pending_cache); 2280 kmem_cache_destroy(pending_cache);
1531bad4: 2281bad_pending_cache:
1532 kmem_cache_destroy(exception_cache); 2282 kmem_cache_destroy(exception_cache);
1533bad3: 2283bad_exception_cache:
1534 exit_origin_hash(); 2284 exit_origin_hash();
1535bad2: 2285bad_origin_hash:
2286 dm_unregister_target(&merge_target);
2287bad_register_merge_target:
1536 dm_unregister_target(&origin_target); 2288 dm_unregister_target(&origin_target);
1537bad1: 2289bad_register_origin_target:
1538 dm_unregister_target(&snapshot_target); 2290 dm_unregister_target(&snapshot_target);
1539
1540bad_register_snapshot_target: 2291bad_register_snapshot_target:
1541 dm_exception_store_exit(); 2292 dm_exception_store_exit();
2293
1542 return r; 2294 return r;
1543} 2295}
1544 2296
@@ -1548,6 +2300,7 @@ static void __exit dm_snapshot_exit(void)
1548 2300
1549 dm_unregister_target(&snapshot_target); 2301 dm_unregister_target(&snapshot_target);
1550 dm_unregister_target(&origin_target); 2302 dm_unregister_target(&origin_target);
2303 dm_unregister_target(&merge_target);
1551 2304
1552 exit_origin_hash(); 2305 exit_origin_hash();
1553 kmem_cache_destroy(pending_cache); 2306 kmem_cache_destroy(pending_cache);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e0efc1adcaff..e610725db766 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -80,8 +80,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
80 if (sscanf(argv[1], "%llu", &start) != 1) 80 if (sscanf(argv[1], "%llu", &start) != 1)
81 return -EINVAL; 81 return -EINVAL;
82 82
83 if (dm_get_device(ti, argv[0], start, sc->stripe_width, 83 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
84 dm_table_get_mode(ti->table),
85 &sc->stripe[stripe].dev)) 84 &sc->stripe[stripe].dev))
86 return -ENXIO; 85 return -ENXIO;
87 86
@@ -110,7 +109,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
110 } 109 }
111 110
112 stripes = simple_strtoul(argv[0], &end, 10); 111 stripes = simple_strtoul(argv[0], &end, 10);
113 if (*end) { 112 if (!stripes || *end) {
114 ti->error = "Invalid stripe count"; 113 ti->error = "Invalid stripe count";
115 return -EINVAL; 114 return -EINVAL;
116 } 115 }
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 4b045903a4e2..84d2b91e4efb 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
59 59
60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) 60static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
61{ 61{
62 sprintf(buf, "%d\n", dm_suspended(md)); 62 sprintf(buf, "%d\n", dm_suspended_md(md));
63 63
64 return strlen(buf); 64 return strlen(buf);
65} 65}
@@ -75,7 +75,7 @@ static struct attribute *dm_attrs[] = {
75 NULL, 75 NULL,
76}; 76};
77 77
78static struct sysfs_ops dm_sysfs_ops = { 78static const struct sysfs_ops dm_sysfs_ops = {
79 .show = dm_attr_show, 79 .show = dm_attr_show,
80}; 80};
81 81
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 1a6cb3c7822e..9924ea23032d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -12,6 +12,7 @@
12#include <linux/blkdev.h> 12#include <linux/blkdev.h>
13#include <linux/namei.h> 13#include <linux/namei.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/string.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
17#include <linux/mutex.h> 18#include <linux/mutex.h>
@@ -237,6 +238,9 @@ void dm_table_destroy(struct dm_table *t)
237{ 238{
238 unsigned int i; 239 unsigned int i;
239 240
241 if (!t)
242 return;
243
240 while (atomic_read(&t->holders)) 244 while (atomic_read(&t->holders))
241 msleep(1); 245 msleep(1);
242 smp_mb(); 246 smp_mb();
@@ -425,8 +429,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
425 * it's already present. 429 * it's already present.
426 */ 430 */
427static int __table_get_device(struct dm_table *t, struct dm_target *ti, 431static int __table_get_device(struct dm_table *t, struct dm_target *ti,
428 const char *path, sector_t start, sector_t len, 432 const char *path, fmode_t mode, struct dm_dev **result)
429 fmode_t mode, struct dm_dev **result)
430{ 433{
431 int r; 434 int r;
432 dev_t uninitialized_var(dev); 435 dev_t uninitialized_var(dev);
@@ -499,16 +502,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
499 return 0; 502 return 0;
500 } 503 }
501 504
502 if (blk_stack_limits(limits, &q->limits, start << 9) < 0) 505 if (bdev_stack_limits(limits, bdev, start) < 0)
503 DMWARN("%s: target device %s is misaligned: " 506 DMWARN("%s: adding target device %s caused an alignment inconsistency: "
504 "physical_block_size=%u, logical_block_size=%u, " 507 "physical_block_size=%u, logical_block_size=%u, "
505 "alignment_offset=%u, start=%llu", 508 "alignment_offset=%u, start=%llu",
506 dm_device_name(ti->table->md), bdevname(bdev, b), 509 dm_device_name(ti->table->md), bdevname(bdev, b),
507 q->limits.physical_block_size, 510 q->limits.physical_block_size,
508 q->limits.logical_block_size, 511 q->limits.logical_block_size,
509 q->limits.alignment_offset, 512 q->limits.alignment_offset,
510 (unsigned long long) start << 9); 513 (unsigned long long) start << SECTOR_SHIFT);
511
512 514
513 /* 515 /*
514 * Check if merge fn is supported. 516 * Check if merge fn is supported.
@@ -524,11 +526,10 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
524} 526}
525EXPORT_SYMBOL_GPL(dm_set_device_limits); 527EXPORT_SYMBOL_GPL(dm_set_device_limits);
526 528
527int dm_get_device(struct dm_target *ti, const char *path, sector_t start, 529int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
528 sector_t len, fmode_t mode, struct dm_dev **result) 530 struct dm_dev **result)
529{ 531{
530 return __table_get_device(ti->table, ti, path, 532 return __table_get_device(ti->table, ti, path, mode, result);
531 start, len, mode, result);
532} 533}
533 534
534 535
@@ -600,11 +601,8 @@ int dm_split_args(int *argc, char ***argvp, char *input)
600 return -ENOMEM; 601 return -ENOMEM;
601 602
602 while (1) { 603 while (1) {
603 start = end;
604
605 /* Skip whitespace */ 604 /* Skip whitespace */
606 while (*start && isspace(*start)) 605 start = skip_spaces(end);
607 start++;
608 606
609 if (!*start) 607 if (!*start)
610 break; /* success, we hit the end */ 608 break; /* success, we hit the end */
@@ -1025,9 +1023,9 @@ combine_limits:
1025 * for the table. 1023 * for the table.
1026 */ 1024 */
1027 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1025 if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1028 DMWARN("%s: target device " 1026 DMWARN("%s: adding target device "
1029 "(start sect %llu len %llu) " 1027 "(start sect %llu len %llu) "
1030 "is misaligned", 1028 "caused an alignment inconsistency",
1031 dm_device_name(table->md), 1029 dm_device_name(table->md),
1032 (unsigned long long) ti->begin, 1030 (unsigned long long) ti->begin,
1033 (unsigned long long) ti->len); 1031 (unsigned long long) ti->len);
@@ -1079,15 +1077,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1079 struct queue_limits *limits) 1077 struct queue_limits *limits)
1080{ 1078{
1081 /* 1079 /*
1082 * Each target device in the table has a data area that should normally
1083 * be aligned such that the DM device's alignment_offset is 0.
1084 * FIXME: Propagate alignment_offsets up the stack and warn of
1085 * sub-optimal or inconsistent settings.
1086 */
1087 limits->alignment_offset = 0;
1088 limits->misaligned = 0;
1089
1090 /*
1091 * Copy table's limits to the DM device's request_queue 1080 * Copy table's limits to the DM device's request_queue
1092 */ 1081 */
1093 q->limits = *limits; 1082 q->limits = *limits;
@@ -1240,8 +1229,6 @@ void dm_table_unplug_all(struct dm_table *t)
1240 1229
1241struct mapped_device *dm_table_get_md(struct dm_table *t) 1230struct mapped_device *dm_table_get_md(struct dm_table *t)
1242{ 1231{
1243 dm_get(t->md);
1244
1245 return t->md; 1232 return t->md;
1246} 1233}
1247 1234
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 04feccf2a997..11dea11dc0b6 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -10,7 +10,6 @@
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/kmod.h> 11#include <linux/kmod.h>
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/slab.h>
14 13
15#define DM_MSG_PREFIX "target" 14#define DM_MSG_PREFIX "target"
16 15
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 6f65883aef12..6b1e3b61b25e 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
139 list_del_init(&event->elist); 139 list_del_init(&event->elist);
140 140
141 /* 141 /*
142 * Need to call dm_copy_name_and_uuid from here for now. 142 * When a device is being removed this copy fails and we
143 * Context of previous var adds and locking used for 143 * discard these unsent events.
144 * hash_cell not compatable.
145 */ 144 */
146 if (dm_copy_name_and_uuid(event->md, event->name, 145 if (dm_copy_name_and_uuid(event->md, event->name,
147 event->uuid)) { 146 event->uuid)) {
148 DMERR("%s: dm_copy_name_and_uuid() failed", 147 DMINFO("%s: skipping sending uevent for lost device",
149 __func__); 148 __func__);
150 goto uevent_free; 149 goto uevent_free;
151 } 150 }
152 151
@@ -188,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
188 187
189 if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { 188 if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) {
190 DMERR("%s: Invalid event_type %d", __func__, event_type); 189 DMERR("%s: Invalid event_type %d", __func__, event_type);
191 goto out; 190 return;
192 } 191 }
193 192
194 event = dm_build_path_uevent(md, ti, 193 event = dm_build_path_uevent(md, ti,
@@ -196,12 +195,9 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
196 _dm_uevent_type_names[event_type].name, 195 _dm_uevent_type_names[event_type].name,
197 path, nr_valid_paths); 196 path, nr_valid_paths);
198 if (IS_ERR(event)) 197 if (IS_ERR(event))
199 goto out; 198 return;
200 199
201 dm_uevent_add(md, &event->elist); 200 dm_uevent_add(md, &event->elist);
202
203out:
204 dm_put(md);
205} 201}
206EXPORT_SYMBOL_GPL(dm_path_uevent); 202EXPORT_SYMBOL_GPL(dm_path_uevent);
207 203
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 724efc63904d..d21e1284604f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -143,9 +143,19 @@ struct mapped_device {
143 int barrier_error; 143 int barrier_error;
144 144
145 /* 145 /*
146 * Protect barrier_error from concurrent endio processing
147 * in request-based dm.
148 */
149 spinlock_t barrier_error_lock;
150
151 /*
146 * Processing queue (flush/barriers) 152 * Processing queue (flush/barriers)
147 */ 153 */
148 struct workqueue_struct *wq; 154 struct workqueue_struct *wq;
155 struct work_struct barrier_work;
156
157 /* A pointer to the currently processing pre/post flush request */
158 struct request *flush_request;
149 159
150 /* 160 /*
151 * The current mapping. 161 * The current mapping.
@@ -178,9 +188,6 @@ struct mapped_device {
178 /* forced geometry settings */ 188 /* forced geometry settings */
179 struct hd_geometry geometry; 189 struct hd_geometry geometry;
180 190
181 /* marker of flush suspend for request-based dm */
182 struct request suspend_rq;
183
184 /* For saving the address of __make_request for request based dm */ 191 /* For saving the address of __make_request for request based dm */
185 make_request_fn *saved_make_request_fn; 192 make_request_fn *saved_make_request_fn;
186 193
@@ -275,6 +282,7 @@ static int (*_inits[])(void) __initdata = {
275 dm_target_init, 282 dm_target_init,
276 dm_linear_init, 283 dm_linear_init,
277 dm_stripe_init, 284 dm_stripe_init,
285 dm_io_init,
278 dm_kcopyd_init, 286 dm_kcopyd_init,
279 dm_interface_init, 287 dm_interface_init,
280}; 288};
@@ -284,6 +292,7 @@ static void (*_exits[])(void) = {
284 dm_target_exit, 292 dm_target_exit,
285 dm_linear_exit, 293 dm_linear_exit,
286 dm_stripe_exit, 294 dm_stripe_exit,
295 dm_io_exit,
287 dm_kcopyd_exit, 296 dm_kcopyd_exit,
288 dm_interface_exit, 297 dm_interface_exit,
289}; 298};
@@ -320,6 +329,11 @@ static void __exit dm_exit(void)
320/* 329/*
321 * Block device functions 330 * Block device functions
322 */ 331 */
332int dm_deleting_md(struct mapped_device *md)
333{
334 return test_bit(DMF_DELETING, &md->flags);
335}
336
323static int dm_blk_open(struct block_device *bdev, fmode_t mode) 337static int dm_blk_open(struct block_device *bdev, fmode_t mode)
324{ 338{
325 struct mapped_device *md; 339 struct mapped_device *md;
@@ -331,7 +345,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
331 goto out; 345 goto out;
332 346
333 if (test_bit(DMF_FREEING, &md->flags) || 347 if (test_bit(DMF_FREEING, &md->flags) ||
334 test_bit(DMF_DELETING, &md->flags)) { 348 dm_deleting_md(md)) {
335 md = NULL; 349 md = NULL;
336 goto out; 350 goto out;
337 } 351 }
@@ -388,7 +402,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
388 unsigned int cmd, unsigned long arg) 402 unsigned int cmd, unsigned long arg)
389{ 403{
390 struct mapped_device *md = bdev->bd_disk->private_data; 404 struct mapped_device *md = bdev->bd_disk->private_data;
391 struct dm_table *map = dm_get_table(md); 405 struct dm_table *map = dm_get_live_table(md);
392 struct dm_target *tgt; 406 struct dm_target *tgt;
393 int r = -ENOTTY; 407 int r = -ENOTTY;
394 408
@@ -401,7 +415,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
401 415
402 tgt = dm_table_get_target(map, 0); 416 tgt = dm_table_get_target(map, 0);
403 417
404 if (dm_suspended(md)) { 418 if (dm_suspended_md(md)) {
405 r = -EAGAIN; 419 r = -EAGAIN;
406 goto out; 420 goto out;
407 } 421 }
@@ -430,9 +444,10 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
430 mempool_free(tio, md->tio_pool); 444 mempool_free(tio, md->tio_pool);
431} 445}
432 446
433static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md) 447static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
448 gfp_t gfp_mask)
434{ 449{
435 return mempool_alloc(md->tio_pool, GFP_ATOMIC); 450 return mempool_alloc(md->tio_pool, gfp_mask);
436} 451}
437 452
438static void free_rq_tio(struct dm_rq_target_io *tio) 453static void free_rq_tio(struct dm_rq_target_io *tio)
@@ -450,6 +465,12 @@ static void free_bio_info(struct dm_rq_clone_bio_info *info)
450 mempool_free(info, info->tio->md->io_pool); 465 mempool_free(info, info->tio->md->io_pool);
451} 466}
452 467
468static int md_in_flight(struct mapped_device *md)
469{
470 return atomic_read(&md->pending[READ]) +
471 atomic_read(&md->pending[WRITE]);
472}
473
453static void start_io_acct(struct dm_io *io) 474static void start_io_acct(struct dm_io *io)
454{ 475{
455 struct mapped_device *md = io->md; 476 struct mapped_device *md = io->md;
@@ -512,7 +533,7 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
512 * function to access the md->map field, and make sure they call 533 * function to access the md->map field, and make sure they call
513 * dm_table_put() when finished. 534 * dm_table_put() when finished.
514 */ 535 */
515struct dm_table *dm_get_table(struct mapped_device *md) 536struct dm_table *dm_get_live_table(struct mapped_device *md)
516{ 537{
517 struct dm_table *t; 538 struct dm_table *t;
518 unsigned long flags; 539 unsigned long flags;
@@ -614,8 +635,10 @@ static void dec_pending(struct dm_io *io, int error)
614 if (!md->barrier_error && io_error != -EOPNOTSUPP) 635 if (!md->barrier_error && io_error != -EOPNOTSUPP)
615 md->barrier_error = io_error; 636 md->barrier_error = io_error;
616 end_io_acct(io); 637 end_io_acct(io);
638 free_io(md, io);
617 } else { 639 } else {
618 end_io_acct(io); 640 end_io_acct(io);
641 free_io(md, io);
619 642
620 if (io_error != DM_ENDIO_REQUEUE) { 643 if (io_error != DM_ENDIO_REQUEUE) {
621 trace_block_bio_complete(md->queue, bio); 644 trace_block_bio_complete(md->queue, bio);
@@ -623,8 +646,6 @@ static void dec_pending(struct dm_io *io, int error)
623 bio_endio(bio, io_error); 646 bio_endio(bio, io_error);
624 } 647 }
625 } 648 }
626
627 free_io(md, io);
628 } 649 }
629} 650}
630 651
@@ -716,28 +737,38 @@ static void end_clone_bio(struct bio *clone, int error)
716 blk_update_request(tio->orig, 0, nr_bytes); 737 blk_update_request(tio->orig, 0, nr_bytes);
717} 738}
718 739
740static void store_barrier_error(struct mapped_device *md, int error)
741{
742 unsigned long flags;
743
744 spin_lock_irqsave(&md->barrier_error_lock, flags);
745 /*
746 * Basically, the first error is taken, but:
747 * -EOPNOTSUPP supersedes any I/O error.
748 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
749 */
750 if (!md->barrier_error || error == -EOPNOTSUPP ||
751 (md->barrier_error != -EOPNOTSUPP &&
752 error == DM_ENDIO_REQUEUE))
753 md->barrier_error = error;
754 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
755}
756
719/* 757/*
720 * Don't touch any member of the md after calling this function because 758 * Don't touch any member of the md after calling this function because
721 * the md may be freed in dm_put() at the end of this function. 759 * the md may be freed in dm_put() at the end of this function.
722 * Or do dm_get() before calling this function and dm_put() later. 760 * Or do dm_get() before calling this function and dm_put() later.
723 */ 761 */
724static void rq_completed(struct mapped_device *md, int run_queue) 762static void rq_completed(struct mapped_device *md, int rw, int run_queue)
725{ 763{
726 int wakeup_waiters = 0; 764 atomic_dec(&md->pending[rw]);
727 struct request_queue *q = md->queue;
728 unsigned long flags;
729
730 spin_lock_irqsave(q->queue_lock, flags);
731 if (!queue_in_flight(q))
732 wakeup_waiters = 1;
733 spin_unlock_irqrestore(q->queue_lock, flags);
734 765
735 /* nudge anyone waiting on suspend queue */ 766 /* nudge anyone waiting on suspend queue */
736 if (wakeup_waiters) 767 if (!md_in_flight(md))
737 wake_up(&md->wait); 768 wake_up(&md->wait);
738 769
739 if (run_queue) 770 if (run_queue)
740 blk_run_queue(q); 771 blk_run_queue(md->queue);
741 772
742 /* 773 /*
743 * dm_put() must be at the end of this function. See the comment above 774 * dm_put() must be at the end of this function. See the comment above
@@ -753,6 +784,44 @@ static void free_rq_clone(struct request *clone)
753 free_rq_tio(tio); 784 free_rq_tio(tio);
754} 785}
755 786
787/*
788 * Complete the clone and the original request.
789 * Must be called without queue lock.
790 */
791static void dm_end_request(struct request *clone, int error)
792{
793 int rw = rq_data_dir(clone);
794 int run_queue = 1;
795 bool is_barrier = blk_barrier_rq(clone);
796 struct dm_rq_target_io *tio = clone->end_io_data;
797 struct mapped_device *md = tio->md;
798 struct request *rq = tio->orig;
799
800 if (blk_pc_request(rq) && !is_barrier) {
801 rq->errors = clone->errors;
802 rq->resid_len = clone->resid_len;
803
804 if (rq->sense)
805 /*
806 * We are using the sense buffer of the original
807 * request.
808 * So setting the length of the sense data is enough.
809 */
810 rq->sense_len = clone->sense_len;
811 }
812
813 free_rq_clone(clone);
814
815 if (unlikely(is_barrier)) {
816 if (unlikely(error))
817 store_barrier_error(md, error);
818 run_queue = 0;
819 } else
820 blk_end_request_all(rq, error);
821
822 rq_completed(md, rw, run_queue);
823}
824
756static void dm_unprep_request(struct request *rq) 825static void dm_unprep_request(struct request *rq)
757{ 826{
758 struct request *clone = rq->special; 827 struct request *clone = rq->special;
@@ -768,12 +837,23 @@ static void dm_unprep_request(struct request *rq)
768 */ 837 */
769void dm_requeue_unmapped_request(struct request *clone) 838void dm_requeue_unmapped_request(struct request *clone)
770{ 839{
840 int rw = rq_data_dir(clone);
771 struct dm_rq_target_io *tio = clone->end_io_data; 841 struct dm_rq_target_io *tio = clone->end_io_data;
772 struct mapped_device *md = tio->md; 842 struct mapped_device *md = tio->md;
773 struct request *rq = tio->orig; 843 struct request *rq = tio->orig;
774 struct request_queue *q = rq->q; 844 struct request_queue *q = rq->q;
775 unsigned long flags; 845 unsigned long flags;
776 846
847 if (unlikely(blk_barrier_rq(clone))) {
848 /*
849 * Barrier clones share an original request.
850 * Leave it to dm_end_request(), which handles this special
851 * case.
852 */
853 dm_end_request(clone, DM_ENDIO_REQUEUE);
854 return;
855 }
856
777 dm_unprep_request(rq); 857 dm_unprep_request(rq);
778 858
779 spin_lock_irqsave(q->queue_lock, flags); 859 spin_lock_irqsave(q->queue_lock, flags);
@@ -782,7 +862,7 @@ void dm_requeue_unmapped_request(struct request *clone)
782 blk_requeue_request(q, rq); 862 blk_requeue_request(q, rq);
783 spin_unlock_irqrestore(q->queue_lock, flags); 863 spin_unlock_irqrestore(q->queue_lock, flags);
784 864
785 rq_completed(md, 0); 865 rq_completed(md, rw, 0);
786} 866}
787EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 867EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
788 868
@@ -815,34 +895,28 @@ static void start_queue(struct request_queue *q)
815 spin_unlock_irqrestore(q->queue_lock, flags); 895 spin_unlock_irqrestore(q->queue_lock, flags);
816} 896}
817 897
818/* 898static void dm_done(struct request *clone, int error, bool mapped)
819 * Complete the clone and the original request.
820 * Must be called without queue lock.
821 */
822static void dm_end_request(struct request *clone, int error)
823{ 899{
900 int r = error;
824 struct dm_rq_target_io *tio = clone->end_io_data; 901 struct dm_rq_target_io *tio = clone->end_io_data;
825 struct mapped_device *md = tio->md; 902 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
826 struct request *rq = tio->orig;
827 903
828 if (blk_pc_request(rq)) { 904 if (mapped && rq_end_io)
829 rq->errors = clone->errors; 905 r = rq_end_io(tio->ti, clone, error, &tio->info);
830 rq->resid_len = clone->resid_len;
831 906
832 if (rq->sense) 907 if (r <= 0)
833 /* 908 /* The target wants to complete the I/O */
834 * We are using the sense buffer of the original 909 dm_end_request(clone, r);
835 * request. 910 else if (r == DM_ENDIO_INCOMPLETE)
836 * So setting the length of the sense data is enough. 911 /* The target will handle the I/O */
837 */ 912 return;
838 rq->sense_len = clone->sense_len; 913 else if (r == DM_ENDIO_REQUEUE)
914 /* The target wants to requeue the I/O */
915 dm_requeue_unmapped_request(clone);
916 else {
917 DMWARN("unimplemented target endio return value: %d", r);
918 BUG();
839 } 919 }
840
841 free_rq_clone(clone);
842
843 blk_end_request_all(rq, error);
844
845 rq_completed(md, 1);
846} 920}
847 921
848/* 922/*
@@ -850,27 +924,14 @@ static void dm_end_request(struct request *clone, int error)
850 */ 924 */
851static void dm_softirq_done(struct request *rq) 925static void dm_softirq_done(struct request *rq)
852{ 926{
927 bool mapped = true;
853 struct request *clone = rq->completion_data; 928 struct request *clone = rq->completion_data;
854 struct dm_rq_target_io *tio = clone->end_io_data; 929 struct dm_rq_target_io *tio = clone->end_io_data;
855 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
856 int error = tio->error;
857 930
858 if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io) 931 if (rq->cmd_flags & REQ_FAILED)
859 error = rq_end_io(tio->ti, clone, error, &tio->info); 932 mapped = false;
860 933
861 if (error <= 0) 934 dm_done(clone, tio->error, mapped);
862 /* The target wants to complete the I/O */
863 dm_end_request(clone, error);
864 else if (error == DM_ENDIO_INCOMPLETE)
865 /* The target will handle the I/O */
866 return;
867 else if (error == DM_ENDIO_REQUEUE)
868 /* The target wants to requeue the I/O */
869 dm_requeue_unmapped_request(clone);
870 else {
871 DMWARN("unimplemented target endio return value: %d", error);
872 BUG();
873 }
874} 935}
875 936
876/* 937/*
@@ -882,6 +943,19 @@ static void dm_complete_request(struct request *clone, int error)
882 struct dm_rq_target_io *tio = clone->end_io_data; 943 struct dm_rq_target_io *tio = clone->end_io_data;
883 struct request *rq = tio->orig; 944 struct request *rq = tio->orig;
884 945
946 if (unlikely(blk_barrier_rq(clone))) {
947 /*
948 * Barrier clones share an original request. So can't use
949 * softirq_done with the original.
950 * Pass the clone to dm_done() directly in this special case.
951 * It is safe (even if clone->q->queue_lock is held here)
952 * because there is no I/O dispatching during the completion
953 * of barrier clone.
954 */
955 dm_done(clone, error, true);
956 return;
957 }
958
885 tio->error = error; 959 tio->error = error;
886 rq->completion_data = clone; 960 rq->completion_data = clone;
887 blk_complete_request(rq); 961 blk_complete_request(rq);
@@ -898,6 +972,17 @@ void dm_kill_unmapped_request(struct request *clone, int error)
898 struct dm_rq_target_io *tio = clone->end_io_data; 972 struct dm_rq_target_io *tio = clone->end_io_data;
899 struct request *rq = tio->orig; 973 struct request *rq = tio->orig;
900 974
975 if (unlikely(blk_barrier_rq(clone))) {
976 /*
977 * Barrier clones share an original request.
978 * Leave it to dm_end_request(), which handles this special
979 * case.
980 */
981 BUG_ON(error > 0);
982 dm_end_request(clone, error);
983 return;
984 }
985
901 rq->cmd_flags |= REQ_FAILED; 986 rq->cmd_flags |= REQ_FAILED;
902 dm_complete_request(clone, error); 987 dm_complete_request(clone, error);
903} 988}
@@ -1214,7 +1299,7 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1214 struct clone_info ci; 1299 struct clone_info ci;
1215 int error = 0; 1300 int error = 0;
1216 1301
1217 ci.map = dm_get_table(md); 1302 ci.map = dm_get_live_table(md);
1218 if (unlikely(!ci.map)) { 1303 if (unlikely(!ci.map)) {
1219 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1304 if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
1220 bio_io_error(bio); 1305 bio_io_error(bio);
@@ -1255,7 +1340,7 @@ static int dm_merge_bvec(struct request_queue *q,
1255 struct bio_vec *biovec) 1340 struct bio_vec *biovec)
1256{ 1341{
1257 struct mapped_device *md = q->queuedata; 1342 struct mapped_device *md = q->queuedata;
1258 struct dm_table *map = dm_get_table(md); 1343 struct dm_table *map = dm_get_live_table(md);
1259 struct dm_target *ti; 1344 struct dm_target *ti;
1260 sector_t max_sectors; 1345 sector_t max_sectors;
1261 int max_size = 0; 1346 int max_size = 0;
@@ -1352,11 +1437,6 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
1352{ 1437{
1353 struct mapped_device *md = q->queuedata; 1438 struct mapped_device *md = q->queuedata;
1354 1439
1355 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1356 bio_endio(bio, -EOPNOTSUPP);
1357 return 0;
1358 }
1359
1360 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1440 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1361} 1441}
1362 1442
@@ -1375,6 +1455,25 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1375 return _dm_request(q, bio); 1455 return _dm_request(q, bio);
1376} 1456}
1377 1457
1458/*
1459 * Mark this request as flush request, so that dm_request_fn() can
1460 * recognize.
1461 */
1462static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
1463{
1464 rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
1465 rq->cmd[0] = REQ_LB_OP_FLUSH;
1466}
1467
1468static bool dm_rq_is_flush_request(struct request *rq)
1469{
1470 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
1471 rq->cmd[0] == REQ_LB_OP_FLUSH)
1472 return true;
1473 else
1474 return false;
1475}
1476
1378void dm_dispatch_request(struct request *rq) 1477void dm_dispatch_request(struct request *rq)
1379{ 1478{
1380 int r; 1479 int r;
@@ -1420,25 +1519,54 @@ static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1420static int setup_clone(struct request *clone, struct request *rq, 1519static int setup_clone(struct request *clone, struct request *rq,
1421 struct dm_rq_target_io *tio) 1520 struct dm_rq_target_io *tio)
1422{ 1521{
1423 int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1522 int r;
1424 dm_rq_bio_constructor, tio);
1425 1523
1426 if (r) 1524 if (dm_rq_is_flush_request(rq)) {
1427 return r; 1525 blk_rq_init(NULL, clone);
1526 clone->cmd_type = REQ_TYPE_FS;
1527 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
1528 } else {
1529 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1530 dm_rq_bio_constructor, tio);
1531 if (r)
1532 return r;
1533
1534 clone->cmd = rq->cmd;
1535 clone->cmd_len = rq->cmd_len;
1536 clone->sense = rq->sense;
1537 clone->buffer = rq->buffer;
1538 }
1428 1539
1429 clone->cmd = rq->cmd;
1430 clone->cmd_len = rq->cmd_len;
1431 clone->sense = rq->sense;
1432 clone->buffer = rq->buffer;
1433 clone->end_io = end_clone_request; 1540 clone->end_io = end_clone_request;
1434 clone->end_io_data = tio; 1541 clone->end_io_data = tio;
1435 1542
1436 return 0; 1543 return 0;
1437} 1544}
1438 1545
1439static int dm_rq_flush_suspending(struct mapped_device *md) 1546static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1547 gfp_t gfp_mask)
1440{ 1548{
1441 return !md->suspend_rq.special; 1549 struct request *clone;
1550 struct dm_rq_target_io *tio;
1551
1552 tio = alloc_rq_tio(md, gfp_mask);
1553 if (!tio)
1554 return NULL;
1555
1556 tio->md = md;
1557 tio->ti = NULL;
1558 tio->orig = rq;
1559 tio->error = 0;
1560 memset(&tio->info, 0, sizeof(tio->info));
1561
1562 clone = &tio->clone;
1563 if (setup_clone(clone, rq, tio)) {
1564 /* -ENOMEM */
1565 free_rq_tio(tio);
1566 return NULL;
1567 }
1568
1569 return clone;
1442} 1570}
1443 1571
1444/* 1572/*
@@ -1447,51 +1575,35 @@ static int dm_rq_flush_suspending(struct mapped_device *md)
1447static int dm_prep_fn(struct request_queue *q, struct request *rq) 1575static int dm_prep_fn(struct request_queue *q, struct request *rq)
1448{ 1576{
1449 struct mapped_device *md = q->queuedata; 1577 struct mapped_device *md = q->queuedata;
1450 struct dm_rq_target_io *tio;
1451 struct request *clone; 1578 struct request *clone;
1452 1579
1453 if (unlikely(rq == &md->suspend_rq)) { 1580 if (unlikely(dm_rq_is_flush_request(rq)))
1454 if (dm_rq_flush_suspending(md)) 1581 return BLKPREP_OK;
1455 return BLKPREP_OK;
1456 else
1457 /* The flush suspend was interrupted */
1458 return BLKPREP_KILL;
1459 }
1460 1582
1461 if (unlikely(rq->special)) { 1583 if (unlikely(rq->special)) {
1462 DMWARN("Already has something in rq->special."); 1584 DMWARN("Already has something in rq->special.");
1463 return BLKPREP_KILL; 1585 return BLKPREP_KILL;
1464 } 1586 }
1465 1587
1466 tio = alloc_rq_tio(md); /* Only one for each original request */ 1588 clone = clone_rq(rq, md, GFP_ATOMIC);
1467 if (!tio) 1589 if (!clone)
1468 /* -ENOMEM */
1469 return BLKPREP_DEFER; 1590 return BLKPREP_DEFER;
1470 1591
1471 tio->md = md;
1472 tio->ti = NULL;
1473 tio->orig = rq;
1474 tio->error = 0;
1475 memset(&tio->info, 0, sizeof(tio->info));
1476
1477 clone = &tio->clone;
1478 if (setup_clone(clone, rq, tio)) {
1479 /* -ENOMEM */
1480 free_rq_tio(tio);
1481 return BLKPREP_DEFER;
1482 }
1483
1484 rq->special = clone; 1592 rq->special = clone;
1485 rq->cmd_flags |= REQ_DONTPREP; 1593 rq->cmd_flags |= REQ_DONTPREP;
1486 1594
1487 return BLKPREP_OK; 1595 return BLKPREP_OK;
1488} 1596}
1489 1597
1490static void map_request(struct dm_target *ti, struct request *rq, 1598/*
1491 struct mapped_device *md) 1599 * Returns:
1600 * 0 : the request has been processed (not requeued)
1601 * !0 : the request has been requeued
1602 */
1603static int map_request(struct dm_target *ti, struct request *clone,
1604 struct mapped_device *md)
1492{ 1605{
1493 int r; 1606 int r, requeued = 0;
1494 struct request *clone = rq->special;
1495 struct dm_rq_target_io *tio = clone->end_io_data; 1607 struct dm_rq_target_io *tio = clone->end_io_data;
1496 1608
1497 /* 1609 /*
@@ -1511,11 +1623,14 @@ static void map_request(struct dm_target *ti, struct request *rq,
1511 break; 1623 break;
1512 case DM_MAPIO_REMAPPED: 1624 case DM_MAPIO_REMAPPED:
1513 /* The target has remapped the I/O so dispatch it */ 1625 /* The target has remapped the I/O so dispatch it */
1626 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1627 blk_rq_pos(tio->orig));
1514 dm_dispatch_request(clone); 1628 dm_dispatch_request(clone);
1515 break; 1629 break;
1516 case DM_MAPIO_REQUEUE: 1630 case DM_MAPIO_REQUEUE:
1517 /* The target wants to requeue the I/O */ 1631 /* The target wants to requeue the I/O */
1518 dm_requeue_unmapped_request(clone); 1632 dm_requeue_unmapped_request(clone);
1633 requeued = 1;
1519 break; 1634 break;
1520 default: 1635 default:
1521 if (r > 0) { 1636 if (r > 0) {
@@ -1527,6 +1642,8 @@ static void map_request(struct dm_target *ti, struct request *rq,
1527 dm_kill_unmapped_request(clone, r); 1642 dm_kill_unmapped_request(clone, r);
1528 break; 1643 break;
1529 } 1644 }
1645
1646 return requeued;
1530} 1647}
1531 1648
1532/* 1649/*
@@ -1536,29 +1653,26 @@ static void map_request(struct dm_target *ti, struct request *rq,
1536static void dm_request_fn(struct request_queue *q) 1653static void dm_request_fn(struct request_queue *q)
1537{ 1654{
1538 struct mapped_device *md = q->queuedata; 1655 struct mapped_device *md = q->queuedata;
1539 struct dm_table *map = dm_get_table(md); 1656 struct dm_table *map = dm_get_live_table(md);
1540 struct dm_target *ti; 1657 struct dm_target *ti;
1541 struct request *rq; 1658 struct request *rq, *clone;
1542 1659
1543 /* 1660 /*
1544 * For noflush suspend, check blk_queue_stopped() to immediately 1661 * For suspend, check blk_queue_stopped() and increment
1545 * quit I/O dispatching. 1662 * ->pending within a single queue_lock not to increment the
1663 * number of in-flight I/Os after the queue is stopped in
1664 * dm_suspend().
1546 */ 1665 */
1547 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1666 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1548 rq = blk_peek_request(q); 1667 rq = blk_peek_request(q);
1549 if (!rq) 1668 if (!rq)
1550 goto plug_and_out; 1669 goto plug_and_out;
1551 1670
1552 if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */ 1671 if (unlikely(dm_rq_is_flush_request(rq))) {
1553 if (queue_in_flight(q)) 1672 BUG_ON(md->flush_request);
1554 /* Not quiet yet. Wait more */ 1673 md->flush_request = rq;
1555 goto plug_and_out;
1556
1557 /* This device should be quiet now */
1558 __stop_queue(q);
1559 blk_start_request(rq); 1674 blk_start_request(rq);
1560 __blk_end_request_all(rq, 0); 1675 queue_work(md->wq, &md->barrier_work);
1561 wake_up(&md->wait);
1562 goto out; 1676 goto out;
1563 } 1677 }
1564 1678
@@ -1567,13 +1681,21 @@ static void dm_request_fn(struct request_queue *q)
1567 goto plug_and_out; 1681 goto plug_and_out;
1568 1682
1569 blk_start_request(rq); 1683 blk_start_request(rq);
1684 clone = rq->special;
1685 atomic_inc(&md->pending[rq_data_dir(clone)]);
1686
1570 spin_unlock(q->queue_lock); 1687 spin_unlock(q->queue_lock);
1571 map_request(ti, rq, md); 1688 if (map_request(ti, clone, md))
1689 goto requeued;
1690
1572 spin_lock_irq(q->queue_lock); 1691 spin_lock_irq(q->queue_lock);
1573 } 1692 }
1574 1693
1575 goto out; 1694 goto out;
1576 1695
1696requeued:
1697 spin_lock_irq(q->queue_lock);
1698
1577plug_and_out: 1699plug_and_out:
1578 if (!elv_queue_empty(q)) 1700 if (!elv_queue_empty(q))
1579 /* Some requests still remain, retry later */ 1701 /* Some requests still remain, retry later */
@@ -1595,7 +1717,7 @@ static int dm_lld_busy(struct request_queue *q)
1595{ 1717{
1596 int r; 1718 int r;
1597 struct mapped_device *md = q->queuedata; 1719 struct mapped_device *md = q->queuedata;
1598 struct dm_table *map = dm_get_table(md); 1720 struct dm_table *map = dm_get_live_table(md);
1599 1721
1600 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1722 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1601 r = 1; 1723 r = 1;
@@ -1610,7 +1732,7 @@ static int dm_lld_busy(struct request_queue *q)
1610static void dm_unplug_all(struct request_queue *q) 1732static void dm_unplug_all(struct request_queue *q)
1611{ 1733{
1612 struct mapped_device *md = q->queuedata; 1734 struct mapped_device *md = q->queuedata;
1613 struct dm_table *map = dm_get_table(md); 1735 struct dm_table *map = dm_get_live_table(md);
1614 1736
1615 if (map) { 1737 if (map) {
1616 if (dm_request_based(md)) 1738 if (dm_request_based(md))
@@ -1628,7 +1750,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
1628 struct dm_table *map; 1750 struct dm_table *map;
1629 1751
1630 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1752 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1631 map = dm_get_table(md); 1753 map = dm_get_live_table(md);
1632 if (map) { 1754 if (map) {
1633 /* 1755 /*
1634 * Request-based dm cares about only own queue for 1756 * Request-based dm cares about only own queue for
@@ -1725,6 +1847,7 @@ out:
1725static const struct block_device_operations dm_blk_dops; 1847static const struct block_device_operations dm_blk_dops;
1726 1848
1727static void dm_wq_work(struct work_struct *work); 1849static void dm_wq_work(struct work_struct *work);
1850static void dm_rq_barrier_work(struct work_struct *work);
1728 1851
1729/* 1852/*
1730 * Allocate and initialise a blank device with a given minor. 1853 * Allocate and initialise a blank device with a given minor.
@@ -1754,6 +1877,7 @@ static struct mapped_device *alloc_dev(int minor)
1754 init_rwsem(&md->io_lock); 1877 init_rwsem(&md->io_lock);
1755 mutex_init(&md->suspend_lock); 1878 mutex_init(&md->suspend_lock);
1756 spin_lock_init(&md->deferred_lock); 1879 spin_lock_init(&md->deferred_lock);
1880 spin_lock_init(&md->barrier_error_lock);
1757 rwlock_init(&md->map_lock); 1881 rwlock_init(&md->map_lock);
1758 atomic_set(&md->holders, 1); 1882 atomic_set(&md->holders, 1);
1759 atomic_set(&md->open_count, 0); 1883 atomic_set(&md->open_count, 0);
@@ -1788,6 +1912,8 @@ static struct mapped_device *alloc_dev(int minor)
1788 blk_queue_softirq_done(md->queue, dm_softirq_done); 1912 blk_queue_softirq_done(md->queue, dm_softirq_done);
1789 blk_queue_prep_rq(md->queue, dm_prep_fn); 1913 blk_queue_prep_rq(md->queue, dm_prep_fn);
1790 blk_queue_lld_busy(md->queue, dm_lld_busy); 1914 blk_queue_lld_busy(md->queue, dm_lld_busy);
1915 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
1916 dm_rq_prepare_flush);
1791 1917
1792 md->disk = alloc_disk(1); 1918 md->disk = alloc_disk(1);
1793 if (!md->disk) 1919 if (!md->disk)
@@ -1797,6 +1923,7 @@ static struct mapped_device *alloc_dev(int minor)
1797 atomic_set(&md->pending[1], 0); 1923 atomic_set(&md->pending[1], 0);
1798 init_waitqueue_head(&md->wait); 1924 init_waitqueue_head(&md->wait);
1799 INIT_WORK(&md->work, dm_wq_work); 1925 INIT_WORK(&md->work, dm_wq_work);
1926 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1800 init_waitqueue_head(&md->eventq); 1927 init_waitqueue_head(&md->eventq);
1801 1928
1802 md->disk->major = _major; 1929 md->disk->major = _major;
@@ -1921,9 +2048,13 @@ static void __set_size(struct mapped_device *md, sector_t size)
1921 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2048 mutex_unlock(&md->bdev->bd_inode->i_mutex);
1922} 2049}
1923 2050
1924static int __bind(struct mapped_device *md, struct dm_table *t, 2051/*
1925 struct queue_limits *limits) 2052 * Returns old map, which caller must destroy.
2053 */
2054static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2055 struct queue_limits *limits)
1926{ 2056{
2057 struct dm_table *old_map;
1927 struct request_queue *q = md->queue; 2058 struct request_queue *q = md->queue;
1928 sector_t size; 2059 sector_t size;
1929 unsigned long flags; 2060 unsigned long flags;
@@ -1938,11 +2069,6 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1938 2069
1939 __set_size(md, size); 2070 __set_size(md, size);
1940 2071
1941 if (!size) {
1942 dm_table_destroy(t);
1943 return 0;
1944 }
1945
1946 dm_table_event_callback(t, event_callback, md); 2072 dm_table_event_callback(t, event_callback, md);
1947 2073
1948 /* 2074 /*
@@ -1958,26 +2084,31 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
1958 __bind_mempools(md, t); 2084 __bind_mempools(md, t);
1959 2085
1960 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2087 old_map = md->map;
1961 md->map = t; 2088 md->map = t;
1962 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
1963 write_unlock_irqrestore(&md->map_lock, flags); 2090 write_unlock_irqrestore(&md->map_lock, flags);
1964 2091
1965 return 0; 2092 return old_map;
1966} 2093}
1967 2094
1968static void __unbind(struct mapped_device *md) 2095/*
2096 * Returns unbound table for the caller to free.
2097 */
2098static struct dm_table *__unbind(struct mapped_device *md)
1969{ 2099{
1970 struct dm_table *map = md->map; 2100 struct dm_table *map = md->map;
1971 unsigned long flags; 2101 unsigned long flags;
1972 2102
1973 if (!map) 2103 if (!map)
1974 return; 2104 return NULL;
1975 2105
1976 dm_table_event_callback(map, NULL, NULL); 2106 dm_table_event_callback(map, NULL, NULL);
1977 write_lock_irqsave(&md->map_lock, flags); 2107 write_lock_irqsave(&md->map_lock, flags);
1978 md->map = NULL; 2108 md->map = NULL;
1979 write_unlock_irqrestore(&md->map_lock, flags); 2109 write_unlock_irqrestore(&md->map_lock, flags);
1980 dm_table_destroy(map); 2110
2111 return map;
1981} 2112}
1982 2113
1983/* 2114/*
@@ -2059,18 +2190,18 @@ void dm_put(struct mapped_device *md)
2059 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2190 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2060 2191
2061 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2192 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
2062 map = dm_get_table(md); 2193 map = dm_get_live_table(md);
2063 idr_replace(&_minor_idr, MINOR_ALLOCED, 2194 idr_replace(&_minor_idr, MINOR_ALLOCED,
2064 MINOR(disk_devt(dm_disk(md)))); 2195 MINOR(disk_devt(dm_disk(md))));
2065 set_bit(DMF_FREEING, &md->flags); 2196 set_bit(DMF_FREEING, &md->flags);
2066 spin_unlock(&_minor_lock); 2197 spin_unlock(&_minor_lock);
2067 if (!dm_suspended(md)) { 2198 if (!dm_suspended_md(md)) {
2068 dm_table_presuspend_targets(map); 2199 dm_table_presuspend_targets(map);
2069 dm_table_postsuspend_targets(map); 2200 dm_table_postsuspend_targets(map);
2070 } 2201 }
2071 dm_sysfs_exit(md); 2202 dm_sysfs_exit(md);
2072 dm_table_put(map); 2203 dm_table_put(map);
2073 __unbind(md); 2204 dm_table_destroy(__unbind(md));
2074 free_dev(md); 2205 free_dev(md);
2075 } 2206 }
2076} 2207}
@@ -2080,8 +2211,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2080{ 2211{
2081 int r = 0; 2212 int r = 0;
2082 DECLARE_WAITQUEUE(wait, current); 2213 DECLARE_WAITQUEUE(wait, current);
2083 struct request_queue *q = md->queue;
2084 unsigned long flags;
2085 2214
2086 dm_unplug_all(md->queue); 2215 dm_unplug_all(md->queue);
2087 2216
@@ -2091,15 +2220,7 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2091 set_current_state(interruptible); 2220 set_current_state(interruptible);
2092 2221
2093 smp_mb(); 2222 smp_mb();
2094 if (dm_request_based(md)) { 2223 if (!md_in_flight(md))
2095 spin_lock_irqsave(q->queue_lock, flags);
2096 if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2097 spin_unlock_irqrestore(q->queue_lock, flags);
2098 break;
2099 }
2100 spin_unlock_irqrestore(q->queue_lock, flags);
2101 } else if (!atomic_read(&md->pending[0]) &&
2102 !atomic_read(&md->pending[1]))
2103 break; 2224 break;
2104 2225
2105 if (interruptible == TASK_INTERRUPTIBLE && 2226 if (interruptible == TASK_INTERRUPTIBLE &&
@@ -2194,98 +2315,106 @@ static void dm_queue_flush(struct mapped_device *md)
2194 queue_work(md->wq, &md->work); 2315 queue_work(md->wq, &md->work);
2195} 2316}
2196 2317
2197/* 2318static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
2198 * Swap in a new table (destroying old one).
2199 */
2200int dm_swap_table(struct mapped_device *md, struct dm_table *table)
2201{ 2319{
2202 struct queue_limits limits; 2320 struct dm_rq_target_io *tio = clone->end_io_data;
2203 int r = -EINVAL;
2204 2321
2205 mutex_lock(&md->suspend_lock); 2322 tio->info.flush_request = flush_nr;
2323}
2206 2324
2207 /* device must be suspended */ 2325/* Issue barrier requests to targets and wait for their completion. */
2208 if (!dm_suspended(md)) 2326static int dm_rq_barrier(struct mapped_device *md)
2209 goto out; 2327{
2328 int i, j;
2329 struct dm_table *map = dm_get_live_table(md);
2330 unsigned num_targets = dm_table_get_num_targets(map);
2331 struct dm_target *ti;
2332 struct request *clone;
2210 2333
2211 r = dm_calculate_queue_limits(table, &limits); 2334 md->barrier_error = 0;
2212 if (r)
2213 goto out;
2214 2335
2215 /* cannot change the device type, once a table is bound */ 2336 for (i = 0; i < num_targets; i++) {
2216 if (md->map && 2337 ti = dm_table_get_target(map, i);
2217 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2338 for (j = 0; j < ti->num_flush_requests; j++) {
2218 DMWARN("can't change the device type after a table is bound"); 2339 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2219 goto out; 2340 dm_rq_set_flush_nr(clone, j);
2341 atomic_inc(&md->pending[rq_data_dir(clone)]);
2342 map_request(ti, clone, md);
2343 }
2220 } 2344 }
2221 2345
2222 __unbind(md); 2346 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2223 r = __bind(md, table, &limits); 2347 dm_table_put(map);
2224
2225out:
2226 mutex_unlock(&md->suspend_lock);
2227 return r;
2228}
2229 2348
2230static void dm_rq_invalidate_suspend_marker(struct mapped_device *md) 2349 return md->barrier_error;
2231{
2232 md->suspend_rq.special = (void *)0x1;
2233} 2350}
2234 2351
2235static void dm_rq_abort_suspend(struct mapped_device *md, int noflush) 2352static void dm_rq_barrier_work(struct work_struct *work)
2236{ 2353{
2354 int error;
2355 struct mapped_device *md = container_of(work, struct mapped_device,
2356 barrier_work);
2237 struct request_queue *q = md->queue; 2357 struct request_queue *q = md->queue;
2358 struct request *rq;
2238 unsigned long flags; 2359 unsigned long flags;
2239 2360
2240 spin_lock_irqsave(q->queue_lock, flags); 2361 /*
2241 if (!noflush) 2362 * Hold the md reference here and leave it at the last part so that
2242 dm_rq_invalidate_suspend_marker(md); 2363 * the md can't be deleted by device opener when the barrier request
2243 __start_queue(q); 2364 * completes.
2244 spin_unlock_irqrestore(q->queue_lock, flags); 2365 */
2245} 2366 dm_get(md);
2246 2367
2247static void dm_rq_start_suspend(struct mapped_device *md, int noflush) 2368 error = dm_rq_barrier(md);
2248{
2249 struct request *rq = &md->suspend_rq;
2250 struct request_queue *q = md->queue;
2251 2369
2252 if (noflush) 2370 rq = md->flush_request;
2253 stop_queue(q); 2371 md->flush_request = NULL;
2254 else { 2372
2255 blk_rq_init(q, rq); 2373 if (error == DM_ENDIO_REQUEUE) {
2256 blk_insert_request(q, rq, 0, NULL); 2374 spin_lock_irqsave(q->queue_lock, flags);
2257 } 2375 blk_requeue_request(q, rq);
2376 spin_unlock_irqrestore(q->queue_lock, flags);
2377 } else
2378 blk_end_request_all(rq, error);
2379
2380 blk_run_queue(q);
2381
2382 dm_put(md);
2258} 2383}
2259 2384
2260static int dm_rq_suspend_available(struct mapped_device *md, int noflush) 2385/*
2386 * Swap in a new table, returning the old one for the caller to destroy.
2387 */
2388struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2261{ 2389{
2262 int r = 1; 2390 struct dm_table *map = ERR_PTR(-EINVAL);
2263 struct request *rq = &md->suspend_rq; 2391 struct queue_limits limits;
2264 struct request_queue *q = md->queue; 2392 int r;
2265 unsigned long flags;
2266 2393
2267 if (noflush) 2394 mutex_lock(&md->suspend_lock);
2268 return r;
2269 2395
2270 /* The marker must be protected by queue lock if it is in use */ 2396 /* device must be suspended */
2271 spin_lock_irqsave(q->queue_lock, flags); 2397 if (!dm_suspended_md(md))
2272 if (unlikely(rq->ref_count)) { 2398 goto out;
2273 /* 2399
2274 * This can happen, when the previous flush suspend was 2400 r = dm_calculate_queue_limits(table, &limits);
2275 * interrupted, the marker is still in the queue and 2401 if (r) {
2276 * this flush suspend has been invoked, because we don't 2402 map = ERR_PTR(r);
2277 * remove the marker at the time of suspend interruption. 2403 goto out;
2278 * We have only one marker per mapped_device, so we can't
2279 * start another flush suspend while it is in use.
2280 */
2281 BUG_ON(!rq->special); /* The marker should be invalidated */
2282 DMWARN("Invalidating the previous flush suspend is still in"
2283 " progress. Please retry later.");
2284 r = 0;
2285 } 2404 }
2286 spin_unlock_irqrestore(q->queue_lock, flags);
2287 2405
2288 return r; 2406 /* cannot change the device type, once a table is bound */
2407 if (md->map &&
2408 (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2409 DMWARN("can't change the device type after a table is bound");
2410 goto out;
2411 }
2412
2413 map = __bind(md, table, &limits);
2414
2415out:
2416 mutex_unlock(&md->suspend_lock);
2417 return map;
2289} 2418}
2290 2419
2291/* 2420/*
@@ -2330,49 +2459,11 @@ static void unlock_fs(struct mapped_device *md)
2330/* 2459/*
2331 * Suspend mechanism in request-based dm. 2460 * Suspend mechanism in request-based dm.
2332 * 2461 *
2333 * After the suspend starts, further incoming requests are kept in 2462 * 1. Flush all I/Os by lock_fs() if needed.
2334 * the request_queue and deferred. 2463 * 2. Stop dispatching any I/O by stopping the request_queue.
2335 * Remaining requests in the request_queue at the start of suspend are flushed 2464 * 3. Wait for all in-flight I/Os to be completed or requeued.
2336 * if it is flush suspend.
2337 * The suspend completes when the following conditions have been satisfied,
2338 * so wait for it:
2339 * 1. q->in_flight is 0 (which means no in_flight request)
2340 * 2. queue has been stopped (which means no request dispatching)
2341 * 2465 *
2342 * 2466 * To abort suspend, start the request_queue.
2343 * Noflush suspend
2344 * ---------------
2345 * Noflush suspend doesn't need to dispatch remaining requests.
2346 * So stop the queue immediately. Then, wait for all in_flight requests
2347 * to be completed or requeued.
2348 *
2349 * To abort noflush suspend, start the queue.
2350 *
2351 *
2352 * Flush suspend
2353 * -------------
2354 * Flush suspend needs to dispatch remaining requests. So stop the queue
2355 * after the remaining requests are completed. (Requeued request must be also
2356 * re-dispatched and completed. Until then, we can't stop the queue.)
2357 *
2358 * During flushing the remaining requests, further incoming requests are also
2359 * inserted to the same queue. To distinguish which requests are to be
2360 * flushed, we insert a marker request to the queue at the time of starting
2361 * flush suspend, like a barrier.
2362 * The dispatching is blocked when the marker is found on the top of the queue.
2363 * And the queue is stopped when all in_flight requests are completed, since
2364 * that means the remaining requests are completely flushed.
2365 * Then, the marker is removed from the queue.
2366 *
2367 * To abort flush suspend, we also need to take care of the marker, not only
2368 * starting the queue.
2369 * We don't remove the marker forcibly from the queue since it's against
2370 * the block-layer manner. Instead, we put a invalidated mark on the marker.
2371 * When the invalidated marker is found on the top of the queue, it is
2372 * immediately removed from the queue, so it doesn't block dispatching.
2373 * Because we have only one marker per mapped_device, we can't start another
2374 * flush suspend until the invalidated marker is removed from the queue.
2375 * So fail and return with -EBUSY in such a case.
2376 */ 2467 */
2377int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2468int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2378{ 2469{
@@ -2383,17 +2474,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2383 2474
2384 mutex_lock(&md->suspend_lock); 2475 mutex_lock(&md->suspend_lock);
2385 2476
2386 if (dm_suspended(md)) { 2477 if (dm_suspended_md(md)) {
2387 r = -EINVAL; 2478 r = -EINVAL;
2388 goto out_unlock; 2479 goto out_unlock;
2389 } 2480 }
2390 2481
2391 if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) { 2482 map = dm_get_live_table(md);
2392 r = -EBUSY;
2393 goto out_unlock;
2394 }
2395
2396 map = dm_get_table(md);
2397 2483
2398 /* 2484 /*
2399 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2485 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2406,8 +2492,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2406 dm_table_presuspend_targets(map); 2492 dm_table_presuspend_targets(map);
2407 2493
2408 /* 2494 /*
2409 * Flush I/O to the device. noflush supersedes do_lockfs, 2495 * Flush I/O to the device.
2410 * because lock_fs() needs to flush I/Os. 2496 * Any I/O submitted after lock_fs() may not be flushed.
2497 * noflush takes precedence over do_lockfs.
2498 * (lock_fs() flushes I/Os and waits for them to complete.)
2411 */ 2499 */
2412 if (!noflush && do_lockfs) { 2500 if (!noflush && do_lockfs) {
2413 r = lock_fs(md); 2501 r = lock_fs(md);
@@ -2436,10 +2524,15 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2436 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2524 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2437 up_write(&md->io_lock); 2525 up_write(&md->io_lock);
2438 2526
2439 flush_workqueue(md->wq); 2527 /*
2440 2528 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2529 * can be kicked until md->queue is stopped. So stop md->queue before
2530 * flushing md->wq.
2531 */
2441 if (dm_request_based(md)) 2532 if (dm_request_based(md))
2442 dm_rq_start_suspend(md, noflush); 2533 stop_queue(md->queue);
2534
2535 flush_workqueue(md->wq);
2443 2536
2444 /* 2537 /*
2445 * At this point no more requests are entering target request routines. 2538 * At this point no more requests are entering target request routines.
@@ -2458,7 +2551,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2458 dm_queue_flush(md); 2551 dm_queue_flush(md);
2459 2552
2460 if (dm_request_based(md)) 2553 if (dm_request_based(md))
2461 dm_rq_abort_suspend(md, noflush); 2554 start_queue(md->queue);
2462 2555
2463 unlock_fs(md); 2556 unlock_fs(md);
2464 goto out; /* pushback list is already flushed, so skip flush */ 2557 goto out; /* pushback list is already flushed, so skip flush */
@@ -2470,10 +2563,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2470 * requests are being added to md->deferred list. 2563 * requests are being added to md->deferred list.
2471 */ 2564 */
2472 2565
2473 dm_table_postsuspend_targets(map);
2474
2475 set_bit(DMF_SUSPENDED, &md->flags); 2566 set_bit(DMF_SUSPENDED, &md->flags);
2476 2567
2568 dm_table_postsuspend_targets(map);
2569
2477out: 2570out:
2478 dm_table_put(map); 2571 dm_table_put(map);
2479 2572
@@ -2488,10 +2581,10 @@ int dm_resume(struct mapped_device *md)
2488 struct dm_table *map = NULL; 2581 struct dm_table *map = NULL;
2489 2582
2490 mutex_lock(&md->suspend_lock); 2583 mutex_lock(&md->suspend_lock);
2491 if (!dm_suspended(md)) 2584 if (!dm_suspended_md(md))
2492 goto out; 2585 goto out;
2493 2586
2494 map = dm_get_table(md); 2587 map = dm_get_live_table(md);
2495 if (!map || !dm_table_get_size(map)) 2588 if (!map || !dm_table_get_size(map))
2496 goto out; 2589 goto out;
2497 2590
@@ -2525,18 +2618,19 @@ out:
2525/*----------------------------------------------------------------- 2618/*-----------------------------------------------------------------
2526 * Event notification. 2619 * Event notification.
2527 *---------------------------------------------------------------*/ 2620 *---------------------------------------------------------------*/
2528void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2621int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2529 unsigned cookie) 2622 unsigned cookie)
2530{ 2623{
2531 char udev_cookie[DM_COOKIE_LENGTH]; 2624 char udev_cookie[DM_COOKIE_LENGTH];
2532 char *envp[] = { udev_cookie, NULL }; 2625 char *envp[] = { udev_cookie, NULL };
2533 2626
2534 if (!cookie) 2627 if (!cookie)
2535 kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2628 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2536 else { 2629 else {
2537 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2630 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2538 DM_COOKIE_ENV_VAR_NAME, cookie); 2631 DM_COOKIE_ENV_VAR_NAME, cookie);
2539 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); 2632 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2633 action, envp);
2540 } 2634 }
2541} 2635}
2542 2636
@@ -2592,26 +2686,27 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2592 return NULL; 2686 return NULL;
2593 2687
2594 if (test_bit(DMF_FREEING, &md->flags) || 2688 if (test_bit(DMF_FREEING, &md->flags) ||
2595 test_bit(DMF_DELETING, &md->flags)) 2689 dm_deleting_md(md))
2596 return NULL; 2690 return NULL;
2597 2691
2598 dm_get(md); 2692 dm_get(md);
2599 return md; 2693 return md;
2600} 2694}
2601 2695
2602int dm_suspended(struct mapped_device *md) 2696int dm_suspended_md(struct mapped_device *md)
2603{ 2697{
2604 return test_bit(DMF_SUSPENDED, &md->flags); 2698 return test_bit(DMF_SUSPENDED, &md->flags);
2605} 2699}
2606 2700
2607int dm_noflush_suspending(struct dm_target *ti) 2701int dm_suspended(struct dm_target *ti)
2608{ 2702{
2609 struct mapped_device *md = dm_table_get_md(ti->table); 2703 return dm_suspended_md(dm_table_get_md(ti->table));
2610 int r = __noflush_suspending(md); 2704}
2611 2705EXPORT_SYMBOL_GPL(dm_suspended);
2612 dm_put(md);
2613 2706
2614 return r; 2707int dm_noflush_suspending(struct dm_target *ti)
2708{
2709 return __noflush_suspending(dm_table_get_md(ti->table));
2615} 2710}
2616EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2711EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2617 2712
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e2..bad1724d4869 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -89,6 +89,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
89int dm_split_args(int *argc, char ***argvp, char *input); 89int dm_split_args(int *argc, char ***argvp, char *input);
90 90
91/* 91/*
92 * Is this mapped_device being deleted?
93 */
94int dm_deleting_md(struct mapped_device *md);
95
96/*
97 * Is this mapped_device suspended?
98 */
99int dm_suspended_md(struct mapped_device *md);
100
101/*
92 * The device-mapper can be driven through one of two interfaces; 102 * The device-mapper can be driven through one of two interfaces;
93 * ioctl or filesystem, depending which patch you have applied. 103 * ioctl or filesystem, depending which patch you have applied.
94 */ 104 */
@@ -115,8 +125,11 @@ void dm_stripe_exit(void);
115int dm_open_count(struct mapped_device *md); 125int dm_open_count(struct mapped_device *md);
116int dm_lock_for_deletion(struct mapped_device *md); 126int dm_lock_for_deletion(struct mapped_device *md);
117 127
118void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 128int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
119 unsigned cookie); 129 unsigned cookie);
130
131int dm_io_init(void);
132void dm_io_exit(void);
120 133
121int dm_kcopyd_init(void); 134int dm_kcopyd_init(void);
122void dm_kcopyd_exit(void); 135void dm_kcopyd_exit(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 87d88dbb667f..8e3850b98cca 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -64,6 +64,7 @@
64#define MaxFault 50 64#define MaxFault 50
65#include <linux/blkdev.h> 65#include <linux/blkdev.h>
66#include <linux/raid/md_u.h> 66#include <linux/raid/md_u.h>
67#include <linux/slab.h>
67#include "md.h" 68#include "md.h"
68#include <linux/seq_file.h> 69#include <linux/seq_file.h>
69 70
@@ -360,6 +361,7 @@ static void raid_exit(void)
360module_init(raid_init); 361module_init(raid_init);
361module_exit(raid_exit); 362module_exit(raid_exit);
362MODULE_LICENSE("GPL"); 363MODULE_LICENSE("GPL");
364MODULE_DESCRIPTION("Fault injection personality for MD");
363MODULE_ALIAS("md-personality-10"); /* faulty */ 365MODULE_ALIAS("md-personality-10"); /* faulty */
364MODULE_ALIAS("md-faulty"); 366MODULE_ALIAS("md-faulty");
365MODULE_ALIAS("md-level--5"); 367MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1ceceb334d5e..09437e958235 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -19,6 +19,7 @@
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/raid/md_u.h> 20#include <linux/raid/md_u.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/slab.h>
22#include "md.h" 23#include "md.h"
23#include "linear.h" 24#include "linear.h"
24 25
@@ -172,12 +173,14 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
172 disk_stack_limits(mddev->gendisk, rdev->bdev, 173 disk_stack_limits(mddev->gendisk, rdev->bdev,
173 rdev->data_offset << 9); 174 rdev->data_offset << 9);
174 /* as we don't honour merge_bvec_fn, we must never risk 175 /* as we don't honour merge_bvec_fn, we must never risk
175 * violating it, so limit ->max_sector to one PAGE, as 176 * violating it, so limit max_segments to 1 lying within
176 * a one page request is never in violation. 177 * a single page.
177 */ 178 */
178 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 179 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
179 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 180 blk_queue_max_segments(mddev->queue, 1);
180 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 181 blk_queue_segment_boundary(mddev->queue,
182 PAGE_CACHE_SIZE - 1);
183 }
181 184
182 conf->array_sectors += rdev->sectors; 185 conf->array_sectors += rdev->sectors;
183 cnt++; 186 cnt++;
@@ -292,7 +295,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
292 int cpu; 295 int cpu;
293 296
294 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 297 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
295 bio_endio(bio, -EOPNOTSUPP); 298 md_barrier_request(mddev, bio);
296 return 0; 299 return 0;
297 } 300 }
298 301
@@ -383,6 +386,7 @@ static void linear_exit (void)
383module_init(linear_init); 386module_init(linear_init);
384module_exit(linear_exit); 387module_exit(linear_exit);
385MODULE_LICENSE("GPL"); 388MODULE_LICENSE("GPL");
389MODULE_DESCRIPTION("Linear device concatenation personality for MD");
386MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ 390MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
387MODULE_ALIAS("md-linear"); 391MODULE_ALIAS("md-linear");
388MODULE_ALIAS("md-level--1"); 392MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b182f86a19dd..cefd63daff31 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -39,14 +39,17 @@
39#include <linux/buffer_head.h> /* for invalidate_bdev */ 39#include <linux/buffer_head.h> /* for invalidate_bdev */
40#include <linux/poll.h> 40#include <linux/poll.h>
41#include <linux/ctype.h> 41#include <linux/ctype.h>
42#include <linux/string.h>
42#include <linux/hdreg.h> 43#include <linux/hdreg.h>
43#include <linux/proc_fs.h> 44#include <linux/proc_fs.h>
44#include <linux/random.h> 45#include <linux/random.h>
45#include <linux/reboot.h> 46#include <linux/reboot.h>
46#include <linux/file.h> 47#include <linux/file.h>
48#include <linux/compat.h>
47#include <linux/delay.h> 49#include <linux/delay.h>
48#include <linux/raid/md_p.h> 50#include <linux/raid/md_p.h>
49#include <linux/raid/md_u.h> 51#include <linux/raid/md_u.h>
52#include <linux/slab.h>
50#include "md.h" 53#include "md.h"
51#include "bitmap.h" 54#include "bitmap.h"
52 55
@@ -68,6 +71,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
68#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 71#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69 72
70/* 73/*
74 * Default number of read corrections we'll attempt on an rdev
75 * before ejecting it from the array. We divide the read error
76 * count by 2 for every hour elapsed between read errors.
77 */
78#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
79/*
71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72 * is 1000 KB/sec, so the extra system load does not show up that much. 81 * is 1000 KB/sec, so the extra system load does not show up that much.
73 * Increase it if you want to have more _guaranteed_ speed. Note that 82 * Increase it if you want to have more _guaranteed_ speed. Note that
@@ -98,44 +107,40 @@ static struct ctl_table_header *raid_table_header;
98 107
99static ctl_table raid_table[] = { 108static ctl_table raid_table[] = {
100 { 109 {
101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
102 .procname = "speed_limit_min", 110 .procname = "speed_limit_min",
103 .data = &sysctl_speed_limit_min, 111 .data = &sysctl_speed_limit_min,
104 .maxlen = sizeof(int), 112 .maxlen = sizeof(int),
105 .mode = S_IRUGO|S_IWUSR, 113 .mode = S_IRUGO|S_IWUSR,
106 .proc_handler = &proc_dointvec, 114 .proc_handler = proc_dointvec,
107 }, 115 },
108 { 116 {
109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
110 .procname = "speed_limit_max", 117 .procname = "speed_limit_max",
111 .data = &sysctl_speed_limit_max, 118 .data = &sysctl_speed_limit_max,
112 .maxlen = sizeof(int), 119 .maxlen = sizeof(int),
113 .mode = S_IRUGO|S_IWUSR, 120 .mode = S_IRUGO|S_IWUSR,
114 .proc_handler = &proc_dointvec, 121 .proc_handler = proc_dointvec,
115 }, 122 },
116 { .ctl_name = 0 } 123 { }
117}; 124};
118 125
119static ctl_table raid_dir_table[] = { 126static ctl_table raid_dir_table[] = {
120 { 127 {
121 .ctl_name = DEV_RAID,
122 .procname = "raid", 128 .procname = "raid",
123 .maxlen = 0, 129 .maxlen = 0,
124 .mode = S_IRUGO|S_IXUGO, 130 .mode = S_IRUGO|S_IXUGO,
125 .child = raid_table, 131 .child = raid_table,
126 }, 132 },
127 { .ctl_name = 0 } 133 { }
128}; 134};
129 135
130static ctl_table raid_root_table[] = { 136static ctl_table raid_root_table[] = {
131 { 137 {
132 .ctl_name = CTL_DEV,
133 .procname = "dev", 138 .procname = "dev",
134 .maxlen = 0, 139 .maxlen = 0,
135 .mode = 0555, 140 .mode = 0555,
136 .child = raid_dir_table, 141 .child = raid_dir_table,
137 }, 142 },
138 { .ctl_name = 0 } 143 { }
139}; 144};
140 145
141static const struct block_device_operations md_fops; 146static const struct block_device_operations md_fops;
@@ -217,12 +222,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
217 return 0; 222 return 0;
218 } 223 }
219 rcu_read_lock(); 224 rcu_read_lock();
220 if (mddev->suspended) { 225 if (mddev->suspended || mddev->barrier) {
221 DEFINE_WAIT(__wait); 226 DEFINE_WAIT(__wait);
222 for (;;) { 227 for (;;) {
223 prepare_to_wait(&mddev->sb_wait, &__wait, 228 prepare_to_wait(&mddev->sb_wait, &__wait,
224 TASK_UNINTERRUPTIBLE); 229 TASK_UNINTERRUPTIBLE);
225 if (!mddev->suspended) 230 if (!mddev->suspended && !mddev->barrier)
226 break; 231 break;
227 rcu_read_unlock(); 232 rcu_read_unlock();
228 schedule(); 233 schedule();
@@ -264,10 +269,110 @@ static void mddev_resume(mddev_t *mddev)
264 269
265int mddev_congested(mddev_t *mddev, int bits) 270int mddev_congested(mddev_t *mddev, int bits)
266{ 271{
272 if (mddev->barrier)
273 return 1;
267 return mddev->suspended; 274 return mddev->suspended;
268} 275}
269EXPORT_SYMBOL(mddev_congested); 276EXPORT_SYMBOL(mddev_congested);
270 277
278/*
279 * Generic barrier handling for md
280 */
281
282#define POST_REQUEST_BARRIER ((void*)1)
283
284static void md_end_barrier(struct bio *bio, int err)
285{
286 mdk_rdev_t *rdev = bio->bi_private;
287 mddev_t *mddev = rdev->mddev;
288 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
289 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
290
291 rdev_dec_pending(rdev, mddev);
292
293 if (atomic_dec_and_test(&mddev->flush_pending)) {
294 if (mddev->barrier == POST_REQUEST_BARRIER) {
295 /* This was a post-request barrier */
296 mddev->barrier = NULL;
297 wake_up(&mddev->sb_wait);
298 } else
299 /* The pre-request barrier has finished */
300 schedule_work(&mddev->barrier_work);
301 }
302 bio_put(bio);
303}
304
305static void submit_barriers(mddev_t *mddev)
306{
307 mdk_rdev_t *rdev;
308
309 rcu_read_lock();
310 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
311 if (rdev->raid_disk >= 0 &&
312 !test_bit(Faulty, &rdev->flags)) {
313 /* Take two references, one is dropped
314 * when request finishes, one after
315 * we reclaim rcu_read_lock
316 */
317 struct bio *bi;
318 atomic_inc(&rdev->nr_pending);
319 atomic_inc(&rdev->nr_pending);
320 rcu_read_unlock();
321 bi = bio_alloc(GFP_KERNEL, 0);
322 bi->bi_end_io = md_end_barrier;
323 bi->bi_private = rdev;
324 bi->bi_bdev = rdev->bdev;
325 atomic_inc(&mddev->flush_pending);
326 submit_bio(WRITE_BARRIER, bi);
327 rcu_read_lock();
328 rdev_dec_pending(rdev, mddev);
329 }
330 rcu_read_unlock();
331}
332
333static void md_submit_barrier(struct work_struct *ws)
334{
335 mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
336 struct bio *bio = mddev->barrier;
337
338 atomic_set(&mddev->flush_pending, 1);
339
340 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
341 bio_endio(bio, -EOPNOTSUPP);
342 else if (bio->bi_size == 0)
343 /* an empty barrier - all done */
344 bio_endio(bio, 0);
345 else {
346 bio->bi_rw &= ~(1<<BIO_RW_BARRIER);
347 if (mddev->pers->make_request(mddev->queue, bio))
348 generic_make_request(bio);
349 mddev->barrier = POST_REQUEST_BARRIER;
350 submit_barriers(mddev);
351 }
352 if (atomic_dec_and_test(&mddev->flush_pending)) {
353 mddev->barrier = NULL;
354 wake_up(&mddev->sb_wait);
355 }
356}
357
358void md_barrier_request(mddev_t *mddev, struct bio *bio)
359{
360 spin_lock_irq(&mddev->write_lock);
361 wait_event_lock_irq(mddev->sb_wait,
362 !mddev->barrier,
363 mddev->write_lock, /*nothing*/);
364 mddev->barrier = bio;
365 spin_unlock_irq(&mddev->write_lock);
366
367 atomic_set(&mddev->flush_pending, 1);
368 INIT_WORK(&mddev->barrier_work, md_submit_barrier);
369
370 submit_barriers(mddev);
371
372 if (atomic_dec_and_test(&mddev->flush_pending))
373 schedule_work(&mddev->barrier_work);
374}
375EXPORT_SYMBOL(md_barrier_request);
271 376
272static inline mddev_t *mddev_get(mddev_t *mddev) 377static inline mddev_t *mddev_get(mddev_t *mddev)
273{ 378{
@@ -282,7 +387,9 @@ static void mddev_put(mddev_t *mddev)
282 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 387 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
283 return; 388 return;
284 if (!mddev->raid_disks && list_empty(&mddev->disks) && 389 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
285 !mddev->hold_active) { 390 mddev->ctime == 0 && !mddev->hold_active) {
391 /* Array is not configured at all, and not held active,
392 * so destroy it */
286 list_del(&mddev->all_mddevs); 393 list_del(&mddev->all_mddevs);
287 if (mddev->gendisk) { 394 if (mddev->gendisk) {
288 /* we did a probe so need to clean up. 395 /* we did a probe so need to clean up.
@@ -367,6 +474,7 @@ static mddev_t * mddev_find(dev_t unit)
367 474
368 mutex_init(&new->open_mutex); 475 mutex_init(&new->open_mutex);
369 mutex_init(&new->reconfig_mutex); 476 mutex_init(&new->reconfig_mutex);
477 mutex_init(&new->bitmap_info.mutex);
370 INIT_LIST_HEAD(&new->disks); 478 INIT_LIST_HEAD(&new->disks);
371 INIT_LIST_HEAD(&new->all_mddevs); 479 INIT_LIST_HEAD(&new->all_mddevs);
372 init_timer(&new->safemode_timer); 480 init_timer(&new->safemode_timer);
@@ -374,6 +482,7 @@ static mddev_t * mddev_find(dev_t unit)
374 atomic_set(&new->openers, 0); 482 atomic_set(&new->openers, 0);
375 atomic_set(&new->active_io, 0); 483 atomic_set(&new->active_io, 0);
376 spin_lock_init(&new->write_lock); 484 spin_lock_init(&new->write_lock);
485 atomic_set(&new->flush_pending, 0);
377 init_waitqueue_head(&new->sb_wait); 486 init_waitqueue_head(&new->sb_wait);
378 init_waitqueue_head(&new->recovery_wait); 487 init_waitqueue_head(&new->recovery_wait);
379 new->reshape_position = MaxSector; 488 new->reshape_position = MaxSector;
@@ -752,7 +861,7 @@ struct super_type {
752 */ 861 */
753int md_check_no_bitmap(mddev_t *mddev) 862int md_check_no_bitmap(mddev_t *mddev)
754{ 863{
755 if (!mddev->bitmap_file && !mddev->bitmap_offset) 864 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
756 return 0; 865 return 0;
757 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 866 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
758 mdname(mddev), mddev->pers->name); 867 mdname(mddev), mddev->pers->name);
@@ -880,8 +989,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
880 mddev->raid_disks = sb->raid_disks; 989 mddev->raid_disks = sb->raid_disks;
881 mddev->dev_sectors = sb->size * 2; 990 mddev->dev_sectors = sb->size * 2;
882 mddev->events = ev1; 991 mddev->events = ev1;
883 mddev->bitmap_offset = 0; 992 mddev->bitmap_info.offset = 0;
884 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 993 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
885 994
886 if (mddev->minor_version >= 91) { 995 if (mddev->minor_version >= 91) {
887 mddev->reshape_position = sb->reshape_position; 996 mddev->reshape_position = sb->reshape_position;
@@ -915,8 +1024,9 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
915 mddev->max_disks = MD_SB_DISKS; 1024 mddev->max_disks = MD_SB_DISKS;
916 1025
917 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1026 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
918 mddev->bitmap_file == NULL) 1027 mddev->bitmap_info.file == NULL)
919 mddev->bitmap_offset = mddev->default_bitmap_offset; 1028 mddev->bitmap_info.offset =
1029 mddev->bitmap_info.default_offset;
920 1030
921 } else if (mddev->pers == NULL) { 1031 } else if (mddev->pers == NULL) {
922 /* Insist on good event counter while assembling */ 1032 /* Insist on good event counter while assembling */
@@ -1033,7 +1143,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1033 sb->layout = mddev->layout; 1143 sb->layout = mddev->layout;
1034 sb->chunk_size = mddev->chunk_sectors << 9; 1144 sb->chunk_size = mddev->chunk_sectors << 9;
1035 1145
1036 if (mddev->bitmap && mddev->bitmap_file == NULL) 1146 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1037 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1147 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1038 1148
1039 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1149 sb->disks[0].state = (1<<MD_DISK_REMOVED);
@@ -1111,7 +1221,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1111{ 1221{
1112 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1222 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1113 return 0; /* component must fit device */ 1223 return 0; /* component must fit device */
1114 if (rdev->mddev->bitmap_offset) 1224 if (rdev->mddev->bitmap_info.offset)
1115 return 0; /* can't move bitmap */ 1225 return 0; /* can't move bitmap */
1116 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1226 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1117 if (!num_sectors || num_sectors > rdev->sb_start) 1227 if (!num_sectors || num_sectors > rdev->sb_start)
@@ -1290,8 +1400,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1290 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1400 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1291 mddev->dev_sectors = le64_to_cpu(sb->size); 1401 mddev->dev_sectors = le64_to_cpu(sb->size);
1292 mddev->events = ev1; 1402 mddev->events = ev1;
1293 mddev->bitmap_offset = 0; 1403 mddev->bitmap_info.offset = 0;
1294 mddev->default_bitmap_offset = 1024 >> 9; 1404 mddev->bitmap_info.default_offset = 1024 >> 9;
1295 1405
1296 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1406 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1297 memcpy(mddev->uuid, sb->set_uuid, 16); 1407 memcpy(mddev->uuid, sb->set_uuid, 16);
@@ -1299,8 +1409,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1299 mddev->max_disks = (4096-256)/2; 1409 mddev->max_disks = (4096-256)/2;
1300 1410
1301 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1411 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1302 mddev->bitmap_file == NULL ) 1412 mddev->bitmap_info.file == NULL )
1303 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1413 mddev->bitmap_info.offset =
1414 (__s32)le32_to_cpu(sb->bitmap_offset);
1304 1415
1305 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1416 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1306 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1417 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1394,19 +1505,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1394 sb->level = cpu_to_le32(mddev->level); 1505 sb->level = cpu_to_le32(mddev->level);
1395 sb->layout = cpu_to_le32(mddev->layout); 1506 sb->layout = cpu_to_le32(mddev->layout);
1396 1507
1397 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1508 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1398 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1509 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1399 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1510 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1400 } 1511 }
1401 1512
1402 if (rdev->raid_disk >= 0 && 1513 if (rdev->raid_disk >= 0 &&
1403 !test_bit(In_sync, &rdev->flags)) { 1514 !test_bit(In_sync, &rdev->flags)) {
1404 if (rdev->recovery_offset > 0) { 1515 sb->feature_map |=
1405 sb->feature_map |= 1516 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1406 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1517 sb->recovery_offset =
1407 sb->recovery_offset = 1518 cpu_to_le64(rdev->recovery_offset);
1408 cpu_to_le64(rdev->recovery_offset);
1409 }
1410 } 1519 }
1411 1520
1412 if (mddev->reshape_position != MaxSector) { 1521 if (mddev->reshape_position != MaxSector) {
@@ -1440,7 +1549,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1440 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1549 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1441 else if (test_bit(In_sync, &rdev2->flags)) 1550 else if (test_bit(In_sync, &rdev2->flags))
1442 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1551 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1443 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1552 else if (rdev2->raid_disk >= 0)
1444 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1553 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1445 else 1554 else
1446 sb->dev_roles[i] = cpu_to_le16(0xffff); 1555 sb->dev_roles[i] = cpu_to_le16(0xffff);
@@ -1462,7 +1571,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1462 max_sectors -= rdev->data_offset; 1571 max_sectors -= rdev->data_offset;
1463 if (!num_sectors || num_sectors > max_sectors) 1572 if (!num_sectors || num_sectors > max_sectors)
1464 num_sectors = max_sectors; 1573 num_sectors = max_sectors;
1465 } else if (rdev->mddev->bitmap_offset) { 1574 } else if (rdev->mddev->bitmap_info.offset) {
1466 /* minor version 0 with bitmap we can't move */ 1575 /* minor version 0 with bitmap we can't move */
1467 return 0; 1576 return 0;
1468 } else { 1577 } else {
@@ -1830,15 +1939,11 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1830 1939
1831 uuid = sb->set_uuid; 1940 uuid = sb->set_uuid;
1832 printk(KERN_INFO 1941 printk(KERN_INFO
1833 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1942 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
1834 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1835 "md: Name: \"%s\" CT:%llu\n", 1943 "md: Name: \"%s\" CT:%llu\n",
1836 le32_to_cpu(sb->major_version), 1944 le32_to_cpu(sb->major_version),
1837 le32_to_cpu(sb->feature_map), 1945 le32_to_cpu(sb->feature_map),
1838 uuid[0], uuid[1], uuid[2], uuid[3], 1946 uuid,
1839 uuid[4], uuid[5], uuid[6], uuid[7],
1840 uuid[8], uuid[9], uuid[10], uuid[11],
1841 uuid[12], uuid[13], uuid[14], uuid[15],
1842 sb->set_name, 1947 sb->set_name,
1843 (unsigned long long)le64_to_cpu(sb->ctime) 1948 (unsigned long long)le64_to_cpu(sb->ctime)
1844 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1949 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
@@ -1847,8 +1952,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1847 printk(KERN_INFO 1952 printk(KERN_INFO
1848 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1953 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1849 " RO:%llu\n" 1954 " RO:%llu\n"
1850 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1955 "md: Dev:%08x UUID: %pU\n"
1851 ":%02x%02x%02x%02x%02x%02x\n"
1852 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1956 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1853 "md: (MaxDev:%u) \n", 1957 "md: (MaxDev:%u) \n",
1854 le32_to_cpu(sb->level), 1958 le32_to_cpu(sb->level),
@@ -1861,10 +1965,7 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1861 (unsigned long long)le64_to_cpu(sb->super_offset), 1965 (unsigned long long)le64_to_cpu(sb->super_offset),
1862 (unsigned long long)le64_to_cpu(sb->recovery_offset), 1966 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1863 le32_to_cpu(sb->dev_number), 1967 le32_to_cpu(sb->dev_number),
1864 uuid[0], uuid[1], uuid[2], uuid[3], 1968 uuid,
1865 uuid[4], uuid[5], uuid[6], uuid[7],
1866 uuid[8], uuid[9], uuid[10], uuid[11],
1867 uuid[12], uuid[13], uuid[14], uuid[15],
1868 sb->devflags, 1969 sb->devflags,
1869 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 1970 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1870 (unsigned long long)le64_to_cpu(sb->events), 1971 (unsigned long long)le64_to_cpu(sb->events),
@@ -2008,12 +2109,18 @@ repeat:
2008 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2109 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
2009 /* .. if the array isn't clean, an 'even' event must also go 2110 /* .. if the array isn't clean, an 'even' event must also go
2010 * to spares. */ 2111 * to spares. */
2011 if ((mddev->events&1)==0) 2112 if ((mddev->events&1)==0) {
2012 nospares = 0; 2113 nospares = 0;
2114 sync_req = 2; /* force a second update to get the
2115 * even/odd in sync */
2116 }
2013 } else { 2117 } else {
2014 /* otherwise an 'odd' event must go to spares */ 2118 /* otherwise an 'odd' event must go to spares */
2015 if ((mddev->events&1)) 2119 if ((mddev->events&1)) {
2016 nospares = 0; 2120 nospares = 0;
2121 sync_req = 2; /* force a second update to get the
2122 * even/odd in sync */
2123 }
2017 } 2124 }
2018 } 2125 }
2019 2126
@@ -2446,12 +2553,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2446static struct rdev_sysfs_entry rdev_size = 2553static struct rdev_sysfs_entry rdev_size =
2447__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2554__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2448 2555
2556
2557static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
2558{
2559 unsigned long long recovery_start = rdev->recovery_offset;
2560
2561 if (test_bit(In_sync, &rdev->flags) ||
2562 recovery_start == MaxSector)
2563 return sprintf(page, "none\n");
2564
2565 return sprintf(page, "%llu\n", recovery_start);
2566}
2567
2568static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2569{
2570 unsigned long long recovery_start;
2571
2572 if (cmd_match(buf, "none"))
2573 recovery_start = MaxSector;
2574 else if (strict_strtoull(buf, 10, &recovery_start))
2575 return -EINVAL;
2576
2577 if (rdev->mddev->pers &&
2578 rdev->raid_disk >= 0)
2579 return -EBUSY;
2580
2581 rdev->recovery_offset = recovery_start;
2582 if (recovery_start == MaxSector)
2583 set_bit(In_sync, &rdev->flags);
2584 else
2585 clear_bit(In_sync, &rdev->flags);
2586 return len;
2587}
2588
2589static struct rdev_sysfs_entry rdev_recovery_start =
2590__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2591
2449static struct attribute *rdev_default_attrs[] = { 2592static struct attribute *rdev_default_attrs[] = {
2450 &rdev_state.attr, 2593 &rdev_state.attr,
2451 &rdev_errors.attr, 2594 &rdev_errors.attr,
2452 &rdev_slot.attr, 2595 &rdev_slot.attr,
2453 &rdev_offset.attr, 2596 &rdev_offset.attr,
2454 &rdev_size.attr, 2597 &rdev_size.attr,
2598 &rdev_recovery_start.attr,
2455 NULL, 2599 NULL,
2456}; 2600};
2457static ssize_t 2601static ssize_t
@@ -2505,7 +2649,7 @@ static void rdev_free(struct kobject *ko)
2505 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2649 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2506 kfree(rdev); 2650 kfree(rdev);
2507} 2651}
2508static struct sysfs_ops rdev_sysfs_ops = { 2652static const struct sysfs_ops rdev_sysfs_ops = {
2509 .show = rdev_attr_show, 2653 .show = rdev_attr_show,
2510 .store = rdev_attr_store, 2654 .store = rdev_attr_store,
2511}; 2655};
@@ -2553,6 +2697,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2553 rdev->flags = 0; 2697 rdev->flags = 0;
2554 rdev->data_offset = 0; 2698 rdev->data_offset = 0;
2555 rdev->sb_events = 0; 2699 rdev->sb_events = 0;
2700 rdev->last_read_error.tv_sec = 0;
2701 rdev->last_read_error.tv_nsec = 0;
2556 atomic_set(&rdev->nr_pending, 0); 2702 atomic_set(&rdev->nr_pending, 0);
2557 atomic_set(&rdev->read_errors, 0); 2703 atomic_set(&rdev->read_errors, 0);
2558 atomic_set(&rdev->corrected_errors, 0); 2704 atomic_set(&rdev->corrected_errors, 0);
@@ -2663,6 +2809,47 @@ static void analyze_sbs(mddev_t * mddev)
2663 } 2809 }
2664} 2810}
2665 2811
2812/* Read a fixed-point number.
2813 * Numbers in sysfs attributes should be in "standard" units where
2814 * possible, so time should be in seconds.
2815 * However we internally use a a much smaller unit such as
2816 * milliseconds or jiffies.
2817 * This function takes a decimal number with a possible fractional
2818 * component, and produces an integer which is the result of
2819 * multiplying that number by 10^'scale'.
2820 * all without any floating-point arithmetic.
2821 */
2822int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
2823{
2824 unsigned long result = 0;
2825 long decimals = -1;
2826 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
2827 if (*cp == '.')
2828 decimals = 0;
2829 else if (decimals < scale) {
2830 unsigned int value;
2831 value = *cp - '0';
2832 result = result * 10 + value;
2833 if (decimals >= 0)
2834 decimals++;
2835 }
2836 cp++;
2837 }
2838 if (*cp == '\n')
2839 cp++;
2840 if (*cp)
2841 return -EINVAL;
2842 if (decimals < 0)
2843 decimals = 0;
2844 while (decimals < scale) {
2845 result *= 10;
2846 decimals ++;
2847 }
2848 *res = result;
2849 return 0;
2850}
2851
2852
2666static void md_safemode_timeout(unsigned long data); 2853static void md_safemode_timeout(unsigned long data);
2667 2854
2668static ssize_t 2855static ssize_t
@@ -2674,31 +2861,10 @@ safe_delay_show(mddev_t *mddev, char *page)
2674static ssize_t 2861static ssize_t
2675safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2862safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2676{ 2863{
2677 int scale=1;
2678 int dot=0;
2679 int i;
2680 unsigned long msec; 2864 unsigned long msec;
2681 char buf[30];
2682 2865
2683 /* remove a period, and count digits after it */ 2866 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
2684 if (len >= sizeof(buf))
2685 return -EINVAL;
2686 strlcpy(buf, cbuf, sizeof(buf));
2687 for (i=0; i<len; i++) {
2688 if (dot) {
2689 if (isdigit(buf[i])) {
2690 buf[i-1] = buf[i];
2691 scale *= 10;
2692 }
2693 buf[i] = 0;
2694 } else if (buf[i] == '.') {
2695 dot=1;
2696 buf[i] = 0;
2697 }
2698 }
2699 if (strict_strtoul(buf, 10, &msec) < 0)
2700 return -EINVAL; 2867 return -EINVAL;
2701 msec = (msec * 1000) / scale;
2702 if (msec == 0) 2868 if (msec == 0)
2703 mddev->safemode_delay = 0; 2869 mddev->safemode_delay = 0;
2704 else { 2870 else {
@@ -2974,7 +3140,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2974 3140
2975 if (mddev->pers) 3141 if (mddev->pers)
2976 return -EBUSY; 3142 return -EBUSY;
2977 if (!*buf || (*e && *e != '\n')) 3143 if (cmd_match(buf, "none"))
3144 n = MaxSector;
3145 else if (!*buf || (*e && *e != '\n'))
2978 return -EINVAL; 3146 return -EINVAL;
2979 3147
2980 mddev->recovery_cp = n; 3148 mddev->recovery_cp = n;
@@ -3170,6 +3338,29 @@ static struct md_sysfs_entry md_array_state =
3170__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3338__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3171 3339
3172static ssize_t 3340static ssize_t
3341max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3342 return sprintf(page, "%d\n",
3343 atomic_read(&mddev->max_corr_read_errors));
3344}
3345
3346static ssize_t
3347max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3348{
3349 char *e;
3350 unsigned long n = simple_strtoul(buf, &e, 10);
3351
3352 if (*buf && (*e == 0 || *e == '\n')) {
3353 atomic_set(&mddev->max_corr_read_errors, n);
3354 return len;
3355 }
3356 return -EINVAL;
3357}
3358
3359static struct md_sysfs_entry max_corr_read_errors =
3360__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3361 max_corrected_read_errors_store);
3362
3363static ssize_t
3173null_show(mddev_t *mddev, char *page) 3364null_show(mddev_t *mddev, char *page)
3174{ 3365{
3175 return -EINVAL; 3366 return -EINVAL;
@@ -3250,8 +3441,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3250 } 3441 }
3251 if (*end && !isspace(*end)) break; 3442 if (*end && !isspace(*end)) break;
3252 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3443 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3253 buf = end; 3444 buf = skip_spaces(end);
3254 while (isspace(*buf)) buf++;
3255 } 3445 }
3256 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3446 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3257out: 3447out:
@@ -3794,6 +3984,7 @@ static struct attribute *md_default_attrs[] = {
3794 &md_array_state.attr, 3984 &md_array_state.attr,
3795 &md_reshape_position.attr, 3985 &md_reshape_position.attr,
3796 &md_array_size.attr, 3986 &md_array_size.attr,
3987 &max_corr_read_errors.attr,
3797 NULL, 3988 NULL,
3798}; 3989};
3799 3990
@@ -3875,7 +4066,7 @@ static void md_free(struct kobject *ko)
3875 kfree(mddev); 4066 kfree(mddev);
3876} 4067}
3877 4068
3878static struct sysfs_ops md_sysfs_ops = { 4069static const struct sysfs_ops md_sysfs_ops = {
3879 .show = md_attr_show, 4070 .show = md_attr_show,
3880 .store = md_attr_store, 4071 .store = md_attr_store,
3881}; 4072};
@@ -3891,13 +4082,16 @@ static void mddev_delayed_delete(struct work_struct *ws)
3891{ 4082{
3892 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4083 mddev_t *mddev = container_of(ws, mddev_t, del_work);
3893 4084
3894 if (mddev->private == &md_redundancy_group) { 4085 if (mddev->private) {
3895 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 4086 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
4087 if (mddev->private != (void*)1)
4088 sysfs_remove_group(&mddev->kobj, mddev->private);
3896 if (mddev->sysfs_action) 4089 if (mddev->sysfs_action)
3897 sysfs_put(mddev->sysfs_action); 4090 sysfs_put(mddev->sysfs_action);
3898 mddev->sysfs_action = NULL; 4091 mddev->sysfs_action = NULL;
3899 mddev->private = NULL; 4092 mddev->private = NULL;
3900 } 4093 }
4094 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
3901 kobject_del(&mddev->kobj); 4095 kobject_del(&mddev->kobj);
3902 kobject_put(&mddev->kobj); 4096 kobject_put(&mddev->kobj);
3903} 4097}
@@ -3989,6 +4183,8 @@ static int md_alloc(dev_t dev, char *name)
3989 disk->disk_name); 4183 disk->disk_name);
3990 error = 0; 4184 error = 0;
3991 } 4185 }
4186 if (sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4187 printk(KERN_DEBUG "pointless warning\n");
3992 abort: 4188 abort:
3993 mutex_unlock(&disks_mutex); 4189 mutex_unlock(&disks_mutex);
3994 if (!error) { 4190 if (!error) {
@@ -4100,10 +4296,7 @@ static int do_md_run(mddev_t * mddev)
4100 sysfs_notify_dirent(rdev->sysfs_state); 4296 sysfs_notify_dirent(rdev->sysfs_state);
4101 } 4297 }
4102 4298
4103 md_probe(mddev->unit, NULL, NULL);
4104 disk = mddev->gendisk; 4299 disk = mddev->gendisk;
4105 if (!disk)
4106 return -ENOMEM;
4107 4300
4108 spin_lock(&pers_lock); 4301 spin_lock(&pers_lock);
4109 pers = find_pers(mddev->level, mddev->clevel); 4302 pers = find_pers(mddev->level, mddev->clevel);
@@ -4170,7 +4363,7 @@ static int do_md_run(mddev_t * mddev)
4170 mddev->barriers_work = 1; 4363 mddev->barriers_work = 1;
4171 mddev->ok_start_degraded = start_dirty_degraded; 4364 mddev->ok_start_degraded = start_dirty_degraded;
4172 4365
4173 if (start_readonly) 4366 if (start_readonly && mddev->ro == 0)
4174 mddev->ro = 2; /* read-only, but switch on first write */ 4367 mddev->ro = 2; /* read-only, but switch on first write */
4175 4368
4176 err = mddev->pers->run(mddev); 4369 err = mddev->pers->run(mddev);
@@ -4210,6 +4403,8 @@ static int do_md_run(mddev_t * mddev)
4210 mddev->ro = 0; 4403 mddev->ro = 0;
4211 4404
4212 atomic_set(&mddev->writes_pending,0); 4405 atomic_set(&mddev->writes_pending,0);
4406 atomic_set(&mddev->max_corr_read_errors,
4407 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4213 mddev->safemode = 0; 4408 mddev->safemode = 0;
4214 mddev->safemode_timer.function = md_safemode_timeout; 4409 mddev->safemode_timer.function = md_safemode_timeout;
4215 mddev->safemode_timer.data = (unsigned long) mddev; 4410 mddev->safemode_timer.data = (unsigned long) mddev;
@@ -4232,33 +4427,6 @@ static int do_md_run(mddev_t * mddev)
4232 4427
4233 set_capacity(disk, mddev->array_sectors); 4428 set_capacity(disk, mddev->array_sectors);
4234 4429
4235 /* If there is a partially-recovered drive we need to
4236 * start recovery here. If we leave it to md_check_recovery,
4237 * it will remove the drives and not do the right thing
4238 */
4239 if (mddev->degraded && !mddev->sync_thread) {
4240 int spares = 0;
4241 list_for_each_entry(rdev, &mddev->disks, same_set)
4242 if (rdev->raid_disk >= 0 &&
4243 !test_bit(In_sync, &rdev->flags) &&
4244 !test_bit(Faulty, &rdev->flags))
4245 /* complete an interrupted recovery */
4246 spares++;
4247 if (spares && mddev->pers->sync_request) {
4248 mddev->recovery = 0;
4249 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4250 mddev->sync_thread = md_register_thread(md_do_sync,
4251 mddev,
4252 "resync");
4253 if (!mddev->sync_thread) {
4254 printk(KERN_ERR "%s: could not start resync"
4255 " thread...\n",
4256 mdname(mddev));
4257 /* leave the spares where they are, it shouldn't hurt */
4258 mddev->recovery = 0;
4259 }
4260 }
4261 }
4262 md_wakeup_thread(mddev->thread); 4430 md_wakeup_thread(mddev->thread);
4263 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4431 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4264 4432
@@ -4314,7 +4482,7 @@ static int deny_bitmap_write_access(struct file * file)
4314 return 0; 4482 return 0;
4315} 4483}
4316 4484
4317static void restore_bitmap_write_access(struct file *file) 4485void restore_bitmap_write_access(struct file *file)
4318{ 4486{
4319 struct inode *inode = file->f_mapping->host; 4487 struct inode *inode = file->f_mapping->host;
4320 4488
@@ -4368,8 +4536,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4368 mddev->queue->unplug_fn = NULL; 4536 mddev->queue->unplug_fn = NULL;
4369 mddev->queue->backing_dev_info.congested_fn = NULL; 4537 mddev->queue->backing_dev_info.congested_fn = NULL;
4370 module_put(mddev->pers->owner); 4538 module_put(mddev->pers->owner);
4371 if (mddev->pers->sync_request) 4539 if (mddev->pers->sync_request && mddev->private == NULL)
4372 mddev->private = &md_redundancy_group; 4540 mddev->private = (void*)1;
4373 mddev->pers = NULL; 4541 mddev->pers = NULL;
4374 /* tell userspace to handle 'inactive' */ 4542 /* tell userspace to handle 'inactive' */
4375 sysfs_notify_dirent(mddev->sysfs_state); 4543 sysfs_notify_dirent(mddev->sysfs_state);
@@ -4409,15 +4577,12 @@ out:
4409 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4577 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4410 4578
4411 bitmap_destroy(mddev); 4579 bitmap_destroy(mddev);
4412 if (mddev->bitmap_file) { 4580 if (mddev->bitmap_info.file) {
4413 restore_bitmap_write_access(mddev->bitmap_file); 4581 restore_bitmap_write_access(mddev->bitmap_info.file);
4414 fput(mddev->bitmap_file); 4582 fput(mddev->bitmap_info.file);
4415 mddev->bitmap_file = NULL; 4583 mddev->bitmap_info.file = NULL;
4416 } 4584 }
4417 mddev->bitmap_offset = 0; 4585 mddev->bitmap_info.offset = 0;
4418
4419 /* make sure all md_delayed_delete calls have finished */
4420 flush_scheduled_work();
4421 4586
4422 export_array(mddev); 4587 export_array(mddev);
4423 4588
@@ -4455,6 +4620,11 @@ out:
4455 mddev->degraded = 0; 4620 mddev->degraded = 0;
4456 mddev->barriers_work = 0; 4621 mddev->barriers_work = 0;
4457 mddev->safemode = 0; 4622 mddev->safemode = 0;
4623 mddev->bitmap_info.offset = 0;
4624 mddev->bitmap_info.default_offset = 0;
4625 mddev->bitmap_info.chunksize = 0;
4626 mddev->bitmap_info.daemon_sleep = 0;
4627 mddev->bitmap_info.max_write_behind = 0;
4458 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4628 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4459 if (mddev->hold_active == UNTIL_STOP) 4629 if (mddev->hold_active == UNTIL_STOP)
4460 mddev->hold_active = 0; 4630 mddev->hold_active = 0;
@@ -4640,7 +4810,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4640 info.state = 0; 4810 info.state = 0;
4641 if (mddev->in_sync) 4811 if (mddev->in_sync)
4642 info.state = (1<<MD_SB_CLEAN); 4812 info.state = (1<<MD_SB_CLEAN);
4643 if (mddev->bitmap && mddev->bitmap_offset) 4813 if (mddev->bitmap && mddev->bitmap_info.offset)
4644 info.state = (1<<MD_SB_BITMAP_PRESENT); 4814 info.state = (1<<MD_SB_BITMAP_PRESENT);
4645 info.active_disks = insync; 4815 info.active_disks = insync;
4646 info.working_disks = working; 4816 info.working_disks = working;
@@ -4998,23 +5168,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
4998 if (fd >= 0) { 5168 if (fd >= 0) {
4999 if (mddev->bitmap) 5169 if (mddev->bitmap)
5000 return -EEXIST; /* cannot add when bitmap is present */ 5170 return -EEXIST; /* cannot add when bitmap is present */
5001 mddev->bitmap_file = fget(fd); 5171 mddev->bitmap_info.file = fget(fd);
5002 5172
5003 if (mddev->bitmap_file == NULL) { 5173 if (mddev->bitmap_info.file == NULL) {
5004 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5174 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5005 mdname(mddev)); 5175 mdname(mddev));
5006 return -EBADF; 5176 return -EBADF;
5007 } 5177 }
5008 5178
5009 err = deny_bitmap_write_access(mddev->bitmap_file); 5179 err = deny_bitmap_write_access(mddev->bitmap_info.file);
5010 if (err) { 5180 if (err) {
5011 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 5181 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5012 mdname(mddev)); 5182 mdname(mddev));
5013 fput(mddev->bitmap_file); 5183 fput(mddev->bitmap_info.file);
5014 mddev->bitmap_file = NULL; 5184 mddev->bitmap_info.file = NULL;
5015 return err; 5185 return err;
5016 } 5186 }
5017 mddev->bitmap_offset = 0; /* file overrides offset */ 5187 mddev->bitmap_info.offset = 0; /* file overrides offset */
5018 } else if (mddev->bitmap == NULL) 5188 } else if (mddev->bitmap == NULL)
5019 return -ENOENT; /* cannot remove what isn't there */ 5189 return -ENOENT; /* cannot remove what isn't there */
5020 err = 0; 5190 err = 0;
@@ -5029,11 +5199,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
5029 mddev->pers->quiesce(mddev, 0); 5199 mddev->pers->quiesce(mddev, 0);
5030 } 5200 }
5031 if (fd < 0) { 5201 if (fd < 0) {
5032 if (mddev->bitmap_file) { 5202 if (mddev->bitmap_info.file) {
5033 restore_bitmap_write_access(mddev->bitmap_file); 5203 restore_bitmap_write_access(mddev->bitmap_info.file);
5034 fput(mddev->bitmap_file); 5204 fput(mddev->bitmap_info.file);
5035 } 5205 }
5036 mddev->bitmap_file = NULL; 5206 mddev->bitmap_info.file = NULL;
5037 } 5207 }
5038 5208
5039 return err; 5209 return err;
@@ -5070,6 +5240,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5070 mddev->minor_version = info->minor_version; 5240 mddev->minor_version = info->minor_version;
5071 mddev->patch_version = info->patch_version; 5241 mddev->patch_version = info->patch_version;
5072 mddev->persistent = !info->not_persistent; 5242 mddev->persistent = !info->not_persistent;
5243 /* ensure mddev_put doesn't delete this now that there
5244 * is some minimal configuration.
5245 */
5246 mddev->ctime = get_seconds();
5073 return 0; 5247 return 0;
5074 } 5248 }
5075 mddev->major_version = MD_MAJOR_VERSION; 5249 mddev->major_version = MD_MAJOR_VERSION;
@@ -5100,8 +5274,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5100 mddev->flags = 0; 5274 mddev->flags = 0;
5101 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5275 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5102 5276
5103 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5277 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5104 mddev->bitmap_offset = 0; 5278 mddev->bitmap_info.offset = 0;
5105 5279
5106 mddev->reshape_position = MaxSector; 5280 mddev->reshape_position = MaxSector;
5107 5281
@@ -5201,7 +5375,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5201 int state = 0; 5375 int state = 0;
5202 5376
5203 /* calculate expected state,ignoring low bits */ 5377 /* calculate expected state,ignoring low bits */
5204 if (mddev->bitmap && mddev->bitmap_offset) 5378 if (mddev->bitmap && mddev->bitmap_info.offset)
5205 state |= (1 << MD_SB_BITMAP_PRESENT); 5379 state |= (1 << MD_SB_BITMAP_PRESENT);
5206 5380
5207 if (mddev->major_version != info->major_version || 5381 if (mddev->major_version != info->major_version ||
@@ -5260,9 +5434,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5260 /* add the bitmap */ 5434 /* add the bitmap */
5261 if (mddev->bitmap) 5435 if (mddev->bitmap)
5262 return -EEXIST; 5436 return -EEXIST;
5263 if (mddev->default_bitmap_offset == 0) 5437 if (mddev->bitmap_info.default_offset == 0)
5264 return -EINVAL; 5438 return -EINVAL;
5265 mddev->bitmap_offset = mddev->default_bitmap_offset; 5439 mddev->bitmap_info.offset =
5440 mddev->bitmap_info.default_offset;
5266 mddev->pers->quiesce(mddev, 1); 5441 mddev->pers->quiesce(mddev, 1);
5267 rv = bitmap_create(mddev); 5442 rv = bitmap_create(mddev);
5268 if (rv) 5443 if (rv)
@@ -5277,7 +5452,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5277 mddev->pers->quiesce(mddev, 1); 5452 mddev->pers->quiesce(mddev, 1);
5278 bitmap_destroy(mddev); 5453 bitmap_destroy(mddev);
5279 mddev->pers->quiesce(mddev, 0); 5454 mddev->pers->quiesce(mddev, 0);
5280 mddev->bitmap_offset = 0; 5455 mddev->bitmap_info.offset = 0;
5281 } 5456 }
5282 } 5457 }
5283 md_update_sb(mddev, 1); 5458 md_update_sb(mddev, 1);
@@ -5528,6 +5703,25 @@ done:
5528abort: 5703abort:
5529 return err; 5704 return err;
5530} 5705}
5706#ifdef CONFIG_COMPAT
5707static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
5708 unsigned int cmd, unsigned long arg)
5709{
5710 switch (cmd) {
5711 case HOT_REMOVE_DISK:
5712 case HOT_ADD_DISK:
5713 case SET_DISK_FAULTY:
5714 case SET_BITMAP_FILE:
5715 /* These take in integer arg, do not convert */
5716 break;
5717 default:
5718 arg = (unsigned long)compat_ptr(arg);
5719 break;
5720 }
5721
5722 return md_ioctl(bdev, mode, cmd, arg);
5723}
5724#endif /* CONFIG_COMPAT */
5531 5725
5532static int md_open(struct block_device *bdev, fmode_t mode) 5726static int md_open(struct block_device *bdev, fmode_t mode)
5533{ 5727{
@@ -5593,6 +5787,9 @@ static const struct block_device_operations md_fops =
5593 .open = md_open, 5787 .open = md_open,
5594 .release = md_release, 5788 .release = md_release,
5595 .ioctl = md_ioctl, 5789 .ioctl = md_ioctl,
5790#ifdef CONFIG_COMPAT
5791 .compat_ioctl = md_compat_ioctl,
5792#endif
5596 .getgeo = md_getgeo, 5793 .getgeo = md_getgeo,
5597 .media_changed = md_media_changed, 5794 .media_changed = md_media_changed,
5598 .revalidate_disk= md_revalidate, 5795 .revalidate_disk= md_revalidate,
@@ -5986,14 +6183,14 @@ static int md_seq_show(struct seq_file *seq, void *v)
5986 unsigned long chunk_kb; 6183 unsigned long chunk_kb;
5987 unsigned long flags; 6184 unsigned long flags;
5988 spin_lock_irqsave(&bitmap->lock, flags); 6185 spin_lock_irqsave(&bitmap->lock, flags);
5989 chunk_kb = bitmap->chunksize >> 10; 6186 chunk_kb = mddev->bitmap_info.chunksize >> 10;
5990 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 6187 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5991 "%lu%s chunk", 6188 "%lu%s chunk",
5992 bitmap->pages - bitmap->missing_pages, 6189 bitmap->pages - bitmap->missing_pages,
5993 bitmap->pages, 6190 bitmap->pages,
5994 (bitmap->pages - bitmap->missing_pages) 6191 (bitmap->pages - bitmap->missing_pages)
5995 << (PAGE_SHIFT - 10), 6192 << (PAGE_SHIFT - 10),
5996 chunk_kb ? chunk_kb : bitmap->chunksize, 6193 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
5997 chunk_kb ? "KB" : "B"); 6194 chunk_kb ? "KB" : "B");
5998 if (bitmap->file) { 6195 if (bitmap->file) {
5999 seq_printf(seq, ", file: "); 6196 seq_printf(seq, ", file: ");
@@ -6279,10 +6476,11 @@ void md_do_sync(mddev_t *mddev)
6279 mddev->curr_resync = 2; 6476 mddev->curr_resync = 2;
6280 6477
6281 try_again: 6478 try_again:
6282 if (kthread_should_stop()) { 6479 if (kthread_should_stop())
6283 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6480 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6481
6482 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6284 goto skip; 6483 goto skip;
6285 }
6286 for_each_mddev(mddev2, tmp) { 6484 for_each_mddev(mddev2, tmp) {
6287 if (mddev2 == mddev) 6485 if (mddev2 == mddev)
6288 continue; 6486 continue;
@@ -6342,12 +6540,14 @@ void md_do_sync(mddev_t *mddev)
6342 /* recovery follows the physical size of devices */ 6540 /* recovery follows the physical size of devices */
6343 max_sectors = mddev->dev_sectors; 6541 max_sectors = mddev->dev_sectors;
6344 j = MaxSector; 6542 j = MaxSector;
6345 list_for_each_entry(rdev, &mddev->disks, same_set) 6543 rcu_read_lock();
6544 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6346 if (rdev->raid_disk >= 0 && 6545 if (rdev->raid_disk >= 0 &&
6347 !test_bit(Faulty, &rdev->flags) && 6546 !test_bit(Faulty, &rdev->flags) &&
6348 !test_bit(In_sync, &rdev->flags) && 6547 !test_bit(In_sync, &rdev->flags) &&
6349 rdev->recovery_offset < j) 6548 rdev->recovery_offset < j)
6350 j = rdev->recovery_offset; 6549 j = rdev->recovery_offset;
6550 rcu_read_unlock();
6351 } 6551 }
6352 6552
6353 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6553 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -6384,6 +6584,7 @@ void md_do_sync(mddev_t *mddev)
6384 desc, mdname(mddev)); 6584 desc, mdname(mddev));
6385 mddev->curr_resync = j; 6585 mddev->curr_resync = j;
6386 } 6586 }
6587 mddev->curr_resync_completed = mddev->curr_resync;
6387 6588
6388 while (j < max_sectors) { 6589 while (j < max_sectors) {
6389 sector_t sectors; 6590 sector_t sectors;
@@ -6516,22 +6717,29 @@ void md_do_sync(mddev_t *mddev)
6516 } else { 6717 } else {
6517 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6718 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6518 mddev->curr_resync = MaxSector; 6719 mddev->curr_resync = MaxSector;
6519 list_for_each_entry(rdev, &mddev->disks, same_set) 6720 rcu_read_lock();
6721 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
6520 if (rdev->raid_disk >= 0 && 6722 if (rdev->raid_disk >= 0 &&
6521 !test_bit(Faulty, &rdev->flags) && 6723 !test_bit(Faulty, &rdev->flags) &&
6522 !test_bit(In_sync, &rdev->flags) && 6724 !test_bit(In_sync, &rdev->flags) &&
6523 rdev->recovery_offset < mddev->curr_resync) 6725 rdev->recovery_offset < mddev->curr_resync)
6524 rdev->recovery_offset = mddev->curr_resync; 6726 rdev->recovery_offset = mddev->curr_resync;
6727 rcu_read_unlock();
6525 } 6728 }
6526 } 6729 }
6527 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6730 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6528 6731
6529 skip: 6732 skip:
6733 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6734 /* We completed so min/max setting can be forgotten if used. */
6735 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6736 mddev->resync_min = 0;
6737 mddev->resync_max = MaxSector;
6738 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6739 mddev->resync_min = mddev->curr_resync_completed;
6530 mddev->curr_resync = 0; 6740 mddev->curr_resync = 0;
6531 mddev->curr_resync_completed = 0;
6532 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6741 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6533 /* We completed so max setting can be forgotten. */ 6742 mddev->curr_resync_completed = 0;
6534 mddev->resync_max = MaxSector;
6535 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6743 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6536 wake_up(&resync_wait); 6744 wake_up(&resync_wait);
6537 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6745 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
@@ -6594,6 +6802,7 @@ static int remove_and_add_spares(mddev_t *mddev)
6594 nm, mdname(mddev)); 6802 nm, mdname(mddev));
6595 spares++; 6803 spares++;
6596 md_new_event(mddev); 6804 md_new_event(mddev);
6805 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6597 } else 6806 } else
6598 break; 6807 break;
6599 } 6808 }
@@ -6629,7 +6838,7 @@ void md_check_recovery(mddev_t *mddev)
6629 6838
6630 6839
6631 if (mddev->bitmap) 6840 if (mddev->bitmap)
6632 bitmap_daemon_work(mddev->bitmap); 6841 bitmap_daemon_work(mddev);
6633 6842
6634 if (mddev->ro) 6843 if (mddev->ro)
6635 return; 6844 return;
@@ -6999,5 +7208,6 @@ EXPORT_SYMBOL(md_unregister_thread);
6999EXPORT_SYMBOL(md_wakeup_thread); 7208EXPORT_SYMBOL(md_wakeup_thread);
7000EXPORT_SYMBOL(md_check_recovery); 7209EXPORT_SYMBOL(md_check_recovery);
7001MODULE_LICENSE("GPL"); 7210MODULE_LICENSE("GPL");
7211MODULE_DESCRIPTION("MD RAID framework");
7002MODULE_ALIAS("md"); 7212MODULE_ALIAS("md");
7003MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 7213MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f184b69ef337..8e4c75c00d46 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
97 atomic_t read_errors; /* number of consecutive read errors that 97 atomic_t read_errors; /* number of consecutive read errors that
98 * we have tried to ignore. 98 * we have tried to ignore.
99 */ 99 */
100 struct timespec last_read_error; /* monotonic time since our
101 * last read error
102 */
100 atomic_t corrected_errors; /* number of corrected read errors, 103 atomic_t corrected_errors; /* number of corrected read errors,
101 * for reporting to userspace and storing 104 * for reporting to userspace and storing
102 * in superblock. 105 * in superblock.
@@ -280,17 +283,38 @@ struct mddev_s
280 unsigned int max_write_behind; /* 0 = sync */ 283 unsigned int max_write_behind; /* 0 = sync */
281 284
282 struct bitmap *bitmap; /* the bitmap for the device */ 285 struct bitmap *bitmap; /* the bitmap for the device */
283 struct file *bitmap_file; /* the bitmap file */ 286 struct {
284 long bitmap_offset; /* offset from superblock of 287 struct file *file; /* the bitmap file */
285 * start of bitmap. May be 288 loff_t offset; /* offset from superblock of
286 * negative, but not '0' 289 * start of bitmap. May be
287 */ 290 * negative, but not '0'
288 long default_bitmap_offset; /* this is the offset to use when 291 * For external metadata, offset
289 * hot-adding a bitmap. It should 292 * from start of device.
290 * eventually be settable by sysfs. 293 */
291 */ 294 loff_t default_offset; /* this is the offset to use when
292 295 * hot-adding a bitmap. It should
296 * eventually be settable by sysfs.
297 */
298 struct mutex mutex;
299 unsigned long chunksize;
300 unsigned long daemon_sleep; /* how many seconds between updates? */
301 unsigned long max_write_behind; /* write-behind mode */
302 int external;
303 } bitmap_info;
304
305 atomic_t max_corr_read_errors; /* max read retries */
293 struct list_head all_mddevs; 306 struct list_head all_mddevs;
307
308 /* Generic barrier handling.
309 * If there is a pending barrier request, all other
310 * writes are blocked while the devices are flushed.
311 * The last to finish a flush schedules a worker to
312 * submit the barrier request (without the barrier flag),
313 * then submit more flush requests.
314 */
315 struct bio *barrier;
316 atomic_t flush_pending;
317 struct work_struct barrier_work;
294}; 318};
295 319
296 320
@@ -353,7 +377,7 @@ struct md_sysfs_entry {
353 ssize_t (*show)(mddev_t *, char *); 377 ssize_t (*show)(mddev_t *, char *);
354 ssize_t (*store)(mddev_t *, const char *, size_t); 378 ssize_t (*store)(mddev_t *, const char *, size_t);
355}; 379};
356 380extern struct attribute_group md_bitmap_group;
357 381
358static inline char * mdname (mddev_t * mddev) 382static inline char * mdname (mddev_t * mddev)
359{ 383{
@@ -431,6 +455,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
431extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 455extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
432 456
433extern int mddev_congested(mddev_t *mddev, int bits); 457extern int mddev_congested(mddev_t *mddev, int bits);
458extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
434extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 459extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
435 sector_t sector, int size, struct page *page); 460 sector_t sector, int size, struct page *page);
436extern void md_super_wait(mddev_t *mddev); 461extern void md_super_wait(mddev_t *mddev);
@@ -443,6 +468,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
443extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 468extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
444extern int md_check_no_bitmap(mddev_t *mddev); 469extern int md_check_no_bitmap(mddev_t *mddev);
445extern int md_integrity_register(mddev_t *mddev); 470extern int md_integrity_register(mddev_t *mddev);
446void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 471extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
472extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
473extern void restore_bitmap_write_access(struct file *file);
447 474
448#endif /* _MD_MD_H */ 475#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index ee7646f974a0..789bf535d29c 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/raid/md_u.h> 23#include <linux/raid/md_u.h>
24#include <linux/seq_file.h> 24#include <linux/seq_file.h>
25#include <linux/slab.h>
25#include "md.h" 26#include "md.h"
26#include "multipath.h" 27#include "multipath.h"
27 28
@@ -145,7 +146,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
145 int cpu; 146 int cpu;
146 147
147 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 148 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
148 bio_endio(bio, -EOPNOTSUPP); 149 md_barrier_request(mddev, bio);
149 return 0; 150 return 0;
150 } 151 }
151 152
@@ -301,14 +302,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
301 rdev->data_offset << 9); 302 rdev->data_offset << 9);
302 303
303 /* as we don't honour merge_bvec_fn, we must never risk 304 /* as we don't honour merge_bvec_fn, we must never risk
304 * violating it, so limit ->max_sector to one PAGE, as 305 * violating it, so limit ->max_segments to one, lying
305 * a one page request is never in violation. 306 * within a single page.
306 * (Note: it is very unlikely that a device with 307 * (Note: it is very unlikely that a device with
307 * merge_bvec_fn will be involved in multipath.) 308 * merge_bvec_fn will be involved in multipath.)
308 */ 309 */
309 if (q->merge_bvec_fn && 310 if (q->merge_bvec_fn) {
310 queue_max_sectors(q) > (PAGE_SIZE>>9)) 311 blk_queue_max_segments(mddev->queue, 1);
311 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 312 blk_queue_segment_boundary(mddev->queue,
313 PAGE_CACHE_SIZE - 1);
314 }
312 315
313 conf->working_disks++; 316 conf->working_disks++;
314 mddev->degraded--; 317 mddev->degraded--;
@@ -476,9 +479,11 @@ static int multipath_run (mddev_t *mddev)
476 /* as we don't honour merge_bvec_fn, we must never risk 479 /* as we don't honour merge_bvec_fn, we must never risk
477 * violating it, not that we ever expect a device with 480 * violating it, not that we ever expect a device with
478 * a merge_bvec_fn to be involved in multipath */ 481 * a merge_bvec_fn to be involved in multipath */
479 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 482 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
480 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 483 blk_queue_max_segments(mddev->queue, 1);
481 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 484 blk_queue_segment_boundary(mddev->queue,
485 PAGE_CACHE_SIZE - 1);
486 }
482 487
483 if (!test_bit(Faulty, &rdev->flags)) 488 if (!test_bit(Faulty, &rdev->flags))
484 conf->working_disks++; 489 conf->working_disks++;
@@ -581,6 +586,7 @@ static void __exit multipath_exit (void)
581module_init(multipath_init); 586module_init(multipath_init);
582module_exit(multipath_exit); 587module_exit(multipath_exit);
583MODULE_LICENSE("GPL"); 588MODULE_LICENSE("GPL");
589MODULE_DESCRIPTION("simple multi-path personality for MD");
584MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ 590MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
585MODULE_ALIAS("md-multipath"); 591MODULE_ALIAS("md-multipath");
586MODULE_ALIAS("md-level--4"); 592MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d3a4ce06015a..c3bec024612e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -20,6 +20,7 @@
20 20
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/slab.h>
23#include "md.h" 24#include "md.h"
24#include "raid0.h" 25#include "raid0.h"
25 26
@@ -176,14 +177,15 @@ static int create_strip_zones(mddev_t *mddev)
176 disk_stack_limits(mddev->gendisk, rdev1->bdev, 177 disk_stack_limits(mddev->gendisk, rdev1->bdev,
177 rdev1->data_offset << 9); 178 rdev1->data_offset << 9);
178 /* as we don't honour merge_bvec_fn, we must never risk 179 /* as we don't honour merge_bvec_fn, we must never risk
179 * violating it, so limit ->max_sector to one PAGE, as 180 * violating it, so limit ->max_segments to 1, lying within
180 * a one page request is never in violation. 181 * a single page.
181 */ 182 */
182 183
183 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && 184 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
184 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 185 blk_queue_max_segments(mddev->queue, 1);
185 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 186 blk_queue_segment_boundary(mddev->queue,
186 187 PAGE_CACHE_SIZE - 1);
188 }
187 if (!smallest || (rdev1->sectors < smallest->sectors)) 189 if (!smallest || (rdev1->sectors < smallest->sectors))
188 smallest = rdev1; 190 smallest = rdev1;
189 cnt++; 191 cnt++;
@@ -325,7 +327,7 @@ static int raid0_run(mddev_t *mddev)
325 } 327 }
326 if (md_check_no_bitmap(mddev)) 328 if (md_check_no_bitmap(mddev))
327 return -EINVAL; 329 return -EINVAL;
328 blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors); 330 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
329 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 331 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
330 332
331 ret = create_strip_zones(mddev); 333 ret = create_strip_zones(mddev);
@@ -453,7 +455,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
453 int cpu; 455 int cpu;
454 456
455 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 457 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
456 bio_endio(bio, -EOPNOTSUPP); 458 md_barrier_request(mddev, bio);
457 return 0; 459 return 0;
458 } 460 }
459 461
@@ -567,6 +569,7 @@ static void raid0_exit (void)
567module_init(raid0_init); 569module_init(raid0_init);
568module_exit(raid0_exit); 570module_exit(raid0_exit);
569MODULE_LICENSE("GPL"); 571MODULE_LICENSE("GPL");
572MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
570MODULE_ALIAS("md-personality-2"); /* RAID0 */ 573MODULE_ALIAS("md-personality-2"); /* RAID0 */
571MODULE_ALIAS("md-raid0"); 574MODULE_ALIAS("md-raid0");
572MODULE_ALIAS("md-level-0"); 575MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e07ce2e033a9..e59b10e66edb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,6 +31,7 @@
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */ 32 */
33 33
34#include <linux/slab.h>
34#include <linux/delay.h> 35#include <linux/delay.h>
35#include <linux/blkdev.h> 36#include <linux/blkdev.h>
36#include <linux/seq_file.h> 37#include <linux/seq_file.h>
@@ -677,6 +678,7 @@ static void raise_barrier(conf_t *conf)
677static void lower_barrier(conf_t *conf) 678static void lower_barrier(conf_t *conf)
678{ 679{
679 unsigned long flags; 680 unsigned long flags;
681 BUG_ON(conf->barrier <= 0);
680 spin_lock_irqsave(&conf->resync_lock, flags); 682 spin_lock_irqsave(&conf->resync_lock, flags);
681 conf->barrier--; 683 conf->barrier--;
682 spin_unlock_irqrestore(&conf->resync_lock, flags); 684 spin_unlock_irqrestore(&conf->resync_lock, flags);
@@ -801,6 +803,25 @@ static int make_request(struct request_queue *q, struct bio * bio)
801 803
802 md_write_start(mddev, bio); /* wait on superblock update early */ 804 md_write_start(mddev, bio); /* wait on superblock update early */
803 805
806 if (bio_data_dir(bio) == WRITE &&
807 bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
808 bio->bi_sector < mddev->suspend_hi) {
809 /* As the suspend_* range is controlled by
810 * userspace, we want an interruptible
811 * wait.
812 */
813 DEFINE_WAIT(w);
814 for (;;) {
815 flush_signals(current);
816 prepare_to_wait(&conf->wait_barrier,
817 &w, TASK_INTERRUPTIBLE);
818 if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
819 bio->bi_sector >= mddev->suspend_hi)
820 break;
821 schedule();
822 }
823 finish_wait(&conf->wait_barrier, &w);
824 }
804 if (unlikely(!mddev->barriers_work && 825 if (unlikely(!mddev->barriers_work &&
805 bio_rw_flagged(bio, BIO_RW_BARRIER))) { 826 bio_rw_flagged(bio, BIO_RW_BARRIER))) {
806 if (rw == WRITE) 827 if (rw == WRITE)
@@ -923,7 +944,8 @@ static int make_request(struct request_queue *q, struct bio * bio)
923 944
924 /* do behind I/O ? */ 945 /* do behind I/O ? */
925 if (bitmap && 946 if (bitmap &&
926 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && 947 (atomic_read(&bitmap->behind_writes)
948 < mddev->bitmap_info.max_write_behind) &&
927 (behind_pages = alloc_behind_pages(bio)) != NULL) 949 (behind_pages = alloc_behind_pages(bio)) != NULL)
928 set_bit(R1BIO_BehindIO, &r1_bio->state); 950 set_bit(R1BIO_BehindIO, &r1_bio->state);
929 951
@@ -1131,13 +1153,17 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1131 1153
1132 disk_stack_limits(mddev->gendisk, rdev->bdev, 1154 disk_stack_limits(mddev->gendisk, rdev->bdev,
1133 rdev->data_offset << 9); 1155 rdev->data_offset << 9);
1134 /* as we don't honour merge_bvec_fn, we must never risk 1156 /* as we don't honour merge_bvec_fn, we must
1135 * violating it, so limit ->max_sector to one PAGE, as 1157 * never risk violating it, so limit
1136 * a one page request is never in violation. 1158 * ->max_segments to one lying with a single
1159 * page, as a one page request is never in
1160 * violation.
1137 */ 1161 */
1138 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1162 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1139 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 1163 blk_queue_max_segments(mddev->queue, 1);
1140 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1164 blk_queue_segment_boundary(mddev->queue,
1165 PAGE_CACHE_SIZE - 1);
1166 }
1141 1167
1142 p->head_position = 0; 1168 p->head_position = 0;
1143 rdev->raid_disk = mirror; 1169 rdev->raid_disk = mirror;
@@ -1941,74 +1967,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
1941 return mddev->dev_sectors; 1967 return mddev->dev_sectors;
1942} 1968}
1943 1969
1944static int run(mddev_t *mddev) 1970static conf_t *setup_conf(mddev_t *mddev)
1945{ 1971{
1946 conf_t *conf; 1972 conf_t *conf;
1947 int i, j, disk_idx; 1973 int i;
1948 mirror_info_t *disk; 1974 mirror_info_t *disk;
1949 mdk_rdev_t *rdev; 1975 mdk_rdev_t *rdev;
1976 int err = -ENOMEM;
1950 1977
1951 if (mddev->level != 1) {
1952 printk("raid1: %s: raid level not set to mirroring (%d)\n",
1953 mdname(mddev), mddev->level);
1954 goto out;
1955 }
1956 if (mddev->reshape_position != MaxSector) {
1957 printk("raid1: %s: reshape_position set but not supported\n",
1958 mdname(mddev));
1959 goto out;
1960 }
1961 /*
1962 * copy the already verified devices into our private RAID1
1963 * bookkeeping area. [whatever we allocate in run(),
1964 * should be freed in stop()]
1965 */
1966 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 1978 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1967 mddev->private = conf;
1968 if (!conf) 1979 if (!conf)
1969 goto out_no_mem; 1980 goto abort;
1970 1981
1971 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 1982 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1972 GFP_KERNEL); 1983 GFP_KERNEL);
1973 if (!conf->mirrors) 1984 if (!conf->mirrors)
1974 goto out_no_mem; 1985 goto abort;
1975 1986
1976 conf->tmppage = alloc_page(GFP_KERNEL); 1987 conf->tmppage = alloc_page(GFP_KERNEL);
1977 if (!conf->tmppage) 1988 if (!conf->tmppage)
1978 goto out_no_mem; 1989 goto abort;
1979 1990
1980 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1991 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1981 if (!conf->poolinfo) 1992 if (!conf->poolinfo)
1982 goto out_no_mem; 1993 goto abort;
1983 conf->poolinfo->mddev = NULL;
1984 conf->poolinfo->raid_disks = mddev->raid_disks; 1994 conf->poolinfo->raid_disks = mddev->raid_disks;
1985 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1995 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1986 r1bio_pool_free, 1996 r1bio_pool_free,
1987 conf->poolinfo); 1997 conf->poolinfo);
1988 if (!conf->r1bio_pool) 1998 if (!conf->r1bio_pool)
1989 goto out_no_mem; 1999 goto abort;
2000
1990 conf->poolinfo->mddev = mddev; 2001 conf->poolinfo->mddev = mddev;
1991 2002
1992 spin_lock_init(&conf->device_lock); 2003 spin_lock_init(&conf->device_lock);
1993 mddev->queue->queue_lock = &conf->device_lock;
1994
1995 list_for_each_entry(rdev, &mddev->disks, same_set) { 2004 list_for_each_entry(rdev, &mddev->disks, same_set) {
1996 disk_idx = rdev->raid_disk; 2005 int disk_idx = rdev->raid_disk;
1997 if (disk_idx >= mddev->raid_disks 2006 if (disk_idx >= mddev->raid_disks
1998 || disk_idx < 0) 2007 || disk_idx < 0)
1999 continue; 2008 continue;
2000 disk = conf->mirrors + disk_idx; 2009 disk = conf->mirrors + disk_idx;
2001 2010
2002 disk->rdev = rdev; 2011 disk->rdev = rdev;
2003 disk_stack_limits(mddev->gendisk, rdev->bdev,
2004 rdev->data_offset << 9);
2005 /* as we don't honour merge_bvec_fn, we must never risk
2006 * violating it, so limit ->max_sector to one PAGE, as
2007 * a one page request is never in violation.
2008 */
2009 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2010 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2011 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2012 2012
2013 disk->head_position = 0; 2013 disk->head_position = 0;
2014 } 2014 }
@@ -2022,8 +2022,7 @@ static int run(mddev_t *mddev)
2022 bio_list_init(&conf->pending_bio_list); 2022 bio_list_init(&conf->pending_bio_list);
2023 bio_list_init(&conf->flushing_bio_list); 2023 bio_list_init(&conf->flushing_bio_list);
2024 2024
2025 2025 conf->last_used = -1;
2026 mddev->degraded = 0;
2027 for (i = 0; i < conf->raid_disks; i++) { 2026 for (i = 0; i < conf->raid_disks; i++) {
2028 2027
2029 disk = conf->mirrors + i; 2028 disk = conf->mirrors + i;
@@ -2031,38 +2030,99 @@ static int run(mddev_t *mddev)
2031 if (!disk->rdev || 2030 if (!disk->rdev ||
2032 !test_bit(In_sync, &disk->rdev->flags)) { 2031 !test_bit(In_sync, &disk->rdev->flags)) {
2033 disk->head_position = 0; 2032 disk->head_position = 0;
2034 mddev->degraded++;
2035 if (disk->rdev) 2033 if (disk->rdev)
2036 conf->fullsync = 1; 2034 conf->fullsync = 1;
2037 } 2035 } else if (conf->last_used < 0)
2036 /*
2037 * The first working device is used as a
2038 * starting point to read balancing.
2039 */
2040 conf->last_used = i;
2038 } 2041 }
2039 if (mddev->degraded == conf->raid_disks) { 2042
2043 err = -EIO;
2044 if (conf->last_used < 0) {
2040 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 2045 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
2041 mdname(mddev)); 2046 mdname(mddev));
2042 goto out_free_conf; 2047 goto abort;
2043 } 2048 }
2044 if (conf->raid_disks - mddev->degraded == 1) 2049 err = -ENOMEM;
2045 mddev->recovery_cp = MaxSector; 2050 conf->thread = md_register_thread(raid1d, mddev, NULL);
2051 if (!conf->thread) {
2052 printk(KERN_ERR
2053 "raid1: couldn't allocate thread for %s\n",
2054 mdname(mddev));
2055 goto abort;
2056 }
2057
2058 return conf;
2059
2060 abort:
2061 if (conf) {
2062 if (conf->r1bio_pool)
2063 mempool_destroy(conf->r1bio_pool);
2064 kfree(conf->mirrors);
2065 safe_put_page(conf->tmppage);
2066 kfree(conf->poolinfo);
2067 kfree(conf);
2068 }
2069 return ERR_PTR(err);
2070}
2046 2071
2072static int run(mddev_t *mddev)
2073{
2074 conf_t *conf;
2075 int i;
2076 mdk_rdev_t *rdev;
2077
2078 if (mddev->level != 1) {
2079 printk("raid1: %s: raid level not set to mirroring (%d)\n",
2080 mdname(mddev), mddev->level);
2081 return -EIO;
2082 }
2083 if (mddev->reshape_position != MaxSector) {
2084 printk("raid1: %s: reshape_position set but not supported\n",
2085 mdname(mddev));
2086 return -EIO;
2087 }
2047 /* 2088 /*
2048 * find the first working one and use it as a starting point 2089 * copy the already verified devices into our private RAID1
2049 * to read balancing. 2090 * bookkeeping area. [whatever we allocate in run(),
2091 * should be freed in stop()]
2050 */ 2092 */
2051 for (j = 0; j < conf->raid_disks && 2093 if (mddev->private == NULL)
2052 (!conf->mirrors[j].rdev || 2094 conf = setup_conf(mddev);
2053 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) 2095 else
2054 /* nothing */; 2096 conf = mddev->private;
2055 conf->last_used = j;
2056 2097
2098 if (IS_ERR(conf))
2099 return PTR_ERR(conf);
2057 2100
2058 mddev->thread = md_register_thread(raid1d, mddev, NULL); 2101 mddev->queue->queue_lock = &conf->device_lock;
2059 if (!mddev->thread) { 2102 list_for_each_entry(rdev, &mddev->disks, same_set) {
2060 printk(KERN_ERR 2103 disk_stack_limits(mddev->gendisk, rdev->bdev,
2061 "raid1: couldn't allocate thread for %s\n", 2104 rdev->data_offset << 9);
2062 mdname(mddev)); 2105 /* as we don't honour merge_bvec_fn, we must never risk
2063 goto out_free_conf; 2106 * violating it, so limit ->max_segments to 1 lying within
2107 * a single page, as a one page request is never in violation.
2108 */
2109 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2110 blk_queue_max_segments(mddev->queue, 1);
2111 blk_queue_segment_boundary(mddev->queue,
2112 PAGE_CACHE_SIZE - 1);
2113 }
2064 } 2114 }
2065 2115
2116 mddev->degraded = 0;
2117 for (i=0; i < conf->raid_disks; i++)
2118 if (conf->mirrors[i].rdev == NULL ||
2119 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2120 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2121 mddev->degraded++;
2122
2123 if (conf->raid_disks - mddev->degraded == 1)
2124 mddev->recovery_cp = MaxSector;
2125
2066 if (mddev->recovery_cp != MaxSector) 2126 if (mddev->recovery_cp != MaxSector)
2067 printk(KERN_NOTICE "raid1: %s is not clean" 2127 printk(KERN_NOTICE "raid1: %s is not clean"
2068 " -- starting background reconstruction\n", 2128 " -- starting background reconstruction\n",
@@ -2071,9 +2131,14 @@ static int run(mddev_t *mddev)
2071 "raid1: raid set %s active with %d out of %d mirrors\n", 2131 "raid1: raid set %s active with %d out of %d mirrors\n",
2072 mdname(mddev), mddev->raid_disks - mddev->degraded, 2132 mdname(mddev), mddev->raid_disks - mddev->degraded,
2073 mddev->raid_disks); 2133 mddev->raid_disks);
2134
2074 /* 2135 /*
2075 * Ok, everything is just fine now 2136 * Ok, everything is just fine now
2076 */ 2137 */
2138 mddev->thread = conf->thread;
2139 conf->thread = NULL;
2140 mddev->private = conf;
2141
2077 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2142 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2078 2143
2079 mddev->queue->unplug_fn = raid1_unplug; 2144 mddev->queue->unplug_fn = raid1_unplug;
@@ -2081,23 +2146,6 @@ static int run(mddev_t *mddev)
2081 mddev->queue->backing_dev_info.congested_data = mddev; 2146 mddev->queue->backing_dev_info.congested_data = mddev;
2082 md_integrity_register(mddev); 2147 md_integrity_register(mddev);
2083 return 0; 2148 return 0;
2084
2085out_no_mem:
2086 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
2087 mdname(mddev));
2088
2089out_free_conf:
2090 if (conf) {
2091 if (conf->r1bio_pool)
2092 mempool_destroy(conf->r1bio_pool);
2093 kfree(conf->mirrors);
2094 safe_put_page(conf->tmppage);
2095 kfree(conf->poolinfo);
2096 kfree(conf);
2097 mddev->private = NULL;
2098 }
2099out:
2100 return -EIO;
2101} 2149}
2102 2150
2103static int stop(mddev_t *mddev) 2151static int stop(mddev_t *mddev)
@@ -2271,6 +2319,9 @@ static void raid1_quiesce(mddev_t *mddev, int state)
2271 conf_t *conf = mddev->private; 2319 conf_t *conf = mddev->private;
2272 2320
2273 switch(state) { 2321 switch(state) {
2322 case 2: /* wake for suspend */
2323 wake_up(&conf->wait_barrier);
2324 break;
2274 case 1: 2325 case 1:
2275 raise_barrier(conf); 2326 raise_barrier(conf);
2276 break; 2327 break;
@@ -2280,6 +2331,23 @@ static void raid1_quiesce(mddev_t *mddev, int state)
2280 } 2331 }
2281} 2332}
2282 2333
2334static void *raid1_takeover(mddev_t *mddev)
2335{
2336 /* raid1 can take over:
2337 * raid5 with 2 devices, any layout or chunk size
2338 */
2339 if (mddev->level == 5 && mddev->raid_disks == 2) {
2340 conf_t *conf;
2341 mddev->new_level = 1;
2342 mddev->new_layout = 0;
2343 mddev->new_chunk_sectors = 0;
2344 conf = setup_conf(mddev);
2345 if (!IS_ERR(conf))
2346 conf->barrier = 1;
2347 return conf;
2348 }
2349 return ERR_PTR(-EINVAL);
2350}
2283 2351
2284static struct mdk_personality raid1_personality = 2352static struct mdk_personality raid1_personality =
2285{ 2353{
@@ -2299,6 +2367,7 @@ static struct mdk_personality raid1_personality =
2299 .size = raid1_size, 2367 .size = raid1_size,
2300 .check_reshape = raid1_reshape, 2368 .check_reshape = raid1_reshape,
2301 .quiesce = raid1_quiesce, 2369 .quiesce = raid1_quiesce,
2370 .takeover = raid1_takeover,
2302}; 2371};
2303 2372
2304static int __init raid_init(void) 2373static int __init raid_init(void)
@@ -2314,6 +2383,7 @@ static void raid_exit(void)
2314module_init(raid_init); 2383module_init(raid_init);
2315module_exit(raid_exit); 2384module_exit(raid_exit);
2316MODULE_LICENSE("GPL"); 2385MODULE_LICENSE("GPL");
2386MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2317MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2387MODULE_ALIAS("md-personality-3"); /* RAID1 */
2318MODULE_ALIAS("md-raid1"); 2388MODULE_ALIAS("md-raid1");
2319MODULE_ALIAS("md-level-1"); 2389MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84deff68..5f2d443ae28a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -59,6 +59,11 @@ struct r1_private_data_s {
59 59
60 mempool_t *r1bio_pool; 60 mempool_t *r1bio_pool;
61 mempool_t *r1buf_pool; 61 mempool_t *r1buf_pool;
62
63 /* When taking over an array from a different personality, we store
64 * the new thread here until we fully activate the array.
65 */
66 struct mdk_thread_s *thread;
62}; 67};
63 68
64typedef struct r1_private_data_s conf_t; 69typedef struct r1_private_data_s conf_t;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b87b440..e2766d8251a1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,6 +18,7 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include <linux/slab.h>
21#include <linux/delay.h> 22#include <linux/delay.h>
22#include <linux/blkdev.h> 23#include <linux/blkdev.h>
23#include <linux/seq_file.h> 24#include <linux/seq_file.h>
@@ -804,7 +805,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
804 mdk_rdev_t *blocked_rdev; 805 mdk_rdev_t *blocked_rdev;
805 806
806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 807 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
807 bio_endio(bio, -EOPNOTSUPP); 808 md_barrier_request(mddev, bio);
808 return 0; 809 return 0;
809 } 810 }
810 811
@@ -1155,13 +1156,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1155 1156
1156 disk_stack_limits(mddev->gendisk, rdev->bdev, 1157 disk_stack_limits(mddev->gendisk, rdev->bdev,
1157 rdev->data_offset << 9); 1158 rdev->data_offset << 9);
1158 /* as we don't honour merge_bvec_fn, we must never risk 1159 /* as we don't honour merge_bvec_fn, we must
1159 * violating it, so limit ->max_sector to one PAGE, as 1160 * never risk violating it, so limit
1160 * a one page request is never in violation. 1161 * ->max_segments to one lying with a single
1162 * page, as a one page request is never in
1163 * violation.
1161 */ 1164 */
1162 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1165 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1163 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 1166 blk_queue_max_segments(mddev->queue, 1);
1164 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1167 blk_queue_segment_boundary(mddev->queue,
1168 PAGE_CACHE_SIZE - 1);
1169 }
1165 1170
1166 p->head_position = 0; 1171 p->head_position = 0;
1167 rdev->raid_disk = mirror; 1172 rdev->raid_disk = mirror;
@@ -1432,6 +1437,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1432 1437
1433 1438
1434/* 1439/*
1440 * Used by fix_read_error() to decay the per rdev read_errors.
1441 * We halve the read error count for every hour that has elapsed
1442 * since the last recorded read error.
1443 *
1444 */
1445static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1446{
1447 struct timespec cur_time_mon;
1448 unsigned long hours_since_last;
1449 unsigned int read_errors = atomic_read(&rdev->read_errors);
1450
1451 ktime_get_ts(&cur_time_mon);
1452
1453 if (rdev->last_read_error.tv_sec == 0 &&
1454 rdev->last_read_error.tv_nsec == 0) {
1455 /* first time we've seen a read error */
1456 rdev->last_read_error = cur_time_mon;
1457 return;
1458 }
1459
1460 hours_since_last = (cur_time_mon.tv_sec -
1461 rdev->last_read_error.tv_sec) / 3600;
1462
1463 rdev->last_read_error = cur_time_mon;
1464
1465 /*
1466 * if hours_since_last is > the number of bits in read_errors
1467 * just set read errors to 0. We do this to avoid
1468 * overflowing the shift of read_errors by hours_since_last.
1469 */
1470 if (hours_since_last >= 8 * sizeof(read_errors))
1471 atomic_set(&rdev->read_errors, 0);
1472 else
1473 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1474}
1475
1476/*
1435 * This is a kernel thread which: 1477 * This is a kernel thread which:
1436 * 1478 *
1437 * 1. Retries failed read operations on working mirrors. 1479 * 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1486,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1444 int sect = 0; /* Offset from r10_bio->sector */ 1486 int sect = 0; /* Offset from r10_bio->sector */
1445 int sectors = r10_bio->sectors; 1487 int sectors = r10_bio->sectors;
1446 mdk_rdev_t*rdev; 1488 mdk_rdev_t*rdev;
1489 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1490
1491 rcu_read_lock();
1492 {
1493 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1494 char b[BDEVNAME_SIZE];
1495 int cur_read_error_count = 0;
1496
1497 rdev = rcu_dereference(conf->mirrors[d].rdev);
1498 bdevname(rdev->bdev, b);
1499
1500 if (test_bit(Faulty, &rdev->flags)) {
1501 rcu_read_unlock();
1502 /* drive has already been failed, just ignore any
1503 more fix_read_error() attempts */
1504 return;
1505 }
1506
1507 check_decay_read_errors(mddev, rdev);
1508 atomic_inc(&rdev->read_errors);
1509 cur_read_error_count = atomic_read(&rdev->read_errors);
1510 if (cur_read_error_count > max_read_errors) {
1511 rcu_read_unlock();
1512 printk(KERN_NOTICE
1513 "raid10: %s: Raid device exceeded "
1514 "read_error threshold "
1515 "[cur %d:max %d]\n",
1516 b, cur_read_error_count, max_read_errors);
1517 printk(KERN_NOTICE
1518 "raid10: %s: Failing raid "
1519 "device\n", b);
1520 md_error(mddev, conf->mirrors[d].rdev);
1521 return;
1522 }
1523 }
1524 rcu_read_unlock();
1525
1447 while(sectors) { 1526 while(sectors) {
1448 int s = sectors; 1527 int s = sectors;
1449 int sl = r10_bio->read_slot; 1528 int sl = r10_bio->read_slot;
@@ -1488,6 +1567,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1488 /* write it back and re-read */ 1567 /* write it back and re-read */
1489 rcu_read_lock(); 1568 rcu_read_lock();
1490 while (sl != r10_bio->read_slot) { 1569 while (sl != r10_bio->read_slot) {
1570 char b[BDEVNAME_SIZE];
1491 int d; 1571 int d;
1492 if (sl==0) 1572 if (sl==0)
1493 sl = conf->copies; 1573 sl = conf->copies;
@@ -1503,9 +1583,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1503 r10_bio->devs[sl].addr + 1583 r10_bio->devs[sl].addr +
1504 sect + rdev->data_offset, 1584 sect + rdev->data_offset,
1505 s<<9, conf->tmppage, WRITE) 1585 s<<9, conf->tmppage, WRITE)
1506 == 0) 1586 == 0) {
1507 /* Well, this device is dead */ 1587 /* Well, this device is dead */
1588 printk(KERN_NOTICE
1589 "raid10:%s: read correction "
1590 "write failed"
1591 " (%d sectors at %llu on %s)\n",
1592 mdname(mddev), s,
1593 (unsigned long long)(sect+
1594 rdev->data_offset),
1595 bdevname(rdev->bdev, b));
1596 printk(KERN_NOTICE "raid10:%s: failing "
1597 "drive\n",
1598 bdevname(rdev->bdev, b));
1508 md_error(mddev, rdev); 1599 md_error(mddev, rdev);
1600 }
1509 rdev_dec_pending(rdev, mddev); 1601 rdev_dec_pending(rdev, mddev);
1510 rcu_read_lock(); 1602 rcu_read_lock();
1511 } 1603 }
@@ -1526,10 +1618,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1526 if (sync_page_io(rdev->bdev, 1618 if (sync_page_io(rdev->bdev,
1527 r10_bio->devs[sl].addr + 1619 r10_bio->devs[sl].addr +
1528 sect + rdev->data_offset, 1620 sect + rdev->data_offset,
1529 s<<9, conf->tmppage, READ) == 0) 1621 s<<9, conf->tmppage,
1622 READ) == 0) {
1530 /* Well, this device is dead */ 1623 /* Well, this device is dead */
1624 printk(KERN_NOTICE
1625 "raid10:%s: unable to read back "
1626 "corrected sectors"
1627 " (%d sectors at %llu on %s)\n",
1628 mdname(mddev), s,
1629 (unsigned long long)(sect+
1630 rdev->data_offset),
1631 bdevname(rdev->bdev, b));
1632 printk(KERN_NOTICE "raid10:%s: failing drive\n",
1633 bdevname(rdev->bdev, b));
1634
1531 md_error(mddev, rdev); 1635 md_error(mddev, rdev);
1532 else 1636 } else {
1533 printk(KERN_INFO 1637 printk(KERN_INFO
1534 "raid10:%s: read error corrected" 1638 "raid10:%s: read error corrected"
1535 " (%d sectors at %llu on %s)\n", 1639 " (%d sectors at %llu on %s)\n",
@@ -1537,6 +1641,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1537 (unsigned long long)(sect+ 1641 (unsigned long long)(sect+
1538 rdev->data_offset), 1642 rdev->data_offset),
1539 bdevname(rdev->bdev, b)); 1643 bdevname(rdev->bdev, b));
1644 }
1540 1645
1541 rdev_dec_pending(rdev, mddev); 1646 rdev_dec_pending(rdev, mddev);
1542 rcu_read_lock(); 1647 rcu_read_lock();
@@ -2155,12 +2260,14 @@ static int run(mddev_t *mddev)
2155 disk_stack_limits(mddev->gendisk, rdev->bdev, 2260 disk_stack_limits(mddev->gendisk, rdev->bdev,
2156 rdev->data_offset << 9); 2261 rdev->data_offset << 9);
2157 /* as we don't honour merge_bvec_fn, we must never risk 2262 /* as we don't honour merge_bvec_fn, we must never risk
2158 * violating it, so limit ->max_sector to one PAGE, as 2263 * violating it, so limit max_segments to 1 lying
2159 * a one page request is never in violation. 2264 * within a single page.
2160 */ 2265 */
2161 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 2266 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2162 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 2267 blk_queue_max_segments(mddev->queue, 1);
2163 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 2268 blk_queue_segment_boundary(mddev->queue,
2269 PAGE_CACHE_SIZE - 1);
2270 }
2164 2271
2165 disk->head_position = 0; 2272 disk->head_position = 0;
2166 } 2273 }
@@ -2275,13 +2382,6 @@ static void raid10_quiesce(mddev_t *mddev, int state)
2275 lower_barrier(conf); 2382 lower_barrier(conf);
2276 break; 2383 break;
2277 } 2384 }
2278 if (mddev->thread) {
2279 if (mddev->bitmap)
2280 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2281 else
2282 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2283 md_wakeup_thread(mddev->thread);
2284 }
2285} 2385}
2286 2386
2287static struct mdk_personality raid10_personality = 2387static struct mdk_personality raid10_personality =
@@ -2315,6 +2415,7 @@ static void raid_exit(void)
2315module_init(raid_init); 2415module_init(raid_init);
2316module_exit(raid_exit); 2416module_exit(raid_exit);
2317MODULE_LICENSE("GPL"); 2417MODULE_LICENSE("GPL");
2418MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
2318MODULE_ALIAS("md-personality-9"); /* RAID10 */ 2419MODULE_ALIAS("md-personality-9"); /* RAID10 */
2319MODULE_ALIAS("md-raid10"); 2420MODULE_ALIAS("md-raid10");
2320MODULE_ALIAS("md-level-10"); 2421MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d29215d966da..15348c393b5d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -50,6 +50,7 @@
50#include <linux/async.h> 50#include <linux/async.h>
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/cpu.h> 52#include <linux/cpu.h>
53#include <linux/slab.h>
53#include "md.h" 54#include "md.h"
54#include "raid5.h" 55#include "raid5.h"
55#include "bitmap.h" 56#include "bitmap.h"
@@ -1526,7 +1527,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1526 1527
1527 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1528 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1528 atomic_inc(&rdev->read_errors); 1529 atomic_inc(&rdev->read_errors);
1529 if (conf->mddev->degraded) 1530 if (conf->mddev->degraded >= conf->max_degraded)
1530 printk_rl(KERN_WARNING 1531 printk_rl(KERN_WARNING
1531 "raid5:%s: read error not correctable " 1532 "raid5:%s: read error not correctable "
1532 "(sector %llu on %s).\n", 1533 "(sector %llu on %s).\n",
@@ -1649,8 +1650,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1649 int previous, int *dd_idx, 1650 int previous, int *dd_idx,
1650 struct stripe_head *sh) 1651 struct stripe_head *sh)
1651{ 1652{
1652 long stripe; 1653 sector_t stripe, stripe2;
1653 unsigned long chunk_number; 1654 sector_t chunk_number;
1654 unsigned int chunk_offset; 1655 unsigned int chunk_offset;
1655 int pd_idx, qd_idx; 1656 int pd_idx, qd_idx;
1656 int ddf_layout = 0; 1657 int ddf_layout = 0;
@@ -1670,18 +1671,13 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1670 */ 1671 */
1671 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1672 chunk_offset = sector_div(r_sector, sectors_per_chunk);
1672 chunk_number = r_sector; 1673 chunk_number = r_sector;
1673 BUG_ON(r_sector != chunk_number);
1674 1674
1675 /* 1675 /*
1676 * Compute the stripe number 1676 * Compute the stripe number
1677 */ 1677 */
1678 stripe = chunk_number / data_disks; 1678 stripe = chunk_number;
1679 1679 *dd_idx = sector_div(stripe, data_disks);
1680 /* 1680 stripe2 = stripe;
1681 * Compute the data disk and parity disk indexes inside the stripe
1682 */
1683 *dd_idx = chunk_number % data_disks;
1684
1685 /* 1681 /*
1686 * Select the parity disk based on the user selected algorithm. 1682 * Select the parity disk based on the user selected algorithm.
1687 */ 1683 */
@@ -1693,21 +1689,21 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1693 case 5: 1689 case 5:
1694 switch (algorithm) { 1690 switch (algorithm) {
1695 case ALGORITHM_LEFT_ASYMMETRIC: 1691 case ALGORITHM_LEFT_ASYMMETRIC:
1696 pd_idx = data_disks - stripe % raid_disks; 1692 pd_idx = data_disks - sector_div(stripe2, raid_disks);
1697 if (*dd_idx >= pd_idx) 1693 if (*dd_idx >= pd_idx)
1698 (*dd_idx)++; 1694 (*dd_idx)++;
1699 break; 1695 break;
1700 case ALGORITHM_RIGHT_ASYMMETRIC: 1696 case ALGORITHM_RIGHT_ASYMMETRIC:
1701 pd_idx = stripe % raid_disks; 1697 pd_idx = sector_div(stripe2, raid_disks);
1702 if (*dd_idx >= pd_idx) 1698 if (*dd_idx >= pd_idx)
1703 (*dd_idx)++; 1699 (*dd_idx)++;
1704 break; 1700 break;
1705 case ALGORITHM_LEFT_SYMMETRIC: 1701 case ALGORITHM_LEFT_SYMMETRIC:
1706 pd_idx = data_disks - stripe % raid_disks; 1702 pd_idx = data_disks - sector_div(stripe2, raid_disks);
1707 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1703 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1708 break; 1704 break;
1709 case ALGORITHM_RIGHT_SYMMETRIC: 1705 case ALGORITHM_RIGHT_SYMMETRIC:
1710 pd_idx = stripe % raid_disks; 1706 pd_idx = sector_div(stripe2, raid_disks);
1711 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1707 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1712 break; 1708 break;
1713 case ALGORITHM_PARITY_0: 1709 case ALGORITHM_PARITY_0:
@@ -1727,7 +1723,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1727 1723
1728 switch (algorithm) { 1724 switch (algorithm) {
1729 case ALGORITHM_LEFT_ASYMMETRIC: 1725 case ALGORITHM_LEFT_ASYMMETRIC:
1730 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1726 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1731 qd_idx = pd_idx + 1; 1727 qd_idx = pd_idx + 1;
1732 if (pd_idx == raid_disks-1) { 1728 if (pd_idx == raid_disks-1) {
1733 (*dd_idx)++; /* Q D D D P */ 1729 (*dd_idx)++; /* Q D D D P */
@@ -1736,7 +1732,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1736 (*dd_idx) += 2; /* D D P Q D */ 1732 (*dd_idx) += 2; /* D D P Q D */
1737 break; 1733 break;
1738 case ALGORITHM_RIGHT_ASYMMETRIC: 1734 case ALGORITHM_RIGHT_ASYMMETRIC:
1739 pd_idx = stripe % raid_disks; 1735 pd_idx = sector_div(stripe2, raid_disks);
1740 qd_idx = pd_idx + 1; 1736 qd_idx = pd_idx + 1;
1741 if (pd_idx == raid_disks-1) { 1737 if (pd_idx == raid_disks-1) {
1742 (*dd_idx)++; /* Q D D D P */ 1738 (*dd_idx)++; /* Q D D D P */
@@ -1745,12 +1741,12 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1745 (*dd_idx) += 2; /* D D P Q D */ 1741 (*dd_idx) += 2; /* D D P Q D */
1746 break; 1742 break;
1747 case ALGORITHM_LEFT_SYMMETRIC: 1743 case ALGORITHM_LEFT_SYMMETRIC:
1748 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1744 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1749 qd_idx = (pd_idx + 1) % raid_disks; 1745 qd_idx = (pd_idx + 1) % raid_disks;
1750 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1746 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1751 break; 1747 break;
1752 case ALGORITHM_RIGHT_SYMMETRIC: 1748 case ALGORITHM_RIGHT_SYMMETRIC:
1753 pd_idx = stripe % raid_disks; 1749 pd_idx = sector_div(stripe2, raid_disks);
1754 qd_idx = (pd_idx + 1) % raid_disks; 1750 qd_idx = (pd_idx + 1) % raid_disks;
1755 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1751 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1756 break; 1752 break;
@@ -1769,7 +1765,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1769 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1765 /* Exactly the same as RIGHT_ASYMMETRIC, but or
1770 * of blocks for computing Q is different. 1766 * of blocks for computing Q is different.
1771 */ 1767 */
1772 pd_idx = stripe % raid_disks; 1768 pd_idx = sector_div(stripe2, raid_disks);
1773 qd_idx = pd_idx + 1; 1769 qd_idx = pd_idx + 1;
1774 if (pd_idx == raid_disks-1) { 1770 if (pd_idx == raid_disks-1) {
1775 (*dd_idx)++; /* Q D D D P */ 1771 (*dd_idx)++; /* Q D D D P */
@@ -1784,7 +1780,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1784 * D D D P Q rather than 1780 * D D D P Q rather than
1785 * Q D D D P 1781 * Q D D D P
1786 */ 1782 */
1787 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); 1783 stripe2 += 1;
1784 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1788 qd_idx = pd_idx + 1; 1785 qd_idx = pd_idx + 1;
1789 if (pd_idx == raid_disks-1) { 1786 if (pd_idx == raid_disks-1) {
1790 (*dd_idx)++; /* Q D D D P */ 1787 (*dd_idx)++; /* Q D D D P */
@@ -1796,7 +1793,7 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1796 1793
1797 case ALGORITHM_ROTATING_N_CONTINUE: 1794 case ALGORITHM_ROTATING_N_CONTINUE:
1798 /* Same as left_symmetric but Q is before P */ 1795 /* Same as left_symmetric but Q is before P */
1799 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1796 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1800 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1797 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1801 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1798 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1802 ddf_layout = 1; 1799 ddf_layout = 1;
@@ -1804,27 +1801,27 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1804 1801
1805 case ALGORITHM_LEFT_ASYMMETRIC_6: 1802 case ALGORITHM_LEFT_ASYMMETRIC_6:
1806 /* RAID5 left_asymmetric, with Q on last device */ 1803 /* RAID5 left_asymmetric, with Q on last device */
1807 pd_idx = data_disks - stripe % (raid_disks-1); 1804 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1808 if (*dd_idx >= pd_idx) 1805 if (*dd_idx >= pd_idx)
1809 (*dd_idx)++; 1806 (*dd_idx)++;
1810 qd_idx = raid_disks - 1; 1807 qd_idx = raid_disks - 1;
1811 break; 1808 break;
1812 1809
1813 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1810 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1814 pd_idx = stripe % (raid_disks-1); 1811 pd_idx = sector_div(stripe2, raid_disks-1);
1815 if (*dd_idx >= pd_idx) 1812 if (*dd_idx >= pd_idx)
1816 (*dd_idx)++; 1813 (*dd_idx)++;
1817 qd_idx = raid_disks - 1; 1814 qd_idx = raid_disks - 1;
1818 break; 1815 break;
1819 1816
1820 case ALGORITHM_LEFT_SYMMETRIC_6: 1817 case ALGORITHM_LEFT_SYMMETRIC_6:
1821 pd_idx = data_disks - stripe % (raid_disks-1); 1818 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1822 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1819 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1823 qd_idx = raid_disks - 1; 1820 qd_idx = raid_disks - 1;
1824 break; 1821 break;
1825 1822
1826 case ALGORITHM_RIGHT_SYMMETRIC_6: 1823 case ALGORITHM_RIGHT_SYMMETRIC_6:
1827 pd_idx = stripe % (raid_disks-1); 1824 pd_idx = sector_div(stripe2, raid_disks-1);
1828 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1825 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1829 qd_idx = raid_disks - 1; 1826 qd_idx = raid_disks - 1;
1830 break; 1827 break;
@@ -1869,14 +1866,14 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1869 : conf->algorithm; 1866 : conf->algorithm;
1870 sector_t stripe; 1867 sector_t stripe;
1871 int chunk_offset; 1868 int chunk_offset;
1872 int chunk_number, dummy1, dd_idx = i; 1869 sector_t chunk_number;
1870 int dummy1, dd_idx = i;
1873 sector_t r_sector; 1871 sector_t r_sector;
1874 struct stripe_head sh2; 1872 struct stripe_head sh2;
1875 1873
1876 1874
1877 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1875 chunk_offset = sector_div(new_sector, sectors_per_chunk);
1878 stripe = new_sector; 1876 stripe = new_sector;
1879 BUG_ON(new_sector != stripe);
1880 1877
1881 if (i == sh->pd_idx) 1878 if (i == sh->pd_idx)
1882 return 0; 1879 return 0;
@@ -1969,7 +1966,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1969 } 1966 }
1970 1967
1971 chunk_number = stripe * data_disks + i; 1968 chunk_number = stripe * data_disks + i;
1972 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 1969 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
1973 1970
1974 check = raid5_compute_sector(conf, r_sector, 1971 check = raid5_compute_sector(conf, r_sector,
1975 previous, &dummy1, &sh2); 1972 previous, &dummy1, &sh2);
@@ -2947,6 +2944,7 @@ static void handle_stripe5(struct stripe_head *sh)
2947 struct r5dev *dev; 2944 struct r5dev *dev;
2948 mdk_rdev_t *blocked_rdev = NULL; 2945 mdk_rdev_t *blocked_rdev = NULL;
2949 int prexor; 2946 int prexor;
2947 int dec_preread_active = 0;
2950 2948
2951 memset(&s, 0, sizeof(s)); 2949 memset(&s, 0, sizeof(s));
2952 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 2950 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
@@ -3096,12 +3094,8 @@ static void handle_stripe5(struct stripe_head *sh)
3096 set_bit(STRIPE_INSYNC, &sh->state); 3094 set_bit(STRIPE_INSYNC, &sh->state);
3097 } 3095 }
3098 } 3096 }
3099 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3097 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3100 atomic_dec(&conf->preread_active_stripes); 3098 dec_preread_active = 1;
3101 if (atomic_read(&conf->preread_active_stripes) <
3102 IO_THRESHOLD)
3103 md_wakeup_thread(conf->mddev->thread);
3104 }
3105 } 3099 }
3106 3100
3107 /* Now to consider new write requests and what else, if anything 3101 /* Now to consider new write requests and what else, if anything
@@ -3208,6 +3202,16 @@ static void handle_stripe5(struct stripe_head *sh)
3208 3202
3209 ops_run_io(sh, &s); 3203 ops_run_io(sh, &s);
3210 3204
3205 if (dec_preread_active) {
3206 /* We delay this until after ops_run_io so that if make_request
3207 * is waiting on a barrier, it won't continue until the writes
3208 * have actually been submitted.
3209 */
3210 atomic_dec(&conf->preread_active_stripes);
3211 if (atomic_read(&conf->preread_active_stripes) <
3212 IO_THRESHOLD)
3213 md_wakeup_thread(conf->mddev->thread);
3214 }
3211 return_io(return_bi); 3215 return_io(return_bi);
3212} 3216}
3213 3217
@@ -3221,6 +3225,7 @@ static void handle_stripe6(struct stripe_head *sh)
3221 struct r6_state r6s; 3225 struct r6_state r6s;
3222 struct r5dev *dev, *pdev, *qdev; 3226 struct r5dev *dev, *pdev, *qdev;
3223 mdk_rdev_t *blocked_rdev = NULL; 3227 mdk_rdev_t *blocked_rdev = NULL;
3228 int dec_preread_active = 0;
3224 3229
3225 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3230 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3226 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3231 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
@@ -3358,7 +3363,6 @@ static void handle_stripe6(struct stripe_head *sh)
3358 * completed 3363 * completed
3359 */ 3364 */
3360 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3365 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3361 int qd_idx = sh->qd_idx;
3362 3366
3363 sh->reconstruct_state = reconstruct_state_idle; 3367 sh->reconstruct_state = reconstruct_state_idle;
3364 /* All the 'written' buffers and the parity blocks are ready to 3368 /* All the 'written' buffers and the parity blocks are ready to
@@ -3380,12 +3384,8 @@ static void handle_stripe6(struct stripe_head *sh)
3380 set_bit(STRIPE_INSYNC, &sh->state); 3384 set_bit(STRIPE_INSYNC, &sh->state);
3381 } 3385 }
3382 } 3386 }
3383 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3387 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3384 atomic_dec(&conf->preread_active_stripes); 3388 dec_preread_active = 1;
3385 if (atomic_read(&conf->preread_active_stripes) <
3386 IO_THRESHOLD)
3387 md_wakeup_thread(conf->mddev->thread);
3388 }
3389 } 3389 }
3390 3390
3391 /* Now to consider new write requests and what else, if anything 3391 /* Now to consider new write requests and what else, if anything
@@ -3494,6 +3494,18 @@ static void handle_stripe6(struct stripe_head *sh)
3494 3494
3495 ops_run_io(sh, &s); 3495 ops_run_io(sh, &s);
3496 3496
3497
3498 if (dec_preread_active) {
3499 /* We delay this until after ops_run_io so that if make_request
3500 * is waiting on a barrier, it won't continue until the writes
3501 * have actually been submitted.
3502 */
3503 atomic_dec(&conf->preread_active_stripes);
3504 if (atomic_read(&conf->preread_active_stripes) <
3505 IO_THRESHOLD)
3506 md_wakeup_thread(conf->mddev->thread);
3507 }
3508
3497 return_io(return_bi); 3509 return_io(return_bi);
3498} 3510}
3499 3511
@@ -3724,7 +3736,7 @@ static int bio_fits_rdev(struct bio *bi)
3724 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3736 if ((bi->bi_size>>9) > queue_max_sectors(q))
3725 return 0; 3737 return 0;
3726 blk_recount_segments(q, bi); 3738 blk_recount_segments(q, bi);
3727 if (bi->bi_phys_segments > queue_max_phys_segments(q)) 3739 if (bi->bi_phys_segments > queue_max_segments(q))
3728 return 0; 3740 return 0;
3729 3741
3730 if (q->merge_bvec_fn) 3742 if (q->merge_bvec_fn)
@@ -3741,7 +3753,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3741{ 3753{
3742 mddev_t *mddev = q->queuedata; 3754 mddev_t *mddev = q->queuedata;
3743 raid5_conf_t *conf = mddev->private; 3755 raid5_conf_t *conf = mddev->private;
3744 unsigned int dd_idx; 3756 int dd_idx;
3745 struct bio* align_bi; 3757 struct bio* align_bi;
3746 mdk_rdev_t *rdev; 3758 mdk_rdev_t *rdev;
3747 3759
@@ -3866,7 +3878,13 @@ static int make_request(struct request_queue *q, struct bio * bi)
3866 int cpu, remaining; 3878 int cpu, remaining;
3867 3879
3868 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3880 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3869 bio_endio(bi, -EOPNOTSUPP); 3881 /* Drain all pending writes. We only really need
3882 * to ensure they have been submitted, but this is
3883 * easier.
3884 */
3885 mddev->pers->quiesce(mddev, 1);
3886 mddev->pers->quiesce(mddev, 0);
3887 md_barrier_request(mddev, bi);
3870 return 0; 3888 return 0;
3871 } 3889 }
3872 3890
@@ -3990,6 +4008,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
3990 finish_wait(&conf->wait_for_overlap, &w); 4008 finish_wait(&conf->wait_for_overlap, &w);
3991 set_bit(STRIPE_HANDLE, &sh->state); 4009 set_bit(STRIPE_HANDLE, &sh->state);
3992 clear_bit(STRIPE_DELAYED, &sh->state); 4010 clear_bit(STRIPE_DELAYED, &sh->state);
4011 if (mddev->barrier &&
4012 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4013 atomic_inc(&conf->preread_active_stripes);
3993 release_stripe(sh); 4014 release_stripe(sh);
3994 } else { 4015 } else {
3995 /* cannot get stripe for read-ahead, just give-up */ 4016 /* cannot get stripe for read-ahead, just give-up */
@@ -4009,6 +4030,14 @@ static int make_request(struct request_queue *q, struct bio * bi)
4009 4030
4010 bio_endio(bi, 0); 4031 bio_endio(bi, 0);
4011 } 4032 }
4033
4034 if (mddev->barrier) {
4035 /* We need to wait for the stripes to all be handled.
4036 * So: wait for preread_active_stripes to drop to 0.
4037 */
4038 wait_event(mddev->thread->wqueue,
4039 atomic_read(&conf->preread_active_stripes) == 0);
4040 }
4012 return 0; 4041 return 0;
4013} 4042}
4014 4043
@@ -4648,7 +4677,7 @@ static int raid5_alloc_percpu(raid5_conf_t *conf)
4648{ 4677{
4649 unsigned long cpu; 4678 unsigned long cpu;
4650 struct page *spare_page; 4679 struct page *spare_page;
4651 struct raid5_percpu *allcpus; 4680 struct raid5_percpu __percpu *allcpus;
4652 void *scribble; 4681 void *scribble;
4653 int err; 4682 int err;
4654 4683
@@ -5104,9 +5133,8 @@ static int stop(mddev_t *mddev)
5104 mddev->thread = NULL; 5133 mddev->thread = NULL;
5105 mddev->queue->backing_dev_info.congested_fn = NULL; 5134 mddev->queue->backing_dev_info.congested_fn = NULL;
5106 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5135 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5107 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
5108 free_conf(conf); 5136 free_conf(conf);
5109 mddev->private = NULL; 5137 mddev->private = &raid5_attrs_group;
5110 return 0; 5138 return 0;
5111} 5139}
5112 5140
@@ -5432,11 +5460,11 @@ static int raid5_start_reshape(mddev_t *mddev)
5432 !test_bit(Faulty, &rdev->flags)) { 5460 !test_bit(Faulty, &rdev->flags)) {
5433 if (raid5_add_disk(mddev, rdev) == 0) { 5461 if (raid5_add_disk(mddev, rdev) == 0) {
5434 char nm[20]; 5462 char nm[20];
5435 if (rdev->raid_disk >= conf->previous_raid_disks) 5463 if (rdev->raid_disk >= conf->previous_raid_disks) {
5436 set_bit(In_sync, &rdev->flags); 5464 set_bit(In_sync, &rdev->flags);
5437 else 5465 added_devices++;
5466 } else
5438 rdev->recovery_offset = 0; 5467 rdev->recovery_offset = 0;
5439 added_devices++;
5440 sprintf(nm, "rd%d", rdev->raid_disk); 5468 sprintf(nm, "rd%d", rdev->raid_disk);
5441 if (sysfs_create_link(&mddev->kobj, 5469 if (sysfs_create_link(&mddev->kobj,
5442 &rdev->kobj, nm)) 5470 &rdev->kobj, nm))
@@ -5448,9 +5476,12 @@ static int raid5_start_reshape(mddev_t *mddev)
5448 break; 5476 break;
5449 } 5477 }
5450 5478
5479 /* When a reshape changes the number of devices, ->degraded
5480 * is measured against the large of the pre and post number of
5481 * devices.*/
5451 if (mddev->delta_disks > 0) { 5482 if (mddev->delta_disks > 0) {
5452 spin_lock_irqsave(&conf->device_lock, flags); 5483 spin_lock_irqsave(&conf->device_lock, flags);
5453 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) 5484 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
5454 - added_devices; 5485 - added_devices;
5455 spin_unlock_irqrestore(&conf->device_lock, flags); 5486 spin_unlock_irqrestore(&conf->device_lock, flags);
5456 } 5487 }
@@ -5860,6 +5891,7 @@ static void raid5_exit(void)
5860module_init(raid5_init); 5891module_init(raid5_init);
5861module_exit(raid5_exit); 5892module_exit(raid5_exit);
5862MODULE_LICENSE("GPL"); 5893MODULE_LICENSE("GPL");
5894MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
5863MODULE_ALIAS("md-personality-4"); /* RAID5 */ 5895MODULE_ALIAS("md-personality-4"); /* RAID5 */
5864MODULE_ALIAS("md-raid5"); 5896MODULE_ALIAS("md-raid5");
5865MODULE_ALIAS("md-raid4"); 5897MODULE_ALIAS("md-raid4");
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index dd708359b451..0f86f5e36724 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -405,7 +405,7 @@ struct raid5_private_data {
405 * lists and performing address 405 * lists and performing address
406 * conversions 406 * conversions
407 */ 407 */
408 } *percpu; 408 } __percpu *percpu;
409 size_t scribble_len; /* size of scribble region must be 409 size_t scribble_len; /* size of scribble region must be
410 * associated with conf to handle 410 * associated with conf to handle
411 * cpu hotplug while reshaping 411 * cpu hotplug while reshaping
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 866215ac7f25..1f8784bfd44d 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/raid/pq.h> 19#include <linux/raid/pq.h>
20#include <linux/gfp.h>
20#ifndef __KERNEL__ 21#ifndef __KERNEL__
21#include <sys/mman.h> 22#include <sys/mman.h>
22#include <stdio.h> 23#include <stdio.h>
@@ -31,25 +32,6 @@ EXPORT_SYMBOL(raid6_empty_zero_page);
31struct raid6_calls raid6_call; 32struct raid6_calls raid6_call;
32EXPORT_SYMBOL_GPL(raid6_call); 33EXPORT_SYMBOL_GPL(raid6_call);
33 34
34/* Various routine sets */
35extern const struct raid6_calls raid6_intx1;
36extern const struct raid6_calls raid6_intx2;
37extern const struct raid6_calls raid6_intx4;
38extern const struct raid6_calls raid6_intx8;
39extern const struct raid6_calls raid6_intx16;
40extern const struct raid6_calls raid6_intx32;
41extern const struct raid6_calls raid6_mmxx1;
42extern const struct raid6_calls raid6_mmxx2;
43extern const struct raid6_calls raid6_sse1x1;
44extern const struct raid6_calls raid6_sse1x2;
45extern const struct raid6_calls raid6_sse2x1;
46extern const struct raid6_calls raid6_sse2x2;
47extern const struct raid6_calls raid6_sse2x4;
48extern const struct raid6_calls raid6_altivec1;
49extern const struct raid6_calls raid6_altivec2;
50extern const struct raid6_calls raid6_altivec4;
51extern const struct raid6_calls raid6_altivec8;
52
53const struct raid6_calls * const raid6_algos[] = { 35const struct raid6_calls * const raid6_algos[] = {
54 &raid6_intx1, 36 &raid6_intx1,
55 &raid6_intx2, 37 &raid6_intx2,
@@ -169,3 +151,4 @@ static void raid6_exit(void)
169subsys_initcall(raid6_select_algo); 151subsys_initcall(raid6_select_algo);
170module_exit(raid6_exit); 152module_exit(raid6_exit);
171MODULE_LICENSE("GPL"); 153MODULE_LICENSE("GPL");
154MODULE_DESCRIPTION("RAID6 Q-syndrome calculations");