aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /drivers/md
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig30
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bitmap.c165
-rw-r--r--drivers/md/bitmap.h16
-rw-r--r--drivers/md/dm-crypt.c626
-rw-r--r--drivers/md/dm-delay.c2
-rw-r--r--drivers/md/dm-flakey.c212
-rw-r--r--drivers/md/dm-io.c49
-rw-r--r--drivers/md/dm-ioctl.c158
-rw-r--r--drivers/md/dm-kcopyd.c176
-rw-r--r--drivers/md/dm-log-userspace-base.c139
-rw-r--r--drivers/md/dm-log-userspace-transfer.c3
-rw-r--r--drivers/md/dm-log.c17
-rw-r--r--drivers/md/dm-mpath.c104
-rw-r--r--drivers/md/dm-raid.c689
-rw-r--r--drivers/md/dm-raid1.c37
-rw-r--r--drivers/md/dm-region-hash.c18
-rw-r--r--drivers/md/dm-snap-persistent.c21
-rw-r--r--drivers/md/dm-snap.c82
-rw-r--r--drivers/md/dm-stripe.c50
-rw-r--r--drivers/md/dm-table.c207
-rw-r--r--drivers/md/dm.c477
-rw-r--r--drivers/md/dm.h2
-rw-r--r--drivers/md/faulty.c4
-rw-r--r--drivers/md/linear.c25
-rw-r--r--drivers/md/md.c787
-rw-r--r--drivers/md/md.h78
-rw-r--r--drivers/md/multipath.c103
-rw-r--r--drivers/md/multipath.h1
-rw-r--r--drivers/md/raid0.c65
-rw-r--r--drivers/md/raid1.c920
-rw-r--r--drivers/md/raid1.h10
-rw-r--r--drivers/md/raid10.c603
-rw-r--r--drivers/md/raid10.h4
-rw-r--r--drivers/md/raid5.c282
-rw-r--r--drivers/md/raid5.h5
36 files changed, 3682 insertions, 2487 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..8420129fc5ee 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
240 Allow volume managers to mirror logical volumes, also 240 Allow volume managers to mirror logical volumes, also
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID456
247 select BLK_DEV_MD
248 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings
250
251 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure
253 of a single drive. For a given sector (row) number, (N - 1) drives
254 contain data sectors, and one drive contains the parity protection.
255 For a RAID-4 set, the parity blocks are present on a single drive,
256 while a RAID-5 set distributes the parity across the drives in one
257 of the available parity distribution methods.
258
259 A RAID-6 set of N drives with a capacity of C MB per drive
260 provides the capacity of C * (N - 2) MB, and protects
261 against a failure of any two drives. For a given sector
262 (row) number, (N - 2) drives contain data sectors, and two
263 drives contains two independent redundancy syndromes. Like
264 RAID-5, RAID-6 distributes the syndromes across the drives
265 in one of the available parity distribution methods.
266
243config DM_LOG_USERSPACE 267config DM_LOG_USERSPACE
244 tristate "Mirror userspace logging (EXPERIMENTAL)" 268 tristate "Mirror userspace logging (EXPERIMENTAL)"
245 depends on DM_MIRROR && EXPERIMENTAL && NET 269 depends on DM_MIRROR && EXPERIMENTAL && NET
@@ -303,4 +327,10 @@ config DM_UEVENT
303 ---help--- 327 ---help---
304 Generate udev events for DM events. 328 Generate udev events for DM events.
305 329
330config DM_FLAKEY
331 tristate "Flakey target (EXPERIMENTAL)"
332 depends on BLK_DEV_DM && EXPERIMENTAL
333 ---help---
334 A target that intermittently fails I/O for debugging purposes.
335
306endif # MD 336endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..448838b1f92a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
29obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 29obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
30obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 30obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
31obj-$(CONFIG_DM_DELAY) += dm-delay.o 31obj-$(CONFIG_DM_DELAY) += dm-delay.o
32obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
32obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 33obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
33obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o 34obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
34obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o 35obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
@@ -36,6 +37,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 37obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 38obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
38obj-$(CONFIG_DM_ZERO) += dm-zero.o 39obj-$(CONFIG_DM_ZERO) += dm-zero.o
40obj-$(CONFIG_DM_RAID) += dm-raid.o
39 41
40ifeq ($(CONFIG_DM_UEVENT),y) 42ifeq ($(CONFIG_DM_UEVENT),y)
41dm-mod-objs += dm-uevent.o 43dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e4fb58db5454..574b09afedd3 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset,
210 || test_bit(Faulty, &rdev->flags)) 210 || test_bit(Faulty, &rdev->flags))
211 continue; 211 continue;
212 212
213 target = rdev->sb_start + offset + index * (PAGE_SIZE/512); 213 target = offset + index * (PAGE_SIZE/512);
214 214
215 if (sync_page_io(rdev->bdev, target, 215 if (sync_page_io(rdev, target,
216 roundup(size, bdev_logical_block_size(rdev->bdev)), 216 roundup(size, bdev_logical_block_size(rdev->bdev)),
217 page, READ)) { 217 page, READ, true)) {
218 page->index = index; 218 page->index = index;
219 attach_page_buffers(page, NULL); /* so that free_buffer will 219 attach_page_buffers(page, NULL); /* so that free_buffer will
220 * quietly no-op */ 220 * quietly no-op */
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
264static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 264static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
265{ 265{
266 mdk_rdev_t *rdev = NULL; 266 mdk_rdev_t *rdev = NULL;
267 struct block_device *bdev;
267 mddev_t *mddev = bitmap->mddev; 268 mddev_t *mddev = bitmap->mddev;
268 269
269 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { 270 while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
270 int size = PAGE_SIZE; 271 int size = PAGE_SIZE;
271 loff_t offset = mddev->bitmap_info.offset; 272 loff_t offset = mddev->bitmap_info.offset;
273
274 bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
275
272 if (page->index == bitmap->file_pages-1) 276 if (page->index == bitmap->file_pages-1)
273 size = roundup(bitmap->last_page_size, 277 size = roundup(bitmap->last_page_size,
274 bdev_logical_block_size(rdev->bdev)); 278 bdev_logical_block_size(bdev));
275 /* Just make sure we aren't corrupting data or 279 /* Just make sure we aren't corrupting data or
276 * metadata 280 * metadata
277 */ 281 */
@@ -343,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
343 atomic_inc(&bitmap->pending_writes); 347 atomic_inc(&bitmap->pending_writes);
344 set_buffer_locked(bh); 348 set_buffer_locked(bh);
345 set_buffer_mapped(bh); 349 set_buffer_mapped(bh);
346 submit_bh(WRITE, bh); 350 submit_bh(WRITE | REQ_SYNC, bh);
347 bh = bh->b_this_page; 351 bh = bh->b_this_page;
348 } 352 }
349 353
@@ -489,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
489 spin_unlock_irqrestore(&bitmap->lock, flags); 493 spin_unlock_irqrestore(&bitmap->lock, flags);
490 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 494 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
491 sb->events = cpu_to_le64(bitmap->mddev->events); 495 sb->events = cpu_to_le64(bitmap->mddev->events);
492 if (bitmap->mddev->events < bitmap->events_cleared) { 496 if (bitmap->mddev->events < bitmap->events_cleared)
493 /* rocking back to read-only */ 497 /* rocking back to read-only */
494 bitmap->events_cleared = bitmap->mddev->events; 498 bitmap->events_cleared = bitmap->mddev->events;
495 sb->events_cleared = cpu_to_le64(bitmap->events_cleared); 499 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
496 } 500 sb->state = cpu_to_le32(bitmap->flags);
497 /* Just in case these have been changed via sysfs: */ 501 /* Just in case these have been changed via sysfs: */
498 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 502 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
499 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 503 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -530,6 +534,82 @@ void bitmap_print_sb(struct bitmap *bitmap)
530 kunmap_atomic(sb, KM_USER0); 534 kunmap_atomic(sb, KM_USER0);
531} 535}
532 536
537/*
538 * bitmap_new_disk_sb
539 * @bitmap
540 *
541 * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb
542 * reads and verifies the on-disk bitmap superblock and populates bitmap_info.
543 * This function verifies 'bitmap_info' and populates the on-disk bitmap
544 * structure, which is to be written to disk.
545 *
546 * Returns: 0 on success, -Exxx on error
547 */
548static int bitmap_new_disk_sb(struct bitmap *bitmap)
549{
550 bitmap_super_t *sb;
551 unsigned long chunksize, daemon_sleep, write_behind;
552 int err = -EINVAL;
553
554 bitmap->sb_page = alloc_page(GFP_KERNEL);
555 if (IS_ERR(bitmap->sb_page)) {
556 err = PTR_ERR(bitmap->sb_page);
557 bitmap->sb_page = NULL;
558 return err;
559 }
560 bitmap->sb_page->index = 0;
561
562 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
563
564 sb->magic = cpu_to_le32(BITMAP_MAGIC);
565 sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
566
567 chunksize = bitmap->mddev->bitmap_info.chunksize;
568 BUG_ON(!chunksize);
569 if (!is_power_of_2(chunksize)) {
570 kunmap_atomic(sb, KM_USER0);
571 printk(KERN_ERR "bitmap chunksize not a power of 2\n");
572 return -EINVAL;
573 }
574 sb->chunksize = cpu_to_le32(chunksize);
575
576 daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep;
577 if (!daemon_sleep ||
578 (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) {
579 printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n");
580 daemon_sleep = 5 * HZ;
581 }
582 sb->daemon_sleep = cpu_to_le32(daemon_sleep);
583 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
584
585 /*
586 * FIXME: write_behind for RAID1. If not specified, what
587 * is a good choice? We choose COUNTER_MAX / 2 arbitrarily.
588 */
589 write_behind = bitmap->mddev->bitmap_info.max_write_behind;
590 if (write_behind > COUNTER_MAX)
591 write_behind = COUNTER_MAX / 2;
592 sb->write_behind = cpu_to_le32(write_behind);
593 bitmap->mddev->bitmap_info.max_write_behind = write_behind;
594
595 /* keep the array size field of the bitmap superblock up to date */
596 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
597
598 memcpy(sb->uuid, bitmap->mddev->uuid, 16);
599
600 bitmap->flags |= BITMAP_STALE;
601 sb->state |= cpu_to_le32(BITMAP_STALE);
602 bitmap->events_cleared = bitmap->mddev->events;
603 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
604
605 bitmap->flags |= BITMAP_HOSTENDIAN;
606 sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
607
608 kunmap_atomic(sb, KM_USER0);
609
610 return 0;
611}
612
533/* read the superblock from the bitmap file and initialize some bitmap fields */ 613/* read the superblock from the bitmap file and initialize some bitmap fields */
534static int bitmap_read_sb(struct bitmap *bitmap) 614static int bitmap_read_sb(struct bitmap *bitmap)
535{ 615{
@@ -571,7 +651,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
571 reason = "unrecognized superblock version"; 651 reason = "unrecognized superblock version";
572 else if (chunksize < 512) 652 else if (chunksize < 512)
573 reason = "bitmap chunksize too small"; 653 reason = "bitmap chunksize too small";
574 else if ((1 << ffz(~chunksize)) != chunksize) 654 else if (!is_power_of_2(chunksize))
575 reason = "bitmap chunksize not a power of 2"; 655 reason = "bitmap chunksize not a power of 2";
576 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) 656 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
577 reason = "daemon sleep period out of range"; 657 reason = "daemon sleep period out of range";
@@ -614,7 +694,7 @@ success:
614 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) 694 if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
615 bitmap->flags |= BITMAP_HOSTENDIAN; 695 bitmap->flags |= BITMAP_HOSTENDIAN;
616 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 696 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
617 if (sb->state & cpu_to_le32(BITMAP_STALE)) 697 if (bitmap->flags & BITMAP_STALE)
618 bitmap->events_cleared = bitmap->mddev->events; 698 bitmap->events_cleared = bitmap->mddev->events;
619 err = 0; 699 err = 0;
620out: 700out:
@@ -648,9 +728,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
648 switch (op) { 728 switch (op) {
649 case MASK_SET: 729 case MASK_SET:
650 sb->state |= cpu_to_le32(bits); 730 sb->state |= cpu_to_le32(bits);
731 bitmap->flags |= bits;
651 break; 732 break;
652 case MASK_UNSET: 733 case MASK_UNSET:
653 sb->state &= cpu_to_le32(~bits); 734 sb->state &= cpu_to_le32(~bits);
735 bitmap->flags &= ~bits;
654 break; 736 break;
655 default: 737 default:
656 BUG(); 738 BUG();
@@ -850,7 +932,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
850 if (bitmap->flags & BITMAP_HOSTENDIAN) 932 if (bitmap->flags & BITMAP_HOSTENDIAN)
851 set_bit(bit, kaddr); 933 set_bit(bit, kaddr);
852 else 934 else
853 ext2_set_bit(bit, kaddr); 935 __test_and_set_bit_le(bit, kaddr);
854 kunmap_atomic(kaddr, KM_USER0); 936 kunmap_atomic(kaddr, KM_USER0);
855 PRINTK("set file bit %lu page %lu\n", bit, page->index); 937 PRINTK("set file bit %lu page %lu\n", bit, page->index);
856 } 938 }
@@ -1046,7 +1128,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1046 if (bitmap->flags & BITMAP_HOSTENDIAN) 1128 if (bitmap->flags & BITMAP_HOSTENDIAN)
1047 b = test_bit(bit, paddr); 1129 b = test_bit(bit, paddr);
1048 else 1130 else
1049 b = ext2_test_bit(bit, paddr); 1131 b = test_bit_le(bit, paddr);
1050 kunmap_atomic(paddr, KM_USER0); 1132 kunmap_atomic(paddr, KM_USER0);
1051 if (b) { 1133 if (b) {
1052 /* if the disk bit is set, set the memory bit */ 1134 /* if the disk bit is set, set the memory bit */
@@ -1070,8 +1152,8 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1070 } 1152 }
1071 1153
1072 printk(KERN_INFO "%s: bitmap initialized from disk: " 1154 printk(KERN_INFO "%s: bitmap initialized from disk: "
1073 "read %lu/%lu pages, set %lu bits\n", 1155 "read %lu/%lu pages, set %lu of %lu bits\n",
1074 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt); 1156 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks);
1075 1157
1076 return 0; 1158 return 0;
1077 1159
@@ -1101,7 +1183,7 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
1101 bitmap_checkfree(bitmap, page); 1183 bitmap_checkfree(bitmap, page);
1102} 1184}
1103static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1185static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1104 sector_t offset, int *blocks, 1186 sector_t offset, sector_t *blocks,
1105 int create); 1187 int create);
1106 1188
1107/* 1189/*
@@ -1115,7 +1197,7 @@ void bitmap_daemon_work(mddev_t *mddev)
1115 unsigned long j; 1197 unsigned long j;
1116 unsigned long flags; 1198 unsigned long flags;
1117 struct page *page = NULL, *lastpage = NULL; 1199 struct page *page = NULL, *lastpage = NULL;
1118 int blocks; 1200 sector_t blocks;
1119 void *paddr; 1201 void *paddr;
1120 struct dm_dirty_log *log = mddev->bitmap_info.log; 1202 struct dm_dirty_log *log = mddev->bitmap_info.log;
1121 1203
@@ -1222,7 +1304,7 @@ void bitmap_daemon_work(mddev_t *mddev)
1222 clear_bit(file_page_offset(bitmap, j), 1304 clear_bit(file_page_offset(bitmap, j),
1223 paddr); 1305 paddr);
1224 else 1306 else
1225 ext2_clear_bit(file_page_offset(bitmap, j), 1307 __test_and_clear_bit_le(file_page_offset(bitmap, j),
1226 paddr); 1308 paddr);
1227 kunmap_atomic(paddr, KM_USER0); 1309 kunmap_atomic(paddr, KM_USER0);
1228 } else 1310 } else
@@ -1258,7 +1340,7 @@ void bitmap_daemon_work(mddev_t *mddev)
1258} 1340}
1259 1341
1260static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1342static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1261 sector_t offset, int *blocks, 1343 sector_t offset, sector_t *blocks,
1262 int create) 1344 int create)
1263__releases(bitmap->lock) 1345__releases(bitmap->lock)
1264__acquires(bitmap->lock) 1346__acquires(bitmap->lock)
@@ -1316,7 +1398,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1316 } 1398 }
1317 1399
1318 while (sectors) { 1400 while (sectors) {
1319 int blocks; 1401 sector_t blocks;
1320 bitmap_counter_t *bmc; 1402 bitmap_counter_t *bmc;
1321 1403
1322 spin_lock_irq(&bitmap->lock); 1404 spin_lock_irq(&bitmap->lock);
@@ -1326,7 +1408,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1326 return 0; 1408 return 0;
1327 } 1409 }
1328 1410
1329 if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) { 1411 if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
1330 DEFINE_WAIT(__wait); 1412 DEFINE_WAIT(__wait);
1331 /* note that it is safe to do the prepare_to_wait 1413 /* note that it is safe to do the prepare_to_wait
1332 * after the test as long as we do it before dropping 1414 * after the test as long as we do it before dropping
@@ -1335,8 +1417,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1335 prepare_to_wait(&bitmap->overflow_wait, &__wait, 1417 prepare_to_wait(&bitmap->overflow_wait, &__wait,
1336 TASK_UNINTERRUPTIBLE); 1418 TASK_UNINTERRUPTIBLE);
1337 spin_unlock_irq(&bitmap->lock); 1419 spin_unlock_irq(&bitmap->lock);
1338 md_unplug(bitmap->mddev); 1420 io_schedule();
1339 schedule();
1340 finish_wait(&bitmap->overflow_wait, &__wait); 1421 finish_wait(&bitmap->overflow_wait, &__wait);
1341 continue; 1422 continue;
1342 } 1423 }
@@ -1381,7 +1462,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1381 success = 0; 1462 success = 0;
1382 1463
1383 while (sectors) { 1464 while (sectors) {
1384 int blocks; 1465 sector_t blocks;
1385 unsigned long flags; 1466 unsigned long flags;
1386 bitmap_counter_t *bmc; 1467 bitmap_counter_t *bmc;
1387 1468
@@ -1399,10 +1480,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1399 sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); 1480 sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
1400 } 1481 }
1401 1482
1402 if (!success && ! (*bmc & NEEDED_MASK)) 1483 if (!success && !NEEDED(*bmc))
1403 *bmc |= NEEDED_MASK; 1484 *bmc |= NEEDED_MASK;
1404 1485
1405 if ((*bmc & COUNTER_MAX) == COUNTER_MAX) 1486 if (COUNTER(*bmc) == COUNTER_MAX)
1406 wake_up(&bitmap->overflow_wait); 1487 wake_up(&bitmap->overflow_wait);
1407 1488
1408 (*bmc)--; 1489 (*bmc)--;
@@ -1423,7 +1504,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1423} 1504}
1424EXPORT_SYMBOL(bitmap_endwrite); 1505EXPORT_SYMBOL(bitmap_endwrite);
1425 1506
1426static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1507static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
1427 int degraded) 1508 int degraded)
1428{ 1509{
1429 bitmap_counter_t *bmc; 1510 bitmap_counter_t *bmc;
@@ -1452,7 +1533,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *bloc
1452 return rv; 1533 return rv;
1453} 1534}
1454 1535
1455int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1536int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
1456 int degraded) 1537 int degraded)
1457{ 1538{
1458 /* bitmap_start_sync must always report on multiples of whole 1539 /* bitmap_start_sync must always report on multiples of whole
@@ -1463,7 +1544,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1463 * Return the 'or' of the result. 1544 * Return the 'or' of the result.
1464 */ 1545 */
1465 int rv = 0; 1546 int rv = 0;
1466 int blocks1; 1547 sector_t blocks1;
1467 1548
1468 *blocks = 0; 1549 *blocks = 0;
1469 while (*blocks < (PAGE_SIZE>>9)) { 1550 while (*blocks < (PAGE_SIZE>>9)) {
@@ -1476,7 +1557,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1476} 1557}
1477EXPORT_SYMBOL(bitmap_start_sync); 1558EXPORT_SYMBOL(bitmap_start_sync);
1478 1559
1479void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) 1560void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
1480{ 1561{
1481 bitmap_counter_t *bmc; 1562 bitmap_counter_t *bmc;
1482 unsigned long flags; 1563 unsigned long flags;
@@ -1515,7 +1596,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
1515 * RESYNC bit wherever it is still on 1596 * RESYNC bit wherever it is still on
1516 */ 1597 */
1517 sector_t sector = 0; 1598 sector_t sector = 0;
1518 int blocks; 1599 sector_t blocks;
1519 if (!bitmap) 1600 if (!bitmap)
1520 return; 1601 return;
1521 while (sector < bitmap->mddev->resync_max_sectors) { 1602 while (sector < bitmap->mddev->resync_max_sectors) {
@@ -1528,7 +1609,7 @@ EXPORT_SYMBOL(bitmap_close_sync);
1528void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) 1609void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1529{ 1610{
1530 sector_t s = 0; 1611 sector_t s = 0;
1531 int blocks; 1612 sector_t blocks;
1532 1613
1533 if (!bitmap) 1614 if (!bitmap)
1534 return; 1615 return;
@@ -1542,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1542 wait_event(bitmap->mddev->recovery_wait, 1623 wait_event(bitmap->mddev->recovery_wait,
1543 atomic_read(&bitmap->mddev->recovery_active) == 0); 1624 atomic_read(&bitmap->mddev->recovery_active) == 0);
1544 1625
1545 bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; 1626 bitmap->mddev->curr_resync_completed = sector;
1546 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1627 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1547 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1628 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
1548 s = 0; 1629 s = 0;
@@ -1562,7 +1643,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1562 * be 0 at this point 1643 * be 0 at this point
1563 */ 1644 */
1564 1645
1565 int secs; 1646 sector_t secs;
1566 bitmap_counter_t *bmc; 1647 bitmap_counter_t *bmc;
1567 spin_lock_irq(&bitmap->lock); 1648 spin_lock_irq(&bitmap->lock);
1568 bmc = bitmap_get_counter(bitmap, offset, &secs, 1); 1649 bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
@@ -1723,9 +1804,16 @@ int bitmap_create(mddev_t *mddev)
1723 vfs_fsync(file, 1); 1804 vfs_fsync(file, 1);
1724 } 1805 }
1725 /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ 1806 /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */
1726 if (!mddev->bitmap_info.external) 1807 if (!mddev->bitmap_info.external) {
1727 err = bitmap_read_sb(bitmap); 1808 /*
1728 else { 1809 * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is
1810 * instructing us to create a new on-disk bitmap instance.
1811 */
1812 if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
1813 err = bitmap_new_disk_sb(bitmap);
1814 else
1815 err = bitmap_read_sb(bitmap);
1816 } else {
1729 err = 0; 1817 err = 0;
1730 if (mddev->bitmap_info.chunksize == 0 || 1818 if (mddev->bitmap_info.chunksize == 0 ||
1731 mddev->bitmap_info.daemon_sleep == 0) 1819 mddev->bitmap_info.daemon_sleep == 0)
@@ -1749,9 +1837,6 @@ int bitmap_create(mddev_t *mddev)
1749 bitmap->chunks = chunks; 1837 bitmap->chunks = chunks;
1750 bitmap->pages = pages; 1838 bitmap->pages = pages;
1751 bitmap->missing_pages = pages; 1839 bitmap->missing_pages = pages;
1752 bitmap->counter_bits = COUNTER_BITS;
1753
1754 bitmap->syncchunk = ~0UL;
1755 1840
1756#ifdef INJECT_FATAL_FAULT_1 1841#ifdef INJECT_FATAL_FAULT_1
1757 bitmap->bp = NULL; 1842 bitmap->bp = NULL;
@@ -1790,7 +1875,7 @@ int bitmap_load(mddev_t *mddev)
1790 * All chunks should be clean, but some might need_sync. 1875 * All chunks should be clean, but some might need_sync.
1791 */ 1876 */
1792 while (sector < mddev->resync_max_sectors) { 1877 while (sector < mddev->resync_max_sectors) {
1793 int blocks; 1878 sector_t blocks;
1794 bitmap_start_sync(bitmap, sector, &blocks, 0); 1879 bitmap_start_sync(bitmap, sector, &blocks, 0);
1795 sector += blocks; 1880 sector += blocks;
1796 } 1881 }
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index e872a7bad6b8..b2a127e891ac 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -45,7 +45,7 @@
45 * 45 *
46 * The counter counts pending write requests, plus the on-disk bit. 46 * The counter counts pending write requests, plus the on-disk bit.
47 * When the counter is '1' and the resync bits are clear, the on-disk 47 * When the counter is '1' and the resync bits are clear, the on-disk
48 * bit can be cleared aswell, thus setting the counter to 0. 48 * bit can be cleared as well, thus setting the counter to 0.
49 * When we set a bit, or in the counter (to start a write), if the fields is 49 * When we set a bit, or in the counter (to start a write), if the fields is
50 * 0, we first set the disk bit and set the counter to 1. 50 * 0, we first set the disk bit and set the counter to 1.
51 * 51 *
@@ -85,7 +85,6 @@
85typedef __u16 bitmap_counter_t; 85typedef __u16 bitmap_counter_t;
86#define COUNTER_BITS 16 86#define COUNTER_BITS 16
87#define COUNTER_BIT_SHIFT 4 87#define COUNTER_BIT_SHIFT 4
88#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
89#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) 88#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
90 89
91#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) 90#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
@@ -196,19 +195,10 @@ struct bitmap {
196 195
197 mddev_t *mddev; /* the md device that the bitmap is for */ 196 mddev_t *mddev; /* the md device that the bitmap is for */
198 197
199 int counter_bits; /* how many bits per block counter */
200
201 /* bitmap chunksize -- how much data does each bit represent? */ 198 /* bitmap chunksize -- how much data does each bit represent? */
202 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 199 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
203 unsigned long chunks; /* total number of data chunks for the array */ 200 unsigned long chunks; /* total number of data chunks for the array */
204 201
205 /* We hold a count on the chunk currently being synced, and drop
206 * it when the last block is started. If the resync is aborted
207 * midway, we need to be able to drop that count, so we remember
208 * the counted chunk..
209 */
210 unsigned long syncchunk;
211
212 __u64 events_cleared; 202 __u64 events_cleared;
213 int need_sync; 203 int need_sync;
214 204
@@ -271,8 +261,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
271 unsigned long sectors, int behind); 261 unsigned long sectors, int behind);
272void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, 262void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
273 unsigned long sectors, int success, int behind); 263 unsigned long sectors, int success, int behind);
274int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); 264int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
275void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); 265void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
276void bitmap_close_sync(struct bitmap *bitmap); 266void bitmap_close_sync(struct bitmap *bitmap);
277void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); 267void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
278 268
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e98f705..c8827ffd85bb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,10 +18,14 @@
18#include <linux/crypto.h> 18#include <linux/crypto.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/backing-dev.h> 20#include <linux/backing-dev.h>
21#include <linux/percpu.h>
21#include <asm/atomic.h> 22#include <asm/atomic.h>
22#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
23#include <asm/page.h> 24#include <asm/page.h>
24#include <asm/unaligned.h> 25#include <asm/unaligned.h>
26#include <crypto/hash.h>
27#include <crypto/md5.h>
28#include <crypto/algapi.h>
25 29
26#include <linux/device-mapper.h> 30#include <linux/device-mapper.h>
27 31
@@ -63,6 +67,7 @@ struct dm_crypt_request {
63 struct convert_context *ctx; 67 struct convert_context *ctx;
64 struct scatterlist sg_in; 68 struct scatterlist sg_in;
65 struct scatterlist sg_out; 69 struct scatterlist sg_out;
70 sector_t iv_sector;
66}; 71};
67 72
68struct crypt_config; 73struct crypt_config;
@@ -73,11 +78,13 @@ struct crypt_iv_operations {
73 void (*dtr)(struct crypt_config *cc); 78 void (*dtr)(struct crypt_config *cc);
74 int (*init)(struct crypt_config *cc); 79 int (*init)(struct crypt_config *cc);
75 int (*wipe)(struct crypt_config *cc); 80 int (*wipe)(struct crypt_config *cc);
76 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); 81 int (*generator)(struct crypt_config *cc, u8 *iv,
82 struct dm_crypt_request *dmreq);
83 int (*post)(struct crypt_config *cc, u8 *iv,
84 struct dm_crypt_request *dmreq);
77}; 85};
78 86
79struct iv_essiv_private { 87struct iv_essiv_private {
80 struct crypto_cipher *tfm;
81 struct crypto_hash *hash_tfm; 88 struct crypto_hash *hash_tfm;
82 u8 *salt; 89 u8 *salt;
83}; 90};
@@ -86,11 +93,32 @@ struct iv_benbi_private {
86 int shift; 93 int shift;
87}; 94};
88 95
96#define LMK_SEED_SIZE 64 /* hash + 0 */
97struct iv_lmk_private {
98 struct crypto_shash *hash_tfm;
99 u8 *seed;
100};
101
89/* 102/*
90 * Crypt: maps a linear range of a block device 103 * Crypt: maps a linear range of a block device
91 * and encrypts / decrypts at the same time. 104 * and encrypts / decrypts at the same time.
92 */ 105 */
93enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; 106enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
107
108/*
109 * Duplicated per-CPU state for cipher.
110 */
111struct crypt_cpu {
112 struct ablkcipher_request *req;
113 /* ESSIV: struct crypto_cipher *essiv_tfm */
114 void *iv_private;
115 struct crypto_ablkcipher *tfms[0];
116};
117
118/*
119 * The fields in here must be read only after initialization,
120 * changing state should be in crypt_cpu.
121 */
94struct crypt_config { 122struct crypt_config {
95 struct dm_dev *dev; 123 struct dm_dev *dev;
96 sector_t start; 124 sector_t start;
@@ -108,17 +136,25 @@ struct crypt_config {
108 struct workqueue_struct *crypt_queue; 136 struct workqueue_struct *crypt_queue;
109 137
110 char *cipher; 138 char *cipher;
111 char *cipher_mode; 139 char *cipher_string;
112 140
113 struct crypt_iv_operations *iv_gen_ops; 141 struct crypt_iv_operations *iv_gen_ops;
114 union { 142 union {
115 struct iv_essiv_private essiv; 143 struct iv_essiv_private essiv;
116 struct iv_benbi_private benbi; 144 struct iv_benbi_private benbi;
145 struct iv_lmk_private lmk;
117 } iv_gen_private; 146 } iv_gen_private;
118 sector_t iv_offset; 147 sector_t iv_offset;
119 unsigned int iv_size; 148 unsigned int iv_size;
120 149
121 /* 150 /*
151 * Duplicated per cpu state. Access through
152 * per_cpu_ptr() only.
153 */
154 struct crypt_cpu __percpu *cpu;
155 unsigned tfms_count;
156
157 /*
122 * Layout of each crypto request: 158 * Layout of each crypto request:
123 * 159 *
124 * struct ablkcipher_request 160 * struct ablkcipher_request
@@ -132,11 +168,10 @@ struct crypt_config {
132 * correctly aligned. 168 * correctly aligned.
133 */ 169 */
134 unsigned int dmreq_start; 170 unsigned int dmreq_start;
135 struct ablkcipher_request *req;
136 171
137 struct crypto_ablkcipher *tfm;
138 unsigned long flags; 172 unsigned long flags;
139 unsigned int key_size; 173 unsigned int key_size;
174 unsigned int key_parts;
140 u8 key[0]; 175 u8 key[0];
141}; 176};
142 177
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool;
148 183
149static void clone_init(struct dm_crypt_io *, struct bio *); 184static void clone_init(struct dm_crypt_io *, struct bio *);
150static void kcryptd_queue_crypt(struct dm_crypt_io *io); 185static void kcryptd_queue_crypt(struct dm_crypt_io *io);
186static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
187
188static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
189{
190 return this_cpu_ptr(cc->cpu);
191}
192
193/*
194 * Use this to access cipher attributes that are the same for each CPU.
195 */
196static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
197{
198 return __this_cpu_ptr(cc->cpu)->tfms[0];
199}
151 200
152/* 201/*
153 * Different IV generation algorithms: 202 * Different IV generation algorithms:
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
168 * null: the initial vector is always zero. Provides compatibility with 217 * null: the initial vector is always zero. Provides compatibility with
169 * obsolete loop_fish2 devices. Do not use for new devices. 218 * obsolete loop_fish2 devices. Do not use for new devices.
170 * 219 *
220 * lmk: Compatible implementation of the block chaining mode used
221 * by the Loop-AES block device encryption system
222 * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
223 * It operates on full 512 byte sectors and uses CBC
224 * with an IV derived from the sector number, the data and
225 * optionally extra IV seed.
226 * This means that after decryption the first block
227 * of sector must be tweaked according to decrypted data.
228 * Loop-AES can use three encryption schemes:
229 * version 1: is plain aes-cbc mode
230 * version 2: uses 64 multikey scheme with lmk IV generator
231 * version 3: the same as version 2 with additional IV seed
232 * (it uses 65 keys, last key is used as IV seed)
233 *
171 * plumb: unimplemented, see: 234 * plumb: unimplemented, see:
172 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 235 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
173 */ 236 */
174 237
175static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 238static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
239 struct dm_crypt_request *dmreq)
176{ 240{
177 memset(iv, 0, cc->iv_size); 241 memset(iv, 0, cc->iv_size);
178 *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); 242 *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
179 243
180 return 0; 244 return 0;
181} 245}
182 246
183static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, 247static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
184 sector_t sector) 248 struct dm_crypt_request *dmreq)
185{ 249{
186 memset(iv, 0, cc->iv_size); 250 memset(iv, 0, cc->iv_size);
187 *(u64 *)iv = cpu_to_le64(sector); 251 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
188 252
189 return 0; 253 return 0;
190} 254}
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
195 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 259 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
196 struct hash_desc desc; 260 struct hash_desc desc;
197 struct scatterlist sg; 261 struct scatterlist sg;
198 int err; 262 struct crypto_cipher *essiv_tfm;
263 int err, cpu;
199 264
200 sg_init_one(&sg, cc->key, cc->key_size); 265 sg_init_one(&sg, cc->key, cc->key_size);
201 desc.tfm = essiv->hash_tfm; 266 desc.tfm = essiv->hash_tfm;
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
205 if (err) 270 if (err)
206 return err; 271 return err;
207 272
208 return crypto_cipher_setkey(essiv->tfm, essiv->salt, 273 for_each_possible_cpu(cpu) {
274 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
275
276 err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
209 crypto_hash_digestsize(essiv->hash_tfm)); 277 crypto_hash_digestsize(essiv->hash_tfm));
278 if (err)
279 return err;
280 }
281
282 return 0;
210} 283}
211 284
212/* Wipe salt and reset key derived from volume key */ 285/* Wipe salt and reset key derived from volume key */
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
214{ 287{
215 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 288 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
216 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); 289 unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
290 struct crypto_cipher *essiv_tfm;
291 int cpu, r, err = 0;
217 292
218 memset(essiv->salt, 0, salt_size); 293 memset(essiv->salt, 0, salt_size);
219 294
220 return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); 295 for_each_possible_cpu(cpu) {
296 essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
297 r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
298 if (r)
299 err = r;
300 }
301
302 return err;
303}
304
305/* Set up per cpu cipher state */
306static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
307 struct dm_target *ti,
308 u8 *salt, unsigned saltsize)
309{
310 struct crypto_cipher *essiv_tfm;
311 int err;
312
313 /* Setup the essiv_tfm with the given salt */
314 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
315 if (IS_ERR(essiv_tfm)) {
316 ti->error = "Error allocating crypto tfm for ESSIV";
317 return essiv_tfm;
318 }
319
320 if (crypto_cipher_blocksize(essiv_tfm) !=
321 crypto_ablkcipher_ivsize(any_tfm(cc))) {
322 ti->error = "Block size of ESSIV cipher does "
323 "not match IV size of block cipher";
324 crypto_free_cipher(essiv_tfm);
325 return ERR_PTR(-EINVAL);
326 }
327
328 err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
329 if (err) {
330 ti->error = "Failed to set key for ESSIV cipher";
331 crypto_free_cipher(essiv_tfm);
332 return ERR_PTR(err);
333 }
334
335 return essiv_tfm;
221} 336}
222 337
223static void crypt_iv_essiv_dtr(struct crypt_config *cc) 338static void crypt_iv_essiv_dtr(struct crypt_config *cc)
224{ 339{
340 int cpu;
341 struct crypt_cpu *cpu_cc;
342 struct crypto_cipher *essiv_tfm;
225 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; 343 struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
226 344
227 crypto_free_cipher(essiv->tfm);
228 essiv->tfm = NULL;
229
230 crypto_free_hash(essiv->hash_tfm); 345 crypto_free_hash(essiv->hash_tfm);
231 essiv->hash_tfm = NULL; 346 essiv->hash_tfm = NULL;
232 347
233 kzfree(essiv->salt); 348 kzfree(essiv->salt);
234 essiv->salt = NULL; 349 essiv->salt = NULL;
350
351 for_each_possible_cpu(cpu) {
352 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
353 essiv_tfm = cpu_cc->iv_private;
354
355 if (essiv_tfm)
356 crypto_free_cipher(essiv_tfm);
357
358 cpu_cc->iv_private = NULL;
359 }
235} 360}
236 361
237static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, 362static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
240 struct crypto_cipher *essiv_tfm = NULL; 365 struct crypto_cipher *essiv_tfm = NULL;
241 struct crypto_hash *hash_tfm = NULL; 366 struct crypto_hash *hash_tfm = NULL;
242 u8 *salt = NULL; 367 u8 *salt = NULL;
243 int err; 368 int err, cpu;
244 369
245 if (!opts) { 370 if (!opts) {
246 ti->error = "Digest algorithm missing for ESSIV mode"; 371 ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
262 goto bad; 387 goto bad;
263 } 388 }
264 389
265 /* Allocate essiv_tfm */
266 essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
267 if (IS_ERR(essiv_tfm)) {
268 ti->error = "Error allocating crypto tfm for ESSIV";
269 err = PTR_ERR(essiv_tfm);
270 goto bad;
271 }
272 if (crypto_cipher_blocksize(essiv_tfm) !=
273 crypto_ablkcipher_ivsize(cc->tfm)) {
274 ti->error = "Block size of ESSIV cipher does "
275 "not match IV size of block cipher";
276 err = -EINVAL;
277 goto bad;
278 }
279
280 cc->iv_gen_private.essiv.salt = salt; 390 cc->iv_gen_private.essiv.salt = salt;
281 cc->iv_gen_private.essiv.tfm = essiv_tfm;
282 cc->iv_gen_private.essiv.hash_tfm = hash_tfm; 391 cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
283 392
393 for_each_possible_cpu(cpu) {
394 essiv_tfm = setup_essiv_cpu(cc, ti, salt,
395 crypto_hash_digestsize(hash_tfm));
396 if (IS_ERR(essiv_tfm)) {
397 crypt_iv_essiv_dtr(cc);
398 return PTR_ERR(essiv_tfm);
399 }
400 per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
401 }
402
284 return 0; 403 return 0;
285 404
286bad: 405bad:
287 if (essiv_tfm && !IS_ERR(essiv_tfm))
288 crypto_free_cipher(essiv_tfm);
289 if (hash_tfm && !IS_ERR(hash_tfm)) 406 if (hash_tfm && !IS_ERR(hash_tfm))
290 crypto_free_hash(hash_tfm); 407 crypto_free_hash(hash_tfm);
291 kfree(salt); 408 kfree(salt);
292 return err; 409 return err;
293} 410}
294 411
295static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 412static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
413 struct dm_crypt_request *dmreq)
296{ 414{
415 struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
416
297 memset(iv, 0, cc->iv_size); 417 memset(iv, 0, cc->iv_size);
298 *(u64 *)iv = cpu_to_le64(sector); 418 *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
299 crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); 419 crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
420
300 return 0; 421 return 0;
301} 422}
302 423
303static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, 424static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
304 const char *opts) 425 const char *opts)
305{ 426{
306 unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); 427 unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
307 int log = ilog2(bs); 428 int log = ilog2(bs);
308 429
309 /* we need to calculate how far we must shift the sector count 430 /* we need to calculate how far we must shift the sector count
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
328{ 449{
329} 450}
330 451
331static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 452static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
453 struct dm_crypt_request *dmreq)
332{ 454{
333 __be64 val; 455 __be64 val;
334 456
335 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ 457 memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
336 458
337 val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); 459 val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
338 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); 460 put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
339 461
340 return 0; 462 return 0;
341} 463}
342 464
343static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) 465static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
466 struct dm_crypt_request *dmreq)
344{ 467{
345 memset(iv, 0, cc->iv_size); 468 memset(iv, 0, cc->iv_size);
346 469
347 return 0; 470 return 0;
348} 471}
349 472
473static void crypt_iv_lmk_dtr(struct crypt_config *cc)
474{
475 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
476
477 if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
478 crypto_free_shash(lmk->hash_tfm);
479 lmk->hash_tfm = NULL;
480
481 kzfree(lmk->seed);
482 lmk->seed = NULL;
483}
484
485static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
486 const char *opts)
487{
488 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
489
490 lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
491 if (IS_ERR(lmk->hash_tfm)) {
492 ti->error = "Error initializing LMK hash";
493 return PTR_ERR(lmk->hash_tfm);
494 }
495
496 /* No seed in LMK version 2 */
497 if (cc->key_parts == cc->tfms_count) {
498 lmk->seed = NULL;
499 return 0;
500 }
501
502 lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
503 if (!lmk->seed) {
504 crypt_iv_lmk_dtr(cc);
505 ti->error = "Error kmallocing seed storage in LMK";
506 return -ENOMEM;
507 }
508
509 return 0;
510}
511
512static int crypt_iv_lmk_init(struct crypt_config *cc)
513{
514 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
515 int subkey_size = cc->key_size / cc->key_parts;
516
517 /* LMK seed is on the position of LMK_KEYS + 1 key */
518 if (lmk->seed)
519 memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
520 crypto_shash_digestsize(lmk->hash_tfm));
521
522 return 0;
523}
524
525static int crypt_iv_lmk_wipe(struct crypt_config *cc)
526{
527 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
528
529 if (lmk->seed)
530 memset(lmk->seed, 0, LMK_SEED_SIZE);
531
532 return 0;
533}
534
535static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
536 struct dm_crypt_request *dmreq,
537 u8 *data)
538{
539 struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
540 struct {
541 struct shash_desc desc;
542 char ctx[crypto_shash_descsize(lmk->hash_tfm)];
543 } sdesc;
544 struct md5_state md5state;
545 u32 buf[4];
546 int i, r;
547
548 sdesc.desc.tfm = lmk->hash_tfm;
549 sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
550
551 r = crypto_shash_init(&sdesc.desc);
552 if (r)
553 return r;
554
555 if (lmk->seed) {
556 r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
557 if (r)
558 return r;
559 }
560
561 /* Sector is always 512B, block size 16, add data of blocks 1-31 */
562 r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
563 if (r)
564 return r;
565
566 /* Sector is cropped to 56 bits here */
567 buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
568 buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
569 buf[2] = cpu_to_le32(4024);
570 buf[3] = 0;
571 r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
572 if (r)
573 return r;
574
575 /* No MD5 padding here */
576 r = crypto_shash_export(&sdesc.desc, &md5state);
577 if (r)
578 return r;
579
580 for (i = 0; i < MD5_HASH_WORDS; i++)
581 __cpu_to_le32s(&md5state.hash[i]);
582 memcpy(iv, &md5state.hash, cc->iv_size);
583
584 return 0;
585}
586
587static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
588 struct dm_crypt_request *dmreq)
589{
590 u8 *src;
591 int r = 0;
592
593 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
594 src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
595 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
596 kunmap_atomic(src, KM_USER0);
597 } else
598 memset(iv, 0, cc->iv_size);
599
600 return r;
601}
602
603static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
604 struct dm_crypt_request *dmreq)
605{
606 u8 *dst;
607 int r;
608
609 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
610 return 0;
611
612 dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
613 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
614
615 /* Tweak the first block of plaintext sector */
616 if (!r)
617 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
618
619 kunmap_atomic(dst, KM_USER0);
620 return r;
621}
622
350static struct crypt_iv_operations crypt_iv_plain_ops = { 623static struct crypt_iv_operations crypt_iv_plain_ops = {
351 .generator = crypt_iv_plain_gen 624 .generator = crypt_iv_plain_gen
352}; 625};
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
373 .generator = crypt_iv_null_gen 646 .generator = crypt_iv_null_gen
374}; 647};
375 648
649static struct crypt_iv_operations crypt_iv_lmk_ops = {
650 .ctr = crypt_iv_lmk_ctr,
651 .dtr = crypt_iv_lmk_dtr,
652 .init = crypt_iv_lmk_init,
653 .wipe = crypt_iv_lmk_wipe,
654 .generator = crypt_iv_lmk_gen,
655 .post = crypt_iv_lmk_post
656};
657
376static void crypt_convert_init(struct crypt_config *cc, 658static void crypt_convert_init(struct crypt_config *cc,
377 struct convert_context *ctx, 659 struct convert_context *ctx,
378 struct bio *bio_out, struct bio *bio_in, 660 struct bio *bio_out, struct bio *bio_in,
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
400 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); 682 return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
401} 683}
402 684
685static u8 *iv_of_dmreq(struct crypt_config *cc,
686 struct dm_crypt_request *dmreq)
687{
688 return (u8 *)ALIGN((unsigned long)(dmreq + 1),
689 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
690}
691
403static int crypt_convert_block(struct crypt_config *cc, 692static int crypt_convert_block(struct crypt_config *cc,
404 struct convert_context *ctx, 693 struct convert_context *ctx,
405 struct ablkcipher_request *req) 694 struct ablkcipher_request *req)
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc,
411 int r = 0; 700 int r = 0;
412 701
413 dmreq = dmreq_of_req(cc, req); 702 dmreq = dmreq_of_req(cc, req);
414 iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), 703 iv = iv_of_dmreq(cc, dmreq);
415 crypto_ablkcipher_alignmask(cc->tfm) + 1);
416 704
705 dmreq->iv_sector = ctx->sector;
417 dmreq->ctx = ctx; 706 dmreq->ctx = ctx;
418 sg_init_table(&dmreq->sg_in, 1); 707 sg_init_table(&dmreq->sg_in, 1);
419 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, 708 sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc,
436 } 725 }
437 726
438 if (cc->iv_gen_ops) { 727 if (cc->iv_gen_ops) {
439 r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); 728 r = cc->iv_gen_ops->generator(cc, iv, dmreq);
440 if (r < 0) 729 if (r < 0)
441 return r; 730 return r;
442 } 731 }
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc,
449 else 738 else
450 r = crypto_ablkcipher_decrypt(req); 739 r = crypto_ablkcipher_decrypt(req);
451 740
741 if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
742 r = cc->iv_gen_ops->post(cc, iv, dmreq);
743
452 return r; 744 return r;
453} 745}
454 746
455static void kcryptd_async_done(struct crypto_async_request *async_req, 747static void kcryptd_async_done(struct crypto_async_request *async_req,
456 int error); 748 int error);
749
457static void crypt_alloc_req(struct crypt_config *cc, 750static void crypt_alloc_req(struct crypt_config *cc,
458 struct convert_context *ctx) 751 struct convert_context *ctx)
459{ 752{
460 if (!cc->req) 753 struct crypt_cpu *this_cc = this_crypt_config(cc);
461 cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 754 unsigned key_index = ctx->sector & (cc->tfms_count - 1);
462 ablkcipher_request_set_tfm(cc->req, cc->tfm); 755
463 ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | 756 if (!this_cc->req)
464 CRYPTO_TFM_REQ_MAY_SLEEP, 757 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
465 kcryptd_async_done, 758
466 dmreq_of_req(cc, cc->req)); 759 ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
760 ablkcipher_request_set_callback(this_cc->req,
761 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
762 kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
467} 763}
468 764
469/* 765/*
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
472static int crypt_convert(struct crypt_config *cc, 768static int crypt_convert(struct crypt_config *cc,
473 struct convert_context *ctx) 769 struct convert_context *ctx)
474{ 770{
771 struct crypt_cpu *this_cc = this_crypt_config(cc);
475 int r; 772 int r;
476 773
477 atomic_set(&ctx->pending, 1); 774 atomic_set(&ctx->pending, 1);
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc,
483 780
484 atomic_inc(&ctx->pending); 781 atomic_inc(&ctx->pending);
485 782
486 r = crypt_convert_block(cc, ctx, cc->req); 783 r = crypt_convert_block(cc, ctx, this_cc->req);
487 784
488 switch (r) { 785 switch (r) {
489 /* async */ 786 /* async */
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc,
492 INIT_COMPLETION(ctx->restart); 789 INIT_COMPLETION(ctx->restart);
493 /* fall through*/ 790 /* fall through*/
494 case -EINPROGRESS: 791 case -EINPROGRESS:
495 cc->req = NULL; 792 this_cc->req = NULL;
496 ctx->sector++; 793 ctx->sector++;
497 continue; 794 continue;
498 795
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
651 * They must be separated as otherwise the final stages could be 948 * They must be separated as otherwise the final stages could be
652 * starved by new requests which can block in the first stages due 949 * starved by new requests which can block in the first stages due
653 * to memory allocation. 950 * to memory allocation.
951 *
952 * The work is done per CPU global for all dm-crypt instances.
953 * They should not depend on each other and do not block.
654 */ 954 */
655static void crypt_endio(struct bio *clone, int error) 955static void crypt_endio(struct bio *clone, int error)
656{ 956{
@@ -691,25 +991,22 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
691 clone->bi_destructor = dm_crypt_bio_destructor; 991 clone->bi_destructor = dm_crypt_bio_destructor;
692} 992}
693 993
694static void kcryptd_io_read(struct dm_crypt_io *io) 994static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
695{ 995{
696 struct crypt_config *cc = io->target->private; 996 struct crypt_config *cc = io->target->private;
697 struct bio *base_bio = io->base_bio; 997 struct bio *base_bio = io->base_bio;
698 struct bio *clone; 998 struct bio *clone;
699 999
700 crypt_inc_pending(io);
701
702 /* 1000 /*
703 * The block layer might modify the bvec array, so always 1001 * The block layer might modify the bvec array, so always
704 * copy the required bvecs because we need the original 1002 * copy the required bvecs because we need the original
705 * one in order to decrypt the whole bio data *afterwards*. 1003 * one in order to decrypt the whole bio data *afterwards*.
706 */ 1004 */
707 clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); 1005 clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
708 if (unlikely(!clone)) { 1006 if (!clone)
709 io->error = -ENOMEM; 1007 return 1;
710 crypt_dec_pending(io); 1008
711 return; 1009 crypt_inc_pending(io);
712 }
713 1010
714 clone_init(io, clone); 1011 clone_init(io, clone);
715 clone->bi_idx = 0; 1012 clone->bi_idx = 0;
@@ -720,6 +1017,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
720 sizeof(struct bio_vec) * clone->bi_vcnt); 1017 sizeof(struct bio_vec) * clone->bi_vcnt);
721 1018
722 generic_make_request(clone); 1019 generic_make_request(clone);
1020 return 0;
723} 1021}
724 1022
725static void kcryptd_io_write(struct dm_crypt_io *io) 1023static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -732,9 +1030,12 @@ static void kcryptd_io(struct work_struct *work)
732{ 1030{
733 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); 1031 struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
734 1032
735 if (bio_data_dir(io->base_bio) == READ) 1033 if (bio_data_dir(io->base_bio) == READ) {
736 kcryptd_io_read(io); 1034 crypt_inc_pending(io);
737 else 1035 if (kcryptd_io_read(io, GFP_NOIO))
1036 io->error = -ENOMEM;
1037 crypt_dec_pending(io);
1038 } else
738 kcryptd_io_write(io); 1039 kcryptd_io_write(io);
739} 1040}
740 1041
@@ -901,6 +1202,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
901 return; 1202 return;
902 } 1203 }
903 1204
1205 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1206 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1207
904 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1208 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
905 1209
906 if (!atomic_dec_and_test(&ctx->pending)) 1210 if (!atomic_dec_and_test(&ctx->pending))
@@ -971,34 +1275,93 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
971 } 1275 }
972} 1276}
973 1277
1278static void crypt_free_tfms(struct crypt_config *cc, int cpu)
1279{
1280 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1281 unsigned i;
1282
1283 for (i = 0; i < cc->tfms_count; i++)
1284 if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
1285 crypto_free_ablkcipher(cpu_cc->tfms[i]);
1286 cpu_cc->tfms[i] = NULL;
1287 }
1288}
1289
1290static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
1291{
1292 struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1293 unsigned i;
1294 int err;
1295
1296 for (i = 0; i < cc->tfms_count; i++) {
1297 cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
1298 if (IS_ERR(cpu_cc->tfms[i])) {
1299 err = PTR_ERR(cpu_cc->tfms[i]);
1300 crypt_free_tfms(cc, cpu);
1301 return err;
1302 }
1303 }
1304
1305 return 0;
1306}
1307
1308static int crypt_setkey_allcpus(struct crypt_config *cc)
1309{
1310 unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
1311 int cpu, err = 0, i, r;
1312
1313 for_each_possible_cpu(cpu) {
1314 for (i = 0; i < cc->tfms_count; i++) {
1315 r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
1316 cc->key + (i * subkey_size), subkey_size);
1317 if (r)
1318 err = r;
1319 }
1320 }
1321
1322 return err;
1323}
1324
974static int crypt_set_key(struct crypt_config *cc, char *key) 1325static int crypt_set_key(struct crypt_config *cc, char *key)
975{ 1326{
976 unsigned key_size = strlen(key) >> 1; 1327 int r = -EINVAL;
1328 int key_string_len = strlen(key);
977 1329
978 if (cc->key_size && cc->key_size != key_size) 1330 /* The key size may not be changed. */
979 return -EINVAL; 1331 if (cc->key_size != (key_string_len >> 1))
1332 goto out;
980 1333
981 cc->key_size = key_size; /* initial settings */ 1334 /* Hyphen (which gives a key_size of zero) means there is no key. */
1335 if (!cc->key_size && strcmp(key, "-"))
1336 goto out;
982 1337
983 if ((!key_size && strcmp(key, "-")) || 1338 if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
984 (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) 1339 goto out;
985 return -EINVAL;
986 1340
987 set_bit(DM_CRYPT_KEY_VALID, &cc->flags); 1341 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
988 1342
989 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); 1343 r = crypt_setkey_allcpus(cc);
1344
1345out:
1346 /* Hex key string not needed after here, so wipe it. */
1347 memset(key, '0', key_string_len);
1348
1349 return r;
990} 1350}
991 1351
992static int crypt_wipe_key(struct crypt_config *cc) 1352static int crypt_wipe_key(struct crypt_config *cc)
993{ 1353{
994 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); 1354 clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
995 memset(&cc->key, 0, cc->key_size * sizeof(u8)); 1355 memset(&cc->key, 0, cc->key_size * sizeof(u8));
996 return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); 1356
1357 return crypt_setkey_allcpus(cc);
997} 1358}
998 1359
999static void crypt_dtr(struct dm_target *ti) 1360static void crypt_dtr(struct dm_target *ti)
1000{ 1361{
1001 struct crypt_config *cc = ti->private; 1362 struct crypt_config *cc = ti->private;
1363 struct crypt_cpu *cpu_cc;
1364 int cpu;
1002 1365
1003 ti->private = NULL; 1366 ti->private = NULL;
1004 1367
@@ -1010,6 +1373,14 @@ static void crypt_dtr(struct dm_target *ti)
1010 if (cc->crypt_queue) 1373 if (cc->crypt_queue)
1011 destroy_workqueue(cc->crypt_queue); 1374 destroy_workqueue(cc->crypt_queue);
1012 1375
1376 if (cc->cpu)
1377 for_each_possible_cpu(cpu) {
1378 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1379 if (cpu_cc->req)
1380 mempool_free(cpu_cc->req, cc->req_pool);
1381 crypt_free_tfms(cc, cpu);
1382 }
1383
1013 if (cc->bs) 1384 if (cc->bs)
1014 bioset_free(cc->bs); 1385 bioset_free(cc->bs);
1015 1386
@@ -1023,14 +1394,14 @@ static void crypt_dtr(struct dm_target *ti)
1023 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 1394 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
1024 cc->iv_gen_ops->dtr(cc); 1395 cc->iv_gen_ops->dtr(cc);
1025 1396
1026 if (cc->tfm && !IS_ERR(cc->tfm))
1027 crypto_free_ablkcipher(cc->tfm);
1028
1029 if (cc->dev) 1397 if (cc->dev)
1030 dm_put_device(ti, cc->dev); 1398 dm_put_device(ti, cc->dev);
1031 1399
1400 if (cc->cpu)
1401 free_percpu(cc->cpu);
1402
1032 kzfree(cc->cipher); 1403 kzfree(cc->cipher);
1033 kzfree(cc->cipher_mode); 1404 kzfree(cc->cipher_string);
1034 1405
1035 /* Must zero key material before freeing */ 1406 /* Must zero key material before freeing */
1036 kzfree(cc); 1407 kzfree(cc);
@@ -1040,9 +1411,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1040 char *cipher_in, char *key) 1411 char *cipher_in, char *key)
1041{ 1412{
1042 struct crypt_config *cc = ti->private; 1413 struct crypt_config *cc = ti->private;
1043 char *tmp, *cipher, *chainmode, *ivmode, *ivopts; 1414 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1044 char *cipher_api = NULL; 1415 char *cipher_api = NULL;
1045 int ret = -EINVAL; 1416 int cpu, ret = -EINVAL;
1046 1417
1047 /* Convert to crypto api definition? */ 1418 /* Convert to crypto api definition? */
1048 if (strchr(cipher_in, '(')) { 1419 if (strchr(cipher_in, '(')) {
@@ -1050,23 +1421,31 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1050 return -EINVAL; 1421 return -EINVAL;
1051 } 1422 }
1052 1423
1424 cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
1425 if (!cc->cipher_string)
1426 goto bad_mem;
1427
1053 /* 1428 /*
1054 * Legacy dm-crypt cipher specification 1429 * Legacy dm-crypt cipher specification
1055 * cipher-mode-iv:ivopts 1430 * cipher[:keycount]-mode-iv:ivopts
1056 */ 1431 */
1057 tmp = cipher_in; 1432 tmp = cipher_in;
1058 cipher = strsep(&tmp, "-"); 1433 keycount = strsep(&tmp, "-");
1434 cipher = strsep(&keycount, ":");
1435
1436 if (!keycount)
1437 cc->tfms_count = 1;
1438 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
1439 !is_power_of_2(cc->tfms_count)) {
1440 ti->error = "Bad cipher key count specification";
1441 return -EINVAL;
1442 }
1443 cc->key_parts = cc->tfms_count;
1059 1444
1060 cc->cipher = kstrdup(cipher, GFP_KERNEL); 1445 cc->cipher = kstrdup(cipher, GFP_KERNEL);
1061 if (!cc->cipher) 1446 if (!cc->cipher)
1062 goto bad_mem; 1447 goto bad_mem;
1063 1448
1064 if (tmp) {
1065 cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
1066 if (!cc->cipher_mode)
1067 goto bad_mem;
1068 }
1069
1070 chainmode = strsep(&tmp, "-"); 1449 chainmode = strsep(&tmp, "-");
1071 ivopts = strsep(&tmp, "-"); 1450 ivopts = strsep(&tmp, "-");
1072 ivmode = strsep(&ivopts, ":"); 1451 ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1453,19 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1074 if (tmp) 1453 if (tmp)
1075 DMWARN("Ignoring unexpected additional cipher options"); 1454 DMWARN("Ignoring unexpected additional cipher options");
1076 1455
1077 /* Compatibility mode for old dm-crypt mappings */ 1456 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
1457 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
1458 __alignof__(struct crypt_cpu));
1459 if (!cc->cpu) {
1460 ti->error = "Cannot allocate per cpu state";
1461 goto bad_mem;
1462 }
1463
1464 /*
1465 * For compatibility with the original dm-crypt mapping format, if
1466 * only the cipher name is supplied, use cbc-plain.
1467 */
1078 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { 1468 if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
1079 kfree(cc->cipher_mode);
1080 cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
1081 chainmode = "cbc"; 1469 chainmode = "cbc";
1082 ivmode = "plain"; 1470 ivmode = "plain";
1083 } 1471 }
@@ -1099,11 +1487,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1099 } 1487 }
1100 1488
1101 /* Allocate cipher */ 1489 /* Allocate cipher */
1102 cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); 1490 for_each_possible_cpu(cpu) {
1103 if (IS_ERR(cc->tfm)) { 1491 ret = crypt_alloc_tfms(cc, cpu, cipher_api);
1104 ret = PTR_ERR(cc->tfm); 1492 if (ret < 0) {
1105 ti->error = "Error allocating crypto tfm"; 1493 ti->error = "Error allocating crypto tfm";
1106 goto bad; 1494 goto bad;
1495 }
1107 } 1496 }
1108 1497
1109 /* Initialize and set key */ 1498 /* Initialize and set key */
@@ -1114,7 +1503,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1114 } 1503 }
1115 1504
1116 /* Initialize IV */ 1505 /* Initialize IV */
1117 cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); 1506 cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
1118 if (cc->iv_size) 1507 if (cc->iv_size)
1119 /* at least a 64 bit sector number should fit in our buffer */ 1508 /* at least a 64 bit sector number should fit in our buffer */
1120 cc->iv_size = max(cc->iv_size, 1509 cc->iv_size = max(cc->iv_size,
@@ -1137,7 +1526,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1137 cc->iv_gen_ops = &crypt_iv_benbi_ops; 1526 cc->iv_gen_ops = &crypt_iv_benbi_ops;
1138 else if (strcmp(ivmode, "null") == 0) 1527 else if (strcmp(ivmode, "null") == 0)
1139 cc->iv_gen_ops = &crypt_iv_null_ops; 1528 cc->iv_gen_ops = &crypt_iv_null_ops;
1140 else { 1529 else if (strcmp(ivmode, "lmk") == 0) {
1530 cc->iv_gen_ops = &crypt_iv_lmk_ops;
1531 /* Version 2 and 3 is recognised according
1532 * to length of provided multi-key string.
1533 * If present (version 3), last key is used as IV seed.
1534 */
1535 if (cc->key_size % cc->key_parts)
1536 cc->key_parts++;
1537 } else {
1141 ret = -EINVAL; 1538 ret = -EINVAL;
1142 ti->error = "Invalid IV mode"; 1539 ti->error = "Invalid IV mode";
1143 goto bad; 1540 goto bad;
@@ -1194,6 +1591,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1194 ti->error = "Cannot allocate encryption context"; 1591 ti->error = "Cannot allocate encryption context";
1195 return -ENOMEM; 1592 return -ENOMEM;
1196 } 1593 }
1594 cc->key_size = key_size;
1197 1595
1198 ti->private = cc; 1596 ti->private = cc;
1199 ret = crypt_ctr_cipher(ti, argv[0], argv[1]); 1597 ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
@@ -1208,9 +1606,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1208 } 1606 }
1209 1607
1210 cc->dmreq_start = sizeof(struct ablkcipher_request); 1608 cc->dmreq_start = sizeof(struct ablkcipher_request);
1211 cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); 1609 cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
1212 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); 1610 cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
1213 cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & 1611 cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
1214 ~(crypto_tfm_ctx_alignment() - 1); 1612 ~(crypto_tfm_ctx_alignment() - 1);
1215 1613
1216 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + 1614 cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1617,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1219 ti->error = "Cannot allocate crypt request mempool"; 1617 ti->error = "Cannot allocate crypt request mempool";
1220 goto bad; 1618 goto bad;
1221 } 1619 }
1222 cc->req = NULL;
1223 1620
1224 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); 1621 cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
1225 if (!cc->page_pool) { 1622 if (!cc->page_pool) {
@@ -1252,13 +1649,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1252 cc->start = tmpll; 1649 cc->start = tmpll;
1253 1650
1254 ret = -ENOMEM; 1651 ret = -ENOMEM;
1255 cc->io_queue = create_singlethread_workqueue("kcryptd_io"); 1652 cc->io_queue = alloc_workqueue("kcryptd_io",
1653 WQ_NON_REENTRANT|
1654 WQ_MEM_RECLAIM,
1655 1);
1256 if (!cc->io_queue) { 1656 if (!cc->io_queue) {
1257 ti->error = "Couldn't create kcryptd io queue"; 1657 ti->error = "Couldn't create kcryptd io queue";
1258 goto bad; 1658 goto bad;
1259 } 1659 }
1260 1660
1261 cc->crypt_queue = create_singlethread_workqueue("kcryptd"); 1661 cc->crypt_queue = alloc_workqueue("kcryptd",
1662 WQ_NON_REENTRANT|
1663 WQ_CPU_INTENSIVE|
1664 WQ_MEM_RECLAIM,
1665 1);
1262 if (!cc->crypt_queue) { 1666 if (!cc->crypt_queue) {
1263 ti->error = "Couldn't create kcryptd queue"; 1667 ti->error = "Couldn't create kcryptd queue";
1264 goto bad; 1668 goto bad;
@@ -1278,7 +1682,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1278 struct dm_crypt_io *io; 1682 struct dm_crypt_io *io;
1279 struct crypt_config *cc; 1683 struct crypt_config *cc;
1280 1684
1281 if (unlikely(bio_empty_barrier(bio))) { 1685 if (bio->bi_rw & REQ_FLUSH) {
1282 cc = ti->private; 1686 cc = ti->private;
1283 bio->bi_bdev = cc->dev->bdev; 1687 bio->bi_bdev = cc->dev->bdev;
1284 return DM_MAPIO_REMAPPED; 1688 return DM_MAPIO_REMAPPED;
@@ -1286,9 +1690,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
1286 1690
1287 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); 1691 io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
1288 1692
1289 if (bio_data_dir(io->base_bio) == READ) 1693 if (bio_data_dir(io->base_bio) == READ) {
1290 kcryptd_queue_io(io); 1694 if (kcryptd_io_read(io, GFP_NOWAIT))
1291 else 1695 kcryptd_queue_io(io);
1696 } else
1292 kcryptd_queue_crypt(io); 1697 kcryptd_queue_crypt(io);
1293 1698
1294 return DM_MAPIO_SUBMITTED; 1699 return DM_MAPIO_SUBMITTED;
@@ -1306,10 +1711,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
1306 break; 1711 break;
1307 1712
1308 case STATUSTYPE_TABLE: 1713 case STATUSTYPE_TABLE:
1309 if (cc->cipher_mode) 1714 DMEMIT("%s ", cc->cipher_string);
1310 DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
1311 else
1312 DMEMIT("%s ", cc->cipher);
1313 1715
1314 if (cc->key_size > 0) { 1716 if (cc->key_size > 0) {
1315 if ((maxlen - sz) < ((cc->key_size << 1) + 1)) 1717 if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1421,7 +1823,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
1421 1823
1422static struct target_type crypt_target = { 1824static struct target_type crypt_target = {
1423 .name = "crypt", 1825 .name = "crypt",
1424 .version = {1, 7, 0}, 1826 .version = {1, 10, 0},
1425 .module = THIS_MODULE, 1827 .module = THIS_MODULE,
1426 .ctr = crypt_ctr, 1828 .ctr = crypt_ctr,
1427 .dtr = crypt_dtr, 1829 .dtr = crypt_dtr,
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
352{ 352{
353 int r = -ENOMEM; 353 int r = -ENOMEM;
354 354
355 kdelayd_wq = create_workqueue("kdelayd"); 355 kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
356 if (!kdelayd_wq) { 356 if (!kdelayd_wq) {
357 DMERR("Couldn't start kdelayd"); 357 DMERR("Couldn't start kdelayd");
358 goto bad_queue; 358 goto bad_queue;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
new file mode 100644
index 000000000000..ea790623c30b
--- /dev/null
+++ b/drivers/md/dm-flakey.c
@@ -0,0 +1,212 @@
1/*
2 * Copyright (C) 2003 Sistina Software (UK) Limited.
3 * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/device-mapper.h>
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/blkdev.h>
13#include <linux/bio.h>
14#include <linux/slab.h>
15
16#define DM_MSG_PREFIX "flakey"
17
18/*
19 * Flakey: Used for testing only, simulates intermittent,
20 * catastrophic device failure.
21 */
22struct flakey_c {
23 struct dm_dev *dev;
24 unsigned long start_time;
25 sector_t start;
26 unsigned up_interval;
27 unsigned down_interval;
28};
29
30/*
31 * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
32 */
33static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
34{
35 struct flakey_c *fc;
36 unsigned long long tmp;
37
38 if (argc != 4) {
39 ti->error = "dm-flakey: Invalid argument count";
40 return -EINVAL;
41 }
42
43 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
44 if (!fc) {
45 ti->error = "dm-flakey: Cannot allocate linear context";
46 return -ENOMEM;
47 }
48 fc->start_time = jiffies;
49
50 if (sscanf(argv[1], "%llu", &tmp) != 1) {
51 ti->error = "dm-flakey: Invalid device sector";
52 goto bad;
53 }
54 fc->start = tmp;
55
56 if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
57 ti->error = "dm-flakey: Invalid up interval";
58 goto bad;
59 }
60
61 if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
62 ti->error = "dm-flakey: Invalid down interval";
63 goto bad;
64 }
65
66 if (!(fc->up_interval + fc->down_interval)) {
67 ti->error = "dm-flakey: Total (up + down) interval is zero";
68 goto bad;
69 }
70
71 if (fc->up_interval + fc->down_interval < fc->up_interval) {
72 ti->error = "dm-flakey: Interval overflow";
73 goto bad;
74 }
75
76 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
77 ti->error = "dm-flakey: Device lookup failed";
78 goto bad;
79 }
80
81 ti->num_flush_requests = 1;
82 ti->private = fc;
83 return 0;
84
85bad:
86 kfree(fc);
87 return -EINVAL;
88}
89
90static void flakey_dtr(struct dm_target *ti)
91{
92 struct flakey_c *fc = ti->private;
93
94 dm_put_device(ti, fc->dev);
95 kfree(fc);
96}
97
98static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
99{
100 struct flakey_c *fc = ti->private;
101
102 return fc->start + (bi_sector - ti->begin);
103}
104
105static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
106{
107 struct flakey_c *fc = ti->private;
108
109 bio->bi_bdev = fc->dev->bdev;
110 if (bio_sectors(bio))
111 bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
112}
113
114static int flakey_map(struct dm_target *ti, struct bio *bio,
115 union map_info *map_context)
116{
117 struct flakey_c *fc = ti->private;
118 unsigned elapsed;
119
120 /* Are we alive ? */
121 elapsed = (jiffies - fc->start_time) / HZ;
122 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
123 return -EIO;
124
125 flakey_map_bio(ti, bio);
126
127 return DM_MAPIO_REMAPPED;
128}
129
130static int flakey_status(struct dm_target *ti, status_type_t type,
131 char *result, unsigned int maxlen)
132{
133 struct flakey_c *fc = ti->private;
134
135 switch (type) {
136 case STATUSTYPE_INFO:
137 result[0] = '\0';
138 break;
139
140 case STATUSTYPE_TABLE:
141 snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
142 (unsigned long long)fc->start, fc->up_interval,
143 fc->down_interval);
144 break;
145 }
146 return 0;
147}
148
149static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg)
150{
151 struct flakey_c *fc = ti->private;
152
153 return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg);
154}
155
156static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
157 struct bio_vec *biovec, int max_size)
158{
159 struct flakey_c *fc = ti->private;
160 struct request_queue *q = bdev_get_queue(fc->dev->bdev);
161
162 if (!q->merge_bvec_fn)
163 return max_size;
164
165 bvm->bi_bdev = fc->dev->bdev;
166 bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector);
167
168 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
169}
170
171static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
172{
173 struct flakey_c *fc = ti->private;
174
175 return fn(ti, fc->dev, fc->start, ti->len, data);
176}
177
178static struct target_type flakey_target = {
179 .name = "flakey",
180 .version = {1, 1, 0},
181 .module = THIS_MODULE,
182 .ctr = flakey_ctr,
183 .dtr = flakey_dtr,
184 .map = flakey_map,
185 .status = flakey_status,
186 .ioctl = flakey_ioctl,
187 .merge = flakey_merge,
188 .iterate_devices = flakey_iterate_devices,
189};
190
191static int __init dm_flakey_init(void)
192{
193 int r = dm_register_target(&flakey_target);
194
195 if (r < 0)
196 DMERR("register failed %d", r);
197
198 return r;
199}
200
201static void __exit dm_flakey_exit(void)
202{
203 dm_unregister_target(&flakey_target);
204}
205
206/* Module hooks */
207module_init(dm_flakey_init);
208module_exit(dm_flakey_exit);
209
210MODULE_DESCRIPTION(DM_NAME " flakey target");
211MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
212MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75b0ab6..2067288f61f9 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -19,6 +19,8 @@
19#define DM_MSG_PREFIX "io" 19#define DM_MSG_PREFIX "io"
20 20
21#define DM_IO_MAX_REGIONS BITS_PER_LONG 21#define DM_IO_MAX_REGIONS BITS_PER_LONG
22#define MIN_IOS 16
23#define MIN_BIOS 16
22 24
23struct dm_io_client { 25struct dm_io_client {
24 mempool_t *pool; 26 mempool_t *pool;
@@ -31,7 +33,6 @@ struct dm_io_client {
31 */ 33 */
32struct io { 34struct io {
33 unsigned long error_bits; 35 unsigned long error_bits;
34 unsigned long eopnotsupp_bits;
35 atomic_t count; 36 atomic_t count;
36 struct task_struct *sleeper; 37 struct task_struct *sleeper;
37 struct dm_io_client *client; 38 struct dm_io_client *client;
@@ -42,33 +43,21 @@ struct io {
42static struct kmem_cache *_dm_io_cache; 43static struct kmem_cache *_dm_io_cache;
43 44
44/* 45/*
45 * io contexts are only dynamically allocated for asynchronous
46 * io. Since async io is likely to be the majority of io we'll
47 * have the same number of io contexts as bios! (FIXME: must reduce this).
48 */
49
50static unsigned int pages_to_ios(unsigned int pages)
51{
52 return 4 * pages; /* too many ? */
53}
54
55/*
56 * Create a client with mempool and bioset. 46 * Create a client with mempool and bioset.
57 */ 47 */
58struct dm_io_client *dm_io_client_create(unsigned num_pages) 48struct dm_io_client *dm_io_client_create(void)
59{ 49{
60 unsigned ios = pages_to_ios(num_pages);
61 struct dm_io_client *client; 50 struct dm_io_client *client;
62 51
63 client = kmalloc(sizeof(*client), GFP_KERNEL); 52 client = kmalloc(sizeof(*client), GFP_KERNEL);
64 if (!client) 53 if (!client)
65 return ERR_PTR(-ENOMEM); 54 return ERR_PTR(-ENOMEM);
66 55
67 client->pool = mempool_create_slab_pool(ios, _dm_io_cache); 56 client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
68 if (!client->pool) 57 if (!client->pool)
69 goto bad; 58 goto bad;
70 59
71 client->bios = bioset_create(16, 0); 60 client->bios = bioset_create(MIN_BIOS, 0);
72 if (!client->bios) 61 if (!client->bios)
73 goto bad; 62 goto bad;
74 63
@@ -82,13 +71,6 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
82} 71}
83EXPORT_SYMBOL(dm_io_client_create); 72EXPORT_SYMBOL(dm_io_client_create);
84 73
85int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client)
86{
87 return mempool_resize(client->pool, pages_to_ios(num_pages),
88 GFP_KERNEL);
89}
90EXPORT_SYMBOL(dm_io_client_resize);
91
92void dm_io_client_destroy(struct dm_io_client *client) 74void dm_io_client_destroy(struct dm_io_client *client)
93{ 75{
94 mempool_destroy(client->pool); 76 mempool_destroy(client->pool);
@@ -130,11 +112,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
130 *---------------------------------------------------------------*/ 112 *---------------------------------------------------------------*/
131static void dec_count(struct io *io, unsigned int region, int error) 113static void dec_count(struct io *io, unsigned int region, int error)
132{ 114{
133 if (error) { 115 if (error)
134 set_bit(region, &io->error_bits); 116 set_bit(region, &io->error_bits);
135 if (error == -EOPNOTSUPP)
136 set_bit(region, &io->eopnotsupp_bits);
137 }
138 117
139 if (atomic_dec_and_test(&io->count)) { 118 if (atomic_dec_and_test(&io->count)) {
140 if (io->sleeper) 119 if (io->sleeper)
@@ -310,8 +289,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
310 sector_t remaining = where->count; 289 sector_t remaining = where->count;
311 290
312 /* 291 /*
313 * where->count may be zero if rw holds a write barrier and we 292 * where->count may be zero if rw holds a flush and we need to
314 * need to send a zero-sized barrier. 293 * send a zero-sized flush.
315 */ 294 */
316 do { 295 do {
317 /* 296 /*
@@ -356,7 +335,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
356 BUG_ON(num_regions > DM_IO_MAX_REGIONS); 335 BUG_ON(num_regions > DM_IO_MAX_REGIONS);
357 336
358 if (sync) 337 if (sync)
359 rw |= REQ_SYNC | REQ_UNPLUG; 338 rw |= REQ_SYNC;
360 339
361 /* 340 /*
362 * For multiple regions we need to be careful to rewind 341 * For multiple regions we need to be careful to rewind
@@ -364,7 +343,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
364 */ 343 */
365 for (i = 0; i < num_regions; i++) { 344 for (i = 0; i < num_regions; i++) {
366 *dp = old_pages; 345 *dp = old_pages;
367 if (where[i].count || (rw & REQ_HARDBARRIER)) 346 if (where[i].count || (rw & REQ_FLUSH))
368 do_region(rw, i, where + i, dp, io); 347 do_region(rw, i, where + i, dp, io);
369 } 348 }
370 349
@@ -393,9 +372,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
393 return -EIO; 372 return -EIO;
394 } 373 }
395 374
396retry:
397 io->error_bits = 0; 375 io->error_bits = 0;
398 io->eopnotsupp_bits = 0;
399 atomic_set(&io->count, 1); /* see dispatch_io() */ 376 atomic_set(&io->count, 1); /* see dispatch_io() */
400 io->sleeper = current; 377 io->sleeper = current;
401 io->client = client; 378 io->client = client;
@@ -412,11 +389,6 @@ retry:
412 } 389 }
413 set_current_state(TASK_RUNNING); 390 set_current_state(TASK_RUNNING);
414 391
415 if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
416 rw &= ~REQ_HARDBARRIER;
417 goto retry;
418 }
419
420 if (error_bits) 392 if (error_bits)
421 *error_bits = io->error_bits; 393 *error_bits = io->error_bits;
422 394
@@ -437,7 +409,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
437 409
438 io = mempool_alloc(client->pool, GFP_NOIO); 410 io = mempool_alloc(client->pool, GFP_NOIO);
439 io->error_bits = 0; 411 io->error_bits = 0;
440 io->eopnotsupp_bits = 0;
441 atomic_set(&io->count, 1); /* see dispatch_io() */ 412 atomic_set(&io->count, 1); /* see dispatch_io() */
442 io->sleeper = NULL; 413 io->sleeper = NULL;
443 io->client = client; 414 io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3e39193e5036..4cacdad2270a 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
295 DMWARN("remove_all left %d open device(s)", dev_skipped); 295 DMWARN("remove_all left %d open device(s)", dev_skipped);
296} 296}
297 297
298/*
299 * Set the uuid of a hash_cell that isn't already set.
300 */
301static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
302{
303 mutex_lock(&dm_hash_cells_mutex);
304 hc->uuid = new_uuid;
305 mutex_unlock(&dm_hash_cells_mutex);
306
307 list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
308}
309
310/*
311 * Changes the name of a hash_cell and returns the old name for
312 * the caller to free.
313 */
314static char *__change_cell_name(struct hash_cell *hc, char *new_name)
315{
316 char *old_name;
317
318 /*
319 * Rename and move the name cell.
320 */
321 list_del(&hc->name_list);
322 old_name = hc->name;
323
324 mutex_lock(&dm_hash_cells_mutex);
325 hc->name = new_name;
326 mutex_unlock(&dm_hash_cells_mutex);
327
328 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
329
330 return old_name;
331}
332
298static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, 333static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
299 const char *new) 334 const char *new)
300{ 335{
301 char *new_name, *old_name; 336 char *new_data, *old_name = NULL;
302 struct hash_cell *hc; 337 struct hash_cell *hc;
303 struct dm_table *table; 338 struct dm_table *table;
304 struct mapped_device *md; 339 struct mapped_device *md;
340 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
305 341
306 /* 342 /*
307 * duplicate new. 343 * duplicate new.
308 */ 344 */
309 new_name = kstrdup(new, GFP_KERNEL); 345 new_data = kstrdup(new, GFP_KERNEL);
310 if (!new_name) 346 if (!new_data)
311 return ERR_PTR(-ENOMEM); 347 return ERR_PTR(-ENOMEM);
312 348
313 down_write(&_hash_lock); 349 down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
315 /* 351 /*
316 * Is new free ? 352 * Is new free ?
317 */ 353 */
318 hc = __get_name_cell(new); 354 if (change_uuid)
355 hc = __get_uuid_cell(new);
356 else
357 hc = __get_name_cell(new);
358
319 if (hc) { 359 if (hc) {
320 DMWARN("asked to rename to an already-existing name %s -> %s", 360 DMWARN("Unable to change %s on mapped device %s to one that "
361 "already exists: %s",
362 change_uuid ? "uuid" : "name",
321 param->name, new); 363 param->name, new);
322 dm_put(hc->md); 364 dm_put(hc->md);
323 up_write(&_hash_lock); 365 up_write(&_hash_lock);
324 kfree(new_name); 366 kfree(new_data);
325 return ERR_PTR(-EBUSY); 367 return ERR_PTR(-EBUSY);
326 } 368 }
327 369
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
330 */ 372 */
331 hc = __get_name_cell(param->name); 373 hc = __get_name_cell(param->name);
332 if (!hc) { 374 if (!hc) {
333 DMWARN("asked to rename a non-existent device %s -> %s", 375 DMWARN("Unable to rename non-existent device, %s to %s%s",
334 param->name, new); 376 param->name, change_uuid ? "uuid " : "", new);
335 up_write(&_hash_lock); 377 up_write(&_hash_lock);
336 kfree(new_name); 378 kfree(new_data);
337 return ERR_PTR(-ENXIO); 379 return ERR_PTR(-ENXIO);
338 } 380 }
339 381
340 /* 382 /*
341 * rename and move the name cell. 383 * Does this device already have a uuid?
342 */ 384 */
343 list_del(&hc->name_list); 385 if (change_uuid && hc->uuid) {
344 old_name = hc->name; 386 DMWARN("Unable to change uuid of mapped device %s to %s "
345 mutex_lock(&dm_hash_cells_mutex); 387 "because uuid is already set to %s",
346 hc->name = new_name; 388 param->name, new, hc->uuid);
347 mutex_unlock(&dm_hash_cells_mutex); 389 dm_put(hc->md);
348 list_add(&hc->name_list, _name_buckets + hash_str(new_name)); 390 up_write(&_hash_lock);
391 kfree(new_data);
392 return ERR_PTR(-EINVAL);
393 }
394
395 if (change_uuid)
396 __set_cell_uuid(hc, new_data);
397 else
398 old_name = __change_cell_name(hc, new_data);
349 399
350 /* 400 /*
351 * Wake up any dm event waiters. 401 * Wake up any dm event waiters.
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
729 hc = __find_device_hash_cell(param); 779 hc = __find_device_hash_cell(param);
730 780
731 if (!hc) { 781 if (!hc) {
732 DMWARN("device doesn't appear to be in the dev hash table."); 782 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
733 up_write(&_hash_lock); 783 up_write(&_hash_lock);
734 return -ENXIO; 784 return -ENXIO;
735 } 785 }
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
741 */ 791 */
742 r = dm_lock_for_deletion(md); 792 r = dm_lock_for_deletion(md);
743 if (r) { 793 if (r) {
744 DMWARN("unable to remove open device %s", hc->name); 794 DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
745 up_write(&_hash_lock); 795 up_write(&_hash_lock);
746 dm_put(md); 796 dm_put(md);
747 return r; 797 return r;
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
774static int dev_rename(struct dm_ioctl *param, size_t param_size) 824static int dev_rename(struct dm_ioctl *param, size_t param_size)
775{ 825{
776 int r; 826 int r;
777 char *new_name = (char *) param + param->data_start; 827 char *new_data = (char *) param + param->data_start;
778 struct mapped_device *md; 828 struct mapped_device *md;
829 unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
779 830
780 if (new_name < param->data || 831 if (new_data < param->data ||
781 invalid_str(new_name, (void *) param + param_size) || 832 invalid_str(new_data, (void *) param + param_size) ||
782 strlen(new_name) > DM_NAME_LEN - 1) { 833 strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
783 DMWARN("Invalid new logical volume name supplied."); 834 DMWARN("Invalid new mapped device name or uuid string supplied.");
784 return -EINVAL; 835 return -EINVAL;
785 } 836 }
786 837
787 r = check_name(new_name); 838 if (!change_uuid) {
788 if (r) 839 r = check_name(new_data);
789 return r; 840 if (r)
841 return r;
842 }
790 843
791 md = dm_hash_rename(param, new_name); 844 md = dm_hash_rename(param, new_data);
792 if (IS_ERR(md)) 845 if (IS_ERR(md))
793 return PTR_ERR(md); 846 return PTR_ERR(md);
794 847
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
885 938
886 hc = __find_device_hash_cell(param); 939 hc = __find_device_hash_cell(param);
887 if (!hc) { 940 if (!hc) {
888 DMWARN("device doesn't appear to be in the dev hash table."); 941 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
889 up_write(&_hash_lock); 942 up_write(&_hash_lock);
890 return -ENXIO; 943 return -ENXIO;
891 } 944 }
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
1212 1265
1213 hc = __find_device_hash_cell(param); 1266 hc = __find_device_hash_cell(param);
1214 if (!hc) { 1267 if (!hc) {
1215 DMWARN("device doesn't appear to be in the dev hash table."); 1268 DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
1216 up_write(&_hash_lock); 1269 up_write(&_hash_lock);
1217 return -ENXIO; 1270 return -ENXIO;
1218 } 1271 }
@@ -1448,14 +1501,10 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
1448 return r; 1501 return r;
1449} 1502}
1450 1503
1451static void free_params(struct dm_ioctl *param)
1452{
1453 vfree(param);
1454}
1455
1456static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) 1504static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
1457{ 1505{
1458 struct dm_ioctl tmp, *dmi; 1506 struct dm_ioctl tmp, *dmi;
1507 int secure_data;
1459 1508
1460 if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) 1509 if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
1461 return -EFAULT; 1510 return -EFAULT;
@@ -1463,17 +1512,30 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
1463 if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) 1512 if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
1464 return -EINVAL; 1513 return -EINVAL;
1465 1514
1515 secure_data = tmp.flags & DM_SECURE_DATA_FLAG;
1516
1466 dmi = vmalloc(tmp.data_size); 1517 dmi = vmalloc(tmp.data_size);
1467 if (!dmi) 1518 if (!dmi) {
1519 if (secure_data && clear_user(user, tmp.data_size))
1520 return -EFAULT;
1468 return -ENOMEM; 1521 return -ENOMEM;
1469
1470 if (copy_from_user(dmi, user, tmp.data_size)) {
1471 vfree(dmi);
1472 return -EFAULT;
1473 } 1522 }
1474 1523
1524 if (copy_from_user(dmi, user, tmp.data_size))
1525 goto bad;
1526
1527 /* Wipe the user buffer so we do not return it to userspace */
1528 if (secure_data && clear_user(user, tmp.data_size))
1529 goto bad;
1530
1475 *param = dmi; 1531 *param = dmi;
1476 return 0; 1532 return 0;
1533
1534bad:
1535 if (secure_data)
1536 memset(dmi, 0, tmp.data_size);
1537 vfree(dmi);
1538 return -EFAULT;
1477} 1539}
1478 1540
1479static int validate_params(uint cmd, struct dm_ioctl *param) 1541static int validate_params(uint cmd, struct dm_ioctl *param)
@@ -1481,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1481 /* Always clear this flag */ 1543 /* Always clear this flag */
1482 param->flags &= ~DM_BUFFER_FULL_FLAG; 1544 param->flags &= ~DM_BUFFER_FULL_FLAG;
1483 param->flags &= ~DM_UEVENT_GENERATED_FLAG; 1545 param->flags &= ~DM_UEVENT_GENERATED_FLAG;
1546 param->flags &= ~DM_SECURE_DATA_FLAG;
1484 1547
1485 /* Ignores parameters */ 1548 /* Ignores parameters */
1486 if (cmd == DM_REMOVE_ALL_CMD || 1549 if (cmd == DM_REMOVE_ALL_CMD ||
@@ -1508,10 +1571,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
1508static int ctl_ioctl(uint command, struct dm_ioctl __user *user) 1571static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1509{ 1572{
1510 int r = 0; 1573 int r = 0;
1574 int wipe_buffer;
1511 unsigned int cmd; 1575 unsigned int cmd;
1512 struct dm_ioctl *uninitialized_var(param); 1576 struct dm_ioctl *uninitialized_var(param);
1513 ioctl_fn fn = NULL; 1577 ioctl_fn fn = NULL;
1514 size_t param_size; 1578 size_t input_param_size;
1515 1579
1516 /* only root can play with this */ 1580 /* only root can play with this */
1517 if (!capable(CAP_SYS_ADMIN)) 1581 if (!capable(CAP_SYS_ADMIN))
@@ -1558,13 +1622,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1558 if (r) 1622 if (r)
1559 return r; 1623 return r;
1560 1624
1625 input_param_size = param->data_size;
1626 wipe_buffer = param->flags & DM_SECURE_DATA_FLAG;
1627
1561 r = validate_params(cmd, param); 1628 r = validate_params(cmd, param);
1562 if (r) 1629 if (r)
1563 goto out; 1630 goto out;
1564 1631
1565 param_size = param->data_size;
1566 param->data_size = sizeof(*param); 1632 param->data_size = sizeof(*param);
1567 r = fn(param, param_size); 1633 r = fn(param, input_param_size);
1568 1634
1569 /* 1635 /*
1570 * Copy the results back to userland. 1636 * Copy the results back to userland.
@@ -1572,8 +1638,11 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
1572 if (!r && copy_to_user(user, param, param->data_size)) 1638 if (!r && copy_to_user(user, param, param->data_size))
1573 r = -EFAULT; 1639 r = -EFAULT;
1574 1640
1575 out: 1641out:
1576 free_params(param); 1642 if (wipe_buffer)
1643 memset(param, 0, input_param_size);
1644
1645 vfree(param);
1577 return r; 1646 return r;
1578} 1647}
1579 1648
@@ -1596,6 +1665,7 @@ static const struct file_operations _ctl_fops = {
1596 .unlocked_ioctl = dm_ctl_ioctl, 1665 .unlocked_ioctl = dm_ctl_ioctl,
1597 .compat_ioctl = dm_compat_ctl_ioctl, 1666 .compat_ioctl = dm_compat_ctl_ioctl,
1598 .owner = THIS_MODULE, 1667 .owner = THIS_MODULE,
1668 .llseek = noop_llseek,
1599}; 1669};
1600 1670
1601static struct miscdevice _dm_misc = { 1671static struct miscdevice _dm_misc = {
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..819e37eaaeba 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -27,15 +27,19 @@
27 27
28#include "dm.h" 28#include "dm.h"
29 29
30#define SUB_JOB_SIZE 128
31#define SPLIT_COUNT 8
32#define MIN_JOBS 8
33#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE))
34
30/*----------------------------------------------------------------- 35/*-----------------------------------------------------------------
31 * Each kcopyd client has its own little pool of preallocated 36 * Each kcopyd client has its own little pool of preallocated
32 * pages for kcopyd io. 37 * pages for kcopyd io.
33 *---------------------------------------------------------------*/ 38 *---------------------------------------------------------------*/
34struct dm_kcopyd_client { 39struct dm_kcopyd_client {
35 spinlock_t lock;
36 struct page_list *pages; 40 struct page_list *pages;
37 unsigned int nr_pages; 41 unsigned nr_reserved_pages;
38 unsigned int nr_free_pages; 42 unsigned nr_free_pages;
39 43
40 struct dm_io_client *io_client; 44 struct dm_io_client *io_client;
41 45
@@ -67,15 +71,18 @@ static void wake(struct dm_kcopyd_client *kc)
67 queue_work(kc->kcopyd_wq, &kc->kcopyd_work); 71 queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
68} 72}
69 73
70static struct page_list *alloc_pl(void) 74/*
75 * Obtain one page for the use of kcopyd.
76 */
77static struct page_list *alloc_pl(gfp_t gfp)
71{ 78{
72 struct page_list *pl; 79 struct page_list *pl;
73 80
74 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 81 pl = kmalloc(sizeof(*pl), gfp);
75 if (!pl) 82 if (!pl)
76 return NULL; 83 return NULL;
77 84
78 pl->page = alloc_page(GFP_KERNEL); 85 pl->page = alloc_page(gfp);
79 if (!pl->page) { 86 if (!pl->page) {
80 kfree(pl); 87 kfree(pl);
81 return NULL; 88 return NULL;
@@ -90,41 +97,56 @@ static void free_pl(struct page_list *pl)
90 kfree(pl); 97 kfree(pl);
91} 98}
92 99
93static int kcopyd_get_pages(struct dm_kcopyd_client *kc, 100/*
94 unsigned int nr, struct page_list **pages) 101 * Add the provided pages to a client's free page list, releasing
102 * back to the system any beyond the reserved_pages limit.
103 */
104static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
95{ 105{
96 struct page_list *pl; 106 struct page_list *next;
97
98 spin_lock(&kc->lock);
99 if (kc->nr_free_pages < nr) {
100 spin_unlock(&kc->lock);
101 return -ENOMEM;
102 }
103
104 kc->nr_free_pages -= nr;
105 for (*pages = pl = kc->pages; --nr; pl = pl->next)
106 ;
107 107
108 kc->pages = pl->next; 108 do {
109 pl->next = NULL; 109 next = pl->next;
110 110
111 spin_unlock(&kc->lock); 111 if (kc->nr_free_pages >= kc->nr_reserved_pages)
112 free_pl(pl);
113 else {
114 pl->next = kc->pages;
115 kc->pages = pl;
116 kc->nr_free_pages++;
117 }
112 118
113 return 0; 119 pl = next;
120 } while (pl);
114} 121}
115 122
116static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) 123static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
124 unsigned int nr, struct page_list **pages)
117{ 125{
118 struct page_list *cursor; 126 struct page_list *pl;
127
128 *pages = NULL;
129
130 do {
131 pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
132 if (unlikely(!pl)) {
133 /* Use reserved pages */
134 pl = kc->pages;
135 if (unlikely(!pl))
136 goto out_of_memory;
137 kc->pages = pl->next;
138 kc->nr_free_pages--;
139 }
140 pl->next = *pages;
141 *pages = pl;
142 } while (--nr);
119 143
120 spin_lock(&kc->lock); 144 return 0;
121 for (cursor = pl; cursor->next; cursor = cursor->next)
122 kc->nr_free_pages++;
123 145
124 kc->nr_free_pages++; 146out_of_memory:
125 cursor->next = kc->pages; 147 if (*pages)
126 kc->pages = pl; 148 kcopyd_put_pages(kc, *pages);
127 spin_unlock(&kc->lock); 149 return -ENOMEM;
128} 150}
129 151
130/* 152/*
@@ -141,13 +163,16 @@ static void drop_pages(struct page_list *pl)
141 } 163 }
142} 164}
143 165
144static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) 166/*
167 * Allocate and reserve nr_pages for the use of a specific client.
168 */
169static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages)
145{ 170{
146 unsigned int i; 171 unsigned i;
147 struct page_list *pl = NULL, *next; 172 struct page_list *pl = NULL, *next;
148 173
149 for (i = 0; i < nr; i++) { 174 for (i = 0; i < nr_pages; i++) {
150 next = alloc_pl(); 175 next = alloc_pl(GFP_KERNEL);
151 if (!next) { 176 if (!next) {
152 if (pl) 177 if (pl)
153 drop_pages(pl); 178 drop_pages(pl);
@@ -157,17 +182,18 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
157 pl = next; 182 pl = next;
158 } 183 }
159 184
185 kc->nr_reserved_pages += nr_pages;
160 kcopyd_put_pages(kc, pl); 186 kcopyd_put_pages(kc, pl);
161 kc->nr_pages += nr; 187
162 return 0; 188 return 0;
163} 189}
164 190
165static void client_free_pages(struct dm_kcopyd_client *kc) 191static void client_free_pages(struct dm_kcopyd_client *kc)
166{ 192{
167 BUG_ON(kc->nr_free_pages != kc->nr_pages); 193 BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages);
168 drop_pages(kc->pages); 194 drop_pages(kc->pages);
169 kc->pages = NULL; 195 kc->pages = NULL;
170 kc->nr_free_pages = kc->nr_pages = 0; 196 kc->nr_free_pages = kc->nr_reserved_pages = 0;
171} 197}
172 198
173/*----------------------------------------------------------------- 199/*-----------------------------------------------------------------
@@ -216,16 +242,17 @@ struct kcopyd_job {
216 struct mutex lock; 242 struct mutex lock;
217 atomic_t sub_jobs; 243 atomic_t sub_jobs;
218 sector_t progress; 244 sector_t progress;
219};
220 245
221/* FIXME: this should scale with the number of pages */ 246 struct kcopyd_job *master_job;
222#define MIN_JOBS 512 247};
223 248
224static struct kmem_cache *_job_cache; 249static struct kmem_cache *_job_cache;
225 250
226int __init dm_kcopyd_init(void) 251int __init dm_kcopyd_init(void)
227{ 252{
228 _job_cache = KMEM_CACHE(kcopyd_job, 0); 253 _job_cache = kmem_cache_create("kcopyd_job",
254 sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1),
255 __alignof__(struct kcopyd_job), 0, NULL);
229 if (!_job_cache) 256 if (!_job_cache)
230 return -ENOMEM; 257 return -ENOMEM;
231 258
@@ -299,7 +326,12 @@ static int run_complete_job(struct kcopyd_job *job)
299 326
300 if (job->pages) 327 if (job->pages)
301 kcopyd_put_pages(kc, job->pages); 328 kcopyd_put_pages(kc, job->pages);
302 mempool_free(job, kc->job_pool); 329 /*
330 * If this is the master job, the sub jobs have already
331 * completed so we can free everything.
332 */
333 if (job->master_job == job)
334 mempool_free(job, kc->job_pool);
303 fn(read_err, write_err, context); 335 fn(read_err, write_err, context);
304 336
305 if (atomic_dec_and_test(&kc->nr_jobs)) 337 if (atomic_dec_and_test(&kc->nr_jobs))
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
345{ 377{
346 int r; 378 int r;
347 struct dm_io_request io_req = { 379 struct dm_io_request io_req = {
348 .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, 380 .bi_rw = job->rw,
349 .mem.type = DM_IO_PAGE_LIST, 381 .mem.type = DM_IO_PAGE_LIST,
350 .mem.ptr.pl = job->pages, 382 .mem.ptr.pl = job->pages,
351 .mem.offset = job->offset, 383 .mem.offset = job->offset,
@@ -428,6 +460,7 @@ static void do_work(struct work_struct *work)
428{ 460{
429 struct dm_kcopyd_client *kc = container_of(work, 461 struct dm_kcopyd_client *kc = container_of(work,
430 struct dm_kcopyd_client, kcopyd_work); 462 struct dm_kcopyd_client, kcopyd_work);
463 struct blk_plug plug;
431 464
432 /* 465 /*
433 * The order that these are called is *very* important. 466 * The order that these are called is *very* important.
@@ -436,9 +469,11 @@ static void do_work(struct work_struct *work)
436 * list. io jobs call wake when they complete and it all 469 * list. io jobs call wake when they complete and it all
437 * starts again. 470 * starts again.
438 */ 471 */
472 blk_start_plug(&plug);
439 process_jobs(&kc->complete_jobs, kc, run_complete_job); 473 process_jobs(&kc->complete_jobs, kc, run_complete_job);
440 process_jobs(&kc->pages_jobs, kc, run_pages_job); 474 process_jobs(&kc->pages_jobs, kc, run_pages_job);
441 process_jobs(&kc->io_jobs, kc, run_io_job); 475 process_jobs(&kc->io_jobs, kc, run_io_job);
476 blk_finish_plug(&plug);
442} 477}
443 478
444/* 479/*
@@ -457,14 +492,14 @@ static void dispatch_job(struct kcopyd_job *job)
457 wake(kc); 492 wake(kc);
458} 493}
459 494
460#define SUB_JOB_SIZE 128
461static void segment_complete(int read_err, unsigned long write_err, 495static void segment_complete(int read_err, unsigned long write_err,
462 void *context) 496 void *context)
463{ 497{
464 /* FIXME: tidy this function */ 498 /* FIXME: tidy this function */
465 sector_t progress = 0; 499 sector_t progress = 0;
466 sector_t count = 0; 500 sector_t count = 0;
467 struct kcopyd_job *job = (struct kcopyd_job *) context; 501 struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
502 struct kcopyd_job *job = sub_job->master_job;
468 struct dm_kcopyd_client *kc = job->kc; 503 struct dm_kcopyd_client *kc = job->kc;
469 504
470 mutex_lock(&job->lock); 505 mutex_lock(&job->lock);
@@ -495,8 +530,6 @@ static void segment_complete(int read_err, unsigned long write_err,
495 530
496 if (count) { 531 if (count) {
497 int i; 532 int i;
498 struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
499 GFP_NOIO);
500 533
501 *sub_job = *job; 534 *sub_job = *job;
502 sub_job->source.sector += progress; 535 sub_job->source.sector += progress;
@@ -508,7 +541,7 @@ static void segment_complete(int read_err, unsigned long write_err,
508 } 541 }
509 542
510 sub_job->fn = segment_complete; 543 sub_job->fn = segment_complete;
511 sub_job->context = job; 544 sub_job->context = sub_job;
512 dispatch_job(sub_job); 545 dispatch_job(sub_job);
513 546
514 } else if (atomic_dec_and_test(&job->sub_jobs)) { 547 } else if (atomic_dec_and_test(&job->sub_jobs)) {
@@ -528,19 +561,19 @@ static void segment_complete(int read_err, unsigned long write_err,
528} 561}
529 562
530/* 563/*
531 * Create some little jobs that will do the move between 564 * Create some sub jobs to share the work between them.
532 * them.
533 */ 565 */
534#define SPLIT_COUNT 8 566static void split_job(struct kcopyd_job *master_job)
535static void split_job(struct kcopyd_job *job)
536{ 567{
537 int i; 568 int i;
538 569
539 atomic_inc(&job->kc->nr_jobs); 570 atomic_inc(&master_job->kc->nr_jobs);
540 571
541 atomic_set(&job->sub_jobs, SPLIT_COUNT); 572 atomic_set(&master_job->sub_jobs, SPLIT_COUNT);
542 for (i = 0; i < SPLIT_COUNT; i++) 573 for (i = 0; i < SPLIT_COUNT; i++) {
543 segment_complete(0, 0u, job); 574 master_job[i + 1].master_job = master_job;
575 segment_complete(0, 0u, &master_job[i + 1]);
576 }
544} 577}
545 578
546int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, 579int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
@@ -550,7 +583,8 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
550 struct kcopyd_job *job; 583 struct kcopyd_job *job;
551 584
552 /* 585 /*
553 * Allocate a new job. 586 * Allocate an array of jobs consisting of one master job
587 * followed by SPLIT_COUNT sub jobs.
554 */ 588 */
555 job = mempool_alloc(kc->job_pool, GFP_NOIO); 589 job = mempool_alloc(kc->job_pool, GFP_NOIO);
556 590
@@ -574,10 +608,10 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
574 608
575 job->fn = fn; 609 job->fn = fn;
576 job->context = context; 610 job->context = context;
611 job->master_job = job;
577 612
578 if (job->source.count < SUB_JOB_SIZE) 613 if (job->source.count <= SUB_JOB_SIZE)
579 dispatch_job(job); 614 dispatch_job(job);
580
581 else { 615 else {
582 mutex_init(&job->lock); 616 mutex_init(&job->lock);
583 job->progress = 0; 617 job->progress = 0;
@@ -603,17 +637,15 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
603/*----------------------------------------------------------------- 637/*-----------------------------------------------------------------
604 * Client setup 638 * Client setup
605 *---------------------------------------------------------------*/ 639 *---------------------------------------------------------------*/
606int dm_kcopyd_client_create(unsigned int nr_pages, 640struct dm_kcopyd_client *dm_kcopyd_client_create(void)
607 struct dm_kcopyd_client **result)
608{ 641{
609 int r = -ENOMEM; 642 int r = -ENOMEM;
610 struct dm_kcopyd_client *kc; 643 struct dm_kcopyd_client *kc;
611 644
612 kc = kmalloc(sizeof(*kc), GFP_KERNEL); 645 kc = kmalloc(sizeof(*kc), GFP_KERNEL);
613 if (!kc) 646 if (!kc)
614 return -ENOMEM; 647 return ERR_PTR(-ENOMEM);
615 648
616 spin_lock_init(&kc->lock);
617 spin_lock_init(&kc->job_lock); 649 spin_lock_init(&kc->job_lock);
618 INIT_LIST_HEAD(&kc->complete_jobs); 650 INIT_LIST_HEAD(&kc->complete_jobs);
619 INIT_LIST_HEAD(&kc->io_jobs); 651 INIT_LIST_HEAD(&kc->io_jobs);
@@ -624,17 +656,18 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
624 goto bad_slab; 656 goto bad_slab;
625 657
626 INIT_WORK(&kc->kcopyd_work, do_work); 658 INIT_WORK(&kc->kcopyd_work, do_work);
627 kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); 659 kc->kcopyd_wq = alloc_workqueue("kcopyd",
660 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
628 if (!kc->kcopyd_wq) 661 if (!kc->kcopyd_wq)
629 goto bad_workqueue; 662 goto bad_workqueue;
630 663
631 kc->pages = NULL; 664 kc->pages = NULL;
632 kc->nr_pages = kc->nr_free_pages = 0; 665 kc->nr_reserved_pages = kc->nr_free_pages = 0;
633 r = client_alloc_pages(kc, nr_pages); 666 r = client_reserve_pages(kc, RESERVE_PAGES);
634 if (r) 667 if (r)
635 goto bad_client_pages; 668 goto bad_client_pages;
636 669
637 kc->io_client = dm_io_client_create(nr_pages); 670 kc->io_client = dm_io_client_create();
638 if (IS_ERR(kc->io_client)) { 671 if (IS_ERR(kc->io_client)) {
639 r = PTR_ERR(kc->io_client); 672 r = PTR_ERR(kc->io_client);
640 goto bad_io_client; 673 goto bad_io_client;
@@ -643,8 +676,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
643 init_waitqueue_head(&kc->destroyq); 676 init_waitqueue_head(&kc->destroyq);
644 atomic_set(&kc->nr_jobs, 0); 677 atomic_set(&kc->nr_jobs, 0);
645 678
646 *result = kc; 679 return kc;
647 return 0;
648 680
649bad_io_client: 681bad_io_client:
650 client_free_pages(kc); 682 client_free_pages(kc);
@@ -655,7 +687,7 @@ bad_workqueue:
655bad_slab: 687bad_slab:
656 kfree(kc); 688 kfree(kc);
657 689
658 return r; 690 return ERR_PTR(r);
659} 691}
660EXPORT_SYMBOL(dm_kcopyd_client_create); 692EXPORT_SYMBOL(dm_kcopyd_client_create);
661 693
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,12 +12,22 @@
12 12
13#include "dm-log-userspace-transfer.h" 13#include "dm-log-userspace-transfer.h"
14 14
15#define DM_LOG_USERSPACE_VSN "1.1.0"
16
15struct flush_entry { 17struct flush_entry {
16 int type; 18 int type;
17 region_t region; 19 region_t region;
18 struct list_head list; 20 struct list_head list;
19}; 21};
20 22
23/*
24 * This limit on the number of mark and clear request is, to a degree,
25 * arbitrary. However, there is some basis for the choice in the limits
26 * imposed on the size of data payload by dm-log-userspace-transfer.c:
27 * dm_consult_userspace().
28 */
29#define MAX_FLUSH_GROUP_COUNT 32
30
21struct log_c { 31struct log_c {
22 struct dm_target *ti; 32 struct dm_target *ti;
23 uint32_t region_size; 33 uint32_t region_size;
@@ -37,8 +47,15 @@ struct log_c {
37 */ 47 */
38 uint64_t in_sync_hint; 48 uint64_t in_sync_hint;
39 49
50 /*
51 * Mark and clear requests are held until a flush is issued
52 * so that we can group, and thereby limit, the amount of
53 * network traffic between kernel and userspace. The 'flush_lock'
54 * is used to protect these lists.
55 */
40 spinlock_t flush_lock; 56 spinlock_t flush_lock;
41 struct list_head flush_list; /* only for clear and mark requests */ 57 struct list_head mark_list;
58 struct list_head clear_list;
42}; 59};
43 60
44static mempool_t *flush_entry_pool; 61static mempool_t *flush_entry_pool;
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
169 186
170 strncpy(lc->uuid, argv[0], DM_UUID_LEN); 187 strncpy(lc->uuid, argv[0], DM_UUID_LEN);
171 spin_lock_init(&lc->flush_lock); 188 spin_lock_init(&lc->flush_lock);
172 INIT_LIST_HEAD(&lc->flush_list); 189 INIT_LIST_HEAD(&lc->mark_list);
190 INIT_LIST_HEAD(&lc->clear_list);
173 191
174 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); 192 str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
175 if (str_size < 0) { 193 if (str_size < 0) {
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
181 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, 199 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
182 ctr_str, str_size, NULL, NULL); 200 ctr_str, str_size, NULL, NULL);
183 201
184 if (r == -ESRCH) { 202 if (r < 0) {
185 DMERR("Userspace log server not found"); 203 if (r == -ESRCH)
204 DMERR("Userspace log server not found");
205 else
206 DMERR("Userspace log server failed to create log");
186 goto out; 207 goto out;
187 } 208 }
188 209
@@ -214,10 +235,9 @@ out:
214 235
215static void userspace_dtr(struct dm_dirty_log *log) 236static void userspace_dtr(struct dm_dirty_log *log)
216{ 237{
217 int r;
218 struct log_c *lc = log->context; 238 struct log_c *lc = log->context;
219 239
220 r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, 240 (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
221 NULL, 0, 241 NULL, 0,
222 NULL, NULL); 242 NULL, NULL);
223 243
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
338 return (r) ? 0 : (int)in_sync; 358 return (r) ? 0 : (int)in_sync;
339} 359}
340 360
361static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
362{
363 int r = 0;
364 struct flush_entry *fe;
365
366 list_for_each_entry(fe, flush_list, list) {
367 r = userspace_do_request(lc, lc->uuid, fe->type,
368 (char *)&fe->region,
369 sizeof(fe->region),
370 NULL, NULL);
371 if (r)
372 break;
373 }
374
375 return r;
376}
377
378static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
379{
380 int r = 0;
381 int count;
382 uint32_t type = 0;
383 struct flush_entry *fe, *tmp_fe;
384 LIST_HEAD(tmp_list);
385 uint64_t group[MAX_FLUSH_GROUP_COUNT];
386
387 /*
388 * Group process the requests
389 */
390 while (!list_empty(flush_list)) {
391 count = 0;
392
393 list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
394 group[count] = fe->region;
395 count++;
396
397 list_del(&fe->list);
398 list_add(&fe->list, &tmp_list);
399
400 type = fe->type;
401 if (count >= MAX_FLUSH_GROUP_COUNT)
402 break;
403 }
404
405 r = userspace_do_request(lc, lc->uuid, type,
406 (char *)(group),
407 count * sizeof(uint64_t),
408 NULL, NULL);
409 if (r) {
410 /* Group send failed. Attempt one-by-one. */
411 list_splice_init(&tmp_list, flush_list);
412 r = flush_one_by_one(lc, flush_list);
413 break;
414 }
415 }
416
417 /*
418 * Must collect flush_entrys that were successfully processed
419 * as a group so that they will be free'd by the caller.
420 */
421 list_splice_init(&tmp_list, flush_list);
422
423 return r;
424}
425
341/* 426/*
342 * userspace_flush 427 * userspace_flush
343 * 428 *
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log)
360 int r = 0; 445 int r = 0;
361 unsigned long flags; 446 unsigned long flags;
362 struct log_c *lc = log->context; 447 struct log_c *lc = log->context;
363 LIST_HEAD(flush_list); 448 LIST_HEAD(mark_list);
449 LIST_HEAD(clear_list);
364 struct flush_entry *fe, *tmp_fe; 450 struct flush_entry *fe, *tmp_fe;
365 451
366 spin_lock_irqsave(&lc->flush_lock, flags); 452 spin_lock_irqsave(&lc->flush_lock, flags);
367 list_splice_init(&lc->flush_list, &flush_list); 453 list_splice_init(&lc->mark_list, &mark_list);
454 list_splice_init(&lc->clear_list, &clear_list);
368 spin_unlock_irqrestore(&lc->flush_lock, flags); 455 spin_unlock_irqrestore(&lc->flush_lock, flags);
369 456
370 if (list_empty(&flush_list)) 457 if (list_empty(&mark_list) && list_empty(&clear_list))
371 return 0; 458 return 0;
372 459
373 /* 460 r = flush_by_group(lc, &mark_list);
374 * FIXME: Count up requests, group request types, 461 if (r)
375 * allocate memory to stick all requests in and 462 goto fail;
376 * send to server in one go. Failing the allocation,
377 * do it one by one.
378 */
379 463
380 list_for_each_entry(fe, &flush_list, list) { 464 r = flush_by_group(lc, &clear_list);
381 r = userspace_do_request(lc, lc->uuid, fe->type, 465 if (r)
382 (char *)&fe->region, 466 goto fail;
383 sizeof(fe->region),
384 NULL, NULL);
385 if (r)
386 goto fail;
387 }
388 467
389 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, 468 r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
390 NULL, 0, NULL, NULL); 469 NULL, 0, NULL, NULL);
@@ -395,7 +474,11 @@ fail:
395 * Calling code will receive an error and will know that 474 * Calling code will receive an error and will know that
396 * the log facility has failed. 475 * the log facility has failed.
397 */ 476 */
398 list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { 477 list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
478 list_del(&fe->list);
479 mempool_free(fe, flush_entry_pool);
480 }
481 list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
399 list_del(&fe->list); 482 list_del(&fe->list);
400 mempool_free(fe, flush_entry_pool); 483 mempool_free(fe, flush_entry_pool);
401 } 484 }
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
425 spin_lock_irqsave(&lc->flush_lock, flags); 508 spin_lock_irqsave(&lc->flush_lock, flags);
426 fe->type = DM_ULOG_MARK_REGION; 509 fe->type = DM_ULOG_MARK_REGION;
427 fe->region = region; 510 fe->region = region;
428 list_add(&fe->list, &lc->flush_list); 511 list_add(&fe->list, &lc->mark_list);
429 spin_unlock_irqrestore(&lc->flush_lock, flags); 512 spin_unlock_irqrestore(&lc->flush_lock, flags);
430 513
431 return; 514 return;
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
462 spin_lock_irqsave(&lc->flush_lock, flags); 545 spin_lock_irqsave(&lc->flush_lock, flags);
463 fe->type = DM_ULOG_CLEAR_REGION; 546 fe->type = DM_ULOG_CLEAR_REGION;
464 fe->region = region; 547 fe->region = region;
465 list_add(&fe->list, &lc->flush_list); 548 list_add(&fe->list, &lc->clear_list);
466 spin_unlock_irqrestore(&lc->flush_lock, flags); 549 spin_unlock_irqrestore(&lc->flush_lock, flags);
467 550
468 return; 551 return;
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
684 return r; 767 return r;
685 } 768 }
686 769
687 DMINFO("version 1.0.0 loaded"); 770 DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
688 return 0; 771 return 0;
689} 772}
690 773
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
694 dm_ulog_tfr_exit(); 777 dm_ulog_tfr_exit();
695 mempool_destroy(flush_entry_pool); 778 mempool_destroy(flush_entry_pool);
696 779
697 DMINFO("version 1.0.0 unloaded"); 780 DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
698 return; 781 return;
699} 782}
700 783
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..1f23e048f077 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
134{ 134{
135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); 135 struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
136 136
137 if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) 137 if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
138 return; 138 return;
139 139
140 spin_lock(&receiving_list_lock); 140 spin_lock(&receiving_list_lock);
@@ -198,6 +198,7 @@ resend:
198 198
199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); 199 memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
200 memcpy(tfr->uuid, uuid, DM_UUID_LEN); 200 memcpy(tfr->uuid, uuid, DM_UUID_LEN);
201 tfr->version = DM_ULOG_REQUEST_VERSION;
201 tfr->luid = luid; 202 tfr->luid = luid;
202 tfr->seq = dm_ulog_seq++; 203 tfr->seq = dm_ulog_seq++;
203 204
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0222db..948e3f4925bf 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -251,20 +251,20 @@ struct log_c {
251 */ 251 */
252static inline int log_test_bit(uint32_t *bs, unsigned bit) 252static inline int log_test_bit(uint32_t *bs, unsigned bit)
253{ 253{
254 return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; 254 return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
255} 255}
256 256
257static inline void log_set_bit(struct log_c *l, 257static inline void log_set_bit(struct log_c *l,
258 uint32_t *bs, unsigned bit) 258 uint32_t *bs, unsigned bit)
259{ 259{
260 ext2_set_bit(bit, (unsigned long *) bs); 260 __test_and_set_bit_le(bit, (unsigned long *) bs);
261 l->touched_cleaned = 1; 261 l->touched_cleaned = 1;
262} 262}
263 263
264static inline void log_clear_bit(struct log_c *l, 264static inline void log_clear_bit(struct log_c *l,
265 uint32_t *bs, unsigned bit) 265 uint32_t *bs, unsigned bit)
266{ 266{
267 ext2_clear_bit(bit, (unsigned long *) bs); 267 __test_and_clear_bit_le(bit, (unsigned long *) bs);
268 l->touched_dirtied = 1; 268 l->touched_dirtied = 1;
269} 269}
270 270
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc)
300 .count = 0, 300 .count = 0,
301 }; 301 };
302 302
303 lc->io_req.bi_rw = WRITE_BARRIER; 303 lc->io_req.bi_rw = WRITE_FLUSH;
304 304
305 return dm_io(&lc->io_req, 1, &null_location, NULL); 305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306} 306}
@@ -449,13 +449,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
449 449
450 lc->io_req.mem.type = DM_IO_VMA; 450 lc->io_req.mem.type = DM_IO_VMA;
451 lc->io_req.notify.fn = NULL; 451 lc->io_req.notify.fn = NULL;
452 lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, 452 lc->io_req.client = dm_io_client_create();
453 PAGE_SIZE));
454 if (IS_ERR(lc->io_req.client)) { 453 if (IS_ERR(lc->io_req.client)) {
455 r = PTR_ERR(lc->io_req.client); 454 r = PTR_ERR(lc->io_req.client);
456 DMWARN("couldn't allocate disk io client"); 455 DMWARN("couldn't allocate disk io client");
457 kfree(lc); 456 kfree(lc);
458 return -ENOMEM; 457 return r;
459 } 458 }
460 459
461 lc->disk_header = vmalloc(buf_size); 460 lc->disk_header = vmalloc(buf_size);
@@ -543,7 +542,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
543 return -EINVAL; 542 return -EINVAL;
544 } 543 }
545 544
546 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev); 545 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
547 if (r) 546 if (r)
548 return r; 547 return r;
549 548
@@ -740,7 +739,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
740 return 0; 739 return 0;
741 740
742 do { 741 do {
743 *region = ext2_find_next_zero_bit( 742 *region = find_next_zero_bit_le(
744 (unsigned long *) lc->sync_bits, 743 (unsigned long *) lc->sync_bits,
745 lc->region_count, 744 lc->region_count,
746 lc->sync_search); 745 lc->sync_search);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..aa4e570c2cb5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
23 23
24#define DM_MSG_PREFIX "multipath" 24#define DM_MSG_PREFIX "multipath"
25#define MESG_STR(x) x, sizeof(x) 25#define MESG_STR(x) x, sizeof(x)
26#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
26 28
27/* Path properties */ 29/* Path properties */
28struct pgpath { 30struct pgpath {
@@ -33,8 +35,7 @@ struct pgpath {
33 unsigned fail_count; /* Cumulative failure count */ 35 unsigned fail_count; /* Cumulative failure count */
34 36
35 struct dm_path path; 37 struct dm_path path;
36 struct work_struct deactivate_path; 38 struct delayed_work activate_path;
37 struct work_struct activate_path;
38}; 39};
39 40
40#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) 41#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -65,11 +66,15 @@ struct multipath {
65 66
66 const char *hw_handler_name; 67 const char *hw_handler_name;
67 char *hw_handler_params; 68 char *hw_handler_params;
69
68 unsigned nr_priority_groups; 70 unsigned nr_priority_groups;
69 struct list_head priority_groups; 71 struct list_head priority_groups;
72
73 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
74
70 unsigned pg_init_required; /* pg_init needs calling? */ 75 unsigned pg_init_required; /* pg_init needs calling? */
71 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ 76 unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
72 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 77 unsigned pg_init_delay_retry; /* Delay pg_init retry? */
73 78
74 unsigned nr_valid_paths; /* Total number of usable paths */ 79 unsigned nr_valid_paths; /* Total number of usable paths */
75 struct pgpath *current_pgpath; 80 struct pgpath *current_pgpath;
@@ -82,6 +87,7 @@ struct multipath {
82 unsigned saved_queue_if_no_path;/* Saved state during suspension */ 87 unsigned saved_queue_if_no_path;/* Saved state during suspension */
83 unsigned pg_init_retries; /* Number of times to retry pg_init */ 88 unsigned pg_init_retries; /* Number of times to retry pg_init */
84 unsigned pg_init_count; /* Number of times pg_init called */ 89 unsigned pg_init_count; /* Number of times pg_init called */
90 unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
85 91
86 struct work_struct process_queued_ios; 92 struct work_struct process_queued_ios;
87 struct list_head queued_ios; 93 struct list_head queued_ios;
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
116static void process_queued_ios(struct work_struct *work); 122static void process_queued_ios(struct work_struct *work);
117static void trigger_event(struct work_struct *work); 123static void trigger_event(struct work_struct *work);
118static void activate_path(struct work_struct *work); 124static void activate_path(struct work_struct *work);
119static void deactivate_path(struct work_struct *work);
120 125
121 126
122/*----------------------------------------------- 127/*-----------------------------------------------
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void)
129 134
130 if (pgpath) { 135 if (pgpath) {
131 pgpath->is_active = 1; 136 pgpath->is_active = 1;
132 INIT_WORK(&pgpath->deactivate_path, deactivate_path); 137 INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
133 INIT_WORK(&pgpath->activate_path, activate_path);
134 } 138 }
135 139
136 return pgpath; 140 return pgpath;
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath)
141 kfree(pgpath); 145 kfree(pgpath);
142} 146}
143 147
144static void deactivate_path(struct work_struct *work)
145{
146 struct pgpath *pgpath =
147 container_of(work, struct pgpath, deactivate_path);
148
149 blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
150}
151
152static struct priority_group *alloc_priority_group(void) 148static struct priority_group *alloc_priority_group(void)
153{ 149{
154 struct priority_group *pg; 150 struct priority_group *pg;
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
199 INIT_LIST_HEAD(&m->queued_ios); 195 INIT_LIST_HEAD(&m->queued_ios);
200 spin_lock_init(&m->lock); 196 spin_lock_init(&m->lock);
201 m->queue_io = 1; 197 m->queue_io = 1;
198 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
202 INIT_WORK(&m->process_queued_ios, process_queued_ios); 199 INIT_WORK(&m->process_queued_ios, process_queued_ios);
203 INIT_WORK(&m->trigger_event, trigger_event); 200 INIT_WORK(&m->trigger_event, trigger_event);
204 init_waitqueue_head(&m->pg_init_wait); 201 init_waitqueue_head(&m->pg_init_wait);
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m)
238static void __pg_init_all_paths(struct multipath *m) 235static void __pg_init_all_paths(struct multipath *m)
239{ 236{
240 struct pgpath *pgpath; 237 struct pgpath *pgpath;
238 unsigned long pg_init_delay = 0;
241 239
242 m->pg_init_count++; 240 m->pg_init_count++;
243 m->pg_init_required = 0; 241 m->pg_init_required = 0;
242 if (m->pg_init_delay_retry)
243 pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
244 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
244 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { 245 list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
245 /* Skip failed paths */ 246 /* Skip failed paths */
246 if (!pgpath->is_active) 247 if (!pgpath->is_active)
247 continue; 248 continue;
248 if (queue_work(kmpath_handlerd, &pgpath->activate_path)) 249 if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
250 pg_init_delay))
249 m->pg_init_in_progress++; 251 m->pg_init_in_progress++;
250 } 252 }
251} 253}
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
793 const char *param_name; 795 const char *param_name;
794 796
795 static struct param _params[] = { 797 static struct param _params[] = {
796 {0, 3, "invalid number of feature args"}, 798 {0, 5, "invalid number of feature args"},
797 {1, 50, "pg_init_retries must be between 1 and 50"}, 799 {1, 50, "pg_init_retries must be between 1 and 50"},
800 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
798 }; 801 };
799 802
800 r = read_param(_params, shift(as), &argc, &ti->error); 803 r = read_param(_params, shift(as), &argc, &ti->error);
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
821 continue; 824 continue;
822 } 825 }
823 826
827 if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
828 (argc >= 1)) {
829 r = read_param(_params + 2, shift(as),
830 &m->pg_init_delay_msecs, &ti->error);
831 argc--;
832 continue;
833 }
834
824 ti->error = "Unrecognised multipath feature request"; 835 ti->error = "Unrecognised multipath feature request";
825 r = -EINVAL; 836 r = -EINVAL;
826 } while (argc && !r); 837 } while (argc && !r);
@@ -833,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
833{ 844{
834 /* target parameters */ 845 /* target parameters */
835 static struct param _params[] = { 846 static struct param _params[] = {
836 {1, 1024, "invalid number of priority groups"}, 847 {0, 1024, "invalid number of priority groups"},
837 {1, 1024, "invalid initial priority group number"}, 848 {0, 1024, "invalid initial priority group number"},
838 }; 849 };
839 850
840 int r; 851 int r;
@@ -868,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
868 if (r) 879 if (r)
869 goto bad; 880 goto bad;
870 881
882 if ((!m->nr_priority_groups && next_pg_num) ||
883 (m->nr_priority_groups && !next_pg_num)) {
884 ti->error = "invalid initial priority group";
885 r = -EINVAL;
886 goto bad;
887 }
888
871 /* parse the priority groups */ 889 /* parse the priority groups */
872 while (as.argc) { 890 while (as.argc) {
873 struct priority_group *pg; 891 struct priority_group *pg;
@@ -931,7 +949,7 @@ static void flush_multipath_work(struct multipath *m)
931 flush_workqueue(kmpath_handlerd); 949 flush_workqueue(kmpath_handlerd);
932 multipath_wait_for_pg_init_completion(m); 950 multipath_wait_for_pg_init_completion(m);
933 flush_workqueue(kmultipathd); 951 flush_workqueue(kmultipathd);
934 flush_scheduled_work(); 952 flush_work_sync(&m->trigger_event);
935} 953}
936 954
937static void multipath_dtr(struct dm_target *ti) 955static void multipath_dtr(struct dm_target *ti)
@@ -995,7 +1013,6 @@ static int fail_path(struct pgpath *pgpath)
995 pgpath->path.dev->name, m->nr_valid_paths); 1013 pgpath->path.dev->name, m->nr_valid_paths);
996 1014
997 schedule_work(&m->trigger_event); 1015 schedule_work(&m->trigger_event);
998 queue_work(kmultipathd, &pgpath->deactivate_path);
999 1016
1000out: 1017out:
1001 spin_unlock_irqrestore(&m->lock, flags); 1018 spin_unlock_irqrestore(&m->lock, flags);
@@ -1034,7 +1051,7 @@ static int reinstate_path(struct pgpath *pgpath)
1034 m->current_pgpath = NULL; 1051 m->current_pgpath = NULL;
1035 queue_work(kmultipathd, &m->process_queued_ios); 1052 queue_work(kmultipathd, &m->process_queued_ios);
1036 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 1053 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1037 if (queue_work(kmpath_handlerd, &pgpath->activate_path)) 1054 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1038 m->pg_init_in_progress++; 1055 m->pg_init_in_progress++;
1039 } 1056 }
1040 1057
@@ -1055,7 +1072,7 @@ out:
1055static int action_dev(struct multipath *m, struct dm_dev *dev, 1072static int action_dev(struct multipath *m, struct dm_dev *dev,
1056 action_fn action) 1073 action_fn action)
1057{ 1074{
1058 int r = 0; 1075 int r = -EINVAL;
1059 struct pgpath *pgpath; 1076 struct pgpath *pgpath;
1060 struct priority_group *pg; 1077 struct priority_group *pg;
1061 1078
@@ -1169,6 +1186,7 @@ static void pg_init_done(void *data, int errors)
1169 struct priority_group *pg = pgpath->pg; 1186 struct priority_group *pg = pgpath->pg;
1170 struct multipath *m = pg->m; 1187 struct multipath *m = pg->m;
1171 unsigned long flags; 1188 unsigned long flags;
1189 unsigned delay_retry = 0;
1172 1190
1173 /* device or driver problems */ 1191 /* device or driver problems */
1174 switch (errors) { 1192 switch (errors) {
@@ -1193,8 +1211,9 @@ static void pg_init_done(void *data, int errors)
1193 */ 1211 */
1194 bypass_pg(m, pg, 1); 1212 bypass_pg(m, pg, 1);
1195 break; 1213 break;
1196 /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
1197 case SCSI_DH_RETRY: 1214 case SCSI_DH_RETRY:
1215 /* Wait before retrying. */
1216 delay_retry = 1;
1198 case SCSI_DH_IMM_RETRY: 1217 case SCSI_DH_IMM_RETRY:
1199 case SCSI_DH_RES_TEMP_UNAVAIL: 1218 case SCSI_DH_RES_TEMP_UNAVAIL:
1200 if (pg_init_limit_reached(m, pgpath)) 1219 if (pg_init_limit_reached(m, pgpath))
@@ -1227,6 +1246,7 @@ static void pg_init_done(void *data, int errors)
1227 if (!m->pg_init_required) 1246 if (!m->pg_init_required)
1228 m->queue_io = 0; 1247 m->queue_io = 0;
1229 1248
1249 m->pg_init_delay_retry = delay_retry;
1230 queue_work(kmultipathd, &m->process_queued_ios); 1250 queue_work(kmultipathd, &m->process_queued_ios);
1231 1251
1232 /* 1252 /*
@@ -1241,7 +1261,7 @@ out:
1241static void activate_path(struct work_struct *work) 1261static void activate_path(struct work_struct *work)
1242{ 1262{
1243 struct pgpath *pgpath = 1263 struct pgpath *pgpath =
1244 container_of(work, struct pgpath, activate_path); 1264 container_of(work, struct pgpath, activate_path.work);
1245 1265
1246 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), 1266 scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1247 pg_init_done, pgpath); 1267 pg_init_done, pgpath);
@@ -1270,24 +1290,22 @@ static int do_end_io(struct multipath *m, struct request *clone,
1270 if (!error && !clone->errors) 1290 if (!error && !clone->errors)
1271 return 0; /* I/O complete */ 1291 return 0; /* I/O complete */
1272 1292
1273 if (error == -EOPNOTSUPP) 1293 if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ)
1274 return error;
1275
1276 if (clone->cmd_flags & REQ_DISCARD)
1277 /*
1278 * Pass all discard request failures up.
1279 * FIXME: only fail_path if the discard failed due to a
1280 * transport problem. This requires precise understanding
1281 * of the underlying failure (e.g. the SCSI sense).
1282 */
1283 return error; 1294 return error;
1284 1295
1285 if (mpio->pgpath) 1296 if (mpio->pgpath)
1286 fail_path(mpio->pgpath); 1297 fail_path(mpio->pgpath);
1287 1298
1288 spin_lock_irqsave(&m->lock, flags); 1299 spin_lock_irqsave(&m->lock, flags);
1289 if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) 1300 if (!m->nr_valid_paths) {
1290 r = -EIO; 1301 if (!m->queue_if_no_path) {
1302 if (!__must_push_back(m))
1303 r = -EIO;
1304 } else {
1305 if (error == -EBADE)
1306 r = error;
1307 }
1308 }
1291 spin_unlock_irqrestore(&m->lock, flags); 1309 spin_unlock_irqrestore(&m->lock, flags);
1292 1310
1293 return r; 1311 return r;
@@ -1382,11 +1400,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1382 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); 1400 DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1383 else { 1401 else {
1384 DMEMIT("%u ", m->queue_if_no_path + 1402 DMEMIT("%u ", m->queue_if_no_path +
1385 (m->pg_init_retries > 0) * 2); 1403 (m->pg_init_retries > 0) * 2 +
1404 (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
1386 if (m->queue_if_no_path) 1405 if (m->queue_if_no_path)
1387 DMEMIT("queue_if_no_path "); 1406 DMEMIT("queue_if_no_path ");
1388 if (m->pg_init_retries) 1407 if (m->pg_init_retries)
1389 DMEMIT("pg_init_retries %u ", m->pg_init_retries); 1408 DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1409 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1410 DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1390 } 1411 }
1391 1412
1392 if (!m->hw_handler_name || type == STATUSTYPE_INFO) 1413 if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1401,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
1401 else if (m->current_pg) 1422 else if (m->current_pg)
1402 pg_num = m->current_pg->pg_num; 1423 pg_num = m->current_pg->pg_num;
1403 else 1424 else
1404 pg_num = 1; 1425 pg_num = (m->nr_priority_groups ? 1 : 0);
1405 1426
1406 DMEMIT("%u ", pg_num); 1427 DMEMIT("%u ", pg_num);
1407 1428
@@ -1655,7 +1676,7 @@ out:
1655 *---------------------------------------------------------------*/ 1676 *---------------------------------------------------------------*/
1656static struct target_type multipath_target = { 1677static struct target_type multipath_target = {
1657 .name = "multipath", 1678 .name = "multipath",
1658 .version = {1, 1, 1}, 1679 .version = {1, 3, 0},
1659 .module = THIS_MODULE, 1680 .module = THIS_MODULE,
1660 .ctr = multipath_ctr, 1681 .ctr = multipath_ctr,
1661 .dtr = multipath_dtr, 1682 .dtr = multipath_dtr,
@@ -1687,7 +1708,7 @@ static int __init dm_multipath_init(void)
1687 return -EINVAL; 1708 return -EINVAL;
1688 } 1709 }
1689 1710
1690 kmultipathd = create_workqueue("kmpathd"); 1711 kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1691 if (!kmultipathd) { 1712 if (!kmultipathd) {
1692 DMERR("failed to create workqueue kmpathd"); 1713 DMERR("failed to create workqueue kmpathd");
1693 dm_unregister_target(&multipath_target); 1714 dm_unregister_target(&multipath_target);
@@ -1701,7 +1722,8 @@ static int __init dm_multipath_init(void)
1701 * old workqueue would also create a bottleneck in the 1722 * old workqueue would also create a bottleneck in the
1702 * path of the storage hardware device activation. 1723 * path of the storage hardware device activation.
1703 */ 1724 */
1704 kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); 1725 kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1726 WQ_MEM_RECLAIM);
1705 if (!kmpath_handlerd) { 1727 if (!kmpath_handlerd) {
1706 DMERR("failed to create workqueue kmpath_handlerd"); 1728 DMERR("failed to create workqueue kmpath_handlerd");
1707 destroy_workqueue(kmultipathd); 1729 destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..e5d8904fc8f6
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,689 @@
1/*
2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/slab.h>
9
10#include "md.h"
11#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h"
14
15#define DM_MSG_PREFIX "raid"
16
17/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
19 * make it so the flag doesn't set anything.
20 */
21#ifndef MD_SYNC_STATE_FORCED
22#define MD_SYNC_STATE_FORCED 0
23#endif
24
25struct raid_dev {
26 /*
27 * Two DM devices, one to hold metadata and one to hold the
28 * actual data/parity. The reason for this is to not confuse
29 * ti->len and give more flexibility in altering size and
30 * characteristics.
31 *
32 * While it is possible for this device to be associated
33 * with a different physical device than the data_dev, it
34 * is intended for it to be the same.
35 * |--------- Physical Device ---------|
36 * |- meta_dev -|------ data_dev ------|
37 */
38 struct dm_dev *meta_dev;
39 struct dm_dev *data_dev;
40 struct mdk_rdev_s rdev;
41};
42
43/*
44 * Flags for rs->print_flags field.
45 */
46#define DMPF_DAEMON_SLEEP 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2
48#define DMPF_SYNC 0x4
49#define DMPF_NOSYNC 0x8
50#define DMPF_STRIPE_CACHE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40
53
54struct raid_set {
55 struct dm_target *ti;
56
57 uint64_t print_flags;
58
59 struct mddev_s md;
60 struct raid_type *raid_type;
61 struct dm_target_callbacks callbacks;
62
63 struct raid_dev dev[0];
64};
65
66/* Supported raid types and properties. */
67static struct raid_type {
68 const char *name; /* RAID algorithm. */
69 const char *descr; /* Descriptor text for logging. */
70 const unsigned parity_devs; /* # of parity devices. */
71 const unsigned minimal_devs; /* minimal # of devices in set. */
72 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = {
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
78 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
79 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
80 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
81 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
82 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
83};
84
85static struct raid_type *get_raid_type(char *name)
86{
87 int i;
88
89 for (i = 0; i < ARRAY_SIZE(raid_types); i++)
90 if (!strcmp(raid_types[i].name, name))
91 return &raid_types[i];
92
93 return NULL;
94}
95
96static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
97{
98 unsigned i;
99 struct raid_set *rs;
100 sector_t sectors_per_dev;
101
102 if (raid_devs <= raid_type->parity_devs) {
103 ti->error = "Insufficient number of devices";
104 return ERR_PTR(-EINVAL);
105 }
106
107 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL);
111 }
112
113 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
114 if (!rs) {
115 ti->error = "Cannot allocate raid context";
116 return ERR_PTR(-ENOMEM);
117 }
118
119 mddev_init(&rs->md);
120
121 rs->ti = ti;
122 rs->raid_type = raid_type;
123 rs->md.raid_disks = raid_devs;
124 rs->md.level = raid_type->level;
125 rs->md.new_level = rs->md.level;
126 rs->md.dev_sectors = sectors_per_dev;
127 rs->md.layout = raid_type->algorithm;
128 rs->md.new_layout = rs->md.layout;
129 rs->md.delta_disks = 0;
130 rs->md.recovery_cp = 0;
131
132 for (i = 0; i < raid_devs; i++)
133 md_rdev_init(&rs->dev[i].rdev);
134
135 /*
136 * Remaining items to be initialized by further RAID params:
137 * rs->md.persistent
138 * rs->md.external
139 * rs->md.chunk_sectors
140 * rs->md.new_chunk_sectors
141 */
142
143 return rs;
144}
145
146static void context_free(struct raid_set *rs)
147{
148 int i;
149
150 for (i = 0; i < rs->md.raid_disks; i++)
151 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev);
153
154 kfree(rs);
155}
156
157/*
158 * For every device we have two words
159 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing
161 *
162 * This code parses those words.
163 */
164static int dev_parms(struct raid_set *rs, char **argv)
165{
166 int i;
167 int rebuild = 0;
168 int metadata_available = 0;
169 int ret = 0;
170
171 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
172 rs->dev[i].rdev.raid_disk = i;
173
174 rs->dev[i].meta_dev = NULL;
175 rs->dev[i].data_dev = NULL;
176
177 /*
178 * There are no offsets, since there is a separate device
179 * for data and metadata.
180 */
181 rs->dev[i].rdev.data_offset = 0;
182 rs->dev[i].rdev.mddev = &rs->md;
183
184 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported";
186 return -EINVAL;
187 }
188
189 if (!strcmp(argv[1], "-")) {
190 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
191 (!rs->dev[i].rdev.recovery_offset)) {
192 rs->ti->error = "Drive designated for rebuild not specified";
193 return -EINVAL;
194 }
195
196 continue;
197 }
198
199 ret = dm_get_device(rs->ti, argv[1],
200 dm_table_get_mode(rs->ti->table),
201 &rs->dev[i].data_dev);
202 if (ret) {
203 rs->ti->error = "RAID device lookup failure";
204 return ret;
205 }
206
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
210 rebuild++;
211 }
212
213 if (metadata_available) {
214 rs->md.external = 0;
215 rs->md.persistent = 1;
216 rs->md.major_version = 2;
217 } else if (rebuild && !rs->md.recovery_cp) {
218 /*
219 * Without metadata, we will not be able to tell if the array
220 * is in-sync or not - we must assume it is not. Therefore,
221 * it is impossible to rebuild a drive.
222 *
223 * Even if there is metadata, the on-disk information may
224 * indicate that the array is not in-sync and it will then
225 * fail at that time.
226 *
227 * User could specify 'nosync' option if desperate.
228 */
229 DMERR("Unable to rebuild drive while array is not in-sync");
230 rs->ti->error = "RAID device lookup failure";
231 return -EINVAL;
232 }
233
234 return 0;
235}
236
237/*
238 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args]
241 *
242 * Optional args:
243 * [[no]sync] Force or prevent recovery of the entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
250 */
251static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params)
253{
254 unsigned i, rebuild_cnt = 0;
255 unsigned long value;
256 char *key;
257
258 /*
259 * First, parse the in-order required arguments
260 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) ||
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size";
264 return -EINVAL;
265 }
266
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
268 argv++;
269 num_raid_params--;
270
271 /*
272 * Second, parse the unordered optional arguments
273 */
274 for (i = 0; i < rs->md.raid_disks; i++)
275 set_bit(In_sync, &rs->dev[i].rdev.flags);
276
277 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue;
283 }
284 if (!strcmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue;
289 }
290
291 /* The rest of the optional arguments come in key/value pairs */
292 if ((i + 1) >= num_raid_params) {
293 rs->ti->error = "Wrong number of raid parameters given";
294 return -EINVAL;
295 }
296
297 key = argv[i++];
298 if (strict_strtoul(argv[i], 10, &value) < 0) {
299 rs->ti->error = "Bad numerical argument given in raid params";
300 return -EINVAL;
301 }
302
303 if (!strcmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) {
305 rs->ti->error = "Too many rebuild drives given";
306 return -EINVAL;
307 }
308 if (value > rs->md.raid_disks) {
309 rs->ti->error = "Invalid rebuild index given";
310 return -EINVAL;
311 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) {
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316
317 /*
318 * In device-mapper, we specify things in sectors, but
319 * MD records this value in kB
320 */
321 value /= 2;
322 if (value > COUNTER_MAX) {
323 rs->ti->error = "Max write-behind limit out of range";
324 return -EINVAL;
325 }
326 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL;
332 }
333 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE;
336
337 /*
338 * In device-mapper, we specify things in sectors, but
339 * MD records this value in kB
340 */
341 value /= 2;
342
343 if (rs->raid_type->level < 5) {
344 rs->ti->error = "Inappropriate argument: stripe_cache";
345 return -EINVAL;
346 }
347 if (raid5_set_cache_size(&rs->md, (int)value)) {
348 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL;
350 }
351 } else if (!strcmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL;
356 }
357 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL;
363 }
364 rs->md.sync_speed_max = (int)value;
365 } else {
366 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters";
368 return -EINVAL;
369 }
370 }
371
372 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0;
374 rs->md.external = 1;
375
376 return 0;
377}
378
379static void do_table_event(struct work_struct *ws)
380{
381 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
382
383 dm_table_event(rs->ti->table);
384}
385
386static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389
390 return md_raid5_congested(&rs->md, bits);
391}
392
393/*
394 * Construct a RAID4/5/6 mapping:
395 * Args:
396 * <raid_type> <#raid_params> <raid_params> \
397 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
398 *
399 * ** metadata devices are not supported yet, use '-' instead **
400 *
401 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
402 * details on possible <raid_params>.
403 */
404static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
405{
406 int ret;
407 struct raid_type *rt;
408 unsigned long num_raid_params, num_raid_devs;
409 struct raid_set *rs = NULL;
410
411 /* Must have at least <raid_type> <#raid_params> */
412 if (argc < 2) {
413 ti->error = "Too few arguments";
414 return -EINVAL;
415 }
416
417 /* raid type */
418 rt = get_raid_type(argv[0]);
419 if (!rt) {
420 ti->error = "Unrecognised raid_type";
421 return -EINVAL;
422 }
423 argc--;
424 argv++;
425
426 /* number of RAID parameters */
427 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
428 ti->error = "Cannot understand number of RAID parameters";
429 return -EINVAL;
430 }
431 argc--;
432 argv++;
433
434 /* Skip over RAID params for now and find out # of devices */
435 if (num_raid_params + 1 > argc) {
436 ti->error = "Arguments do not agree with counts given";
437 return -EINVAL;
438 }
439
440 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
441 (num_raid_devs >= INT_MAX)) {
442 ti->error = "Cannot understand number of raid devices";
443 return -EINVAL;
444 }
445
446 rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
447 if (IS_ERR(rs))
448 return PTR_ERR(rs);
449
450 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
451 if (ret)
452 goto bad;
453
454 ret = -EINVAL;
455
456 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
457 argv += num_raid_params + 1;
458
459 if (argc != (num_raid_devs * 2)) {
460 ti->error = "Supplied RAID devices does not match the count given";
461 goto bad;
462 }
463
464 ret = dev_parms(rs, argv);
465 if (ret)
466 goto bad;
467
468 INIT_WORK(&rs->md.event_work, do_table_event);
469 ti->split_io = rs->md.chunk_sectors;
470 ti->private = rs;
471
472 mutex_lock(&rs->md.reconfig_mutex);
473 ret = md_run(&rs->md);
474 rs->md.in_sync = 0; /* Assume already marked dirty */
475 mutex_unlock(&rs->md.reconfig_mutex);
476
477 if (ret) {
478 ti->error = "Fail to run raid array";
479 goto bad;
480 }
481
482 rs->callbacks.congested_fn = raid_is_congested;
483 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
484
485 return 0;
486
487bad:
488 context_free(rs);
489
490 return ret;
491}
492
493static void raid_dtr(struct dm_target *ti)
494{
495 struct raid_set *rs = ti->private;
496
497 list_del_init(&rs->callbacks.list);
498 md_stop(&rs->md);
499 context_free(rs);
500}
501
502static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
503{
504 struct raid_set *rs = ti->private;
505 mddev_t *mddev = &rs->md;
506
507 mddev->pers->make_request(mddev, bio);
508
509 return DM_MAPIO_SUBMITTED;
510}
511
512static int raid_status(struct dm_target *ti, status_type_t type,
513 char *result, unsigned maxlen)
514{
515 struct raid_set *rs = ti->private;
516 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
517 unsigned sz = 0;
518 int i;
519 sector_t sync;
520
521 switch (type) {
522 case STATUSTYPE_INFO:
523 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
524
525 for (i = 0; i < rs->md.raid_disks; i++) {
526 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
527 DMEMIT("D");
528 else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
529 DMEMIT("A");
530 else
531 DMEMIT("a");
532 }
533
534 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
535 sync = rs->md.curr_resync_completed;
536 else
537 sync = rs->md.recovery_cp;
538
539 if (sync > rs->md.resync_max_sectors)
540 sync = rs->md.resync_max_sectors;
541
542 DMEMIT(" %llu/%llu",
543 (unsigned long long) sync,
544 (unsigned long long) rs->md.resync_max_sectors);
545
546 break;
547 case STATUSTYPE_TABLE:
548 /* The string you would use to construct this array */
549 for (i = 0; i < rs->md.raid_disks; i++)
550 if (rs->dev[i].data_dev &&
551 !test_bit(In_sync, &rs->dev[i].rdev.flags))
552 raid_param_cnt++; /* for rebuilds */
553
554 raid_param_cnt += (hweight64(rs->print_flags) * 2);
555 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
556 raid_param_cnt--;
557
558 DMEMIT("%s %u %u", rs->raid_type->name,
559 raid_param_cnt, rs->md.chunk_sectors);
560
561 if ((rs->print_flags & DMPF_SYNC) &&
562 (rs->md.recovery_cp == MaxSector))
563 DMEMIT(" sync");
564 if (rs->print_flags & DMPF_NOSYNC)
565 DMEMIT(" nosync");
566
567 for (i = 0; i < rs->md.raid_disks; i++)
568 if (rs->dev[i].data_dev &&
569 !test_bit(In_sync, &rs->dev[i].rdev.flags))
570 DMEMIT(" rebuild %u", i);
571
572 if (rs->print_flags & DMPF_DAEMON_SLEEP)
573 DMEMIT(" daemon_sleep %lu",
574 rs->md.bitmap_info.daemon_sleep);
575
576 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
577 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
578
579 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
580 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
581
582 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
583 DMEMIT(" max_write_behind %lu",
584 rs->md.bitmap_info.max_write_behind);
585
586 if (rs->print_flags & DMPF_STRIPE_CACHE) {
587 raid5_conf_t *conf = rs->md.private;
588
589 /* convert from kiB to sectors */
590 DMEMIT(" stripe_cache %d",
591 conf ? conf->max_nr_stripes * 2 : 0);
592 }
593
594 DMEMIT(" %d", rs->md.raid_disks);
595 for (i = 0; i < rs->md.raid_disks; i++) {
596 DMEMIT(" -"); /* metadata device */
597
598 if (rs->dev[i].data_dev)
599 DMEMIT(" %s", rs->dev[i].data_dev->name);
600 else
601 DMEMIT(" -");
602 }
603 }
604
605 return 0;
606}
607
608static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
609{
610 struct raid_set *rs = ti->private;
611 unsigned i;
612 int ret = 0;
613
614 for (i = 0; !ret && i < rs->md.raid_disks; i++)
615 if (rs->dev[i].data_dev)
616 ret = fn(ti,
617 rs->dev[i].data_dev,
618 0, /* No offset on data devs */
619 rs->md.dev_sectors,
620 data);
621
622 return ret;
623}
624
625static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
626{
627 struct raid_set *rs = ti->private;
628 unsigned chunk_size = rs->md.chunk_sectors << 9;
629 raid5_conf_t *conf = rs->md.private;
630
631 blk_limits_io_min(limits, chunk_size);
632 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
633}
634
635static void raid_presuspend(struct dm_target *ti)
636{
637 struct raid_set *rs = ti->private;
638
639 md_stop_writes(&rs->md);
640}
641
642static void raid_postsuspend(struct dm_target *ti)
643{
644 struct raid_set *rs = ti->private;
645
646 mddev_suspend(&rs->md);
647}
648
649static void raid_resume(struct dm_target *ti)
650{
651 struct raid_set *rs = ti->private;
652
653 mddev_resume(&rs->md);
654}
655
656static struct target_type raid_target = {
657 .name = "raid",
658 .version = {1, 0, 0},
659 .module = THIS_MODULE,
660 .ctr = raid_ctr,
661 .dtr = raid_dtr,
662 .map = raid_map,
663 .status = raid_status,
664 .iterate_devices = raid_iterate_devices,
665 .io_hints = raid_io_hints,
666 .presuspend = raid_presuspend,
667 .postsuspend = raid_postsuspend,
668 .resume = raid_resume,
669};
670
671static int __init dm_raid_init(void)
672{
673 return dm_register_target(&raid_target);
674}
675
676static void __exit dm_raid_exit(void)
677{
678 dm_unregister_target(&raid_target);
679}
680
681module_init(dm_raid_init);
682module_exit(dm_raid_exit);
683
684MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
685MODULE_ALIAS("dm-raid4");
686MODULE_ALIAS("dm-raid5");
687MODULE_ALIAS("dm-raid6");
688MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
689MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bcbc3cf..9bfd057be686 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -22,8 +22,6 @@
22#define DM_MSG_PREFIX "raid1" 22#define DM_MSG_PREFIX "raid1"
23 23
24#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ 24#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
25#define DM_IO_PAGES 64
26#define DM_KCOPYD_PAGES 64
27 25
28#define DM_RAID1_HANDLE_ERRORS 0x01 26#define DM_RAID1_HANDLE_ERRORS 0x01
29#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) 27#define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
@@ -259,9 +257,9 @@ static int mirror_flush(struct dm_target *ti)
259 struct dm_io_region io[ms->nr_mirrors]; 257 struct dm_io_region io[ms->nr_mirrors];
260 struct mirror *m; 258 struct mirror *m;
261 struct dm_io_request io_req = { 259 struct dm_io_request io_req = {
262 .bi_rw = WRITE_BARRIER, 260 .bi_rw = WRITE_FLUSH,
263 .mem.type = DM_IO_KMEM, 261 .mem.type = DM_IO_KMEM,
264 .mem.ptr.bvec = NULL, 262 .mem.ptr.addr = NULL,
265 .client = ms->io_client, 263 .client = ms->io_client,
266 }; 264 };
267 265
@@ -629,7 +627,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
629 struct dm_io_region io[ms->nr_mirrors], *dest = io; 627 struct dm_io_region io[ms->nr_mirrors], *dest = io;
630 struct mirror *m; 628 struct mirror *m;
631 struct dm_io_request io_req = { 629 struct dm_io_request io_req = {
632 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), 630 .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
633 .mem.type = DM_IO_BVEC, 631 .mem.type = DM_IO_BVEC,
634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 632 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
635 .notify.fn = write_callback, 633 .notify.fn = write_callback,
@@ -637,6 +635,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
637 .client = ms->io_client, 635 .client = ms->io_client,
638 }; 636 };
639 637
638 if (bio->bi_rw & REQ_DISCARD) {
639 io_req.bi_rw |= REQ_DISCARD;
640 io_req.mem.type = DM_IO_KMEM;
641 io_req.mem.ptr.addr = NULL;
642 }
643
640 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 644 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
641 map_region(dest++, m, bio); 645 map_region(dest++, m, bio);
642 646
@@ -670,7 +674,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
670 bio_list_init(&requeue); 674 bio_list_init(&requeue);
671 675
672 while ((bio = bio_list_pop(writes))) { 676 while ((bio = bio_list_pop(writes))) {
673 if (unlikely(bio_empty_barrier(bio))) { 677 if ((bio->bi_rw & REQ_FLUSH) ||
678 (bio->bi_rw & REQ_DISCARD)) {
674 bio_list_add(&sync, bio); 679 bio_list_add(&sync, bio);
675 continue; 680 continue;
676 } 681 }
@@ -835,8 +840,6 @@ static void do_mirror(struct work_struct *work)
835 do_reads(ms, &reads); 840 do_reads(ms, &reads);
836 do_writes(ms, &writes); 841 do_writes(ms, &writes);
837 do_failures(ms, &failures); 842 do_failures(ms, &failures);
838
839 dm_table_unplug_all(ms->ti->table);
840} 843}
841 844
842/*----------------------------------------------------------------- 845/*-----------------------------------------------------------------
@@ -882,7 +885,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
882 return NULL; 885 return NULL;
883 } 886 }
884 887
885 ms->io_client = dm_io_client_create(DM_IO_PAGES); 888 ms->io_client = dm_io_client_create();
886 if (IS_ERR(ms->io_client)) { 889 if (IS_ERR(ms->io_client)) {
887 ti->error = "Error creating dm_io client"; 890 ti->error = "Error creating dm_io client";
888 mempool_destroy(ms->read_record_pool); 891 mempool_destroy(ms->read_record_pool);
@@ -1076,8 +1079,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1076 ti->private = ms; 1079 ti->private = ms;
1077 ti->split_io = dm_rh_get_region_size(ms->rh); 1080 ti->split_io = dm_rh_get_region_size(ms->rh);
1078 ti->num_flush_requests = 1; 1081 ti->num_flush_requests = 1;
1082 ti->num_discard_requests = 1;
1079 1083
1080 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1084 ms->kmirrord_wq = alloc_workqueue("kmirrord",
1085 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1081 if (!ms->kmirrord_wq) { 1086 if (!ms->kmirrord_wq) {
1082 DMERR("couldn't start kmirrord"); 1087 DMERR("couldn't start kmirrord");
1083 r = -ENOMEM; 1088 r = -ENOMEM;
@@ -1110,9 +1115,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1110 goto err_destroy_wq; 1115 goto err_destroy_wq;
1111 } 1116 }
1112 1117
1113 r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); 1118 ms->kcopyd_client = dm_kcopyd_client_create();
1114 if (r) 1119 if (IS_ERR(ms->kcopyd_client)) {
1120 r = PTR_ERR(ms->kcopyd_client);
1115 goto err_destroy_wq; 1121 goto err_destroy_wq;
1122 }
1116 1123
1117 wakeup_mirrord(ms); 1124 wakeup_mirrord(ms);
1118 return 0; 1125 return 0;
@@ -1130,7 +1137,7 @@ static void mirror_dtr(struct dm_target *ti)
1130 1137
1131 del_timer_sync(&ms->timer); 1138 del_timer_sync(&ms->timer);
1132 flush_workqueue(ms->kmirrord_wq); 1139 flush_workqueue(ms->kmirrord_wq);
1133 flush_scheduled_work(); 1140 flush_work_sync(&ms->trigger_event);
1134 dm_kcopyd_client_destroy(ms->kcopyd_client); 1141 dm_kcopyd_client_destroy(ms->kcopyd_client);
1135 destroy_workqueue(ms->kmirrord_wq); 1142 destroy_workqueue(ms->kmirrord_wq);
1136 free_context(ms, ti, ms->nr_mirrors); 1143 free_context(ms, ti, ms->nr_mirrors);
@@ -1203,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1203 * We need to dec pending if this was a write. 1210 * We need to dec pending if this was a write.
1204 */ 1211 */
1205 if (rw == WRITE) { 1212 if (rw == WRITE) {
1206 if (likely(!bio_empty_barrier(bio))) 1213 if (!(bio->bi_rw & REQ_FLUSH))
1207 dm_rh_dec(ms->rh, map_context->ll); 1214 dm_rh_dec(ms->rh, map_context->ll);
1208 return error; 1215 return error;
1209 } 1216 }
@@ -1406,7 +1413,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
1406 1413
1407static struct target_type mirror_target = { 1414static struct target_type mirror_target = {
1408 .name = "mirror", 1415 .name = "mirror",
1409 .version = {1, 12, 0}, 1416 .version = {1, 12, 1},
1410 .module = THIS_MODULE, 1417 .module = THIS_MODULE,
1411 .ctr = mirror_ctr, 1418 .ctr = mirror_ctr,
1412 .dtr = mirror_dtr, 1419 .dtr = mirror_dtr,
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b28868..7771ed212182 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@ struct dm_region_hash {
81 struct list_head failed_recovered_regions; 81 struct list_head failed_recovered_regions;
82 82
83 /* 83 /*
84 * If there was a barrier failure no regions can be marked clean. 84 * If there was a flush failure no regions can be marked clean.
85 */ 85 */
86 int barrier_failure; 86 int flush_failure;
87 87
88 void *context; 88 void *context;
89 sector_t target_begin; 89 sector_t target_begin;
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create(
217 INIT_LIST_HEAD(&rh->quiesced_regions); 217 INIT_LIST_HEAD(&rh->quiesced_regions);
218 INIT_LIST_HEAD(&rh->recovered_regions); 218 INIT_LIST_HEAD(&rh->recovered_regions);
219 INIT_LIST_HEAD(&rh->failed_recovered_regions); 219 INIT_LIST_HEAD(&rh->failed_recovered_regions);
220 rh->barrier_failure = 0; 220 rh->flush_failure = 0;
221 221
222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
223 sizeof(struct dm_region)); 223 sizeof(struct dm_region));
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
399 region_t region = dm_rh_bio_to_region(rh, bio); 399 region_t region = dm_rh_bio_to_region(rh, bio);
400 int recovering = 0; 400 int recovering = 0;
401 401
402 if (bio_empty_barrier(bio)) { 402 if (bio->bi_rw & REQ_FLUSH) {
403 rh->barrier_failure = 1; 403 rh->flush_failure = 1;
404 return; 404 return;
405 } 405 }
406 406
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
419 /* 419 /*
420 * Possible cases: 420 * Possible cases:
421 * 1) DM_RH_DIRTY 421 * 1) DM_RH_DIRTY
422 * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed 422 * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed
423 * 3) DM_RH_RECOVERING: flushing pending writes 423 * 3) DM_RH_RECOVERING: flushing pending writes
424 * Either case, the region should have not been connected to list. 424 * Either case, the region should have not been connected to list.
425 */ 425 */
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
524 struct bio *bio; 524 struct bio *bio;
525 525
526 for (bio = bios->head; bio; bio = bio->bi_next) { 526 for (bio = bios->head; bio; bio = bio->bi_next) {
527 if (bio_empty_barrier(bio)) 527 if (bio->bi_rw & REQ_FLUSH)
528 continue; 528 continue;
529 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 } 530 }
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
555 */ 555 */
556 556
557 /* do nothing for DM_RH_NOSYNC */ 557 /* do nothing for DM_RH_NOSYNC */
558 if (unlikely(rh->barrier_failure)) { 558 if (unlikely(rh->flush_failure)) {
559 /* 559 /*
560 * If a write barrier failed some time ago, we 560 * If a write flush failed some time ago, we
561 * don't know whether or not this write made it 561 * don't know whether or not this write made it
562 * to the disk, so we must resync the device. 562 * to the disk, so we must resync the device.
563 */ 563 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb83f9ad..135c2f1fdbfc 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -154,11 +154,6 @@ struct pstore {
154 struct workqueue_struct *metadata_wq; 154 struct workqueue_struct *metadata_wq;
155}; 155};
156 156
157static unsigned sectors_to_pages(unsigned sectors)
158{
159 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
160}
161
162static int alloc_area(struct pstore *ps) 157static int alloc_area(struct pstore *ps)
163{ 158{
164 int r = -ENOMEM; 159 int r = -ENOMEM;
@@ -254,9 +249,9 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
254 * Issue the synchronous I/O from a different thread 249 * Issue the synchronous I/O from a different thread
255 * to avoid generic_make_request recursion. 250 * to avoid generic_make_request recursion.
256 */ 251 */
257 INIT_WORK_ON_STACK(&req.work, do_metadata); 252 INIT_WORK_ONSTACK(&req.work, do_metadata);
258 queue_work(ps->metadata_wq, &req.work); 253 queue_work(ps->metadata_wq, &req.work);
259 flush_workqueue(ps->metadata_wq); 254 flush_work(&req.work);
260 255
261 return req.result; 256 return req.result;
262} 257}
@@ -318,8 +313,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
318 chunk_size_supplied = 0; 313 chunk_size_supplied = 0;
319 } 314 }
320 315
321 ps->io_client = dm_io_client_create(sectors_to_pages(ps->store-> 316 ps->io_client = dm_io_client_create();
322 chunk_size));
323 if (IS_ERR(ps->io_client)) 317 if (IS_ERR(ps->io_client))
324 return PTR_ERR(ps->io_client); 318 return PTR_ERR(ps->io_client);
325 319
@@ -368,11 +362,6 @@ static int read_header(struct pstore *ps, int *new_snapshot)
368 return r; 362 return r;
369 } 363 }
370 364
371 r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
372 ps->io_client);
373 if (r)
374 return r;
375
376 r = alloc_area(ps); 365 r = alloc_area(ps);
377 return r; 366 return r;
378 367
@@ -687,7 +676,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
687 /* 676 /*
688 * Commit exceptions to disk. 677 * Commit exceptions to disk.
689 */ 678 */
690 if (ps->valid && area_io(ps, WRITE_BARRIER)) 679 if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
691 ps->valid = 0; 680 ps->valid = 0;
692 681
693 /* 682 /*
@@ -818,7 +807,7 @@ static int persistent_ctr(struct dm_exception_store *store,
818 atomic_set(&ps->pending_count, 0); 807 atomic_set(&ps->pending_count, 0);
819 ps->callbacks = NULL; 808 ps->callbacks = NULL;
820 809
821 ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); 810 ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
822 if (!ps->metadata_wq) { 811 if (!ps->metadata_wq) {
823 kfree(ps); 812 kfree(ps);
824 DMERR("couldn't start header metadata update thread"); 813 DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d3094d97..9ecff5f3023a 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/log2.h> 20#include <linux/log2.h>
21#include <linux/dm-kcopyd.h> 21#include <linux/dm-kcopyd.h>
22#include <linux/workqueue.h>
23 22
24#include "dm-exception-store.h" 23#include "dm-exception-store.h"
25 24
@@ -41,11 +40,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
41#define SNAPSHOT_COPY_PRIORITY 2 40#define SNAPSHOT_COPY_PRIORITY 2
42 41
43/* 42/*
44 * Reserve 1MB for each snapshot initially (with minimum of 1 page).
45 */
46#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
47
48/*
49 * The size of the mempool used to track chunks in use. 43 * The size of the mempool used to track chunks in use.
50 */ 44 */
51#define MIN_IOS 256 45#define MIN_IOS 256
@@ -80,9 +74,6 @@ struct dm_snapshot {
80 /* Origin writes don't trigger exceptions until this is set */ 74 /* Origin writes don't trigger exceptions until this is set */
81 int active; 75 int active;
82 76
83 /* Whether or not owning mapped_device is suspended */
84 int suspended;
85
86 atomic_t pending_exceptions_count; 77 atomic_t pending_exceptions_count;
87 78
88 mempool_t *pending_pool; 79 mempool_t *pending_pool;
@@ -106,10 +97,6 @@ struct dm_snapshot {
106 97
107 struct dm_kcopyd_client *kcopyd_client; 98 struct dm_kcopyd_client *kcopyd_client;
108 99
109 /* Queue of snapshot writes for ksnapd to flush */
110 struct bio_list queued_bios;
111 struct work_struct queued_bios_work;
112
113 /* Wait for events based on state_bits */ 100 /* Wait for events based on state_bits */
114 unsigned long state_bits; 101 unsigned long state_bits;
115 102
@@ -160,9 +147,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
160} 147}
161EXPORT_SYMBOL(dm_snap_cow); 148EXPORT_SYMBOL(dm_snap_cow);
162 149
163static struct workqueue_struct *ksnapd;
164static void flush_queued_bios(struct work_struct *work);
165
166static sector_t chunk_to_sector(struct dm_exception_store *store, 150static sector_t chunk_to_sector(struct dm_exception_store *store,
167 chunk_t chunk) 151 chunk_t chunk)
168{ 152{
@@ -706,8 +690,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
706 return 0; 690 return 0;
707} 691}
708 692
709#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
710
711/* 693/*
712 * Return a minimum chunk size of all snapshots that have the specified origin. 694 * Return a minimum chunk size of all snapshots that have the specified origin.
713 * Return zero if the origin has no snapshots. 695 * Return zero if the origin has no snapshots.
@@ -1093,7 +1075,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1093 argv++; 1075 argv++;
1094 argc--; 1076 argc--;
1095 1077
1096 r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); 1078 r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
1097 if (r) { 1079 if (r) {
1098 ti->error = "Cannot get COW device"; 1080 ti->error = "Cannot get COW device";
1099 goto bad_cow; 1081 goto bad_cow;
@@ -1112,7 +1094,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1112 s->ti = ti; 1094 s->ti = ti;
1113 s->valid = 1; 1095 s->valid = 1;
1114 s->active = 0; 1096 s->active = 0;
1115 s->suspended = 0;
1116 atomic_set(&s->pending_exceptions_count, 0); 1097 atomic_set(&s->pending_exceptions_count, 0);
1117 init_rwsem(&s->lock); 1098 init_rwsem(&s->lock);
1118 INIT_LIST_HEAD(&s->list); 1099 INIT_LIST_HEAD(&s->list);
@@ -1130,8 +1111,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1130 goto bad_hash_tables; 1111 goto bad_hash_tables;
1131 } 1112 }
1132 1113
1133 r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); 1114 s->kcopyd_client = dm_kcopyd_client_create();
1134 if (r) { 1115 if (IS_ERR(s->kcopyd_client)) {
1116 r = PTR_ERR(s->kcopyd_client);
1135 ti->error = "Could not create kcopyd client"; 1117 ti->error = "Could not create kcopyd client";
1136 goto bad_kcopyd; 1118 goto bad_kcopyd;
1137 } 1119 }
@@ -1155,9 +1137,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1155 1137
1156 spin_lock_init(&s->tracked_chunk_lock); 1138 spin_lock_init(&s->tracked_chunk_lock);
1157 1139
1158 bio_list_init(&s->queued_bios);
1159 INIT_WORK(&s->queued_bios_work, flush_queued_bios);
1160
1161 ti->private = s; 1140 ti->private = s;
1162 ti->num_flush_requests = num_flush_requests; 1141 ti->num_flush_requests = num_flush_requests;
1163 1142
@@ -1281,8 +1260,6 @@ static void snapshot_dtr(struct dm_target *ti)
1281 struct dm_snapshot *s = ti->private; 1260 struct dm_snapshot *s = ti->private;
1282 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1261 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1283 1262
1284 flush_workqueue(ksnapd);
1285
1286 down_read(&_origins_lock); 1263 down_read(&_origins_lock);
1287 /* Check whether exception handover must be cancelled */ 1264 /* Check whether exception handover must be cancelled */
1288 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1265 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1344,20 +1321,6 @@ static void flush_bios(struct bio *bio)
1344 } 1321 }
1345} 1322}
1346 1323
1347static void flush_queued_bios(struct work_struct *work)
1348{
1349 struct dm_snapshot *s =
1350 container_of(work, struct dm_snapshot, queued_bios_work);
1351 struct bio *queued_bios;
1352 unsigned long flags;
1353
1354 spin_lock_irqsave(&s->pe_lock, flags);
1355 queued_bios = bio_list_get(&s->queued_bios);
1356 spin_unlock_irqrestore(&s->pe_lock, flags);
1357
1358 flush_bios(queued_bios);
1359}
1360
1361static int do_origin(struct dm_dev *origin, struct bio *bio); 1324static int do_origin(struct dm_dev *origin, struct bio *bio);
1362 1325
1363/* 1326/*
@@ -1587,7 +1550,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
1587 chunk_t chunk; 1550 chunk_t chunk;
1588 struct dm_snap_pending_exception *pe = NULL; 1551 struct dm_snap_pending_exception *pe = NULL;
1589 1552
1590 if (unlikely(bio_empty_barrier(bio))) { 1553 if (bio->bi_rw & REQ_FLUSH) {
1591 bio->bi_bdev = s->cow->bdev; 1554 bio->bi_bdev = s->cow->bdev;
1592 return DM_MAPIO_REMAPPED; 1555 return DM_MAPIO_REMAPPED;
1593 } 1556 }
@@ -1691,7 +1654,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
1691 int r = DM_MAPIO_REMAPPED; 1654 int r = DM_MAPIO_REMAPPED;
1692 chunk_t chunk; 1655 chunk_t chunk;
1693 1656
1694 if (unlikely(bio_empty_barrier(bio))) { 1657 if (bio->bi_rw & REQ_FLUSH) {
1695 if (!map_context->target_request_nr) 1658 if (!map_context->target_request_nr)
1696 bio->bi_bdev = s->origin->bdev; 1659 bio->bi_bdev = s->origin->bdev;
1697 else 1660 else
@@ -1762,15 +1725,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
1762 stop_merge(s); 1725 stop_merge(s);
1763} 1726}
1764 1727
1765static void snapshot_postsuspend(struct dm_target *ti)
1766{
1767 struct dm_snapshot *s = ti->private;
1768
1769 down_write(&s->lock);
1770 s->suspended = 1;
1771 up_write(&s->lock);
1772}
1773
1774static int snapshot_preresume(struct dm_target *ti) 1728static int snapshot_preresume(struct dm_target *ti)
1775{ 1729{
1776 int r = 0; 1730 int r = 0;
@@ -1785,7 +1739,7 @@ static int snapshot_preresume(struct dm_target *ti)
1785 DMERR("Unable to resume snapshot source until " 1739 DMERR("Unable to resume snapshot source until "
1786 "handover completes."); 1740 "handover completes.");
1787 r = -EINVAL; 1741 r = -EINVAL;
1788 } else if (!snap_src->suspended) { 1742 } else if (!dm_suspended(snap_src->ti)) {
1789 DMERR("Unable to perform snapshot handover until " 1743 DMERR("Unable to perform snapshot handover until "
1790 "source is suspended."); 1744 "source is suspended.");
1791 r = -EINVAL; 1745 r = -EINVAL;
@@ -1818,7 +1772,6 @@ static void snapshot_resume(struct dm_target *ti)
1818 1772
1819 down_write(&s->lock); 1773 down_write(&s->lock);
1820 s->active = 1; 1774 s->active = 1;
1821 s->suspended = 0;
1822 up_write(&s->lock); 1775 up_write(&s->lock);
1823} 1776}
1824 1777
@@ -2135,7 +2088,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
2135 struct dm_dev *dev = ti->private; 2088 struct dm_dev *dev = ti->private;
2136 bio->bi_bdev = dev->bdev; 2089 bio->bi_bdev = dev->bdev;
2137 2090
2138 if (unlikely(bio_empty_barrier(bio))) 2091 if (bio->bi_rw & REQ_FLUSH)
2139 return DM_MAPIO_REMAPPED; 2092 return DM_MAPIO_REMAPPED;
2140 2093
2141 /* Only tell snapshots if this is a write */ 2094 /* Only tell snapshots if this is a write */
@@ -2196,7 +2149,7 @@ static int origin_iterate_devices(struct dm_target *ti,
2196 2149
2197static struct target_type origin_target = { 2150static struct target_type origin_target = {
2198 .name = "snapshot-origin", 2151 .name = "snapshot-origin",
2199 .version = {1, 7, 0}, 2152 .version = {1, 7, 1},
2200 .module = THIS_MODULE, 2153 .module = THIS_MODULE,
2201 .ctr = origin_ctr, 2154 .ctr = origin_ctr,
2202 .dtr = origin_dtr, 2155 .dtr = origin_dtr,
@@ -2209,13 +2162,12 @@ static struct target_type origin_target = {
2209 2162
2210static struct target_type snapshot_target = { 2163static struct target_type snapshot_target = {
2211 .name = "snapshot", 2164 .name = "snapshot",
2212 .version = {1, 9, 0}, 2165 .version = {1, 10, 0},
2213 .module = THIS_MODULE, 2166 .module = THIS_MODULE,
2214 .ctr = snapshot_ctr, 2167 .ctr = snapshot_ctr,
2215 .dtr = snapshot_dtr, 2168 .dtr = snapshot_dtr,
2216 .map = snapshot_map, 2169 .map = snapshot_map,
2217 .end_io = snapshot_end_io, 2170 .end_io = snapshot_end_io,
2218 .postsuspend = snapshot_postsuspend,
2219 .preresume = snapshot_preresume, 2171 .preresume = snapshot_preresume,
2220 .resume = snapshot_resume, 2172 .resume = snapshot_resume,
2221 .status = snapshot_status, 2173 .status = snapshot_status,
@@ -2224,14 +2176,13 @@ static struct target_type snapshot_target = {
2224 2176
2225static struct target_type merge_target = { 2177static struct target_type merge_target = {
2226 .name = dm_snapshot_merge_target_name, 2178 .name = dm_snapshot_merge_target_name,
2227 .version = {1, 0, 0}, 2179 .version = {1, 1, 0},
2228 .module = THIS_MODULE, 2180 .module = THIS_MODULE,
2229 .ctr = snapshot_ctr, 2181 .ctr = snapshot_ctr,
2230 .dtr = snapshot_dtr, 2182 .dtr = snapshot_dtr,
2231 .map = snapshot_merge_map, 2183 .map = snapshot_merge_map,
2232 .end_io = snapshot_end_io, 2184 .end_io = snapshot_end_io,
2233 .presuspend = snapshot_merge_presuspend, 2185 .presuspend = snapshot_merge_presuspend,
2234 .postsuspend = snapshot_postsuspend,
2235 .preresume = snapshot_preresume, 2186 .preresume = snapshot_preresume,
2236 .resume = snapshot_merge_resume, 2187 .resume = snapshot_merge_resume,
2237 .status = snapshot_status, 2188 .status = snapshot_status,
@@ -2293,17 +2244,8 @@ static int __init dm_snapshot_init(void)
2293 goto bad_tracked_chunk_cache; 2244 goto bad_tracked_chunk_cache;
2294 } 2245 }
2295 2246
2296 ksnapd = create_singlethread_workqueue("ksnapd");
2297 if (!ksnapd) {
2298 DMERR("Failed to create ksnapd workqueue.");
2299 r = -ENOMEM;
2300 goto bad_pending_pool;
2301 }
2302
2303 return 0; 2247 return 0;
2304 2248
2305bad_pending_pool:
2306 kmem_cache_destroy(tracked_chunk_cache);
2307bad_tracked_chunk_cache: 2249bad_tracked_chunk_cache:
2308 kmem_cache_destroy(pending_cache); 2250 kmem_cache_destroy(pending_cache);
2309bad_pending_cache: 2251bad_pending_cache:
@@ -2324,8 +2266,6 @@ bad_register_snapshot_target:
2324 2266
2325static void __exit dm_snapshot_exit(void) 2267static void __exit dm_snapshot_exit(void)
2326{ 2268{
2327 destroy_workqueue(ksnapd);
2328
2329 dm_unregister_target(&snapshot_target); 2269 dm_unregister_target(&snapshot_target);
2330 dm_unregister_target(&origin_target); 2270 dm_unregister_target(&origin_target);
2331 dm_unregister_target(&merge_target); 2271 dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6da91ea..3d80cf0c152d 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
39 struct dm_target *ti; 39 struct dm_target *ti;
40 40
41 /* Work struct used for triggering events*/ 41 /* Work struct used for triggering events*/
42 struct work_struct kstriped_ws; 42 struct work_struct trigger_event;
43 43
44 struct stripe stripe[0]; 44 struct stripe stripe[0];
45}; 45};
46 46
47static struct workqueue_struct *kstriped;
48
49/* 47/*
50 * An event is triggered whenever a drive 48 * An event is triggered whenever a drive
51 * drops out of a stripe volume. 49 * drops out of a stripe volume.
52 */ 50 */
53static void trigger_event(struct work_struct *work) 51static void trigger_event(struct work_struct *work)
54{ 52{
55 struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); 53 struct stripe_c *sc = container_of(work, struct stripe_c,
56 54 trigger_event);
57 dm_table_event(sc->ti->table); 55 dm_table_event(sc->ti->table);
58
59} 56}
60 57
61static inline struct stripe_c *alloc_context(unsigned int stripes) 58static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 return -ENOMEM; 157 return -ENOMEM;
161 } 158 }
162 159
163 INIT_WORK(&sc->kstriped_ws, trigger_event); 160 INIT_WORK(&sc->trigger_event, trigger_event);
164 161
165 /* Set pointer to dm target; used in trigger_event */ 162 /* Set pointer to dm target; used in trigger_event */
166 sc->ti = ti; 163 sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
211 for (i = 0; i < sc->stripes; i++) 208 for (i = 0; i < sc->stripes; i++)
212 dm_put_device(ti, sc->stripe[i].dev); 209 dm_put_device(ti, sc->stripe[i].dev);
213 210
214 flush_workqueue(kstriped); 211 flush_work_sync(&sc->trigger_event);
215 kfree(sc); 212 kfree(sc);
216} 213}
217 214
@@ -271,7 +268,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
271 uint32_t stripe; 268 uint32_t stripe;
272 unsigned target_request_nr; 269 unsigned target_request_nr;
273 270
274 if (unlikely(bio_empty_barrier(bio))) { 271 if (bio->bi_rw & REQ_FLUSH) {
275 target_request_nr = map_context->target_request_nr; 272 target_request_nr = map_context->target_request_nr;
276 BUG_ON(target_request_nr >= sc->stripes); 273 BUG_ON(target_request_nr >= sc->stripes);
277 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; 274 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
367 atomic_inc(&(sc->stripe[i].error_count)); 364 atomic_inc(&(sc->stripe[i].error_count));
368 if (atomic_read(&(sc->stripe[i].error_count)) < 365 if (atomic_read(&(sc->stripe[i].error_count)) <
369 DM_IO_ERROR_THRESHOLD) 366 DM_IO_ERROR_THRESHOLD)
370 queue_work(kstriped, &sc->kstriped_ws); 367 schedule_work(&sc->trigger_event);
371 } 368 }
372 369
373 return error; 370 return error;
@@ -399,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti,
399 blk_limits_io_opt(limits, chunk_size * sc->stripes); 396 blk_limits_io_opt(limits, chunk_size * sc->stripes);
400} 397}
401 398
399static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
400 struct bio_vec *biovec, int max_size)
401{
402 struct stripe_c *sc = ti->private;
403 sector_t bvm_sector = bvm->bi_sector;
404 uint32_t stripe;
405 struct request_queue *q;
406
407 stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector);
408
409 q = bdev_get_queue(sc->stripe[stripe].dev->bdev);
410 if (!q->merge_bvec_fn)
411 return max_size;
412
413 bvm->bi_bdev = sc->stripe[stripe].dev->bdev;
414 bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector;
415
416 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
417}
418
402static struct target_type stripe_target = { 419static struct target_type stripe_target = {
403 .name = "striped", 420 .name = "striped",
404 .version = {1, 3, 0}, 421 .version = {1, 4, 0},
405 .module = THIS_MODULE, 422 .module = THIS_MODULE,
406 .ctr = stripe_ctr, 423 .ctr = stripe_ctr,
407 .dtr = stripe_dtr, 424 .dtr = stripe_dtr,
@@ -410,6 +427,7 @@ static struct target_type stripe_target = {
410 .status = stripe_status, 427 .status = stripe_status,
411 .iterate_devices = stripe_iterate_devices, 428 .iterate_devices = stripe_iterate_devices,
412 .io_hints = stripe_io_hints, 429 .io_hints = stripe_io_hints,
430 .merge = stripe_merge,
413}; 431};
414 432
415int __init dm_stripe_init(void) 433int __init dm_stripe_init(void)
@@ -422,20 +440,10 @@ int __init dm_stripe_init(void)
422 return r; 440 return r;
423 } 441 }
424 442
425 kstriped = create_singlethread_workqueue("kstriped");
426 if (!kstriped) {
427 DMERR("failed to create workqueue kstriped");
428 dm_unregister_target(&stripe_target);
429 return -ENOMEM;
430 }
431
432 return r; 443 return r;
433} 444}
434 445
435void dm_stripe_exit(void) 446void dm_stripe_exit(void)
436{ 447{
437 dm_unregister_target(&stripe_target); 448 dm_unregister_target(&stripe_target);
438 destroy_workqueue(kstriped);
439
440 return;
441} 449}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f9fc07d7a4b9..451c3bb176d2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -55,6 +55,7 @@ struct dm_table {
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 unsigned discards_supported:1; 57 unsigned discards_supported:1;
58 unsigned integrity_supported:1;
58 59
59 /* 60 /*
60 * Indicates the rw permissions for the new logical 61 * Indicates the rw permissions for the new logical
@@ -71,6 +72,8 @@ struct dm_table {
71 void *event_context; 72 void *event_context;
72 73
73 struct dm_md_mempools *mempools; 74 struct dm_md_mempools *mempools;
75
76 struct list_head target_callbacks;
74}; 77};
75 78
76/* 79/*
@@ -204,6 +207,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
204 return -ENOMEM; 207 return -ENOMEM;
205 208
206 INIT_LIST_HEAD(&t->devices); 209 INIT_LIST_HEAD(&t->devices);
210 INIT_LIST_HEAD(&t->target_callbacks);
207 atomic_set(&t->holders, 0); 211 atomic_set(&t->holders, 0);
208 t->discards_supported = 1; 212 t->discards_supported = 1;
209 213
@@ -325,15 +329,18 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
325 329
326 BUG_ON(d->dm_dev.bdev); 330 BUG_ON(d->dm_dev.bdev);
327 331
328 bdev = open_by_devnum(dev, d->dm_dev.mode); 332 bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
329 if (IS_ERR(bdev)) 333 if (IS_ERR(bdev))
330 return PTR_ERR(bdev); 334 return PTR_ERR(bdev);
331 r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md)); 335
332 if (r) 336 r = bd_link_disk_holder(bdev, dm_disk(md));
333 blkdev_put(bdev, d->dm_dev.mode); 337 if (r) {
334 else 338 blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
335 d->dm_dev.bdev = bdev; 339 return r;
336 return r; 340 }
341
342 d->dm_dev.bdev = bdev;
343 return 0;
337} 344}
338 345
339/* 346/*
@@ -344,8 +351,8 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
344 if (!d->dm_dev.bdev) 351 if (!d->dm_dev.bdev)
345 return; 352 return;
346 353
347 bd_release_from_disk(d->dm_dev.bdev, dm_disk(md)); 354 bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md));
348 blkdev_put(d->dm_dev.bdev, d->dm_dev.mode); 355 blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
349 d->dm_dev.bdev = NULL; 356 d->dm_dev.bdev = NULL;
350} 357}
351 358
@@ -355,6 +362,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
355static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, 362static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
356 sector_t start, sector_t len, void *data) 363 sector_t start, sector_t len, void *data)
357{ 364{
365 struct request_queue *q;
358 struct queue_limits *limits = data; 366 struct queue_limits *limits = data;
359 struct block_device *bdev = dev->bdev; 367 struct block_device *bdev = dev->bdev;
360 sector_t dev_size = 368 sector_t dev_size =
@@ -363,6 +371,22 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
363 limits->logical_block_size >> SECTOR_SHIFT; 371 limits->logical_block_size >> SECTOR_SHIFT;
364 char b[BDEVNAME_SIZE]; 372 char b[BDEVNAME_SIZE];
365 373
374 /*
375 * Some devices exist without request functions,
376 * such as loop devices not yet bound to backing files.
377 * Forbid the use of such devices.
378 */
379 q = bdev_get_queue(bdev);
380 if (!q || !q->make_request_fn) {
381 DMWARN("%s: %s is not yet initialised: "
382 "start=%llu, len=%llu, dev_size=%llu",
383 dm_device_name(ti->table->md), bdevname(bdev, b),
384 (unsigned long long)start,
385 (unsigned long long)len,
386 (unsigned long long)dev_size);
387 return 1;
388 }
389
366 if (!dev_size) 390 if (!dev_size)
367 return 0; 391 return 0;
368 392
@@ -486,11 +510,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
486 return 0; 510 return 0;
487} 511}
488 512
489/*
490 * Returns the minimum that is _not_ zero, unless both are zero.
491 */
492#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
493
494int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 513int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
495 sector_t start, sector_t len, void *data) 514 sector_t start, sector_t len, void *data)
496{ 515{
@@ -522,9 +541,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
522 */ 541 */
523 542
524 if (q->merge_bvec_fn && !ti->type->merge) 543 if (q->merge_bvec_fn && !ti->type->merge)
525 limits->max_sectors = 544 blk_limits_max_hw_sectors(limits,
526 min_not_zero(limits->max_sectors, 545 (unsigned int) (PAGE_SIZE >> 9));
527 (unsigned int) (PAGE_SIZE >> 9));
528 return 0; 546 return 0;
529} 547}
530EXPORT_SYMBOL_GPL(dm_set_device_limits); 548EXPORT_SYMBOL_GPL(dm_set_device_limits);
@@ -859,7 +877,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t)
859 return -EINVAL; 877 return -EINVAL;
860 } 878 }
861 879
862 t->mempools = dm_alloc_md_mempools(type); 880 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
863 if (!t->mempools) 881 if (!t->mempools)
864 return -ENOMEM; 882 return -ENOMEM;
865 883
@@ -926,18 +944,80 @@ static int dm_table_build_index(struct dm_table *t)
926} 944}
927 945
928/* 946/*
947 * Get a disk whose integrity profile reflects the table's profile.
948 * If %match_all is true, all devices' profiles must match.
949 * If %match_all is false, all devices must at least have an
950 * allocated integrity profile; but uninitialized is ok.
951 * Returns NULL if integrity support was inconsistent or unavailable.
952 */
953static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
954 bool match_all)
955{
956 struct list_head *devices = dm_table_get_devices(t);
957 struct dm_dev_internal *dd = NULL;
958 struct gendisk *prev_disk = NULL, *template_disk = NULL;
959
960 list_for_each_entry(dd, devices, list) {
961 template_disk = dd->dm_dev.bdev->bd_disk;
962 if (!blk_get_integrity(template_disk))
963 goto no_integrity;
964 if (!match_all && !blk_integrity_is_initialized(template_disk))
965 continue; /* skip uninitialized profiles */
966 else if (prev_disk &&
967 blk_integrity_compare(prev_disk, template_disk) < 0)
968 goto no_integrity;
969 prev_disk = template_disk;
970 }
971
972 return template_disk;
973
974no_integrity:
975 if (prev_disk)
976 DMWARN("%s: integrity not set: %s and %s profile mismatch",
977 dm_device_name(t->md),
978 prev_disk->disk_name,
979 template_disk->disk_name);
980 return NULL;
981}
982
983/*
929 * Register the mapped device for blk_integrity support if 984 * Register the mapped device for blk_integrity support if
930 * the underlying devices support it. 985 * the underlying devices have an integrity profile. But all devices
986 * may not have matching profiles (checking all devices isn't reliable
987 * during table load because this table may use other DM device(s) which
988 * must be resumed before they will have an initialized integity profile).
989 * Stacked DM devices force a 2 stage integrity profile validation:
990 * 1 - during load, validate all initialized integrity profiles match
991 * 2 - during resume, validate all integrity profiles match
931 */ 992 */
932static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) 993static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
933{ 994{
934 struct list_head *devices = dm_table_get_devices(t); 995 struct gendisk *template_disk = NULL;
935 struct dm_dev_internal *dd; 996
997 template_disk = dm_table_get_integrity_disk(t, false);
998 if (!template_disk)
999 return 0;
936 1000
937 list_for_each_entry(dd, devices, list) 1001 if (!blk_integrity_is_initialized(dm_disk(md))) {
938 if (bdev_get_integrity(dd->dm_dev.bdev)) 1002 t->integrity_supported = 1;
939 return blk_integrity_register(dm_disk(md), NULL); 1003 return blk_integrity_register(dm_disk(md), NULL);
1004 }
940 1005
1006 /*
1007 * If DM device already has an initalized integrity
1008 * profile the new profile should not conflict.
1009 */
1010 if (blk_integrity_is_initialized(template_disk) &&
1011 blk_integrity_compare(dm_disk(md), template_disk) < 0) {
1012 DMWARN("%s: conflict with existing integrity profile: "
1013 "%s profile mismatch",
1014 dm_device_name(t->md),
1015 template_disk->disk_name);
1016 return 1;
1017 }
1018
1019 /* Preserve existing initialized integrity profile */
1020 t->integrity_supported = 1;
941 return 0; 1021 return 0;
942} 1022}
943 1023
@@ -1091,41 +1171,27 @@ combine_limits:
1091 1171
1092/* 1172/*
1093 * Set the integrity profile for this device if all devices used have 1173 * Set the integrity profile for this device if all devices used have
1094 * matching profiles. 1174 * matching profiles. We're quite deep in the resume path but still
1175 * don't know if all devices (particularly DM devices this device
1176 * may be stacked on) have matching profiles. Even if the profiles
1177 * don't match we have no way to fail (to resume) at this point.
1095 */ 1178 */
1096static void dm_table_set_integrity(struct dm_table *t) 1179static void dm_table_set_integrity(struct dm_table *t)
1097{ 1180{
1098 struct list_head *devices = dm_table_get_devices(t); 1181 struct gendisk *template_disk = NULL;
1099 struct dm_dev_internal *prev = NULL, *dd = NULL;
1100 1182
1101 if (!blk_get_integrity(dm_disk(t->md))) 1183 if (!blk_get_integrity(dm_disk(t->md)))
1102 return; 1184 return;
1103 1185
1104 list_for_each_entry(dd, devices, list) { 1186 template_disk = dm_table_get_integrity_disk(t, true);
1105 if (prev && 1187 if (!template_disk &&
1106 blk_integrity_compare(prev->dm_dev.bdev->bd_disk, 1188 blk_integrity_is_initialized(dm_disk(t->md))) {
1107 dd->dm_dev.bdev->bd_disk) < 0) { 1189 DMWARN("%s: device no longer has a valid integrity profile",
1108 DMWARN("%s: integrity not set: %s and %s mismatch", 1190 dm_device_name(t->md));
1109 dm_device_name(t->md), 1191 return;
1110 prev->dm_dev.bdev->bd_disk->disk_name,
1111 dd->dm_dev.bdev->bd_disk->disk_name);
1112 goto no_integrity;
1113 }
1114 prev = dd;
1115 } 1192 }
1116
1117 if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
1118 goto no_integrity;
1119
1120 blk_integrity_register(dm_disk(t->md), 1193 blk_integrity_register(dm_disk(t->md),
1121 bdev_get_integrity(prev->dm_dev.bdev)); 1194 blk_get_integrity(template_disk));
1122
1123 return;
1124
1125no_integrity:
1126 blk_integrity_register(dm_disk(t->md), NULL);
1127
1128 return;
1129} 1195}
1130 1196
1131void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1197void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1136,11 +1202,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1136 */ 1202 */
1137 q->limits = *limits; 1203 q->limits = *limits;
1138 1204
1139 if (limits->no_cluster)
1140 queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
1141 else
1142 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
1143
1144 if (!dm_table_supports_discards(t)) 1205 if (!dm_table_supports_discards(t))
1145 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); 1206 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1146 else 1207 else
@@ -1234,10 +1295,17 @@ int dm_table_resume_targets(struct dm_table *t)
1234 return 0; 1295 return 0;
1235} 1296}
1236 1297
1298void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
1299{
1300 list_add(&cb->list, &t->target_callbacks);
1301}
1302EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
1303
1237int dm_table_any_congested(struct dm_table *t, int bdi_bits) 1304int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1238{ 1305{
1239 struct dm_dev_internal *dd; 1306 struct dm_dev_internal *dd;
1240 struct list_head *devices = dm_table_get_devices(t); 1307 struct list_head *devices = dm_table_get_devices(t);
1308 struct dm_target_callbacks *cb;
1241 int r = 0; 1309 int r = 0;
1242 1310
1243 list_for_each_entry(dd, devices, list) { 1311 list_for_each_entry(dd, devices, list) {
@@ -1252,6 +1320,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1252 bdevname(dd->dm_dev.bdev, b)); 1320 bdevname(dd->dm_dev.bdev, b));
1253 } 1321 }
1254 1322
1323 list_for_each_entry(cb, &t->target_callbacks, list)
1324 if (cb->congested_fn)
1325 r |= cb->congested_fn(cb, bdi_bits);
1326
1255 return r; 1327 return r;
1256} 1328}
1257 1329
@@ -1269,24 +1341,6 @@ int dm_table_any_busy_target(struct dm_table *t)
1269 return 0; 1341 return 0;
1270} 1342}
1271 1343
1272void dm_table_unplug_all(struct dm_table *t)
1273{
1274 struct dm_dev_internal *dd;
1275 struct list_head *devices = dm_table_get_devices(t);
1276
1277 list_for_each_entry(dd, devices, list) {
1278 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
1279 char b[BDEVNAME_SIZE];
1280
1281 if (likely(q))
1282 blk_unplug(q);
1283 else
1284 DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s",
1285 dm_device_name(t->md),
1286 bdevname(dd->dm_dev.bdev, b));
1287 }
1288}
1289
1290struct mapped_device *dm_table_get_md(struct dm_table *t) 1344struct mapped_device *dm_table_get_md(struct dm_table *t)
1291{ 1345{
1292 return t->md; 1346 return t->md;
@@ -1309,7 +1363,8 @@ bool dm_table_supports_discards(struct dm_table *t)
1309 return 0; 1363 return 0;
1310 1364
1311 /* 1365 /*
1312 * Ensure that at least one underlying device supports discards. 1366 * Unless any target used by the table set discards_supported,
1367 * require at least one underlying device to support discards.
1313 * t->devices includes internal dm devices such as mirror logs 1368 * t->devices includes internal dm devices such as mirror logs
1314 * so we need to use iterate_devices here, which targets 1369 * so we need to use iterate_devices here, which targets
1315 * supporting discard must provide. 1370 * supporting discard must provide.
@@ -1317,6 +1372,9 @@ bool dm_table_supports_discards(struct dm_table *t)
1317 while (i < dm_table_get_num_targets(t)) { 1372 while (i < dm_table_get_num_targets(t)) {
1318 ti = dm_table_get_target(t, i++); 1373 ti = dm_table_get_target(t, i++);
1319 1374
1375 if (ti->discards_supported)
1376 return 1;
1377
1320 if (ti->type->iterate_devices && 1378 if (ti->type->iterate_devices &&
1321 ti->type->iterate_devices(ti, device_discard_capable, NULL)) 1379 ti->type->iterate_devices(ti, device_discard_capable, NULL))
1322 return 1; 1380 return 1;
@@ -1334,4 +1392,3 @@ EXPORT_SYMBOL(dm_table_get_mode);
1334EXPORT_SYMBOL(dm_table_get_md); 1392EXPORT_SYMBOL(dm_table_get_md);
1335EXPORT_SYMBOL(dm_table_put); 1393EXPORT_SYMBOL(dm_table_put);
1336EXPORT_SYMBOL(dm_table_get); 1394EXPORT_SYMBOL(dm_table_get);
1337EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ac384b2a6a33..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -15,7 +15,6 @@
15#include <linux/blkpg.h> 15#include <linux/blkpg.h>
16#include <linux/bio.h> 16#include <linux/bio.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/smp_lock.h>
19#include <linux/mempool.h> 18#include <linux/mempool.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/idr.h> 20#include <linux/idr.h>
@@ -110,7 +109,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
110#define DMF_FREEING 3 109#define DMF_FREEING 3
111#define DMF_DELETING 4 110#define DMF_DELETING 4
112#define DMF_NOFLUSH_SUSPENDING 5 111#define DMF_NOFLUSH_SUSPENDING 5
113#define DMF_QUEUE_IO_TO_THREAD 6
114 112
115/* 113/*
116 * Work processed by per-device workqueue. 114 * Work processed by per-device workqueue.
@@ -144,24 +142,9 @@ struct mapped_device {
144 spinlock_t deferred_lock; 142 spinlock_t deferred_lock;
145 143
146 /* 144 /*
147 * An error from the barrier request currently being processed. 145 * Processing queue (flush)
148 */
149 int barrier_error;
150
151 /*
152 * Protect barrier_error from concurrent endio processing
153 * in request-based dm.
154 */
155 spinlock_t barrier_error_lock;
156
157 /*
158 * Processing queue (flush/barriers)
159 */ 146 */
160 struct workqueue_struct *wq; 147 struct workqueue_struct *wq;
161 struct work_struct barrier_work;
162
163 /* A pointer to the currently processing pre/post flush request */
164 struct request *flush_request;
165 148
166 /* 149 /*
167 * The current mapping. 150 * The current mapping.
@@ -200,8 +183,8 @@ struct mapped_device {
200 /* sysfs handle */ 183 /* sysfs handle */
201 struct kobject kobj; 184 struct kobject kobj;
202 185
203 /* zero-length barrier that will be cloned and submitted to targets */ 186 /* zero-length flush that will be cloned and submitted to targets */
204 struct bio barrier_bio; 187 struct bio flush_bio;
205}; 188};
206 189
207/* 190/*
@@ -344,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
344{ 327{
345 struct mapped_device *md; 328 struct mapped_device *md;
346 329
347 lock_kernel();
348 spin_lock(&_minor_lock); 330 spin_lock(&_minor_lock);
349 331
350 md = bdev->bd_disk->private_data; 332 md = bdev->bd_disk->private_data;
@@ -362,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
362 344
363out: 345out:
364 spin_unlock(&_minor_lock); 346 spin_unlock(&_minor_lock);
365 unlock_kernel();
366 347
367 return md ? 0 : -ENXIO; 348 return md ? 0 : -ENXIO;
368} 349}
@@ -371,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
371{ 352{
372 struct mapped_device *md = disk->private_data; 353 struct mapped_device *md = disk->private_data;
373 354
374 lock_kernel(); 355 spin_lock(&_minor_lock);
356
375 atomic_dec(&md->open_count); 357 atomic_dec(&md->open_count);
376 dm_put(md); 358 dm_put(md);
377 unlock_kernel(); 359
360 spin_unlock(&_minor_lock);
378 361
379 return 0; 362 return 0;
380} 363}
@@ -494,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
494 cpu = part_stat_lock(); 477 cpu = part_stat_lock();
495 part_round_stats(cpu, &dm_disk(md)->part0); 478 part_round_stats(cpu, &dm_disk(md)->part0);
496 part_stat_unlock(); 479 part_stat_unlock();
497 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 480 atomic_set(&dm_disk(md)->part0.in_flight[rw],
481 atomic_inc_return(&md->pending[rw]));
498} 482}
499 483
500static void end_io_acct(struct dm_io *io) 484static void end_io_acct(struct dm_io *io)
@@ -512,10 +496,10 @@ static void end_io_acct(struct dm_io *io)
512 496
513 /* 497 /*
514 * After this is decremented the bio must not be touched if it is 498 * After this is decremented the bio must not be touched if it is
515 * a barrier. 499 * a flush.
516 */ 500 */
517 dm_disk(md)->part0.in_flight[rw] = pending = 501 pending = atomic_dec_return(&md->pending[rw]);
518 atomic_dec_return(&md->pending[rw]); 502 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
519 pending += atomic_read(&md->pending[rw^0x1]); 503 pending += atomic_read(&md->pending[rw^0x1]);
520 504
521 /* nudge anyone waiting on suspend queue */ 505 /* nudge anyone waiting on suspend queue */
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
528 */ 512 */
529static void queue_io(struct mapped_device *md, struct bio *bio) 513static void queue_io(struct mapped_device *md, struct bio *bio)
530{ 514{
531 down_write(&md->io_lock); 515 unsigned long flags;
532 516
533 spin_lock_irq(&md->deferred_lock); 517 spin_lock_irqsave(&md->deferred_lock, flags);
534 bio_list_add(&md->deferred, bio); 518 bio_list_add(&md->deferred, bio);
535 spin_unlock_irq(&md->deferred_lock); 519 spin_unlock_irqrestore(&md->deferred_lock, flags);
536 520 queue_work(md->wq, &md->work);
537 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
538 queue_work(md->wq, &md->work);
539
540 up_write(&md->io_lock);
541} 521}
542 522
543/* 523/*
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
625 * Target requested pushing back the I/O. 605 * Target requested pushing back the I/O.
626 */ 606 */
627 spin_lock_irqsave(&md->deferred_lock, flags); 607 spin_lock_irqsave(&md->deferred_lock, flags);
628 if (__noflush_suspending(md)) { 608 if (__noflush_suspending(md))
629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 609 bio_list_add_head(&md->deferred, io->bio);
630 bio_list_add_head(&md->deferred, 610 else
631 io->bio);
632 } else
633 /* noflush suspend was interrupted. */ 611 /* noflush suspend was interrupted. */
634 io->error = -EIO; 612 io->error = -EIO;
635 spin_unlock_irqrestore(&md->deferred_lock, flags); 613 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
637 615
638 io_error = io->error; 616 io_error = io->error;
639 bio = io->bio; 617 bio = io->bio;
618 end_io_acct(io);
619 free_io(md, io);
620
621 if (io_error == DM_ENDIO_REQUEUE)
622 return;
640 623
641 if (bio->bi_rw & REQ_HARDBARRIER) { 624 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
642 /* 625 /*
643 * There can be just one barrier request so we use 626 * Preflush done for flush with data, reissue
644 * a per-device variable for error reporting. 627 * without REQ_FLUSH.
645 * Note that you can't touch the bio after end_io_acct
646 *
647 * We ignore -EOPNOTSUPP for empty flush reported by
648 * underlying devices. We assume that if the device
649 * doesn't support empty barriers, it doesn't need
650 * cache flushing commands.
651 */ 628 */
652 if (!md->barrier_error && 629 bio->bi_rw &= ~REQ_FLUSH;
653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 630 queue_io(md, bio);
654 md->barrier_error = io_error;
655 end_io_acct(io);
656 free_io(md, io);
657 } else { 631 } else {
658 end_io_acct(io); 632 /* done with normal IO or empty flush */
659 free_io(md, io); 633 trace_block_bio_complete(md->queue, bio, io_error);
660 634 bio_endio(bio, io_error);
661 if (io_error != DM_ENDIO_REQUEUE) {
662 trace_block_bio_complete(md->queue, bio);
663
664 bio_endio(bio, io_error);
665 }
666 } 635 }
667 } 636 }
668} 637}
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
755 blk_update_request(tio->orig, 0, nr_bytes); 724 blk_update_request(tio->orig, 0, nr_bytes);
756} 725}
757 726
758static void store_barrier_error(struct mapped_device *md, int error)
759{
760 unsigned long flags;
761
762 spin_lock_irqsave(&md->barrier_error_lock, flags);
763 /*
764 * Basically, the first error is taken, but:
765 * -EOPNOTSUPP supersedes any I/O error.
766 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
767 */
768 if (!md->barrier_error || error == -EOPNOTSUPP ||
769 (md->barrier_error != -EOPNOTSUPP &&
770 error == DM_ENDIO_REQUEUE))
771 md->barrier_error = error;
772 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
773}
774
775/* 727/*
776 * Don't touch any member of the md after calling this function because 728 * Don't touch any member of the md after calling this function because
777 * the md may be freed in dm_put() at the end of this function. 729 * the md may be freed in dm_put() at the end of this function.
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone)
809static void dm_end_request(struct request *clone, int error) 761static void dm_end_request(struct request *clone, int error)
810{ 762{
811 int rw = rq_data_dir(clone); 763 int rw = rq_data_dir(clone);
812 int run_queue = 1;
813 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
814 struct dm_rq_target_io *tio = clone->end_io_data; 764 struct dm_rq_target_io *tio = clone->end_io_data;
815 struct mapped_device *md = tio->md; 765 struct mapped_device *md = tio->md;
816 struct request *rq = tio->orig; 766 struct request *rq = tio->orig;
817 767
818 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 768 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
819 rq->errors = clone->errors; 769 rq->errors = clone->errors;
820 rq->resid_len = clone->resid_len; 770 rq->resid_len = clone->resid_len;
821 771
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
829 } 779 }
830 780
831 free_rq_clone(clone); 781 free_rq_clone(clone);
832 782 blk_end_request_all(rq, error);
833 if (unlikely(is_barrier)) { 783 rq_completed(md, rw, true);
834 if (unlikely(error))
835 store_barrier_error(md, error);
836 run_queue = 0;
837 } else
838 blk_end_request_all(rq, error);
839
840 rq_completed(md, rw, run_queue);
841} 784}
842 785
843static void dm_unprep_request(struct request *rq) 786static void dm_unprep_request(struct request *rq)
@@ -862,21 +805,9 @@ void dm_requeue_unmapped_request(struct request *clone)
862 struct request_queue *q = rq->q; 805 struct request_queue *q = rq->q;
863 unsigned long flags; 806 unsigned long flags;
864 807
865 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
866 /*
867 * Barrier clones share an original request.
868 * Leave it to dm_end_request(), which handles this special
869 * case.
870 */
871 dm_end_request(clone, DM_ENDIO_REQUEUE);
872 return;
873 }
874
875 dm_unprep_request(rq); 808 dm_unprep_request(rq);
876 809
877 spin_lock_irqsave(q->queue_lock, flags); 810 spin_lock_irqsave(q->queue_lock, flags);
878 if (elv_queue_empty(q))
879 blk_plug_device(q);
880 blk_requeue_request(q, rq); 811 blk_requeue_request(q, rq);
881 spin_unlock_irqrestore(q->queue_lock, flags); 812 spin_unlock_irqrestore(q->queue_lock, flags);
882 813
@@ -961,19 +892,6 @@ static void dm_complete_request(struct request *clone, int error)
961 struct dm_rq_target_io *tio = clone->end_io_data; 892 struct dm_rq_target_io *tio = clone->end_io_data;
962 struct request *rq = tio->orig; 893 struct request *rq = tio->orig;
963 894
964 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
965 /*
966 * Barrier clones share an original request. So can't use
967 * softirq_done with the original.
968 * Pass the clone to dm_done() directly in this special case.
969 * It is safe (even if clone->q->queue_lock is held here)
970 * because there is no I/O dispatching during the completion
971 * of barrier clone.
972 */
973 dm_done(clone, error, true);
974 return;
975 }
976
977 tio->error = error; 895 tio->error = error;
978 rq->completion_data = clone; 896 rq->completion_data = clone;
979 blk_complete_request(rq); 897 blk_complete_request(rq);
@@ -990,17 +908,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
990 struct dm_rq_target_io *tio = clone->end_io_data; 908 struct dm_rq_target_io *tio = clone->end_io_data;
991 struct request *rq = tio->orig; 909 struct request *rq = tio->orig;
992 910
993 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
994 /*
995 * Barrier clones share an original request.
996 * Leave it to dm_end_request(), which handles this special
997 * case.
998 */
999 BUG_ON(error > 0);
1000 dm_end_request(clone, error);
1001 return;
1002 }
1003
1004 rq->cmd_flags |= REQ_FAILED; 911 rq->cmd_flags |= REQ_FAILED;
1005 dm_complete_request(clone, error); 912 dm_complete_request(clone, error);
1006} 913}
@@ -1081,8 +988,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
1081 if (r == DM_MAPIO_REMAPPED) { 988 if (r == DM_MAPIO_REMAPPED) {
1082 /* the bio has been remapped so dispatch it */ 989 /* the bio has been remapped so dispatch it */
1083 990
1084 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 991 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1085 tio->io->bio->bi_bdev->bd_dev, sector); 992 tio->io->bio->bi_bdev->bd_dev, sector);
1086 993
1087 generic_make_request(clone); 994 generic_make_request(clone);
1088 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 995 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
@@ -1119,7 +1026,7 @@ static void dm_bio_destructor(struct bio *bio)
1119} 1026}
1120 1027
1121/* 1028/*
1122 * Creates a little bio that is just does part of a bvec. 1029 * Creates a little bio that just does part of a bvec.
1123 */ 1030 */
1124static struct bio *split_bvec(struct bio *bio, sector_t sector, 1031static struct bio *split_bvec(struct bio *bio, sector_t sector,
1125 unsigned short idx, unsigned int offset, 1032 unsigned short idx, unsigned int offset,
@@ -1134,7 +1041,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
1134 1041
1135 clone->bi_sector = sector; 1042 clone->bi_sector = sector;
1136 clone->bi_bdev = bio->bi_bdev; 1043 clone->bi_bdev = bio->bi_bdev;
1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1044 clone->bi_rw = bio->bi_rw;
1138 clone->bi_vcnt = 1; 1045 clone->bi_vcnt = 1;
1139 clone->bi_size = to_bytes(len); 1046 clone->bi_size = to_bytes(len);
1140 clone->bi_io_vec->bv_offset = offset; 1047 clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1068,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
1161 1068
1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1069 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163 __bio_clone(clone, bio); 1070 __bio_clone(clone, bio);
1164 clone->bi_rw &= ~REQ_HARDBARRIER;
1165 clone->bi_destructor = dm_bio_destructor; 1071 clone->bi_destructor = dm_bio_destructor;
1166 clone->bi_sector = sector; 1072 clone->bi_sector = sector;
1167 clone->bi_idx = idx; 1073 clone->bi_idx = idx;
@@ -1225,16 +1131,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1225 __issue_target_request(ci, ti, request_nr, len); 1131 __issue_target_request(ci, ti, request_nr, len);
1226} 1132}
1227 1133
1228static int __clone_and_map_empty_barrier(struct clone_info *ci) 1134static int __clone_and_map_empty_flush(struct clone_info *ci)
1229{ 1135{
1230 unsigned target_nr = 0; 1136 unsigned target_nr = 0;
1231 struct dm_target *ti; 1137 struct dm_target *ti;
1232 1138
1139 BUG_ON(bio_has_data(ci->bio));
1233 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1140 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1234 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1141 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1235 1142
1236 ci->sector_count = 0;
1237
1238 return 0; 1143 return 0;
1239} 1144}
1240 1145
@@ -1289,9 +1194,6 @@ static int __clone_and_map(struct clone_info *ci)
1289 sector_t len = 0, max; 1194 sector_t len = 0, max;
1290 struct dm_target_io *tio; 1195 struct dm_target_io *tio;
1291 1196
1292 if (unlikely(bio_empty_barrier(bio)))
1293 return __clone_and_map_empty_barrier(ci);
1294
1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1197 if (unlikely(bio->bi_rw & REQ_DISCARD))
1296 return __clone_and_map_discard(ci); 1198 return __clone_and_map_discard(ci);
1297 1199
@@ -1383,16 +1285,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1383 1285
1384 ci.map = dm_get_live_table(md); 1286 ci.map = dm_get_live_table(md);
1385 if (unlikely(!ci.map)) { 1287 if (unlikely(!ci.map)) {
1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1288 bio_io_error(bio);
1387 bio_io_error(bio);
1388 else
1389 if (!md->barrier_error)
1390 md->barrier_error = -EIO;
1391 return; 1289 return;
1392 } 1290 }
1393 1291
1394 ci.md = md; 1292 ci.md = md;
1395 ci.bio = bio;
1396 ci.io = alloc_io(md); 1293 ci.io = alloc_io(md);
1397 ci.io->error = 0; 1294 ci.io->error = 0;
1398 atomic_set(&ci.io->io_count, 1); 1295 atomic_set(&ci.io->io_count, 1);
@@ -1400,14 +1297,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1400 ci.io->md = md; 1297 ci.io->md = md;
1401 spin_lock_init(&ci.io->endio_lock); 1298 spin_lock_init(&ci.io->endio_lock);
1402 ci.sector = bio->bi_sector; 1299 ci.sector = bio->bi_sector;
1403 ci.sector_count = bio_sectors(bio);
1404 if (unlikely(bio_empty_barrier(bio)))
1405 ci.sector_count = 1;
1406 ci.idx = bio->bi_idx; 1300 ci.idx = bio->bi_idx;
1407 1301
1408 start_io_acct(ci.io); 1302 start_io_acct(ci.io);
1409 while (ci.sector_count && !error) 1303 if (bio->bi_rw & REQ_FLUSH) {
1410 error = __clone_and_map(&ci); 1304 ci.bio = &ci.md->flush_bio;
1305 ci.sector_count = 0;
1306 error = __clone_and_map_empty_flush(&ci);
1307 /* dec_pending submits any data associated with flush */
1308 } else {
1309 ci.bio = bio;
1310 ci.sector_count = bio_sectors(bio);
1311 while (ci.sector_count && !error)
1312 error = __clone_and_map(&ci);
1313 }
1411 1314
1412 /* drop the extra reference count */ 1315 /* drop the extra reference count */
1413 dec_pending(ci.io, error); 1316 dec_pending(ci.io, error);
@@ -1491,22 +1394,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1491 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1394 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1492 part_stat_unlock(); 1395 part_stat_unlock();
1493 1396
1494 /* 1397 /* if we're suspended, we have to queue this io for later */
1495 * If we're suspended or the thread is processing barriers 1398 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1496 * we have to queue this io for later.
1497 */
1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
1500 up_read(&md->io_lock); 1399 up_read(&md->io_lock);
1501 1400
1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1401 if (bio_rw(bio) != READA)
1503 bio_rw(bio) == READA) { 1402 queue_io(md, bio);
1403 else
1504 bio_io_error(bio); 1404 bio_io_error(bio);
1505 return 0;
1506 }
1507
1508 queue_io(md, bio);
1509
1510 return 0; 1405 return 0;
1511 } 1406 }
1512 1407
@@ -1537,14 +1432,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1537 return _dm_request(q, bio); 1432 return _dm_request(q, bio);
1538} 1433}
1539 1434
1540static bool dm_rq_is_flush_request(struct request *rq)
1541{
1542 if (rq->cmd_flags & REQ_FLUSH)
1543 return true;
1544 else
1545 return false;
1546}
1547
1548void dm_dispatch_request(struct request *rq) 1435void dm_dispatch_request(struct request *rq)
1549{ 1436{
1550 int r; 1437 int r;
@@ -1592,22 +1479,15 @@ static int setup_clone(struct request *clone, struct request *rq,
1592{ 1479{
1593 int r; 1480 int r;
1594 1481
1595 if (dm_rq_is_flush_request(rq)) { 1482 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1596 blk_rq_init(NULL, clone); 1483 dm_rq_bio_constructor, tio);
1597 clone->cmd_type = REQ_TYPE_FS; 1484 if (r)
1598 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1485 return r;
1599 } else {
1600 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1601 dm_rq_bio_constructor, tio);
1602 if (r)
1603 return r;
1604
1605 clone->cmd = rq->cmd;
1606 clone->cmd_len = rq->cmd_len;
1607 clone->sense = rq->sense;
1608 clone->buffer = rq->buffer;
1609 }
1610 1486
1487 clone->cmd = rq->cmd;
1488 clone->cmd_len = rq->cmd_len;
1489 clone->sense = rq->sense;
1490 clone->buffer = rq->buffer;
1611 clone->end_io = end_clone_request; 1491 clone->end_io = end_clone_request;
1612 clone->end_io_data = tio; 1492 clone->end_io_data = tio;
1613 1493
@@ -1648,9 +1528,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1648 struct mapped_device *md = q->queuedata; 1528 struct mapped_device *md = q->queuedata;
1649 struct request *clone; 1529 struct request *clone;
1650 1530
1651 if (unlikely(dm_rq_is_flush_request(rq)))
1652 return BLKPREP_OK;
1653
1654 if (unlikely(rq->special)) { 1531 if (unlikely(rq->special)) {
1655 DMWARN("Already has something in rq->special."); 1532 DMWARN("Already has something in rq->special.");
1656 return BLKPREP_KILL; 1533 return BLKPREP_KILL;
@@ -1727,6 +1604,7 @@ static void dm_request_fn(struct request_queue *q)
1727 struct dm_table *map = dm_get_live_table(md); 1604 struct dm_table *map = dm_get_live_table(md);
1728 struct dm_target *ti; 1605 struct dm_target *ti;
1729 struct request *rq, *clone; 1606 struct request *rq, *clone;
1607 sector_t pos;
1730 1608
1731 /* 1609 /*
1732 * For suspend, check blk_queue_stopped() and increment 1610 * For suspend, check blk_queue_stopped() and increment
@@ -1734,22 +1612,21 @@ static void dm_request_fn(struct request_queue *q)
1734 * number of in-flight I/Os after the queue is stopped in 1612 * number of in-flight I/Os after the queue is stopped in
1735 * dm_suspend(). 1613 * dm_suspend().
1736 */ 1614 */
1737 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1615 while (!blk_queue_stopped(q)) {
1738 rq = blk_peek_request(q); 1616 rq = blk_peek_request(q);
1739 if (!rq) 1617 if (!rq)
1740 goto plug_and_out; 1618 goto delay_and_out;
1741 1619
1742 if (unlikely(dm_rq_is_flush_request(rq))) { 1620 /* always use block 0 to find the target for flushes for now */
1743 BUG_ON(md->flush_request); 1621 pos = 0;
1744 md->flush_request = rq; 1622 if (!(rq->cmd_flags & REQ_FLUSH))
1745 blk_start_request(rq); 1623 pos = blk_rq_pos(rq);
1746 queue_work(md->wq, &md->barrier_work); 1624
1747 goto out; 1625 ti = dm_table_find_target(map, pos);
1748 } 1626 BUG_ON(!dm_target_is_valid(ti));
1749 1627
1750 ti = dm_table_find_target(map, blk_rq_pos(rq));
1751 if (ti->type->busy && ti->type->busy(ti)) 1628 if (ti->type->busy && ti->type->busy(ti))
1752 goto plug_and_out; 1629 goto delay_and_out;
1753 1630
1754 blk_start_request(rq); 1631 blk_start_request(rq);
1755 clone = rq->special; 1632 clone = rq->special;
@@ -1759,19 +1636,18 @@ static void dm_request_fn(struct request_queue *q)
1759 if (map_request(ti, clone, md)) 1636 if (map_request(ti, clone, md))
1760 goto requeued; 1637 goto requeued;
1761 1638
1762 spin_lock_irq(q->queue_lock); 1639 BUG_ON(!irqs_disabled());
1640 spin_lock(q->queue_lock);
1763 } 1641 }
1764 1642
1765 goto out; 1643 goto out;
1766 1644
1767requeued: 1645requeued:
1768 spin_lock_irq(q->queue_lock); 1646 BUG_ON(!irqs_disabled());
1769 1647 spin_lock(q->queue_lock);
1770plug_and_out:
1771 if (!elv_queue_empty(q))
1772 /* Some requests still remain, retry later */
1773 blk_plug_device(q);
1774 1648
1649delay_and_out:
1650 blk_delay_queue(q, HZ / 10);
1775out: 1651out:
1776 dm_table_put(map); 1652 dm_table_put(map);
1777 1653
@@ -1800,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q)
1800 return r; 1676 return r;
1801} 1677}
1802 1678
1803static void dm_unplug_all(struct request_queue *q)
1804{
1805 struct mapped_device *md = q->queuedata;
1806 struct dm_table *map = dm_get_live_table(md);
1807
1808 if (map) {
1809 if (dm_request_based(md))
1810 generic_unplug_device(q);
1811
1812 dm_table_unplug_all(map);
1813 dm_table_put(map);
1814 }
1815}
1816
1817static int dm_any_congested(void *congested_data, int bdi_bits) 1679static int dm_any_congested(void *congested_data, int bdi_bits)
1818{ 1680{
1819 int r = bdi_bits; 1681 int r = bdi_bits;
@@ -1918,7 +1780,6 @@ out:
1918static const struct block_device_operations dm_blk_dops; 1780static const struct block_device_operations dm_blk_dops;
1919 1781
1920static void dm_wq_work(struct work_struct *work); 1782static void dm_wq_work(struct work_struct *work);
1921static void dm_rq_barrier_work(struct work_struct *work);
1922 1783
1923static void dm_init_md_queue(struct mapped_device *md) 1784static void dm_init_md_queue(struct mapped_device *md)
1924{ 1785{
@@ -1938,8 +1799,8 @@ static void dm_init_md_queue(struct mapped_device *md)
1938 md->queue->backing_dev_info.congested_data = md; 1799 md->queue->backing_dev_info.congested_data = md;
1939 blk_queue_make_request(md->queue, dm_request); 1800 blk_queue_make_request(md->queue, dm_request);
1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1801 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941 md->queue->unplug_fn = dm_unplug_all;
1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1802 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1803 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943} 1804}
1944 1805
1945/* 1806/*
@@ -1972,7 +1833,6 @@ static struct mapped_device *alloc_dev(int minor)
1972 mutex_init(&md->suspend_lock); 1833 mutex_init(&md->suspend_lock);
1973 mutex_init(&md->type_lock); 1834 mutex_init(&md->type_lock);
1974 spin_lock_init(&md->deferred_lock); 1835 spin_lock_init(&md->deferred_lock);
1975 spin_lock_init(&md->barrier_error_lock);
1976 rwlock_init(&md->map_lock); 1836 rwlock_init(&md->map_lock);
1977 atomic_set(&md->holders, 1); 1837 atomic_set(&md->holders, 1);
1978 atomic_set(&md->open_count, 0); 1838 atomic_set(&md->open_count, 0);
@@ -1995,7 +1855,6 @@ static struct mapped_device *alloc_dev(int minor)
1995 atomic_set(&md->pending[1], 0); 1855 atomic_set(&md->pending[1], 0);
1996 init_waitqueue_head(&md->wait); 1856 init_waitqueue_head(&md->wait);
1997 INIT_WORK(&md->work, dm_wq_work); 1857 INIT_WORK(&md->work, dm_wq_work);
1998 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1999 init_waitqueue_head(&md->eventq); 1858 init_waitqueue_head(&md->eventq);
2000 1859
2001 md->disk->major = _major; 1860 md->disk->major = _major;
@@ -2007,7 +1866,8 @@ static struct mapped_device *alloc_dev(int minor)
2007 add_disk(md->disk); 1866 add_disk(md->disk);
2008 format_dev_t(md->name, MKDEV(_major, minor)); 1867 format_dev_t(md->name, MKDEV(_major, minor));
2009 1868
2010 md->wq = create_singlethread_workqueue("kdmflush"); 1869 md->wq = alloc_workqueue("kdmflush",
1870 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
2011 if (!md->wq) 1871 if (!md->wq)
2012 goto bad_thread; 1872 goto bad_thread;
2013 1873
@@ -2015,6 +1875,10 @@ static struct mapped_device *alloc_dev(int minor)
2015 if (!md->bdev) 1875 if (!md->bdev)
2016 goto bad_bdev; 1876 goto bad_bdev;
2017 1877
1878 bio_init(&md->flush_bio);
1879 md->flush_bio.bi_bdev = md->bdev;
1880 md->flush_bio.bi_rw = WRITE_FLUSH;
1881
2018 /* Populate the mapping, nobody knows we exist yet */ 1882 /* Populate the mapping, nobody knows we exist yet */
2019 spin_lock(&_minor_lock); 1883 spin_lock(&_minor_lock);
2020 old_md = idr_replace(&_minor_idr, md, minor); 1884 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2111,13 +1975,14 @@ static void event_callback(void *context)
2111 wake_up(&md->eventq); 1975 wake_up(&md->eventq);
2112} 1976}
2113 1977
1978/*
1979 * Protected by md->suspend_lock obtained by dm_swap_table().
1980 */
2114static void __set_size(struct mapped_device *md, sector_t size) 1981static void __set_size(struct mapped_device *md, sector_t size)
2115{ 1982{
2116 set_capacity(md->disk, size); 1983 set_capacity(md->disk, size);
2117 1984
2118 mutex_lock(&md->bdev->bd_inode->i_mutex);
2119 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1985 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2120 mutex_unlock(&md->bdev->bd_inode->i_mutex);
2121} 1986}
2122 1987
2123/* 1988/*
@@ -2245,7 +2110,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2245 blk_queue_softirq_done(md->queue, dm_softirq_done); 2110 blk_queue_softirq_done(md->queue, dm_softirq_done);
2246 blk_queue_prep_rq(md->queue, dm_prep_fn); 2111 blk_queue_prep_rq(md->queue, dm_prep_fn);
2247 blk_queue_lld_busy(md->queue, dm_lld_busy); 2112 blk_queue_lld_busy(md->queue, dm_lld_busy);
2248 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
2249 2113
2250 elv_register_queue(md->queue); 2114 elv_register_queue(md->queue);
2251 2115
@@ -2380,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2380 int r = 0; 2244 int r = 0;
2381 DECLARE_WAITQUEUE(wait, current); 2245 DECLARE_WAITQUEUE(wait, current);
2382 2246
2383 dm_unplug_all(md->queue);
2384
2385 add_wait_queue(&md->wait, &wait); 2247 add_wait_queue(&md->wait, &wait);
2386 2248
2387 while (1) { 2249 while (1) {
@@ -2406,43 +2268,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2406 return r; 2268 return r;
2407} 2269}
2408 2270
2409static void dm_flush(struct mapped_device *md)
2410{
2411 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412
2413 bio_init(&md->barrier_bio);
2414 md->barrier_bio.bi_bdev = md->bdev;
2415 md->barrier_bio.bi_rw = WRITE_BARRIER;
2416 __split_and_process_bio(md, &md->barrier_bio);
2417
2418 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419}
2420
2421static void process_barrier(struct mapped_device *md, struct bio *bio)
2422{
2423 md->barrier_error = 0;
2424
2425 dm_flush(md);
2426
2427 if (!bio_empty_barrier(bio)) {
2428 __split_and_process_bio(md, bio);
2429 /*
2430 * If the request isn't supported, don't waste time with
2431 * the second flush.
2432 */
2433 if (md->barrier_error != -EOPNOTSUPP)
2434 dm_flush(md);
2435 }
2436
2437 if (md->barrier_error != DM_ENDIO_REQUEUE)
2438 bio_endio(bio, md->barrier_error);
2439 else {
2440 spin_lock_irq(&md->deferred_lock);
2441 bio_list_add_head(&md->deferred, bio);
2442 spin_unlock_irq(&md->deferred_lock);
2443 }
2444}
2445
2446/* 2271/*
2447 * Process the deferred bios 2272 * Process the deferred bios
2448 */ 2273 */
@@ -2452,33 +2277,27 @@ static void dm_wq_work(struct work_struct *work)
2452 work); 2277 work);
2453 struct bio *c; 2278 struct bio *c;
2454 2279
2455 down_write(&md->io_lock); 2280 down_read(&md->io_lock);
2456 2281
2457 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2282 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2458 spin_lock_irq(&md->deferred_lock); 2283 spin_lock_irq(&md->deferred_lock);
2459 c = bio_list_pop(&md->deferred); 2284 c = bio_list_pop(&md->deferred);
2460 spin_unlock_irq(&md->deferred_lock); 2285 spin_unlock_irq(&md->deferred_lock);
2461 2286
2462 if (!c) { 2287 if (!c)
2463 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2464 break; 2288 break;
2465 }
2466 2289
2467 up_write(&md->io_lock); 2290 up_read(&md->io_lock);
2468 2291
2469 if (dm_request_based(md)) 2292 if (dm_request_based(md))
2470 generic_make_request(c); 2293 generic_make_request(c);
2471 else { 2294 else
2472 if (c->bi_rw & REQ_HARDBARRIER) 2295 __split_and_process_bio(md, c);
2473 process_barrier(md, c);
2474 else
2475 __split_and_process_bio(md, c);
2476 }
2477 2296
2478 down_write(&md->io_lock); 2297 down_read(&md->io_lock);
2479 } 2298 }
2480 2299
2481 up_write(&md->io_lock); 2300 up_read(&md->io_lock);
2482} 2301}
2483 2302
2484static void dm_queue_flush(struct mapped_device *md) 2303static void dm_queue_flush(struct mapped_device *md)
@@ -2488,73 +2307,6 @@ static void dm_queue_flush(struct mapped_device *md)
2488 queue_work(md->wq, &md->work); 2307 queue_work(md->wq, &md->work);
2489} 2308}
2490 2309
2491static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
2492{
2493 struct dm_rq_target_io *tio = clone->end_io_data;
2494
2495 tio->info.target_request_nr = request_nr;
2496}
2497
2498/* Issue barrier requests to targets and wait for their completion. */
2499static int dm_rq_barrier(struct mapped_device *md)
2500{
2501 int i, j;
2502 struct dm_table *map = dm_get_live_table(md);
2503 unsigned num_targets = dm_table_get_num_targets(map);
2504 struct dm_target *ti;
2505 struct request *clone;
2506
2507 md->barrier_error = 0;
2508
2509 for (i = 0; i < num_targets; i++) {
2510 ti = dm_table_get_target(map, i);
2511 for (j = 0; j < ti->num_flush_requests; j++) {
2512 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2513 dm_rq_set_target_request_nr(clone, j);
2514 atomic_inc(&md->pending[rq_data_dir(clone)]);
2515 map_request(ti, clone, md);
2516 }
2517 }
2518
2519 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2520 dm_table_put(map);
2521
2522 return md->barrier_error;
2523}
2524
2525static void dm_rq_barrier_work(struct work_struct *work)
2526{
2527 int error;
2528 struct mapped_device *md = container_of(work, struct mapped_device,
2529 barrier_work);
2530 struct request_queue *q = md->queue;
2531 struct request *rq;
2532 unsigned long flags;
2533
2534 /*
2535 * Hold the md reference here and leave it at the last part so that
2536 * the md can't be deleted by device opener when the barrier request
2537 * completes.
2538 */
2539 dm_get(md);
2540
2541 error = dm_rq_barrier(md);
2542
2543 rq = md->flush_request;
2544 md->flush_request = NULL;
2545
2546 if (error == DM_ENDIO_REQUEUE) {
2547 spin_lock_irqsave(q->queue_lock, flags);
2548 blk_requeue_request(q, rq);
2549 spin_unlock_irqrestore(q->queue_lock, flags);
2550 } else
2551 blk_end_request_all(rq, error);
2552
2553 blk_run_queue(q);
2554
2555 dm_put(md);
2556}
2557
2558/* 2310/*
2559 * Swap in a new table, returning the old one for the caller to destroy. 2311 * Swap in a new table, returning the old one for the caller to destroy.
2560 */ 2312 */
@@ -2677,23 +2429,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2677 * 2429 *
2678 * To get all processes out of __split_and_process_bio in dm_request, 2430 * To get all processes out of __split_and_process_bio in dm_request,
2679 * we take the write lock. To prevent any process from reentering 2431 * we take the write lock. To prevent any process from reentering
2680 * __split_and_process_bio from dm_request, we set 2432 * __split_and_process_bio from dm_request and quiesce the thread
2681 * DMF_QUEUE_IO_TO_THREAD. 2433 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2682 * 2434 * flush_workqueue(md->wq).
2683 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2684 * and call flush_workqueue(md->wq). flush_workqueue will wait until
2685 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2686 * further calls to __split_and_process_bio from dm_wq_work.
2687 */ 2435 */
2688 down_write(&md->io_lock); 2436 down_write(&md->io_lock);
2689 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2437 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2690 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2691 up_write(&md->io_lock); 2438 up_write(&md->io_lock);
2692 2439
2693 /* 2440 /*
2694 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2441 * Stop md->queue before flushing md->wq in case request-based
2695 * can be kicked until md->queue is stopped. So stop md->queue before 2442 * dm defers requests to md->wq from md->queue.
2696 * flushing md->wq.
2697 */ 2443 */
2698 if (dm_request_based(md)) 2444 if (dm_request_based(md))
2699 stop_queue(md->queue); 2445 stop_queue(md->queue);
@@ -2772,7 +2518,6 @@ int dm_resume(struct mapped_device *md)
2772 2518
2773 clear_bit(DMF_SUSPENDED, &md->flags); 2519 clear_bit(DMF_SUSPENDED, &md->flags);
2774 2520
2775 dm_table_unplug_all(map);
2776 r = 0; 2521 r = 0;
2777out: 2522out:
2778 dm_table_put(map); 2523 dm_table_put(map);
@@ -2876,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti)
2876} 2621}
2877EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2622EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2878 2623
2879struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2624struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2880{ 2625{
2881 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2626 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2627 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2882 2628
2883 if (!pools) 2629 if (!pools)
2884 return NULL; 2630 return NULL;
@@ -2895,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2895 if (!pools->tio_pool) 2641 if (!pools->tio_pool)
2896 goto free_io_pool_and_out; 2642 goto free_io_pool_and_out;
2897 2643
2898 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2644 pools->bs = bioset_create(pool_size, 0);
2899 bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2900 if (!pools->bs) 2645 if (!pools->bs)
2901 goto free_tio_pool_and_out; 2646 goto free_tio_pool_and_out;
2902 2647
2648 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2649 goto free_bioset_and_out;
2650
2903 return pools; 2651 return pools;
2904 2652
2653free_bioset_and_out:
2654 bioset_free(pools->bs);
2655
2905free_tio_pool_and_out: 2656free_tio_pool_and_out:
2906 mempool_destroy(pools->tio_pool); 2657 mempool_destroy(pools->tio_pool);
2907 2658
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0c2dd5f4af76..1aaf16746da8 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void);
149/* 149/*
150 * Mempool operations 150 * Mempool operations
151 */ 151 */
152struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); 152struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity);
153void dm_free_md_mempools(struct dm_md_mempools *pools); 153void dm_free_md_mempools(struct dm_md_mempools *pools);
154 154
155#endif 155#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 1a8987884614..23078dabb6df 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -30,7 +30,7 @@
30 * 30 *
31 * Different modes can be active at a time, but only 31 * Different modes can be active at a time, but only
32 * one can be set at array creation. Others can be added later. 32 * one can be set at array creation. Others can be added later.
33 * A mode can be one-shot or recurrent with the recurrance being 33 * A mode can be one-shot or recurrent with the recurrence being
34 * once in every N requests. 34 * once in every N requests.
35 * The bottom 5 bits of the "layout" indicate the mode. The 35 * The bottom 5 bits of the "layout" indicate the mode. The
36 * remainder indicate a period, or 0 for one-shot. 36 * remainder indicate a period, or 0 for one-shot.
@@ -210,7 +210,7 @@ static int make_request(mddev_t *mddev, struct bio *bio)
210 } 210 }
211 } 211 }
212 if (failit) { 212 if (failit) {
213 struct bio *b = bio_clone(bio, GFP_NOIO); 213 struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev);
214 b->bi_bdev = conf->rdev->bdev; 214 b->bi_bdev = conf->rdev->bdev;
215 b->bi_private = bio; 215 b->bi_private = bio;
216 b->bi_end_io = faulty_fail; 216 b->bi_end_io = faulty_fail;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ba19060bcf3f..abfb59a61ede 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q,
87 return maxsectors << 9; 87 return maxsectors << 9;
88} 88}
89 89
90static void linear_unplug(struct request_queue *q)
91{
92 mddev_t *mddev = q->queuedata;
93 linear_conf_t *conf;
94 int i;
95
96 rcu_read_lock();
97 conf = rcu_dereference(mddev->private);
98
99 for (i=0; i < mddev->raid_disks; i++) {
100 struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
101 blk_unplug(r_queue);
102 }
103 rcu_read_unlock();
104}
105
106static int linear_congested(void *data, int bits) 90static int linear_congested(void *data, int bits)
107{ 91{
108 mddev_t *mddev = data; 92 mddev_t *mddev = data;
@@ -216,7 +200,6 @@ static int linear_run (mddev_t *mddev)
216 200
217 if (md_check_no_bitmap(mddev)) 201 if (md_check_no_bitmap(mddev))
218 return -EINVAL; 202 return -EINVAL;
219 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
220 conf = linear_conf(mddev, mddev->raid_disks); 203 conf = linear_conf(mddev, mddev->raid_disks);
221 204
222 if (!conf) 205 if (!conf)
@@ -225,11 +208,9 @@ static int linear_run (mddev_t *mddev)
225 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 208 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
226 209
227 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 210 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
228 mddev->queue->unplug_fn = linear_unplug;
229 mddev->queue->backing_dev_info.congested_fn = linear_congested; 211 mddev->queue->backing_dev_info.congested_fn = linear_congested;
230 mddev->queue->backing_dev_info.congested_data = mddev; 212 mddev->queue->backing_dev_info.congested_data = mddev;
231 md_integrity_register(mddev); 213 return md_integrity_register(mddev);
232 return 0;
233} 214}
234 215
235static void free_conf(struct rcu_head *head) 216static void free_conf(struct rcu_head *head)
@@ -294,8 +275,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
294 dev_info_t *tmp_dev; 275 dev_info_t *tmp_dev;
295 sector_t start_sector; 276 sector_t start_sector;
296 277
297 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 278 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
298 md_barrier_request(mddev, bio); 279 md_flush_request(mddev, bio);
299 return 0; 280 return 0;
300 } 281 }
301 282
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f20d13e717d5..91e31e260b4a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,7 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/sysctl.h> 37#include <linux/sysctl.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/smp_lock.h> 39#include <linux/mutex.h>
40#include <linux/buffer_head.h> /* for invalidate_bdev */ 40#include <linux/buffer_head.h> /* for invalidate_bdev */
41#include <linux/poll.h> 41#include <linux/poll.h>
42#include <linux/ctype.h> 42#include <linux/ctype.h>
@@ -57,7 +57,6 @@
57#define DEBUG 0 57#define DEBUG 0
58#define dprintk(x...) ((void)(DEBUG && printk(x))) 58#define dprintk(x...) ((void)(DEBUG && printk(x)))
59 59
60
61#ifndef MODULE 60#ifndef MODULE
62static void autostart_arrays(int part); 61static void autostart_arrays(int part);
63#endif 62#endif
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
68static void md_print_devices(void); 67static void md_print_devices(void);
69 68
70static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 69static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
70static struct workqueue_struct *md_wq;
71static struct workqueue_struct *md_misc_wq;
71 72
72#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 73#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
73 74
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops;
148 149
149static int start_readonly; 150static int start_readonly;
150 151
152/* bio_clone_mddev
153 * like bio_clone, but with a local bio set
154 */
155
156static void mddev_bio_destructor(struct bio *bio)
157{
158 mddev_t *mddev, **mddevp;
159
160 mddevp = (void*)bio;
161 mddev = mddevp[-1];
162
163 bio_free(bio, mddev->bio_set);
164}
165
166struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
167 mddev_t *mddev)
168{
169 struct bio *b;
170 mddev_t **mddevp;
171
172 if (!mddev || !mddev->bio_set)
173 return bio_alloc(gfp_mask, nr_iovecs);
174
175 b = bio_alloc_bioset(gfp_mask, nr_iovecs,
176 mddev->bio_set);
177 if (!b)
178 return NULL;
179 mddevp = (void*)b;
180 mddevp[-1] = mddev;
181 b->bi_destructor = mddev_bio_destructor;
182 return b;
183}
184EXPORT_SYMBOL_GPL(bio_alloc_mddev);
185
186struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
187 mddev_t *mddev)
188{
189 struct bio *b;
190 mddev_t **mddevp;
191
192 if (!mddev || !mddev->bio_set)
193 return bio_clone(bio, gfp_mask);
194
195 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
196 mddev->bio_set);
197 if (!b)
198 return NULL;
199 mddevp = (void*)b;
200 mddevp[-1] = mddev;
201 b->bi_destructor = mddev_bio_destructor;
202 __bio_clone(b, bio);
203 if (bio_integrity(bio)) {
204 int ret;
205
206 ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
207
208 if (ret < 0) {
209 bio_put(b);
210 return NULL;
211 }
212 }
213
214 return b;
215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217
151/* 218/*
152 * We have a system wide 'event count' that is incremented 219 * We have a system wide 'event count' that is incremented
153 * on any 'interesting' event, and readers of /proc/mdstat 220 * on any 'interesting' event, and readers of /proc/mdstat
@@ -220,18 +287,21 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
220 mddev_t *mddev = q->queuedata; 287 mddev_t *mddev = q->queuedata;
221 int rv; 288 int rv;
222 int cpu; 289 int cpu;
290 unsigned int sectors;
223 291
224 if (mddev == NULL || mddev->pers == NULL) { 292 if (mddev == NULL || mddev->pers == NULL
293 || !mddev->ready) {
225 bio_io_error(bio); 294 bio_io_error(bio);
226 return 0; 295 return 0;
227 } 296 }
297 smp_rmb(); /* Ensure implications of 'active' are visible */
228 rcu_read_lock(); 298 rcu_read_lock();
229 if (mddev->suspended || mddev->barrier) { 299 if (mddev->suspended) {
230 DEFINE_WAIT(__wait); 300 DEFINE_WAIT(__wait);
231 for (;;) { 301 for (;;) {
232 prepare_to_wait(&mddev->sb_wait, &__wait, 302 prepare_to_wait(&mddev->sb_wait, &__wait,
233 TASK_UNINTERRUPTIBLE); 303 TASK_UNINTERRUPTIBLE);
234 if (!mddev->suspended && !mddev->barrier) 304 if (!mddev->suspended)
235 break; 305 break;
236 rcu_read_unlock(); 306 rcu_read_unlock();
237 schedule(); 307 schedule();
@@ -242,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
242 atomic_inc(&mddev->active_io); 312 atomic_inc(&mddev->active_io);
243 rcu_read_unlock(); 313 rcu_read_unlock();
244 314
315 /*
316 * save the sectors now since our bio can
317 * go away inside make_request
318 */
319 sectors = bio_sectors(bio);
245 rv = mddev->pers->make_request(mddev, bio); 320 rv = mddev->pers->make_request(mddev, bio);
246 321
247 cpu = part_stat_lock(); 322 cpu = part_stat_lock();
248 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 323 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
249 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 324 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
250 bio_sectors(bio));
251 part_stat_unlock(); 325 part_stat_unlock();
252 326
253 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 327 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
@@ -277,48 +351,45 @@ void mddev_resume(mddev_t *mddev)
277 mddev->suspended = 0; 351 mddev->suspended = 0;
278 wake_up(&mddev->sb_wait); 352 wake_up(&mddev->sb_wait);
279 mddev->pers->quiesce(mddev, 0); 353 mddev->pers->quiesce(mddev, 0);
354
355 md_wakeup_thread(mddev->thread);
356 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
280} 357}
281EXPORT_SYMBOL_GPL(mddev_resume); 358EXPORT_SYMBOL_GPL(mddev_resume);
282 359
283int mddev_congested(mddev_t *mddev, int bits) 360int mddev_congested(mddev_t *mddev, int bits)
284{ 361{
285 if (mddev->barrier)
286 return 1;
287 return mddev->suspended; 362 return mddev->suspended;
288} 363}
289EXPORT_SYMBOL(mddev_congested); 364EXPORT_SYMBOL(mddev_congested);
290 365
291/* 366/*
292 * Generic barrier handling for md 367 * Generic flush handling for md
293 */ 368 */
294 369
295#define POST_REQUEST_BARRIER ((void*)1) 370static void md_end_flush(struct bio *bio, int err)
296
297static void md_end_barrier(struct bio *bio, int err)
298{ 371{
299 mdk_rdev_t *rdev = bio->bi_private; 372 mdk_rdev_t *rdev = bio->bi_private;
300 mddev_t *mddev = rdev->mddev; 373 mddev_t *mddev = rdev->mddev;
301 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
302 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
303 374
304 rdev_dec_pending(rdev, mddev); 375 rdev_dec_pending(rdev, mddev);
305 376
306 if (atomic_dec_and_test(&mddev->flush_pending)) { 377 if (atomic_dec_and_test(&mddev->flush_pending)) {
307 if (mddev->barrier == POST_REQUEST_BARRIER) { 378 /* The pre-request flush has finished */
308 /* This was a post-request barrier */ 379 queue_work(md_wq, &mddev->flush_work);
309 mddev->barrier = NULL;
310 wake_up(&mddev->sb_wait);
311 } else
312 /* The pre-request barrier has finished */
313 schedule_work(&mddev->barrier_work);
314 } 380 }
315 bio_put(bio); 381 bio_put(bio);
316} 382}
317 383
318static void submit_barriers(mddev_t *mddev) 384static void md_submit_flush_data(struct work_struct *ws);
385
386static void submit_flushes(struct work_struct *ws)
319{ 387{
388 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
320 mdk_rdev_t *rdev; 389 mdk_rdev_t *rdev;
321 390
391 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
392 atomic_set(&mddev->flush_pending, 1);
322 rcu_read_lock(); 393 rcu_read_lock();
323 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 394 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
324 if (rdev->raid_disk >= 0 && 395 if (rdev->raid_disk >= 0 &&
@@ -331,106 +402,107 @@ static void submit_barriers(mddev_t *mddev)
331 atomic_inc(&rdev->nr_pending); 402 atomic_inc(&rdev->nr_pending);
332 atomic_inc(&rdev->nr_pending); 403 atomic_inc(&rdev->nr_pending);
333 rcu_read_unlock(); 404 rcu_read_unlock();
334 bi = bio_alloc(GFP_KERNEL, 0); 405 bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
335 bi->bi_end_io = md_end_barrier; 406 bi->bi_end_io = md_end_flush;
336 bi->bi_private = rdev; 407 bi->bi_private = rdev;
337 bi->bi_bdev = rdev->bdev; 408 bi->bi_bdev = rdev->bdev;
338 atomic_inc(&mddev->flush_pending); 409 atomic_inc(&mddev->flush_pending);
339 submit_bio(WRITE_BARRIER, bi); 410 submit_bio(WRITE_FLUSH, bi);
340 rcu_read_lock(); 411 rcu_read_lock();
341 rdev_dec_pending(rdev, mddev); 412 rdev_dec_pending(rdev, mddev);
342 } 413 }
343 rcu_read_unlock(); 414 rcu_read_unlock();
415 if (atomic_dec_and_test(&mddev->flush_pending))
416 queue_work(md_wq, &mddev->flush_work);
344} 417}
345 418
346static void md_submit_barrier(struct work_struct *ws) 419static void md_submit_flush_data(struct work_struct *ws)
347{ 420{
348 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 421 mddev_t *mddev = container_of(ws, mddev_t, flush_work);
349 struct bio *bio = mddev->barrier; 422 struct bio *bio = mddev->flush_bio;
350
351 atomic_set(&mddev->flush_pending, 1);
352 423
353 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 424 if (bio->bi_size == 0)
354 bio_endio(bio, -EOPNOTSUPP);
355 else if (bio->bi_size == 0)
356 /* an empty barrier - all done */ 425 /* an empty barrier - all done */
357 bio_endio(bio, 0); 426 bio_endio(bio, 0);
358 else { 427 else {
359 bio->bi_rw &= ~REQ_HARDBARRIER; 428 bio->bi_rw &= ~REQ_FLUSH;
360 if (mddev->pers->make_request(mddev, bio)) 429 if (mddev->pers->make_request(mddev, bio))
361 generic_make_request(bio); 430 generic_make_request(bio);
362 mddev->barrier = POST_REQUEST_BARRIER;
363 submit_barriers(mddev);
364 }
365 if (atomic_dec_and_test(&mddev->flush_pending)) {
366 mddev->barrier = NULL;
367 wake_up(&mddev->sb_wait);
368 } 431 }
432
433 mddev->flush_bio = NULL;
434 wake_up(&mddev->sb_wait);
369} 435}
370 436
371void md_barrier_request(mddev_t *mddev, struct bio *bio) 437void md_flush_request(mddev_t *mddev, struct bio *bio)
372{ 438{
373 spin_lock_irq(&mddev->write_lock); 439 spin_lock_irq(&mddev->write_lock);
374 wait_event_lock_irq(mddev->sb_wait, 440 wait_event_lock_irq(mddev->sb_wait,
375 !mddev->barrier, 441 !mddev->flush_bio,
376 mddev->write_lock, /*nothing*/); 442 mddev->write_lock, /*nothing*/);
377 mddev->barrier = bio; 443 mddev->flush_bio = bio;
378 spin_unlock_irq(&mddev->write_lock); 444 spin_unlock_irq(&mddev->write_lock);
379 445
380 atomic_set(&mddev->flush_pending, 1); 446 INIT_WORK(&mddev->flush_work, submit_flushes);
381 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 447 queue_work(md_wq, &mddev->flush_work);
382
383 submit_barriers(mddev);
384
385 if (atomic_dec_and_test(&mddev->flush_pending))
386 schedule_work(&mddev->barrier_work);
387} 448}
388EXPORT_SYMBOL(md_barrier_request); 449EXPORT_SYMBOL(md_flush_request);
389 450
390/* Support for plugging. 451/* Support for plugging.
391 * This mirrors the plugging support in request_queue, but does not 452 * This mirrors the plugging support in request_queue, but does not
392 * require having a whole queue 453 * require having a whole queue or request structures.
454 * We allocate an md_plug_cb for each md device and each thread it gets
455 * plugged on. This links tot the private plug_handle structure in the
456 * personality data where we keep a count of the number of outstanding
457 * plugs so other code can see if a plug is active.
393 */ 458 */
394static void plugger_work(struct work_struct *work) 459struct md_plug_cb {
395{ 460 struct blk_plug_cb cb;
396 struct plug_handle *plug = 461 mddev_t *mddev;
397 container_of(work, struct plug_handle, unplug_work); 462};
398 plug->unplug_fn(plug);
399}
400static void plugger_timeout(unsigned long data)
401{
402 struct plug_handle *plug = (void *)data;
403 kblockd_schedule_work(NULL, &plug->unplug_work);
404}
405void plugger_init(struct plug_handle *plug,
406 void (*unplug_fn)(struct plug_handle *))
407{
408 plug->unplug_flag = 0;
409 plug->unplug_fn = unplug_fn;
410 init_timer(&plug->unplug_timer);
411 plug->unplug_timer.function = plugger_timeout;
412 plug->unplug_timer.data = (unsigned long)plug;
413 INIT_WORK(&plug->unplug_work, plugger_work);
414}
415EXPORT_SYMBOL_GPL(plugger_init);
416 463
417void plugger_set_plug(struct plug_handle *plug) 464static void plugger_unplug(struct blk_plug_cb *cb)
418{ 465{
419 if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) 466 struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
420 mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); 467 if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
468 md_wakeup_thread(mdcb->mddev->thread);
469 kfree(mdcb);
421} 470}
422EXPORT_SYMBOL_GPL(plugger_set_plug);
423 471
424int plugger_remove_plug(struct plug_handle *plug) 472/* Check that an unplug wakeup will come shortly.
473 * If not, wakeup the md thread immediately
474 */
475int mddev_check_plugged(mddev_t *mddev)
425{ 476{
426 if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { 477 struct blk_plug *plug = current->plug;
427 del_timer(&plug->unplug_timer); 478 struct md_plug_cb *mdcb;
428 return 1; 479
429 } else 480 if (!plug)
430 return 0; 481 return 0;
431}
432EXPORT_SYMBOL_GPL(plugger_remove_plug);
433 482
483 list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
484 if (mdcb->cb.callback == plugger_unplug &&
485 mdcb->mddev == mddev) {
486 /* Already on the list, move to top */
487 if (mdcb != list_first_entry(&plug->cb_list,
488 struct md_plug_cb,
489 cb.list))
490 list_move(&mdcb->cb.list, &plug->cb_list);
491 return 1;
492 }
493 }
494 /* Not currently on the callback list */
495 mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
496 if (!mdcb)
497 return 0;
498
499 mdcb->mddev = mddev;
500 mdcb->cb.callback = plugger_unplug;
501 atomic_inc(&mddev->plug_cnt);
502 list_add(&mdcb->cb.list, &plug->cb_list);
503 return 1;
504}
505EXPORT_SYMBOL_GPL(mddev_check_plugged);
434 506
435static inline mddev_t *mddev_get(mddev_t *mddev) 507static inline mddev_t *mddev_get(mddev_t *mddev)
436{ 508{
@@ -442,6 +514,8 @@ static void mddev_delayed_delete(struct work_struct *ws);
442 514
443static void mddev_put(mddev_t *mddev) 515static void mddev_put(mddev_t *mddev)
444{ 516{
517 struct bio_set *bs = NULL;
518
445 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 519 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
446 return; 520 return;
447 if (!mddev->raid_disks && list_empty(&mddev->disks) && 521 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -449,19 +523,22 @@ static void mddev_put(mddev_t *mddev)
449 /* Array is not configured at all, and not held active, 523 /* Array is not configured at all, and not held active,
450 * so destroy it */ 524 * so destroy it */
451 list_del(&mddev->all_mddevs); 525 list_del(&mddev->all_mddevs);
526 bs = mddev->bio_set;
527 mddev->bio_set = NULL;
452 if (mddev->gendisk) { 528 if (mddev->gendisk) {
453 /* we did a probe so need to clean up. 529 /* We did a probe so need to clean up. Call
454 * Call schedule_work inside the spinlock 530 * queue_work inside the spinlock so that
455 * so that flush_scheduled_work() after 531 * flush_workqueue() after mddev_find will
456 * mddev_find will succeed in waiting for the 532 * succeed in waiting for the work to be done.
457 * work to be done.
458 */ 533 */
459 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 534 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
460 schedule_work(&mddev->del_work); 535 queue_work(md_misc_wq, &mddev->del_work);
461 } else 536 } else
462 kfree(mddev); 537 kfree(mddev);
463 } 538 }
464 spin_unlock(&all_mddevs_lock); 539 spin_unlock(&all_mddevs_lock);
540 if (bs)
541 bioset_free(bs);
465} 542}
466 543
467void mddev_init(mddev_t *mddev) 544void mddev_init(mddev_t *mddev)
@@ -475,6 +552,7 @@ void mddev_init(mddev_t *mddev)
475 atomic_set(&mddev->active, 1); 552 atomic_set(&mddev->active, 1);
476 atomic_set(&mddev->openers, 0); 553 atomic_set(&mddev->openers, 0);
477 atomic_set(&mddev->active_io, 0); 554 atomic_set(&mddev->active_io, 0);
555 atomic_set(&mddev->plug_cnt, 0);
478 spin_lock_init(&mddev->write_lock); 556 spin_lock_init(&mddev->write_lock);
479 atomic_set(&mddev->flush_pending, 0); 557 atomic_set(&mddev->flush_pending, 0);
480 init_waitqueue_head(&mddev->sb_wait); 558 init_waitqueue_head(&mddev->sb_wait);
@@ -490,6 +568,9 @@ static mddev_t * mddev_find(dev_t unit)
490{ 568{
491 mddev_t *mddev, *new = NULL; 569 mddev_t *mddev, *new = NULL;
492 570
571 if (unit && MAJOR(unit) != MD_MAJOR)
572 unit &= ~((1<<MdpMinorShift)-1);
573
493 retry: 574 retry:
494 spin_lock(&all_mddevs_lock); 575 spin_lock(&all_mddevs_lock);
495 576
@@ -647,9 +728,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
647} 728}
648 729
649/* return the offset of the super block in 512byte sectors */ 730/* return the offset of the super block in 512byte sectors */
650static inline sector_t calc_dev_sboffset(struct block_device *bdev) 731static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
651{ 732{
652 sector_t num_sectors = bdev->bd_inode->i_size / 512; 733 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
653 return MD_NEW_SIZE_SECTORS(num_sectors); 734 return MD_NEW_SIZE_SECTORS(num_sectors);
654} 735}
655 736
@@ -696,31 +777,6 @@ static void super_written(struct bio *bio, int error)
696 bio_put(bio); 777 bio_put(bio);
697} 778}
698 779
699static void super_written_barrier(struct bio *bio, int error)
700{
701 struct bio *bio2 = bio->bi_private;
702 mdk_rdev_t *rdev = bio2->bi_private;
703 mddev_t *mddev = rdev->mddev;
704
705 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
706 error == -EOPNOTSUPP) {
707 unsigned long flags;
708 /* barriers don't appear to be supported :-( */
709 set_bit(BarriersNotsupp, &rdev->flags);
710 mddev->barriers_work = 0;
711 spin_lock_irqsave(&mddev->write_lock, flags);
712 bio2->bi_next = mddev->biolist;
713 mddev->biolist = bio2;
714 spin_unlock_irqrestore(&mddev->write_lock, flags);
715 wake_up(&mddev->sb_wait);
716 bio_put(bio);
717 } else {
718 bio_put(bio2);
719 bio->bi_private = rdev;
720 super_written(bio, error);
721 }
722}
723
724void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 780void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
725 sector_t sector, int size, struct page *page) 781 sector_t sector, int size, struct page *page)
726{ 782{
@@ -729,51 +785,27 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
729 * and decrement it on completion, waking up sb_wait 785 * and decrement it on completion, waking up sb_wait
730 * if zero is reached. 786 * if zero is reached.
731 * If an error occurred, call md_error 787 * If an error occurred, call md_error
732 *
733 * As we might need to resubmit the request if REQ_HARDBARRIER
734 * causes ENOTSUPP, we allocate a spare bio...
735 */ 788 */
736 struct bio *bio = bio_alloc(GFP_NOIO, 1); 789 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
737 int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
738 790
739 bio->bi_bdev = rdev->bdev; 791 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
740 bio->bi_sector = sector; 792 bio->bi_sector = sector;
741 bio_add_page(bio, page, size, 0); 793 bio_add_page(bio, page, size, 0);
742 bio->bi_private = rdev; 794 bio->bi_private = rdev;
743 bio->bi_end_io = super_written; 795 bio->bi_end_io = super_written;
744 bio->bi_rw = rw;
745 796
746 atomic_inc(&mddev->pending_writes); 797 atomic_inc(&mddev->pending_writes);
747 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 798 submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
748 struct bio *rbio;
749 rw |= REQ_HARDBARRIER;
750 rbio = bio_clone(bio, GFP_NOIO);
751 rbio->bi_private = bio;
752 rbio->bi_end_io = super_written_barrier;
753 submit_bio(rw, rbio);
754 } else
755 submit_bio(rw, bio);
756} 799}
757 800
758void md_super_wait(mddev_t *mddev) 801void md_super_wait(mddev_t *mddev)
759{ 802{
760 /* wait for all superblock writes that were scheduled to complete. 803 /* wait for all superblock writes that were scheduled to complete */
761 * if any had to be retried (due to BARRIER problems), retry them
762 */
763 DEFINE_WAIT(wq); 804 DEFINE_WAIT(wq);
764 for(;;) { 805 for(;;) {
765 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 806 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
766 if (atomic_read(&mddev->pending_writes)==0) 807 if (atomic_read(&mddev->pending_writes)==0)
767 break; 808 break;
768 while (mddev->biolist) {
769 struct bio *bio;
770 spin_lock_irq(&mddev->write_lock);
771 bio = mddev->biolist;
772 mddev->biolist = bio->bi_next ;
773 bio->bi_next = NULL;
774 spin_unlock_irq(&mddev->write_lock);
775 submit_bio(bio->bi_rw, bio);
776 }
777 schedule(); 809 schedule();
778 } 810 }
779 finish_wait(&mddev->sb_wait, &wq); 811 finish_wait(&mddev->sb_wait, &wq);
@@ -784,17 +816,21 @@ static void bi_complete(struct bio *bio, int error)
784 complete((struct completion*)bio->bi_private); 816 complete((struct completion*)bio->bi_private);
785} 817}
786 818
787int sync_page_io(struct block_device *bdev, sector_t sector, int size, 819int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
788 struct page *page, int rw) 820 struct page *page, int rw, bool metadata_op)
789{ 821{
790 struct bio *bio = bio_alloc(GFP_NOIO, 1); 822 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
791 struct completion event; 823 struct completion event;
792 int ret; 824 int ret;
793 825
794 rw |= REQ_SYNC | REQ_UNPLUG; 826 rw |= REQ_SYNC;
795 827
796 bio->bi_bdev = bdev; 828 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
797 bio->bi_sector = sector; 829 rdev->meta_bdev : rdev->bdev;
830 if (metadata_op)
831 bio->bi_sector = sector + rdev->sb_start;
832 else
833 bio->bi_sector = sector + rdev->data_offset;
798 bio_add_page(bio, page, size, 0); 834 bio_add_page(bio, page, size, 0);
799 init_completion(&event); 835 init_completion(&event);
800 bio->bi_private = &event; 836 bio->bi_private = &event;
@@ -819,7 +855,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
819 return 0; 855 return 0;
820 856
821 857
822 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 858 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
823 goto fail; 859 goto fail;
824 rdev->sb_loaded = 1; 860 rdev->sb_loaded = 1;
825 return 0; 861 return 0;
@@ -981,7 +1017,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
981 * 1017 *
982 * It also happens to be a multiple of 4Kb. 1018 * It also happens to be a multiple of 4Kb.
983 */ 1019 */
984 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1020 rdev->sb_start = calc_dev_sboffset(rdev);
985 1021
986 ret = read_disk_sb(rdev, MD_SB_BYTES); 1022 ret = read_disk_sb(rdev, MD_SB_BYTES);
987 if (ret) return ret; 1023 if (ret) return ret;
@@ -1070,7 +1106,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1070 clear_bit(Faulty, &rdev->flags); 1106 clear_bit(Faulty, &rdev->flags);
1071 clear_bit(In_sync, &rdev->flags); 1107 clear_bit(In_sync, &rdev->flags);
1072 clear_bit(WriteMostly, &rdev->flags); 1108 clear_bit(WriteMostly, &rdev->flags);
1073 clear_bit(BarriersNotsupp, &rdev->flags);
1074 1109
1075 if (mddev->raid_disks == 0) { 1110 if (mddev->raid_disks == 0) {
1076 mddev->major_version = 0; 1111 mddev->major_version = 0;
@@ -1323,13 +1358,13 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1323 return 0; /* component must fit device */ 1358 return 0; /* component must fit device */
1324 if (rdev->mddev->bitmap_info.offset) 1359 if (rdev->mddev->bitmap_info.offset)
1325 return 0; /* can't move bitmap */ 1360 return 0; /* can't move bitmap */
1326 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1361 rdev->sb_start = calc_dev_sboffset(rdev);
1327 if (!num_sectors || num_sectors > rdev->sb_start) 1362 if (!num_sectors || num_sectors > rdev->sb_start)
1328 num_sectors = rdev->sb_start; 1363 num_sectors = rdev->sb_start;
1329 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1364 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1330 rdev->sb_page); 1365 rdev->sb_page);
1331 md_super_wait(rdev->mddev); 1366 md_super_wait(rdev->mddev);
1332 return num_sectors / 2; /* kB for sysfs */ 1367 return num_sectors;
1333} 1368}
1334 1369
1335 1370
@@ -1378,7 +1413,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1378 */ 1413 */
1379 switch(minor_version) { 1414 switch(minor_version) {
1380 case 0: 1415 case 0:
1381 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1416 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1382 sb_start -= 8*2; 1417 sb_start -= 8*2;
1383 sb_start &= ~(sector_t)(4*2-1); 1418 sb_start &= ~(sector_t)(4*2-1);
1384 break; 1419 break;
@@ -1464,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1464 ret = 0; 1499 ret = 0;
1465 } 1500 }
1466 if (minor_version) 1501 if (minor_version)
1467 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1502 rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
1468 le64_to_cpu(sb->data_offset); 1503 le64_to_cpu(sb->data_offset);
1469 else 1504 else
1470 rdev->sectors = rdev->sb_start; 1505 rdev->sectors = rdev->sb_start;
@@ -1485,7 +1520,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1485 clear_bit(Faulty, &rdev->flags); 1520 clear_bit(Faulty, &rdev->flags);
1486 clear_bit(In_sync, &rdev->flags); 1521 clear_bit(In_sync, &rdev->flags);
1487 clear_bit(WriteMostly, &rdev->flags); 1522 clear_bit(WriteMostly, &rdev->flags);
1488 clear_bit(BarriersNotsupp, &rdev->flags);
1489 1523
1490 if (mddev->raid_disks == 0) { 1524 if (mddev->raid_disks == 0) {
1491 mddev->major_version = 1; 1525 mddev->major_version = 1;
@@ -1673,7 +1707,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1673 return 0; /* component must fit device */ 1707 return 0; /* component must fit device */
1674 if (rdev->sb_start < rdev->data_offset) { 1708 if (rdev->sb_start < rdev->data_offset) {
1675 /* minor versions 1 and 2; superblock before data */ 1709 /* minor versions 1 and 2; superblock before data */
1676 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1710 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1677 max_sectors -= rdev->data_offset; 1711 max_sectors -= rdev->data_offset;
1678 if (!num_sectors || num_sectors > max_sectors) 1712 if (!num_sectors || num_sectors > max_sectors)
1679 num_sectors = max_sectors; 1713 num_sectors = max_sectors;
@@ -1683,7 +1717,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1683 } else { 1717 } else {
1684 /* minor version 0; superblock after data */ 1718 /* minor version 0; superblock after data */
1685 sector_t sb_start; 1719 sector_t sb_start;
1686 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1720 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1687 sb_start &= ~(sector_t)(4*2 - 1); 1721 sb_start &= ~(sector_t)(4*2 - 1);
1688 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1722 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1689 if (!num_sectors || num_sectors > max_sectors) 1723 if (!num_sectors || num_sectors > max_sectors)
@@ -1697,7 +1731,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1697 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1731 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1698 rdev->sb_page); 1732 rdev->sb_page);
1699 md_super_wait(rdev->mddev); 1733 md_super_wait(rdev->mddev);
1700 return num_sectors / 2; /* kB for sysfs */ 1734 return num_sectors;
1701} 1735}
1702 1736
1703static struct super_type super_types[] = { 1737static struct super_type super_types[] = {
@@ -1719,6 +1753,18 @@ static struct super_type super_types[] = {
1719 }, 1753 },
1720}; 1754};
1721 1755
1756static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
1757{
1758 if (mddev->sync_super) {
1759 mddev->sync_super(mddev, rdev);
1760 return;
1761 }
1762
1763 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1764
1765 super_types[mddev->major_version].sync_super(mddev, rdev);
1766}
1767
1722static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1768static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1723{ 1769{
1724 mdk_rdev_t *rdev, *rdev2; 1770 mdk_rdev_t *rdev, *rdev2;
@@ -1750,20 +1796,14 @@ int md_integrity_register(mddev_t *mddev)
1750 1796
1751 if (list_empty(&mddev->disks)) 1797 if (list_empty(&mddev->disks))
1752 return 0; /* nothing to do */ 1798 return 0; /* nothing to do */
1753 if (blk_get_integrity(mddev->gendisk)) 1799 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1754 return 0; /* already registered */ 1800 return 0; /* shouldn't register, or already is */
1755 list_for_each_entry(rdev, &mddev->disks, same_set) { 1801 list_for_each_entry(rdev, &mddev->disks, same_set) {
1756 /* skip spares and non-functional disks */ 1802 /* skip spares and non-functional disks */
1757 if (test_bit(Faulty, &rdev->flags)) 1803 if (test_bit(Faulty, &rdev->flags))
1758 continue; 1804 continue;
1759 if (rdev->raid_disk < 0) 1805 if (rdev->raid_disk < 0)
1760 continue; 1806 continue;
1761 /*
1762 * If at least one rdev is not integrity capable, we can not
1763 * enable data integrity for the md device.
1764 */
1765 if (!bdev_get_integrity(rdev->bdev))
1766 return -EINVAL;
1767 if (!reference) { 1807 if (!reference) {
1768 /* Use the first rdev as the reference */ 1808 /* Use the first rdev as the reference */
1769 reference = rdev; 1809 reference = rdev;
@@ -1774,6 +1814,8 @@ int md_integrity_register(mddev_t *mddev)
1774 rdev->bdev->bd_disk) < 0) 1814 rdev->bdev->bd_disk) < 0)
1775 return -EINVAL; 1815 return -EINVAL;
1776 } 1816 }
1817 if (!reference || !bdev_get_integrity(reference->bdev))
1818 return 0;
1777 /* 1819 /*
1778 * All component devices are integrity capable and have matching 1820 * All component devices are integrity capable and have matching
1779 * profiles, register the common profile for the md device. 1821 * profiles, register the common profile for the md device.
@@ -1784,8 +1826,12 @@ int md_integrity_register(mddev_t *mddev)
1784 mdname(mddev)); 1826 mdname(mddev));
1785 return -EINVAL; 1827 return -EINVAL;
1786 } 1828 }
1787 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1829 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1788 mdname(mddev)); 1830 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1831 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1832 mdname(mddev));
1833 return -EINVAL;
1834 }
1789 return 0; 1835 return 0;
1790} 1836}
1791EXPORT_SYMBOL(md_integrity_register); 1837EXPORT_SYMBOL(md_integrity_register);
@@ -1873,7 +1919,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1873 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 1919 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
1874 1920
1875 list_add_rcu(&rdev->same_set, &mddev->disks); 1921 list_add_rcu(&rdev->same_set, &mddev->disks);
1876 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1922 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1877 1923
1878 /* May as well allow recovery to be retried once */ 1924 /* May as well allow recovery to be retried once */
1879 mddev->recovery_disabled = 0; 1925 mddev->recovery_disabled = 0;
@@ -1900,7 +1946,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1900 MD_BUG(); 1946 MD_BUG();
1901 return; 1947 return;
1902 } 1948 }
1903 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1949 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
1904 list_del_rcu(&rdev->same_set); 1950 list_del_rcu(&rdev->same_set);
1905 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1951 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1906 rdev->mddev = NULL; 1952 rdev->mddev = NULL;
@@ -1914,7 +1960,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1914 synchronize_rcu(); 1960 synchronize_rcu();
1915 INIT_WORK(&rdev->del_work, md_delayed_delete); 1961 INIT_WORK(&rdev->del_work, md_delayed_delete);
1916 kobject_get(&rdev->kobj); 1962 kobject_get(&rdev->kobj);
1917 schedule_work(&rdev->del_work); 1963 queue_work(md_misc_wq, &rdev->del_work);
1918} 1964}
1919 1965
1920/* 1966/*
@@ -1928,21 +1974,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1928 struct block_device *bdev; 1974 struct block_device *bdev;
1929 char b[BDEVNAME_SIZE]; 1975 char b[BDEVNAME_SIZE];
1930 1976
1931 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1977 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1978 shared ? (mdk_rdev_t *)lock_rdev : rdev);
1932 if (IS_ERR(bdev)) { 1979 if (IS_ERR(bdev)) {
1933 printk(KERN_ERR "md: could not open %s.\n", 1980 printk(KERN_ERR "md: could not open %s.\n",
1934 __bdevname(dev, b)); 1981 __bdevname(dev, b));
1935 return PTR_ERR(bdev); 1982 return PTR_ERR(bdev);
1936 } 1983 }
1937 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1938 if (err) {
1939 printk(KERN_ERR "md: could not bd_claim %s.\n",
1940 bdevname(bdev, b));
1941 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1942 return err;
1943 }
1944 if (!shared)
1945 set_bit(AllReserved, &rdev->flags);
1946 rdev->bdev = bdev; 1984 rdev->bdev = bdev;
1947 return err; 1985 return err;
1948} 1986}
@@ -1953,8 +1991,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
1953 rdev->bdev = NULL; 1991 rdev->bdev = NULL;
1954 if (!bdev) 1992 if (!bdev)
1955 MD_BUG(); 1993 MD_BUG();
1956 bd_release(bdev); 1994 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1957 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1958} 1995}
1959 1996
1960void md_autodetect_dev(dev_t dev); 1997void md_autodetect_dev(dev_t dev);
@@ -2146,8 +2183,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
2146 /* Don't update this superblock */ 2183 /* Don't update this superblock */
2147 rdev->sb_loaded = 2; 2184 rdev->sb_loaded = 2;
2148 } else { 2185 } else {
2149 super_types[mddev->major_version]. 2186 sync_super(mddev, rdev);
2150 sync_super(mddev, rdev);
2151 rdev->sb_loaded = 1; 2187 rdev->sb_loaded = 1;
2152 } 2188 }
2153 } 2189 }
@@ -2172,6 +2208,8 @@ repeat:
2172 if (!mddev->persistent) { 2208 if (!mddev->persistent) {
2173 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2209 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2174 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2210 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2211 if (!mddev->external)
2212 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2175 wake_up(&mddev->sb_wait); 2213 wake_up(&mddev->sb_wait);
2176 return; 2214 return;
2177 } 2215 }
@@ -2438,7 +2476,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2438 if (rdev->raid_disk == -1) 2476 if (rdev->raid_disk == -1)
2439 return -EEXIST; 2477 return -EEXIST;
2440 /* personality does all needed checks */ 2478 /* personality does all needed checks */
2441 if (rdev->mddev->pers->hot_add_disk == NULL) 2479 if (rdev->mddev->pers->hot_remove_disk == NULL)
2442 return -EINVAL; 2480 return -EINVAL;
2443 err = rdev->mddev->pers-> 2481 err = rdev->mddev->pers->
2444 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2482 hot_remove_disk(rdev->mddev, rdev->raid_disk);
@@ -2458,6 +2496,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2458 if (rdev->raid_disk != -1) 2496 if (rdev->raid_disk != -1)
2459 return -EBUSY; 2497 return -EBUSY;
2460 2498
2499 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2500 return -EBUSY;
2501
2461 if (rdev->mddev->pers->hot_add_disk == NULL) 2502 if (rdev->mddev->pers->hot_add_disk == NULL)
2462 return -EINVAL; 2503 return -EINVAL;
2463 2504
@@ -2465,6 +2506,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2465 if (rdev2->raid_disk == slot) 2506 if (rdev2->raid_disk == slot)
2466 return -EEXIST; 2507 return -EEXIST;
2467 2508
2509 if (slot >= rdev->mddev->raid_disks &&
2510 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2511 return -ENOSPC;
2512
2468 rdev->raid_disk = slot; 2513 rdev->raid_disk = slot;
2469 if (test_bit(In_sync, &rdev->flags)) 2514 if (test_bit(In_sync, &rdev->flags))
2470 rdev->saved_raid_disk = slot; 2515 rdev->saved_raid_disk = slot;
@@ -2482,7 +2527,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 /* failure here is OK */; 2527 /* failure here is OK */;
2483 /* don't wakeup anyone, leave that to userspace. */ 2528 /* don't wakeup anyone, leave that to userspace. */
2484 } else { 2529 } else {
2485 if (slot >= rdev->mddev->raid_disks) 2530 if (slot >= rdev->mddev->raid_disks &&
2531 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2486 return -ENOSPC; 2532 return -ENOSPC;
2487 rdev->raid_disk = slot; 2533 rdev->raid_disk = slot;
2488 /* assume it is working */ 2534 /* assume it is working */
@@ -2575,7 +2621,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2575 if (!sectors) 2621 if (!sectors)
2576 return -EBUSY; 2622 return -EBUSY;
2577 } else if (!sectors) 2623 } else if (!sectors)
2578 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2624 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2579 rdev->data_offset; 2625 rdev->data_offset;
2580 } 2626 }
2581 if (sectors < my_mddev->dev_sectors) 2627 if (sectors < my_mddev->dev_sectors)
@@ -2598,12 +2644,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2598 2644
2599 mddev_lock(mddev); 2645 mddev_lock(mddev);
2600 list_for_each_entry(rdev2, &mddev->disks, same_set) 2646 list_for_each_entry(rdev2, &mddev->disks, same_set)
2601 if (test_bit(AllReserved, &rdev2->flags) || 2647 if (rdev->bdev == rdev2->bdev &&
2602 (rdev->bdev == rdev2->bdev && 2648 rdev != rdev2 &&
2603 rdev != rdev2 && 2649 overlaps(rdev->data_offset, rdev->sectors,
2604 overlaps(rdev->data_offset, rdev->sectors, 2650 rdev2->data_offset,
2605 rdev2->data_offset, 2651 rdev2->sectors)) {
2606 rdev2->sectors))) {
2607 overlap = 1; 2652 overlap = 1;
2608 break; 2653 break;
2609 } 2654 }
@@ -2788,7 +2833,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2788 2833
2789 kobject_init(&rdev->kobj, &rdev_ktype); 2834 kobject_init(&rdev->kobj, &rdev_ktype);
2790 2835
2791 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2836 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
2792 if (!size) { 2837 if (!size) {
2793 printk(KERN_WARNING 2838 printk(KERN_WARNING
2794 "md: %s has zero or unknown size, marking faulty!\n", 2839 "md: %s has zero or unknown size, marking faulty!\n",
@@ -3107,7 +3152,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3107 char nm[20]; 3152 char nm[20];
3108 if (rdev->raid_disk < 0) 3153 if (rdev->raid_disk < 0)
3109 continue; 3154 continue;
3110 if (rdev->new_raid_disk > mddev->raid_disks) 3155 if (rdev->new_raid_disk >= mddev->raid_disks)
3111 rdev->new_raid_disk = -1; 3156 rdev->new_raid_disk = -1;
3112 if (rdev->new_raid_disk == rdev->raid_disk) 3157 if (rdev->new_raid_disk == rdev->raid_disk)
3113 continue; 3158 continue;
@@ -3139,6 +3184,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3139 mddev->layout = mddev->new_layout; 3184 mddev->layout = mddev->new_layout;
3140 mddev->chunk_sectors = mddev->new_chunk_sectors; 3185 mddev->chunk_sectors = mddev->new_chunk_sectors;
3141 mddev->delta_disks = 0; 3186 mddev->delta_disks = 0;
3187 mddev->degraded = 0;
3142 if (mddev->pers->sync_request == NULL) { 3188 if (mddev->pers->sync_request == NULL) {
3143 /* this is now an array without redundancy, so 3189 /* this is now an array without redundancy, so
3144 * it must always be in_sync 3190 * it must always be in_sync
@@ -3292,7 +3338,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3292 char *e; 3338 char *e;
3293 unsigned long long n = simple_strtoull(buf, &e, 10); 3339 unsigned long long n = simple_strtoull(buf, &e, 10);
3294 3340
3295 if (mddev->pers) 3341 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3296 return -EBUSY; 3342 return -EBUSY;
3297 if (cmd_match(buf, "none")) 3343 if (cmd_match(buf, "none"))
3298 n = MaxSector; 3344 n = MaxSector;
@@ -3736,6 +3782,8 @@ action_show(mddev_t *mddev, char *page)
3736 return sprintf(page, "%s\n", type); 3782 return sprintf(page, "%s\n", type);
3737} 3783}
3738 3784
3785static void reap_sync_thread(mddev_t *mddev);
3786
3739static ssize_t 3787static ssize_t
3740action_store(mddev_t *mddev, const char *page, size_t len) 3788action_store(mddev_t *mddev, const char *page, size_t len)
3741{ 3789{
@@ -3750,9 +3798,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
3750 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3798 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3751 if (mddev->sync_thread) { 3799 if (mddev->sync_thread) {
3752 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3800 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3753 md_unregister_thread(mddev->sync_thread); 3801 reap_sync_thread(mddev);
3754 mddev->sync_thread = NULL;
3755 mddev->recovery = 0;
3756 } 3802 }
3757 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3803 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3758 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3804 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3950,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3904static ssize_t 3950static ssize_t
3905sync_completed_show(mddev_t *mddev, char *page) 3951sync_completed_show(mddev_t *mddev, char *page)
3906{ 3952{
3907 unsigned long max_sectors, resync; 3953 unsigned long long max_sectors, resync;
3908 3954
3909 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3955 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3910 return sprintf(page, "none\n"); 3956 return sprintf(page, "none\n");
@@ -3915,7 +3961,7 @@ sync_completed_show(mddev_t *mddev, char *page)
3915 max_sectors = mddev->dev_sectors; 3961 max_sectors = mddev->dev_sectors;
3916 3962
3917 resync = mddev->curr_resync_completed; 3963 resync = mddev->curr_resync_completed;
3918 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3964 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
3919} 3965}
3920 3966
3921static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3967static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4048,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
4002{ 4048{
4003 char *e; 4049 char *e;
4004 unsigned long long new = simple_strtoull(buf, &e, 10); 4050 unsigned long long new = simple_strtoull(buf, &e, 10);
4051 unsigned long long old = mddev->suspend_lo;
4005 4052
4006 if (mddev->pers == NULL || 4053 if (mddev->pers == NULL ||
4007 mddev->pers->quiesce == NULL) 4054 mddev->pers->quiesce == NULL)
4008 return -EINVAL; 4055 return -EINVAL;
4009 if (buf == e || (*e && *e != '\n')) 4056 if (buf == e || (*e && *e != '\n'))
4010 return -EINVAL; 4057 return -EINVAL;
4011 if (new >= mddev->suspend_hi || 4058
4012 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 4059 mddev->suspend_lo = new;
4013 mddev->suspend_lo = new; 4060 if (new >= old)
4061 /* Shrinking suspended region */
4014 mddev->pers->quiesce(mddev, 2); 4062 mddev->pers->quiesce(mddev, 2);
4015 return len; 4063 else {
4016 } else 4064 /* Expanding suspended region - need to wait */
4017 return -EINVAL; 4065 mddev->pers->quiesce(mddev, 1);
4066 mddev->pers->quiesce(mddev, 0);
4067 }
4068 return len;
4018} 4069}
4019static struct md_sysfs_entry md_suspend_lo = 4070static struct md_sysfs_entry md_suspend_lo =
4020__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4071__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4082,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
4031{ 4082{
4032 char *e; 4083 char *e;
4033 unsigned long long new = simple_strtoull(buf, &e, 10); 4084 unsigned long long new = simple_strtoull(buf, &e, 10);
4085 unsigned long long old = mddev->suspend_hi;
4034 4086
4035 if (mddev->pers == NULL || 4087 if (mddev->pers == NULL ||
4036 mddev->pers->quiesce == NULL) 4088 mddev->pers->quiesce == NULL)
4037 return -EINVAL; 4089 return -EINVAL;
4038 if (buf == e || (*e && *e != '\n')) 4090 if (buf == e || (*e && *e != '\n'))
4039 return -EINVAL; 4091 return -EINVAL;
4040 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 4092
4041 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 4093 mddev->suspend_hi = new;
4042 mddev->suspend_hi = new; 4094 if (new <= old)
4095 /* Shrinking suspended region */
4096 mddev->pers->quiesce(mddev, 2);
4097 else {
4098 /* Expanding suspended region - need to wait */
4043 mddev->pers->quiesce(mddev, 1); 4099 mddev->pers->quiesce(mddev, 1);
4044 mddev->pers->quiesce(mddev, 0); 4100 mddev->pers->quiesce(mddev, 0);
4045 return len; 4101 }
4046 } else 4102 return len;
4047 return -EINVAL;
4048} 4103}
4049static struct md_sysfs_entry md_suspend_hi = 4104static struct md_sysfs_entry md_suspend_hi =
4050__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4105__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4112,10 +4167,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
4112 } 4167 }
4113 4168
4114 mddev->array_sectors = sectors; 4169 mddev->array_sectors = sectors;
4115 set_capacity(mddev->gendisk, mddev->array_sectors); 4170 if (mddev->pers) {
4116 if (mddev->pers) 4171 set_capacity(mddev->gendisk, mddev->array_sectors);
4117 revalidate_disk(mddev->gendisk); 4172 revalidate_disk(mddev->gendisk);
4118 4173 }
4119 return len; 4174 return len;
4120} 4175}
4121 4176
@@ -4256,10 +4311,10 @@ static int md_alloc(dev_t dev, char *name)
4256 shift = partitioned ? MdpMinorShift : 0; 4311 shift = partitioned ? MdpMinorShift : 0;
4257 unit = MINOR(mddev->unit) >> shift; 4312 unit = MINOR(mddev->unit) >> shift;
4258 4313
4259 /* wait for any previous instance if this device 4314 /* wait for any previous instance of this device to be
4260 * to be completed removed (mddev_delayed_delete). 4315 * completely removed (mddev_delayed_delete).
4261 */ 4316 */
4262 flush_scheduled_work(); 4317 flush_workqueue(md_misc_wq);
4263 4318
4264 mutex_lock(&disks_mutex); 4319 mutex_lock(&disks_mutex);
4265 error = -EEXIST; 4320 error = -EEXIST;
@@ -4287,9 +4342,6 @@ static int md_alloc(dev_t dev, char *name)
4287 goto abort; 4342 goto abort;
4288 mddev->queue->queuedata = mddev; 4343 mddev->queue->queuedata = mddev;
4289 4344
4290 /* Can be unlocked because the queue is new: no concurrency */
4291 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
4292
4293 blk_queue_make_request(mddev->queue, md_make_request); 4345 blk_queue_make_request(mddev->queue, md_make_request);
4294 4346
4295 disk = alloc_disk(1 << shift); 4347 disk = alloc_disk(1 << shift);
@@ -4309,13 +4361,19 @@ static int md_alloc(dev_t dev, char *name)
4309 disk->fops = &md_fops; 4361 disk->fops = &md_fops;
4310 disk->private_data = mddev; 4362 disk->private_data = mddev;
4311 disk->queue = mddev->queue; 4363 disk->queue = mddev->queue;
4364 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4312 /* Allow extended partitions. This makes the 4365 /* Allow extended partitions. This makes the
4313 * 'mdp' device redundant, but we can't really 4366 * 'mdp' device redundant, but we can't really
4314 * remove it now. 4367 * remove it now.
4315 */ 4368 */
4316 disk->flags |= GENHD_FL_EXT_DEVT; 4369 disk->flags |= GENHD_FL_EXT_DEVT;
4317 add_disk(disk);
4318 mddev->gendisk = disk; 4370 mddev->gendisk = disk;
4371 /* As soon as we call add_disk(), another thread could get
4372 * through to md_open, so make sure it doesn't get too far
4373 */
4374 mutex_lock(&mddev->open_mutex);
4375 add_disk(disk);
4376
4319 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4377 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4320 &disk_to_dev(disk)->kobj, "%s", "md"); 4378 &disk_to_dev(disk)->kobj, "%s", "md");
4321 if (error) { 4379 if (error) {
@@ -4329,6 +4387,7 @@ static int md_alloc(dev_t dev, char *name)
4329 if (mddev->kobj.sd && 4387 if (mddev->kobj.sd &&
4330 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4388 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4331 printk(KERN_DEBUG "pointless warning\n"); 4389 printk(KERN_DEBUG "pointless warning\n");
4390 mutex_unlock(&mddev->open_mutex);
4332 abort: 4391 abort:
4333 mutex_unlock(&disks_mutex); 4392 mutex_unlock(&disks_mutex);
4334 if (!error && mddev->kobj.sd) { 4393 if (!error && mddev->kobj.sd) {
@@ -4423,7 +4482,9 @@ int md_run(mddev_t *mddev)
4423 * We don't want the data to overlap the metadata, 4482 * We don't want the data to overlap the metadata,
4424 * Internal Bitmap issues have been handled elsewhere. 4483 * Internal Bitmap issues have been handled elsewhere.
4425 */ 4484 */
4426 if (rdev->data_offset < rdev->sb_start) { 4485 if (rdev->meta_bdev) {
4486 /* Nothing to check */;
4487 } else if (rdev->data_offset < rdev->sb_start) {
4427 if (mddev->dev_sectors && 4488 if (mddev->dev_sectors &&
4428 rdev->data_offset + mddev->dev_sectors 4489 rdev->data_offset + mddev->dev_sectors
4429 > rdev->sb_start) { 4490 > rdev->sb_start) {
@@ -4442,6 +4503,9 @@ int md_run(mddev_t *mddev)
4442 sysfs_notify_dirent_safe(rdev->sysfs_state); 4503 sysfs_notify_dirent_safe(rdev->sysfs_state);
4443 } 4504 }
4444 4505
4506 if (mddev->bio_set == NULL)
4507 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
4508
4445 spin_lock(&pers_lock); 4509 spin_lock(&pers_lock);
4446 pers = find_pers(mddev->level, mddev->clevel); 4510 pers = find_pers(mddev->level, mddev->clevel);
4447 if (!pers || !try_module_get(pers->owner)) { 4511 if (!pers || !try_module_get(pers->owner)) {
@@ -4504,7 +4568,6 @@ int md_run(mddev_t *mddev)
4504 /* may be over-ridden by personality */ 4568 /* may be over-ridden by personality */
4505 mddev->resync_max_sectors = mddev->dev_sectors; 4569 mddev->resync_max_sectors = mddev->dev_sectors;
4506 4570
4507 mddev->barriers_work = 1;
4508 mddev->ok_start_degraded = start_dirty_degraded; 4571 mddev->ok_start_degraded = start_dirty_degraded;
4509 4572
4510 if (start_readonly && mddev->ro == 0) 4573 if (start_readonly && mddev->ro == 0)
@@ -4555,7 +4618,8 @@ int md_run(mddev_t *mddev)
4555 mddev->safemode_timer.data = (unsigned long) mddev; 4618 mddev->safemode_timer.data = (unsigned long) mddev;
4556 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4619 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4557 mddev->in_sync = 1; 4620 mddev->in_sync = 1;
4558 4621 smp_wmb();
4622 mddev->ready = 1;
4559 list_for_each_entry(rdev, &mddev->disks, same_set) 4623 list_for_each_entry(rdev, &mddev->disks, same_set)
4560 if (rdev->raid_disk >= 0) { 4624 if (rdev->raid_disk >= 0) {
4561 char nm[20]; 4625 char nm[20];
@@ -4569,9 +4633,6 @@ int md_run(mddev_t *mddev)
4569 if (mddev->flags) 4633 if (mddev->flags)
4570 md_update_sb(mddev, 0); 4634 md_update_sb(mddev, 0);
4571 4635
4572 md_wakeup_thread(mddev->thread);
4573 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4574
4575 md_new_event(mddev); 4636 md_new_event(mddev);
4576 sysfs_notify_dirent_safe(mddev->sysfs_state); 4637 sysfs_notify_dirent_safe(mddev->sysfs_state);
4577 sysfs_notify_dirent_safe(mddev->sysfs_action); 4638 sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -4592,8 +4653,13 @@ static int do_md_run(mddev_t *mddev)
4592 bitmap_destroy(mddev); 4653 bitmap_destroy(mddev);
4593 goto out; 4654 goto out;
4594 } 4655 }
4656
4657 md_wakeup_thread(mddev->thread);
4658 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4659
4595 set_capacity(mddev->gendisk, mddev->array_sectors); 4660 set_capacity(mddev->gendisk, mddev->array_sectors);
4596 revalidate_disk(mddev->gendisk); 4661 revalidate_disk(mddev->gendisk);
4662 mddev->changed = 1;
4597 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4663 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4598out: 4664out:
4599 return err; 4665 return err;
@@ -4682,24 +4748,22 @@ static void md_clean(mddev_t *mddev)
4682 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4748 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4683 mddev->recovery = 0; 4749 mddev->recovery = 0;
4684 mddev->in_sync = 0; 4750 mddev->in_sync = 0;
4751 mddev->changed = 0;
4685 mddev->degraded = 0; 4752 mddev->degraded = 0;
4686 mddev->barriers_work = 0;
4687 mddev->safemode = 0; 4753 mddev->safemode = 0;
4688 mddev->bitmap_info.offset = 0; 4754 mddev->bitmap_info.offset = 0;
4689 mddev->bitmap_info.default_offset = 0; 4755 mddev->bitmap_info.default_offset = 0;
4690 mddev->bitmap_info.chunksize = 0; 4756 mddev->bitmap_info.chunksize = 0;
4691 mddev->bitmap_info.daemon_sleep = 0; 4757 mddev->bitmap_info.daemon_sleep = 0;
4692 mddev->bitmap_info.max_write_behind = 0; 4758 mddev->bitmap_info.max_write_behind = 0;
4693 mddev->plug = NULL;
4694} 4759}
4695 4760
4696void md_stop_writes(mddev_t *mddev) 4761static void __md_stop_writes(mddev_t *mddev)
4697{ 4762{
4698 if (mddev->sync_thread) { 4763 if (mddev->sync_thread) {
4699 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4764 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4700 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4765 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4701 md_unregister_thread(mddev->sync_thread); 4766 reap_sync_thread(mddev);
4702 mddev->sync_thread = NULL;
4703 } 4767 }
4704 4768
4705 del_timer_sync(&mddev->safemode_timer); 4769 del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4777,18 @@ void md_stop_writes(mddev_t *mddev)
4713 md_update_sb(mddev, 1); 4777 md_update_sb(mddev, 1);
4714 } 4778 }
4715} 4779}
4780
4781void md_stop_writes(mddev_t *mddev)
4782{
4783 mddev_lock(mddev);
4784 __md_stop_writes(mddev);
4785 mddev_unlock(mddev);
4786}
4716EXPORT_SYMBOL_GPL(md_stop_writes); 4787EXPORT_SYMBOL_GPL(md_stop_writes);
4717 4788
4718void md_stop(mddev_t *mddev) 4789void md_stop(mddev_t *mddev)
4719{ 4790{
4791 mddev->ready = 0;
4720 mddev->pers->stop(mddev); 4792 mddev->pers->stop(mddev);
4721 if (mddev->pers->sync_request && mddev->to_remove == NULL) 4793 if (mddev->pers->sync_request && mddev->to_remove == NULL)
4722 mddev->to_remove = &md_redundancy_group; 4794 mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4808,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
4736 goto out; 4808 goto out;
4737 } 4809 }
4738 if (mddev->pers) { 4810 if (mddev->pers) {
4739 md_stop_writes(mddev); 4811 __md_stop_writes(mddev);
4740 4812
4741 err = -ENXIO; 4813 err = -ENXIO;
4742 if (mddev->ro==1) 4814 if (mddev->ro==1)
@@ -4773,10 +4845,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4773 if (mddev->ro) 4845 if (mddev->ro)
4774 set_disk_ro(disk, 0); 4846 set_disk_ro(disk, 0);
4775 4847
4776 md_stop_writes(mddev); 4848 __md_stop_writes(mddev);
4777 md_stop(mddev); 4849 md_stop(mddev);
4778 mddev->queue->merge_bvec_fn = NULL; 4850 mddev->queue->merge_bvec_fn = NULL;
4779 mddev->queue->unplug_fn = NULL;
4780 mddev->queue->backing_dev_info.congested_fn = NULL; 4851 mddev->queue->backing_dev_info.congested_fn = NULL;
4781 4852
4782 /* tell userspace to handle 'inactive' */ 4853 /* tell userspace to handle 'inactive' */
@@ -4791,6 +4862,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4791 4862
4792 set_capacity(disk, 0); 4863 set_capacity(disk, 0);
4793 mutex_unlock(&mddev->open_mutex); 4864 mutex_unlock(&mddev->open_mutex);
4865 mddev->changed = 1;
4794 revalidate_disk(disk); 4866 revalidate_disk(disk);
4795 4867
4796 if (mddev->ro) 4868 if (mddev->ro)
@@ -5148,17 +5220,31 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5148 PTR_ERR(rdev)); 5220 PTR_ERR(rdev));
5149 return PTR_ERR(rdev); 5221 return PTR_ERR(rdev);
5150 } 5222 }
5151 /* set save_raid_disk if appropriate */ 5223 /* set saved_raid_disk if appropriate */
5152 if (!mddev->persistent) { 5224 if (!mddev->persistent) {
5153 if (info->state & (1<<MD_DISK_SYNC) && 5225 if (info->state & (1<<MD_DISK_SYNC) &&
5154 info->raid_disk < mddev->raid_disks) 5226 info->raid_disk < mddev->raid_disks) {
5155 rdev->raid_disk = info->raid_disk; 5227 rdev->raid_disk = info->raid_disk;
5156 else 5228 set_bit(In_sync, &rdev->flags);
5229 } else
5157 rdev->raid_disk = -1; 5230 rdev->raid_disk = -1;
5158 } else 5231 } else
5159 super_types[mddev->major_version]. 5232 super_types[mddev->major_version].
5160 validate_super(mddev, rdev); 5233 validate_super(mddev, rdev);
5161 rdev->saved_raid_disk = rdev->raid_disk; 5234 if ((info->state & (1<<MD_DISK_SYNC)) &&
5235 (!test_bit(In_sync, &rdev->flags) ||
5236 rdev->raid_disk != info->raid_disk)) {
5237 /* This was a hot-add request, but events doesn't
5238 * match, so reject it.
5239 */
5240 export_rdev(rdev);
5241 return -EINVAL;
5242 }
5243
5244 if (test_bit(In_sync, &rdev->flags))
5245 rdev->saved_raid_disk = rdev->raid_disk;
5246 else
5247 rdev->saved_raid_disk = -1;
5162 5248
5163 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5249 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5164 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5250 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
@@ -5188,6 +5274,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5188 if (mddev->degraded) 5274 if (mddev->degraded)
5189 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5275 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5190 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5276 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5277 if (!err)
5278 md_new_event(mddev);
5191 md_wakeup_thread(mddev->thread); 5279 md_wakeup_thread(mddev->thread);
5192 return err; 5280 return err;
5193 } 5281 }
@@ -5225,9 +5313,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5225 5313
5226 if (!mddev->persistent) { 5314 if (!mddev->persistent) {
5227 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5315 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5228 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5316 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5229 } else 5317 } else
5230 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5318 rdev->sb_start = calc_dev_sboffset(rdev);
5231 rdev->sectors = rdev->sb_start; 5319 rdev->sectors = rdev->sb_start;
5232 5320
5233 err = bind_rdev_to_array(rdev, mddev); 5321 err = bind_rdev_to_array(rdev, mddev);
@@ -5294,9 +5382,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
5294 } 5382 }
5295 5383
5296 if (mddev->persistent) 5384 if (mddev->persistent)
5297 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5385 rdev->sb_start = calc_dev_sboffset(rdev);
5298 else 5386 else
5299 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5387 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5300 5388
5301 rdev->sectors = rdev->sb_start; 5389 rdev->sectors = rdev->sb_start;
5302 5390
@@ -5507,7 +5595,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
5507 * sb_start or, if that is <data_offset, it must fit before the size 5595 * sb_start or, if that is <data_offset, it must fit before the size
5508 * of each device. If num_sectors is zero, we find the largest size 5596 * of each device. If num_sectors is zero, we find the largest size
5509 * that fits. 5597 * that fits.
5510
5511 */ 5598 */
5512 if (mddev->sync_thread) 5599 if (mddev->sync_thread)
5513 return -EBUSY; 5600 return -EBUSY;
@@ -5544,6 +5631,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
5544 mddev->delta_disks = raid_disks - mddev->raid_disks; 5631 mddev->delta_disks = raid_disks - mddev->raid_disks;
5545 5632
5546 rv = mddev->pers->check_reshape(mddev); 5633 rv = mddev->pers->check_reshape(mddev);
5634 if (rv < 0)
5635 mddev->delta_disks = 0;
5547 return rv; 5636 return rv;
5548} 5637}
5549 5638
@@ -5951,16 +6040,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5951 mddev_t *mddev = mddev_find(bdev->bd_dev); 6040 mddev_t *mddev = mddev_find(bdev->bd_dev);
5952 int err; 6041 int err;
5953 6042
5954 lock_kernel();
5955 if (mddev->gendisk != bdev->bd_disk) { 6043 if (mddev->gendisk != bdev->bd_disk) {
5956 /* we are racing with mddev_put which is discarding this 6044 /* we are racing with mddev_put which is discarding this
5957 * bd_disk. 6045 * bd_disk.
5958 */ 6046 */
5959 mddev_put(mddev); 6047 mddev_put(mddev);
5960 /* Wait until bdev->bd_disk is definitely gone */ 6048 /* Wait until bdev->bd_disk is definitely gone */
5961 flush_scheduled_work(); 6049 flush_workqueue(md_misc_wq);
5962 /* Then retry the open from the top */ 6050 /* Then retry the open from the top */
5963 unlock_kernel();
5964 return -ERESTARTSYS; 6051 return -ERESTARTSYS;
5965 } 6052 }
5966 BUG_ON(mddev != bdev->bd_disk->private_data); 6053 BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5972,9 +6059,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
5972 atomic_inc(&mddev->openers); 6059 atomic_inc(&mddev->openers);
5973 mutex_unlock(&mddev->open_mutex); 6060 mutex_unlock(&mddev->open_mutex);
5974 6061
5975 check_disk_size_change(mddev->gendisk, bdev); 6062 check_disk_change(bdev);
5976 out: 6063 out:
5977 unlock_kernel();
5978 return err; 6064 return err;
5979} 6065}
5980 6066
@@ -5983,13 +6069,26 @@ static int md_release(struct gendisk *disk, fmode_t mode)
5983 mddev_t *mddev = disk->private_data; 6069 mddev_t *mddev = disk->private_data;
5984 6070
5985 BUG_ON(!mddev); 6071 BUG_ON(!mddev);
5986 lock_kernel();
5987 atomic_dec(&mddev->openers); 6072 atomic_dec(&mddev->openers);
5988 mddev_put(mddev); 6073 mddev_put(mddev);
5989 unlock_kernel();
5990 6074
5991 return 0; 6075 return 0;
5992} 6076}
6077
6078static int md_media_changed(struct gendisk *disk)
6079{
6080 mddev_t *mddev = disk->private_data;
6081
6082 return mddev->changed;
6083}
6084
6085static int md_revalidate(struct gendisk *disk)
6086{
6087 mddev_t *mddev = disk->private_data;
6088
6089 mddev->changed = 0;
6090 return 0;
6091}
5993static const struct block_device_operations md_fops = 6092static const struct block_device_operations md_fops =
5994{ 6093{
5995 .owner = THIS_MODULE, 6094 .owner = THIS_MODULE,
@@ -6000,6 +6099,8 @@ static const struct block_device_operations md_fops =
6000 .compat_ioctl = md_compat_ioctl, 6099 .compat_ioctl = md_compat_ioctl,
6001#endif 6100#endif
6002 .getgeo = md_getgeo, 6101 .getgeo = md_getgeo,
6102 .media_changed = md_media_changed,
6103 .revalidate_disk= md_revalidate,
6003}; 6104};
6004 6105
6005static int md_thread(void * arg) 6106static int md_thread(void * arg)
@@ -6036,8 +6137,8 @@ static int md_thread(void * arg)
6036 thread->timeout); 6137 thread->timeout);
6037 6138
6038 clear_bit(THREAD_WAKEUP, &thread->flags); 6139 clear_bit(THREAD_WAKEUP, &thread->flags);
6039 6140 if (!kthread_should_stop())
6040 thread->run(thread->mddev); 6141 thread->run(thread->mddev);
6041 } 6142 }
6042 6143
6043 return 0; 6144 return 0;
@@ -6118,7 +6219,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6118 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6219 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6119 md_wakeup_thread(mddev->thread); 6220 md_wakeup_thread(mddev->thread);
6120 if (mddev->event_work.func) 6221 if (mddev->event_work.func)
6121 schedule_work(&mddev->event_work); 6222 queue_work(md_misc_wq, &mddev->event_work);
6122 md_new_event_inintr(mddev); 6223 md_new_event_inintr(mddev);
6123} 6224}
6124 6225
@@ -6209,7 +6310,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
6209 * rt is a sector_t, so could be 32bit or 64bit. 6310 * rt is a sector_t, so could be 32bit or 64bit.
6210 * So we divide before multiply in case it is 32bit and close 6311 * So we divide before multiply in case it is 32bit and close
6211 * to the limit. 6312 * to the limit.
6212 * We scale the divisor (db) by 32 to avoid loosing precision 6313 * We scale the divisor (db) by 32 to avoid losing precision
6213 * near the end of resync when the number of remaining sectors 6314 * near the end of resync when the number of remaining sectors
6214 * is close to 'db'. 6315 * is close to 'db'.
6215 * We then divide rt by 32 after multiplying by db to compensate. 6316 * We then divide rt by 32 after multiplying by db to compensate.
@@ -6631,14 +6732,6 @@ int md_allow_write(mddev_t *mddev)
6631} 6732}
6632EXPORT_SYMBOL_GPL(md_allow_write); 6733EXPORT_SYMBOL_GPL(md_allow_write);
6633 6734
6634void md_unplug(mddev_t *mddev)
6635{
6636 if (mddev->queue)
6637 blk_unplug(mddev->queue);
6638 if (mddev->plug)
6639 mddev->plug->unplug_fn(mddev->plug);
6640}
6641
6642#define SYNC_MARKS 10 6735#define SYNC_MARKS 10
6643#define SYNC_MARK_STEP (3*HZ) 6736#define SYNC_MARK_STEP (3*HZ)
6644void md_do_sync(mddev_t *mddev) 6737void md_do_sync(mddev_t *mddev)
@@ -6790,8 +6883,8 @@ void md_do_sync(mddev_t *mddev)
6790 * Tune reconstruction: 6883 * Tune reconstruction:
6791 */ 6884 */
6792 window = 32*(PAGE_SIZE/512); 6885 window = 32*(PAGE_SIZE/512);
6793 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6886 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
6794 window/2,(unsigned long long) max_sectors/2); 6887 window/2, (unsigned long long)max_sectors/2);
6795 6888
6796 atomic_set(&mddev->recovery_active, 0); 6889 atomic_set(&mddev->recovery_active, 0);
6797 last_check = 0; 6890 last_check = 0;
@@ -6802,7 +6895,7 @@ void md_do_sync(mddev_t *mddev)
6802 desc, mdname(mddev)); 6895 desc, mdname(mddev));
6803 mddev->curr_resync = j; 6896 mddev->curr_resync = j;
6804 } 6897 }
6805 mddev->curr_resync_completed = mddev->curr_resync; 6898 mddev->curr_resync_completed = j;
6806 6899
6807 while (j < max_sectors) { 6900 while (j < max_sectors) {
6808 sector_t sectors; 6901 sector_t sectors;
@@ -6817,11 +6910,9 @@ void md_do_sync(mddev_t *mddev)
6817 >= mddev->resync_max - mddev->curr_resync_completed 6910 >= mddev->resync_max - mddev->curr_resync_completed
6818 )) { 6911 )) {
6819 /* time to update curr_resync_completed */ 6912 /* time to update curr_resync_completed */
6820 md_unplug(mddev);
6821 wait_event(mddev->recovery_wait, 6913 wait_event(mddev->recovery_wait,
6822 atomic_read(&mddev->recovery_active) == 0); 6914 atomic_read(&mddev->recovery_active) == 0);
6823 mddev->curr_resync_completed = 6915 mddev->curr_resync_completed = j;
6824 mddev->curr_resync;
6825 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6916 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6826 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6917 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6827 } 6918 }
@@ -6894,7 +6985,6 @@ void md_do_sync(mddev_t *mddev)
6894 * about not overloading the IO subsystem. (things like an 6985 * about not overloading the IO subsystem. (things like an
6895 * e2fsck being done on the RAID array should execute fast) 6986 * e2fsck being done on the RAID array should execute fast)
6896 */ 6987 */
6897 md_unplug(mddev);
6898 cond_resched(); 6988 cond_resched();
6899 6989
6900 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6990 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -6913,8 +7003,6 @@ void md_do_sync(mddev_t *mddev)
6913 * this also signals 'finished resyncing' to md_stop 7003 * this also signals 'finished resyncing' to md_stop
6914 */ 7004 */
6915 out: 7005 out:
6916 md_unplug(mddev);
6917
6918 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7006 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6919 7007
6920 /* tell personality that we are finished */ 7008 /* tell personality that we are finished */
@@ -6957,9 +7045,6 @@ void md_do_sync(mddev_t *mddev)
6957 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7045 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6958 mddev->resync_min = mddev->curr_resync_completed; 7046 mddev->resync_min = mddev->curr_resync_completed;
6959 mddev->curr_resync = 0; 7047 mddev->curr_resync = 0;
6960 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6961 mddev->curr_resync_completed = 0;
6962 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6963 wake_up(&resync_wait); 7048 wake_up(&resync_wait);
6964 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7049 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6965 md_wakeup_thread(mddev->thread); 7050 md_wakeup_thread(mddev->thread);
@@ -6977,7 +7062,6 @@ void md_do_sync(mddev_t *mddev)
6977} 7062}
6978EXPORT_SYMBOL_GPL(md_do_sync); 7063EXPORT_SYMBOL_GPL(md_do_sync);
6979 7064
6980
6981static int remove_and_add_spares(mddev_t *mddev) 7065static int remove_and_add_spares(mddev_t *mddev)
6982{ 7066{
6983 mdk_rdev_t *rdev; 7067 mdk_rdev_t *rdev;
@@ -7000,10 +7084,11 @@ static int remove_and_add_spares(mddev_t *mddev)
7000 } 7084 }
7001 } 7085 }
7002 7086
7003 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 7087 if (mddev->degraded && !mddev->recovery_disabled) {
7004 list_for_each_entry(rdev, &mddev->disks, same_set) { 7088 list_for_each_entry(rdev, &mddev->disks, same_set) {
7005 if (rdev->raid_disk >= 0 && 7089 if (rdev->raid_disk >= 0 &&
7006 !test_bit(In_sync, &rdev->flags) && 7090 !test_bit(In_sync, &rdev->flags) &&
7091 !test_bit(Faulty, &rdev->flags) &&
7007 !test_bit(Blocked, &rdev->flags)) 7092 !test_bit(Blocked, &rdev->flags))
7008 spares++; 7093 spares++;
7009 if (rdev->raid_disk < 0 7094 if (rdev->raid_disk < 0
@@ -7026,6 +7111,45 @@ static int remove_and_add_spares(mddev_t *mddev)
7026 } 7111 }
7027 return spares; 7112 return spares;
7028} 7113}
7114
7115static void reap_sync_thread(mddev_t *mddev)
7116{
7117 mdk_rdev_t *rdev;
7118
7119 /* resync has finished, collect result */
7120 md_unregister_thread(mddev->sync_thread);
7121 mddev->sync_thread = NULL;
7122 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7123 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7124 /* success...*/
7125 /* activate any spares */
7126 if (mddev->pers->spare_active(mddev))
7127 sysfs_notify(&mddev->kobj, NULL,
7128 "degraded");
7129 }
7130 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7131 mddev->pers->finish_reshape)
7132 mddev->pers->finish_reshape(mddev);
7133 md_update_sb(mddev, 1);
7134
7135 /* if array is no-longer degraded, then any saved_raid_disk
7136 * information must be scrapped
7137 */
7138 if (!mddev->degraded)
7139 list_for_each_entry(rdev, &mddev->disks, same_set)
7140 rdev->saved_raid_disk = -1;
7141
7142 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7143 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7144 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7145 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7146 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7147 /* flag recovery needed just to double check */
7148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7149 sysfs_notify_dirent_safe(mddev->sysfs_action);
7150 md_new_event(mddev);
7151}
7152
7029/* 7153/*
7030 * This routine is regularly called by all per-raid-array threads to 7154 * This routine is regularly called by all per-raid-array threads to
7031 * deal with generic issues like resync and super-block update. 7155 * deal with generic issues like resync and super-block update.
@@ -7050,8 +7174,8 @@ static int remove_and_add_spares(mddev_t *mddev)
7050 */ 7174 */
7051void md_check_recovery(mddev_t *mddev) 7175void md_check_recovery(mddev_t *mddev)
7052{ 7176{
7053 mdk_rdev_t *rdev; 7177 if (mddev->suspended)
7054 7178 return;
7055 7179
7056 if (mddev->bitmap) 7180 if (mddev->bitmap)
7057 bitmap_daemon_work(mddev); 7181 bitmap_daemon_work(mddev);
@@ -7087,7 +7211,20 @@ void md_check_recovery(mddev_t *mddev)
7087 /* Only thing we do on a ro array is remove 7211 /* Only thing we do on a ro array is remove
7088 * failed devices. 7212 * failed devices.
7089 */ 7213 */
7090 remove_and_add_spares(mddev); 7214 mdk_rdev_t *rdev;
7215 list_for_each_entry(rdev, &mddev->disks, same_set)
7216 if (rdev->raid_disk >= 0 &&
7217 !test_bit(Blocked, &rdev->flags) &&
7218 test_bit(Faulty, &rdev->flags) &&
7219 atomic_read(&rdev->nr_pending)==0) {
7220 if (mddev->pers->hot_remove_disk(
7221 mddev, rdev->raid_disk)==0) {
7222 char nm[20];
7223 sprintf(nm,"rd%d", rdev->raid_disk);
7224 sysfs_remove_link(&mddev->kobj, nm);
7225 rdev->raid_disk = -1;
7226 }
7227 }
7091 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7228 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7092 goto unlock; 7229 goto unlock;
7093 } 7230 }
@@ -7120,34 +7257,7 @@ void md_check_recovery(mddev_t *mddev)
7120 goto unlock; 7257 goto unlock;
7121 } 7258 }
7122 if (mddev->sync_thread) { 7259 if (mddev->sync_thread) {
7123 /* resync has finished, collect result */ 7260 reap_sync_thread(mddev);
7124 md_unregister_thread(mddev->sync_thread);
7125 mddev->sync_thread = NULL;
7126 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7127 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7128 /* success...*/
7129 /* activate any spares */
7130 if (mddev->pers->spare_active(mddev))
7131 sysfs_notify(&mddev->kobj, NULL,
7132 "degraded");
7133 }
7134 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7135 mddev->pers->finish_reshape)
7136 mddev->pers->finish_reshape(mddev);
7137 md_update_sb(mddev, 1);
7138
7139 /* if array is no-longer degraded, then any saved_raid_disk
7140 * information must be scrapped
7141 */
7142 if (!mddev->degraded)
7143 list_for_each_entry(rdev, &mddev->disks, same_set)
7144 rdev->saved_raid_disk = -1;
7145
7146 mddev->recovery = 0;
7147 /* flag recovery needed just to double check */
7148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7149 sysfs_notify_dirent_safe(mddev->sysfs_action);
7150 md_new_event(mddev);
7151 goto unlock; 7261 goto unlock;
7152 } 7262 }
7153 /* Set RUNNING before clearing NEEDED to avoid 7263 /* Set RUNNING before clearing NEEDED to avoid
@@ -7205,7 +7315,11 @@ void md_check_recovery(mddev_t *mddev)
7205 " thread...\n", 7315 " thread...\n",
7206 mdname(mddev)); 7316 mdname(mddev));
7207 /* leave the spares where they are, it shouldn't hurt */ 7317 /* leave the spares where they are, it shouldn't hurt */
7208 mddev->recovery = 0; 7318 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7319 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7320 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7321 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7322 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7209 } else 7323 } else
7210 md_wakeup_thread(mddev->sync_thread); 7324 md_wakeup_thread(mddev->sync_thread);
7211 sysfs_notify_dirent_safe(mddev->sysfs_action); 7325 sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -7278,12 +7392,23 @@ static void md_geninit(void)
7278 7392
7279static int __init md_init(void) 7393static int __init md_init(void)
7280{ 7394{
7281 if (register_blkdev(MD_MAJOR, "md")) 7395 int ret = -ENOMEM;
7282 return -1; 7396
7283 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 7397 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
7284 unregister_blkdev(MD_MAJOR, "md"); 7398 if (!md_wq)
7285 return -1; 7399 goto err_wq;
7286 } 7400
7401 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
7402 if (!md_misc_wq)
7403 goto err_misc_wq;
7404
7405 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
7406 goto err_md;
7407
7408 if ((ret = register_blkdev(0, "mdp")) < 0)
7409 goto err_mdp;
7410 mdp_major = ret;
7411
7287 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 7412 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
7288 md_probe, NULL, NULL); 7413 md_probe, NULL, NULL);
7289 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 7414 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7294,8 +7419,16 @@ static int __init md_init(void)
7294 7419
7295 md_geninit(); 7420 md_geninit();
7296 return 0; 7421 return 0;
7297}
7298 7422
7423err_mdp:
7424 unregister_blkdev(MD_MAJOR, "md");
7425err_md:
7426 destroy_workqueue(md_misc_wq);
7427err_misc_wq:
7428 destroy_workqueue(md_wq);
7429err_wq:
7430 return ret;
7431}
7299 7432
7300#ifndef MODULE 7433#ifndef MODULE
7301 7434
@@ -7382,6 +7515,8 @@ static __exit void md_exit(void)
7382 export_array(mddev); 7515 export_array(mddev);
7383 mddev->hold_active = 0; 7516 mddev->hold_active = 0;
7384 } 7517 }
7518 destroy_workqueue(md_misc_wq);
7519 destroy_workqueue(md_wq);
7385} 7520}
7386 7521
7387subsys_initcall(md_init); 7522subsys_initcall(md_init);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 3931299788dc..1c26c7a08ae6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,26 +29,6 @@
29typedef struct mddev_s mddev_t; 29typedef struct mddev_s mddev_t;
30typedef struct mdk_rdev_s mdk_rdev_t; 30typedef struct mdk_rdev_s mdk_rdev_t;
31 31
32/* generic plugging support - like that provided with request_queue,
33 * but does not require a request_queue
34 */
35struct plug_handle {
36 void (*unplug_fn)(struct plug_handle *);
37 struct timer_list unplug_timer;
38 struct work_struct unplug_work;
39 unsigned long unplug_flag;
40};
41#define PLUGGED_FLAG 1
42void plugger_init(struct plug_handle *plug,
43 void (*unplug_fn)(struct plug_handle *));
44void plugger_set_plug(struct plug_handle *plug);
45int plugger_remove_plug(struct plug_handle *plug);
46static inline void plugger_flush(struct plug_handle *plug)
47{
48 del_timer_sync(&plug->unplug_timer);
49 cancel_work_sync(&plug->unplug_work);
50}
51
52/* 32/*
53 * MD's 'extended' device 33 * MD's 'extended' device
54 */ 34 */
@@ -60,6 +40,12 @@ struct mdk_rdev_s
60 mddev_t *mddev; /* RAID array if running */ 40 mddev_t *mddev; /* RAID array if running */
61 int last_events; /* IO event timestamp */ 41 int last_events; /* IO event timestamp */
62 42
43 /*
44 * If meta_bdev is non-NULL, it means that a separate device is
45 * being used to store the metadata (superblock/bitmap) which
46 * would otherwise be contained on the same device as the data (bdev).
47 */
48 struct block_device *meta_bdev;
63 struct block_device *bdev; /* block device handle */ 49 struct block_device *bdev; /* block device handle */
64 50
65 struct page *sb_page; 51 struct page *sb_page;
@@ -87,11 +73,8 @@ struct mdk_rdev_s
87#define Faulty 1 /* device is known to have a fault */ 73#define Faulty 1 /* device is known to have a fault */
88#define In_sync 2 /* device is in_sync with rest of array */ 74#define In_sync 2 /* device is in_sync with rest of array */
89#define WriteMostly 4 /* Avoid reading if at all possible */ 75#define WriteMostly 4 /* Avoid reading if at all possible */
90#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
91#define AllReserved 6 /* If whole device is reserved for
92 * one array */
93#define AutoDetected 7 /* added by auto-detect */ 76#define AutoDetected 7 /* added by auto-detect */
94#define Blocked 8 /* An error occured on an externally 77#define Blocked 8 /* An error occurred on an externally
95 * managed array, don't allow writes 78 * managed array, don't allow writes
96 * until it is cleared */ 79 * until it is cleared */
97 wait_queue_head_t blocked_wait; 80 wait_queue_head_t blocked_wait;
@@ -141,6 +124,7 @@ struct mddev_s
141#define MD_CHANGE_DEVS 0 /* Some device status has changed */ 124#define MD_CHANGE_DEVS 0 /* Some device status has changed */
142#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 125#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
143#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 126#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
127#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
144 128
145 int suspended; 129 int suspended;
146 atomic_t active_io; 130 atomic_t active_io;
@@ -149,7 +133,8 @@ struct mddev_s
149 * are happening, so run/ 133 * are happening, so run/
150 * takeover/stop are not safe 134 * takeover/stop are not safe
151 */ 135 */
152 136 int ready; /* See when safe to pass
137 * IO requests down */
153 struct gendisk *gendisk; 138 struct gendisk *gendisk;
154 139
155 struct kobject kobj; 140 struct kobject kobj;
@@ -195,6 +180,9 @@ struct mddev_s
195 int delta_disks, new_level, new_layout; 180 int delta_disks, new_level, new_layout;
196 int new_chunk_sectors; 181 int new_chunk_sectors;
197 182
183 atomic_t plug_cnt; /* If device is expecting
184 * more bios soon.
185 */
198 struct mdk_thread_s *thread; /* management thread */ 186 struct mdk_thread_s *thread; /* management thread */
199 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ 187 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
200 sector_t curr_resync; /* last block scheduled */ 188 sector_t curr_resync; /* last block scheduled */
@@ -270,16 +258,11 @@ struct mddev_s
270 atomic_t active; /* general refcount */ 258 atomic_t active; /* general refcount */
271 atomic_t openers; /* number of active opens */ 259 atomic_t openers; /* number of active opens */
272 260
261 int changed; /* True if we might need to
262 * reread partition info */
273 int degraded; /* whether md should consider 263 int degraded; /* whether md should consider
274 * adding a spare 264 * adding a spare
275 */ 265 */
276 int barriers_work; /* initialised to true, cleared as soon
277 * as a barrier request to slave
278 * fails. Only supported
279 */
280 struct bio *biolist; /* bios that need to be retried
281 * because REQ_HARDBARRIER is not supported
282 */
283 266
284 atomic_t recovery_active; /* blocks scheduled, but not written */ 267 atomic_t recovery_active; /* blocks scheduled, but not written */
285 wait_queue_head_t recovery_wait; 268 wait_queue_head_t recovery_wait;
@@ -337,19 +320,18 @@ struct mddev_s
337 struct list_head all_mddevs; 320 struct list_head all_mddevs;
338 321
339 struct attribute_group *to_remove; 322 struct attribute_group *to_remove;
340 struct plug_handle *plug; /* if used by personality */ 323
341 324 struct bio_set *bio_set;
342 /* Generic barrier handling. 325
343 * If there is a pending barrier request, all other 326 /* Generic flush handling.
344 * writes are blocked while the devices are flushed. 327 * The last to finish preflush schedules a worker to submit
345 * The last to finish a flush schedules a worker to 328 * the rest of the request (without the REQ_FLUSH flag).
346 * submit the barrier request (without the barrier flag),
347 * then submit more flush requests.
348 */ 329 */
349 struct bio *barrier; 330 struct bio *flush_bio;
350 atomic_t flush_pending; 331 atomic_t flush_pending;
351 struct work_struct barrier_work; 332 struct work_struct flush_work;
352 struct work_struct event_work; /* used by dm to report failure event */ 333 struct work_struct event_work; /* used by dm to report failure event */
334 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
353}; 335};
354 336
355 337
@@ -502,12 +484,12 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
502extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); 484extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
503 485
504extern int mddev_congested(mddev_t *mddev, int bits); 486extern int mddev_congested(mddev_t *mddev, int bits);
505extern void md_barrier_request(mddev_t *mddev, struct bio *bio); 487extern void md_flush_request(mddev_t *mddev, struct bio *bio);
506extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 488extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
507 sector_t sector, int size, struct page *page); 489 sector_t sector, int size, struct page *page);
508extern void md_super_wait(mddev_t *mddev); 490extern void md_super_wait(mddev_t *mddev);
509extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, 491extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
510 struct page *page, int rw); 492 struct page *page, int rw, bool metadata_op);
511extern void md_do_sync(mddev_t *mddev); 493extern void md_do_sync(mddev_t *mddev);
512extern void md_new_event(mddev_t *mddev); 494extern void md_new_event(mddev_t *mddev);
513extern int md_allow_write(mddev_t *mddev); 495extern int md_allow_write(mddev_t *mddev);
@@ -518,7 +500,6 @@ extern int md_integrity_register(mddev_t *mddev);
518extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 500extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
519extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 501extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
520extern void restore_bitmap_write_access(struct file *file); 502extern void restore_bitmap_write_access(struct file *file);
521extern void md_unplug(mddev_t *mddev);
522 503
523extern void mddev_init(mddev_t *mddev); 504extern void mddev_init(mddev_t *mddev);
524extern int md_run(mddev_t *mddev); 505extern int md_run(mddev_t *mddev);
@@ -528,4 +509,9 @@ extern void md_rdev_init(mdk_rdev_t *rdev);
528 509
529extern void mddev_suspend(mddev_t *mddev); 510extern void mddev_suspend(mddev_t *mddev);
530extern void mddev_resume(mddev_t *mddev); 511extern void mddev_resume(mddev_t *mddev);
512extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
513 mddev_t *mddev);
514extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
515 mddev_t *mddev);
516extern int mddev_check_plugged(mddev_t *mddev);
531#endif /* _MD_MD_H */ 517#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 0307d217e7a4..3535c23af288 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -106,44 +106,14 @@ static void multipath_end_request(struct bio *bio, int error)
106 rdev_dec_pending(rdev, conf->mddev); 106 rdev_dec_pending(rdev, conf->mddev);
107} 107}
108 108
109static void unplug_slaves(mddev_t *mddev)
110{
111 multipath_conf_t *conf = mddev->private;
112 int i;
113
114 rcu_read_lock();
115 for (i=0; i<mddev->raid_disks; i++) {
116 mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev);
117 if (rdev && !test_bit(Faulty, &rdev->flags)
118 && atomic_read(&rdev->nr_pending)) {
119 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
120
121 atomic_inc(&rdev->nr_pending);
122 rcu_read_unlock();
123
124 blk_unplug(r_queue);
125
126 rdev_dec_pending(rdev, mddev);
127 rcu_read_lock();
128 }
129 }
130 rcu_read_unlock();
131}
132
133static void multipath_unplug(struct request_queue *q)
134{
135 unplug_slaves(q->queuedata);
136}
137
138
139static int multipath_make_request(mddev_t *mddev, struct bio * bio) 109static int multipath_make_request(mddev_t *mddev, struct bio * bio)
140{ 110{
141 multipath_conf_t *conf = mddev->private; 111 multipath_conf_t *conf = mddev->private;
142 struct multipath_bh * mp_bh; 112 struct multipath_bh * mp_bh;
143 struct multipath_info *multipath; 113 struct multipath_info *multipath;
144 114
145 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 115 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
146 md_barrier_request(mddev, bio); 116 md_flush_request(mddev, bio);
147 return 0; 117 return 0;
148 } 118 }
149 119
@@ -176,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
176 int i; 146 int i;
177 147
178 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 148 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
179 conf->working_disks); 149 conf->raid_disks - mddev->degraded);
180 for (i = 0; i < conf->raid_disks; i++) 150 for (i = 0; i < conf->raid_disks; i++)
181 seq_printf (seq, "%s", 151 seq_printf (seq, "%s",
182 conf->multipaths[i].rdev && 152 conf->multipaths[i].rdev &&
@@ -216,35 +186,36 @@ static int multipath_congested(void *data, int bits)
216static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) 186static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
217{ 187{
218 multipath_conf_t *conf = mddev->private; 188 multipath_conf_t *conf = mddev->private;
189 char b[BDEVNAME_SIZE];
219 190
220 if (conf->working_disks <= 1) { 191 if (conf->raid_disks - mddev->degraded <= 1) {
221 /* 192 /*
222 * Uh oh, we can do nothing if this is our last path, but 193 * Uh oh, we can do nothing if this is our last path, but
223 * first check if this is a queued request for a device 194 * first check if this is a queued request for a device
224 * which has just failed. 195 * which has just failed.
225 */ 196 */
226 printk(KERN_ALERT 197 printk(KERN_ALERT
227 "multipath: only one IO path left and IO error.\n"); 198 "multipath: only one IO path left and IO error.\n");
228 /* leave it active... it's all we have */ 199 /* leave it active... it's all we have */
229 } else { 200 return;
230 /*
231 * Mark disk as unusable
232 */
233 if (!test_bit(Faulty, &rdev->flags)) {
234 char b[BDEVNAME_SIZE];
235 clear_bit(In_sync, &rdev->flags);
236 set_bit(Faulty, &rdev->flags);
237 set_bit(MD_CHANGE_DEVS, &mddev->flags);
238 conf->working_disks--;
239 mddev->degraded++;
240 printk(KERN_ALERT "multipath: IO failure on %s,"
241 " disabling IO path.\n"
242 "multipath: Operation continuing"
243 " on %d IO paths.\n",
244 bdevname (rdev->bdev,b),
245 conf->working_disks);
246 }
247 } 201 }
202 /*
203 * Mark disk as unusable
204 */
205 if (test_and_clear_bit(In_sync, &rdev->flags)) {
206 unsigned long flags;
207 spin_lock_irqsave(&conf->device_lock, flags);
208 mddev->degraded++;
209 spin_unlock_irqrestore(&conf->device_lock, flags);
210 }
211 set_bit(Faulty, &rdev->flags);
212 set_bit(MD_CHANGE_DEVS, &mddev->flags);
213 printk(KERN_ALERT "multipath: IO failure on %s,"
214 " disabling IO path.\n"
215 "multipath: Operation continuing"
216 " on %d IO paths.\n",
217 bdevname(rdev->bdev, b),
218 conf->raid_disks - mddev->degraded);
248} 219}
249 220
250static void print_multipath_conf (multipath_conf_t *conf) 221static void print_multipath_conf (multipath_conf_t *conf)
@@ -257,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
257 printk("(conf==NULL)\n"); 228 printk("(conf==NULL)\n");
258 return; 229 return;
259 } 230 }
260 printk(" --- wd:%d rd:%d\n", conf->working_disks, 231 printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
261 conf->raid_disks); 232 conf->raid_disks);
262 233
263 for (i = 0; i < conf->raid_disks; i++) { 234 for (i = 0; i < conf->raid_disks; i++) {
@@ -304,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
304 PAGE_CACHE_SIZE - 1); 275 PAGE_CACHE_SIZE - 1);
305 } 276 }
306 277
307 conf->working_disks++; 278 spin_lock_irq(&conf->device_lock);
308 mddev->degraded--; 279 mddev->degraded--;
309 rdev->raid_disk = path; 280 rdev->raid_disk = path;
310 set_bit(In_sync, &rdev->flags); 281 set_bit(In_sync, &rdev->flags);
282 spin_unlock_irq(&conf->device_lock);
311 rcu_assign_pointer(p->rdev, rdev); 283 rcu_assign_pointer(p->rdev, rdev);
312 err = 0; 284 err = 0;
313 md_integrity_add_rdev(rdev, mddev); 285 md_integrity_add_rdev(rdev, mddev);
@@ -345,7 +317,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
345 p->rdev = rdev; 317 p->rdev = rdev;
346 goto abort; 318 goto abort;
347 } 319 }
348 md_integrity_register(mddev); 320 err = md_integrity_register(mddev);
349 } 321 }
350abort: 322abort:
351 323
@@ -421,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
421 int disk_idx; 393 int disk_idx;
422 struct multipath_info *disk; 394 struct multipath_info *disk;
423 mdk_rdev_t *rdev; 395 mdk_rdev_t *rdev;
396 int working_disks;
424 397
425 if (md_check_no_bitmap(mddev)) 398 if (md_check_no_bitmap(mddev))
426 return -EINVAL; 399 return -EINVAL;
@@ -435,7 +408,6 @@ static int multipath_run (mddev_t *mddev)
435 * bookkeeping area. [whatever we allocate in multipath_run(), 408 * bookkeeping area. [whatever we allocate in multipath_run(),
436 * should be freed in multipath_stop()] 409 * should be freed in multipath_stop()]
437 */ 410 */
438 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
439 411
440 conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL); 412 conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
441 mddev->private = conf; 413 mddev->private = conf;
@@ -455,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
455 goto out_free_conf; 427 goto out_free_conf;
456 } 428 }
457 429
458 conf->working_disks = 0; 430 working_disks = 0;
459 list_for_each_entry(rdev, &mddev->disks, same_set) { 431 list_for_each_entry(rdev, &mddev->disks, same_set) {
460 disk_idx = rdev->raid_disk; 432 disk_idx = rdev->raid_disk;
461 if (disk_idx < 0 || 433 if (disk_idx < 0 ||
@@ -477,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
477 } 449 }
478 450
479 if (!test_bit(Faulty, &rdev->flags)) 451 if (!test_bit(Faulty, &rdev->flags))
480 conf->working_disks++; 452 working_disks++;
481 } 453 }
482 454
483 conf->raid_disks = mddev->raid_disks; 455 conf->raid_disks = mddev->raid_disks;
@@ -485,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
485 spin_lock_init(&conf->device_lock); 457 spin_lock_init(&conf->device_lock);
486 INIT_LIST_HEAD(&conf->retry_list); 458 INIT_LIST_HEAD(&conf->retry_list);
487 459
488 if (!conf->working_disks) { 460 if (!working_disks) {
489 printk(KERN_ERR "multipath: no operational IO paths for %s\n", 461 printk(KERN_ERR "multipath: no operational IO paths for %s\n",
490 mdname(mddev)); 462 mdname(mddev));
491 goto out_free_conf; 463 goto out_free_conf;
492 } 464 }
493 mddev->degraded = conf->raid_disks - conf->working_disks; 465 mddev->degraded = conf->raid_disks - working_disks;
494 466
495 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, 467 conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
496 sizeof(struct multipath_bh)); 468 sizeof(struct multipath_bh));
@@ -512,16 +484,19 @@ static int multipath_run (mddev_t *mddev)
512 484
513 printk(KERN_INFO 485 printk(KERN_INFO
514 "multipath: array %s active with %d out of %d IO paths\n", 486 "multipath: array %s active with %d out of %d IO paths\n",
515 mdname(mddev), conf->working_disks, mddev->raid_disks); 487 mdname(mddev), conf->raid_disks - mddev->degraded,
488 mddev->raid_disks);
516 /* 489 /*
517 * Ok, everything is just fine now 490 * Ok, everything is just fine now
518 */ 491 */
519 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); 492 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
520 493
521 mddev->queue->unplug_fn = multipath_unplug;
522 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 494 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
523 mddev->queue->backing_dev_info.congested_data = mddev; 495 mddev->queue->backing_dev_info.congested_data = mddev;
524 md_integrity_register(mddev); 496
497 if (md_integrity_register(mddev))
498 goto out_free_conf;
499
525 return 0; 500 return 0;
526 501
527out_free_conf: 502out_free_conf:
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index d1c2a8d78395..3c5a45eb5f8a 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -9,7 +9,6 @@ struct multipath_private_data {
9 mddev_t *mddev; 9 mddev_t *mddev;
10 struct multipath_info *multipaths; 10 struct multipath_info *multipaths;
11 int raid_disks; 11 int raid_disks;
12 int working_disks;
13 spinlock_t device_lock; 12 spinlock_t device_lock;
14 struct list_head retry_list; 13 struct list_head retry_list;
15 14
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f7af46d623c..e86bf3682e1e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,21 +25,6 @@
25#include "raid0.h" 25#include "raid0.h"
26#include "raid5.h" 26#include "raid5.h"
27 27
28static void raid0_unplug(struct request_queue *q)
29{
30 mddev_t *mddev = q->queuedata;
31 raid0_conf_t *conf = mddev->private;
32 mdk_rdev_t **devlist = conf->devlist;
33 int raid_disks = conf->strip_zone[0].nb_dev;
34 int i;
35
36 for (i=0; i < raid_disks; i++) {
37 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev);
38
39 blk_unplug(r_queue);
40 }
41}
42
43static int raid0_congested(void *data, int bits) 28static int raid0_congested(void *data, int bits)
44{ 29{
45 mddev_t *mddev = data; 30 mddev_t *mddev = data;
@@ -179,6 +164,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
179 rdev1->new_raid_disk = j; 164 rdev1->new_raid_disk = j;
180 } 165 }
181 166
167 if (mddev->level == 1) {
168 /* taiking over a raid1 array-
169 * we have only one active disk
170 */
171 j = 0;
172 rdev1->new_raid_disk = j;
173 }
174
182 if (j < 0 || j >= mddev->raid_disks) { 175 if (j < 0 || j >= mddev->raid_disks) {
183 printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 176 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
184 "aborting!\n", mdname(mddev), j); 177 "aborting!\n", mdname(mddev), j);
@@ -264,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
264 mdname(mddev), 257 mdname(mddev),
265 (unsigned long long)smallest->sectors); 258 (unsigned long long)smallest->sectors);
266 } 259 }
267 mddev->queue->unplug_fn = raid0_unplug;
268 mddev->queue->backing_dev_info.congested_fn = raid0_congested; 260 mddev->queue->backing_dev_info.congested_fn = raid0_congested;
269 mddev->queue->backing_dev_info.congested_data = mddev; 261 mddev->queue->backing_dev_info.congested_data = mddev;
270 262
@@ -353,7 +345,6 @@ static int raid0_run(mddev_t *mddev)
353 if (md_check_no_bitmap(mddev)) 345 if (md_check_no_bitmap(mddev))
354 return -EINVAL; 346 return -EINVAL;
355 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 347 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
356 mddev->queue->queue_lock = &mddev->queue->__queue_lock;
357 348
358 /* if private is not null, we are here after takeover */ 349 /* if private is not null, we are here after takeover */
359 if (mddev->private == NULL) { 350 if (mddev->private == NULL) {
@@ -388,8 +379,7 @@ static int raid0_run(mddev_t *mddev)
388 379
389 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 380 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
390 dump_zones(mddev); 381 dump_zones(mddev);
391 md_integrity_register(mddev); 382 return md_integrity_register(mddev);
392 return 0;
393} 383}
394 384
395static int raid0_stop(mddev_t *mddev) 385static int raid0_stop(mddev_t *mddev)
@@ -483,8 +473,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
483 struct strip_zone *zone; 473 struct strip_zone *zone;
484 mdk_rdev_t *tmp_dev; 474 mdk_rdev_t *tmp_dev;
485 475
486 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 476 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
487 md_barrier_request(mddev, bio); 477 md_flush_request(mddev, bio);
488 return 0; 478 return 0;
489 } 479 }
490 480
@@ -644,12 +634,39 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
644 return priv_conf; 634 return priv_conf;
645} 635}
646 636
637static void *raid0_takeover_raid1(mddev_t *mddev)
638{
639 raid0_conf_t *priv_conf;
640
641 /* Check layout:
642 * - (N - 1) mirror drives must be already faulty
643 */
644 if ((mddev->raid_disks - 1) != mddev->degraded) {
645 printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
646 mdname(mddev));
647 return ERR_PTR(-EINVAL);
648 }
649
650 /* Set new parameters */
651 mddev->new_level = 0;
652 mddev->new_layout = 0;
653 mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */
654 mddev->delta_disks = 1 - mddev->raid_disks;
655 mddev->raid_disks = 1;
656 /* make sure it will be not marked as dirty */
657 mddev->recovery_cp = MaxSector;
658
659 create_strip_zones(mddev, &priv_conf);
660 return priv_conf;
661}
662
647static void *raid0_takeover(mddev_t *mddev) 663static void *raid0_takeover(mddev_t *mddev)
648{ 664{
649 /* raid0 can take over: 665 /* raid0 can take over:
650 * raid4 - if all data disks are active. 666 * raid4 - if all data disks are active.
651 * raid5 - providing it is Raid4 layout and one disk is faulty 667 * raid5 - providing it is Raid4 layout and one disk is faulty
652 * raid10 - assuming we have all necessary active disks 668 * raid10 - assuming we have all necessary active disks
669 * raid1 - with (N -1) mirror drives faulty
653 */ 670 */
654 if (mddev->level == 4) 671 if (mddev->level == 4)
655 return raid0_takeover_raid45(mddev); 672 return raid0_takeover_raid45(mddev);
@@ -665,6 +682,12 @@ static void *raid0_takeover(mddev_t *mddev)
665 if (mddev->level == 10) 682 if (mddev->level == 10)
666 return raid0_takeover_raid10(mddev); 683 return raid0_takeover_raid10(mddev);
667 684
685 if (mddev->level == 1)
686 return raid0_takeover_raid1(mddev);
687
688 printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
689 mddev->level);
690
668 return ERR_PTR(-EINVAL); 691 return ERR_PTR(-EINVAL);
669} 692}
670 693
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8b..f7431b6d8447 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,23 +52,16 @@
52#define NR_RAID1_BIOS 256 52#define NR_RAID1_BIOS 256
53 53
54 54
55static void unplug_slaves(mddev_t *mddev);
56
57static void allow_barrier(conf_t *conf); 55static void allow_barrier(conf_t *conf);
58static void lower_barrier(conf_t *conf); 56static void lower_barrier(conf_t *conf);
59 57
60static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 58static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
61{ 59{
62 struct pool_info *pi = data; 60 struct pool_info *pi = data;
63 r1bio_t *r1_bio;
64 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 61 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
65 62
66 /* allocate a r1bio with room for raid_disks entries in the bios array */ 63 /* allocate a r1bio with room for raid_disks entries in the bios array */
67 r1_bio = kzalloc(size, gfp_flags); 64 return kzalloc(size, gfp_flags);
68 if (!r1_bio && pi->mddev)
69 unplug_slaves(pi->mddev);
70
71 return r1_bio;
72} 65}
73 66
74static void r1bio_pool_free(void *r1_bio, void *data) 67static void r1bio_pool_free(void *r1_bio, void *data)
@@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
91 int i, j; 84 int i, j;
92 85
93 r1_bio = r1bio_pool_alloc(gfp_flags, pi); 86 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
94 if (!r1_bio) { 87 if (!r1_bio)
95 unplug_slaves(pi->mddev);
96 return NULL; 88 return NULL;
97 }
98 89
99 /* 90 /*
100 * Allocate bios : 1 for reading, n-1 for writing 91 * Allocate bios : 1 for reading, n-1 for writing
101 */ 92 */
102 for (j = pi->raid_disks ; j-- ; ) { 93 for (j = pi->raid_disks ; j-- ; ) {
103 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 94 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
104 if (!bio) 95 if (!bio)
105 goto out_free_bio; 96 goto out_free_bio;
106 r1_bio->bios[j] = bio; 97 r1_bio->bios[j] = bio;
@@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error)
306 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 297 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
307} 298}
308 299
300static void r1_bio_write_done(r1bio_t *r1_bio)
301{
302 if (atomic_dec_and_test(&r1_bio->remaining))
303 {
304 /* it really is the end of this request */
305 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
306 /* free extra copy of the data pages */
307 int i = r1_bio->behind_page_count;
308 while (i--)
309 safe_put_page(r1_bio->behind_pages[i]);
310 kfree(r1_bio->behind_pages);
311 r1_bio->behind_pages = NULL;
312 }
313 /* clear the bitmap if all writes complete successfully */
314 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
315 r1_bio->sectors,
316 !test_bit(R1BIO_Degraded, &r1_bio->state),
317 test_bit(R1BIO_BehindIO, &r1_bio->state));
318 md_write_end(r1_bio->mddev);
319 raid_end_bio_io(r1_bio);
320 }
321}
322
309static void raid1_end_write_request(struct bio *bio, int error) 323static void raid1_end_write_request(struct bio *bio, int error)
310{ 324{
311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 325 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -319,84 +333,61 @@ static void raid1_end_write_request(struct bio *bio, int error)
319 if (r1_bio->bios[mirror] == bio) 333 if (r1_bio->bios[mirror] == bio)
320 break; 334 break;
321 335
322 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 336 /*
323 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 337 * 'one mirror IO has finished' event handler:
324 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 338 */
325 r1_bio->mddev->barriers_work = 0; 339 r1_bio->bios[mirror] = NULL;
326 /* Don't rdev_dec_pending in this branch - keep it for the retry */ 340 to_put = bio;
327 } else { 341 if (!uptodate) {
342 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
343 /* an I/O failed, we can't clear the bitmap */
344 set_bit(R1BIO_Degraded, &r1_bio->state);
345 } else
328 /* 346 /*
329 * this branch is our 'one mirror IO has finished' event handler: 347 * Set R1BIO_Uptodate in our master bio, so that we
348 * will return a good error code for to the higher
349 * levels even if IO on some other mirrored buffer
350 * fails.
351 *
352 * The 'master' represents the composite IO operation
353 * to user-side. So if something waits for IO, then it
354 * will wait for the 'master' bio.
330 */ 355 */
331 r1_bio->bios[mirror] = NULL; 356 set_bit(R1BIO_Uptodate, &r1_bio->state);
332 to_put = bio; 357
333 if (!uptodate) { 358 update_head_pos(mirror, r1_bio);
334 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 359
335 /* an I/O failed, we can't clear the bitmap */ 360 if (behind) {
336 set_bit(R1BIO_Degraded, &r1_bio->state); 361 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337 } else 362 atomic_dec(&r1_bio->behind_remaining);
338 /* 363
339 * Set R1BIO_Uptodate in our master bio, so that 364 /*
340 * we will return a good error code for to the higher 365 * In behind mode, we ACK the master bio once the I/O
341 * levels even if IO on some other mirrored buffer fails. 366 * has safely reached all non-writemostly
342 * 367 * disks. Setting the Returned bit ensures that this
343 * The 'master' represents the composite IO operation to 368 * gets done only once -- we don't ever want to return
344 * user-side. So if something waits for IO, then it will 369 * -EIO here, instead we'll wait
345 * wait for the 'master' bio. 370 */
346 */ 371 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347 set_bit(R1BIO_Uptodate, &r1_bio->state); 372 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348 373 /* Maybe we can return now */
349 update_head_pos(mirror, r1_bio); 374 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350 375 struct bio *mbio = r1_bio->master_bio;
351 if (behind) { 376 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 377 (unsigned long long) mbio->bi_sector,
353 atomic_dec(&r1_bio->behind_remaining); 378 (unsigned long long) mbio->bi_sector +
354 379 (mbio->bi_size >> 9) - 1);
355 /* In behind mode, we ACK the master bio once the I/O has safely 380 bio_endio(mbio, 0);
356 * reached all non-writemostly disks. Setting the Returned bit
357 * ensures that this gets done only once -- we don't ever want to
358 * return -EIO here, instead we'll wait */
359
360 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362 /* Maybe we can return now */
363 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364 struct bio *mbio = r1_bio->master_bio;
365 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366 (unsigned long long) mbio->bi_sector,
367 (unsigned long long) mbio->bi_sector +
368 (mbio->bi_size >> 9) - 1);
369 bio_endio(mbio, 0);
370 }
371 } 381 }
372 } 382 }
373 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374 } 383 }
384 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
385
375 /* 386 /*
376 *
377 * Let's see if all mirrored write operations have finished 387 * Let's see if all mirrored write operations have finished
378 * already. 388 * already.
379 */ 389 */
380 if (atomic_dec_and_test(&r1_bio->remaining)) { 390 r1_bio_write_done(r1_bio);
381 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
382 reschedule_retry(r1_bio);
383 else {
384 /* it really is the end of this request */
385 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
386 /* free extra copy of the data pages */
387 int i = bio->bi_vcnt;
388 while (i--)
389 safe_put_page(bio->bi_io_vec[i].bv_page);
390 }
391 /* clear the bitmap if all writes complete successfully */
392 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393 r1_bio->sectors,
394 !test_bit(R1BIO_Degraded, &r1_bio->state),
395 behind);
396 md_write_end(r1_bio->mddev);
397 raid_end_bio_io(r1_bio);
398 }
399 }
400 391
401 if (to_put) 392 if (to_put)
402 bio_put(to_put); 393 bio_put(to_put);
@@ -420,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error)
420static int read_balance(conf_t *conf, r1bio_t *r1_bio) 411static int read_balance(conf_t *conf, r1bio_t *r1_bio)
421{ 412{
422 const sector_t this_sector = r1_bio->sector; 413 const sector_t this_sector = r1_bio->sector;
423 int new_disk = conf->last_used, disk = new_disk;
424 int wonly_disk = -1;
425 const int sectors = r1_bio->sectors; 414 const int sectors = r1_bio->sectors;
426 sector_t new_distance, current_distance; 415 int start_disk;
416 int best_disk;
417 int i;
418 sector_t best_dist;
427 mdk_rdev_t *rdev; 419 mdk_rdev_t *rdev;
420 int choose_first;
428 421
429 rcu_read_lock(); 422 rcu_read_lock();
430 /* 423 /*
@@ -433,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
433 * We take the first readable disk when above the resync window. 426 * We take the first readable disk when above the resync window.
434 */ 427 */
435 retry: 428 retry:
429 best_disk = -1;
430 best_dist = MaxSector;
436 if (conf->mddev->recovery_cp < MaxSector && 431 if (conf->mddev->recovery_cp < MaxSector &&
437 (this_sector + sectors >= conf->next_resync)) { 432 (this_sector + sectors >= conf->next_resync)) {
438 /* Choose the first operational device, for consistancy */ 433 choose_first = 1;
439 new_disk = 0; 434 start_disk = 0;
440 435 } else {
441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 436 choose_first = 0;
442 r1_bio->bios[new_disk] == IO_BLOCKED || 437 start_disk = conf->last_used;
443 !rdev || !test_bit(In_sync, &rdev->flags)
444 || test_bit(WriteMostly, &rdev->flags);
445 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
446
447 if (rdev && test_bit(In_sync, &rdev->flags) &&
448 r1_bio->bios[new_disk] != IO_BLOCKED)
449 wonly_disk = new_disk;
450
451 if (new_disk == conf->raid_disks - 1) {
452 new_disk = wonly_disk;
453 break;
454 }
455 }
456 goto rb_out;
457 }
458
459
460 /* make sure the disk is operational */
461 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
462 r1_bio->bios[new_disk] == IO_BLOCKED ||
463 !rdev || !test_bit(In_sync, &rdev->flags) ||
464 test_bit(WriteMostly, &rdev->flags);
465 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
466
467 if (rdev && test_bit(In_sync, &rdev->flags) &&
468 r1_bio->bios[new_disk] != IO_BLOCKED)
469 wonly_disk = new_disk;
470
471 if (new_disk <= 0)
472 new_disk = conf->raid_disks;
473 new_disk--;
474 if (new_disk == disk) {
475 new_disk = wonly_disk;
476 break;
477 }
478 } 438 }
479 439
480 if (new_disk < 0) 440 for (i = 0 ; i < conf->raid_disks ; i++) {
481 goto rb_out; 441 sector_t dist;
482 442 int disk = start_disk + i;
483 disk = new_disk; 443 if (disk >= conf->raid_disks)
484 /* now disk == new_disk == starting point for search */ 444 disk -= conf->raid_disks;
485
486 /*
487 * Don't change to another disk for sequential reads:
488 */
489 if (conf->next_seq_sect == this_sector)
490 goto rb_out;
491 if (this_sector == conf->mirrors[new_disk].head_position)
492 goto rb_out;
493
494 current_distance = abs(this_sector - conf->mirrors[disk].head_position);
495
496 /* Find the disk whose head is closest */
497
498 do {
499 if (disk <= 0)
500 disk = conf->raid_disks;
501 disk--;
502 445
503 rdev = rcu_dereference(conf->mirrors[disk].rdev); 446 rdev = rcu_dereference(conf->mirrors[disk].rdev);
504 447 if (r1_bio->bios[disk] == IO_BLOCKED
505 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || 448 || rdev == NULL
506 !test_bit(In_sync, &rdev->flags) || 449 || test_bit(Faulty, &rdev->flags))
507 test_bit(WriteMostly, &rdev->flags))
508 continue; 450 continue;
509 451 if (!test_bit(In_sync, &rdev->flags) &&
510 if (!atomic_read(&rdev->nr_pending)) { 452 rdev->recovery_offset < this_sector + sectors)
511 new_disk = disk; 453 continue;
454 if (test_bit(WriteMostly, &rdev->flags)) {
455 /* Don't balance among write-mostly, just
456 * use the first as a last resort */
457 if (best_disk < 0)
458 best_disk = disk;
459 continue;
460 }
461 /* This is a reasonable device to use. It might
462 * even be best.
463 */
464 dist = abs(this_sector - conf->mirrors[disk].head_position);
465 if (choose_first
466 /* Don't change to another disk for sequential reads */
467 || conf->next_seq_sect == this_sector
468 || dist == 0
469 /* If device is idle, use it */
470 || atomic_read(&rdev->nr_pending) == 0) {
471 best_disk = disk;
512 break; 472 break;
513 } 473 }
514 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 474 if (dist < best_dist) {
515 if (new_distance < current_distance) { 475 best_dist = dist;
516 current_distance = new_distance; 476 best_disk = disk;
517 new_disk = disk;
518 } 477 }
519 } while (disk != conf->last_used); 478 }
520
521 rb_out:
522
523 479
524 if (new_disk >= 0) { 480 if (best_disk >= 0) {
525 rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 481 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
526 if (!rdev) 482 if (!rdev)
527 goto retry; 483 goto retry;
528 atomic_inc(&rdev->nr_pending); 484 atomic_inc(&rdev->nr_pending);
529 if (!test_bit(In_sync, &rdev->flags)) { 485 if (test_bit(Faulty, &rdev->flags)) {
530 /* cannot risk returning a device that failed 486 /* cannot risk returning a device that failed
531 * before we inc'ed nr_pending 487 * before we inc'ed nr_pending
532 */ 488 */
@@ -534,59 +490,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
534 goto retry; 490 goto retry;
535 } 491 }
536 conf->next_seq_sect = this_sector + sectors; 492 conf->next_seq_sect = this_sector + sectors;
537 conf->last_used = new_disk; 493 conf->last_used = best_disk;
538 } 494 }
539 rcu_read_unlock(); 495 rcu_read_unlock();
540 496
541 return new_disk; 497 return best_disk;
542} 498}
543 499
544static void unplug_slaves(mddev_t *mddev) 500int md_raid1_congested(mddev_t *mddev, int bits)
545{ 501{
546 conf_t *conf = mddev->private; 502 conf_t *conf = mddev->private;
547 int i;
548
549 rcu_read_lock();
550 for (i=0; i<mddev->raid_disks; i++) {
551 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
552 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
553 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
554
555 atomic_inc(&rdev->nr_pending);
556 rcu_read_unlock();
557
558 blk_unplug(r_queue);
559
560 rdev_dec_pending(rdev, mddev);
561 rcu_read_lock();
562 }
563 }
564 rcu_read_unlock();
565}
566
567static void raid1_unplug(struct request_queue *q)
568{
569 mddev_t *mddev = q->queuedata;
570
571 unplug_slaves(mddev);
572 md_wakeup_thread(mddev->thread);
573}
574
575static int raid1_congested(void *data, int bits)
576{
577 mddev_t *mddev = data;
578 conf_t *conf = mddev->private;
579 int i, ret = 0; 503 int i, ret = 0;
580 504
581 if (mddev_congested(mddev, bits))
582 return 1;
583
584 rcu_read_lock(); 505 rcu_read_lock();
585 for (i = 0; i < mddev->raid_disks; i++) { 506 for (i = 0; i < mddev->raid_disks; i++) {
586 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 507 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
587 if (rdev && !test_bit(Faulty, &rdev->flags)) { 508 if (rdev && !test_bit(Faulty, &rdev->flags)) {
588 struct request_queue *q = bdev_get_queue(rdev->bdev); 509 struct request_queue *q = bdev_get_queue(rdev->bdev);
589 510
511 BUG_ON(!q);
512
590 /* Note the '|| 1' - when read_balance prefers 513 /* Note the '|| 1' - when read_balance prefers
591 * non-congested targets, it can be removed 514 * non-congested targets, it can be removed
592 */ 515 */
@@ -599,22 +522,26 @@ static int raid1_congested(void *data, int bits)
599 rcu_read_unlock(); 522 rcu_read_unlock();
600 return ret; 523 return ret;
601} 524}
525EXPORT_SYMBOL_GPL(md_raid1_congested);
602 526
527static int raid1_congested(void *data, int bits)
528{
529 mddev_t *mddev = data;
603 530
604static int flush_pending_writes(conf_t *conf) 531 return mddev_congested(mddev, bits) ||
532 md_raid1_congested(mddev, bits);
533}
534
535static void flush_pending_writes(conf_t *conf)
605{ 536{
606 /* Any writes that have been queued but are awaiting 537 /* Any writes that have been queued but are awaiting
607 * bitmap updates get flushed here. 538 * bitmap updates get flushed here.
608 * We return 1 if any requests were actually submitted.
609 */ 539 */
610 int rv = 0;
611
612 spin_lock_irq(&conf->device_lock); 540 spin_lock_irq(&conf->device_lock);
613 541
614 if (conf->pending_bio_list.head) { 542 if (conf->pending_bio_list.head) {
615 struct bio *bio; 543 struct bio *bio;
616 bio = bio_list_get(&conf->pending_bio_list); 544 bio = bio_list_get(&conf->pending_bio_list);
617 blk_remove_plug(conf->mddev->queue);
618 spin_unlock_irq(&conf->device_lock); 545 spin_unlock_irq(&conf->device_lock);
619 /* flush any pending bitmap writes to 546 /* flush any pending bitmap writes to
620 * disk before proceeding w/ I/O */ 547 * disk before proceeding w/ I/O */
@@ -626,10 +553,8 @@ static int flush_pending_writes(conf_t *conf)
626 generic_make_request(bio); 553 generic_make_request(bio);
627 bio = next; 554 bio = next;
628 } 555 }
629 rv = 1;
630 } else 556 } else
631 spin_unlock_irq(&conf->device_lock); 557 spin_unlock_irq(&conf->device_lock);
632 return rv;
633} 558}
634 559
635/* Barriers.... 560/* Barriers....
@@ -661,17 +586,15 @@ static void raise_barrier(conf_t *conf)
661 586
662 /* Wait until no block IO is waiting */ 587 /* Wait until no block IO is waiting */
663 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 588 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
664 conf->resync_lock, 589 conf->resync_lock, );
665 raid1_unplug(conf->mddev->queue));
666 590
667 /* block any new IO from starting */ 591 /* block any new IO from starting */
668 conf->barrier++; 592 conf->barrier++;
669 593
670 /* No wait for all pending IO to complete */ 594 /* Now wait for all pending IO to complete */
671 wait_event_lock_irq(conf->wait_barrier, 595 wait_event_lock_irq(conf->wait_barrier,
672 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 596 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
673 conf->resync_lock, 597 conf->resync_lock, );
674 raid1_unplug(conf->mddev->queue));
675 598
676 spin_unlock_irq(&conf->resync_lock); 599 spin_unlock_irq(&conf->resync_lock);
677} 600}
@@ -693,7 +616,7 @@ static void wait_barrier(conf_t *conf)
693 conf->nr_waiting++; 616 conf->nr_waiting++;
694 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 617 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
695 conf->resync_lock, 618 conf->resync_lock,
696 raid1_unplug(conf->mddev->queue)); 619 );
697 conf->nr_waiting--; 620 conf->nr_waiting--;
698 } 621 }
699 conf->nr_pending++; 622 conf->nr_pending++;
@@ -729,8 +652,7 @@ static void freeze_array(conf_t *conf)
729 wait_event_lock_irq(conf->wait_barrier, 652 wait_event_lock_irq(conf->wait_barrier,
730 conf->nr_pending == conf->nr_queued+1, 653 conf->nr_pending == conf->nr_queued+1,
731 conf->resync_lock, 654 conf->resync_lock,
732 ({ flush_pending_writes(conf); 655 flush_pending_writes(conf));
733 raid1_unplug(conf->mddev->queue); }));
734 spin_unlock_irq(&conf->resync_lock); 656 spin_unlock_irq(&conf->resync_lock);
735} 657}
736static void unfreeze_array(conf_t *conf) 658static void unfreeze_array(conf_t *conf)
@@ -744,15 +666,16 @@ static void unfreeze_array(conf_t *conf)
744} 666}
745 667
746 668
747/* duplicate the data pages for behind I/O */ 669/* duplicate the data pages for behind I/O
748static struct page **alloc_behind_pages(struct bio *bio) 670 */
671static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
749{ 672{
750 int i; 673 int i;
751 struct bio_vec *bvec; 674 struct bio_vec *bvec;
752 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), 675 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
753 GFP_NOIO); 676 GFP_NOIO);
754 if (unlikely(!pages)) 677 if (unlikely(!pages))
755 goto do_sync_io; 678 return;
756 679
757 bio_for_each_segment(bvec, bio, i) { 680 bio_for_each_segment(bvec, bio, i) {
758 pages[i] = alloc_page(GFP_NOIO); 681 pages[i] = alloc_page(GFP_NOIO);
@@ -763,16 +686,17 @@ static struct page **alloc_behind_pages(struct bio *bio)
763 kunmap(pages[i]); 686 kunmap(pages[i]);
764 kunmap(bvec->bv_page); 687 kunmap(bvec->bv_page);
765 } 688 }
766 689 r1_bio->behind_pages = pages;
767 return pages; 690 r1_bio->behind_page_count = bio->bi_vcnt;
691 set_bit(R1BIO_BehindIO, &r1_bio->state);
692 return;
768 693
769do_sync_io: 694do_sync_io:
770 if (pages) 695 for (i = 0; i < bio->bi_vcnt; i++)
771 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 696 if (pages[i])
772 put_page(pages[i]); 697 put_page(pages[i]);
773 kfree(pages); 698 kfree(pages);
774 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 699 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
775 return NULL;
776} 700}
777 701
778static int make_request(mddev_t *mddev, struct bio * bio) 702static int make_request(mddev_t *mddev, struct bio * bio)
@@ -784,20 +708,16 @@ static int make_request(mddev_t *mddev, struct bio * bio)
784 int i, targets = 0, disks; 708 int i, targets = 0, disks;
785 struct bitmap *bitmap; 709 struct bitmap *bitmap;
786 unsigned long flags; 710 unsigned long flags;
787 struct bio_list bl;
788 struct page **behind_pages = NULL;
789 const int rw = bio_data_dir(bio); 711 const int rw = bio_data_dir(bio);
790 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 712 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791 unsigned long do_barriers; 713 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
792 mdk_rdev_t *blocked_rdev; 714 mdk_rdev_t *blocked_rdev;
715 int plugged;
793 716
794 /* 717 /*
795 * Register the new request and wait if the reconstruction 718 * Register the new request and wait if the reconstruction
796 * thread has put up a bar for new requests. 719 * thread has put up a bar for new requests.
797 * Continue immediately if no resync is active currently. 720 * Continue immediately if no resync is active currently.
798 * We test barriers_work *after* md_write_start as md_write_start
799 * may cause the first superblock write, and that will check out
800 * if barriers work.
801 */ 721 */
802 722
803 md_write_start(mddev, bio); /* wait on superblock update early */ 723 md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +741,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
821 } 741 }
822 finish_wait(&conf->wait_barrier, &w); 742 finish_wait(&conf->wait_barrier, &w);
823 } 743 }
824 if (unlikely(!mddev->barriers_work &&
825 (bio->bi_rw & REQ_HARDBARRIER))) {
826 if (rw == WRITE)
827 md_write_end(mddev);
828 bio_endio(bio, -EOPNOTSUPP);
829 return 0;
830 }
831 744
832 wait_barrier(conf); 745 wait_barrier(conf);
833 746
@@ -870,7 +783,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
870 } 783 }
871 r1_bio->read_disk = rdisk; 784 r1_bio->read_disk = rdisk;
872 785
873 read_bio = bio_clone(bio, GFP_NOIO); 786 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
874 787
875 r1_bio->bios[rdisk] = read_bio; 788 r1_bio->bios[rdisk] = read_bio;
876 789
@@ -891,14 +804,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
891 * inc refcount on their rdev. Record them by setting 804 * inc refcount on their rdev. Record them by setting
892 * bios[x] to bio 805 * bios[x] to bio
893 */ 806 */
807 plugged = mddev_check_plugged(mddev);
808
894 disks = conf->raid_disks; 809 disks = conf->raid_disks;
895#if 0
896 { static int first=1;
897 if (first) printk("First Write sector %llu disks %d\n",
898 (unsigned long long)r1_bio->sector, disks);
899 first = 0;
900 }
901#endif
902 retry_write: 810 retry_write:
903 blocked_rdev = NULL; 811 blocked_rdev = NULL;
904 rcu_read_lock(); 812 rcu_read_lock();
@@ -952,33 +860,29 @@ static int make_request(mddev_t *mddev, struct bio * bio)
952 if (bitmap && 860 if (bitmap &&
953 (atomic_read(&bitmap->behind_writes) 861 (atomic_read(&bitmap->behind_writes)
954 < mddev->bitmap_info.max_write_behind) && 862 < mddev->bitmap_info.max_write_behind) &&
955 !waitqueue_active(&bitmap->behind_wait) && 863 !waitqueue_active(&bitmap->behind_wait))
956 (behind_pages = alloc_behind_pages(bio)) != NULL) 864 alloc_behind_pages(bio, r1_bio);
957 set_bit(R1BIO_BehindIO, &r1_bio->state);
958 865
959 atomic_set(&r1_bio->remaining, 0); 866 atomic_set(&r1_bio->remaining, 1);
960 atomic_set(&r1_bio->behind_remaining, 0); 867 atomic_set(&r1_bio->behind_remaining, 0);
961 868
962 do_barriers = bio->bi_rw & REQ_HARDBARRIER; 869 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
963 if (do_barriers) 870 test_bit(R1BIO_BehindIO, &r1_bio->state));
964 set_bit(R1BIO_Barrier, &r1_bio->state);
965
966 bio_list_init(&bl);
967 for (i = 0; i < disks; i++) { 871 for (i = 0; i < disks; i++) {
968 struct bio *mbio; 872 struct bio *mbio;
969 if (!r1_bio->bios[i]) 873 if (!r1_bio->bios[i])
970 continue; 874 continue;
971 875
972 mbio = bio_clone(bio, GFP_NOIO); 876 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
973 r1_bio->bios[i] = mbio; 877 r1_bio->bios[i] = mbio;
974 878
975 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 879 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 880 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977 mbio->bi_end_io = raid1_end_write_request; 881 mbio->bi_end_io = raid1_end_write_request;
978 mbio->bi_rw = WRITE | do_barriers | do_sync; 882 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
979 mbio->bi_private = r1_bio; 883 mbio->bi_private = r1_bio;
980 884
981 if (behind_pages) { 885 if (r1_bio->behind_pages) {
982 struct bio_vec *bvec; 886 struct bio_vec *bvec;
983 int j; 887 int j;
984 888
@@ -986,39 +890,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
986 * we clear any unused pointer in the io_vec, rather 890 * we clear any unused pointer in the io_vec, rather
987 * than leave them unchanged. This is important 891 * than leave them unchanged. This is important
988 * because when we come to free the pages, we won't 892 * because when we come to free the pages, we won't
989 * know the originial bi_idx, so we just free 893 * know the original bi_idx, so we just free
990 * them all 894 * them all
991 */ 895 */
992 __bio_for_each_segment(bvec, mbio, j, 0) 896 __bio_for_each_segment(bvec, mbio, j, 0)
993 bvec->bv_page = behind_pages[j]; 897 bvec->bv_page = r1_bio->behind_pages[j];
994 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 898 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
995 atomic_inc(&r1_bio->behind_remaining); 899 atomic_inc(&r1_bio->behind_remaining);
996 } 900 }
997 901
998 atomic_inc(&r1_bio->remaining); 902 atomic_inc(&r1_bio->remaining);
999 903 spin_lock_irqsave(&conf->device_lock, flags);
1000 bio_list_add(&bl, mbio); 904 bio_list_add(&conf->pending_bio_list, mbio);
905 spin_unlock_irqrestore(&conf->device_lock, flags);
1001 } 906 }
1002 kfree(behind_pages); /* the behind pages are attached to the bios now */ 907 r1_bio_write_done(r1_bio);
1003
1004 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
1005 test_bit(R1BIO_BehindIO, &r1_bio->state));
1006 spin_lock_irqsave(&conf->device_lock, flags);
1007 bio_list_merge(&conf->pending_bio_list, &bl);
1008 bio_list_init(&bl);
1009 908
1010 blk_plug_device(mddev->queue); 909 /* In case raid1d snuck in to freeze_array */
1011 spin_unlock_irqrestore(&conf->device_lock, flags);
1012
1013 /* In case raid1d snuck into freeze_array */
1014 wake_up(&conf->wait_barrier); 910 wake_up(&conf->wait_barrier);
1015 911
1016 if (do_sync) 912 if (do_sync || !bitmap || !plugged)
1017 md_wakeup_thread(mddev->thread); 913 md_wakeup_thread(mddev->thread);
1018#if 0
1019 while ((bio = bio_list_pop(&bl)) != NULL)
1020 generic_make_request(bio);
1021#endif
1022 914
1023 return 0; 915 return 0;
1024} 916}
@@ -1076,8 +968,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1076 } else 968 } else
1077 set_bit(Faulty, &rdev->flags); 969 set_bit(Faulty, &rdev->flags);
1078 set_bit(MD_CHANGE_DEVS, &mddev->flags); 970 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1079 printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" 971 printk(KERN_ALERT
1080 KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", 972 "md/raid1:%s: Disk failure on %s, disabling device.\n"
973 "md/raid1:%s: Operation continuing on %d devices.\n",
1081 mdname(mddev), bdevname(rdev->bdev, b), 974 mdname(mddev), bdevname(rdev->bdev, b),
1082 mdname(mddev), conf->raid_disks - mddev->degraded); 975 mdname(mddev), conf->raid_disks - mddev->degraded);
1083} 976}
@@ -1206,10 +1099,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1206 err = -EBUSY; 1099 err = -EBUSY;
1207 goto abort; 1100 goto abort;
1208 } 1101 }
1209 /* Only remove non-faulty devices is recovery 1102 /* Only remove non-faulty devices if recovery
1210 * is not possible. 1103 * is not possible.
1211 */ 1104 */
1212 if (!test_bit(Faulty, &rdev->flags) && 1105 if (!test_bit(Faulty, &rdev->flags) &&
1106 !mddev->recovery_disabled &&
1213 mddev->degraded < conf->raid_disks) { 1107 mddev->degraded < conf->raid_disks) {
1214 err = -EBUSY; 1108 err = -EBUSY;
1215 goto abort; 1109 goto abort;
@@ -1222,7 +1116,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1222 p->rdev = rdev; 1116 p->rdev = rdev;
1223 goto abort; 1117 goto abort;
1224 } 1118 }
1225 md_integrity_register(mddev); 1119 err = md_integrity_register(mddev);
1226 } 1120 }
1227abort: 1121abort:
1228 1122
@@ -1268,7 +1162,7 @@ static void end_sync_write(struct bio *bio, int error)
1268 break; 1162 break;
1269 } 1163 }
1270 if (!uptodate) { 1164 if (!uptodate) {
1271 int sync_blocks = 0; 1165 sector_t sync_blocks = 0;
1272 sector_t s = r1_bio->sector; 1166 sector_t s = r1_bio->sector;
1273 long sectors_to_go = r1_bio->sectors; 1167 long sectors_to_go = r1_bio->sectors;
1274 /* make sure these bits doesn't get cleared. */ 1168 /* make sure these bits doesn't get cleared. */
@@ -1290,194 +1184,210 @@ static void end_sync_write(struct bio *bio, int error)
1290 } 1184 }
1291} 1185}
1292 1186
1293static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1187static int fix_sync_read_error(r1bio_t *r1_bio)
1294{ 1188{
1189 /* Try some synchronous reads of other devices to get
1190 * good data, much like with normal read errors. Only
1191 * read into the pages we already have so we don't
1192 * need to re-issue the read request.
1193 * We don't need to freeze the array, because being in an
1194 * active sync request, there is no normal IO, and
1195 * no overlapping syncs.
1196 */
1197 mddev_t *mddev = r1_bio->mddev;
1295 conf_t *conf = mddev->private; 1198 conf_t *conf = mddev->private;
1296 int i; 1199 struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1297 int disks = conf->raid_disks; 1200 sector_t sect = r1_bio->sector;
1298 struct bio *bio, *wbio; 1201 int sectors = r1_bio->sectors;
1299 1202 int idx = 0;
1300 bio = r1_bio->bios[r1_bio->read_disk];
1301 1203
1204 while(sectors) {
1205 int s = sectors;
1206 int d = r1_bio->read_disk;
1207 int success = 0;
1208 mdk_rdev_t *rdev;
1209 int start;
1302 1210
1303 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1211 if (s > (PAGE_SIZE>>9))
1304 /* We have read all readable devices. If we haven't 1212 s = PAGE_SIZE >> 9;
1305 * got the block, then there is no hope left. 1213 do {
1306 * If we have, then we want to do a comparison 1214 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1307 * and skip the write if everything is the same. 1215 /* No rcu protection needed here devices
1308 * If any blocks failed to read, then we need to 1216 * can only be removed when no resync is
1309 * attempt an over-write 1217 * active, and resync is currently active
1310 */ 1218 */
1311 int primary; 1219 rdev = conf->mirrors[d].rdev;
1312 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1220 if (sync_page_io(rdev,
1313 for (i=0; i<mddev->raid_disks; i++) 1221 sect,
1314 if (r1_bio->bios[i]->bi_end_io == end_sync_read) 1222 s<<9,
1315 md_error(mddev, conf->mirrors[i].rdev); 1223 bio->bi_io_vec[idx].bv_page,
1224 READ, false)) {
1225 success = 1;
1226 break;
1227 }
1228 }
1229 d++;
1230 if (d == conf->raid_disks)
1231 d = 0;
1232 } while (!success && d != r1_bio->read_disk);
1316 1233
1317 md_done_sync(mddev, r1_bio->sectors, 1); 1234 if (!success) {
1235 char b[BDEVNAME_SIZE];
1236 /* Cannot read from anywhere, array is toast */
1237 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1238 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1239 " for block %llu\n",
1240 mdname(mddev),
1241 bdevname(bio->bi_bdev, b),
1242 (unsigned long long)r1_bio->sector);
1243 md_done_sync(mddev, r1_bio->sectors, 0);
1318 put_buf(r1_bio); 1244 put_buf(r1_bio);
1319 return; 1245 return 0;
1320 } 1246 }
1321 for (primary=0; primary<mddev->raid_disks; primary++)
1322 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1323 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1324 r1_bio->bios[primary]->bi_end_io = NULL;
1325 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1326 break;
1327 }
1328 r1_bio->read_disk = primary;
1329 for (i=0; i<mddev->raid_disks; i++)
1330 if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
1331 int j;
1332 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1333 struct bio *pbio = r1_bio->bios[primary];
1334 struct bio *sbio = r1_bio->bios[i];
1335
1336 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1337 for (j = vcnt; j-- ; ) {
1338 struct page *p, *s;
1339 p = pbio->bi_io_vec[j].bv_page;
1340 s = sbio->bi_io_vec[j].bv_page;
1341 if (memcmp(page_address(p),
1342 page_address(s),
1343 PAGE_SIZE))
1344 break;
1345 }
1346 } else
1347 j = 0;
1348 if (j >= 0)
1349 mddev->resync_mismatches += r1_bio->sectors;
1350 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1351 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1352 sbio->bi_end_io = NULL;
1353 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1354 } else {
1355 /* fixup the bio for reuse */
1356 int size;
1357 sbio->bi_vcnt = vcnt;
1358 sbio->bi_size = r1_bio->sectors << 9;
1359 sbio->bi_idx = 0;
1360 sbio->bi_phys_segments = 0;
1361 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1362 sbio->bi_flags |= 1 << BIO_UPTODATE;
1363 sbio->bi_next = NULL;
1364 sbio->bi_sector = r1_bio->sector +
1365 conf->mirrors[i].rdev->data_offset;
1366 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1367 size = sbio->bi_size;
1368 for (j = 0; j < vcnt ; j++) {
1369 struct bio_vec *bi;
1370 bi = &sbio->bi_io_vec[j];
1371 bi->bv_offset = 0;
1372 if (size > PAGE_SIZE)
1373 bi->bv_len = PAGE_SIZE;
1374 else
1375 bi->bv_len = size;
1376 size -= PAGE_SIZE;
1377 memcpy(page_address(bi->bv_page),
1378 page_address(pbio->bi_io_vec[j].bv_page),
1379 PAGE_SIZE);
1380 }
1381 1247
1382 } 1248 start = d;
1383 } 1249 /* write it back and re-read */
1250 while (d != r1_bio->read_disk) {
1251 if (d == 0)
1252 d = conf->raid_disks;
1253 d--;
1254 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1255 continue;
1256 rdev = conf->mirrors[d].rdev;
1257 if (sync_page_io(rdev,
1258 sect,
1259 s<<9,
1260 bio->bi_io_vec[idx].bv_page,
1261 WRITE, false) == 0) {
1262 r1_bio->bios[d]->bi_end_io = NULL;
1263 rdev_dec_pending(rdev, mddev);
1264 md_error(mddev, rdev);
1265 } else
1266 atomic_add(s, &rdev->corrected_errors);
1267 }
1268 d = start;
1269 while (d != r1_bio->read_disk) {
1270 if (d == 0)
1271 d = conf->raid_disks;
1272 d--;
1273 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1274 continue;
1275 rdev = conf->mirrors[d].rdev;
1276 if (sync_page_io(rdev,
1277 sect,
1278 s<<9,
1279 bio->bi_io_vec[idx].bv_page,
1280 READ, false) == 0)
1281 md_error(mddev, rdev);
1282 }
1283 sectors -= s;
1284 sect += s;
1285 idx ++;
1384 } 1286 }
1385 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1287 set_bit(R1BIO_Uptodate, &r1_bio->state);
1386 /* ouch - failed to read all of that. 1288 set_bit(BIO_UPTODATE, &bio->bi_flags);
1387 * Try some synchronous reads of other devices to get 1289 return 1;
1388 * good data, much like with normal read errors. Only 1290}
1389 * read into the pages we already have so we don't 1291
1390 * need to re-issue the read request. 1292static int process_checks(r1bio_t *r1_bio)
1391 * We don't need to freeze the array, because being in an 1293{
1392 * active sync request, there is no normal IO, and 1294 /* We have read all readable devices. If we haven't
1393 * no overlapping syncs. 1295 * got the block, then there is no hope left.
1394 */ 1296 * If we have, then we want to do a comparison
1395 sector_t sect = r1_bio->sector; 1297 * and skip the write if everything is the same.
1396 int sectors = r1_bio->sectors; 1298 * If any blocks failed to read, then we need to
1397 int idx = 0; 1299 * attempt an over-write
1398 1300 */
1399 while(sectors) { 1301 mddev_t *mddev = r1_bio->mddev;
1400 int s = sectors; 1302 conf_t *conf = mddev->private;
1401 int d = r1_bio->read_disk; 1303 int primary;
1402 int success = 0; 1304 int i;
1403 mdk_rdev_t *rdev; 1305
1404 1306 for (primary = 0; primary < conf->raid_disks; primary++)
1405 if (s > (PAGE_SIZE>>9)) 1307 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1406 s = PAGE_SIZE >> 9; 1308 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1407 do { 1309 r1_bio->bios[primary]->bi_end_io = NULL;
1408 if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 1310 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1409 /* No rcu protection needed here devices 1311 break;
1410 * can only be removed when no resync is 1312 }
1411 * active, and resync is currently active 1313 r1_bio->read_disk = primary;
1412 */ 1314 for (i = 0; i < conf->raid_disks; i++) {
1413 rdev = conf->mirrors[d].rdev; 1315 int j;
1414 if (sync_page_io(rdev->bdev, 1316 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1415 sect + rdev->data_offset, 1317 struct bio *pbio = r1_bio->bios[primary];
1416 s<<9, 1318 struct bio *sbio = r1_bio->bios[i];
1417 bio->bi_io_vec[idx].bv_page, 1319 int size;
1418 READ)) { 1320
1419 success = 1; 1321 if (r1_bio->bios[i]->bi_end_io != end_sync_read)
1420 break; 1322 continue;
1421 } 1323
1422 } 1324 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1423 d++; 1325 for (j = vcnt; j-- ; ) {
1424 if (d == conf->raid_disks) 1326 struct page *p, *s;
1425 d = 0; 1327 p = pbio->bi_io_vec[j].bv_page;
1426 } while (!success && d != r1_bio->read_disk); 1328 s = sbio->bi_io_vec[j].bv_page;
1427 1329 if (memcmp(page_address(p),
1428 if (success) { 1330 page_address(s),
1429 int start = d; 1331 PAGE_SIZE))
1430 /* write it back and re-read */ 1332 break;
1431 set_bit(R1BIO_Uptodate, &r1_bio->state);
1432 while (d != r1_bio->read_disk) {
1433 if (d == 0)
1434 d = conf->raid_disks;
1435 d--;
1436 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1437 continue;
1438 rdev = conf->mirrors[d].rdev;
1439 atomic_add(s, &rdev->corrected_errors);
1440 if (sync_page_io(rdev->bdev,
1441 sect + rdev->data_offset,
1442 s<<9,
1443 bio->bi_io_vec[idx].bv_page,
1444 WRITE) == 0)
1445 md_error(mddev, rdev);
1446 }
1447 d = start;
1448 while (d != r1_bio->read_disk) {
1449 if (d == 0)
1450 d = conf->raid_disks;
1451 d--;
1452 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1453 continue;
1454 rdev = conf->mirrors[d].rdev;
1455 if (sync_page_io(rdev->bdev,
1456 sect + rdev->data_offset,
1457 s<<9,
1458 bio->bi_io_vec[idx].bv_page,
1459 READ) == 0)
1460 md_error(mddev, rdev);
1461 }
1462 } else {
1463 char b[BDEVNAME_SIZE];
1464 /* Cannot read from anywhere, array is toast */
1465 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1466 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
1467 " for block %llu\n",
1468 mdname(mddev),
1469 bdevname(bio->bi_bdev, b),
1470 (unsigned long long)r1_bio->sector);
1471 md_done_sync(mddev, r1_bio->sectors, 0);
1472 put_buf(r1_bio);
1473 return;
1474 } 1333 }
1475 sectors -= s; 1334 } else
1476 sect += s; 1335 j = 0;
1477 idx ++; 1336 if (j >= 0)
1337 mddev->resync_mismatches += r1_bio->sectors;
1338 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1339 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1340 /* No need to write to this device. */
1341 sbio->bi_end_io = NULL;
1342 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1343 continue;
1344 }
1345 /* fixup the bio for reuse */
1346 sbio->bi_vcnt = vcnt;
1347 sbio->bi_size = r1_bio->sectors << 9;
1348 sbio->bi_idx = 0;
1349 sbio->bi_phys_segments = 0;
1350 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1351 sbio->bi_flags |= 1 << BIO_UPTODATE;
1352 sbio->bi_next = NULL;
1353 sbio->bi_sector = r1_bio->sector +
1354 conf->mirrors[i].rdev->data_offset;
1355 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1356 size = sbio->bi_size;
1357 for (j = 0; j < vcnt ; j++) {
1358 struct bio_vec *bi;
1359 bi = &sbio->bi_io_vec[j];
1360 bi->bv_offset = 0;
1361 if (size > PAGE_SIZE)
1362 bi->bv_len = PAGE_SIZE;
1363 else
1364 bi->bv_len = size;
1365 size -= PAGE_SIZE;
1366 memcpy(page_address(bi->bv_page),
1367 page_address(pbio->bi_io_vec[j].bv_page),
1368 PAGE_SIZE);
1478 } 1369 }
1479 } 1370 }
1371 return 0;
1372}
1373
1374static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1375{
1376 conf_t *conf = mddev->private;
1377 int i;
1378 int disks = conf->raid_disks;
1379 struct bio *bio, *wbio;
1380
1381 bio = r1_bio->bios[r1_bio->read_disk];
1480 1382
1383 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
1384 /* ouch - failed to read all of that. */
1385 if (!fix_sync_read_error(r1_bio))
1386 return;
1387
1388 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1389 if (process_checks(r1_bio) < 0)
1390 return;
1481 /* 1391 /*
1482 * schedule writes 1392 * schedule writes
1483 */ 1393 */
@@ -1536,10 +1446,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1536 rdev = conf->mirrors[d].rdev; 1446 rdev = conf->mirrors[d].rdev;
1537 if (rdev && 1447 if (rdev &&
1538 test_bit(In_sync, &rdev->flags) && 1448 test_bit(In_sync, &rdev->flags) &&
1539 sync_page_io(rdev->bdev, 1449 sync_page_io(rdev, sect, s<<9,
1540 sect + rdev->data_offset, 1450 conf->tmppage, READ, false))
1541 s<<9,
1542 conf->tmppage, READ))
1543 success = 1; 1451 success = 1;
1544 else { 1452 else {
1545 d++; 1453 d++;
@@ -1562,9 +1470,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1562 rdev = conf->mirrors[d].rdev; 1470 rdev = conf->mirrors[d].rdev;
1563 if (rdev && 1471 if (rdev &&
1564 test_bit(In_sync, &rdev->flags)) { 1472 test_bit(In_sync, &rdev->flags)) {
1565 if (sync_page_io(rdev->bdev, 1473 if (sync_page_io(rdev, sect, s<<9,
1566 sect + rdev->data_offset, 1474 conf->tmppage, WRITE, false)
1567 s<<9, conf->tmppage, WRITE)
1568 == 0) 1475 == 0)
1569 /* Well, this device is dead */ 1476 /* Well, this device is dead */
1570 md_error(mddev, rdev); 1477 md_error(mddev, rdev);
@@ -1579,9 +1486,8 @@ static void fix_read_error(conf_t *conf, int read_disk,
1579 rdev = conf->mirrors[d].rdev; 1486 rdev = conf->mirrors[d].rdev;
1580 if (rdev && 1487 if (rdev &&
1581 test_bit(In_sync, &rdev->flags)) { 1488 test_bit(In_sync, &rdev->flags)) {
1582 if (sync_page_io(rdev->bdev, 1489 if (sync_page_io(rdev, sect, s<<9,
1583 sect + rdev->data_offset, 1490 conf->tmppage, READ, false)
1584 s<<9, conf->tmppage, READ)
1585 == 0) 1491 == 0)
1586 /* Well, this device is dead */ 1492 /* Well, this device is dead */
1587 md_error(mddev, rdev); 1493 md_error(mddev, rdev);
@@ -1609,15 +1515,17 @@ static void raid1d(mddev_t *mddev)
1609 unsigned long flags; 1515 unsigned long flags;
1610 conf_t *conf = mddev->private; 1516 conf_t *conf = mddev->private;
1611 struct list_head *head = &conf->retry_list; 1517 struct list_head *head = &conf->retry_list;
1612 int unplug=0;
1613 mdk_rdev_t *rdev; 1518 mdk_rdev_t *rdev;
1519 struct blk_plug plug;
1614 1520
1615 md_check_recovery(mddev); 1521 md_check_recovery(mddev);
1616 1522
1523 blk_start_plug(&plug);
1617 for (;;) { 1524 for (;;) {
1618 char b[BDEVNAME_SIZE]; 1525 char b[BDEVNAME_SIZE];
1619 1526
1620 unplug += flush_pending_writes(conf); 1527 if (atomic_read(&mddev->plug_cnt) == 0)
1528 flush_pending_writes(conf);
1621 1529
1622 spin_lock_irqsave(&conf->device_lock, flags); 1530 spin_lock_irqsave(&conf->device_lock, flags);
1623 if (list_empty(head)) { 1531 if (list_empty(head)) {
@@ -1631,45 +1539,9 @@ static void raid1d(mddev_t *mddev)
1631 1539
1632 mddev = r1_bio->mddev; 1540 mddev = r1_bio->mddev;
1633 conf = mddev->private; 1541 conf = mddev->private;
1634 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1542 if (test_bit(R1BIO_IsSync, &r1_bio->state))
1635 sync_request_write(mddev, r1_bio); 1543 sync_request_write(mddev, r1_bio);
1636 unplug = 1; 1544 else {
1637 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638 /* some requests in the r1bio were REQ_HARDBARRIER
1639 * requests which failed with -EOPNOTSUPP. Hohumm..
1640 * Better resubmit without the barrier.
1641 * We know which devices to resubmit for, because
1642 * all others have had their bios[] entry cleared.
1643 * We already have a nr_pending reference on these rdevs.
1644 */
1645 int i;
1646 const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648 clear_bit(R1BIO_Barrier, &r1_bio->state);
1649 for (i=0; i < conf->raid_disks; i++)
1650 if (r1_bio->bios[i])
1651 atomic_inc(&r1_bio->remaining);
1652 for (i=0; i < conf->raid_disks; i++)
1653 if (r1_bio->bios[i]) {
1654 struct bio_vec *bvec;
1655 int j;
1656
1657 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658 /* copy pages from the failed bio, as
1659 * this might be a write-behind device */
1660 __bio_for_each_segment(bvec, bio, j, 0)
1661 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662 bio_put(r1_bio->bios[i]);
1663 bio->bi_sector = r1_bio->sector +
1664 conf->mirrors[i].rdev->data_offset;
1665 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666 bio->bi_end_io = raid1_end_write_request;
1667 bio->bi_rw = WRITE | do_sync;
1668 bio->bi_private = r1_bio;
1669 r1_bio->bios[i] = bio;
1670 generic_make_request(bio);
1671 }
1672 } else {
1673 int disk; 1545 int disk;
1674 1546
1675 /* we got a read error. Maybe the drive is bad. Maybe just 1547 /* we got a read error. Maybe the drive is bad. Maybe just
@@ -1704,7 +1576,8 @@ static void raid1d(mddev_t *mddev)
1704 mddev->ro ? IO_BLOCKED : NULL; 1576 mddev->ro ? IO_BLOCKED : NULL;
1705 r1_bio->read_disk = disk; 1577 r1_bio->read_disk = disk;
1706 bio_put(bio); 1578 bio_put(bio);
1707 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1579 bio = bio_clone_mddev(r1_bio->master_bio,
1580 GFP_NOIO, mddev);
1708 r1_bio->bios[r1_bio->read_disk] = bio; 1581 r1_bio->bios[r1_bio->read_disk] = bio;
1709 rdev = conf->mirrors[disk].rdev; 1582 rdev = conf->mirrors[disk].rdev;
1710 if (printk_ratelimit()) 1583 if (printk_ratelimit())
@@ -1718,14 +1591,12 @@ static void raid1d(mddev_t *mddev)
1718 bio->bi_end_io = raid1_end_read_request; 1591 bio->bi_end_io = raid1_end_read_request;
1719 bio->bi_rw = READ | do_sync; 1592 bio->bi_rw = READ | do_sync;
1720 bio->bi_private = r1_bio; 1593 bio->bi_private = r1_bio;
1721 unplug = 1;
1722 generic_make_request(bio); 1594 generic_make_request(bio);
1723 } 1595 }
1724 } 1596 }
1725 cond_resched(); 1597 cond_resched();
1726 } 1598 }
1727 if (unplug) 1599 blk_finish_plug(&plug);
1728 unplug_slaves(mddev);
1729} 1600}
1730 1601
1731 1602
@@ -1763,7 +1634,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1763 int i; 1634 int i;
1764 int wonly = -1; 1635 int wonly = -1;
1765 int write_targets = 0, read_targets = 0; 1636 int write_targets = 0, read_targets = 0;
1766 int sync_blocks; 1637 sector_t sync_blocks;
1767 int still_degraded = 0; 1638 int still_degraded = 0;
1768 1639
1769 if (!conf->r1buf_pool) 1640 if (!conf->r1buf_pool)
@@ -1813,11 +1684,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1813 msleep_interruptible(1000); 1684 msleep_interruptible(1000);
1814 1685
1815 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 1686 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1687 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1816 raise_barrier(conf); 1688 raise_barrier(conf);
1817 1689
1818 conf->next_resync = sector_nr; 1690 conf->next_resync = sector_nr;
1819 1691
1820 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1821 rcu_read_lock(); 1692 rcu_read_lock();
1822 /* 1693 /*
1823 * If we get a correctably read error during resync or recovery, 1694 * If we get a correctably read error during resync or recovery,
@@ -2029,7 +1900,6 @@ static conf_t *setup_conf(mddev_t *mddev)
2029 init_waitqueue_head(&conf->wait_barrier); 1900 init_waitqueue_head(&conf->wait_barrier);
2030 1901
2031 bio_list_init(&conf->pending_bio_list); 1902 bio_list_init(&conf->pending_bio_list);
2032 bio_list_init(&conf->flushing_bio_list);
2033 1903
2034 conf->last_used = -1; 1904 conf->last_used = -1;
2035 for (i = 0; i < conf->raid_disks; i++) { 1905 for (i = 0; i < conf->raid_disks; i++) {
@@ -2107,8 +1977,9 @@ static int run(mddev_t *mddev)
2107 if (IS_ERR(conf)) 1977 if (IS_ERR(conf))
2108 return PTR_ERR(conf); 1978 return PTR_ERR(conf);
2109 1979
2110 mddev->queue->queue_lock = &conf->device_lock;
2111 list_for_each_entry(rdev, &mddev->disks, same_set) { 1980 list_for_each_entry(rdev, &mddev->disks, same_set) {
1981 if (!mddev->gendisk)
1982 continue;
2112 disk_stack_limits(mddev->gendisk, rdev->bdev, 1983 disk_stack_limits(mddev->gendisk, rdev->bdev,
2113 rdev->data_offset << 9); 1984 rdev->data_offset << 9);
2114 /* as we don't honour merge_bvec_fn, we must never risk 1985 /* as we don't honour merge_bvec_fn, we must never risk
@@ -2150,11 +2021,11 @@ static int run(mddev_t *mddev)
2150 2021
2151 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 2022 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2152 2023
2153 mddev->queue->unplug_fn = raid1_unplug; 2024 if (mddev->queue) {
2154 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2025 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2155 mddev->queue->backing_dev_info.congested_data = mddev; 2026 mddev->queue->backing_dev_info.congested_data = mddev;
2156 md_integrity_register(mddev); 2027 }
2157 return 0; 2028 return md_integrity_register(mddev);
2158} 2029}
2159 2030
2160static int stop(mddev_t *mddev) 2031static int stop(mddev_t *mddev)
@@ -2176,7 +2047,6 @@ static int stop(mddev_t *mddev)
2176 2047
2177 md_unregister_thread(mddev->thread); 2048 md_unregister_thread(mddev->thread);
2178 mddev->thread = NULL; 2049 mddev->thread = NULL;
2179 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2180 if (conf->r1bio_pool) 2050 if (conf->r1bio_pool)
2181 mempool_destroy(conf->r1bio_pool); 2051 mempool_destroy(conf->r1bio_pool);
2182 kfree(conf->mirrors); 2052 kfree(conf->mirrors);
@@ -2201,7 +2071,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2201 set_capacity(mddev->gendisk, mddev->array_sectors); 2071 set_capacity(mddev->gendisk, mddev->array_sectors);
2202 revalidate_disk(mddev->gendisk); 2072 revalidate_disk(mddev->gendisk);
2203 if (sectors > mddev->dev_sectors && 2073 if (sectors > mddev->dev_sectors &&
2204 mddev->recovery_cp == MaxSector) { 2074 mddev->recovery_cp > mddev->dev_sectors) {
2205 mddev->recovery_cp = mddev->dev_sectors; 2075 mddev->recovery_cp = mddev->dev_sectors;
2206 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2076 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2207 } 2077 }
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5f2d443ae28a..e743a64fac4f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -35,8 +35,6 @@ struct r1_private_data_s {
35 struct list_head retry_list; 35 struct list_head retry_list;
36 /* queue pending writes and submit them on unplug */ 36 /* queue pending writes and submit them on unplug */
37 struct bio_list pending_bio_list; 37 struct bio_list pending_bio_list;
38 /* queue of writes that have been unplugged */
39 struct bio_list flushing_bio_list;
40 38
41 /* for use when syncing mirrors: */ 39 /* for use when syncing mirrors: */
42 40
@@ -96,7 +94,9 @@ struct r1bio_s {
96 int read_disk; 94 int read_disk;
97 95
98 struct list_head retry_list; 96 struct list_head retry_list;
99 struct bitmap_update *bitmap_update; 97 /* Next two are only valid when R1BIO_BehindIO is set */
98 struct page **behind_pages;
99 int behind_page_count;
100 /* 100 /*
101 * if the IO is in WRITE direction, then multiple bios are used. 101 * if the IO is in WRITE direction, then multiple bios are used.
102 * We choose the number when they are allocated. 102 * We choose the number when they are allocated.
@@ -117,8 +117,6 @@ struct r1bio_s {
117#define R1BIO_IsSync 1 117#define R1BIO_IsSync 1
118#define R1BIO_Degraded 2 118#define R1BIO_Degraded 2
119#define R1BIO_BehindIO 3 119#define R1BIO_BehindIO 3
120#define R1BIO_Barrier 4
121#define R1BIO_BarrierRetry 5
122/* For write-behind requests, we call bi_end_io when 120/* For write-behind requests, we call bi_end_io when
123 * the last non-write-behind device completes, providing 121 * the last non-write-behind device completes, providing
124 * any write was successful. Otherwise we call when 122 * any write was successful. Otherwise we call when
@@ -128,4 +126,6 @@ struct r1bio_s {
128 */ 126 */
129#define R1BIO_Returned 6 127#define R1BIO_Returned 6
130 128
129extern int md_raid1_congested(mddev_t *mddev, int bits);
130
131#endif 131#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84718383124d..6e846688962f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * RAID-10 support for md. 6 * RAID-10 support for md.
7 * 7 *
8 * Base on code in raid1.c. See raid1.c for futher copyright information. 8 * Base on code in raid1.c. See raid1.c for further copyright information.
9 * 9 *
10 * 10 *
11 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
@@ -57,23 +57,16 @@
57 */ 57 */
58#define NR_RAID10_BIOS 256 58#define NR_RAID10_BIOS 256
59 59
60static void unplug_slaves(mddev_t *mddev);
61
62static void allow_barrier(conf_t *conf); 60static void allow_barrier(conf_t *conf);
63static void lower_barrier(conf_t *conf); 61static void lower_barrier(conf_t *conf);
64 62
65static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 63static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
66{ 64{
67 conf_t *conf = data; 65 conf_t *conf = data;
68 r10bio_t *r10_bio;
69 int size = offsetof(struct r10bio_s, devs[conf->copies]); 66 int size = offsetof(struct r10bio_s, devs[conf->copies]);
70 67
71 /* allocate a r10bio with room for raid_disks entries in the bios array */ 68 /* allocate a r10bio with room for raid_disks entries in the bios array */
72 r10_bio = kzalloc(size, gfp_flags); 69 return kzalloc(size, gfp_flags);
73 if (!r10_bio && conf->mddev)
74 unplug_slaves(conf->mddev);
75
76 return r10_bio;
77} 70}
78 71
79static void r10bio_pool_free(void *r10_bio, void *data) 72static void r10bio_pool_free(void *r10_bio, void *data)
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
106 int nalloc; 99 int nalloc;
107 100
108 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 101 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
109 if (!r10_bio) { 102 if (!r10_bio)
110 unplug_slaves(conf->mddev);
111 return NULL; 103 return NULL;
112 }
113 104
114 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) 105 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
115 nalloc = conf->copies; /* resync */ 106 nalloc = conf->copies; /* resync */
@@ -120,7 +111,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
120 * Allocate bios. 111 * Allocate bios.
121 */ 112 */
122 for (j = nalloc ; j-- ; ) { 113 for (j = nalloc ; j-- ; ) {
123 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 114 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
124 if (!bio) 115 if (!bio)
125 goto out_free_bio; 116 goto out_free_bio;
126 r10_bio->devs[j].bio = bio; 117 r10_bio->devs[j].bio = bio;
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
280 */ 271 */
281 set_bit(R10BIO_Uptodate, &r10_bio->state); 272 set_bit(R10BIO_Uptodate, &r10_bio->state);
282 raid_end_bio_io(r10_bio); 273 raid_end_bio_io(r10_bio);
274 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
283 } else { 275 } else {
284 /* 276 /*
285 * oops, read error: 277 * oops, read error - keep the refcount on the rdev
286 */ 278 */
287 char b[BDEVNAME_SIZE]; 279 char b[BDEVNAME_SIZE];
288 if (printk_ratelimit()) 280 if (printk_ratelimit())
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
291 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 283 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
292 reschedule_retry(r10_bio); 284 reschedule_retry(r10_bio);
293 } 285 }
294
295 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
296} 286}
297 287
298static void raid10_end_write_request(struct bio *bio, int error) 288static void raid10_end_write_request(struct bio *bio, int error)
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error)
349 339
350/* 340/*
351 * RAID10 layout manager 341 * RAID10 layout manager
352 * Aswell as the chunksize and raid_disks count, there are two 342 * As well as the chunksize and raid_disks count, there are two
353 * parameters: near_copies and far_copies. 343 * parameters: near_copies and far_copies.
354 * near_copies * far_copies must be <= raid_disks. 344 * near_copies * far_copies must be <= raid_disks.
355 * Normally one of these will be 1. 345 * Normally one of these will be 1.
356 * If both are 1, we get raid0. 346 * If both are 1, we get raid0.
357 * If near_copies == raid_disks, we get raid1. 347 * If near_copies == raid_disks, we get raid1.
358 * 348 *
359 * Chunks are layed out in raid0 style with near_copies copies of the 349 * Chunks are laid out in raid0 style with near_copies copies of the
360 * first chunk, followed by near_copies copies of the next chunk and 350 * first chunk, followed by near_copies copies of the next chunk and
361 * so on. 351 * so on.
362 * If far_copies > 1, then after 1/far_copies of the array has been assigned 352 * If far_copies > 1, then after 1/far_copies of the array has been assigned
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
497static int read_balance(conf_t *conf, r10bio_t *r10_bio) 487static int read_balance(conf_t *conf, r10bio_t *r10_bio)
498{ 488{
499 const sector_t this_sector = r10_bio->sector; 489 const sector_t this_sector = r10_bio->sector;
500 int disk, slot, nslot; 490 int disk, slot;
501 const int sectors = r10_bio->sectors; 491 const int sectors = r10_bio->sectors;
502 sector_t new_distance, current_distance; 492 sector_t new_distance, best_dist;
503 mdk_rdev_t *rdev; 493 mdk_rdev_t *rdev;
494 int do_balance;
495 int best_slot;
504 496
505 raid10_find_phys(conf, r10_bio); 497 raid10_find_phys(conf, r10_bio);
506 rcu_read_lock(); 498 rcu_read_lock();
499retry:
500 best_slot = -1;
501 best_dist = MaxSector;
502 do_balance = 1;
507 /* 503 /*
508 * Check if we can balance. We can balance on the whole 504 * Check if we can balance. We can balance on the whole
509 * device if no resync is going on (recovery is ok), or below 505 * device if no resync is going on (recovery is ok), or below
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
511 * above the resync window. 507 * above the resync window.
512 */ 508 */
513 if (conf->mddev->recovery_cp < MaxSector 509 if (conf->mddev->recovery_cp < MaxSector
514 && (this_sector + sectors >= conf->next_resync)) { 510 && (this_sector + sectors >= conf->next_resync))
515 /* make sure that disk is operational */ 511 do_balance = 0;
516 slot = 0;
517 disk = r10_bio->devs[slot].devnum;
518 512
519 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || 513 for (slot = 0; slot < conf->copies ; slot++) {
520 r10_bio->devs[slot].bio == IO_BLOCKED || 514 if (r10_bio->devs[slot].bio == IO_BLOCKED)
521 !test_bit(In_sync, &rdev->flags)) { 515 continue;
522 slot++;
523 if (slot == conf->copies) {
524 slot = 0;
525 disk = -1;
526 break;
527 }
528 disk = r10_bio->devs[slot].devnum;
529 }
530 goto rb_out;
531 }
532
533
534 /* make sure the disk is operational */
535 slot = 0;
536 disk = r10_bio->devs[slot].devnum;
537 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
538 r10_bio->devs[slot].bio == IO_BLOCKED ||
539 !test_bit(In_sync, &rdev->flags)) {
540 slot ++;
541 if (slot == conf->copies) {
542 disk = -1;
543 goto rb_out;
544 }
545 disk = r10_bio->devs[slot].devnum; 516 disk = r10_bio->devs[slot].devnum;
546 } 517 rdev = rcu_dereference(conf->mirrors[disk].rdev);
547 518 if (rdev == NULL)
548 519 continue;
549 current_distance = abs(r10_bio->devs[slot].addr - 520 if (!test_bit(In_sync, &rdev->flags))
550 conf->mirrors[disk].head_position);
551
552 /* Find the disk whose head is closest,
553 * or - for far > 1 - find the closest to partition beginning */
554
555 for (nslot = slot; nslot < conf->copies; nslot++) {
556 int ndisk = r10_bio->devs[nslot].devnum;
557
558
559 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
560 r10_bio->devs[nslot].bio == IO_BLOCKED ||
561 !test_bit(In_sync, &rdev->flags))
562 continue; 521 continue;
563 522
523 if (!do_balance)
524 break;
525
564 /* This optimisation is debatable, and completely destroys 526 /* This optimisation is debatable, and completely destroys
565 * sequential read speed for 'far copies' arrays. So only 527 * sequential read speed for 'far copies' arrays. So only
566 * keep it for 'near' arrays, and review those later. 528 * keep it for 'near' arrays, and review those later.
567 */ 529 */
568 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { 530 if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
569 disk = ndisk;
570 slot = nslot;
571 break; 531 break;
572 }
573 532
574 /* for far > 1 always use the lowest address */ 533 /* for far > 1 always use the lowest address */
575 if (conf->far_copies > 1) 534 if (conf->far_copies > 1)
576 new_distance = r10_bio->devs[nslot].addr; 535 new_distance = r10_bio->devs[slot].addr;
577 else 536 else
578 new_distance = abs(r10_bio->devs[nslot].addr - 537 new_distance = abs(r10_bio->devs[slot].addr -
579 conf->mirrors[ndisk].head_position); 538 conf->mirrors[disk].head_position);
580 if (new_distance < current_distance) { 539 if (new_distance < best_dist) {
581 current_distance = new_distance; 540 best_dist = new_distance;
582 disk = ndisk; 541 best_slot = slot;
583 slot = nslot;
584 } 542 }
585 } 543 }
544 if (slot == conf->copies)
545 slot = best_slot;
586 546
587rb_out: 547 if (slot >= 0) {
588 r10_bio->read_slot = slot; 548 disk = r10_bio->devs[slot].devnum;
589/* conf->next_seq_sect = this_sector + sectors;*/ 549 rdev = rcu_dereference(conf->mirrors[disk].rdev);
590 550 if (!rdev)
591 if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) 551 goto retry;
592 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 552 atomic_inc(&rdev->nr_pending);
593 else 553 if (test_bit(Faulty, &rdev->flags)) {
554 /* Cannot risk returning a device that failed
555 * before we inc'ed nr_pending
556 */
557 rdev_dec_pending(rdev, conf->mddev);
558 goto retry;
559 }
560 r10_bio->read_slot = slot;
561 } else
594 disk = -1; 562 disk = -1;
595 rcu_read_unlock(); 563 rcu_read_unlock();
596 564
597 return disk; 565 return disk;
598} 566}
599 567
600static void unplug_slaves(mddev_t *mddev)
601{
602 conf_t *conf = mddev->private;
603 int i;
604
605 rcu_read_lock();
606 for (i=0; i < conf->raid_disks; i++) {
607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
610
611 atomic_inc(&rdev->nr_pending);
612 rcu_read_unlock();
613
614 blk_unplug(r_queue);
615
616 rdev_dec_pending(rdev, mddev);
617 rcu_read_lock();
618 }
619 }
620 rcu_read_unlock();
621}
622
623static void raid10_unplug(struct request_queue *q)
624{
625 mddev_t *mddev = q->queuedata;
626
627 unplug_slaves(q->queuedata);
628 md_wakeup_thread(mddev->thread);
629}
630
631static int raid10_congested(void *data, int bits) 568static int raid10_congested(void *data, int bits)
632{ 569{
633 mddev_t *mddev = data; 570 mddev_t *mddev = data;
@@ -649,20 +586,16 @@ static int raid10_congested(void *data, int bits)
649 return ret; 586 return ret;
650} 587}
651 588
652static int flush_pending_writes(conf_t *conf) 589static void flush_pending_writes(conf_t *conf)
653{ 590{
654 /* Any writes that have been queued but are awaiting 591 /* Any writes that have been queued but are awaiting
655 * bitmap updates get flushed here. 592 * bitmap updates get flushed here.
656 * We return 1 if any requests were actually submitted.
657 */ 593 */
658 int rv = 0;
659
660 spin_lock_irq(&conf->device_lock); 594 spin_lock_irq(&conf->device_lock);
661 595
662 if (conf->pending_bio_list.head) { 596 if (conf->pending_bio_list.head) {
663 struct bio *bio; 597 struct bio *bio;
664 bio = bio_list_get(&conf->pending_bio_list); 598 bio = bio_list_get(&conf->pending_bio_list);
665 blk_remove_plug(conf->mddev->queue);
666 spin_unlock_irq(&conf->device_lock); 599 spin_unlock_irq(&conf->device_lock);
667 /* flush any pending bitmap writes to disk 600 /* flush any pending bitmap writes to disk
668 * before proceeding w/ I/O */ 601 * before proceeding w/ I/O */
@@ -674,11 +607,10 @@ static int flush_pending_writes(conf_t *conf)
674 generic_make_request(bio); 607 generic_make_request(bio);
675 bio = next; 608 bio = next;
676 } 609 }
677 rv = 1;
678 } else 610 } else
679 spin_unlock_irq(&conf->device_lock); 611 spin_unlock_irq(&conf->device_lock);
680 return rv;
681} 612}
613
682/* Barriers.... 614/* Barriers....
683 * Sometimes we need to suspend IO while we do something else, 615 * Sometimes we need to suspend IO while we do something else,
684 * either some resync/recovery, or reconfigure the array. 616 * either some resync/recovery, or reconfigure the array.
@@ -708,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force)
708 640
709 /* Wait until no block IO is waiting (unless 'force') */ 641 /* Wait until no block IO is waiting (unless 'force') */
710 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 642 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
711 conf->resync_lock, 643 conf->resync_lock, );
712 raid10_unplug(conf->mddev->queue));
713 644
714 /* block any new IO from starting */ 645 /* block any new IO from starting */
715 conf->barrier++; 646 conf->barrier++;
716 647
717 /* No wait for all pending IO to complete */ 648 /* Now wait for all pending IO to complete */
718 wait_event_lock_irq(conf->wait_barrier, 649 wait_event_lock_irq(conf->wait_barrier,
719 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 650 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
720 conf->resync_lock, 651 conf->resync_lock, );
721 raid10_unplug(conf->mddev->queue));
722 652
723 spin_unlock_irq(&conf->resync_lock); 653 spin_unlock_irq(&conf->resync_lock);
724} 654}
@@ -739,7 +669,7 @@ static void wait_barrier(conf_t *conf)
739 conf->nr_waiting++; 669 conf->nr_waiting++;
740 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 670 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
741 conf->resync_lock, 671 conf->resync_lock,
742 raid10_unplug(conf->mddev->queue)); 672 );
743 conf->nr_waiting--; 673 conf->nr_waiting--;
744 } 674 }
745 conf->nr_pending++; 675 conf->nr_pending++;
@@ -775,8 +705,8 @@ static void freeze_array(conf_t *conf)
775 wait_event_lock_irq(conf->wait_barrier, 705 wait_event_lock_irq(conf->wait_barrier,
776 conf->nr_pending == conf->nr_queued+1, 706 conf->nr_pending == conf->nr_queued+1,
777 conf->resync_lock, 707 conf->resync_lock,
778 ({ flush_pending_writes(conf); 708 flush_pending_writes(conf));
779 raid10_unplug(conf->mddev->queue); })); 709
780 spin_unlock_irq(&conf->resync_lock); 710 spin_unlock_irq(&conf->resync_lock);
781} 711}
782 712
@@ -800,12 +730,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
800 int chunk_sects = conf->chunk_mask + 1; 730 int chunk_sects = conf->chunk_mask + 1;
801 const int rw = bio_data_dir(bio); 731 const int rw = bio_data_dir(bio);
802 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 732 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
803 struct bio_list bl; 733 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
804 unsigned long flags; 734 unsigned long flags;
805 mdk_rdev_t *blocked_rdev; 735 mdk_rdev_t *blocked_rdev;
736 int plugged;
806 737
807 if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 738 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
808 md_barrier_request(mddev, bio); 739 md_flush_request(mddev, bio);
809 return 0; 740 return 0;
810 } 741 }
811 742
@@ -889,7 +820,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
889 } 820 }
890 mirror = conf->mirrors + disk; 821 mirror = conf->mirrors + disk;
891 822
892 read_bio = bio_clone(bio, GFP_NOIO); 823 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
893 824
894 r10_bio->devs[slot].bio = read_bio; 825 r10_bio->devs[slot].bio = read_bio;
895 826
@@ -911,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
911 * inc refcount on their rdev. Record them by setting 842 * inc refcount on their rdev. Record them by setting
912 * bios[x] to bio 843 * bios[x] to bio
913 */ 844 */
845 plugged = mddev_check_plugged(mddev);
846
914 raid10_find_phys(conf, r10_bio); 847 raid10_find_phys(conf, r10_bio);
915 retry_write: 848 retry_write:
916 blocked_rdev = NULL; 849 blocked_rdev = NULL;
@@ -949,48 +882,46 @@ static int make_request(mddev_t *mddev, struct bio * bio)
949 goto retry_write; 882 goto retry_write;
950 } 883 }
951 884
952 atomic_set(&r10_bio->remaining, 0); 885 atomic_set(&r10_bio->remaining, 1);
886 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
953 887
954 bio_list_init(&bl);
955 for (i = 0; i < conf->copies; i++) { 888 for (i = 0; i < conf->copies; i++) {
956 struct bio *mbio; 889 struct bio *mbio;
957 int d = r10_bio->devs[i].devnum; 890 int d = r10_bio->devs[i].devnum;
958 if (!r10_bio->devs[i].bio) 891 if (!r10_bio->devs[i].bio)
959 continue; 892 continue;
960 893
961 mbio = bio_clone(bio, GFP_NOIO); 894 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
962 r10_bio->devs[i].bio = mbio; 895 r10_bio->devs[i].bio = mbio;
963 896
964 mbio->bi_sector = r10_bio->devs[i].addr+ 897 mbio->bi_sector = r10_bio->devs[i].addr+
965 conf->mirrors[d].rdev->data_offset; 898 conf->mirrors[d].rdev->data_offset;
966 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 899 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
967 mbio->bi_end_io = raid10_end_write_request; 900 mbio->bi_end_io = raid10_end_write_request;
968 mbio->bi_rw = WRITE | do_sync; 901 mbio->bi_rw = WRITE | do_sync | do_fua;
969 mbio->bi_private = r10_bio; 902 mbio->bi_private = r10_bio;
970 903
971 atomic_inc(&r10_bio->remaining); 904 atomic_inc(&r10_bio->remaining);
972 bio_list_add(&bl, mbio); 905 spin_lock_irqsave(&conf->device_lock, flags);
906 bio_list_add(&conf->pending_bio_list, mbio);
907 spin_unlock_irqrestore(&conf->device_lock, flags);
973 } 908 }
974 909
975 if (unlikely(!atomic_read(&r10_bio->remaining))) { 910 if (atomic_dec_and_test(&r10_bio->remaining)) {
976 /* the array is dead */ 911 /* This matches the end of raid10_end_write_request() */
912 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
913 r10_bio->sectors,
914 !test_bit(R10BIO_Degraded, &r10_bio->state),
915 0);
977 md_write_end(mddev); 916 md_write_end(mddev);
978 raid_end_bio_io(r10_bio); 917 raid_end_bio_io(r10_bio);
979 return 0;
980 } 918 }
981 919
982 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
983 spin_lock_irqsave(&conf->device_lock, flags);
984 bio_list_merge(&conf->pending_bio_list, &bl);
985 blk_plug_device(mddev->queue);
986 spin_unlock_irqrestore(&conf->device_lock, flags);
987
988 /* In case raid10d snuck in to freeze_array */ 920 /* In case raid10d snuck in to freeze_array */
989 wake_up(&conf->wait_barrier); 921 wake_up(&conf->wait_barrier);
990 922
991 if (do_sync) 923 if (do_sync || !mddev->bitmap || !plugged)
992 md_wakeup_thread(mddev->thread); 924 md_wakeup_thread(mddev->thread);
993
994 return 0; 925 return 0;
995} 926}
996 927
@@ -1051,8 +982,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1051 } 982 }
1052 set_bit(Faulty, &rdev->flags); 983 set_bit(Faulty, &rdev->flags);
1053 set_bit(MD_CHANGE_DEVS, &mddev->flags); 984 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1054 printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" 985 printk(KERN_ALERT
1055 KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", 986 "md/raid10:%s: Disk failure on %s, disabling device.\n"
987 "md/raid10:%s: Operation continuing on %d devices.\n",
1056 mdname(mddev), bdevname(rdev->bdev, b), 988 mdname(mddev), bdevname(rdev->bdev, b),
1057 mdname(mddev), conf->raid_disks - mddev->degraded); 989 mdname(mddev), conf->raid_disks - mddev->degraded);
1058} 990}
@@ -1229,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1229 p->rdev = rdev; 1161 p->rdev = rdev;
1230 goto abort; 1162 goto abort;
1231 } 1163 }
1232 md_integrity_register(mddev); 1164 err = md_integrity_register(mddev);
1233 } 1165 }
1234abort: 1166abort:
1235 1167
@@ -1505,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1505 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 1437 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1506 int d = r10_bio->devs[r10_bio->read_slot].devnum; 1438 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1507 1439
1508 rcu_read_lock(); 1440 /* still own a reference to this rdev, so it cannot
1509 rdev = rcu_dereference(conf->mirrors[d].rdev); 1441 * have been cleared recently.
1510 if (rdev) { /* If rdev is not NULL */ 1442 */
1511 char b[BDEVNAME_SIZE]; 1443 rdev = conf->mirrors[d].rdev;
1512 int cur_read_error_count = 0;
1513 1444
1514 bdevname(rdev->bdev, b); 1445 if (test_bit(Faulty, &rdev->flags))
1446 /* drive has already been failed, just ignore any
1447 more fix_read_error() attempts */
1448 return;
1515 1449
1516 if (test_bit(Faulty, &rdev->flags)) { 1450 check_decay_read_errors(mddev, rdev);
1517 rcu_read_unlock(); 1451 atomic_inc(&rdev->read_errors);
1518 /* drive has already been failed, just ignore any 1452 if (atomic_read(&rdev->read_errors) > max_read_errors) {
1519 more fix_read_error() attempts */ 1453 char b[BDEVNAME_SIZE];
1520 return; 1454 bdevname(rdev->bdev, b);
1521 }
1522 1455
1523 check_decay_read_errors(mddev, rdev); 1456 printk(KERN_NOTICE
1524 atomic_inc(&rdev->read_errors); 1457 "md/raid10:%s: %s: Raid device exceeded "
1525 cur_read_error_count = atomic_read(&rdev->read_errors); 1458 "read_error threshold [cur %d:max %d]\n",
1526 if (cur_read_error_count > max_read_errors) { 1459 mdname(mddev), b,
1527 rcu_read_unlock(); 1460 atomic_read(&rdev->read_errors), max_read_errors);
1528 printk(KERN_NOTICE 1461 printk(KERN_NOTICE
1529 "md/raid10:%s: %s: Raid device exceeded " 1462 "md/raid10:%s: %s: Failing raid device\n",
1530 "read_error threshold " 1463 mdname(mddev), b);
1531 "[cur %d:max %d]\n", 1464 md_error(mddev, conf->mirrors[d].rdev);
1532 mdname(mddev), 1465 return;
1533 b, cur_read_error_count, max_read_errors);
1534 printk(KERN_NOTICE
1535 "md/raid10:%s: %s: Failing raid "
1536 "device\n", mdname(mddev), b);
1537 md_error(mddev, conf->mirrors[d].rdev);
1538 return;
1539 }
1540 } 1466 }
1541 rcu_read_unlock();
1542 1467
1543 while(sectors) { 1468 while(sectors) {
1544 int s = sectors; 1469 int s = sectors;
@@ -1557,11 +1482,11 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1557 test_bit(In_sync, &rdev->flags)) { 1482 test_bit(In_sync, &rdev->flags)) {
1558 atomic_inc(&rdev->nr_pending); 1483 atomic_inc(&rdev->nr_pending);
1559 rcu_read_unlock(); 1484 rcu_read_unlock();
1560 success = sync_page_io(rdev->bdev, 1485 success = sync_page_io(rdev,
1561 r10_bio->devs[sl].addr + 1486 r10_bio->devs[sl].addr +
1562 sect + rdev->data_offset, 1487 sect,
1563 s<<9, 1488 s<<9,
1564 conf->tmppage, READ); 1489 conf->tmppage, READ, false);
1565 rdev_dec_pending(rdev, mddev); 1490 rdev_dec_pending(rdev, mddev);
1566 rcu_read_lock(); 1491 rcu_read_lock();
1567 if (success) 1492 if (success)
@@ -1596,10 +1521,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1596 atomic_inc(&rdev->nr_pending); 1521 atomic_inc(&rdev->nr_pending);
1597 rcu_read_unlock(); 1522 rcu_read_unlock();
1598 atomic_add(s, &rdev->corrected_errors); 1523 atomic_add(s, &rdev->corrected_errors);
1599 if (sync_page_io(rdev->bdev, 1524 if (sync_page_io(rdev,
1600 r10_bio->devs[sl].addr + 1525 r10_bio->devs[sl].addr +
1601 sect + rdev->data_offset, 1526 sect,
1602 s<<9, conf->tmppage, WRITE) 1527 s<<9, conf->tmppage, WRITE, false)
1603 == 0) { 1528 == 0) {
1604 /* Well, this device is dead */ 1529 /* Well, this device is dead */
1605 printk(KERN_NOTICE 1530 printk(KERN_NOTICE
@@ -1607,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1607 "write failed" 1532 "write failed"
1608 " (%d sectors at %llu on %s)\n", 1533 " (%d sectors at %llu on %s)\n",
1609 mdname(mddev), s, 1534 mdname(mddev), s,
1610 (unsigned long long)(sect+ 1535 (unsigned long long)(
1611 rdev->data_offset), 1536 sect + rdev->data_offset),
1612 bdevname(rdev->bdev, b)); 1537 bdevname(rdev->bdev, b));
1613 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1538 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1614 "drive\n", 1539 "drive\n",
@@ -1633,19 +1558,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1633 char b[BDEVNAME_SIZE]; 1558 char b[BDEVNAME_SIZE];
1634 atomic_inc(&rdev->nr_pending); 1559 atomic_inc(&rdev->nr_pending);
1635 rcu_read_unlock(); 1560 rcu_read_unlock();
1636 if (sync_page_io(rdev->bdev, 1561 if (sync_page_io(rdev,
1637 r10_bio->devs[sl].addr + 1562 r10_bio->devs[sl].addr +
1638 sect + rdev->data_offset, 1563 sect,
1639 s<<9, conf->tmppage, 1564 s<<9, conf->tmppage,
1640 READ) == 0) { 1565 READ, false) == 0) {
1641 /* Well, this device is dead */ 1566 /* Well, this device is dead */
1642 printk(KERN_NOTICE 1567 printk(KERN_NOTICE
1643 "md/raid10:%s: unable to read back " 1568 "md/raid10:%s: unable to read back "
1644 "corrected sectors" 1569 "corrected sectors"
1645 " (%d sectors at %llu on %s)\n", 1570 " (%d sectors at %llu on %s)\n",
1646 mdname(mddev), s, 1571 mdname(mddev), s,
1647 (unsigned long long)(sect+ 1572 (unsigned long long)(
1648 rdev->data_offset), 1573 sect + rdev->data_offset),
1649 bdevname(rdev->bdev, b)); 1574 bdevname(rdev->bdev, b));
1650 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", 1575 printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1651 mdname(mddev), 1576 mdname(mddev),
@@ -1657,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1657 "md/raid10:%s: read error corrected" 1582 "md/raid10:%s: read error corrected"
1658 " (%d sectors at %llu on %s)\n", 1583 " (%d sectors at %llu on %s)\n",
1659 mdname(mddev), s, 1584 mdname(mddev), s,
1660 (unsigned long long)(sect+ 1585 (unsigned long long)(
1661 rdev->data_offset), 1586 sect + rdev->data_offset),
1662 bdevname(rdev->bdev, b)); 1587 bdevname(rdev->bdev, b));
1663 } 1588 }
1664 1589
@@ -1680,15 +1605,16 @@ static void raid10d(mddev_t *mddev)
1680 unsigned long flags; 1605 unsigned long flags;
1681 conf_t *conf = mddev->private; 1606 conf_t *conf = mddev->private;
1682 struct list_head *head = &conf->retry_list; 1607 struct list_head *head = &conf->retry_list;
1683 int unplug=0;
1684 mdk_rdev_t *rdev; 1608 mdk_rdev_t *rdev;
1609 struct blk_plug plug;
1685 1610
1686 md_check_recovery(mddev); 1611 md_check_recovery(mddev);
1687 1612
1613 blk_start_plug(&plug);
1688 for (;;) { 1614 for (;;) {
1689 char b[BDEVNAME_SIZE]; 1615 char b[BDEVNAME_SIZE];
1690 1616
1691 unplug += flush_pending_writes(conf); 1617 flush_pending_writes(conf);
1692 1618
1693 spin_lock_irqsave(&conf->device_lock, flags); 1619 spin_lock_irqsave(&conf->device_lock, flags);
1694 if (list_empty(head)) { 1620 if (list_empty(head)) {
@@ -1702,14 +1628,13 @@ static void raid10d(mddev_t *mddev)
1702 1628
1703 mddev = r10_bio->mddev; 1629 mddev = r10_bio->mddev;
1704 conf = mddev->private; 1630 conf = mddev->private;
1705 if (test_bit(R10BIO_IsSync, &r10_bio->state)) { 1631 if (test_bit(R10BIO_IsSync, &r10_bio->state))
1706 sync_request_write(mddev, r10_bio); 1632 sync_request_write(mddev, r10_bio);
1707 unplug = 1; 1633 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
1708 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1709 recovery_request_write(mddev, r10_bio); 1634 recovery_request_write(mddev, r10_bio);
1710 unplug = 1; 1635 else {
1711 } else { 1636 int slot = r10_bio->read_slot;
1712 int mirror; 1637 int mirror = r10_bio->devs[slot].devnum;
1713 /* we got a read error. Maybe the drive is bad. Maybe just 1638 /* we got a read error. Maybe the drive is bad. Maybe just
1714 * the block and we can fix it. 1639 * the block and we can fix it.
1715 * We freeze all other IO, and try reading the block from 1640 * We freeze all other IO, and try reading the block from
@@ -1723,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
1723 fix_read_error(conf, mddev, r10_bio); 1648 fix_read_error(conf, mddev, r10_bio);
1724 unfreeze_array(conf); 1649 unfreeze_array(conf);
1725 } 1650 }
1651 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1726 1652
1727 bio = r10_bio->devs[r10_bio->read_slot].bio; 1653 bio = r10_bio->devs[slot].bio;
1728 r10_bio->devs[r10_bio->read_slot].bio = 1654 r10_bio->devs[slot].bio =
1729 mddev->ro ? IO_BLOCKED : NULL; 1655 mddev->ro ? IO_BLOCKED : NULL;
1730 mirror = read_balance(conf, r10_bio); 1656 mirror = read_balance(conf, r10_bio);
1731 if (mirror == -1) { 1657 if (mirror == -1) {
@@ -1739,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
1739 } else { 1665 } else {
1740 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); 1666 const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1741 bio_put(bio); 1667 bio_put(bio);
1668 slot = r10_bio->read_slot;
1742 rdev = conf->mirrors[mirror].rdev; 1669 rdev = conf->mirrors[mirror].rdev;
1743 if (printk_ratelimit()) 1670 if (printk_ratelimit())
1744 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" 1671 printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
@@ -1746,22 +1673,21 @@ static void raid10d(mddev_t *mddev)
1746 mdname(mddev), 1673 mdname(mddev),
1747 bdevname(rdev->bdev,b), 1674 bdevname(rdev->bdev,b),
1748 (unsigned long long)r10_bio->sector); 1675 (unsigned long long)r10_bio->sector);
1749 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); 1676 bio = bio_clone_mddev(r10_bio->master_bio,
1750 r10_bio->devs[r10_bio->read_slot].bio = bio; 1677 GFP_NOIO, mddev);
1751 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr 1678 r10_bio->devs[slot].bio = bio;
1679 bio->bi_sector = r10_bio->devs[slot].addr
1752 + rdev->data_offset; 1680 + rdev->data_offset;
1753 bio->bi_bdev = rdev->bdev; 1681 bio->bi_bdev = rdev->bdev;
1754 bio->bi_rw = READ | do_sync; 1682 bio->bi_rw = READ | do_sync;
1755 bio->bi_private = r10_bio; 1683 bio->bi_private = r10_bio;
1756 bio->bi_end_io = raid10_end_read_request; 1684 bio->bi_end_io = raid10_end_read_request;
1757 unplug = 1;
1758 generic_make_request(bio); 1685 generic_make_request(bio);
1759 } 1686 }
1760 } 1687 }
1761 cond_resched(); 1688 cond_resched();
1762 } 1689 }
1763 if (unplug) 1690 blk_finish_plug(&plug);
1764 unplug_slaves(mddev);
1765} 1691}
1766 1692
1767 1693
@@ -1810,16 +1736,16 @@ static int init_resync(conf_t *conf)
1810 * 1736 *
1811 */ 1737 */
1812 1738
1813static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1739static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
1740 int *skipped, int go_faster)
1814{ 1741{
1815 conf_t *conf = mddev->private; 1742 conf_t *conf = mddev->private;
1816 r10bio_t *r10_bio; 1743 r10bio_t *r10_bio;
1817 struct bio *biolist = NULL, *bio; 1744 struct bio *biolist = NULL, *bio;
1818 sector_t max_sector, nr_sectors; 1745 sector_t max_sector, nr_sectors;
1819 int disk;
1820 int i; 1746 int i;
1821 int max_sync; 1747 int max_sync;
1822 int sync_blocks; 1748 sector_t sync_blocks;
1823 1749
1824 sector_t sectors_skipped = 0; 1750 sector_t sectors_skipped = 0;
1825 int chunks_skipped = 0; 1751 int chunks_skipped = 0;
@@ -1905,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1905 int j, k; 1831 int j, k;
1906 r10_bio = NULL; 1832 r10_bio = NULL;
1907 1833
1908 for (i=0 ; i<conf->raid_disks; i++) 1834 for (i=0 ; i<conf->raid_disks; i++) {
1909 if (conf->mirrors[i].rdev && 1835 int still_degraded;
1910 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { 1836 r10bio_t *rb2;
1911 int still_degraded = 0; 1837 sector_t sect;
1912 /* want to reconstruct this device */ 1838 int must_sync;
1913 r10bio_t *rb2 = r10_bio;
1914 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1915 int must_sync;
1916 /* Unless we are doing a full sync, we only need
1917 * to recover the block if it is set in the bitmap
1918 */
1919 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1920 &sync_blocks, 1);
1921 if (sync_blocks < max_sync)
1922 max_sync = sync_blocks;
1923 if (!must_sync &&
1924 !conf->fullsync) {
1925 /* yep, skip the sync_blocks here, but don't assume
1926 * that there will never be anything to do here
1927 */
1928 chunks_skipped = -1;
1929 continue;
1930 }
1931 1839
1932 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1840 if (conf->mirrors[i].rdev == NULL ||
1933 raise_barrier(conf, rb2 != NULL); 1841 test_bit(In_sync, &conf->mirrors[i].rdev->flags))
1934 atomic_set(&r10_bio->remaining, 0); 1842 continue;
1935 1843
1936 r10_bio->master_bio = (struct bio*)rb2; 1844 still_degraded = 0;
1937 if (rb2) 1845 /* want to reconstruct this device */
1938 atomic_inc(&rb2->remaining); 1846 rb2 = r10_bio;
1939 r10_bio->mddev = mddev; 1847 sect = raid10_find_virt(conf, sector_nr, i);
1940 set_bit(R10BIO_IsRecover, &r10_bio->state); 1848 /* Unless we are doing a full sync, we only need
1941 r10_bio->sector = sect; 1849 * to recover the block if it is set in the bitmap
1850 */
1851 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1852 &sync_blocks, 1);
1853 if (sync_blocks < max_sync)
1854 max_sync = sync_blocks;
1855 if (!must_sync &&
1856 !conf->fullsync) {
1857 /* yep, skip the sync_blocks here, but don't assume
1858 * that there will never be anything to do here
1859 */
1860 chunks_skipped = -1;
1861 continue;
1862 }
1942 1863
1943 raid10_find_phys(conf, r10_bio); 1864 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1865 raise_barrier(conf, rb2 != NULL);
1866 atomic_set(&r10_bio->remaining, 0);
1944 1867
1945 /* Need to check if the array will still be 1868 r10_bio->master_bio = (struct bio*)rb2;
1946 * degraded 1869 if (rb2)
1947 */ 1870 atomic_inc(&rb2->remaining);
1948 for (j=0; j<conf->raid_disks; j++) 1871 r10_bio->mddev = mddev;
1949 if (conf->mirrors[j].rdev == NULL || 1872 set_bit(R10BIO_IsRecover, &r10_bio->state);
1950 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 1873 r10_bio->sector = sect;
1951 still_degraded = 1;
1952 break;
1953 }
1954
1955 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1956 &sync_blocks, still_degraded);
1957
1958 for (j=0; j<conf->copies;j++) {
1959 int d = r10_bio->devs[j].devnum;
1960 if (conf->mirrors[d].rdev &&
1961 test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1962 /* This is where we read from */
1963 bio = r10_bio->devs[0].bio;
1964 bio->bi_next = biolist;
1965 biolist = bio;
1966 bio->bi_private = r10_bio;
1967 bio->bi_end_io = end_sync_read;
1968 bio->bi_rw = READ;
1969 bio->bi_sector = r10_bio->devs[j].addr +
1970 conf->mirrors[d].rdev->data_offset;
1971 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1972 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1973 atomic_inc(&r10_bio->remaining);
1974 /* and we write to 'i' */
1975
1976 for (k=0; k<conf->copies; k++)
1977 if (r10_bio->devs[k].devnum == i)
1978 break;
1979 BUG_ON(k == conf->copies);
1980 bio = r10_bio->devs[1].bio;
1981 bio->bi_next = biolist;
1982 biolist = bio;
1983 bio->bi_private = r10_bio;
1984 bio->bi_end_io = end_sync_write;
1985 bio->bi_rw = WRITE;
1986 bio->bi_sector = r10_bio->devs[k].addr +
1987 conf->mirrors[i].rdev->data_offset;
1988 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1989
1990 r10_bio->devs[0].devnum = d;
1991 r10_bio->devs[1].devnum = i;
1992 1874
1993 break; 1875 raid10_find_phys(conf, r10_bio);
1994 } 1876
1995 } 1877 /* Need to check if the array will still be
1996 if (j == conf->copies) { 1878 * degraded
1997 /* Cannot recover, so abort the recovery */ 1879 */
1998 put_buf(r10_bio); 1880 for (j=0; j<conf->raid_disks; j++)
1999 if (rb2) 1881 if (conf->mirrors[j].rdev == NULL ||
2000 atomic_dec(&rb2->remaining); 1882 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
2001 r10_bio = rb2; 1883 still_degraded = 1;
2002 if (!test_and_set_bit(MD_RECOVERY_INTR,
2003 &mddev->recovery))
2004 printk(KERN_INFO "md/raid10:%s: insufficient "
2005 "working devices for recovery.\n",
2006 mdname(mddev));
2007 break; 1884 break;
2008 } 1885 }
1886
1887 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1888 &sync_blocks, still_degraded);
1889
1890 for (j=0; j<conf->copies;j++) {
1891 int d = r10_bio->devs[j].devnum;
1892 if (!conf->mirrors[d].rdev ||
1893 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
1894 continue;
1895 /* This is where we read from */
1896 bio = r10_bio->devs[0].bio;
1897 bio->bi_next = biolist;
1898 biolist = bio;
1899 bio->bi_private = r10_bio;
1900 bio->bi_end_io = end_sync_read;
1901 bio->bi_rw = READ;
1902 bio->bi_sector = r10_bio->devs[j].addr +
1903 conf->mirrors[d].rdev->data_offset;
1904 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1905 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1906 atomic_inc(&r10_bio->remaining);
1907 /* and we write to 'i' */
1908
1909 for (k=0; k<conf->copies; k++)
1910 if (r10_bio->devs[k].devnum == i)
1911 break;
1912 BUG_ON(k == conf->copies);
1913 bio = r10_bio->devs[1].bio;
1914 bio->bi_next = biolist;
1915 biolist = bio;
1916 bio->bi_private = r10_bio;
1917 bio->bi_end_io = end_sync_write;
1918 bio->bi_rw = WRITE;
1919 bio->bi_sector = r10_bio->devs[k].addr +
1920 conf->mirrors[i].rdev->data_offset;
1921 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1922
1923 r10_bio->devs[0].devnum = d;
1924 r10_bio->devs[1].devnum = i;
1925
1926 break;
1927 }
1928 if (j == conf->copies) {
1929 /* Cannot recover, so abort the recovery */
1930 put_buf(r10_bio);
1931 if (rb2)
1932 atomic_dec(&rb2->remaining);
1933 r10_bio = rb2;
1934 if (!test_and_set_bit(MD_RECOVERY_INTR,
1935 &mddev->recovery))
1936 printk(KERN_INFO "md/raid10:%s: insufficient "
1937 "working devices for recovery.\n",
1938 mdname(mddev));
1939 break;
2009 } 1940 }
1941 }
2010 if (biolist == NULL) { 1942 if (biolist == NULL) {
2011 while (r10_bio) { 1943 while (r10_bio) {
2012 r10bio_t *rb2 = r10_bio; 1944 r10bio_t *rb2 = r10_bio;
@@ -2024,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2024 1956
2025 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1957 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2026 &sync_blocks, mddev->degraded) && 1958 &sync_blocks, mddev->degraded) &&
2027 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1959 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
1960 &mddev->recovery)) {
2028 /* We can skip this block */ 1961 /* We can skip this block */
2029 *skipped = 1; 1962 *skipped = 1;
2030 return sync_blocks + sectors_skipped; 1963 return sync_blocks + sectors_skipped;
@@ -2069,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2069 for (i=0; i<conf->copies; i++) { 2002 for (i=0; i<conf->copies; i++) {
2070 int d = r10_bio->devs[i].devnum; 2003 int d = r10_bio->devs[i].devnum;
2071 if (r10_bio->devs[i].bio->bi_end_io) 2004 if (r10_bio->devs[i].bio->bi_end_io)
2072 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 2005 rdev_dec_pending(conf->mirrors[d].rdev,
2006 mddev);
2073 } 2007 }
2074 put_buf(r10_bio); 2008 put_buf(r10_bio);
2075 biolist = NULL; 2009 biolist = NULL;
@@ -2094,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2094 do { 2028 do {
2095 struct page *page; 2029 struct page *page;
2096 int len = PAGE_SIZE; 2030 int len = PAGE_SIZE;
2097 disk = 0;
2098 if (sector_nr + (len>>9) > max_sector) 2031 if (sector_nr + (len>>9) > max_sector)
2099 len = (max_sector - sector_nr) << 9; 2032 len = (max_sector - sector_nr) << 9;
2100 if (len == 0) 2033 if (len == 0)
2101 break; 2034 break;
2102 for (bio= biolist ; bio ; bio=bio->bi_next) { 2035 for (bio= biolist ; bio ; bio=bio->bi_next) {
2036 struct bio *bio2;
2103 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 2037 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2104 if (bio_add_page(bio, page, len, 0) == 0) { 2038 if (bio_add_page(bio, page, len, 0))
2105 /* stop here */ 2039 continue;
2106 struct bio *bio2; 2040
2107 bio->bi_io_vec[bio->bi_vcnt].bv_page = page; 2041 /* stop here */
2108 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { 2042 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2109 /* remove last page from this bio */ 2043 for (bio2 = biolist;
2110 bio2->bi_vcnt--; 2044 bio2 && bio2 != bio;
2111 bio2->bi_size -= len; 2045 bio2 = bio2->bi_next) {
2112 bio2->bi_flags &= ~(1<< BIO_SEG_VALID); 2046 /* remove last page from this bio */
2113 } 2047 bio2->bi_vcnt--;
2114 goto bio_full; 2048 bio2->bi_size -= len;
2049 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
2115 } 2050 }
2116 disk = i; 2051 goto bio_full;
2117 } 2052 }
2118 nr_sectors += len>>9; 2053 nr_sectors += len>>9;
2119 sector_nr += len>>9; 2054 sector_nr += len>>9;
@@ -2302,8 +2237,6 @@ static int run(mddev_t *mddev)
2302 if (!conf) 2237 if (!conf)
2303 goto out; 2238 goto out;
2304 2239
2305 mddev->queue->queue_lock = &conf->device_lock;
2306
2307 mddev->thread = conf->thread; 2240 mddev->thread = conf->thread;
2308 conf->thread = NULL; 2241 conf->thread = NULL;
2309 2242
@@ -2374,7 +2307,6 @@ static int run(mddev_t *mddev)
2374 md_set_array_sectors(mddev, size); 2307 md_set_array_sectors(mddev, size);
2375 mddev->resync_max_sectors = size; 2308 mddev->resync_max_sectors = size;
2376 2309
2377 mddev->queue->unplug_fn = raid10_unplug;
2378 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 2310 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2379 mddev->queue->backing_dev_info.congested_data = mddev; 2311 mddev->queue->backing_dev_info.congested_data = mddev;
2380 2312
@@ -2392,17 +2324,20 @@ static int run(mddev_t *mddev)
2392 2324
2393 if (conf->near_copies < conf->raid_disks) 2325 if (conf->near_copies < conf->raid_disks)
2394 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2326 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2395 md_integrity_register(mddev); 2327
2328 if (md_integrity_register(mddev))
2329 goto out_free_conf;
2330
2396 return 0; 2331 return 0;
2397 2332
2398out_free_conf: 2333out_free_conf:
2334 md_unregister_thread(mddev->thread);
2399 if (conf->r10bio_pool) 2335 if (conf->r10bio_pool)
2400 mempool_destroy(conf->r10bio_pool); 2336 mempool_destroy(conf->r10bio_pool);
2401 safe_put_page(conf->tmppage); 2337 safe_put_page(conf->tmppage);
2402 kfree(conf->mirrors); 2338 kfree(conf->mirrors);
2403 kfree(conf); 2339 kfree(conf);
2404 mddev->private = NULL; 2340 mddev->private = NULL;
2405 md_unregister_thread(mddev->thread);
2406out: 2341out:
2407 return -EIO; 2342 return -EIO;
2408} 2343}
@@ -2461,11 +2396,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
2461 mddev->recovery_cp = MaxSector; 2396 mddev->recovery_cp = MaxSector;
2462 2397
2463 conf = setup_conf(mddev); 2398 conf = setup_conf(mddev);
2464 if (!IS_ERR(conf)) 2399 if (!IS_ERR(conf)) {
2465 list_for_each_entry(rdev, &mddev->disks, same_set) 2400 list_for_each_entry(rdev, &mddev->disks, same_set)
2466 if (rdev->raid_disk >= 0) 2401 if (rdev->raid_disk >= 0)
2467 rdev->new_raid_disk = rdev->raid_disk * 2; 2402 rdev->new_raid_disk = rdev->raid_disk * 2;
2468 2403 conf->barrier = 1;
2404 }
2405
2469 return conf; 2406 return conf;
2470} 2407}
2471 2408
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2316ac2e8e21..944b1104d3b4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -17,8 +17,8 @@ struct r10_private_data_s {
17 spinlock_t device_lock; 17 spinlock_t device_lock;
18 18
19 /* geometry */ 19 /* geometry */
20 int near_copies; /* number of copies layed out raid0 style */ 20 int near_copies; /* number of copies laid out raid0 style */
21 int far_copies; /* number of copies layed out 21 int far_copies; /* number of copies laid out
22 * at large strides across drives 22 * at large strides across drives
23 */ 23 */
24 int far_offset; /* far_copies are offset by 1 stripe 24 int far_offset; /* far_copies are offset by 1 stripe
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 69b0a169e43d..b72edf35ec54 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -27,12 +27,12 @@
27 * 27 *
28 * We group bitmap updates into batches. Each batch has a number. 28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important. 29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written. 30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to 31 * conf->seq_flush is the number of the last batch that was closed to
32 * new additions. 32 * new additions.
33 * When we discover that we will need to write to any block in a stripe 33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1. 35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet, 36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later. 37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current 38 * When an unplug happens, we increment bm_flush, thus closing the current
@@ -129,7 +129,7 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio)
129 129
130static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 130static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
131{ 131{
132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
133} 133}
134 134
135/* Find first data disk in a raid6 stripe */ 135/* Find first data disk in a raid6 stripe */
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
199 BUG_ON(!list_empty(&sh->lru)); 199 BUG_ON(!list_empty(&sh->lru));
200 BUG_ON(atomic_read(&conf->active_stripes)==0); 200 BUG_ON(atomic_read(&conf->active_stripes)==0);
201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 201 if (test_bit(STRIPE_HANDLE, &sh->state)) {
202 if (test_bit(STRIPE_DELAYED, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state))
203 list_add_tail(&sh->lru, &conf->delayed_list); 203 list_add_tail(&sh->lru, &conf->delayed_list);
204 plugger_set_plug(&conf->plug); 204 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
205 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 205 sh->bm_seq - conf->seq_write > 0)
206 sh->bm_seq - conf->seq_write > 0) {
207 list_add_tail(&sh->lru, &conf->bitmap_list); 206 list_add_tail(&sh->lru, &conf->bitmap_list);
208 plugger_set_plug(&conf->plug); 207 else {
209 } else {
210 clear_bit(STRIPE_BIT_DELAY, &sh->state); 208 clear_bit(STRIPE_BIT_DELAY, &sh->state);
211 list_add_tail(&sh->lru, &conf->handle_list); 209 list_add_tail(&sh->lru, &conf->handle_list);
212 } 210 }
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf)
433 return 0; 431 return 0;
434} 432}
435 433
436static void unplug_slaves(mddev_t *mddev);
437
438static struct stripe_head * 434static struct stripe_head *
439get_active_stripe(raid5_conf_t *conf, sector_t sector, 435get_active_stripe(raid5_conf_t *conf, sector_t sector,
440 int previous, int noblock, int noquiesce) 436 int previous, int noblock, int noquiesce)
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector,
463 < (conf->max_nr_stripes *3/4) 459 < (conf->max_nr_stripes *3/4)
464 || !conf->inactive_blocked), 460 || !conf->inactive_blocked),
465 conf->device_lock, 461 conf->device_lock,
466 md_raid5_unplug_device(conf) 462 );
467 );
468 conf->inactive_blocked = 0; 463 conf->inactive_blocked = 0;
469 } else 464 } else
470 init_stripe(sh, sector, previous); 465 init_stripe(sh, sector, previous);
@@ -506,9 +501,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
506 int rw; 501 int rw;
507 struct bio *bi; 502 struct bio *bi;
508 mdk_rdev_t *rdev; 503 mdk_rdev_t *rdev;
509 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 504 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
510 rw = WRITE; 505 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
511 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 506 rw = WRITE_FUA;
507 else
508 rw = WRITE;
509 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
512 rw = READ; 510 rw = READ;
513 else 511 else
514 continue; 512 continue;
@@ -516,7 +514,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
516 bi = &sh->dev[i].req; 514 bi = &sh->dev[i].req;
517 515
518 bi->bi_rw = rw; 516 bi->bi_rw = rw;
519 if (rw == WRITE) 517 if (rw & WRITE)
520 bi->bi_end_io = raid5_end_write_request; 518 bi->bi_end_io = raid5_end_write_request;
521 else 519 else
522 bi->bi_end_io = raid5_end_read_request; 520 bi->bi_end_io = raid5_end_read_request;
@@ -550,13 +548,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
550 bi->bi_io_vec[0].bv_offset = 0; 548 bi->bi_io_vec[0].bv_offset = 0;
551 bi->bi_size = STRIPE_SIZE; 549 bi->bi_size = STRIPE_SIZE;
552 bi->bi_next = NULL; 550 bi->bi_next = NULL;
553 if (rw == WRITE && 551 if ((rw & WRITE) &&
554 test_bit(R5_ReWrite, &sh->dev[i].flags)) 552 test_bit(R5_ReWrite, &sh->dev[i].flags))
555 atomic_add(STRIPE_SECTORS, 553 atomic_add(STRIPE_SECTORS,
556 &rdev->corrected_errors); 554 &rdev->corrected_errors);
557 generic_make_request(bi); 555 generic_make_request(bi);
558 } else { 556 } else {
559 if (rw == WRITE) 557 if (rw & WRITE)
560 set_bit(STRIPE_DEGRADED, &sh->state); 558 set_bit(STRIPE_DEGRADED, &sh->state);
561 pr_debug("skip op %ld on disc %d for sector %llu\n", 559 pr_debug("skip op %ld on disc %d for sector %llu\n",
562 bi->bi_rw, i, (unsigned long long)sh->sector); 560 bi->bi_rw, i, (unsigned long long)sh->sector);
@@ -587,7 +585,7 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
587 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 585 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
588 586
589 bio_for_each_segment(bvl, bio, i) { 587 bio_for_each_segment(bvl, bio, i) {
590 int len = bio_iovec_idx(bio, i)->bv_len; 588 int len = bvl->bv_len;
591 int clen; 589 int clen;
592 int b_offset = 0; 590 int b_offset = 0;
593 591
@@ -603,8 +601,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
603 clen = len; 601 clen = len;
604 602
605 if (clen > 0) { 603 if (clen > 0) {
606 b_offset += bio_iovec_idx(bio, i)->bv_offset; 604 b_offset += bvl->bv_offset;
607 bio_page = bio_iovec_idx(bio, i)->bv_page; 605 bio_page = bvl->bv_page;
608 if (frombio) 606 if (frombio)
609 tx = async_memcpy(page, bio_page, page_offset, 607 tx = async_memcpy(page, bio_page, page_offset,
610 b_offset, clen, &submit); 608 b_offset, clen, &submit);
@@ -1031,6 +1029,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1031 1029
1032 while (wbi && wbi->bi_sector < 1030 while (wbi && wbi->bi_sector <
1033 dev->sector + STRIPE_SECTORS) { 1031 dev->sector + STRIPE_SECTORS) {
1032 if (wbi->bi_rw & REQ_FUA)
1033 set_bit(R5_WantFUA, &dev->flags);
1034 tx = async_copy_data(1, wbi, dev->page, 1034 tx = async_copy_data(1, wbi, dev->page,
1035 dev->sector, tx); 1035 dev->sector, tx);
1036 wbi = r5_next_bio(wbi, dev->sector); 1036 wbi = r5_next_bio(wbi, dev->sector);
@@ -1048,15 +1048,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1048 int pd_idx = sh->pd_idx; 1048 int pd_idx = sh->pd_idx;
1049 int qd_idx = sh->qd_idx; 1049 int qd_idx = sh->qd_idx;
1050 int i; 1050 int i;
1051 bool fua = false;
1051 1052
1052 pr_debug("%s: stripe %llu\n", __func__, 1053 pr_debug("%s: stripe %llu\n", __func__,
1053 (unsigned long long)sh->sector); 1054 (unsigned long long)sh->sector);
1054 1055
1056 for (i = disks; i--; )
1057 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1058
1055 for (i = disks; i--; ) { 1059 for (i = disks; i--; ) {
1056 struct r5dev *dev = &sh->dev[i]; 1060 struct r5dev *dev = &sh->dev[i];
1057 1061
1058 if (dev->written || i == pd_idx || i == qd_idx) 1062 if (dev->written || i == pd_idx || i == qd_idx) {
1059 set_bit(R5_UPTODATE, &dev->flags); 1063 set_bit(R5_UPTODATE, &dev->flags);
1064 if (fua)
1065 set_bit(R5_WantFUA, &dev->flags);
1066 }
1060 } 1067 }
1061 1068
1062 if (sh->reconstruct_state == reconstruct_state_drain_run) 1069 if (sh->reconstruct_state == reconstruct_state_drain_run)
@@ -1461,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1461 wait_event_lock_irq(conf->wait_for_stripe, 1468 wait_event_lock_irq(conf->wait_for_stripe,
1462 !list_empty(&conf->inactive_list), 1469 !list_empty(&conf->inactive_list),
1463 conf->device_lock, 1470 conf->device_lock,
1464 unplug_slaves(conf->mddev) 1471 );
1465 );
1466 osh = get_free_stripe(conf); 1472 osh = get_free_stripe(conf);
1467 spin_unlock_irq(&conf->device_lock); 1473 spin_unlock_irq(&conf->device_lock);
1468 atomic_set(&nsh->count, 1); 1474 atomic_set(&nsh->count, 1);
@@ -1694,28 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1694 raid5_conf_t *conf = mddev->private; 1700 raid5_conf_t *conf = mddev->private;
1695 pr_debug("raid456: error called\n"); 1701 pr_debug("raid456: error called\n");
1696 1702
1697 if (!test_bit(Faulty, &rdev->flags)) { 1703 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1698 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1704 unsigned long flags;
1699 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1705 spin_lock_irqsave(&conf->device_lock, flags);
1700 unsigned long flags; 1706 mddev->degraded++;
1701 spin_lock_irqsave(&conf->device_lock, flags); 1707 spin_unlock_irqrestore(&conf->device_lock, flags);
1702 mddev->degraded++; 1708 /*
1703 spin_unlock_irqrestore(&conf->device_lock, flags); 1709 * if recovery was running, make sure it aborts.
1704 /* 1710 */
1705 * if recovery was running, make sure it aborts. 1711 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1706 */
1707 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1708 }
1709 set_bit(Faulty, &rdev->flags);
1710 printk(KERN_ALERT
1711 "md/raid:%s: Disk failure on %s, disabling device.\n"
1712 KERN_ALERT
1713 "md/raid:%s: Operation continuing on %d devices.\n",
1714 mdname(mddev),
1715 bdevname(rdev->bdev, b),
1716 mdname(mddev),
1717 conf->raid_disks - mddev->degraded);
1718 } 1712 }
1713 set_bit(Faulty, &rdev->flags);
1714 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1715 printk(KERN_ALERT
1716 "md/raid:%s: Disk failure on %s, disabling device.\n"
1717 "md/raid:%s: Operation continuing on %d devices.\n",
1718 mdname(mddev),
1719 bdevname(rdev->bdev, b),
1720 mdname(mddev),
1721 conf->raid_disks - mddev->degraded);
1719} 1722}
1720 1723
1721/* 1724/*
@@ -3281,7 +3284,7 @@ static void handle_stripe5(struct stripe_head *sh)
3281 3284
3282 if (dec_preread_active) { 3285 if (dec_preread_active) {
3283 /* We delay this until after ops_run_io so that if make_request 3286 /* We delay this until after ops_run_io so that if make_request
3284 * is waiting on a barrier, it won't continue until the writes 3287 * is waiting on a flush, it won't continue until the writes
3285 * have actually been submitted. 3288 * have actually been submitted.
3286 */ 3289 */
3287 atomic_dec(&conf->preread_active_stripes); 3290 atomic_dec(&conf->preread_active_stripes);
@@ -3583,7 +3586,7 @@ static void handle_stripe6(struct stripe_head *sh)
3583 3586
3584 if (dec_preread_active) { 3587 if (dec_preread_active) {
3585 /* We delay this until after ops_run_io so that if make_request 3588 /* We delay this until after ops_run_io so that if make_request
3586 * is waiting on a barrier, it won't continue until the writes 3589 * is waiting on a flush, it won't continue until the writes
3587 * have actually been submitted. 3590 * have actually been submitted.
3588 */ 3591 */
3589 atomic_dec(&conf->preread_active_stripes); 3592 atomic_dec(&conf->preread_active_stripes);
@@ -3616,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
3616 atomic_inc(&conf->preread_active_stripes); 3619 atomic_inc(&conf->preread_active_stripes);
3617 list_add_tail(&sh->lru, &conf->hold_list); 3620 list_add_tail(&sh->lru, &conf->hold_list);
3618 } 3621 }
3619 } else 3622 }
3620 plugger_set_plug(&conf->plug);
3621} 3623}
3622 3624
3623static void activate_bit_delay(raid5_conf_t *conf) 3625static void activate_bit_delay(raid5_conf_t *conf)
@@ -3634,60 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf)
3634 } 3636 }
3635} 3637}
3636 3638
3637static void unplug_slaves(mddev_t *mddev)
3638{
3639 raid5_conf_t *conf = mddev->private;
3640 int i;
3641 int devs = max(conf->raid_disks, conf->previous_raid_disks);
3642
3643 rcu_read_lock();
3644 for (i = 0; i < devs; i++) {
3645 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3646 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3647 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3648
3649 atomic_inc(&rdev->nr_pending);
3650 rcu_read_unlock();
3651
3652 blk_unplug(r_queue);
3653
3654 rdev_dec_pending(rdev, mddev);
3655 rcu_read_lock();
3656 }
3657 }
3658 rcu_read_unlock();
3659}
3660
3661void md_raid5_unplug_device(raid5_conf_t *conf)
3662{
3663 unsigned long flags;
3664
3665 spin_lock_irqsave(&conf->device_lock, flags);
3666
3667 if (plugger_remove_plug(&conf->plug)) {
3668 conf->seq_flush++;
3669 raid5_activate_delayed(conf);
3670 }
3671 md_wakeup_thread(conf->mddev->thread);
3672
3673 spin_unlock_irqrestore(&conf->device_lock, flags);
3674
3675 unplug_slaves(conf->mddev);
3676}
3677EXPORT_SYMBOL_GPL(md_raid5_unplug_device);
3678
3679static void raid5_unplug(struct plug_handle *plug)
3680{
3681 raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug);
3682 md_raid5_unplug_device(conf);
3683}
3684
3685static void raid5_unplug_queue(struct request_queue *q)
3686{
3687 mddev_t *mddev = q->queuedata;
3688 md_raid5_unplug_device(mddev->private);
3689}
3690
3691int md_raid5_congested(mddev_t *mddev, int bits) 3639int md_raid5_congested(mddev_t *mddev, int bits)
3692{ 3640{
3693 raid5_conf_t *conf = mddev->private; 3641 raid5_conf_t *conf = mddev->private;
@@ -3864,9 +3812,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3864 return 0; 3812 return 0;
3865 } 3813 }
3866 /* 3814 /*
3867 * use bio_clone to make a copy of the bio 3815 * use bio_clone_mddev to make a copy of the bio
3868 */ 3816 */
3869 align_bi = bio_clone(raid_bio, GFP_NOIO); 3817 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3870 if (!align_bi) 3818 if (!align_bi)
3871 return 0; 3819 return 0;
3872 /* 3820 /*
@@ -3977,15 +3925,10 @@ static int make_request(mddev_t *mddev, struct bio * bi)
3977 struct stripe_head *sh; 3925 struct stripe_head *sh;
3978 const int rw = bio_data_dir(bi); 3926 const int rw = bio_data_dir(bi);
3979 int remaining; 3927 int remaining;
3928 int plugged;
3980 3929
3981 if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { 3930 if (unlikely(bi->bi_rw & REQ_FLUSH)) {
3982 /* Drain all pending writes. We only really need 3931 md_flush_request(mddev, bi);
3983 * to ensure they have been submitted, but this is
3984 * easier.
3985 */
3986 mddev->pers->quiesce(mddev, 1);
3987 mddev->pers->quiesce(mddev, 0);
3988 md_barrier_request(mddev, bi);
3989 return 0; 3932 return 0;
3990 } 3933 }
3991 3934
@@ -4001,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4001 bi->bi_next = NULL; 3944 bi->bi_next = NULL;
4002 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3945 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4003 3946
3947 plugged = mddev_check_plugged(mddev);
4004 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3948 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
4005 DEFINE_WAIT(w); 3949 DEFINE_WAIT(w);
4006 int disks, data_disks; 3950 int disks, data_disks;
@@ -4014,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4014 /* spinlock is needed as reshape_progress may be 3958 /* spinlock is needed as reshape_progress may be
4015 * 64bit on a 32bit platform, and so it might be 3959 * 64bit on a 32bit platform, and so it might be
4016 * possible to see a half-updated value 3960 * possible to see a half-updated value
4017 * Ofcourse reshape_progress could change after 3961 * Of course reshape_progress could change after
4018 * the lock is dropped, so once we get a reference 3962 * the lock is dropped, so once we get a reference
4019 * to the stripe that we think it is, we will have 3963 * to the stripe that we think it is, we will have
4020 * to check again. 3964 * to check again.
@@ -4095,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4095 * add failed due to overlap. Flush everything 4039 * add failed due to overlap. Flush everything
4096 * and wait a while 4040 * and wait a while
4097 */ 4041 */
4098 md_raid5_unplug_device(conf); 4042 md_wakeup_thread(mddev->thread);
4099 release_stripe(sh); 4043 release_stripe(sh);
4100 schedule(); 4044 schedule();
4101 goto retry; 4045 goto retry;
@@ -4103,7 +4047,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4103 finish_wait(&conf->wait_for_overlap, &w); 4047 finish_wait(&conf->wait_for_overlap, &w);
4104 set_bit(STRIPE_HANDLE, &sh->state); 4048 set_bit(STRIPE_HANDLE, &sh->state);
4105 clear_bit(STRIPE_DELAYED, &sh->state); 4049 clear_bit(STRIPE_DELAYED, &sh->state);
4106 if (mddev->barrier && 4050 if ((bi->bi_rw & REQ_SYNC) &&
4107 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4051 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4108 atomic_inc(&conf->preread_active_stripes); 4052 atomic_inc(&conf->preread_active_stripes);
4109 release_stripe(sh); 4053 release_stripe(sh);
@@ -4115,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4115 } 4059 }
4116 4060
4117 } 4061 }
4062 if (!plugged)
4063 md_wakeup_thread(mddev->thread);
4064
4118 spin_lock_irq(&conf->device_lock); 4065 spin_lock_irq(&conf->device_lock);
4119 remaining = raid5_dec_bi_phys_segments(bi); 4066 remaining = raid5_dec_bi_phys_segments(bi);
4120 spin_unlock_irq(&conf->device_lock); 4067 spin_unlock_irq(&conf->device_lock);
@@ -4126,13 +4073,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
4126 bio_endio(bi, 0); 4073 bio_endio(bi, 0);
4127 } 4074 }
4128 4075
4129 if (mddev->barrier) {
4130 /* We need to wait for the stripes to all be handled.
4131 * So: wait for preread_active_stripes to drop to 0.
4132 */
4133 wait_event(mddev->thread->wqueue,
4134 atomic_read(&conf->preread_active_stripes) == 0);
4135 }
4136 return 0; 4076 return 0;
4137} 4077}
4138 4078
@@ -4238,7 +4178,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4238 wait_event(conf->wait_for_overlap, 4178 wait_event(conf->wait_for_overlap,
4239 atomic_read(&conf->reshape_stripes)==0); 4179 atomic_read(&conf->reshape_stripes)==0);
4240 mddev->reshape_position = conf->reshape_progress; 4180 mddev->reshape_position = conf->reshape_progress;
4241 mddev->curr_resync_completed = mddev->curr_resync; 4181 mddev->curr_resync_completed = sector_nr;
4242 conf->reshape_checkpoint = jiffies; 4182 conf->reshape_checkpoint = jiffies;
4243 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4183 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4244 md_wakeup_thread(mddev->thread); 4184 md_wakeup_thread(mddev->thread);
@@ -4339,7 +4279,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
4339 wait_event(conf->wait_for_overlap, 4279 wait_event(conf->wait_for_overlap,
4340 atomic_read(&conf->reshape_stripes) == 0); 4280 atomic_read(&conf->reshape_stripes) == 0);
4341 mddev->reshape_position = conf->reshape_progress; 4281 mddev->reshape_position = conf->reshape_progress;
4342 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; 4282 mddev->curr_resync_completed = sector_nr;
4343 conf->reshape_checkpoint = jiffies; 4283 conf->reshape_checkpoint = jiffies;
4344 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4284 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4345 md_wakeup_thread(mddev->thread); 4285 md_wakeup_thread(mddev->thread);
@@ -4361,13 +4301,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
4361 raid5_conf_t *conf = mddev->private; 4301 raid5_conf_t *conf = mddev->private;
4362 struct stripe_head *sh; 4302 struct stripe_head *sh;
4363 sector_t max_sector = mddev->dev_sectors; 4303 sector_t max_sector = mddev->dev_sectors;
4364 int sync_blocks; 4304 sector_t sync_blocks;
4365 int still_degraded = 0; 4305 int still_degraded = 0;
4366 int i; 4306 int i;
4367 4307
4368 if (sector_nr >= max_sector) { 4308 if (sector_nr >= max_sector) {
4369 /* just being told to finish up .. nothing much to do */ 4309 /* just being told to finish up .. nothing much to do */
4370 unplug_slaves(mddev);
4371 4310
4372 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4311 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4373 end_reshape(conf); 4312 end_reshape(conf);
@@ -4524,24 +4463,30 @@ static void raid5d(mddev_t *mddev)
4524 struct stripe_head *sh; 4463 struct stripe_head *sh;
4525 raid5_conf_t *conf = mddev->private; 4464 raid5_conf_t *conf = mddev->private;
4526 int handled; 4465 int handled;
4466 struct blk_plug plug;
4527 4467
4528 pr_debug("+++ raid5d active\n"); 4468 pr_debug("+++ raid5d active\n");
4529 4469
4530 md_check_recovery(mddev); 4470 md_check_recovery(mddev);
4531 4471
4472 blk_start_plug(&plug);
4532 handled = 0; 4473 handled = 0;
4533 spin_lock_irq(&conf->device_lock); 4474 spin_lock_irq(&conf->device_lock);
4534 while (1) { 4475 while (1) {
4535 struct bio *bio; 4476 struct bio *bio;
4536 4477
4537 if (conf->seq_flush != conf->seq_write) { 4478 if (atomic_read(&mddev->plug_cnt) == 0 &&
4538 int seq = conf->seq_flush; 4479 !list_empty(&conf->bitmap_list)) {
4480 /* Now is a good time to flush some bitmap updates */
4481 conf->seq_flush++;
4539 spin_unlock_irq(&conf->device_lock); 4482 spin_unlock_irq(&conf->device_lock);
4540 bitmap_unplug(mddev->bitmap); 4483 bitmap_unplug(mddev->bitmap);
4541 spin_lock_irq(&conf->device_lock); 4484 spin_lock_irq(&conf->device_lock);
4542 conf->seq_write = seq; 4485 conf->seq_write = conf->seq_flush;
4543 activate_bit_delay(conf); 4486 activate_bit_delay(conf);
4544 } 4487 }
4488 if (atomic_read(&mddev->plug_cnt) == 0)
4489 raid5_activate_delayed(conf);
4545 4490
4546 while ((bio = remove_bio_from_retry(conf))) { 4491 while ((bio = remove_bio_from_retry(conf))) {
4547 int ok; 4492 int ok;
@@ -4571,7 +4516,7 @@ static void raid5d(mddev_t *mddev)
4571 spin_unlock_irq(&conf->device_lock); 4516 spin_unlock_irq(&conf->device_lock);
4572 4517
4573 async_tx_issue_pending_all(); 4518 async_tx_issue_pending_all();
4574 unplug_slaves(mddev); 4519 blk_finish_plug(&plug);
4575 4520
4576 pr_debug("--- raid5d inactive\n"); 4521 pr_debug("--- raid5d inactive\n");
4577} 4522}
@@ -4913,7 +4858,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4913 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4858 printk(KERN_INFO "md/raid:%s: device %s operational as raid"
4914 " disk %d\n", 4859 " disk %d\n",
4915 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4860 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
4916 } else 4861 } else if (rdev->saved_raid_disk != raid_disk)
4917 /* Cannot rely on bitmap to complete recovery */ 4862 /* Cannot rely on bitmap to complete recovery */
4918 conf->fullsync = 1; 4863 conf->fullsync = 1;
4919 } 4864 }
@@ -5188,8 +5133,6 @@ static int run(mddev_t *mddev)
5188 mdname(mddev)); 5133 mdname(mddev));
5189 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5134 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5190 5135
5191 plugger_init(&conf->plug, raid5_unplug);
5192 mddev->plug = &conf->plug;
5193 if (mddev->queue) { 5136 if (mddev->queue) {
5194 int chunk_size; 5137 int chunk_size;
5195 /* read-ahead size must cover two whole stripes, which 5138 /* read-ahead size must cover two whole stripes, which
@@ -5206,8 +5149,6 @@ static int run(mddev_t *mddev)
5206 5149
5207 mddev->queue->backing_dev_info.congested_data = mddev; 5150 mddev->queue->backing_dev_info.congested_data = mddev;
5208 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 5151 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
5209 mddev->queue->queue_lock = &conf->device_lock;
5210 mddev->queue->unplug_fn = raid5_unplug_queue;
5211 5152
5212 chunk_size = mddev->chunk_sectors << 9; 5153 chunk_size = mddev->chunk_sectors << 9;
5213 blk_queue_io_min(mddev->queue, chunk_size); 5154 blk_queue_io_min(mddev->queue, chunk_size);
@@ -5240,7 +5181,6 @@ static int stop(mddev_t *mddev)
5240 mddev->thread = NULL; 5181 mddev->thread = NULL;
5241 if (mddev->queue) 5182 if (mddev->queue)
5242 mddev->queue->backing_dev_info.congested_fn = NULL; 5183 mddev->queue->backing_dev_info.congested_fn = NULL;
5243 plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/
5244 free_conf(conf); 5184 free_conf(conf);
5245 mddev->private = NULL; 5185 mddev->private = NULL;
5246 mddev->to_remove = &raid5_attrs_group; 5186 mddev->to_remove = &raid5_attrs_group;
@@ -5340,7 +5280,7 @@ static int raid5_spare_active(mddev_t *mddev)
5340 && !test_bit(Faulty, &tmp->rdev->flags) 5280 && !test_bit(Faulty, &tmp->rdev->flags)
5341 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5281 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5342 count++; 5282 count++;
5343 sysfs_notify_dirent(tmp->rdev->sysfs_state); 5283 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
5344 } 5284 }
5345 } 5285 }
5346 spin_lock_irqsave(&conf->device_lock, flags); 5286 spin_lock_irqsave(&conf->device_lock, flags);
@@ -5449,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
5449 return -EINVAL; 5389 return -EINVAL;
5450 set_capacity(mddev->gendisk, mddev->array_sectors); 5390 set_capacity(mddev->gendisk, mddev->array_sectors);
5451 revalidate_disk(mddev->gendisk); 5391 revalidate_disk(mddev->gendisk);
5452 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5392 if (sectors > mddev->dev_sectors &&
5393 mddev->recovery_cp > mddev->dev_sectors) {
5453 mddev->recovery_cp = mddev->dev_sectors; 5394 mddev->recovery_cp = mddev->dev_sectors;
5454 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5395 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5455 } 5396 }
@@ -5519,7 +5460,6 @@ static int raid5_start_reshape(mddev_t *mddev)
5519 raid5_conf_t *conf = mddev->private; 5460 raid5_conf_t *conf = mddev->private;
5520 mdk_rdev_t *rdev; 5461 mdk_rdev_t *rdev;
5521 int spares = 0; 5462 int spares = 0;
5522 int added_devices = 0;
5523 unsigned long flags; 5463 unsigned long flags;
5524 5464
5525 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5465 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -5529,8 +5469,8 @@ static int raid5_start_reshape(mddev_t *mddev)
5529 return -ENOSPC; 5469 return -ENOSPC;
5530 5470
5531 list_for_each_entry(rdev, &mddev->disks, same_set) 5471 list_for_each_entry(rdev, &mddev->disks, same_set)
5532 if (rdev->raid_disk < 0 && 5472 if (!test_bit(In_sync, &rdev->flags)
5533 !test_bit(Faulty, &rdev->flags)) 5473 && !test_bit(Faulty, &rdev->flags))
5534 spares++; 5474 spares++;
5535 5475
5536 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5476 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
@@ -5573,29 +5513,35 @@ static int raid5_start_reshape(mddev_t *mddev)
5573 * to correctly record the "partially reconstructed" state of 5513 * to correctly record the "partially reconstructed" state of
5574 * such devices during the reshape and confusion could result. 5514 * such devices during the reshape and confusion could result.
5575 */ 5515 */
5576 if (mddev->delta_disks >= 0) 5516 if (mddev->delta_disks >= 0) {
5577 list_for_each_entry(rdev, &mddev->disks, same_set) 5517 int added_devices = 0;
5578 if (rdev->raid_disk < 0 && 5518 list_for_each_entry(rdev, &mddev->disks, same_set)
5579 !test_bit(Faulty, &rdev->flags)) { 5519 if (rdev->raid_disk < 0 &&
5580 if (raid5_add_disk(mddev, rdev) == 0) { 5520 !test_bit(Faulty, &rdev->flags)) {
5581 char nm[20]; 5521 if (raid5_add_disk(mddev, rdev) == 0) {
5582 if (rdev->raid_disk >= conf->previous_raid_disks) { 5522 char nm[20];
5583 set_bit(In_sync, &rdev->flags); 5523 if (rdev->raid_disk
5584 added_devices++; 5524 >= conf->previous_raid_disks) {
5585 } else 5525 set_bit(In_sync, &rdev->flags);
5586 rdev->recovery_offset = 0; 5526 added_devices++;
5587 sprintf(nm, "rd%d", rdev->raid_disk); 5527 } else
5588 if (sysfs_create_link(&mddev->kobj, 5528 rdev->recovery_offset = 0;
5589 &rdev->kobj, nm)) 5529 sprintf(nm, "rd%d", rdev->raid_disk);
5590 /* Failure here is OK */; 5530 if (sysfs_create_link(&mddev->kobj,
5591 } else 5531 &rdev->kobj, nm))
5592 break; 5532 /* Failure here is OK */;
5593 } 5533 }
5534 } else if (rdev->raid_disk >= conf->previous_raid_disks
5535 && !test_bit(Faulty, &rdev->flags)) {
5536 /* This is a spare that was manually added */
5537 set_bit(In_sync, &rdev->flags);
5538 added_devices++;
5539 }
5594 5540
5595 /* When a reshape changes the number of devices, ->degraded 5541 /* When a reshape changes the number of devices,
5596 * is measured against the larger of the pre and post number of 5542 * ->degraded is measured against the larger of the
5597 * devices.*/ 5543 * pre and post number of devices.
5598 if (mddev->delta_disks > 0) { 5544 */
5599 spin_lock_irqsave(&conf->device_lock, flags); 5545 spin_lock_irqsave(&conf->device_lock, flags);
5600 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) 5546 mddev->degraded += (conf->raid_disks - conf->previous_raid_disks)
5601 - added_devices; 5547 - added_devices;
@@ -5731,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
5731static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5677static void *raid45_takeover_raid0(mddev_t *mddev, int level)
5732{ 5678{
5733 struct raid0_private_data *raid0_priv = mddev->private; 5679 struct raid0_private_data *raid0_priv = mddev->private;
5680 sector_t sectors;
5734 5681
5735 /* for raid0 takeover only one zone is supported */ 5682 /* for raid0 takeover only one zone is supported */
5736 if (raid0_priv->nr_strip_zones > 1) { 5683 if (raid0_priv->nr_strip_zones > 1) {
@@ -5739,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level)
5739 return ERR_PTR(-EINVAL); 5686 return ERR_PTR(-EINVAL);
5740 } 5687 }
5741 5688
5689 sectors = raid0_priv->strip_zone[0].zone_end;
5690 sector_div(sectors, raid0_priv->strip_zone[0].nb_dev);
5691 mddev->dev_sectors = sectors;
5742 mddev->new_level = level; 5692 mddev->new_level = level;
5743 mddev->new_layout = ALGORITHM_PARITY_N; 5693 mddev->new_layout = ALGORITHM_PARITY_N;
5744 mddev->new_chunk_sectors = mddev->chunk_sectors; 5694 mddev->new_chunk_sectors = mddev->chunk_sectors;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 36eaed5dfd6e..3ca77a2613ba 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -275,6 +275,7 @@ struct r6_state {
275 * filling 275 * filling
276 */ 276 */
277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ 277#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
278#define R5_WantFUA 14 /* Write should be FUA */
278/* 279/*
279 * Write method 280 * Write method
280 */ 281 */
@@ -399,8 +400,6 @@ struct raid5_private_data {
399 * Cleared when a sync completes. 400 * Cleared when a sync completes.
400 */ 401 */
401 402
402 struct plug_handle plug;
403
404 /* per cpu variables */ 403 /* per cpu variables */
405 struct raid5_percpu { 404 struct raid5_percpu {
406 struct page *spare_page; /* Used when checking P/Q in raid6 */ 405 struct page *spare_page; /* Used when checking P/Q in raid6 */
@@ -502,6 +501,6 @@ static inline int algorithm_is_DDF(int layout)
502} 501}
503 502
504extern int md_raid5_congested(mddev_t *mddev, int bits); 503extern int md_raid5_congested(mddev_t *mddev, int bits);
505extern void md_raid5_unplug_device(raid5_conf_t *conf); 504extern void md_raid5_kick_device(raid5_conf_t *conf);
506extern int raid5_set_cache_size(mddev_t *mddev, int size); 505extern int raid5_set_cache_size(mddev_t *mddev, int size);
507#endif 506#endif