diff options
Diffstat (limited to 'drivers/md')
36 files changed, 3682 insertions, 2487 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf1a95e31559..8420129fc5ee 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -240,6 +240,30 @@ config DM_MIRROR | |||
240 | Allow volume managers to mirror logical volumes, also | 240 | Allow volume managers to mirror logical volumes, also |
241 | needed for live data migration tools such as 'pvmove'. | 241 | needed for live data migration tools such as 'pvmove'. |
242 | 242 | ||
243 | config DM_RAID | ||
244 | tristate "RAID 4/5/6 target (EXPERIMENTAL)" | ||
245 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
246 | select MD_RAID456 | ||
247 | select BLK_DEV_MD | ||
248 | ---help--- | ||
249 | A dm target that supports RAID4, RAID5 and RAID6 mappings | ||
250 | |||
251 | A RAID-5 set of N drives with a capacity of C MB per drive provides | ||
252 | the capacity of C * (N - 1) MB, and protects against a failure | ||
253 | of a single drive. For a given sector (row) number, (N - 1) drives | ||
254 | contain data sectors, and one drive contains the parity protection. | ||
255 | For a RAID-4 set, the parity blocks are present on a single drive, | ||
256 | while a RAID-5 set distributes the parity across the drives in one | ||
257 | of the available parity distribution methods. | ||
258 | |||
259 | A RAID-6 set of N drives with a capacity of C MB per drive | ||
260 | provides the capacity of C * (N - 2) MB, and protects | ||
261 | against a failure of any two drives. For a given sector | ||
262 | (row) number, (N - 2) drives contain data sectors, and two | ||
263 | drives contains two independent redundancy syndromes. Like | ||
264 | RAID-5, RAID-6 distributes the syndromes across the drives | ||
265 | in one of the available parity distribution methods. | ||
266 | |||
243 | config DM_LOG_USERSPACE | 267 | config DM_LOG_USERSPACE |
244 | tristate "Mirror userspace logging (EXPERIMENTAL)" | 268 | tristate "Mirror userspace logging (EXPERIMENTAL)" |
245 | depends on DM_MIRROR && EXPERIMENTAL && NET | 269 | depends on DM_MIRROR && EXPERIMENTAL && NET |
@@ -303,4 +327,10 @@ config DM_UEVENT | |||
303 | ---help--- | 327 | ---help--- |
304 | Generate udev events for DM events. | 328 | Generate udev events for DM events. |
305 | 329 | ||
330 | config DM_FLAKEY | ||
331 | tristate "Flakey target (EXPERIMENTAL)" | ||
332 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
333 | ---help--- | ||
334 | A target that intermittently fails I/O for debugging purposes. | ||
335 | |||
306 | endif # MD | 336 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 5e3aac41919d..448838b1f92a 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o | |||
29 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o | 29 | obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o |
30 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | 30 | obj-$(CONFIG_DM_CRYPT) += dm-crypt.o |
31 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 31 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
32 | obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o | ||
32 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 33 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
33 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | 34 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o |
34 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | 35 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o |
@@ -36,6 +37,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | |||
36 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 37 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
37 | obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | 38 | obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o |
38 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 39 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
40 | obj-$(CONFIG_DM_RAID) += dm-raid.o | ||
39 | 41 | ||
40 | ifeq ($(CONFIG_DM_UEVENT),y) | 42 | ifeq ($(CONFIG_DM_UEVENT),y) |
41 | dm-mod-objs += dm-uevent.o | 43 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index e4fb58db5454..574b09afedd3 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -210,11 +210,11 @@ static struct page *read_sb_page(mddev_t *mddev, loff_t offset, | |||
210 | || test_bit(Faulty, &rdev->flags)) | 210 | || test_bit(Faulty, &rdev->flags)) |
211 | continue; | 211 | continue; |
212 | 212 | ||
213 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); | 213 | target = offset + index * (PAGE_SIZE/512); |
214 | 214 | ||
215 | if (sync_page_io(rdev->bdev, target, | 215 | if (sync_page_io(rdev, target, |
216 | roundup(size, bdev_logical_block_size(rdev->bdev)), | 216 | roundup(size, bdev_logical_block_size(rdev->bdev)), |
217 | page, READ)) { | 217 | page, READ, true)) { |
218 | page->index = index; | 218 | page->index = index; |
219 | attach_page_buffers(page, NULL); /* so that free_buffer will | 219 | attach_page_buffers(page, NULL); /* so that free_buffer will |
220 | * quietly no-op */ | 220 | * quietly no-op */ |
@@ -264,14 +264,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
264 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | 264 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) |
265 | { | 265 | { |
266 | mdk_rdev_t *rdev = NULL; | 266 | mdk_rdev_t *rdev = NULL; |
267 | struct block_device *bdev; | ||
267 | mddev_t *mddev = bitmap->mddev; | 268 | mddev_t *mddev = bitmap->mddev; |
268 | 269 | ||
269 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 270 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
270 | int size = PAGE_SIZE; | 271 | int size = PAGE_SIZE; |
271 | loff_t offset = mddev->bitmap_info.offset; | 272 | loff_t offset = mddev->bitmap_info.offset; |
273 | |||
274 | bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; | ||
275 | |||
272 | if (page->index == bitmap->file_pages-1) | 276 | if (page->index == bitmap->file_pages-1) |
273 | size = roundup(bitmap->last_page_size, | 277 | size = roundup(bitmap->last_page_size, |
274 | bdev_logical_block_size(rdev->bdev)); | 278 | bdev_logical_block_size(bdev)); |
275 | /* Just make sure we aren't corrupting data or | 279 | /* Just make sure we aren't corrupting data or |
276 | * metadata | 280 | * metadata |
277 | */ | 281 | */ |
@@ -343,7 +347,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) | |||
343 | atomic_inc(&bitmap->pending_writes); | 347 | atomic_inc(&bitmap->pending_writes); |
344 | set_buffer_locked(bh); | 348 | set_buffer_locked(bh); |
345 | set_buffer_mapped(bh); | 349 | set_buffer_mapped(bh); |
346 | submit_bh(WRITE, bh); | 350 | submit_bh(WRITE | REQ_SYNC, bh); |
347 | bh = bh->b_this_page; | 351 | bh = bh->b_this_page; |
348 | } | 352 | } |
349 | 353 | ||
@@ -489,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
489 | spin_unlock_irqrestore(&bitmap->lock, flags); | 493 | spin_unlock_irqrestore(&bitmap->lock, flags); |
490 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 494 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
491 | sb->events = cpu_to_le64(bitmap->mddev->events); | 495 | sb->events = cpu_to_le64(bitmap->mddev->events); |
492 | if (bitmap->mddev->events < bitmap->events_cleared) { | 496 | if (bitmap->mddev->events < bitmap->events_cleared) |
493 | /* rocking back to read-only */ | 497 | /* rocking back to read-only */ |
494 | bitmap->events_cleared = bitmap->mddev->events; | 498 | bitmap->events_cleared = bitmap->mddev->events; |
495 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | 499 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); |
496 | } | 500 | sb->state = cpu_to_le32(bitmap->flags); |
497 | /* Just in case these have been changed via sysfs: */ | 501 | /* Just in case these have been changed via sysfs: */ |
498 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | 502 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); |
499 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | 503 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); |
@@ -530,6 +534,82 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
530 | kunmap_atomic(sb, KM_USER0); | 534 | kunmap_atomic(sb, KM_USER0); |
531 | } | 535 | } |
532 | 536 | ||
537 | /* | ||
538 | * bitmap_new_disk_sb | ||
539 | * @bitmap | ||
540 | * | ||
541 | * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb | ||
542 | * reads and verifies the on-disk bitmap superblock and populates bitmap_info. | ||
543 | * This function verifies 'bitmap_info' and populates the on-disk bitmap | ||
544 | * structure, which is to be written to disk. | ||
545 | * | ||
546 | * Returns: 0 on success, -Exxx on error | ||
547 | */ | ||
548 | static int bitmap_new_disk_sb(struct bitmap *bitmap) | ||
549 | { | ||
550 | bitmap_super_t *sb; | ||
551 | unsigned long chunksize, daemon_sleep, write_behind; | ||
552 | int err = -EINVAL; | ||
553 | |||
554 | bitmap->sb_page = alloc_page(GFP_KERNEL); | ||
555 | if (IS_ERR(bitmap->sb_page)) { | ||
556 | err = PTR_ERR(bitmap->sb_page); | ||
557 | bitmap->sb_page = NULL; | ||
558 | return err; | ||
559 | } | ||
560 | bitmap->sb_page->index = 0; | ||
561 | |||
562 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | ||
563 | |||
564 | sb->magic = cpu_to_le32(BITMAP_MAGIC); | ||
565 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); | ||
566 | |||
567 | chunksize = bitmap->mddev->bitmap_info.chunksize; | ||
568 | BUG_ON(!chunksize); | ||
569 | if (!is_power_of_2(chunksize)) { | ||
570 | kunmap_atomic(sb, KM_USER0); | ||
571 | printk(KERN_ERR "bitmap chunksize not a power of 2\n"); | ||
572 | return -EINVAL; | ||
573 | } | ||
574 | sb->chunksize = cpu_to_le32(chunksize); | ||
575 | |||
576 | daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; | ||
577 | if (!daemon_sleep || | ||
578 | (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { | ||
579 | printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); | ||
580 | daemon_sleep = 5 * HZ; | ||
581 | } | ||
582 | sb->daemon_sleep = cpu_to_le32(daemon_sleep); | ||
583 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | ||
584 | |||
585 | /* | ||
586 | * FIXME: write_behind for RAID1. If not specified, what | ||
587 | * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. | ||
588 | */ | ||
589 | write_behind = bitmap->mddev->bitmap_info.max_write_behind; | ||
590 | if (write_behind > COUNTER_MAX) | ||
591 | write_behind = COUNTER_MAX / 2; | ||
592 | sb->write_behind = cpu_to_le32(write_behind); | ||
593 | bitmap->mddev->bitmap_info.max_write_behind = write_behind; | ||
594 | |||
595 | /* keep the array size field of the bitmap superblock up to date */ | ||
596 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); | ||
597 | |||
598 | memcpy(sb->uuid, bitmap->mddev->uuid, 16); | ||
599 | |||
600 | bitmap->flags |= BITMAP_STALE; | ||
601 | sb->state |= cpu_to_le32(BITMAP_STALE); | ||
602 | bitmap->events_cleared = bitmap->mddev->events; | ||
603 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); | ||
604 | |||
605 | bitmap->flags |= BITMAP_HOSTENDIAN; | ||
606 | sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN); | ||
607 | |||
608 | kunmap_atomic(sb, KM_USER0); | ||
609 | |||
610 | return 0; | ||
611 | } | ||
612 | |||
533 | /* read the superblock from the bitmap file and initialize some bitmap fields */ | 613 | /* read the superblock from the bitmap file and initialize some bitmap fields */ |
534 | static int bitmap_read_sb(struct bitmap *bitmap) | 614 | static int bitmap_read_sb(struct bitmap *bitmap) |
535 | { | 615 | { |
@@ -571,7 +651,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
571 | reason = "unrecognized superblock version"; | 651 | reason = "unrecognized superblock version"; |
572 | else if (chunksize < 512) | 652 | else if (chunksize < 512) |
573 | reason = "bitmap chunksize too small"; | 653 | reason = "bitmap chunksize too small"; |
574 | else if ((1 << ffz(~chunksize)) != chunksize) | 654 | else if (!is_power_of_2(chunksize)) |
575 | reason = "bitmap chunksize not a power of 2"; | 655 | reason = "bitmap chunksize not a power of 2"; |
576 | else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) | 656 | else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) |
577 | reason = "daemon sleep period out of range"; | 657 | reason = "daemon sleep period out of range"; |
@@ -614,7 +694,7 @@ success: | |||
614 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 694 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
615 | bitmap->flags |= BITMAP_HOSTENDIAN; | 695 | bitmap->flags |= BITMAP_HOSTENDIAN; |
616 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 696 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
617 | if (sb->state & cpu_to_le32(BITMAP_STALE)) | 697 | if (bitmap->flags & BITMAP_STALE) |
618 | bitmap->events_cleared = bitmap->mddev->events; | 698 | bitmap->events_cleared = bitmap->mddev->events; |
619 | err = 0; | 699 | err = 0; |
620 | out: | 700 | out: |
@@ -648,9 +728,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
648 | switch (op) { | 728 | switch (op) { |
649 | case MASK_SET: | 729 | case MASK_SET: |
650 | sb->state |= cpu_to_le32(bits); | 730 | sb->state |= cpu_to_le32(bits); |
731 | bitmap->flags |= bits; | ||
651 | break; | 732 | break; |
652 | case MASK_UNSET: | 733 | case MASK_UNSET: |
653 | sb->state &= cpu_to_le32(~bits); | 734 | sb->state &= cpu_to_le32(~bits); |
735 | bitmap->flags &= ~bits; | ||
654 | break; | 736 | break; |
655 | default: | 737 | default: |
656 | BUG(); | 738 | BUG(); |
@@ -850,7 +932,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
850 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 932 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
851 | set_bit(bit, kaddr); | 933 | set_bit(bit, kaddr); |
852 | else | 934 | else |
853 | ext2_set_bit(bit, kaddr); | 935 | __test_and_set_bit_le(bit, kaddr); |
854 | kunmap_atomic(kaddr, KM_USER0); | 936 | kunmap_atomic(kaddr, KM_USER0); |
855 | PRINTK("set file bit %lu page %lu\n", bit, page->index); | 937 | PRINTK("set file bit %lu page %lu\n", bit, page->index); |
856 | } | 938 | } |
@@ -1046,7 +1128,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1046 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1128 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1047 | b = test_bit(bit, paddr); | 1129 | b = test_bit(bit, paddr); |
1048 | else | 1130 | else |
1049 | b = ext2_test_bit(bit, paddr); | 1131 | b = test_bit_le(bit, paddr); |
1050 | kunmap_atomic(paddr, KM_USER0); | 1132 | kunmap_atomic(paddr, KM_USER0); |
1051 | if (b) { | 1133 | if (b) { |
1052 | /* if the disk bit is set, set the memory bit */ | 1134 | /* if the disk bit is set, set the memory bit */ |
@@ -1070,8 +1152,8 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1070 | } | 1152 | } |
1071 | 1153 | ||
1072 | printk(KERN_INFO "%s: bitmap initialized from disk: " | 1154 | printk(KERN_INFO "%s: bitmap initialized from disk: " |
1073 | "read %lu/%lu pages, set %lu bits\n", | 1155 | "read %lu/%lu pages, set %lu of %lu bits\n", |
1074 | bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt); | 1156 | bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); |
1075 | 1157 | ||
1076 | return 0; | 1158 | return 0; |
1077 | 1159 | ||
@@ -1101,7 +1183,7 @@ static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) | |||
1101 | bitmap_checkfree(bitmap, page); | 1183 | bitmap_checkfree(bitmap, page); |
1102 | } | 1184 | } |
1103 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1185 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
1104 | sector_t offset, int *blocks, | 1186 | sector_t offset, sector_t *blocks, |
1105 | int create); | 1187 | int create); |
1106 | 1188 | ||
1107 | /* | 1189 | /* |
@@ -1115,7 +1197,7 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1115 | unsigned long j; | 1197 | unsigned long j; |
1116 | unsigned long flags; | 1198 | unsigned long flags; |
1117 | struct page *page = NULL, *lastpage = NULL; | 1199 | struct page *page = NULL, *lastpage = NULL; |
1118 | int blocks; | 1200 | sector_t blocks; |
1119 | void *paddr; | 1201 | void *paddr; |
1120 | struct dm_dirty_log *log = mddev->bitmap_info.log; | 1202 | struct dm_dirty_log *log = mddev->bitmap_info.log; |
1121 | 1203 | ||
@@ -1222,7 +1304,7 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1222 | clear_bit(file_page_offset(bitmap, j), | 1304 | clear_bit(file_page_offset(bitmap, j), |
1223 | paddr); | 1305 | paddr); |
1224 | else | 1306 | else |
1225 | ext2_clear_bit(file_page_offset(bitmap, j), | 1307 | __test_and_clear_bit_le(file_page_offset(bitmap, j), |
1226 | paddr); | 1308 | paddr); |
1227 | kunmap_atomic(paddr, KM_USER0); | 1309 | kunmap_atomic(paddr, KM_USER0); |
1228 | } else | 1310 | } else |
@@ -1258,7 +1340,7 @@ void bitmap_daemon_work(mddev_t *mddev) | |||
1258 | } | 1340 | } |
1259 | 1341 | ||
1260 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1342 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
1261 | sector_t offset, int *blocks, | 1343 | sector_t offset, sector_t *blocks, |
1262 | int create) | 1344 | int create) |
1263 | __releases(bitmap->lock) | 1345 | __releases(bitmap->lock) |
1264 | __acquires(bitmap->lock) | 1346 | __acquires(bitmap->lock) |
@@ -1316,7 +1398,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1316 | } | 1398 | } |
1317 | 1399 | ||
1318 | while (sectors) { | 1400 | while (sectors) { |
1319 | int blocks; | 1401 | sector_t blocks; |
1320 | bitmap_counter_t *bmc; | 1402 | bitmap_counter_t *bmc; |
1321 | 1403 | ||
1322 | spin_lock_irq(&bitmap->lock); | 1404 | spin_lock_irq(&bitmap->lock); |
@@ -1326,7 +1408,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1326 | return 0; | 1408 | return 0; |
1327 | } | 1409 | } |
1328 | 1410 | ||
1329 | if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) { | 1411 | if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { |
1330 | DEFINE_WAIT(__wait); | 1412 | DEFINE_WAIT(__wait); |
1331 | /* note that it is safe to do the prepare_to_wait | 1413 | /* note that it is safe to do the prepare_to_wait |
1332 | * after the test as long as we do it before dropping | 1414 | * after the test as long as we do it before dropping |
@@ -1335,8 +1417,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1335 | prepare_to_wait(&bitmap->overflow_wait, &__wait, | 1417 | prepare_to_wait(&bitmap->overflow_wait, &__wait, |
1336 | TASK_UNINTERRUPTIBLE); | 1418 | TASK_UNINTERRUPTIBLE); |
1337 | spin_unlock_irq(&bitmap->lock); | 1419 | spin_unlock_irq(&bitmap->lock); |
1338 | md_unplug(bitmap->mddev); | 1420 | io_schedule(); |
1339 | schedule(); | ||
1340 | finish_wait(&bitmap->overflow_wait, &__wait); | 1421 | finish_wait(&bitmap->overflow_wait, &__wait); |
1341 | continue; | 1422 | continue; |
1342 | } | 1423 | } |
@@ -1381,7 +1462,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1381 | success = 0; | 1462 | success = 0; |
1382 | 1463 | ||
1383 | while (sectors) { | 1464 | while (sectors) { |
1384 | int blocks; | 1465 | sector_t blocks; |
1385 | unsigned long flags; | 1466 | unsigned long flags; |
1386 | bitmap_counter_t *bmc; | 1467 | bitmap_counter_t *bmc; |
1387 | 1468 | ||
@@ -1399,10 +1480,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1399 | sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); | 1480 | sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); |
1400 | } | 1481 | } |
1401 | 1482 | ||
1402 | if (!success && ! (*bmc & NEEDED_MASK)) | 1483 | if (!success && !NEEDED(*bmc)) |
1403 | *bmc |= NEEDED_MASK; | 1484 | *bmc |= NEEDED_MASK; |
1404 | 1485 | ||
1405 | if ((*bmc & COUNTER_MAX) == COUNTER_MAX) | 1486 | if (COUNTER(*bmc) == COUNTER_MAX) |
1406 | wake_up(&bitmap->overflow_wait); | 1487 | wake_up(&bitmap->overflow_wait); |
1407 | 1488 | ||
1408 | (*bmc)--; | 1489 | (*bmc)--; |
@@ -1423,7 +1504,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1423 | } | 1504 | } |
1424 | EXPORT_SYMBOL(bitmap_endwrite); | 1505 | EXPORT_SYMBOL(bitmap_endwrite); |
1425 | 1506 | ||
1426 | static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | 1507 | static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, |
1427 | int degraded) | 1508 | int degraded) |
1428 | { | 1509 | { |
1429 | bitmap_counter_t *bmc; | 1510 | bitmap_counter_t *bmc; |
@@ -1452,7 +1533,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *bloc | |||
1452 | return rv; | 1533 | return rv; |
1453 | } | 1534 | } |
1454 | 1535 | ||
1455 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | 1536 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, |
1456 | int degraded) | 1537 | int degraded) |
1457 | { | 1538 | { |
1458 | /* bitmap_start_sync must always report on multiples of whole | 1539 | /* bitmap_start_sync must always report on multiples of whole |
@@ -1463,7 +1544,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | |||
1463 | * Return the 'or' of the result. | 1544 | * Return the 'or' of the result. |
1464 | */ | 1545 | */ |
1465 | int rv = 0; | 1546 | int rv = 0; |
1466 | int blocks1; | 1547 | sector_t blocks1; |
1467 | 1548 | ||
1468 | *blocks = 0; | 1549 | *blocks = 0; |
1469 | while (*blocks < (PAGE_SIZE>>9)) { | 1550 | while (*blocks < (PAGE_SIZE>>9)) { |
@@ -1476,7 +1557,7 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | |||
1476 | } | 1557 | } |
1477 | EXPORT_SYMBOL(bitmap_start_sync); | 1558 | EXPORT_SYMBOL(bitmap_start_sync); |
1478 | 1559 | ||
1479 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) | 1560 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) |
1480 | { | 1561 | { |
1481 | bitmap_counter_t *bmc; | 1562 | bitmap_counter_t *bmc; |
1482 | unsigned long flags; | 1563 | unsigned long flags; |
@@ -1515,7 +1596,7 @@ void bitmap_close_sync(struct bitmap *bitmap) | |||
1515 | * RESYNC bit wherever it is still on | 1596 | * RESYNC bit wherever it is still on |
1516 | */ | 1597 | */ |
1517 | sector_t sector = 0; | 1598 | sector_t sector = 0; |
1518 | int blocks; | 1599 | sector_t blocks; |
1519 | if (!bitmap) | 1600 | if (!bitmap) |
1520 | return; | 1601 | return; |
1521 | while (sector < bitmap->mddev->resync_max_sectors) { | 1602 | while (sector < bitmap->mddev->resync_max_sectors) { |
@@ -1528,7 +1609,7 @@ EXPORT_SYMBOL(bitmap_close_sync); | |||
1528 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | 1609 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) |
1529 | { | 1610 | { |
1530 | sector_t s = 0; | 1611 | sector_t s = 0; |
1531 | int blocks; | 1612 | sector_t blocks; |
1532 | 1613 | ||
1533 | if (!bitmap) | 1614 | if (!bitmap) |
1534 | return; | 1615 | return; |
@@ -1542,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1542 | wait_event(bitmap->mddev->recovery_wait, | 1623 | wait_event(bitmap->mddev->recovery_wait, |
1543 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1624 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
1544 | 1625 | ||
1545 | bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; | 1626 | bitmap->mddev->curr_resync_completed = sector; |
1546 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | 1627 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); |
1547 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); | 1628 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); |
1548 | s = 0; | 1629 | s = 0; |
@@ -1562,7 +1643,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
1562 | * be 0 at this point | 1643 | * be 0 at this point |
1563 | */ | 1644 | */ |
1564 | 1645 | ||
1565 | int secs; | 1646 | sector_t secs; |
1566 | bitmap_counter_t *bmc; | 1647 | bitmap_counter_t *bmc; |
1567 | spin_lock_irq(&bitmap->lock); | 1648 | spin_lock_irq(&bitmap->lock); |
1568 | bmc = bitmap_get_counter(bitmap, offset, &secs, 1); | 1649 | bmc = bitmap_get_counter(bitmap, offset, &secs, 1); |
@@ -1723,9 +1804,16 @@ int bitmap_create(mddev_t *mddev) | |||
1723 | vfs_fsync(file, 1); | 1804 | vfs_fsync(file, 1); |
1724 | } | 1805 | } |
1725 | /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ | 1806 | /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ |
1726 | if (!mddev->bitmap_info.external) | 1807 | if (!mddev->bitmap_info.external) { |
1727 | err = bitmap_read_sb(bitmap); | 1808 | /* |
1728 | else { | 1809 | * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is |
1810 | * instructing us to create a new on-disk bitmap instance. | ||
1811 | */ | ||
1812 | if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) | ||
1813 | err = bitmap_new_disk_sb(bitmap); | ||
1814 | else | ||
1815 | err = bitmap_read_sb(bitmap); | ||
1816 | } else { | ||
1729 | err = 0; | 1817 | err = 0; |
1730 | if (mddev->bitmap_info.chunksize == 0 || | 1818 | if (mddev->bitmap_info.chunksize == 0 || |
1731 | mddev->bitmap_info.daemon_sleep == 0) | 1819 | mddev->bitmap_info.daemon_sleep == 0) |
@@ -1749,9 +1837,6 @@ int bitmap_create(mddev_t *mddev) | |||
1749 | bitmap->chunks = chunks; | 1837 | bitmap->chunks = chunks; |
1750 | bitmap->pages = pages; | 1838 | bitmap->pages = pages; |
1751 | bitmap->missing_pages = pages; | 1839 | bitmap->missing_pages = pages; |
1752 | bitmap->counter_bits = COUNTER_BITS; | ||
1753 | |||
1754 | bitmap->syncchunk = ~0UL; | ||
1755 | 1840 | ||
1756 | #ifdef INJECT_FATAL_FAULT_1 | 1841 | #ifdef INJECT_FATAL_FAULT_1 |
1757 | bitmap->bp = NULL; | 1842 | bitmap->bp = NULL; |
@@ -1790,7 +1875,7 @@ int bitmap_load(mddev_t *mddev) | |||
1790 | * All chunks should be clean, but some might need_sync. | 1875 | * All chunks should be clean, but some might need_sync. |
1791 | */ | 1876 | */ |
1792 | while (sector < mddev->resync_max_sectors) { | 1877 | while (sector < mddev->resync_max_sectors) { |
1793 | int blocks; | 1878 | sector_t blocks; |
1794 | bitmap_start_sync(bitmap, sector, &blocks, 0); | 1879 | bitmap_start_sync(bitmap, sector, &blocks, 0); |
1795 | sector += blocks; | 1880 | sector += blocks; |
1796 | } | 1881 | } |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index e872a7bad6b8..b2a127e891ac 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -45,7 +45,7 @@ | |||
45 | * | 45 | * |
46 | * The counter counts pending write requests, plus the on-disk bit. | 46 | * The counter counts pending write requests, plus the on-disk bit. |
47 | * When the counter is '1' and the resync bits are clear, the on-disk | 47 | * When the counter is '1' and the resync bits are clear, the on-disk |
48 | * bit can be cleared aswell, thus setting the counter to 0. | 48 | * bit can be cleared as well, thus setting the counter to 0. |
49 | * When we set a bit, or in the counter (to start a write), if the fields is | 49 | * When we set a bit, or in the counter (to start a write), if the fields is |
50 | * 0, we first set the disk bit and set the counter to 1. | 50 | * 0, we first set the disk bit and set the counter to 1. |
51 | * | 51 | * |
@@ -85,7 +85,6 @@ | |||
85 | typedef __u16 bitmap_counter_t; | 85 | typedef __u16 bitmap_counter_t; |
86 | #define COUNTER_BITS 16 | 86 | #define COUNTER_BITS 16 |
87 | #define COUNTER_BIT_SHIFT 4 | 87 | #define COUNTER_BIT_SHIFT 4 |
88 | #define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) | ||
89 | #define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) | 88 | #define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) |
90 | 89 | ||
91 | #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) | 90 | #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) |
@@ -196,19 +195,10 @@ struct bitmap { | |||
196 | 195 | ||
197 | mddev_t *mddev; /* the md device that the bitmap is for */ | 196 | mddev_t *mddev; /* the md device that the bitmap is for */ |
198 | 197 | ||
199 | int counter_bits; /* how many bits per block counter */ | ||
200 | |||
201 | /* bitmap chunksize -- how much data does each bit represent? */ | 198 | /* bitmap chunksize -- how much data does each bit represent? */ |
202 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ | 199 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ |
203 | unsigned long chunks; /* total number of data chunks for the array */ | 200 | unsigned long chunks; /* total number of data chunks for the array */ |
204 | 201 | ||
205 | /* We hold a count on the chunk currently being synced, and drop | ||
206 | * it when the last block is started. If the resync is aborted | ||
207 | * midway, we need to be able to drop that count, so we remember | ||
208 | * the counted chunk.. | ||
209 | */ | ||
210 | unsigned long syncchunk; | ||
211 | |||
212 | __u64 events_cleared; | 202 | __u64 events_cleared; |
213 | int need_sync; | 203 | int need_sync; |
214 | 204 | ||
@@ -271,8 +261,8 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, | |||
271 | unsigned long sectors, int behind); | 261 | unsigned long sectors, int behind); |
272 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, | 262 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, |
273 | unsigned long sectors, int success, int behind); | 263 | unsigned long sectors, int success, int behind); |
274 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); | 264 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); |
275 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); | 265 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); |
276 | void bitmap_close_sync(struct bitmap *bitmap); | 266 | void bitmap_close_sync(struct bitmap *bitmap); |
277 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); | 267 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); |
278 | 268 | ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 368e8e98f705..c8827ffd85bb 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -18,10 +18,14 @@ | |||
18 | #include <linux/crypto.h> | 18 | #include <linux/crypto.h> |
19 | #include <linux/workqueue.h> | 19 | #include <linux/workqueue.h> |
20 | #include <linux/backing-dev.h> | 20 | #include <linux/backing-dev.h> |
21 | #include <linux/percpu.h> | ||
21 | #include <asm/atomic.h> | 22 | #include <asm/atomic.h> |
22 | #include <linux/scatterlist.h> | 23 | #include <linux/scatterlist.h> |
23 | #include <asm/page.h> | 24 | #include <asm/page.h> |
24 | #include <asm/unaligned.h> | 25 | #include <asm/unaligned.h> |
26 | #include <crypto/hash.h> | ||
27 | #include <crypto/md5.h> | ||
28 | #include <crypto/algapi.h> | ||
25 | 29 | ||
26 | #include <linux/device-mapper.h> | 30 | #include <linux/device-mapper.h> |
27 | 31 | ||
@@ -63,6 +67,7 @@ struct dm_crypt_request { | |||
63 | struct convert_context *ctx; | 67 | struct convert_context *ctx; |
64 | struct scatterlist sg_in; | 68 | struct scatterlist sg_in; |
65 | struct scatterlist sg_out; | 69 | struct scatterlist sg_out; |
70 | sector_t iv_sector; | ||
66 | }; | 71 | }; |
67 | 72 | ||
68 | struct crypt_config; | 73 | struct crypt_config; |
@@ -73,11 +78,13 @@ struct crypt_iv_operations { | |||
73 | void (*dtr)(struct crypt_config *cc); | 78 | void (*dtr)(struct crypt_config *cc); |
74 | int (*init)(struct crypt_config *cc); | 79 | int (*init)(struct crypt_config *cc); |
75 | int (*wipe)(struct crypt_config *cc); | 80 | int (*wipe)(struct crypt_config *cc); |
76 | int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); | 81 | int (*generator)(struct crypt_config *cc, u8 *iv, |
82 | struct dm_crypt_request *dmreq); | ||
83 | int (*post)(struct crypt_config *cc, u8 *iv, | ||
84 | struct dm_crypt_request *dmreq); | ||
77 | }; | 85 | }; |
78 | 86 | ||
79 | struct iv_essiv_private { | 87 | struct iv_essiv_private { |
80 | struct crypto_cipher *tfm; | ||
81 | struct crypto_hash *hash_tfm; | 88 | struct crypto_hash *hash_tfm; |
82 | u8 *salt; | 89 | u8 *salt; |
83 | }; | 90 | }; |
@@ -86,11 +93,32 @@ struct iv_benbi_private { | |||
86 | int shift; | 93 | int shift; |
87 | }; | 94 | }; |
88 | 95 | ||
96 | #define LMK_SEED_SIZE 64 /* hash + 0 */ | ||
97 | struct iv_lmk_private { | ||
98 | struct crypto_shash *hash_tfm; | ||
99 | u8 *seed; | ||
100 | }; | ||
101 | |||
89 | /* | 102 | /* |
90 | * Crypt: maps a linear range of a block device | 103 | * Crypt: maps a linear range of a block device |
91 | * and encrypts / decrypts at the same time. | 104 | * and encrypts / decrypts at the same time. |
92 | */ | 105 | */ |
93 | enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; | 106 | enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; |
107 | |||
108 | /* | ||
109 | * Duplicated per-CPU state for cipher. | ||
110 | */ | ||
111 | struct crypt_cpu { | ||
112 | struct ablkcipher_request *req; | ||
113 | /* ESSIV: struct crypto_cipher *essiv_tfm */ | ||
114 | void *iv_private; | ||
115 | struct crypto_ablkcipher *tfms[0]; | ||
116 | }; | ||
117 | |||
118 | /* | ||
119 | * The fields in here must be read only after initialization, | ||
120 | * changing state should be in crypt_cpu. | ||
121 | */ | ||
94 | struct crypt_config { | 122 | struct crypt_config { |
95 | struct dm_dev *dev; | 123 | struct dm_dev *dev; |
96 | sector_t start; | 124 | sector_t start; |
@@ -108,17 +136,25 @@ struct crypt_config { | |||
108 | struct workqueue_struct *crypt_queue; | 136 | struct workqueue_struct *crypt_queue; |
109 | 137 | ||
110 | char *cipher; | 138 | char *cipher; |
111 | char *cipher_mode; | 139 | char *cipher_string; |
112 | 140 | ||
113 | struct crypt_iv_operations *iv_gen_ops; | 141 | struct crypt_iv_operations *iv_gen_ops; |
114 | union { | 142 | union { |
115 | struct iv_essiv_private essiv; | 143 | struct iv_essiv_private essiv; |
116 | struct iv_benbi_private benbi; | 144 | struct iv_benbi_private benbi; |
145 | struct iv_lmk_private lmk; | ||
117 | } iv_gen_private; | 146 | } iv_gen_private; |
118 | sector_t iv_offset; | 147 | sector_t iv_offset; |
119 | unsigned int iv_size; | 148 | unsigned int iv_size; |
120 | 149 | ||
121 | /* | 150 | /* |
151 | * Duplicated per cpu state. Access through | ||
152 | * per_cpu_ptr() only. | ||
153 | */ | ||
154 | struct crypt_cpu __percpu *cpu; | ||
155 | unsigned tfms_count; | ||
156 | |||
157 | /* | ||
122 | * Layout of each crypto request: | 158 | * Layout of each crypto request: |
123 | * | 159 | * |
124 | * struct ablkcipher_request | 160 | * struct ablkcipher_request |
@@ -132,11 +168,10 @@ struct crypt_config { | |||
132 | * correctly aligned. | 168 | * correctly aligned. |
133 | */ | 169 | */ |
134 | unsigned int dmreq_start; | 170 | unsigned int dmreq_start; |
135 | struct ablkcipher_request *req; | ||
136 | 171 | ||
137 | struct crypto_ablkcipher *tfm; | ||
138 | unsigned long flags; | 172 | unsigned long flags; |
139 | unsigned int key_size; | 173 | unsigned int key_size; |
174 | unsigned int key_parts; | ||
140 | u8 key[0]; | 175 | u8 key[0]; |
141 | }; | 176 | }; |
142 | 177 | ||
@@ -148,6 +183,20 @@ static struct kmem_cache *_crypt_io_pool; | |||
148 | 183 | ||
149 | static void clone_init(struct dm_crypt_io *, struct bio *); | 184 | static void clone_init(struct dm_crypt_io *, struct bio *); |
150 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); | 185 | static void kcryptd_queue_crypt(struct dm_crypt_io *io); |
186 | static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); | ||
187 | |||
188 | static struct crypt_cpu *this_crypt_config(struct crypt_config *cc) | ||
189 | { | ||
190 | return this_cpu_ptr(cc->cpu); | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Use this to access cipher attributes that are the same for each CPU. | ||
195 | */ | ||
196 | static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) | ||
197 | { | ||
198 | return __this_cpu_ptr(cc->cpu)->tfms[0]; | ||
199 | } | ||
151 | 200 | ||
152 | /* | 201 | /* |
153 | * Different IV generation algorithms: | 202 | * Different IV generation algorithms: |
@@ -168,23 +217,38 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); | |||
168 | * null: the initial vector is always zero. Provides compatibility with | 217 | * null: the initial vector is always zero. Provides compatibility with |
169 | * obsolete loop_fish2 devices. Do not use for new devices. | 218 | * obsolete loop_fish2 devices. Do not use for new devices. |
170 | * | 219 | * |
220 | * lmk: Compatible implementation of the block chaining mode used | ||
221 | * by the Loop-AES block device encryption system | ||
222 | * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ | ||
223 | * It operates on full 512 byte sectors and uses CBC | ||
224 | * with an IV derived from the sector number, the data and | ||
225 | * optionally extra IV seed. | ||
226 | * This means that after decryption the first block | ||
227 | * of sector must be tweaked according to decrypted data. | ||
228 | * Loop-AES can use three encryption schemes: | ||
229 | * version 1: is plain aes-cbc mode | ||
230 | * version 2: uses 64 multikey scheme with lmk IV generator | ||
231 | * version 3: the same as version 2 with additional IV seed | ||
232 | * (it uses 65 keys, last key is used as IV seed) | ||
233 | * | ||
171 | * plumb: unimplemented, see: | 234 | * plumb: unimplemented, see: |
172 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 | 235 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 |
173 | */ | 236 | */ |
174 | 237 | ||
175 | static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 238 | static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, |
239 | struct dm_crypt_request *dmreq) | ||
176 | { | 240 | { |
177 | memset(iv, 0, cc->iv_size); | 241 | memset(iv, 0, cc->iv_size); |
178 | *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); | 242 | *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); |
179 | 243 | ||
180 | return 0; | 244 | return 0; |
181 | } | 245 | } |
182 | 246 | ||
183 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, | 247 | static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, |
184 | sector_t sector) | 248 | struct dm_crypt_request *dmreq) |
185 | { | 249 | { |
186 | memset(iv, 0, cc->iv_size); | 250 | memset(iv, 0, cc->iv_size); |
187 | *(u64 *)iv = cpu_to_le64(sector); | 251 | *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); |
188 | 252 | ||
189 | return 0; | 253 | return 0; |
190 | } | 254 | } |
@@ -195,7 +259,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) | |||
195 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 259 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
196 | struct hash_desc desc; | 260 | struct hash_desc desc; |
197 | struct scatterlist sg; | 261 | struct scatterlist sg; |
198 | int err; | 262 | struct crypto_cipher *essiv_tfm; |
263 | int err, cpu; | ||
199 | 264 | ||
200 | sg_init_one(&sg, cc->key, cc->key_size); | 265 | sg_init_one(&sg, cc->key, cc->key_size); |
201 | desc.tfm = essiv->hash_tfm; | 266 | desc.tfm = essiv->hash_tfm; |
@@ -205,8 +270,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) | |||
205 | if (err) | 270 | if (err) |
206 | return err; | 271 | return err; |
207 | 272 | ||
208 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, | 273 | for_each_possible_cpu(cpu) { |
274 | essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private, | ||
275 | |||
276 | err = crypto_cipher_setkey(essiv_tfm, essiv->salt, | ||
209 | crypto_hash_digestsize(essiv->hash_tfm)); | 277 | crypto_hash_digestsize(essiv->hash_tfm)); |
278 | if (err) | ||
279 | return err; | ||
280 | } | ||
281 | |||
282 | return 0; | ||
210 | } | 283 | } |
211 | 284 | ||
212 | /* Wipe salt and reset key derived from volume key */ | 285 | /* Wipe salt and reset key derived from volume key */ |
@@ -214,24 +287,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc) | |||
214 | { | 287 | { |
215 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 288 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
216 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); | 289 | unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); |
290 | struct crypto_cipher *essiv_tfm; | ||
291 | int cpu, r, err = 0; | ||
217 | 292 | ||
218 | memset(essiv->salt, 0, salt_size); | 293 | memset(essiv->salt, 0, salt_size); |
219 | 294 | ||
220 | return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size); | 295 | for_each_possible_cpu(cpu) { |
296 | essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private; | ||
297 | r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); | ||
298 | if (r) | ||
299 | err = r; | ||
300 | } | ||
301 | |||
302 | return err; | ||
303 | } | ||
304 | |||
305 | /* Set up per cpu cipher state */ | ||
306 | static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, | ||
307 | struct dm_target *ti, | ||
308 | u8 *salt, unsigned saltsize) | ||
309 | { | ||
310 | struct crypto_cipher *essiv_tfm; | ||
311 | int err; | ||
312 | |||
313 | /* Setup the essiv_tfm with the given salt */ | ||
314 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | ||
315 | if (IS_ERR(essiv_tfm)) { | ||
316 | ti->error = "Error allocating crypto tfm for ESSIV"; | ||
317 | return essiv_tfm; | ||
318 | } | ||
319 | |||
320 | if (crypto_cipher_blocksize(essiv_tfm) != | ||
321 | crypto_ablkcipher_ivsize(any_tfm(cc))) { | ||
322 | ti->error = "Block size of ESSIV cipher does " | ||
323 | "not match IV size of block cipher"; | ||
324 | crypto_free_cipher(essiv_tfm); | ||
325 | return ERR_PTR(-EINVAL); | ||
326 | } | ||
327 | |||
328 | err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); | ||
329 | if (err) { | ||
330 | ti->error = "Failed to set key for ESSIV cipher"; | ||
331 | crypto_free_cipher(essiv_tfm); | ||
332 | return ERR_PTR(err); | ||
333 | } | ||
334 | |||
335 | return essiv_tfm; | ||
221 | } | 336 | } |
222 | 337 | ||
223 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) | 338 | static void crypt_iv_essiv_dtr(struct crypt_config *cc) |
224 | { | 339 | { |
340 | int cpu; | ||
341 | struct crypt_cpu *cpu_cc; | ||
342 | struct crypto_cipher *essiv_tfm; | ||
225 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; | 343 | struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; |
226 | 344 | ||
227 | crypto_free_cipher(essiv->tfm); | ||
228 | essiv->tfm = NULL; | ||
229 | |||
230 | crypto_free_hash(essiv->hash_tfm); | 345 | crypto_free_hash(essiv->hash_tfm); |
231 | essiv->hash_tfm = NULL; | 346 | essiv->hash_tfm = NULL; |
232 | 347 | ||
233 | kzfree(essiv->salt); | 348 | kzfree(essiv->salt); |
234 | essiv->salt = NULL; | 349 | essiv->salt = NULL; |
350 | |||
351 | for_each_possible_cpu(cpu) { | ||
352 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
353 | essiv_tfm = cpu_cc->iv_private; | ||
354 | |||
355 | if (essiv_tfm) | ||
356 | crypto_free_cipher(essiv_tfm); | ||
357 | |||
358 | cpu_cc->iv_private = NULL; | ||
359 | } | ||
235 | } | 360 | } |
236 | 361 | ||
237 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | 362 | static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, |
@@ -240,7 +365,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
240 | struct crypto_cipher *essiv_tfm = NULL; | 365 | struct crypto_cipher *essiv_tfm = NULL; |
241 | struct crypto_hash *hash_tfm = NULL; | 366 | struct crypto_hash *hash_tfm = NULL; |
242 | u8 *salt = NULL; | 367 | u8 *salt = NULL; |
243 | int err; | 368 | int err, cpu; |
244 | 369 | ||
245 | if (!opts) { | 370 | if (!opts) { |
246 | ti->error = "Digest algorithm missing for ESSIV mode"; | 371 | ti->error = "Digest algorithm missing for ESSIV mode"; |
@@ -262,48 +387,44 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, | |||
262 | goto bad; | 387 | goto bad; |
263 | } | 388 | } |
264 | 389 | ||
265 | /* Allocate essiv_tfm */ | ||
266 | essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); | ||
267 | if (IS_ERR(essiv_tfm)) { | ||
268 | ti->error = "Error allocating crypto tfm for ESSIV"; | ||
269 | err = PTR_ERR(essiv_tfm); | ||
270 | goto bad; | ||
271 | } | ||
272 | if (crypto_cipher_blocksize(essiv_tfm) != | ||
273 | crypto_ablkcipher_ivsize(cc->tfm)) { | ||
274 | ti->error = "Block size of ESSIV cipher does " | ||
275 | "not match IV size of block cipher"; | ||
276 | err = -EINVAL; | ||
277 | goto bad; | ||
278 | } | ||
279 | |||
280 | cc->iv_gen_private.essiv.salt = salt; | 390 | cc->iv_gen_private.essiv.salt = salt; |
281 | cc->iv_gen_private.essiv.tfm = essiv_tfm; | ||
282 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; | 391 | cc->iv_gen_private.essiv.hash_tfm = hash_tfm; |
283 | 392 | ||
393 | for_each_possible_cpu(cpu) { | ||
394 | essiv_tfm = setup_essiv_cpu(cc, ti, salt, | ||
395 | crypto_hash_digestsize(hash_tfm)); | ||
396 | if (IS_ERR(essiv_tfm)) { | ||
397 | crypt_iv_essiv_dtr(cc); | ||
398 | return PTR_ERR(essiv_tfm); | ||
399 | } | ||
400 | per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm; | ||
401 | } | ||
402 | |||
284 | return 0; | 403 | return 0; |
285 | 404 | ||
286 | bad: | 405 | bad: |
287 | if (essiv_tfm && !IS_ERR(essiv_tfm)) | ||
288 | crypto_free_cipher(essiv_tfm); | ||
289 | if (hash_tfm && !IS_ERR(hash_tfm)) | 406 | if (hash_tfm && !IS_ERR(hash_tfm)) |
290 | crypto_free_hash(hash_tfm); | 407 | crypto_free_hash(hash_tfm); |
291 | kfree(salt); | 408 | kfree(salt); |
292 | return err; | 409 | return err; |
293 | } | 410 | } |
294 | 411 | ||
295 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 412 | static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, |
413 | struct dm_crypt_request *dmreq) | ||
296 | { | 414 | { |
415 | struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private; | ||
416 | |||
297 | memset(iv, 0, cc->iv_size); | 417 | memset(iv, 0, cc->iv_size); |
298 | *(u64 *)iv = cpu_to_le64(sector); | 418 | *(u64 *)iv = cpu_to_le64(dmreq->iv_sector); |
299 | crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv); | 419 | crypto_cipher_encrypt_one(essiv_tfm, iv, iv); |
420 | |||
300 | return 0; | 421 | return 0; |
301 | } | 422 | } |
302 | 423 | ||
303 | static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, | 424 | static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, |
304 | const char *opts) | 425 | const char *opts) |
305 | { | 426 | { |
306 | unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); | 427 | unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); |
307 | int log = ilog2(bs); | 428 | int log = ilog2(bs); |
308 | 429 | ||
309 | /* we need to calculate how far we must shift the sector count | 430 | /* we need to calculate how far we must shift the sector count |
@@ -328,25 +449,177 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) | |||
328 | { | 449 | { |
329 | } | 450 | } |
330 | 451 | ||
331 | static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 452 | static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, |
453 | struct dm_crypt_request *dmreq) | ||
332 | { | 454 | { |
333 | __be64 val; | 455 | __be64 val; |
334 | 456 | ||
335 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ | 457 | memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ |
336 | 458 | ||
337 | val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1); | 459 | val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); |
338 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); | 460 | put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); |
339 | 461 | ||
340 | return 0; | 462 | return 0; |
341 | } | 463 | } |
342 | 464 | ||
343 | static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) | 465 | static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, |
466 | struct dm_crypt_request *dmreq) | ||
344 | { | 467 | { |
345 | memset(iv, 0, cc->iv_size); | 468 | memset(iv, 0, cc->iv_size); |
346 | 469 | ||
347 | return 0; | 470 | return 0; |
348 | } | 471 | } |
349 | 472 | ||
473 | static void crypt_iv_lmk_dtr(struct crypt_config *cc) | ||
474 | { | ||
475 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
476 | |||
477 | if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) | ||
478 | crypto_free_shash(lmk->hash_tfm); | ||
479 | lmk->hash_tfm = NULL; | ||
480 | |||
481 | kzfree(lmk->seed); | ||
482 | lmk->seed = NULL; | ||
483 | } | ||
484 | |||
485 | static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
486 | const char *opts) | ||
487 | { | ||
488 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
489 | |||
490 | lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); | ||
491 | if (IS_ERR(lmk->hash_tfm)) { | ||
492 | ti->error = "Error initializing LMK hash"; | ||
493 | return PTR_ERR(lmk->hash_tfm); | ||
494 | } | ||
495 | |||
496 | /* No seed in LMK version 2 */ | ||
497 | if (cc->key_parts == cc->tfms_count) { | ||
498 | lmk->seed = NULL; | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); | ||
503 | if (!lmk->seed) { | ||
504 | crypt_iv_lmk_dtr(cc); | ||
505 | ti->error = "Error kmallocing seed storage in LMK"; | ||
506 | return -ENOMEM; | ||
507 | } | ||
508 | |||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static int crypt_iv_lmk_init(struct crypt_config *cc) | ||
513 | { | ||
514 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
515 | int subkey_size = cc->key_size / cc->key_parts; | ||
516 | |||
517 | /* LMK seed is on the position of LMK_KEYS + 1 key */ | ||
518 | if (lmk->seed) | ||
519 | memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), | ||
520 | crypto_shash_digestsize(lmk->hash_tfm)); | ||
521 | |||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | static int crypt_iv_lmk_wipe(struct crypt_config *cc) | ||
526 | { | ||
527 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
528 | |||
529 | if (lmk->seed) | ||
530 | memset(lmk->seed, 0, LMK_SEED_SIZE); | ||
531 | |||
532 | return 0; | ||
533 | } | ||
534 | |||
535 | static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, | ||
536 | struct dm_crypt_request *dmreq, | ||
537 | u8 *data) | ||
538 | { | ||
539 | struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; | ||
540 | struct { | ||
541 | struct shash_desc desc; | ||
542 | char ctx[crypto_shash_descsize(lmk->hash_tfm)]; | ||
543 | } sdesc; | ||
544 | struct md5_state md5state; | ||
545 | u32 buf[4]; | ||
546 | int i, r; | ||
547 | |||
548 | sdesc.desc.tfm = lmk->hash_tfm; | ||
549 | sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
550 | |||
551 | r = crypto_shash_init(&sdesc.desc); | ||
552 | if (r) | ||
553 | return r; | ||
554 | |||
555 | if (lmk->seed) { | ||
556 | r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); | ||
557 | if (r) | ||
558 | return r; | ||
559 | } | ||
560 | |||
561 | /* Sector is always 512B, block size 16, add data of blocks 1-31 */ | ||
562 | r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); | ||
563 | if (r) | ||
564 | return r; | ||
565 | |||
566 | /* Sector is cropped to 56 bits here */ | ||
567 | buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); | ||
568 | buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); | ||
569 | buf[2] = cpu_to_le32(4024); | ||
570 | buf[3] = 0; | ||
571 | r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); | ||
572 | if (r) | ||
573 | return r; | ||
574 | |||
575 | /* No MD5 padding here */ | ||
576 | r = crypto_shash_export(&sdesc.desc, &md5state); | ||
577 | if (r) | ||
578 | return r; | ||
579 | |||
580 | for (i = 0; i < MD5_HASH_WORDS; i++) | ||
581 | __cpu_to_le32s(&md5state.hash[i]); | ||
582 | memcpy(iv, &md5state.hash, cc->iv_size); | ||
583 | |||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, | ||
588 | struct dm_crypt_request *dmreq) | ||
589 | { | ||
590 | u8 *src; | ||
591 | int r = 0; | ||
592 | |||
593 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { | ||
594 | src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); | ||
595 | r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); | ||
596 | kunmap_atomic(src, KM_USER0); | ||
597 | } else | ||
598 | memset(iv, 0, cc->iv_size); | ||
599 | |||
600 | return r; | ||
601 | } | ||
602 | |||
603 | static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, | ||
604 | struct dm_crypt_request *dmreq) | ||
605 | { | ||
606 | u8 *dst; | ||
607 | int r; | ||
608 | |||
609 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) | ||
610 | return 0; | ||
611 | |||
612 | dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); | ||
613 | r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); | ||
614 | |||
615 | /* Tweak the first block of plaintext sector */ | ||
616 | if (!r) | ||
617 | crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); | ||
618 | |||
619 | kunmap_atomic(dst, KM_USER0); | ||
620 | return r; | ||
621 | } | ||
622 | |||
350 | static struct crypt_iv_operations crypt_iv_plain_ops = { | 623 | static struct crypt_iv_operations crypt_iv_plain_ops = { |
351 | .generator = crypt_iv_plain_gen | 624 | .generator = crypt_iv_plain_gen |
352 | }; | 625 | }; |
@@ -373,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = { | |||
373 | .generator = crypt_iv_null_gen | 646 | .generator = crypt_iv_null_gen |
374 | }; | 647 | }; |
375 | 648 | ||
649 | static struct crypt_iv_operations crypt_iv_lmk_ops = { | ||
650 | .ctr = crypt_iv_lmk_ctr, | ||
651 | .dtr = crypt_iv_lmk_dtr, | ||
652 | .init = crypt_iv_lmk_init, | ||
653 | .wipe = crypt_iv_lmk_wipe, | ||
654 | .generator = crypt_iv_lmk_gen, | ||
655 | .post = crypt_iv_lmk_post | ||
656 | }; | ||
657 | |||
376 | static void crypt_convert_init(struct crypt_config *cc, | 658 | static void crypt_convert_init(struct crypt_config *cc, |
377 | struct convert_context *ctx, | 659 | struct convert_context *ctx, |
378 | struct bio *bio_out, struct bio *bio_in, | 660 | struct bio *bio_out, struct bio *bio_in, |
@@ -400,6 +682,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, | |||
400 | return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); | 682 | return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); |
401 | } | 683 | } |
402 | 684 | ||
685 | static u8 *iv_of_dmreq(struct crypt_config *cc, | ||
686 | struct dm_crypt_request *dmreq) | ||
687 | { | ||
688 | return (u8 *)ALIGN((unsigned long)(dmreq + 1), | ||
689 | crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); | ||
690 | } | ||
691 | |||
403 | static int crypt_convert_block(struct crypt_config *cc, | 692 | static int crypt_convert_block(struct crypt_config *cc, |
404 | struct convert_context *ctx, | 693 | struct convert_context *ctx, |
405 | struct ablkcipher_request *req) | 694 | struct ablkcipher_request *req) |
@@ -411,9 +700,9 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
411 | int r = 0; | 700 | int r = 0; |
412 | 701 | ||
413 | dmreq = dmreq_of_req(cc, req); | 702 | dmreq = dmreq_of_req(cc, req); |
414 | iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), | 703 | iv = iv_of_dmreq(cc, dmreq); |
415 | crypto_ablkcipher_alignmask(cc->tfm) + 1); | ||
416 | 704 | ||
705 | dmreq->iv_sector = ctx->sector; | ||
417 | dmreq->ctx = ctx; | 706 | dmreq->ctx = ctx; |
418 | sg_init_table(&dmreq->sg_in, 1); | 707 | sg_init_table(&dmreq->sg_in, 1); |
419 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, | 708 | sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, |
@@ -436,7 +725,7 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
436 | } | 725 | } |
437 | 726 | ||
438 | if (cc->iv_gen_ops) { | 727 | if (cc->iv_gen_ops) { |
439 | r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); | 728 | r = cc->iv_gen_ops->generator(cc, iv, dmreq); |
440 | if (r < 0) | 729 | if (r < 0) |
441 | return r; | 730 | return r; |
442 | } | 731 | } |
@@ -449,21 +738,28 @@ static int crypt_convert_block(struct crypt_config *cc, | |||
449 | else | 738 | else |
450 | r = crypto_ablkcipher_decrypt(req); | 739 | r = crypto_ablkcipher_decrypt(req); |
451 | 740 | ||
741 | if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) | ||
742 | r = cc->iv_gen_ops->post(cc, iv, dmreq); | ||
743 | |||
452 | return r; | 744 | return r; |
453 | } | 745 | } |
454 | 746 | ||
455 | static void kcryptd_async_done(struct crypto_async_request *async_req, | 747 | static void kcryptd_async_done(struct crypto_async_request *async_req, |
456 | int error); | 748 | int error); |
749 | |||
457 | static void crypt_alloc_req(struct crypt_config *cc, | 750 | static void crypt_alloc_req(struct crypt_config *cc, |
458 | struct convert_context *ctx) | 751 | struct convert_context *ctx) |
459 | { | 752 | { |
460 | if (!cc->req) | 753 | struct crypt_cpu *this_cc = this_crypt_config(cc); |
461 | cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); | 754 | unsigned key_index = ctx->sector & (cc->tfms_count - 1); |
462 | ablkcipher_request_set_tfm(cc->req, cc->tfm); | 755 | |
463 | ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | | 756 | if (!this_cc->req) |
464 | CRYPTO_TFM_REQ_MAY_SLEEP, | 757 | this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); |
465 | kcryptd_async_done, | 758 | |
466 | dmreq_of_req(cc, cc->req)); | 759 | ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]); |
760 | ablkcipher_request_set_callback(this_cc->req, | ||
761 | CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, | ||
762 | kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); | ||
467 | } | 763 | } |
468 | 764 | ||
469 | /* | 765 | /* |
@@ -472,6 +768,7 @@ static void crypt_alloc_req(struct crypt_config *cc, | |||
472 | static int crypt_convert(struct crypt_config *cc, | 768 | static int crypt_convert(struct crypt_config *cc, |
473 | struct convert_context *ctx) | 769 | struct convert_context *ctx) |
474 | { | 770 | { |
771 | struct crypt_cpu *this_cc = this_crypt_config(cc); | ||
475 | int r; | 772 | int r; |
476 | 773 | ||
477 | atomic_set(&ctx->pending, 1); | 774 | atomic_set(&ctx->pending, 1); |
@@ -483,7 +780,7 @@ static int crypt_convert(struct crypt_config *cc, | |||
483 | 780 | ||
484 | atomic_inc(&ctx->pending); | 781 | atomic_inc(&ctx->pending); |
485 | 782 | ||
486 | r = crypt_convert_block(cc, ctx, cc->req); | 783 | r = crypt_convert_block(cc, ctx, this_cc->req); |
487 | 784 | ||
488 | switch (r) { | 785 | switch (r) { |
489 | /* async */ | 786 | /* async */ |
@@ -492,7 +789,7 @@ static int crypt_convert(struct crypt_config *cc, | |||
492 | INIT_COMPLETION(ctx->restart); | 789 | INIT_COMPLETION(ctx->restart); |
493 | /* fall through*/ | 790 | /* fall through*/ |
494 | case -EINPROGRESS: | 791 | case -EINPROGRESS: |
495 | cc->req = NULL; | 792 | this_cc->req = NULL; |
496 | ctx->sector++; | 793 | ctx->sector++; |
497 | continue; | 794 | continue; |
498 | 795 | ||
@@ -651,6 +948,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) | |||
651 | * They must be separated as otherwise the final stages could be | 948 | * They must be separated as otherwise the final stages could be |
652 | * starved by new requests which can block in the first stages due | 949 | * starved by new requests which can block in the first stages due |
653 | * to memory allocation. | 950 | * to memory allocation. |
951 | * | ||
952 | * The work is done per CPU global for all dm-crypt instances. | ||
953 | * They should not depend on each other and do not block. | ||
654 | */ | 954 | */ |
655 | static void crypt_endio(struct bio *clone, int error) | 955 | static void crypt_endio(struct bio *clone, int error) |
656 | { | 956 | { |
@@ -691,25 +991,22 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) | |||
691 | clone->bi_destructor = dm_crypt_bio_destructor; | 991 | clone->bi_destructor = dm_crypt_bio_destructor; |
692 | } | 992 | } |
693 | 993 | ||
694 | static void kcryptd_io_read(struct dm_crypt_io *io) | 994 | static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) |
695 | { | 995 | { |
696 | struct crypt_config *cc = io->target->private; | 996 | struct crypt_config *cc = io->target->private; |
697 | struct bio *base_bio = io->base_bio; | 997 | struct bio *base_bio = io->base_bio; |
698 | struct bio *clone; | 998 | struct bio *clone; |
699 | 999 | ||
700 | crypt_inc_pending(io); | ||
701 | |||
702 | /* | 1000 | /* |
703 | * The block layer might modify the bvec array, so always | 1001 | * The block layer might modify the bvec array, so always |
704 | * copy the required bvecs because we need the original | 1002 | * copy the required bvecs because we need the original |
705 | * one in order to decrypt the whole bio data *afterwards*. | 1003 | * one in order to decrypt the whole bio data *afterwards*. |
706 | */ | 1004 | */ |
707 | clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); | 1005 | clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs); |
708 | if (unlikely(!clone)) { | 1006 | if (!clone) |
709 | io->error = -ENOMEM; | 1007 | return 1; |
710 | crypt_dec_pending(io); | 1008 | |
711 | return; | 1009 | crypt_inc_pending(io); |
712 | } | ||
713 | 1010 | ||
714 | clone_init(io, clone); | 1011 | clone_init(io, clone); |
715 | clone->bi_idx = 0; | 1012 | clone->bi_idx = 0; |
@@ -720,6 +1017,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io) | |||
720 | sizeof(struct bio_vec) * clone->bi_vcnt); | 1017 | sizeof(struct bio_vec) * clone->bi_vcnt); |
721 | 1018 | ||
722 | generic_make_request(clone); | 1019 | generic_make_request(clone); |
1020 | return 0; | ||
723 | } | 1021 | } |
724 | 1022 | ||
725 | static void kcryptd_io_write(struct dm_crypt_io *io) | 1023 | static void kcryptd_io_write(struct dm_crypt_io *io) |
@@ -732,9 +1030,12 @@ static void kcryptd_io(struct work_struct *work) | |||
732 | { | 1030 | { |
733 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); | 1031 | struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); |
734 | 1032 | ||
735 | if (bio_data_dir(io->base_bio) == READ) | 1033 | if (bio_data_dir(io->base_bio) == READ) { |
736 | kcryptd_io_read(io); | 1034 | crypt_inc_pending(io); |
737 | else | 1035 | if (kcryptd_io_read(io, GFP_NOIO)) |
1036 | io->error = -ENOMEM; | ||
1037 | crypt_dec_pending(io); | ||
1038 | } else | ||
738 | kcryptd_io_write(io); | 1039 | kcryptd_io_write(io); |
739 | } | 1040 | } |
740 | 1041 | ||
@@ -901,6 +1202,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
901 | return; | 1202 | return; |
902 | } | 1203 | } |
903 | 1204 | ||
1205 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) | ||
1206 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); | ||
1207 | |||
904 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); | 1208 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); |
905 | 1209 | ||
906 | if (!atomic_dec_and_test(&ctx->pending)) | 1210 | if (!atomic_dec_and_test(&ctx->pending)) |
@@ -971,34 +1275,93 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size) | |||
971 | } | 1275 | } |
972 | } | 1276 | } |
973 | 1277 | ||
1278 | static void crypt_free_tfms(struct crypt_config *cc, int cpu) | ||
1279 | { | ||
1280 | struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1281 | unsigned i; | ||
1282 | |||
1283 | for (i = 0; i < cc->tfms_count; i++) | ||
1284 | if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) { | ||
1285 | crypto_free_ablkcipher(cpu_cc->tfms[i]); | ||
1286 | cpu_cc->tfms[i] = NULL; | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode) | ||
1291 | { | ||
1292 | struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1293 | unsigned i; | ||
1294 | int err; | ||
1295 | |||
1296 | for (i = 0; i < cc->tfms_count; i++) { | ||
1297 | cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); | ||
1298 | if (IS_ERR(cpu_cc->tfms[i])) { | ||
1299 | err = PTR_ERR(cpu_cc->tfms[i]); | ||
1300 | crypt_free_tfms(cc, cpu); | ||
1301 | return err; | ||
1302 | } | ||
1303 | } | ||
1304 | |||
1305 | return 0; | ||
1306 | } | ||
1307 | |||
1308 | static int crypt_setkey_allcpus(struct crypt_config *cc) | ||
1309 | { | ||
1310 | unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); | ||
1311 | int cpu, err = 0, i, r; | ||
1312 | |||
1313 | for_each_possible_cpu(cpu) { | ||
1314 | for (i = 0; i < cc->tfms_count; i++) { | ||
1315 | r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i], | ||
1316 | cc->key + (i * subkey_size), subkey_size); | ||
1317 | if (r) | ||
1318 | err = r; | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | return err; | ||
1323 | } | ||
1324 | |||
974 | static int crypt_set_key(struct crypt_config *cc, char *key) | 1325 | static int crypt_set_key(struct crypt_config *cc, char *key) |
975 | { | 1326 | { |
976 | unsigned key_size = strlen(key) >> 1; | 1327 | int r = -EINVAL; |
1328 | int key_string_len = strlen(key); | ||
977 | 1329 | ||
978 | if (cc->key_size && cc->key_size != key_size) | 1330 | /* The key size may not be changed. */ |
979 | return -EINVAL; | 1331 | if (cc->key_size != (key_string_len >> 1)) |
1332 | goto out; | ||
980 | 1333 | ||
981 | cc->key_size = key_size; /* initial settings */ | 1334 | /* Hyphen (which gives a key_size of zero) means there is no key. */ |
1335 | if (!cc->key_size && strcmp(key, "-")) | ||
1336 | goto out; | ||
982 | 1337 | ||
983 | if ((!key_size && strcmp(key, "-")) || | 1338 | if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) |
984 | (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) | 1339 | goto out; |
985 | return -EINVAL; | ||
986 | 1340 | ||
987 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 1341 | set_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
988 | 1342 | ||
989 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); | 1343 | r = crypt_setkey_allcpus(cc); |
1344 | |||
1345 | out: | ||
1346 | /* Hex key string not needed after here, so wipe it. */ | ||
1347 | memset(key, '0', key_string_len); | ||
1348 | |||
1349 | return r; | ||
990 | } | 1350 | } |
991 | 1351 | ||
992 | static int crypt_wipe_key(struct crypt_config *cc) | 1352 | static int crypt_wipe_key(struct crypt_config *cc) |
993 | { | 1353 | { |
994 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); | 1354 | clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); |
995 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); | 1355 | memset(&cc->key, 0, cc->key_size * sizeof(u8)); |
996 | return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size); | 1356 | |
1357 | return crypt_setkey_allcpus(cc); | ||
997 | } | 1358 | } |
998 | 1359 | ||
999 | static void crypt_dtr(struct dm_target *ti) | 1360 | static void crypt_dtr(struct dm_target *ti) |
1000 | { | 1361 | { |
1001 | struct crypt_config *cc = ti->private; | 1362 | struct crypt_config *cc = ti->private; |
1363 | struct crypt_cpu *cpu_cc; | ||
1364 | int cpu; | ||
1002 | 1365 | ||
1003 | ti->private = NULL; | 1366 | ti->private = NULL; |
1004 | 1367 | ||
@@ -1010,6 +1373,14 @@ static void crypt_dtr(struct dm_target *ti) | |||
1010 | if (cc->crypt_queue) | 1373 | if (cc->crypt_queue) |
1011 | destroy_workqueue(cc->crypt_queue); | 1374 | destroy_workqueue(cc->crypt_queue); |
1012 | 1375 | ||
1376 | if (cc->cpu) | ||
1377 | for_each_possible_cpu(cpu) { | ||
1378 | cpu_cc = per_cpu_ptr(cc->cpu, cpu); | ||
1379 | if (cpu_cc->req) | ||
1380 | mempool_free(cpu_cc->req, cc->req_pool); | ||
1381 | crypt_free_tfms(cc, cpu); | ||
1382 | } | ||
1383 | |||
1013 | if (cc->bs) | 1384 | if (cc->bs) |
1014 | bioset_free(cc->bs); | 1385 | bioset_free(cc->bs); |
1015 | 1386 | ||
@@ -1023,14 +1394,14 @@ static void crypt_dtr(struct dm_target *ti) | |||
1023 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) | 1394 | if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) |
1024 | cc->iv_gen_ops->dtr(cc); | 1395 | cc->iv_gen_ops->dtr(cc); |
1025 | 1396 | ||
1026 | if (cc->tfm && !IS_ERR(cc->tfm)) | ||
1027 | crypto_free_ablkcipher(cc->tfm); | ||
1028 | |||
1029 | if (cc->dev) | 1397 | if (cc->dev) |
1030 | dm_put_device(ti, cc->dev); | 1398 | dm_put_device(ti, cc->dev); |
1031 | 1399 | ||
1400 | if (cc->cpu) | ||
1401 | free_percpu(cc->cpu); | ||
1402 | |||
1032 | kzfree(cc->cipher); | 1403 | kzfree(cc->cipher); |
1033 | kzfree(cc->cipher_mode); | 1404 | kzfree(cc->cipher_string); |
1034 | 1405 | ||
1035 | /* Must zero key material before freeing */ | 1406 | /* Must zero key material before freeing */ |
1036 | kzfree(cc); | 1407 | kzfree(cc); |
@@ -1040,9 +1411,9 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1040 | char *cipher_in, char *key) | 1411 | char *cipher_in, char *key) |
1041 | { | 1412 | { |
1042 | struct crypt_config *cc = ti->private; | 1413 | struct crypt_config *cc = ti->private; |
1043 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts; | 1414 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; |
1044 | char *cipher_api = NULL; | 1415 | char *cipher_api = NULL; |
1045 | int ret = -EINVAL; | 1416 | int cpu, ret = -EINVAL; |
1046 | 1417 | ||
1047 | /* Convert to crypto api definition? */ | 1418 | /* Convert to crypto api definition? */ |
1048 | if (strchr(cipher_in, '(')) { | 1419 | if (strchr(cipher_in, '(')) { |
@@ -1050,23 +1421,31 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1050 | return -EINVAL; | 1421 | return -EINVAL; |
1051 | } | 1422 | } |
1052 | 1423 | ||
1424 | cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); | ||
1425 | if (!cc->cipher_string) | ||
1426 | goto bad_mem; | ||
1427 | |||
1053 | /* | 1428 | /* |
1054 | * Legacy dm-crypt cipher specification | 1429 | * Legacy dm-crypt cipher specification |
1055 | * cipher-mode-iv:ivopts | 1430 | * cipher[:keycount]-mode-iv:ivopts |
1056 | */ | 1431 | */ |
1057 | tmp = cipher_in; | 1432 | tmp = cipher_in; |
1058 | cipher = strsep(&tmp, "-"); | 1433 | keycount = strsep(&tmp, "-"); |
1434 | cipher = strsep(&keycount, ":"); | ||
1435 | |||
1436 | if (!keycount) | ||
1437 | cc->tfms_count = 1; | ||
1438 | else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || | ||
1439 | !is_power_of_2(cc->tfms_count)) { | ||
1440 | ti->error = "Bad cipher key count specification"; | ||
1441 | return -EINVAL; | ||
1442 | } | ||
1443 | cc->key_parts = cc->tfms_count; | ||
1059 | 1444 | ||
1060 | cc->cipher = kstrdup(cipher, GFP_KERNEL); | 1445 | cc->cipher = kstrdup(cipher, GFP_KERNEL); |
1061 | if (!cc->cipher) | 1446 | if (!cc->cipher) |
1062 | goto bad_mem; | 1447 | goto bad_mem; |
1063 | 1448 | ||
1064 | if (tmp) { | ||
1065 | cc->cipher_mode = kstrdup(tmp, GFP_KERNEL); | ||
1066 | if (!cc->cipher_mode) | ||
1067 | goto bad_mem; | ||
1068 | } | ||
1069 | |||
1070 | chainmode = strsep(&tmp, "-"); | 1449 | chainmode = strsep(&tmp, "-"); |
1071 | ivopts = strsep(&tmp, "-"); | 1450 | ivopts = strsep(&tmp, "-"); |
1072 | ivmode = strsep(&ivopts, ":"); | 1451 | ivmode = strsep(&ivopts, ":"); |
@@ -1074,10 +1453,19 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1074 | if (tmp) | 1453 | if (tmp) |
1075 | DMWARN("Ignoring unexpected additional cipher options"); | 1454 | DMWARN("Ignoring unexpected additional cipher options"); |
1076 | 1455 | ||
1077 | /* Compatibility mode for old dm-crypt mappings */ | 1456 | cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) + |
1457 | cc->tfms_count * sizeof(*(cc->cpu->tfms)), | ||
1458 | __alignof__(struct crypt_cpu)); | ||
1459 | if (!cc->cpu) { | ||
1460 | ti->error = "Cannot allocate per cpu state"; | ||
1461 | goto bad_mem; | ||
1462 | } | ||
1463 | |||
1464 | /* | ||
1465 | * For compatibility with the original dm-crypt mapping format, if | ||
1466 | * only the cipher name is supplied, use cbc-plain. | ||
1467 | */ | ||
1078 | if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { | 1468 | if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { |
1079 | kfree(cc->cipher_mode); | ||
1080 | cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL); | ||
1081 | chainmode = "cbc"; | 1469 | chainmode = "cbc"; |
1082 | ivmode = "plain"; | 1470 | ivmode = "plain"; |
1083 | } | 1471 | } |
@@ -1099,11 +1487,12 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1099 | } | 1487 | } |
1100 | 1488 | ||
1101 | /* Allocate cipher */ | 1489 | /* Allocate cipher */ |
1102 | cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0); | 1490 | for_each_possible_cpu(cpu) { |
1103 | if (IS_ERR(cc->tfm)) { | 1491 | ret = crypt_alloc_tfms(cc, cpu, cipher_api); |
1104 | ret = PTR_ERR(cc->tfm); | 1492 | if (ret < 0) { |
1105 | ti->error = "Error allocating crypto tfm"; | 1493 | ti->error = "Error allocating crypto tfm"; |
1106 | goto bad; | 1494 | goto bad; |
1495 | } | ||
1107 | } | 1496 | } |
1108 | 1497 | ||
1109 | /* Initialize and set key */ | 1498 | /* Initialize and set key */ |
@@ -1114,7 +1503,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1114 | } | 1503 | } |
1115 | 1504 | ||
1116 | /* Initialize IV */ | 1505 | /* Initialize IV */ |
1117 | cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm); | 1506 | cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); |
1118 | if (cc->iv_size) | 1507 | if (cc->iv_size) |
1119 | /* at least a 64 bit sector number should fit in our buffer */ | 1508 | /* at least a 64 bit sector number should fit in our buffer */ |
1120 | cc->iv_size = max(cc->iv_size, | 1509 | cc->iv_size = max(cc->iv_size, |
@@ -1137,7 +1526,15 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1137 | cc->iv_gen_ops = &crypt_iv_benbi_ops; | 1526 | cc->iv_gen_ops = &crypt_iv_benbi_ops; |
1138 | else if (strcmp(ivmode, "null") == 0) | 1527 | else if (strcmp(ivmode, "null") == 0) |
1139 | cc->iv_gen_ops = &crypt_iv_null_ops; | 1528 | cc->iv_gen_ops = &crypt_iv_null_ops; |
1140 | else { | 1529 | else if (strcmp(ivmode, "lmk") == 0) { |
1530 | cc->iv_gen_ops = &crypt_iv_lmk_ops; | ||
1531 | /* Version 2 and 3 is recognised according | ||
1532 | * to length of provided multi-key string. | ||
1533 | * If present (version 3), last key is used as IV seed. | ||
1534 | */ | ||
1535 | if (cc->key_size % cc->key_parts) | ||
1536 | cc->key_parts++; | ||
1537 | } else { | ||
1141 | ret = -EINVAL; | 1538 | ret = -EINVAL; |
1142 | ti->error = "Invalid IV mode"; | 1539 | ti->error = "Invalid IV mode"; |
1143 | goto bad; | 1540 | goto bad; |
@@ -1194,6 +1591,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1194 | ti->error = "Cannot allocate encryption context"; | 1591 | ti->error = "Cannot allocate encryption context"; |
1195 | return -ENOMEM; | 1592 | return -ENOMEM; |
1196 | } | 1593 | } |
1594 | cc->key_size = key_size; | ||
1197 | 1595 | ||
1198 | ti->private = cc; | 1596 | ti->private = cc; |
1199 | ret = crypt_ctr_cipher(ti, argv[0], argv[1]); | 1597 | ret = crypt_ctr_cipher(ti, argv[0], argv[1]); |
@@ -1208,9 +1606,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1208 | } | 1606 | } |
1209 | 1607 | ||
1210 | cc->dmreq_start = sizeof(struct ablkcipher_request); | 1608 | cc->dmreq_start = sizeof(struct ablkcipher_request); |
1211 | cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm); | 1609 | cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); |
1212 | cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); | 1610 | cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); |
1213 | cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) & | 1611 | cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & |
1214 | ~(crypto_tfm_ctx_alignment() - 1); | 1612 | ~(crypto_tfm_ctx_alignment() - 1); |
1215 | 1613 | ||
1216 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + | 1614 | cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + |
@@ -1219,7 +1617,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1219 | ti->error = "Cannot allocate crypt request mempool"; | 1617 | ti->error = "Cannot allocate crypt request mempool"; |
1220 | goto bad; | 1618 | goto bad; |
1221 | } | 1619 | } |
1222 | cc->req = NULL; | ||
1223 | 1620 | ||
1224 | cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); | 1621 | cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); |
1225 | if (!cc->page_pool) { | 1622 | if (!cc->page_pool) { |
@@ -1252,13 +1649,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1252 | cc->start = tmpll; | 1649 | cc->start = tmpll; |
1253 | 1650 | ||
1254 | ret = -ENOMEM; | 1651 | ret = -ENOMEM; |
1255 | cc->io_queue = create_singlethread_workqueue("kcryptd_io"); | 1652 | cc->io_queue = alloc_workqueue("kcryptd_io", |
1653 | WQ_NON_REENTRANT| | ||
1654 | WQ_MEM_RECLAIM, | ||
1655 | 1); | ||
1256 | if (!cc->io_queue) { | 1656 | if (!cc->io_queue) { |
1257 | ti->error = "Couldn't create kcryptd io queue"; | 1657 | ti->error = "Couldn't create kcryptd io queue"; |
1258 | goto bad; | 1658 | goto bad; |
1259 | } | 1659 | } |
1260 | 1660 | ||
1261 | cc->crypt_queue = create_singlethread_workqueue("kcryptd"); | 1661 | cc->crypt_queue = alloc_workqueue("kcryptd", |
1662 | WQ_NON_REENTRANT| | ||
1663 | WQ_CPU_INTENSIVE| | ||
1664 | WQ_MEM_RECLAIM, | ||
1665 | 1); | ||
1262 | if (!cc->crypt_queue) { | 1666 | if (!cc->crypt_queue) { |
1263 | ti->error = "Couldn't create kcryptd queue"; | 1667 | ti->error = "Couldn't create kcryptd queue"; |
1264 | goto bad; | 1668 | goto bad; |
@@ -1278,7 +1682,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1278 | struct dm_crypt_io *io; | 1682 | struct dm_crypt_io *io; |
1279 | struct crypt_config *cc; | 1683 | struct crypt_config *cc; |
1280 | 1684 | ||
1281 | if (unlikely(bio_empty_barrier(bio))) { | 1685 | if (bio->bi_rw & REQ_FLUSH) { |
1282 | cc = ti->private; | 1686 | cc = ti->private; |
1283 | bio->bi_bdev = cc->dev->bdev; | 1687 | bio->bi_bdev = cc->dev->bdev; |
1284 | return DM_MAPIO_REMAPPED; | 1688 | return DM_MAPIO_REMAPPED; |
@@ -1286,9 +1690,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, | |||
1286 | 1690 | ||
1287 | io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); | 1691 | io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector)); |
1288 | 1692 | ||
1289 | if (bio_data_dir(io->base_bio) == READ) | 1693 | if (bio_data_dir(io->base_bio) == READ) { |
1290 | kcryptd_queue_io(io); | 1694 | if (kcryptd_io_read(io, GFP_NOWAIT)) |
1291 | else | 1695 | kcryptd_queue_io(io); |
1696 | } else | ||
1292 | kcryptd_queue_crypt(io); | 1697 | kcryptd_queue_crypt(io); |
1293 | 1698 | ||
1294 | return DM_MAPIO_SUBMITTED; | 1699 | return DM_MAPIO_SUBMITTED; |
@@ -1306,10 +1711,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type, | |||
1306 | break; | 1711 | break; |
1307 | 1712 | ||
1308 | case STATUSTYPE_TABLE: | 1713 | case STATUSTYPE_TABLE: |
1309 | if (cc->cipher_mode) | 1714 | DMEMIT("%s ", cc->cipher_string); |
1310 | DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode); | ||
1311 | else | ||
1312 | DMEMIT("%s ", cc->cipher); | ||
1313 | 1715 | ||
1314 | if (cc->key_size > 0) { | 1716 | if (cc->key_size > 0) { |
1315 | if ((maxlen - sz) < ((cc->key_size << 1) + 1)) | 1717 | if ((maxlen - sz) < ((cc->key_size << 1) + 1)) |
@@ -1421,7 +1823,7 @@ static int crypt_iterate_devices(struct dm_target *ti, | |||
1421 | 1823 | ||
1422 | static struct target_type crypt_target = { | 1824 | static struct target_type crypt_target = { |
1423 | .name = "crypt", | 1825 | .name = "crypt", |
1424 | .version = {1, 7, 0}, | 1826 | .version = {1, 10, 0}, |
1425 | .module = THIS_MODULE, | 1827 | .module = THIS_MODULE, |
1426 | .ctr = crypt_ctr, | 1828 | .ctr = crypt_ctr, |
1427 | .dtr = crypt_dtr, | 1829 | .dtr = crypt_dtr, |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index baa11912cc94..f18375dcedd9 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void) | |||
352 | { | 352 | { |
353 | int r = -ENOMEM; | 353 | int r = -ENOMEM; |
354 | 354 | ||
355 | kdelayd_wq = create_workqueue("kdelayd"); | 355 | kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); |
356 | if (!kdelayd_wq) { | 356 | if (!kdelayd_wq) { |
357 | DMERR("Couldn't start kdelayd"); | 357 | DMERR("Couldn't start kdelayd"); |
358 | goto bad_queue; | 358 | goto bad_queue; |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c new file mode 100644 index 000000000000..ea790623c30b --- /dev/null +++ b/drivers/md/dm-flakey.c | |||
@@ -0,0 +1,212 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Sistina Software (UK) Limited. | ||
3 | * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include <linux/device-mapper.h> | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/blkdev.h> | ||
13 | #include <linux/bio.h> | ||
14 | #include <linux/slab.h> | ||
15 | |||
16 | #define DM_MSG_PREFIX "flakey" | ||
17 | |||
18 | /* | ||
19 | * Flakey: Used for testing only, simulates intermittent, | ||
20 | * catastrophic device failure. | ||
21 | */ | ||
22 | struct flakey_c { | ||
23 | struct dm_dev *dev; | ||
24 | unsigned long start_time; | ||
25 | sector_t start; | ||
26 | unsigned up_interval; | ||
27 | unsigned down_interval; | ||
28 | }; | ||
29 | |||
30 | /* | ||
31 | * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval> | ||
32 | */ | ||
33 | static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
34 | { | ||
35 | struct flakey_c *fc; | ||
36 | unsigned long long tmp; | ||
37 | |||
38 | if (argc != 4) { | ||
39 | ti->error = "dm-flakey: Invalid argument count"; | ||
40 | return -EINVAL; | ||
41 | } | ||
42 | |||
43 | fc = kmalloc(sizeof(*fc), GFP_KERNEL); | ||
44 | if (!fc) { | ||
45 | ti->error = "dm-flakey: Cannot allocate linear context"; | ||
46 | return -ENOMEM; | ||
47 | } | ||
48 | fc->start_time = jiffies; | ||
49 | |||
50 | if (sscanf(argv[1], "%llu", &tmp) != 1) { | ||
51 | ti->error = "dm-flakey: Invalid device sector"; | ||
52 | goto bad; | ||
53 | } | ||
54 | fc->start = tmp; | ||
55 | |||
56 | if (sscanf(argv[2], "%u", &fc->up_interval) != 1) { | ||
57 | ti->error = "dm-flakey: Invalid up interval"; | ||
58 | goto bad; | ||
59 | } | ||
60 | |||
61 | if (sscanf(argv[3], "%u", &fc->down_interval) != 1) { | ||
62 | ti->error = "dm-flakey: Invalid down interval"; | ||
63 | goto bad; | ||
64 | } | ||
65 | |||
66 | if (!(fc->up_interval + fc->down_interval)) { | ||
67 | ti->error = "dm-flakey: Total (up + down) interval is zero"; | ||
68 | goto bad; | ||
69 | } | ||
70 | |||
71 | if (fc->up_interval + fc->down_interval < fc->up_interval) { | ||
72 | ti->error = "dm-flakey: Interval overflow"; | ||
73 | goto bad; | ||
74 | } | ||
75 | |||
76 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) { | ||
77 | ti->error = "dm-flakey: Device lookup failed"; | ||
78 | goto bad; | ||
79 | } | ||
80 | |||
81 | ti->num_flush_requests = 1; | ||
82 | ti->private = fc; | ||
83 | return 0; | ||
84 | |||
85 | bad: | ||
86 | kfree(fc); | ||
87 | return -EINVAL; | ||
88 | } | ||
89 | |||
90 | static void flakey_dtr(struct dm_target *ti) | ||
91 | { | ||
92 | struct flakey_c *fc = ti->private; | ||
93 | |||
94 | dm_put_device(ti, fc->dev); | ||
95 | kfree(fc); | ||
96 | } | ||
97 | |||
98 | static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) | ||
99 | { | ||
100 | struct flakey_c *fc = ti->private; | ||
101 | |||
102 | return fc->start + (bi_sector - ti->begin); | ||
103 | } | ||
104 | |||
105 | static void flakey_map_bio(struct dm_target *ti, struct bio *bio) | ||
106 | { | ||
107 | struct flakey_c *fc = ti->private; | ||
108 | |||
109 | bio->bi_bdev = fc->dev->bdev; | ||
110 | if (bio_sectors(bio)) | ||
111 | bio->bi_sector = flakey_map_sector(ti, bio->bi_sector); | ||
112 | } | ||
113 | |||
114 | static int flakey_map(struct dm_target *ti, struct bio *bio, | ||
115 | union map_info *map_context) | ||
116 | { | ||
117 | struct flakey_c *fc = ti->private; | ||
118 | unsigned elapsed; | ||
119 | |||
120 | /* Are we alive ? */ | ||
121 | elapsed = (jiffies - fc->start_time) / HZ; | ||
122 | if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) | ||
123 | return -EIO; | ||
124 | |||
125 | flakey_map_bio(ti, bio); | ||
126 | |||
127 | return DM_MAPIO_REMAPPED; | ||
128 | } | ||
129 | |||
130 | static int flakey_status(struct dm_target *ti, status_type_t type, | ||
131 | char *result, unsigned int maxlen) | ||
132 | { | ||
133 | struct flakey_c *fc = ti->private; | ||
134 | |||
135 | switch (type) { | ||
136 | case STATUSTYPE_INFO: | ||
137 | result[0] = '\0'; | ||
138 | break; | ||
139 | |||
140 | case STATUSTYPE_TABLE: | ||
141 | snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name, | ||
142 | (unsigned long long)fc->start, fc->up_interval, | ||
143 | fc->down_interval); | ||
144 | break; | ||
145 | } | ||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) | ||
150 | { | ||
151 | struct flakey_c *fc = ti->private; | ||
152 | |||
153 | return __blkdev_driver_ioctl(fc->dev->bdev, fc->dev->mode, cmd, arg); | ||
154 | } | ||
155 | |||
156 | static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
157 | struct bio_vec *biovec, int max_size) | ||
158 | { | ||
159 | struct flakey_c *fc = ti->private; | ||
160 | struct request_queue *q = bdev_get_queue(fc->dev->bdev); | ||
161 | |||
162 | if (!q->merge_bvec_fn) | ||
163 | return max_size; | ||
164 | |||
165 | bvm->bi_bdev = fc->dev->bdev; | ||
166 | bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector); | ||
167 | |||
168 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
169 | } | ||
170 | |||
171 | static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) | ||
172 | { | ||
173 | struct flakey_c *fc = ti->private; | ||
174 | |||
175 | return fn(ti, fc->dev, fc->start, ti->len, data); | ||
176 | } | ||
177 | |||
178 | static struct target_type flakey_target = { | ||
179 | .name = "flakey", | ||
180 | .version = {1, 1, 0}, | ||
181 | .module = THIS_MODULE, | ||
182 | .ctr = flakey_ctr, | ||
183 | .dtr = flakey_dtr, | ||
184 | .map = flakey_map, | ||
185 | .status = flakey_status, | ||
186 | .ioctl = flakey_ioctl, | ||
187 | .merge = flakey_merge, | ||
188 | .iterate_devices = flakey_iterate_devices, | ||
189 | }; | ||
190 | |||
191 | static int __init dm_flakey_init(void) | ||
192 | { | ||
193 | int r = dm_register_target(&flakey_target); | ||
194 | |||
195 | if (r < 0) | ||
196 | DMERR("register failed %d", r); | ||
197 | |||
198 | return r; | ||
199 | } | ||
200 | |||
201 | static void __exit dm_flakey_exit(void) | ||
202 | { | ||
203 | dm_unregister_target(&flakey_target); | ||
204 | } | ||
205 | |||
206 | /* Module hooks */ | ||
207 | module_init(dm_flakey_init); | ||
208 | module_exit(dm_flakey_exit); | ||
209 | |||
210 | MODULE_DESCRIPTION(DM_NAME " flakey target"); | ||
211 | MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); | ||
212 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 0590c75b0ab6..2067288f61f9 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -19,6 +19,8 @@ | |||
19 | #define DM_MSG_PREFIX "io" | 19 | #define DM_MSG_PREFIX "io" |
20 | 20 | ||
21 | #define DM_IO_MAX_REGIONS BITS_PER_LONG | 21 | #define DM_IO_MAX_REGIONS BITS_PER_LONG |
22 | #define MIN_IOS 16 | ||
23 | #define MIN_BIOS 16 | ||
22 | 24 | ||
23 | struct dm_io_client { | 25 | struct dm_io_client { |
24 | mempool_t *pool; | 26 | mempool_t *pool; |
@@ -31,7 +33,6 @@ struct dm_io_client { | |||
31 | */ | 33 | */ |
32 | struct io { | 34 | struct io { |
33 | unsigned long error_bits; | 35 | unsigned long error_bits; |
34 | unsigned long eopnotsupp_bits; | ||
35 | atomic_t count; | 36 | atomic_t count; |
36 | struct task_struct *sleeper; | 37 | struct task_struct *sleeper; |
37 | struct dm_io_client *client; | 38 | struct dm_io_client *client; |
@@ -42,33 +43,21 @@ struct io { | |||
42 | static struct kmem_cache *_dm_io_cache; | 43 | static struct kmem_cache *_dm_io_cache; |
43 | 44 | ||
44 | /* | 45 | /* |
45 | * io contexts are only dynamically allocated for asynchronous | ||
46 | * io. Since async io is likely to be the majority of io we'll | ||
47 | * have the same number of io contexts as bios! (FIXME: must reduce this). | ||
48 | */ | ||
49 | |||
50 | static unsigned int pages_to_ios(unsigned int pages) | ||
51 | { | ||
52 | return 4 * pages; /* too many ? */ | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Create a client with mempool and bioset. | 46 | * Create a client with mempool and bioset. |
57 | */ | 47 | */ |
58 | struct dm_io_client *dm_io_client_create(unsigned num_pages) | 48 | struct dm_io_client *dm_io_client_create(void) |
59 | { | 49 | { |
60 | unsigned ios = pages_to_ios(num_pages); | ||
61 | struct dm_io_client *client; | 50 | struct dm_io_client *client; |
62 | 51 | ||
63 | client = kmalloc(sizeof(*client), GFP_KERNEL); | 52 | client = kmalloc(sizeof(*client), GFP_KERNEL); |
64 | if (!client) | 53 | if (!client) |
65 | return ERR_PTR(-ENOMEM); | 54 | return ERR_PTR(-ENOMEM); |
66 | 55 | ||
67 | client->pool = mempool_create_slab_pool(ios, _dm_io_cache); | 56 | client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache); |
68 | if (!client->pool) | 57 | if (!client->pool) |
69 | goto bad; | 58 | goto bad; |
70 | 59 | ||
71 | client->bios = bioset_create(16, 0); | 60 | client->bios = bioset_create(MIN_BIOS, 0); |
72 | if (!client->bios) | 61 | if (!client->bios) |
73 | goto bad; | 62 | goto bad; |
74 | 63 | ||
@@ -82,13 +71,6 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) | |||
82 | } | 71 | } |
83 | EXPORT_SYMBOL(dm_io_client_create); | 72 | EXPORT_SYMBOL(dm_io_client_create); |
84 | 73 | ||
85 | int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client) | ||
86 | { | ||
87 | return mempool_resize(client->pool, pages_to_ios(num_pages), | ||
88 | GFP_KERNEL); | ||
89 | } | ||
90 | EXPORT_SYMBOL(dm_io_client_resize); | ||
91 | |||
92 | void dm_io_client_destroy(struct dm_io_client *client) | 74 | void dm_io_client_destroy(struct dm_io_client *client) |
93 | { | 75 | { |
94 | mempool_destroy(client->pool); | 76 | mempool_destroy(client->pool); |
@@ -130,11 +112,8 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, | |||
130 | *---------------------------------------------------------------*/ | 112 | *---------------------------------------------------------------*/ |
131 | static void dec_count(struct io *io, unsigned int region, int error) | 113 | static void dec_count(struct io *io, unsigned int region, int error) |
132 | { | 114 | { |
133 | if (error) { | 115 | if (error) |
134 | set_bit(region, &io->error_bits); | 116 | set_bit(region, &io->error_bits); |
135 | if (error == -EOPNOTSUPP) | ||
136 | set_bit(region, &io->eopnotsupp_bits); | ||
137 | } | ||
138 | 117 | ||
139 | if (atomic_dec_and_test(&io->count)) { | 118 | if (atomic_dec_and_test(&io->count)) { |
140 | if (io->sleeper) | 119 | if (io->sleeper) |
@@ -310,8 +289,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
310 | sector_t remaining = where->count; | 289 | sector_t remaining = where->count; |
311 | 290 | ||
312 | /* | 291 | /* |
313 | * where->count may be zero if rw holds a write barrier and we | 292 | * where->count may be zero if rw holds a flush and we need to |
314 | * need to send a zero-sized barrier. | 293 | * send a zero-sized flush. |
315 | */ | 294 | */ |
316 | do { | 295 | do { |
317 | /* | 296 | /* |
@@ -356,7 +335,7 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
356 | BUG_ON(num_regions > DM_IO_MAX_REGIONS); | 335 | BUG_ON(num_regions > DM_IO_MAX_REGIONS); |
357 | 336 | ||
358 | if (sync) | 337 | if (sync) |
359 | rw |= REQ_SYNC | REQ_UNPLUG; | 338 | rw |= REQ_SYNC; |
360 | 339 | ||
361 | /* | 340 | /* |
362 | * For multiple regions we need to be careful to rewind | 341 | * For multiple regions we need to be careful to rewind |
@@ -364,7 +343,7 @@ static void dispatch_io(int rw, unsigned int num_regions, | |||
364 | */ | 343 | */ |
365 | for (i = 0; i < num_regions; i++) { | 344 | for (i = 0; i < num_regions; i++) { |
366 | *dp = old_pages; | 345 | *dp = old_pages; |
367 | if (where[i].count || (rw & REQ_HARDBARRIER)) | 346 | if (where[i].count || (rw & REQ_FLUSH)) |
368 | do_region(rw, i, where + i, dp, io); | 347 | do_region(rw, i, where + i, dp, io); |
369 | } | 348 | } |
370 | 349 | ||
@@ -393,9 +372,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, | |||
393 | return -EIO; | 372 | return -EIO; |
394 | } | 373 | } |
395 | 374 | ||
396 | retry: | ||
397 | io->error_bits = 0; | 375 | io->error_bits = 0; |
398 | io->eopnotsupp_bits = 0; | ||
399 | atomic_set(&io->count, 1); /* see dispatch_io() */ | 376 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
400 | io->sleeper = current; | 377 | io->sleeper = current; |
401 | io->client = client; | 378 | io->client = client; |
@@ -412,11 +389,6 @@ retry: | |||
412 | } | 389 | } |
413 | set_current_state(TASK_RUNNING); | 390 | set_current_state(TASK_RUNNING); |
414 | 391 | ||
415 | if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) { | ||
416 | rw &= ~REQ_HARDBARRIER; | ||
417 | goto retry; | ||
418 | } | ||
419 | |||
420 | if (error_bits) | 392 | if (error_bits) |
421 | *error_bits = io->error_bits; | 393 | *error_bits = io->error_bits; |
422 | 394 | ||
@@ -437,7 +409,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, | |||
437 | 409 | ||
438 | io = mempool_alloc(client->pool, GFP_NOIO); | 410 | io = mempool_alloc(client->pool, GFP_NOIO); |
439 | io->error_bits = 0; | 411 | io->error_bits = 0; |
440 | io->eopnotsupp_bits = 0; | ||
441 | atomic_set(&io->count, 1); /* see dispatch_io() */ | 412 | atomic_set(&io->count, 1); /* see dispatch_io() */ |
442 | io->sleeper = NULL; | 413 | io->sleeper = NULL; |
443 | io->client = client; | 414 | io->client = client; |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3e39193e5036..4cacdad2270a 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -295,19 +295,55 @@ retry: | |||
295 | DMWARN("remove_all left %d open device(s)", dev_skipped); | 295 | DMWARN("remove_all left %d open device(s)", dev_skipped); |
296 | } | 296 | } |
297 | 297 | ||
298 | /* | ||
299 | * Set the uuid of a hash_cell that isn't already set. | ||
300 | */ | ||
301 | static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) | ||
302 | { | ||
303 | mutex_lock(&dm_hash_cells_mutex); | ||
304 | hc->uuid = new_uuid; | ||
305 | mutex_unlock(&dm_hash_cells_mutex); | ||
306 | |||
307 | list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * Changes the name of a hash_cell and returns the old name for | ||
312 | * the caller to free. | ||
313 | */ | ||
314 | static char *__change_cell_name(struct hash_cell *hc, char *new_name) | ||
315 | { | ||
316 | char *old_name; | ||
317 | |||
318 | /* | ||
319 | * Rename and move the name cell. | ||
320 | */ | ||
321 | list_del(&hc->name_list); | ||
322 | old_name = hc->name; | ||
323 | |||
324 | mutex_lock(&dm_hash_cells_mutex); | ||
325 | hc->name = new_name; | ||
326 | mutex_unlock(&dm_hash_cells_mutex); | ||
327 | |||
328 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); | ||
329 | |||
330 | return old_name; | ||
331 | } | ||
332 | |||
298 | static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, | 333 | static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, |
299 | const char *new) | 334 | const char *new) |
300 | { | 335 | { |
301 | char *new_name, *old_name; | 336 | char *new_data, *old_name = NULL; |
302 | struct hash_cell *hc; | 337 | struct hash_cell *hc; |
303 | struct dm_table *table; | 338 | struct dm_table *table; |
304 | struct mapped_device *md; | 339 | struct mapped_device *md; |
340 | unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; | ||
305 | 341 | ||
306 | /* | 342 | /* |
307 | * duplicate new. | 343 | * duplicate new. |
308 | */ | 344 | */ |
309 | new_name = kstrdup(new, GFP_KERNEL); | 345 | new_data = kstrdup(new, GFP_KERNEL); |
310 | if (!new_name) | 346 | if (!new_data) |
311 | return ERR_PTR(-ENOMEM); | 347 | return ERR_PTR(-ENOMEM); |
312 | 348 | ||
313 | down_write(&_hash_lock); | 349 | down_write(&_hash_lock); |
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, | |||
315 | /* | 351 | /* |
316 | * Is new free ? | 352 | * Is new free ? |
317 | */ | 353 | */ |
318 | hc = __get_name_cell(new); | 354 | if (change_uuid) |
355 | hc = __get_uuid_cell(new); | ||
356 | else | ||
357 | hc = __get_name_cell(new); | ||
358 | |||
319 | if (hc) { | 359 | if (hc) { |
320 | DMWARN("asked to rename to an already-existing name %s -> %s", | 360 | DMWARN("Unable to change %s on mapped device %s to one that " |
361 | "already exists: %s", | ||
362 | change_uuid ? "uuid" : "name", | ||
321 | param->name, new); | 363 | param->name, new); |
322 | dm_put(hc->md); | 364 | dm_put(hc->md); |
323 | up_write(&_hash_lock); | 365 | up_write(&_hash_lock); |
324 | kfree(new_name); | 366 | kfree(new_data); |
325 | return ERR_PTR(-EBUSY); | 367 | return ERR_PTR(-EBUSY); |
326 | } | 368 | } |
327 | 369 | ||
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, | |||
330 | */ | 372 | */ |
331 | hc = __get_name_cell(param->name); | 373 | hc = __get_name_cell(param->name); |
332 | if (!hc) { | 374 | if (!hc) { |
333 | DMWARN("asked to rename a non-existent device %s -> %s", | 375 | DMWARN("Unable to rename non-existent device, %s to %s%s", |
334 | param->name, new); | 376 | param->name, change_uuid ? "uuid " : "", new); |
335 | up_write(&_hash_lock); | 377 | up_write(&_hash_lock); |
336 | kfree(new_name); | 378 | kfree(new_data); |
337 | return ERR_PTR(-ENXIO); | 379 | return ERR_PTR(-ENXIO); |
338 | } | 380 | } |
339 | 381 | ||
340 | /* | 382 | /* |
341 | * rename and move the name cell. | 383 | * Does this device already have a uuid? |
342 | */ | 384 | */ |
343 | list_del(&hc->name_list); | 385 | if (change_uuid && hc->uuid) { |
344 | old_name = hc->name; | 386 | DMWARN("Unable to change uuid of mapped device %s to %s " |
345 | mutex_lock(&dm_hash_cells_mutex); | 387 | "because uuid is already set to %s", |
346 | hc->name = new_name; | 388 | param->name, new, hc->uuid); |
347 | mutex_unlock(&dm_hash_cells_mutex); | 389 | dm_put(hc->md); |
348 | list_add(&hc->name_list, _name_buckets + hash_str(new_name)); | 390 | up_write(&_hash_lock); |
391 | kfree(new_data); | ||
392 | return ERR_PTR(-EINVAL); | ||
393 | } | ||
394 | |||
395 | if (change_uuid) | ||
396 | __set_cell_uuid(hc, new_data); | ||
397 | else | ||
398 | old_name = __change_cell_name(hc, new_data); | ||
349 | 399 | ||
350 | /* | 400 | /* |
351 | * Wake up any dm event waiters. | 401 | * Wake up any dm event waiters. |
@@ -729,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
729 | hc = __find_device_hash_cell(param); | 779 | hc = __find_device_hash_cell(param); |
730 | 780 | ||
731 | if (!hc) { | 781 | if (!hc) { |
732 | DMWARN("device doesn't appear to be in the dev hash table."); | 782 | DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); |
733 | up_write(&_hash_lock); | 783 | up_write(&_hash_lock); |
734 | return -ENXIO; | 784 | return -ENXIO; |
735 | } | 785 | } |
@@ -741,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
741 | */ | 791 | */ |
742 | r = dm_lock_for_deletion(md); | 792 | r = dm_lock_for_deletion(md); |
743 | if (r) { | 793 | if (r) { |
744 | DMWARN("unable to remove open device %s", hc->name); | 794 | DMDEBUG_LIMIT("unable to remove open device %s", hc->name); |
745 | up_write(&_hash_lock); | 795 | up_write(&_hash_lock); |
746 | dm_put(md); | 796 | dm_put(md); |
747 | return r; | 797 | return r; |
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end) | |||
774 | static int dev_rename(struct dm_ioctl *param, size_t param_size) | 824 | static int dev_rename(struct dm_ioctl *param, size_t param_size) |
775 | { | 825 | { |
776 | int r; | 826 | int r; |
777 | char *new_name = (char *) param + param->data_start; | 827 | char *new_data = (char *) param + param->data_start; |
778 | struct mapped_device *md; | 828 | struct mapped_device *md; |
829 | unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; | ||
779 | 830 | ||
780 | if (new_name < param->data || | 831 | if (new_data < param->data || |
781 | invalid_str(new_name, (void *) param + param_size) || | 832 | invalid_str(new_data, (void *) param + param_size) || |
782 | strlen(new_name) > DM_NAME_LEN - 1) { | 833 | strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { |
783 | DMWARN("Invalid new logical volume name supplied."); | 834 | DMWARN("Invalid new mapped device name or uuid string supplied."); |
784 | return -EINVAL; | 835 | return -EINVAL; |
785 | } | 836 | } |
786 | 837 | ||
787 | r = check_name(new_name); | 838 | if (!change_uuid) { |
788 | if (r) | 839 | r = check_name(new_data); |
789 | return r; | 840 | if (r) |
841 | return r; | ||
842 | } | ||
790 | 843 | ||
791 | md = dm_hash_rename(param, new_name); | 844 | md = dm_hash_rename(param, new_data); |
792 | if (IS_ERR(md)) | 845 | if (IS_ERR(md)) |
793 | return PTR_ERR(md); | 846 | return PTR_ERR(md); |
794 | 847 | ||
@@ -885,7 +938,7 @@ static int do_resume(struct dm_ioctl *param) | |||
885 | 938 | ||
886 | hc = __find_device_hash_cell(param); | 939 | hc = __find_device_hash_cell(param); |
887 | if (!hc) { | 940 | if (!hc) { |
888 | DMWARN("device doesn't appear to be in the dev hash table."); | 941 | DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); |
889 | up_write(&_hash_lock); | 942 | up_write(&_hash_lock); |
890 | return -ENXIO; | 943 | return -ENXIO; |
891 | } | 944 | } |
@@ -1212,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size) | |||
1212 | 1265 | ||
1213 | hc = __find_device_hash_cell(param); | 1266 | hc = __find_device_hash_cell(param); |
1214 | if (!hc) { | 1267 | if (!hc) { |
1215 | DMWARN("device doesn't appear to be in the dev hash table."); | 1268 | DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); |
1216 | up_write(&_hash_lock); | 1269 | up_write(&_hash_lock); |
1217 | return -ENXIO; | 1270 | return -ENXIO; |
1218 | } | 1271 | } |
@@ -1448,14 +1501,10 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) | |||
1448 | return r; | 1501 | return r; |
1449 | } | 1502 | } |
1450 | 1503 | ||
1451 | static void free_params(struct dm_ioctl *param) | ||
1452 | { | ||
1453 | vfree(param); | ||
1454 | } | ||
1455 | |||
1456 | static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) | 1504 | static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) |
1457 | { | 1505 | { |
1458 | struct dm_ioctl tmp, *dmi; | 1506 | struct dm_ioctl tmp, *dmi; |
1507 | int secure_data; | ||
1459 | 1508 | ||
1460 | if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) | 1509 | if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) |
1461 | return -EFAULT; | 1510 | return -EFAULT; |
@@ -1463,17 +1512,30 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) | |||
1463 | if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) | 1512 | if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) |
1464 | return -EINVAL; | 1513 | return -EINVAL; |
1465 | 1514 | ||
1515 | secure_data = tmp.flags & DM_SECURE_DATA_FLAG; | ||
1516 | |||
1466 | dmi = vmalloc(tmp.data_size); | 1517 | dmi = vmalloc(tmp.data_size); |
1467 | if (!dmi) | 1518 | if (!dmi) { |
1519 | if (secure_data && clear_user(user, tmp.data_size)) | ||
1520 | return -EFAULT; | ||
1468 | return -ENOMEM; | 1521 | return -ENOMEM; |
1469 | |||
1470 | if (copy_from_user(dmi, user, tmp.data_size)) { | ||
1471 | vfree(dmi); | ||
1472 | return -EFAULT; | ||
1473 | } | 1522 | } |
1474 | 1523 | ||
1524 | if (copy_from_user(dmi, user, tmp.data_size)) | ||
1525 | goto bad; | ||
1526 | |||
1527 | /* Wipe the user buffer so we do not return it to userspace */ | ||
1528 | if (secure_data && clear_user(user, tmp.data_size)) | ||
1529 | goto bad; | ||
1530 | |||
1475 | *param = dmi; | 1531 | *param = dmi; |
1476 | return 0; | 1532 | return 0; |
1533 | |||
1534 | bad: | ||
1535 | if (secure_data) | ||
1536 | memset(dmi, 0, tmp.data_size); | ||
1537 | vfree(dmi); | ||
1538 | return -EFAULT; | ||
1477 | } | 1539 | } |
1478 | 1540 | ||
1479 | static int validate_params(uint cmd, struct dm_ioctl *param) | 1541 | static int validate_params(uint cmd, struct dm_ioctl *param) |
@@ -1481,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
1481 | /* Always clear this flag */ | 1543 | /* Always clear this flag */ |
1482 | param->flags &= ~DM_BUFFER_FULL_FLAG; | 1544 | param->flags &= ~DM_BUFFER_FULL_FLAG; |
1483 | param->flags &= ~DM_UEVENT_GENERATED_FLAG; | 1545 | param->flags &= ~DM_UEVENT_GENERATED_FLAG; |
1546 | param->flags &= ~DM_SECURE_DATA_FLAG; | ||
1484 | 1547 | ||
1485 | /* Ignores parameters */ | 1548 | /* Ignores parameters */ |
1486 | if (cmd == DM_REMOVE_ALL_CMD || | 1549 | if (cmd == DM_REMOVE_ALL_CMD || |
@@ -1508,10 +1571,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
1508 | static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | 1571 | static int ctl_ioctl(uint command, struct dm_ioctl __user *user) |
1509 | { | 1572 | { |
1510 | int r = 0; | 1573 | int r = 0; |
1574 | int wipe_buffer; | ||
1511 | unsigned int cmd; | 1575 | unsigned int cmd; |
1512 | struct dm_ioctl *uninitialized_var(param); | 1576 | struct dm_ioctl *uninitialized_var(param); |
1513 | ioctl_fn fn = NULL; | 1577 | ioctl_fn fn = NULL; |
1514 | size_t param_size; | 1578 | size_t input_param_size; |
1515 | 1579 | ||
1516 | /* only root can play with this */ | 1580 | /* only root can play with this */ |
1517 | if (!capable(CAP_SYS_ADMIN)) | 1581 | if (!capable(CAP_SYS_ADMIN)) |
@@ -1558,13 +1622,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | |||
1558 | if (r) | 1622 | if (r) |
1559 | return r; | 1623 | return r; |
1560 | 1624 | ||
1625 | input_param_size = param->data_size; | ||
1626 | wipe_buffer = param->flags & DM_SECURE_DATA_FLAG; | ||
1627 | |||
1561 | r = validate_params(cmd, param); | 1628 | r = validate_params(cmd, param); |
1562 | if (r) | 1629 | if (r) |
1563 | goto out; | 1630 | goto out; |
1564 | 1631 | ||
1565 | param_size = param->data_size; | ||
1566 | param->data_size = sizeof(*param); | 1632 | param->data_size = sizeof(*param); |
1567 | r = fn(param, param_size); | 1633 | r = fn(param, input_param_size); |
1568 | 1634 | ||
1569 | /* | 1635 | /* |
1570 | * Copy the results back to userland. | 1636 | * Copy the results back to userland. |
@@ -1572,8 +1638,11 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | |||
1572 | if (!r && copy_to_user(user, param, param->data_size)) | 1638 | if (!r && copy_to_user(user, param, param->data_size)) |
1573 | r = -EFAULT; | 1639 | r = -EFAULT; |
1574 | 1640 | ||
1575 | out: | 1641 | out: |
1576 | free_params(param); | 1642 | if (wipe_buffer) |
1643 | memset(param, 0, input_param_size); | ||
1644 | |||
1645 | vfree(param); | ||
1577 | return r; | 1646 | return r; |
1578 | } | 1647 | } |
1579 | 1648 | ||
@@ -1596,6 +1665,7 @@ static const struct file_operations _ctl_fops = { | |||
1596 | .unlocked_ioctl = dm_ctl_ioctl, | 1665 | .unlocked_ioctl = dm_ctl_ioctl, |
1597 | .compat_ioctl = dm_compat_ctl_ioctl, | 1666 | .compat_ioctl = dm_compat_ctl_ioctl, |
1598 | .owner = THIS_MODULE, | 1667 | .owner = THIS_MODULE, |
1668 | .llseek = noop_llseek, | ||
1599 | }; | 1669 | }; |
1600 | 1670 | ||
1601 | static struct miscdevice _dm_misc = { | 1671 | static struct miscdevice _dm_misc = { |
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index d8587bac5682..819e37eaaeba 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c | |||
@@ -27,15 +27,19 @@ | |||
27 | 27 | ||
28 | #include "dm.h" | 28 | #include "dm.h" |
29 | 29 | ||
30 | #define SUB_JOB_SIZE 128 | ||
31 | #define SPLIT_COUNT 8 | ||
32 | #define MIN_JOBS 8 | ||
33 | #define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE)) | ||
34 | |||
30 | /*----------------------------------------------------------------- | 35 | /*----------------------------------------------------------------- |
31 | * Each kcopyd client has its own little pool of preallocated | 36 | * Each kcopyd client has its own little pool of preallocated |
32 | * pages for kcopyd io. | 37 | * pages for kcopyd io. |
33 | *---------------------------------------------------------------*/ | 38 | *---------------------------------------------------------------*/ |
34 | struct dm_kcopyd_client { | 39 | struct dm_kcopyd_client { |
35 | spinlock_t lock; | ||
36 | struct page_list *pages; | 40 | struct page_list *pages; |
37 | unsigned int nr_pages; | 41 | unsigned nr_reserved_pages; |
38 | unsigned int nr_free_pages; | 42 | unsigned nr_free_pages; |
39 | 43 | ||
40 | struct dm_io_client *io_client; | 44 | struct dm_io_client *io_client; |
41 | 45 | ||
@@ -67,15 +71,18 @@ static void wake(struct dm_kcopyd_client *kc) | |||
67 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); | 71 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); |
68 | } | 72 | } |
69 | 73 | ||
70 | static struct page_list *alloc_pl(void) | 74 | /* |
75 | * Obtain one page for the use of kcopyd. | ||
76 | */ | ||
77 | static struct page_list *alloc_pl(gfp_t gfp) | ||
71 | { | 78 | { |
72 | struct page_list *pl; | 79 | struct page_list *pl; |
73 | 80 | ||
74 | pl = kmalloc(sizeof(*pl), GFP_KERNEL); | 81 | pl = kmalloc(sizeof(*pl), gfp); |
75 | if (!pl) | 82 | if (!pl) |
76 | return NULL; | 83 | return NULL; |
77 | 84 | ||
78 | pl->page = alloc_page(GFP_KERNEL); | 85 | pl->page = alloc_page(gfp); |
79 | if (!pl->page) { | 86 | if (!pl->page) { |
80 | kfree(pl); | 87 | kfree(pl); |
81 | return NULL; | 88 | return NULL; |
@@ -90,41 +97,56 @@ static void free_pl(struct page_list *pl) | |||
90 | kfree(pl); | 97 | kfree(pl); |
91 | } | 98 | } |
92 | 99 | ||
93 | static int kcopyd_get_pages(struct dm_kcopyd_client *kc, | 100 | /* |
94 | unsigned int nr, struct page_list **pages) | 101 | * Add the provided pages to a client's free page list, releasing |
102 | * back to the system any beyond the reserved_pages limit. | ||
103 | */ | ||
104 | static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) | ||
95 | { | 105 | { |
96 | struct page_list *pl; | 106 | struct page_list *next; |
97 | |||
98 | spin_lock(&kc->lock); | ||
99 | if (kc->nr_free_pages < nr) { | ||
100 | spin_unlock(&kc->lock); | ||
101 | return -ENOMEM; | ||
102 | } | ||
103 | |||
104 | kc->nr_free_pages -= nr; | ||
105 | for (*pages = pl = kc->pages; --nr; pl = pl->next) | ||
106 | ; | ||
107 | 107 | ||
108 | kc->pages = pl->next; | 108 | do { |
109 | pl->next = NULL; | 109 | next = pl->next; |
110 | 110 | ||
111 | spin_unlock(&kc->lock); | 111 | if (kc->nr_free_pages >= kc->nr_reserved_pages) |
112 | free_pl(pl); | ||
113 | else { | ||
114 | pl->next = kc->pages; | ||
115 | kc->pages = pl; | ||
116 | kc->nr_free_pages++; | ||
117 | } | ||
112 | 118 | ||
113 | return 0; | 119 | pl = next; |
120 | } while (pl); | ||
114 | } | 121 | } |
115 | 122 | ||
116 | static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) | 123 | static int kcopyd_get_pages(struct dm_kcopyd_client *kc, |
124 | unsigned int nr, struct page_list **pages) | ||
117 | { | 125 | { |
118 | struct page_list *cursor; | 126 | struct page_list *pl; |
127 | |||
128 | *pages = NULL; | ||
129 | |||
130 | do { | ||
131 | pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); | ||
132 | if (unlikely(!pl)) { | ||
133 | /* Use reserved pages */ | ||
134 | pl = kc->pages; | ||
135 | if (unlikely(!pl)) | ||
136 | goto out_of_memory; | ||
137 | kc->pages = pl->next; | ||
138 | kc->nr_free_pages--; | ||
139 | } | ||
140 | pl->next = *pages; | ||
141 | *pages = pl; | ||
142 | } while (--nr); | ||
119 | 143 | ||
120 | spin_lock(&kc->lock); | 144 | return 0; |
121 | for (cursor = pl; cursor->next; cursor = cursor->next) | ||
122 | kc->nr_free_pages++; | ||
123 | 145 | ||
124 | kc->nr_free_pages++; | 146 | out_of_memory: |
125 | cursor->next = kc->pages; | 147 | if (*pages) |
126 | kc->pages = pl; | 148 | kcopyd_put_pages(kc, *pages); |
127 | spin_unlock(&kc->lock); | 149 | return -ENOMEM; |
128 | } | 150 | } |
129 | 151 | ||
130 | /* | 152 | /* |
@@ -141,13 +163,16 @@ static void drop_pages(struct page_list *pl) | |||
141 | } | 163 | } |
142 | } | 164 | } |
143 | 165 | ||
144 | static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) | 166 | /* |
167 | * Allocate and reserve nr_pages for the use of a specific client. | ||
168 | */ | ||
169 | static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages) | ||
145 | { | 170 | { |
146 | unsigned int i; | 171 | unsigned i; |
147 | struct page_list *pl = NULL, *next; | 172 | struct page_list *pl = NULL, *next; |
148 | 173 | ||
149 | for (i = 0; i < nr; i++) { | 174 | for (i = 0; i < nr_pages; i++) { |
150 | next = alloc_pl(); | 175 | next = alloc_pl(GFP_KERNEL); |
151 | if (!next) { | 176 | if (!next) { |
152 | if (pl) | 177 | if (pl) |
153 | drop_pages(pl); | 178 | drop_pages(pl); |
@@ -157,17 +182,18 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) | |||
157 | pl = next; | 182 | pl = next; |
158 | } | 183 | } |
159 | 184 | ||
185 | kc->nr_reserved_pages += nr_pages; | ||
160 | kcopyd_put_pages(kc, pl); | 186 | kcopyd_put_pages(kc, pl); |
161 | kc->nr_pages += nr; | 187 | |
162 | return 0; | 188 | return 0; |
163 | } | 189 | } |
164 | 190 | ||
165 | static void client_free_pages(struct dm_kcopyd_client *kc) | 191 | static void client_free_pages(struct dm_kcopyd_client *kc) |
166 | { | 192 | { |
167 | BUG_ON(kc->nr_free_pages != kc->nr_pages); | 193 | BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); |
168 | drop_pages(kc->pages); | 194 | drop_pages(kc->pages); |
169 | kc->pages = NULL; | 195 | kc->pages = NULL; |
170 | kc->nr_free_pages = kc->nr_pages = 0; | 196 | kc->nr_free_pages = kc->nr_reserved_pages = 0; |
171 | } | 197 | } |
172 | 198 | ||
173 | /*----------------------------------------------------------------- | 199 | /*----------------------------------------------------------------- |
@@ -216,16 +242,17 @@ struct kcopyd_job { | |||
216 | struct mutex lock; | 242 | struct mutex lock; |
217 | atomic_t sub_jobs; | 243 | atomic_t sub_jobs; |
218 | sector_t progress; | 244 | sector_t progress; |
219 | }; | ||
220 | 245 | ||
221 | /* FIXME: this should scale with the number of pages */ | 246 | struct kcopyd_job *master_job; |
222 | #define MIN_JOBS 512 | 247 | }; |
223 | 248 | ||
224 | static struct kmem_cache *_job_cache; | 249 | static struct kmem_cache *_job_cache; |
225 | 250 | ||
226 | int __init dm_kcopyd_init(void) | 251 | int __init dm_kcopyd_init(void) |
227 | { | 252 | { |
228 | _job_cache = KMEM_CACHE(kcopyd_job, 0); | 253 | _job_cache = kmem_cache_create("kcopyd_job", |
254 | sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), | ||
255 | __alignof__(struct kcopyd_job), 0, NULL); | ||
229 | if (!_job_cache) | 256 | if (!_job_cache) |
230 | return -ENOMEM; | 257 | return -ENOMEM; |
231 | 258 | ||
@@ -299,7 +326,12 @@ static int run_complete_job(struct kcopyd_job *job) | |||
299 | 326 | ||
300 | if (job->pages) | 327 | if (job->pages) |
301 | kcopyd_put_pages(kc, job->pages); | 328 | kcopyd_put_pages(kc, job->pages); |
302 | mempool_free(job, kc->job_pool); | 329 | /* |
330 | * If this is the master job, the sub jobs have already | ||
331 | * completed so we can free everything. | ||
332 | */ | ||
333 | if (job->master_job == job) | ||
334 | mempool_free(job, kc->job_pool); | ||
303 | fn(read_err, write_err, context); | 335 | fn(read_err, write_err, context); |
304 | 336 | ||
305 | if (atomic_dec_and_test(&kc->nr_jobs)) | 337 | if (atomic_dec_and_test(&kc->nr_jobs)) |
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job) | |||
345 | { | 377 | { |
346 | int r; | 378 | int r; |
347 | struct dm_io_request io_req = { | 379 | struct dm_io_request io_req = { |
348 | .bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG, | 380 | .bi_rw = job->rw, |
349 | .mem.type = DM_IO_PAGE_LIST, | 381 | .mem.type = DM_IO_PAGE_LIST, |
350 | .mem.ptr.pl = job->pages, | 382 | .mem.ptr.pl = job->pages, |
351 | .mem.offset = job->offset, | 383 | .mem.offset = job->offset, |
@@ -428,6 +460,7 @@ static void do_work(struct work_struct *work) | |||
428 | { | 460 | { |
429 | struct dm_kcopyd_client *kc = container_of(work, | 461 | struct dm_kcopyd_client *kc = container_of(work, |
430 | struct dm_kcopyd_client, kcopyd_work); | 462 | struct dm_kcopyd_client, kcopyd_work); |
463 | struct blk_plug plug; | ||
431 | 464 | ||
432 | /* | 465 | /* |
433 | * The order that these are called is *very* important. | 466 | * The order that these are called is *very* important. |
@@ -436,9 +469,11 @@ static void do_work(struct work_struct *work) | |||
436 | * list. io jobs call wake when they complete and it all | 469 | * list. io jobs call wake when they complete and it all |
437 | * starts again. | 470 | * starts again. |
438 | */ | 471 | */ |
472 | blk_start_plug(&plug); | ||
439 | process_jobs(&kc->complete_jobs, kc, run_complete_job); | 473 | process_jobs(&kc->complete_jobs, kc, run_complete_job); |
440 | process_jobs(&kc->pages_jobs, kc, run_pages_job); | 474 | process_jobs(&kc->pages_jobs, kc, run_pages_job); |
441 | process_jobs(&kc->io_jobs, kc, run_io_job); | 475 | process_jobs(&kc->io_jobs, kc, run_io_job); |
476 | blk_finish_plug(&plug); | ||
442 | } | 477 | } |
443 | 478 | ||
444 | /* | 479 | /* |
@@ -457,14 +492,14 @@ static void dispatch_job(struct kcopyd_job *job) | |||
457 | wake(kc); | 492 | wake(kc); |
458 | } | 493 | } |
459 | 494 | ||
460 | #define SUB_JOB_SIZE 128 | ||
461 | static void segment_complete(int read_err, unsigned long write_err, | 495 | static void segment_complete(int read_err, unsigned long write_err, |
462 | void *context) | 496 | void *context) |
463 | { | 497 | { |
464 | /* FIXME: tidy this function */ | 498 | /* FIXME: tidy this function */ |
465 | sector_t progress = 0; | 499 | sector_t progress = 0; |
466 | sector_t count = 0; | 500 | sector_t count = 0; |
467 | struct kcopyd_job *job = (struct kcopyd_job *) context; | 501 | struct kcopyd_job *sub_job = (struct kcopyd_job *) context; |
502 | struct kcopyd_job *job = sub_job->master_job; | ||
468 | struct dm_kcopyd_client *kc = job->kc; | 503 | struct dm_kcopyd_client *kc = job->kc; |
469 | 504 | ||
470 | mutex_lock(&job->lock); | 505 | mutex_lock(&job->lock); |
@@ -495,8 +530,6 @@ static void segment_complete(int read_err, unsigned long write_err, | |||
495 | 530 | ||
496 | if (count) { | 531 | if (count) { |
497 | int i; | 532 | int i; |
498 | struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool, | ||
499 | GFP_NOIO); | ||
500 | 533 | ||
501 | *sub_job = *job; | 534 | *sub_job = *job; |
502 | sub_job->source.sector += progress; | 535 | sub_job->source.sector += progress; |
@@ -508,7 +541,7 @@ static void segment_complete(int read_err, unsigned long write_err, | |||
508 | } | 541 | } |
509 | 542 | ||
510 | sub_job->fn = segment_complete; | 543 | sub_job->fn = segment_complete; |
511 | sub_job->context = job; | 544 | sub_job->context = sub_job; |
512 | dispatch_job(sub_job); | 545 | dispatch_job(sub_job); |
513 | 546 | ||
514 | } else if (atomic_dec_and_test(&job->sub_jobs)) { | 547 | } else if (atomic_dec_and_test(&job->sub_jobs)) { |
@@ -528,19 +561,19 @@ static void segment_complete(int read_err, unsigned long write_err, | |||
528 | } | 561 | } |
529 | 562 | ||
530 | /* | 563 | /* |
531 | * Create some little jobs that will do the move between | 564 | * Create some sub jobs to share the work between them. |
532 | * them. | ||
533 | */ | 565 | */ |
534 | #define SPLIT_COUNT 8 | 566 | static void split_job(struct kcopyd_job *master_job) |
535 | static void split_job(struct kcopyd_job *job) | ||
536 | { | 567 | { |
537 | int i; | 568 | int i; |
538 | 569 | ||
539 | atomic_inc(&job->kc->nr_jobs); | 570 | atomic_inc(&master_job->kc->nr_jobs); |
540 | 571 | ||
541 | atomic_set(&job->sub_jobs, SPLIT_COUNT); | 572 | atomic_set(&master_job->sub_jobs, SPLIT_COUNT); |
542 | for (i = 0; i < SPLIT_COUNT; i++) | 573 | for (i = 0; i < SPLIT_COUNT; i++) { |
543 | segment_complete(0, 0u, job); | 574 | master_job[i + 1].master_job = master_job; |
575 | segment_complete(0, 0u, &master_job[i + 1]); | ||
576 | } | ||
544 | } | 577 | } |
545 | 578 | ||
546 | int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, | 579 | int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, |
@@ -550,7 +583,8 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, | |||
550 | struct kcopyd_job *job; | 583 | struct kcopyd_job *job; |
551 | 584 | ||
552 | /* | 585 | /* |
553 | * Allocate a new job. | 586 | * Allocate an array of jobs consisting of one master job |
587 | * followed by SPLIT_COUNT sub jobs. | ||
554 | */ | 588 | */ |
555 | job = mempool_alloc(kc->job_pool, GFP_NOIO); | 589 | job = mempool_alloc(kc->job_pool, GFP_NOIO); |
556 | 590 | ||
@@ -574,10 +608,10 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, | |||
574 | 608 | ||
575 | job->fn = fn; | 609 | job->fn = fn; |
576 | job->context = context; | 610 | job->context = context; |
611 | job->master_job = job; | ||
577 | 612 | ||
578 | if (job->source.count < SUB_JOB_SIZE) | 613 | if (job->source.count <= SUB_JOB_SIZE) |
579 | dispatch_job(job); | 614 | dispatch_job(job); |
580 | |||
581 | else { | 615 | else { |
582 | mutex_init(&job->lock); | 616 | mutex_init(&job->lock); |
583 | job->progress = 0; | 617 | job->progress = 0; |
@@ -603,17 +637,15 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) | |||
603 | /*----------------------------------------------------------------- | 637 | /*----------------------------------------------------------------- |
604 | * Client setup | 638 | * Client setup |
605 | *---------------------------------------------------------------*/ | 639 | *---------------------------------------------------------------*/ |
606 | int dm_kcopyd_client_create(unsigned int nr_pages, | 640 | struct dm_kcopyd_client *dm_kcopyd_client_create(void) |
607 | struct dm_kcopyd_client **result) | ||
608 | { | 641 | { |
609 | int r = -ENOMEM; | 642 | int r = -ENOMEM; |
610 | struct dm_kcopyd_client *kc; | 643 | struct dm_kcopyd_client *kc; |
611 | 644 | ||
612 | kc = kmalloc(sizeof(*kc), GFP_KERNEL); | 645 | kc = kmalloc(sizeof(*kc), GFP_KERNEL); |
613 | if (!kc) | 646 | if (!kc) |
614 | return -ENOMEM; | 647 | return ERR_PTR(-ENOMEM); |
615 | 648 | ||
616 | spin_lock_init(&kc->lock); | ||
617 | spin_lock_init(&kc->job_lock); | 649 | spin_lock_init(&kc->job_lock); |
618 | INIT_LIST_HEAD(&kc->complete_jobs); | 650 | INIT_LIST_HEAD(&kc->complete_jobs); |
619 | INIT_LIST_HEAD(&kc->io_jobs); | 651 | INIT_LIST_HEAD(&kc->io_jobs); |
@@ -624,17 +656,18 @@ int dm_kcopyd_client_create(unsigned int nr_pages, | |||
624 | goto bad_slab; | 656 | goto bad_slab; |
625 | 657 | ||
626 | INIT_WORK(&kc->kcopyd_work, do_work); | 658 | INIT_WORK(&kc->kcopyd_work, do_work); |
627 | kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); | 659 | kc->kcopyd_wq = alloc_workqueue("kcopyd", |
660 | WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); | ||
628 | if (!kc->kcopyd_wq) | 661 | if (!kc->kcopyd_wq) |
629 | goto bad_workqueue; | 662 | goto bad_workqueue; |
630 | 663 | ||
631 | kc->pages = NULL; | 664 | kc->pages = NULL; |
632 | kc->nr_pages = kc->nr_free_pages = 0; | 665 | kc->nr_reserved_pages = kc->nr_free_pages = 0; |
633 | r = client_alloc_pages(kc, nr_pages); | 666 | r = client_reserve_pages(kc, RESERVE_PAGES); |
634 | if (r) | 667 | if (r) |
635 | goto bad_client_pages; | 668 | goto bad_client_pages; |
636 | 669 | ||
637 | kc->io_client = dm_io_client_create(nr_pages); | 670 | kc->io_client = dm_io_client_create(); |
638 | if (IS_ERR(kc->io_client)) { | 671 | if (IS_ERR(kc->io_client)) { |
639 | r = PTR_ERR(kc->io_client); | 672 | r = PTR_ERR(kc->io_client); |
640 | goto bad_io_client; | 673 | goto bad_io_client; |
@@ -643,8 +676,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages, | |||
643 | init_waitqueue_head(&kc->destroyq); | 676 | init_waitqueue_head(&kc->destroyq); |
644 | atomic_set(&kc->nr_jobs, 0); | 677 | atomic_set(&kc->nr_jobs, 0); |
645 | 678 | ||
646 | *result = kc; | 679 | return kc; |
647 | return 0; | ||
648 | 680 | ||
649 | bad_io_client: | 681 | bad_io_client: |
650 | client_free_pages(kc); | 682 | client_free_pages(kc); |
@@ -655,7 +687,7 @@ bad_workqueue: | |||
655 | bad_slab: | 687 | bad_slab: |
656 | kfree(kc); | 688 | kfree(kc); |
657 | 689 | ||
658 | return r; | 690 | return ERR_PTR(r); |
659 | } | 691 | } |
660 | EXPORT_SYMBOL(dm_kcopyd_client_create); | 692 | EXPORT_SYMBOL(dm_kcopyd_client_create); |
661 | 693 | ||
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 1ed0094f064b..aa2e0c374ab3 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c | |||
@@ -12,12 +12,22 @@ | |||
12 | 12 | ||
13 | #include "dm-log-userspace-transfer.h" | 13 | #include "dm-log-userspace-transfer.h" |
14 | 14 | ||
15 | #define DM_LOG_USERSPACE_VSN "1.1.0" | ||
16 | |||
15 | struct flush_entry { | 17 | struct flush_entry { |
16 | int type; | 18 | int type; |
17 | region_t region; | 19 | region_t region; |
18 | struct list_head list; | 20 | struct list_head list; |
19 | }; | 21 | }; |
20 | 22 | ||
23 | /* | ||
24 | * This limit on the number of mark and clear request is, to a degree, | ||
25 | * arbitrary. However, there is some basis for the choice in the limits | ||
26 | * imposed on the size of data payload by dm-log-userspace-transfer.c: | ||
27 | * dm_consult_userspace(). | ||
28 | */ | ||
29 | #define MAX_FLUSH_GROUP_COUNT 32 | ||
30 | |||
21 | struct log_c { | 31 | struct log_c { |
22 | struct dm_target *ti; | 32 | struct dm_target *ti; |
23 | uint32_t region_size; | 33 | uint32_t region_size; |
@@ -37,8 +47,15 @@ struct log_c { | |||
37 | */ | 47 | */ |
38 | uint64_t in_sync_hint; | 48 | uint64_t in_sync_hint; |
39 | 49 | ||
50 | /* | ||
51 | * Mark and clear requests are held until a flush is issued | ||
52 | * so that we can group, and thereby limit, the amount of | ||
53 | * network traffic between kernel and userspace. The 'flush_lock' | ||
54 | * is used to protect these lists. | ||
55 | */ | ||
40 | spinlock_t flush_lock; | 56 | spinlock_t flush_lock; |
41 | struct list_head flush_list; /* only for clear and mark requests */ | 57 | struct list_head mark_list; |
58 | struct list_head clear_list; | ||
42 | }; | 59 | }; |
43 | 60 | ||
44 | static mempool_t *flush_entry_pool; | 61 | static mempool_t *flush_entry_pool; |
@@ -169,7 +186,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
169 | 186 | ||
170 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); | 187 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); |
171 | spin_lock_init(&lc->flush_lock); | 188 | spin_lock_init(&lc->flush_lock); |
172 | INIT_LIST_HEAD(&lc->flush_list); | 189 | INIT_LIST_HEAD(&lc->mark_list); |
190 | INIT_LIST_HEAD(&lc->clear_list); | ||
173 | 191 | ||
174 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); | 192 | str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str); |
175 | if (str_size < 0) { | 193 | if (str_size < 0) { |
@@ -181,8 +199,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
181 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, | 199 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, |
182 | ctr_str, str_size, NULL, NULL); | 200 | ctr_str, str_size, NULL, NULL); |
183 | 201 | ||
184 | if (r == -ESRCH) { | 202 | if (r < 0) { |
185 | DMERR("Userspace log server not found"); | 203 | if (r == -ESRCH) |
204 | DMERR("Userspace log server not found"); | ||
205 | else | ||
206 | DMERR("Userspace log server failed to create log"); | ||
186 | goto out; | 207 | goto out; |
187 | } | 208 | } |
188 | 209 | ||
@@ -214,10 +235,9 @@ out: | |||
214 | 235 | ||
215 | static void userspace_dtr(struct dm_dirty_log *log) | 236 | static void userspace_dtr(struct dm_dirty_log *log) |
216 | { | 237 | { |
217 | int r; | ||
218 | struct log_c *lc = log->context; | 238 | struct log_c *lc = log->context; |
219 | 239 | ||
220 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, | 240 | (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, |
221 | NULL, 0, | 241 | NULL, 0, |
222 | NULL, NULL); | 242 | NULL, NULL); |
223 | 243 | ||
@@ -338,6 +358,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region, | |||
338 | return (r) ? 0 : (int)in_sync; | 358 | return (r) ? 0 : (int)in_sync; |
339 | } | 359 | } |
340 | 360 | ||
361 | static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) | ||
362 | { | ||
363 | int r = 0; | ||
364 | struct flush_entry *fe; | ||
365 | |||
366 | list_for_each_entry(fe, flush_list, list) { | ||
367 | r = userspace_do_request(lc, lc->uuid, fe->type, | ||
368 | (char *)&fe->region, | ||
369 | sizeof(fe->region), | ||
370 | NULL, NULL); | ||
371 | if (r) | ||
372 | break; | ||
373 | } | ||
374 | |||
375 | return r; | ||
376 | } | ||
377 | |||
378 | static int flush_by_group(struct log_c *lc, struct list_head *flush_list) | ||
379 | { | ||
380 | int r = 0; | ||
381 | int count; | ||
382 | uint32_t type = 0; | ||
383 | struct flush_entry *fe, *tmp_fe; | ||
384 | LIST_HEAD(tmp_list); | ||
385 | uint64_t group[MAX_FLUSH_GROUP_COUNT]; | ||
386 | |||
387 | /* | ||
388 | * Group process the requests | ||
389 | */ | ||
390 | while (!list_empty(flush_list)) { | ||
391 | count = 0; | ||
392 | |||
393 | list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { | ||
394 | group[count] = fe->region; | ||
395 | count++; | ||
396 | |||
397 | list_del(&fe->list); | ||
398 | list_add(&fe->list, &tmp_list); | ||
399 | |||
400 | type = fe->type; | ||
401 | if (count >= MAX_FLUSH_GROUP_COUNT) | ||
402 | break; | ||
403 | } | ||
404 | |||
405 | r = userspace_do_request(lc, lc->uuid, type, | ||
406 | (char *)(group), | ||
407 | count * sizeof(uint64_t), | ||
408 | NULL, NULL); | ||
409 | if (r) { | ||
410 | /* Group send failed. Attempt one-by-one. */ | ||
411 | list_splice_init(&tmp_list, flush_list); | ||
412 | r = flush_one_by_one(lc, flush_list); | ||
413 | break; | ||
414 | } | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Must collect flush_entrys that were successfully processed | ||
419 | * as a group so that they will be free'd by the caller. | ||
420 | */ | ||
421 | list_splice_init(&tmp_list, flush_list); | ||
422 | |||
423 | return r; | ||
424 | } | ||
425 | |||
341 | /* | 426 | /* |
342 | * userspace_flush | 427 | * userspace_flush |
343 | * | 428 | * |
@@ -360,31 +445,25 @@ static int userspace_flush(struct dm_dirty_log *log) | |||
360 | int r = 0; | 445 | int r = 0; |
361 | unsigned long flags; | 446 | unsigned long flags; |
362 | struct log_c *lc = log->context; | 447 | struct log_c *lc = log->context; |
363 | LIST_HEAD(flush_list); | 448 | LIST_HEAD(mark_list); |
449 | LIST_HEAD(clear_list); | ||
364 | struct flush_entry *fe, *tmp_fe; | 450 | struct flush_entry *fe, *tmp_fe; |
365 | 451 | ||
366 | spin_lock_irqsave(&lc->flush_lock, flags); | 452 | spin_lock_irqsave(&lc->flush_lock, flags); |
367 | list_splice_init(&lc->flush_list, &flush_list); | 453 | list_splice_init(&lc->mark_list, &mark_list); |
454 | list_splice_init(&lc->clear_list, &clear_list); | ||
368 | spin_unlock_irqrestore(&lc->flush_lock, flags); | 455 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
369 | 456 | ||
370 | if (list_empty(&flush_list)) | 457 | if (list_empty(&mark_list) && list_empty(&clear_list)) |
371 | return 0; | 458 | return 0; |
372 | 459 | ||
373 | /* | 460 | r = flush_by_group(lc, &mark_list); |
374 | * FIXME: Count up requests, group request types, | 461 | if (r) |
375 | * allocate memory to stick all requests in and | 462 | goto fail; |
376 | * send to server in one go. Failing the allocation, | ||
377 | * do it one by one. | ||
378 | */ | ||
379 | 463 | ||
380 | list_for_each_entry(fe, &flush_list, list) { | 464 | r = flush_by_group(lc, &clear_list); |
381 | r = userspace_do_request(lc, lc->uuid, fe->type, | 465 | if (r) |
382 | (char *)&fe->region, | 466 | goto fail; |
383 | sizeof(fe->region), | ||
384 | NULL, NULL); | ||
385 | if (r) | ||
386 | goto fail; | ||
387 | } | ||
388 | 467 | ||
389 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | 468 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, |
390 | NULL, 0, NULL, NULL); | 469 | NULL, 0, NULL, NULL); |
@@ -395,7 +474,11 @@ fail: | |||
395 | * Calling code will receive an error and will know that | 474 | * Calling code will receive an error and will know that |
396 | * the log facility has failed. | 475 | * the log facility has failed. |
397 | */ | 476 | */ |
398 | list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) { | 477 | list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { |
478 | list_del(&fe->list); | ||
479 | mempool_free(fe, flush_entry_pool); | ||
480 | } | ||
481 | list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { | ||
399 | list_del(&fe->list); | 482 | list_del(&fe->list); |
400 | mempool_free(fe, flush_entry_pool); | 483 | mempool_free(fe, flush_entry_pool); |
401 | } | 484 | } |
@@ -425,7 +508,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) | |||
425 | spin_lock_irqsave(&lc->flush_lock, flags); | 508 | spin_lock_irqsave(&lc->flush_lock, flags); |
426 | fe->type = DM_ULOG_MARK_REGION; | 509 | fe->type = DM_ULOG_MARK_REGION; |
427 | fe->region = region; | 510 | fe->region = region; |
428 | list_add(&fe->list, &lc->flush_list); | 511 | list_add(&fe->list, &lc->mark_list); |
429 | spin_unlock_irqrestore(&lc->flush_lock, flags); | 512 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
430 | 513 | ||
431 | return; | 514 | return; |
@@ -462,7 +545,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) | |||
462 | spin_lock_irqsave(&lc->flush_lock, flags); | 545 | spin_lock_irqsave(&lc->flush_lock, flags); |
463 | fe->type = DM_ULOG_CLEAR_REGION; | 546 | fe->type = DM_ULOG_CLEAR_REGION; |
464 | fe->region = region; | 547 | fe->region = region; |
465 | list_add(&fe->list, &lc->flush_list); | 548 | list_add(&fe->list, &lc->clear_list); |
466 | spin_unlock_irqrestore(&lc->flush_lock, flags); | 549 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
467 | 550 | ||
468 | return; | 551 | return; |
@@ -684,7 +767,7 @@ static int __init userspace_dirty_log_init(void) | |||
684 | return r; | 767 | return r; |
685 | } | 768 | } |
686 | 769 | ||
687 | DMINFO("version 1.0.0 loaded"); | 770 | DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); |
688 | return 0; | 771 | return 0; |
689 | } | 772 | } |
690 | 773 | ||
@@ -694,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void) | |||
694 | dm_ulog_tfr_exit(); | 777 | dm_ulog_tfr_exit(); |
695 | mempool_destroy(flush_entry_pool); | 778 | mempool_destroy(flush_entry_pool); |
696 | 779 | ||
697 | DMINFO("version 1.0.0 unloaded"); | 780 | DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); |
698 | return; | 781 | return; |
699 | } | 782 | } |
700 | 783 | ||
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 075cbcf8a9f5..1f23e048f077 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c | |||
@@ -134,7 +134,7 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) | |||
134 | { | 134 | { |
135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); | 135 | struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); |
136 | 136 | ||
137 | if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) | 137 | if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) |
138 | return; | 138 | return; |
139 | 139 | ||
140 | spin_lock(&receiving_list_lock); | 140 | spin_lock(&receiving_list_lock); |
@@ -198,6 +198,7 @@ resend: | |||
198 | 198 | ||
199 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); | 199 | memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); |
200 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); | 200 | memcpy(tfr->uuid, uuid, DM_UUID_LEN); |
201 | tfr->version = DM_ULOG_REQUEST_VERSION; | ||
201 | tfr->luid = luid; | 202 | tfr->luid = luid; |
202 | tfr->seq = dm_ulog_seq++; | 203 | tfr->seq = dm_ulog_seq++; |
203 | 204 | ||
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 5a08be0222db..948e3f4925bf 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -251,20 +251,20 @@ struct log_c { | |||
251 | */ | 251 | */ |
252 | static inline int log_test_bit(uint32_t *bs, unsigned bit) | 252 | static inline int log_test_bit(uint32_t *bs, unsigned bit) |
253 | { | 253 | { |
254 | return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; | 254 | return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0; |
255 | } | 255 | } |
256 | 256 | ||
257 | static inline void log_set_bit(struct log_c *l, | 257 | static inline void log_set_bit(struct log_c *l, |
258 | uint32_t *bs, unsigned bit) | 258 | uint32_t *bs, unsigned bit) |
259 | { | 259 | { |
260 | ext2_set_bit(bit, (unsigned long *) bs); | 260 | __test_and_set_bit_le(bit, (unsigned long *) bs); |
261 | l->touched_cleaned = 1; | 261 | l->touched_cleaned = 1; |
262 | } | 262 | } |
263 | 263 | ||
264 | static inline void log_clear_bit(struct log_c *l, | 264 | static inline void log_clear_bit(struct log_c *l, |
265 | uint32_t *bs, unsigned bit) | 265 | uint32_t *bs, unsigned bit) |
266 | { | 266 | { |
267 | ext2_clear_bit(bit, (unsigned long *) bs); | 267 | __test_and_clear_bit_le(bit, (unsigned long *) bs); |
268 | l->touched_dirtied = 1; | 268 | l->touched_dirtied = 1; |
269 | } | 269 | } |
270 | 270 | ||
@@ -300,7 +300,7 @@ static int flush_header(struct log_c *lc) | |||
300 | .count = 0, | 300 | .count = 0, |
301 | }; | 301 | }; |
302 | 302 | ||
303 | lc->io_req.bi_rw = WRITE_BARRIER; | 303 | lc->io_req.bi_rw = WRITE_FLUSH; |
304 | 304 | ||
305 | return dm_io(&lc->io_req, 1, &null_location, NULL); | 305 | return dm_io(&lc->io_req, 1, &null_location, NULL); |
306 | } | 306 | } |
@@ -449,13 +449,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
449 | 449 | ||
450 | lc->io_req.mem.type = DM_IO_VMA; | 450 | lc->io_req.mem.type = DM_IO_VMA; |
451 | lc->io_req.notify.fn = NULL; | 451 | lc->io_req.notify.fn = NULL; |
452 | lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, | 452 | lc->io_req.client = dm_io_client_create(); |
453 | PAGE_SIZE)); | ||
454 | if (IS_ERR(lc->io_req.client)) { | 453 | if (IS_ERR(lc->io_req.client)) { |
455 | r = PTR_ERR(lc->io_req.client); | 454 | r = PTR_ERR(lc->io_req.client); |
456 | DMWARN("couldn't allocate disk io client"); | 455 | DMWARN("couldn't allocate disk io client"); |
457 | kfree(lc); | 456 | kfree(lc); |
458 | return -ENOMEM; | 457 | return r; |
459 | } | 458 | } |
460 | 459 | ||
461 | lc->disk_header = vmalloc(buf_size); | 460 | lc->disk_header = vmalloc(buf_size); |
@@ -543,7 +542,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |||
543 | return -EINVAL; | 542 | return -EINVAL; |
544 | } | 543 | } |
545 | 544 | ||
546 | r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &dev); | 545 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); |
547 | if (r) | 546 | if (r) |
548 | return r; | 547 | return r; |
549 | 548 | ||
@@ -740,7 +739,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) | |||
740 | return 0; | 739 | return 0; |
741 | 740 | ||
742 | do { | 741 | do { |
743 | *region = ext2_find_next_zero_bit( | 742 | *region = find_next_zero_bit_le( |
744 | (unsigned long *) lc->sync_bits, | 743 | (unsigned long *) lc->sync_bits, |
745 | lc->region_count, | 744 | lc->region_count, |
746 | lc->sync_search); | 745 | lc->sync_search); |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 487ecda90ad4..aa4e570c2cb5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -23,6 +23,8 @@ | |||
23 | 23 | ||
24 | #define DM_MSG_PREFIX "multipath" | 24 | #define DM_MSG_PREFIX "multipath" |
25 | #define MESG_STR(x) x, sizeof(x) | 25 | #define MESG_STR(x) x, sizeof(x) |
26 | #define DM_PG_INIT_DELAY_MSECS 2000 | ||
27 | #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) | ||
26 | 28 | ||
27 | /* Path properties */ | 29 | /* Path properties */ |
28 | struct pgpath { | 30 | struct pgpath { |
@@ -33,8 +35,7 @@ struct pgpath { | |||
33 | unsigned fail_count; /* Cumulative failure count */ | 35 | unsigned fail_count; /* Cumulative failure count */ |
34 | 36 | ||
35 | struct dm_path path; | 37 | struct dm_path path; |
36 | struct work_struct deactivate_path; | 38 | struct delayed_work activate_path; |
37 | struct work_struct activate_path; | ||
38 | }; | 39 | }; |
39 | 40 | ||
40 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) | 41 | #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) |
@@ -65,11 +66,15 @@ struct multipath { | |||
65 | 66 | ||
66 | const char *hw_handler_name; | 67 | const char *hw_handler_name; |
67 | char *hw_handler_params; | 68 | char *hw_handler_params; |
69 | |||
68 | unsigned nr_priority_groups; | 70 | unsigned nr_priority_groups; |
69 | struct list_head priority_groups; | 71 | struct list_head priority_groups; |
72 | |||
73 | wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ | ||
74 | |||
70 | unsigned pg_init_required; /* pg_init needs calling? */ | 75 | unsigned pg_init_required; /* pg_init needs calling? */ |
71 | unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ | 76 | unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ |
72 | wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ | 77 | unsigned pg_init_delay_retry; /* Delay pg_init retry? */ |
73 | 78 | ||
74 | unsigned nr_valid_paths; /* Total number of usable paths */ | 79 | unsigned nr_valid_paths; /* Total number of usable paths */ |
75 | struct pgpath *current_pgpath; | 80 | struct pgpath *current_pgpath; |
@@ -82,6 +87,7 @@ struct multipath { | |||
82 | unsigned saved_queue_if_no_path;/* Saved state during suspension */ | 87 | unsigned saved_queue_if_no_path;/* Saved state during suspension */ |
83 | unsigned pg_init_retries; /* Number of times to retry pg_init */ | 88 | unsigned pg_init_retries; /* Number of times to retry pg_init */ |
84 | unsigned pg_init_count; /* Number of times pg_init called */ | 89 | unsigned pg_init_count; /* Number of times pg_init called */ |
90 | unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ | ||
85 | 91 | ||
86 | struct work_struct process_queued_ios; | 92 | struct work_struct process_queued_ios; |
87 | struct list_head queued_ios; | 93 | struct list_head queued_ios; |
@@ -116,7 +122,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd; | |||
116 | static void process_queued_ios(struct work_struct *work); | 122 | static void process_queued_ios(struct work_struct *work); |
117 | static void trigger_event(struct work_struct *work); | 123 | static void trigger_event(struct work_struct *work); |
118 | static void activate_path(struct work_struct *work); | 124 | static void activate_path(struct work_struct *work); |
119 | static void deactivate_path(struct work_struct *work); | ||
120 | 125 | ||
121 | 126 | ||
122 | /*----------------------------------------------- | 127 | /*----------------------------------------------- |
@@ -129,8 +134,7 @@ static struct pgpath *alloc_pgpath(void) | |||
129 | 134 | ||
130 | if (pgpath) { | 135 | if (pgpath) { |
131 | pgpath->is_active = 1; | 136 | pgpath->is_active = 1; |
132 | INIT_WORK(&pgpath->deactivate_path, deactivate_path); | 137 | INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); |
133 | INIT_WORK(&pgpath->activate_path, activate_path); | ||
134 | } | 138 | } |
135 | 139 | ||
136 | return pgpath; | 140 | return pgpath; |
@@ -141,14 +145,6 @@ static void free_pgpath(struct pgpath *pgpath) | |||
141 | kfree(pgpath); | 145 | kfree(pgpath); |
142 | } | 146 | } |
143 | 147 | ||
144 | static void deactivate_path(struct work_struct *work) | ||
145 | { | ||
146 | struct pgpath *pgpath = | ||
147 | container_of(work, struct pgpath, deactivate_path); | ||
148 | |||
149 | blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue); | ||
150 | } | ||
151 | |||
152 | static struct priority_group *alloc_priority_group(void) | 148 | static struct priority_group *alloc_priority_group(void) |
153 | { | 149 | { |
154 | struct priority_group *pg; | 150 | struct priority_group *pg; |
@@ -199,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) | |||
199 | INIT_LIST_HEAD(&m->queued_ios); | 195 | INIT_LIST_HEAD(&m->queued_ios); |
200 | spin_lock_init(&m->lock); | 196 | spin_lock_init(&m->lock); |
201 | m->queue_io = 1; | 197 | m->queue_io = 1; |
198 | m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; | ||
202 | INIT_WORK(&m->process_queued_ios, process_queued_ios); | 199 | INIT_WORK(&m->process_queued_ios, process_queued_ios); |
203 | INIT_WORK(&m->trigger_event, trigger_event); | 200 | INIT_WORK(&m->trigger_event, trigger_event); |
204 | init_waitqueue_head(&m->pg_init_wait); | 201 | init_waitqueue_head(&m->pg_init_wait); |
@@ -238,14 +235,19 @@ static void free_multipath(struct multipath *m) | |||
238 | static void __pg_init_all_paths(struct multipath *m) | 235 | static void __pg_init_all_paths(struct multipath *m) |
239 | { | 236 | { |
240 | struct pgpath *pgpath; | 237 | struct pgpath *pgpath; |
238 | unsigned long pg_init_delay = 0; | ||
241 | 239 | ||
242 | m->pg_init_count++; | 240 | m->pg_init_count++; |
243 | m->pg_init_required = 0; | 241 | m->pg_init_required = 0; |
242 | if (m->pg_init_delay_retry) | ||
243 | pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? | ||
244 | m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); | ||
244 | list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { | 245 | list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { |
245 | /* Skip failed paths */ | 246 | /* Skip failed paths */ |
246 | if (!pgpath->is_active) | 247 | if (!pgpath->is_active) |
247 | continue; | 248 | continue; |
248 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | 249 | if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, |
250 | pg_init_delay)) | ||
249 | m->pg_init_in_progress++; | 251 | m->pg_init_in_progress++; |
250 | } | 252 | } |
251 | } | 253 | } |
@@ -793,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m) | |||
793 | const char *param_name; | 795 | const char *param_name; |
794 | 796 | ||
795 | static struct param _params[] = { | 797 | static struct param _params[] = { |
796 | {0, 3, "invalid number of feature args"}, | 798 | {0, 5, "invalid number of feature args"}, |
797 | {1, 50, "pg_init_retries must be between 1 and 50"}, | 799 | {1, 50, "pg_init_retries must be between 1 and 50"}, |
800 | {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, | ||
798 | }; | 801 | }; |
799 | 802 | ||
800 | r = read_param(_params, shift(as), &argc, &ti->error); | 803 | r = read_param(_params, shift(as), &argc, &ti->error); |
@@ -821,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m) | |||
821 | continue; | 824 | continue; |
822 | } | 825 | } |
823 | 826 | ||
827 | if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) && | ||
828 | (argc >= 1)) { | ||
829 | r = read_param(_params + 2, shift(as), | ||
830 | &m->pg_init_delay_msecs, &ti->error); | ||
831 | argc--; | ||
832 | continue; | ||
833 | } | ||
834 | |||
824 | ti->error = "Unrecognised multipath feature request"; | 835 | ti->error = "Unrecognised multipath feature request"; |
825 | r = -EINVAL; | 836 | r = -EINVAL; |
826 | } while (argc && !r); | 837 | } while (argc && !r); |
@@ -833,8 +844,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
833 | { | 844 | { |
834 | /* target parameters */ | 845 | /* target parameters */ |
835 | static struct param _params[] = { | 846 | static struct param _params[] = { |
836 | {1, 1024, "invalid number of priority groups"}, | 847 | {0, 1024, "invalid number of priority groups"}, |
837 | {1, 1024, "invalid initial priority group number"}, | 848 | {0, 1024, "invalid initial priority group number"}, |
838 | }; | 849 | }; |
839 | 850 | ||
840 | int r; | 851 | int r; |
@@ -868,6 +879,13 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
868 | if (r) | 879 | if (r) |
869 | goto bad; | 880 | goto bad; |
870 | 881 | ||
882 | if ((!m->nr_priority_groups && next_pg_num) || | ||
883 | (m->nr_priority_groups && !next_pg_num)) { | ||
884 | ti->error = "invalid initial priority group"; | ||
885 | r = -EINVAL; | ||
886 | goto bad; | ||
887 | } | ||
888 | |||
871 | /* parse the priority groups */ | 889 | /* parse the priority groups */ |
872 | while (as.argc) { | 890 | while (as.argc) { |
873 | struct priority_group *pg; | 891 | struct priority_group *pg; |
@@ -931,7 +949,7 @@ static void flush_multipath_work(struct multipath *m) | |||
931 | flush_workqueue(kmpath_handlerd); | 949 | flush_workqueue(kmpath_handlerd); |
932 | multipath_wait_for_pg_init_completion(m); | 950 | multipath_wait_for_pg_init_completion(m); |
933 | flush_workqueue(kmultipathd); | 951 | flush_workqueue(kmultipathd); |
934 | flush_scheduled_work(); | 952 | flush_work_sync(&m->trigger_event); |
935 | } | 953 | } |
936 | 954 | ||
937 | static void multipath_dtr(struct dm_target *ti) | 955 | static void multipath_dtr(struct dm_target *ti) |
@@ -995,7 +1013,6 @@ static int fail_path(struct pgpath *pgpath) | |||
995 | pgpath->path.dev->name, m->nr_valid_paths); | 1013 | pgpath->path.dev->name, m->nr_valid_paths); |
996 | 1014 | ||
997 | schedule_work(&m->trigger_event); | 1015 | schedule_work(&m->trigger_event); |
998 | queue_work(kmultipathd, &pgpath->deactivate_path); | ||
999 | 1016 | ||
1000 | out: | 1017 | out: |
1001 | spin_unlock_irqrestore(&m->lock, flags); | 1018 | spin_unlock_irqrestore(&m->lock, flags); |
@@ -1034,7 +1051,7 @@ static int reinstate_path(struct pgpath *pgpath) | |||
1034 | m->current_pgpath = NULL; | 1051 | m->current_pgpath = NULL; |
1035 | queue_work(kmultipathd, &m->process_queued_ios); | 1052 | queue_work(kmultipathd, &m->process_queued_ios); |
1036 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { | 1053 | } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { |
1037 | if (queue_work(kmpath_handlerd, &pgpath->activate_path)) | 1054 | if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) |
1038 | m->pg_init_in_progress++; | 1055 | m->pg_init_in_progress++; |
1039 | } | 1056 | } |
1040 | 1057 | ||
@@ -1055,7 +1072,7 @@ out: | |||
1055 | static int action_dev(struct multipath *m, struct dm_dev *dev, | 1072 | static int action_dev(struct multipath *m, struct dm_dev *dev, |
1056 | action_fn action) | 1073 | action_fn action) |
1057 | { | 1074 | { |
1058 | int r = 0; | 1075 | int r = -EINVAL; |
1059 | struct pgpath *pgpath; | 1076 | struct pgpath *pgpath; |
1060 | struct priority_group *pg; | 1077 | struct priority_group *pg; |
1061 | 1078 | ||
@@ -1169,6 +1186,7 @@ static void pg_init_done(void *data, int errors) | |||
1169 | struct priority_group *pg = pgpath->pg; | 1186 | struct priority_group *pg = pgpath->pg; |
1170 | struct multipath *m = pg->m; | 1187 | struct multipath *m = pg->m; |
1171 | unsigned long flags; | 1188 | unsigned long flags; |
1189 | unsigned delay_retry = 0; | ||
1172 | 1190 | ||
1173 | /* device or driver problems */ | 1191 | /* device or driver problems */ |
1174 | switch (errors) { | 1192 | switch (errors) { |
@@ -1193,8 +1211,9 @@ static void pg_init_done(void *data, int errors) | |||
1193 | */ | 1211 | */ |
1194 | bypass_pg(m, pg, 1); | 1212 | bypass_pg(m, pg, 1); |
1195 | break; | 1213 | break; |
1196 | /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ | ||
1197 | case SCSI_DH_RETRY: | 1214 | case SCSI_DH_RETRY: |
1215 | /* Wait before retrying. */ | ||
1216 | delay_retry = 1; | ||
1198 | case SCSI_DH_IMM_RETRY: | 1217 | case SCSI_DH_IMM_RETRY: |
1199 | case SCSI_DH_RES_TEMP_UNAVAIL: | 1218 | case SCSI_DH_RES_TEMP_UNAVAIL: |
1200 | if (pg_init_limit_reached(m, pgpath)) | 1219 | if (pg_init_limit_reached(m, pgpath)) |
@@ -1227,6 +1246,7 @@ static void pg_init_done(void *data, int errors) | |||
1227 | if (!m->pg_init_required) | 1246 | if (!m->pg_init_required) |
1228 | m->queue_io = 0; | 1247 | m->queue_io = 0; |
1229 | 1248 | ||
1249 | m->pg_init_delay_retry = delay_retry; | ||
1230 | queue_work(kmultipathd, &m->process_queued_ios); | 1250 | queue_work(kmultipathd, &m->process_queued_ios); |
1231 | 1251 | ||
1232 | /* | 1252 | /* |
@@ -1241,7 +1261,7 @@ out: | |||
1241 | static void activate_path(struct work_struct *work) | 1261 | static void activate_path(struct work_struct *work) |
1242 | { | 1262 | { |
1243 | struct pgpath *pgpath = | 1263 | struct pgpath *pgpath = |
1244 | container_of(work, struct pgpath, activate_path); | 1264 | container_of(work, struct pgpath, activate_path.work); |
1245 | 1265 | ||
1246 | scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), | 1266 | scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), |
1247 | pg_init_done, pgpath); | 1267 | pg_init_done, pgpath); |
@@ -1270,24 +1290,22 @@ static int do_end_io(struct multipath *m, struct request *clone, | |||
1270 | if (!error && !clone->errors) | 1290 | if (!error && !clone->errors) |
1271 | return 0; /* I/O complete */ | 1291 | return 0; /* I/O complete */ |
1272 | 1292 | ||
1273 | if (error == -EOPNOTSUPP) | 1293 | if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ) |
1274 | return error; | ||
1275 | |||
1276 | if (clone->cmd_flags & REQ_DISCARD) | ||
1277 | /* | ||
1278 | * Pass all discard request failures up. | ||
1279 | * FIXME: only fail_path if the discard failed due to a | ||
1280 | * transport problem. This requires precise understanding | ||
1281 | * of the underlying failure (e.g. the SCSI sense). | ||
1282 | */ | ||
1283 | return error; | 1294 | return error; |
1284 | 1295 | ||
1285 | if (mpio->pgpath) | 1296 | if (mpio->pgpath) |
1286 | fail_path(mpio->pgpath); | 1297 | fail_path(mpio->pgpath); |
1287 | 1298 | ||
1288 | spin_lock_irqsave(&m->lock, flags); | 1299 | spin_lock_irqsave(&m->lock, flags); |
1289 | if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m)) | 1300 | if (!m->nr_valid_paths) { |
1290 | r = -EIO; | 1301 | if (!m->queue_if_no_path) { |
1302 | if (!__must_push_back(m)) | ||
1303 | r = -EIO; | ||
1304 | } else { | ||
1305 | if (error == -EBADE) | ||
1306 | r = error; | ||
1307 | } | ||
1308 | } | ||
1291 | spin_unlock_irqrestore(&m->lock, flags); | 1309 | spin_unlock_irqrestore(&m->lock, flags); |
1292 | 1310 | ||
1293 | return r; | 1311 | return r; |
@@ -1382,11 +1400,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
1382 | DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); | 1400 | DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); |
1383 | else { | 1401 | else { |
1384 | DMEMIT("%u ", m->queue_if_no_path + | 1402 | DMEMIT("%u ", m->queue_if_no_path + |
1385 | (m->pg_init_retries > 0) * 2); | 1403 | (m->pg_init_retries > 0) * 2 + |
1404 | (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); | ||
1386 | if (m->queue_if_no_path) | 1405 | if (m->queue_if_no_path) |
1387 | DMEMIT("queue_if_no_path "); | 1406 | DMEMIT("queue_if_no_path "); |
1388 | if (m->pg_init_retries) | 1407 | if (m->pg_init_retries) |
1389 | DMEMIT("pg_init_retries %u ", m->pg_init_retries); | 1408 | DMEMIT("pg_init_retries %u ", m->pg_init_retries); |
1409 | if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) | ||
1410 | DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); | ||
1390 | } | 1411 | } |
1391 | 1412 | ||
1392 | if (!m->hw_handler_name || type == STATUSTYPE_INFO) | 1413 | if (!m->hw_handler_name || type == STATUSTYPE_INFO) |
@@ -1401,7 +1422,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
1401 | else if (m->current_pg) | 1422 | else if (m->current_pg) |
1402 | pg_num = m->current_pg->pg_num; | 1423 | pg_num = m->current_pg->pg_num; |
1403 | else | 1424 | else |
1404 | pg_num = 1; | 1425 | pg_num = (m->nr_priority_groups ? 1 : 0); |
1405 | 1426 | ||
1406 | DMEMIT("%u ", pg_num); | 1427 | DMEMIT("%u ", pg_num); |
1407 | 1428 | ||
@@ -1655,7 +1676,7 @@ out: | |||
1655 | *---------------------------------------------------------------*/ | 1676 | *---------------------------------------------------------------*/ |
1656 | static struct target_type multipath_target = { | 1677 | static struct target_type multipath_target = { |
1657 | .name = "multipath", | 1678 | .name = "multipath", |
1658 | .version = {1, 1, 1}, | 1679 | .version = {1, 3, 0}, |
1659 | .module = THIS_MODULE, | 1680 | .module = THIS_MODULE, |
1660 | .ctr = multipath_ctr, | 1681 | .ctr = multipath_ctr, |
1661 | .dtr = multipath_dtr, | 1682 | .dtr = multipath_dtr, |
@@ -1687,7 +1708,7 @@ static int __init dm_multipath_init(void) | |||
1687 | return -EINVAL; | 1708 | return -EINVAL; |
1688 | } | 1709 | } |
1689 | 1710 | ||
1690 | kmultipathd = create_workqueue("kmpathd"); | 1711 | kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); |
1691 | if (!kmultipathd) { | 1712 | if (!kmultipathd) { |
1692 | DMERR("failed to create workqueue kmpathd"); | 1713 | DMERR("failed to create workqueue kmpathd"); |
1693 | dm_unregister_target(&multipath_target); | 1714 | dm_unregister_target(&multipath_target); |
@@ -1701,7 +1722,8 @@ static int __init dm_multipath_init(void) | |||
1701 | * old workqueue would also create a bottleneck in the | 1722 | * old workqueue would also create a bottleneck in the |
1702 | * path of the storage hardware device activation. | 1723 | * path of the storage hardware device activation. |
1703 | */ | 1724 | */ |
1704 | kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); | 1725 | kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", |
1726 | WQ_MEM_RECLAIM); | ||
1705 | if (!kmpath_handlerd) { | 1727 | if (!kmpath_handlerd) { |
1706 | DMERR("failed to create workqueue kmpath_handlerd"); | 1728 | DMERR("failed to create workqueue kmpath_handlerd"); |
1707 | destroy_workqueue(kmultipathd); | 1729 | destroy_workqueue(kmultipathd); |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 000000000000..e5d8904fc8f6 --- /dev/null +++ b/drivers/md/dm-raid.c | |||
@@ -0,0 +1,689 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010-2011 Neil Brown | ||
3 | * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | */ | ||
7 | |||
8 | #include <linux/slab.h> | ||
9 | |||
10 | #include "md.h" | ||
11 | #include "raid5.h" | ||
12 | #include "dm.h" | ||
13 | #include "bitmap.h" | ||
14 | |||
15 | #define DM_MSG_PREFIX "raid" | ||
16 | |||
17 | /* | ||
18 | * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then | ||
19 | * make it so the flag doesn't set anything. | ||
20 | */ | ||
21 | #ifndef MD_SYNC_STATE_FORCED | ||
22 | #define MD_SYNC_STATE_FORCED 0 | ||
23 | #endif | ||
24 | |||
25 | struct raid_dev { | ||
26 | /* | ||
27 | * Two DM devices, one to hold metadata and one to hold the | ||
28 | * actual data/parity. The reason for this is to not confuse | ||
29 | * ti->len and give more flexibility in altering size and | ||
30 | * characteristics. | ||
31 | * | ||
32 | * While it is possible for this device to be associated | ||
33 | * with a different physical device than the data_dev, it | ||
34 | * is intended for it to be the same. | ||
35 | * |--------- Physical Device ---------| | ||
36 | * |- meta_dev -|------ data_dev ------| | ||
37 | */ | ||
38 | struct dm_dev *meta_dev; | ||
39 | struct dm_dev *data_dev; | ||
40 | struct mdk_rdev_s rdev; | ||
41 | }; | ||
42 | |||
43 | /* | ||
44 | * Flags for rs->print_flags field. | ||
45 | */ | ||
46 | #define DMPF_DAEMON_SLEEP 0x1 | ||
47 | #define DMPF_MAX_WRITE_BEHIND 0x2 | ||
48 | #define DMPF_SYNC 0x4 | ||
49 | #define DMPF_NOSYNC 0x8 | ||
50 | #define DMPF_STRIPE_CACHE 0x10 | ||
51 | #define DMPF_MIN_RECOVERY_RATE 0x20 | ||
52 | #define DMPF_MAX_RECOVERY_RATE 0x40 | ||
53 | |||
54 | struct raid_set { | ||
55 | struct dm_target *ti; | ||
56 | |||
57 | uint64_t print_flags; | ||
58 | |||
59 | struct mddev_s md; | ||
60 | struct raid_type *raid_type; | ||
61 | struct dm_target_callbacks callbacks; | ||
62 | |||
63 | struct raid_dev dev[0]; | ||
64 | }; | ||
65 | |||
66 | /* Supported raid types and properties. */ | ||
67 | static struct raid_type { | ||
68 | const char *name; /* RAID algorithm. */ | ||
69 | const char *descr; /* Descriptor text for logging. */ | ||
70 | const unsigned parity_devs; /* # of parity devices. */ | ||
71 | const unsigned minimal_devs; /* minimal # of devices in set. */ | ||
72 | const unsigned level; /* RAID level. */ | ||
73 | const unsigned algorithm; /* RAID algorithm. */ | ||
74 | } raid_types[] = { | ||
75 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, | ||
76 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, | ||
77 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, | ||
78 | {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, | ||
79 | {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, | ||
80 | {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, | ||
81 | {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, | ||
82 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} | ||
83 | }; | ||
84 | |||
85 | static struct raid_type *get_raid_type(char *name) | ||
86 | { | ||
87 | int i; | ||
88 | |||
89 | for (i = 0; i < ARRAY_SIZE(raid_types); i++) | ||
90 | if (!strcmp(raid_types[i].name, name)) | ||
91 | return &raid_types[i]; | ||
92 | |||
93 | return NULL; | ||
94 | } | ||
95 | |||
96 | static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) | ||
97 | { | ||
98 | unsigned i; | ||
99 | struct raid_set *rs; | ||
100 | sector_t sectors_per_dev; | ||
101 | |||
102 | if (raid_devs <= raid_type->parity_devs) { | ||
103 | ti->error = "Insufficient number of devices"; | ||
104 | return ERR_PTR(-EINVAL); | ||
105 | } | ||
106 | |||
107 | sectors_per_dev = ti->len; | ||
108 | if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { | ||
109 | ti->error = "Target length not divisible by number of data devices"; | ||
110 | return ERR_PTR(-EINVAL); | ||
111 | } | ||
112 | |||
113 | rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); | ||
114 | if (!rs) { | ||
115 | ti->error = "Cannot allocate raid context"; | ||
116 | return ERR_PTR(-ENOMEM); | ||
117 | } | ||
118 | |||
119 | mddev_init(&rs->md); | ||
120 | |||
121 | rs->ti = ti; | ||
122 | rs->raid_type = raid_type; | ||
123 | rs->md.raid_disks = raid_devs; | ||
124 | rs->md.level = raid_type->level; | ||
125 | rs->md.new_level = rs->md.level; | ||
126 | rs->md.dev_sectors = sectors_per_dev; | ||
127 | rs->md.layout = raid_type->algorithm; | ||
128 | rs->md.new_layout = rs->md.layout; | ||
129 | rs->md.delta_disks = 0; | ||
130 | rs->md.recovery_cp = 0; | ||
131 | |||
132 | for (i = 0; i < raid_devs; i++) | ||
133 | md_rdev_init(&rs->dev[i].rdev); | ||
134 | |||
135 | /* | ||
136 | * Remaining items to be initialized by further RAID params: | ||
137 | * rs->md.persistent | ||
138 | * rs->md.external | ||
139 | * rs->md.chunk_sectors | ||
140 | * rs->md.new_chunk_sectors | ||
141 | */ | ||
142 | |||
143 | return rs; | ||
144 | } | ||
145 | |||
146 | static void context_free(struct raid_set *rs) | ||
147 | { | ||
148 | int i; | ||
149 | |||
150 | for (i = 0; i < rs->md.raid_disks; i++) | ||
151 | if (rs->dev[i].data_dev) | ||
152 | dm_put_device(rs->ti, rs->dev[i].data_dev); | ||
153 | |||
154 | kfree(rs); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * For every device we have two words | ||
159 | * <meta_dev>: meta device name or '-' if missing | ||
160 | * <data_dev>: data device name or '-' if missing | ||
161 | * | ||
162 | * This code parses those words. | ||
163 | */ | ||
164 | static int dev_parms(struct raid_set *rs, char **argv) | ||
165 | { | ||
166 | int i; | ||
167 | int rebuild = 0; | ||
168 | int metadata_available = 0; | ||
169 | int ret = 0; | ||
170 | |||
171 | for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { | ||
172 | rs->dev[i].rdev.raid_disk = i; | ||
173 | |||
174 | rs->dev[i].meta_dev = NULL; | ||
175 | rs->dev[i].data_dev = NULL; | ||
176 | |||
177 | /* | ||
178 | * There are no offsets, since there is a separate device | ||
179 | * for data and metadata. | ||
180 | */ | ||
181 | rs->dev[i].rdev.data_offset = 0; | ||
182 | rs->dev[i].rdev.mddev = &rs->md; | ||
183 | |||
184 | if (strcmp(argv[0], "-")) { | ||
185 | rs->ti->error = "Metadata devices not supported"; | ||
186 | return -EINVAL; | ||
187 | } | ||
188 | |||
189 | if (!strcmp(argv[1], "-")) { | ||
190 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && | ||
191 | (!rs->dev[i].rdev.recovery_offset)) { | ||
192 | rs->ti->error = "Drive designated for rebuild not specified"; | ||
193 | return -EINVAL; | ||
194 | } | ||
195 | |||
196 | continue; | ||
197 | } | ||
198 | |||
199 | ret = dm_get_device(rs->ti, argv[1], | ||
200 | dm_table_get_mode(rs->ti->table), | ||
201 | &rs->dev[i].data_dev); | ||
202 | if (ret) { | ||
203 | rs->ti->error = "RAID device lookup failure"; | ||
204 | return ret; | ||
205 | } | ||
206 | |||
207 | rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; | ||
208 | list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); | ||
209 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) | ||
210 | rebuild++; | ||
211 | } | ||
212 | |||
213 | if (metadata_available) { | ||
214 | rs->md.external = 0; | ||
215 | rs->md.persistent = 1; | ||
216 | rs->md.major_version = 2; | ||
217 | } else if (rebuild && !rs->md.recovery_cp) { | ||
218 | /* | ||
219 | * Without metadata, we will not be able to tell if the array | ||
220 | * is in-sync or not - we must assume it is not. Therefore, | ||
221 | * it is impossible to rebuild a drive. | ||
222 | * | ||
223 | * Even if there is metadata, the on-disk information may | ||
224 | * indicate that the array is not in-sync and it will then | ||
225 | * fail at that time. | ||
226 | * | ||
227 | * User could specify 'nosync' option if desperate. | ||
228 | */ | ||
229 | DMERR("Unable to rebuild drive while array is not in-sync"); | ||
230 | rs->ti->error = "RAID device lookup failure"; | ||
231 | return -EINVAL; | ||
232 | } | ||
233 | |||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Possible arguments are... | ||
239 | * RAID456: | ||
240 | * <chunk_size> [optional_args] | ||
241 | * | ||
242 | * Optional args: | ||
243 | * [[no]sync] Force or prevent recovery of the entire array | ||
244 | * [rebuild <idx>] Rebuild the drive indicated by the index | ||
245 | * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits | ||
246 | * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization | ||
247 | * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization | ||
248 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) | ||
249 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs | ||
250 | */ | ||
251 | static int parse_raid_params(struct raid_set *rs, char **argv, | ||
252 | unsigned num_raid_params) | ||
253 | { | ||
254 | unsigned i, rebuild_cnt = 0; | ||
255 | unsigned long value; | ||
256 | char *key; | ||
257 | |||
258 | /* | ||
259 | * First, parse the in-order required arguments | ||
260 | */ | ||
261 | if ((strict_strtoul(argv[0], 10, &value) < 0) || | ||
262 | !is_power_of_2(value) || (value < 8)) { | ||
263 | rs->ti->error = "Bad chunk size"; | ||
264 | return -EINVAL; | ||
265 | } | ||
266 | |||
267 | rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; | ||
268 | argv++; | ||
269 | num_raid_params--; | ||
270 | |||
271 | /* | ||
272 | * Second, parse the unordered optional arguments | ||
273 | */ | ||
274 | for (i = 0; i < rs->md.raid_disks; i++) | ||
275 | set_bit(In_sync, &rs->dev[i].rdev.flags); | ||
276 | |||
277 | for (i = 0; i < num_raid_params; i++) { | ||
278 | if (!strcmp(argv[i], "nosync")) { | ||
279 | rs->md.recovery_cp = MaxSector; | ||
280 | rs->print_flags |= DMPF_NOSYNC; | ||
281 | rs->md.flags |= MD_SYNC_STATE_FORCED; | ||
282 | continue; | ||
283 | } | ||
284 | if (!strcmp(argv[i], "sync")) { | ||
285 | rs->md.recovery_cp = 0; | ||
286 | rs->print_flags |= DMPF_SYNC; | ||
287 | rs->md.flags |= MD_SYNC_STATE_FORCED; | ||
288 | continue; | ||
289 | } | ||
290 | |||
291 | /* The rest of the optional arguments come in key/value pairs */ | ||
292 | if ((i + 1) >= num_raid_params) { | ||
293 | rs->ti->error = "Wrong number of raid parameters given"; | ||
294 | return -EINVAL; | ||
295 | } | ||
296 | |||
297 | key = argv[i++]; | ||
298 | if (strict_strtoul(argv[i], 10, &value) < 0) { | ||
299 | rs->ti->error = "Bad numerical argument given in raid params"; | ||
300 | return -EINVAL; | ||
301 | } | ||
302 | |||
303 | if (!strcmp(key, "rebuild")) { | ||
304 | if (++rebuild_cnt > rs->raid_type->parity_devs) { | ||
305 | rs->ti->error = "Too many rebuild drives given"; | ||
306 | return -EINVAL; | ||
307 | } | ||
308 | if (value > rs->md.raid_disks) { | ||
309 | rs->ti->error = "Invalid rebuild index given"; | ||
310 | return -EINVAL; | ||
311 | } | ||
312 | clear_bit(In_sync, &rs->dev[value].rdev.flags); | ||
313 | rs->dev[value].rdev.recovery_offset = 0; | ||
314 | } else if (!strcmp(key, "max_write_behind")) { | ||
315 | rs->print_flags |= DMPF_MAX_WRITE_BEHIND; | ||
316 | |||
317 | /* | ||
318 | * In device-mapper, we specify things in sectors, but | ||
319 | * MD records this value in kB | ||
320 | */ | ||
321 | value /= 2; | ||
322 | if (value > COUNTER_MAX) { | ||
323 | rs->ti->error = "Max write-behind limit out of range"; | ||
324 | return -EINVAL; | ||
325 | } | ||
326 | rs->md.bitmap_info.max_write_behind = value; | ||
327 | } else if (!strcmp(key, "daemon_sleep")) { | ||
328 | rs->print_flags |= DMPF_DAEMON_SLEEP; | ||
329 | if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { | ||
330 | rs->ti->error = "daemon sleep period out of range"; | ||
331 | return -EINVAL; | ||
332 | } | ||
333 | rs->md.bitmap_info.daemon_sleep = value; | ||
334 | } else if (!strcmp(key, "stripe_cache")) { | ||
335 | rs->print_flags |= DMPF_STRIPE_CACHE; | ||
336 | |||
337 | /* | ||
338 | * In device-mapper, we specify things in sectors, but | ||
339 | * MD records this value in kB | ||
340 | */ | ||
341 | value /= 2; | ||
342 | |||
343 | if (rs->raid_type->level < 5) { | ||
344 | rs->ti->error = "Inappropriate argument: stripe_cache"; | ||
345 | return -EINVAL; | ||
346 | } | ||
347 | if (raid5_set_cache_size(&rs->md, (int)value)) { | ||
348 | rs->ti->error = "Bad stripe_cache size"; | ||
349 | return -EINVAL; | ||
350 | } | ||
351 | } else if (!strcmp(key, "min_recovery_rate")) { | ||
352 | rs->print_flags |= DMPF_MIN_RECOVERY_RATE; | ||
353 | if (value > INT_MAX) { | ||
354 | rs->ti->error = "min_recovery_rate out of range"; | ||
355 | return -EINVAL; | ||
356 | } | ||
357 | rs->md.sync_speed_min = (int)value; | ||
358 | } else if (!strcmp(key, "max_recovery_rate")) { | ||
359 | rs->print_flags |= DMPF_MAX_RECOVERY_RATE; | ||
360 | if (value > INT_MAX) { | ||
361 | rs->ti->error = "max_recovery_rate out of range"; | ||
362 | return -EINVAL; | ||
363 | } | ||
364 | rs->md.sync_speed_max = (int)value; | ||
365 | } else { | ||
366 | DMERR("Unable to parse RAID parameter: %s", key); | ||
367 | rs->ti->error = "Unable to parse RAID parameters"; | ||
368 | return -EINVAL; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | /* Assume there are no metadata devices until the drives are parsed */ | ||
373 | rs->md.persistent = 0; | ||
374 | rs->md.external = 1; | ||
375 | |||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | static void do_table_event(struct work_struct *ws) | ||
380 | { | ||
381 | struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); | ||
382 | |||
383 | dm_table_event(rs->ti->table); | ||
384 | } | ||
385 | |||
386 | static int raid_is_congested(struct dm_target_callbacks *cb, int bits) | ||
387 | { | ||
388 | struct raid_set *rs = container_of(cb, struct raid_set, callbacks); | ||
389 | |||
390 | return md_raid5_congested(&rs->md, bits); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * Construct a RAID4/5/6 mapping: | ||
395 | * Args: | ||
396 | * <raid_type> <#raid_params> <raid_params> \ | ||
397 | * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } | ||
398 | * | ||
399 | * ** metadata devices are not supported yet, use '-' instead ** | ||
400 | * | ||
401 | * <raid_params> varies by <raid_type>. See 'parse_raid_params' for | ||
402 | * details on possible <raid_params>. | ||
403 | */ | ||
404 | static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
405 | { | ||
406 | int ret; | ||
407 | struct raid_type *rt; | ||
408 | unsigned long num_raid_params, num_raid_devs; | ||
409 | struct raid_set *rs = NULL; | ||
410 | |||
411 | /* Must have at least <raid_type> <#raid_params> */ | ||
412 | if (argc < 2) { | ||
413 | ti->error = "Too few arguments"; | ||
414 | return -EINVAL; | ||
415 | } | ||
416 | |||
417 | /* raid type */ | ||
418 | rt = get_raid_type(argv[0]); | ||
419 | if (!rt) { | ||
420 | ti->error = "Unrecognised raid_type"; | ||
421 | return -EINVAL; | ||
422 | } | ||
423 | argc--; | ||
424 | argv++; | ||
425 | |||
426 | /* number of RAID parameters */ | ||
427 | if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { | ||
428 | ti->error = "Cannot understand number of RAID parameters"; | ||
429 | return -EINVAL; | ||
430 | } | ||
431 | argc--; | ||
432 | argv++; | ||
433 | |||
434 | /* Skip over RAID params for now and find out # of devices */ | ||
435 | if (num_raid_params + 1 > argc) { | ||
436 | ti->error = "Arguments do not agree with counts given"; | ||
437 | return -EINVAL; | ||
438 | } | ||
439 | |||
440 | if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || | ||
441 | (num_raid_devs >= INT_MAX)) { | ||
442 | ti->error = "Cannot understand number of raid devices"; | ||
443 | return -EINVAL; | ||
444 | } | ||
445 | |||
446 | rs = context_alloc(ti, rt, (unsigned)num_raid_devs); | ||
447 | if (IS_ERR(rs)) | ||
448 | return PTR_ERR(rs); | ||
449 | |||
450 | ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); | ||
451 | if (ret) | ||
452 | goto bad; | ||
453 | |||
454 | ret = -EINVAL; | ||
455 | |||
456 | argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ | ||
457 | argv += num_raid_params + 1; | ||
458 | |||
459 | if (argc != (num_raid_devs * 2)) { | ||
460 | ti->error = "Supplied RAID devices does not match the count given"; | ||
461 | goto bad; | ||
462 | } | ||
463 | |||
464 | ret = dev_parms(rs, argv); | ||
465 | if (ret) | ||
466 | goto bad; | ||
467 | |||
468 | INIT_WORK(&rs->md.event_work, do_table_event); | ||
469 | ti->split_io = rs->md.chunk_sectors; | ||
470 | ti->private = rs; | ||
471 | |||
472 | mutex_lock(&rs->md.reconfig_mutex); | ||
473 | ret = md_run(&rs->md); | ||
474 | rs->md.in_sync = 0; /* Assume already marked dirty */ | ||
475 | mutex_unlock(&rs->md.reconfig_mutex); | ||
476 | |||
477 | if (ret) { | ||
478 | ti->error = "Fail to run raid array"; | ||
479 | goto bad; | ||
480 | } | ||
481 | |||
482 | rs->callbacks.congested_fn = raid_is_congested; | ||
483 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); | ||
484 | |||
485 | return 0; | ||
486 | |||
487 | bad: | ||
488 | context_free(rs); | ||
489 | |||
490 | return ret; | ||
491 | } | ||
492 | |||
493 | static void raid_dtr(struct dm_target *ti) | ||
494 | { | ||
495 | struct raid_set *rs = ti->private; | ||
496 | |||
497 | list_del_init(&rs->callbacks.list); | ||
498 | md_stop(&rs->md); | ||
499 | context_free(rs); | ||
500 | } | ||
501 | |||
502 | static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) | ||
503 | { | ||
504 | struct raid_set *rs = ti->private; | ||
505 | mddev_t *mddev = &rs->md; | ||
506 | |||
507 | mddev->pers->make_request(mddev, bio); | ||
508 | |||
509 | return DM_MAPIO_SUBMITTED; | ||
510 | } | ||
511 | |||
512 | static int raid_status(struct dm_target *ti, status_type_t type, | ||
513 | char *result, unsigned maxlen) | ||
514 | { | ||
515 | struct raid_set *rs = ti->private; | ||
516 | unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ | ||
517 | unsigned sz = 0; | ||
518 | int i; | ||
519 | sector_t sync; | ||
520 | |||
521 | switch (type) { | ||
522 | case STATUSTYPE_INFO: | ||
523 | DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); | ||
524 | |||
525 | for (i = 0; i < rs->md.raid_disks; i++) { | ||
526 | if (test_bit(Faulty, &rs->dev[i].rdev.flags)) | ||
527 | DMEMIT("D"); | ||
528 | else if (test_bit(In_sync, &rs->dev[i].rdev.flags)) | ||
529 | DMEMIT("A"); | ||
530 | else | ||
531 | DMEMIT("a"); | ||
532 | } | ||
533 | |||
534 | if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) | ||
535 | sync = rs->md.curr_resync_completed; | ||
536 | else | ||
537 | sync = rs->md.recovery_cp; | ||
538 | |||
539 | if (sync > rs->md.resync_max_sectors) | ||
540 | sync = rs->md.resync_max_sectors; | ||
541 | |||
542 | DMEMIT(" %llu/%llu", | ||
543 | (unsigned long long) sync, | ||
544 | (unsigned long long) rs->md.resync_max_sectors); | ||
545 | |||
546 | break; | ||
547 | case STATUSTYPE_TABLE: | ||
548 | /* The string you would use to construct this array */ | ||
549 | for (i = 0; i < rs->md.raid_disks; i++) | ||
550 | if (rs->dev[i].data_dev && | ||
551 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) | ||
552 | raid_param_cnt++; /* for rebuilds */ | ||
553 | |||
554 | raid_param_cnt += (hweight64(rs->print_flags) * 2); | ||
555 | if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) | ||
556 | raid_param_cnt--; | ||
557 | |||
558 | DMEMIT("%s %u %u", rs->raid_type->name, | ||
559 | raid_param_cnt, rs->md.chunk_sectors); | ||
560 | |||
561 | if ((rs->print_flags & DMPF_SYNC) && | ||
562 | (rs->md.recovery_cp == MaxSector)) | ||
563 | DMEMIT(" sync"); | ||
564 | if (rs->print_flags & DMPF_NOSYNC) | ||
565 | DMEMIT(" nosync"); | ||
566 | |||
567 | for (i = 0; i < rs->md.raid_disks; i++) | ||
568 | if (rs->dev[i].data_dev && | ||
569 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) | ||
570 | DMEMIT(" rebuild %u", i); | ||
571 | |||
572 | if (rs->print_flags & DMPF_DAEMON_SLEEP) | ||
573 | DMEMIT(" daemon_sleep %lu", | ||
574 | rs->md.bitmap_info.daemon_sleep); | ||
575 | |||
576 | if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) | ||
577 | DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); | ||
578 | |||
579 | if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) | ||
580 | DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); | ||
581 | |||
582 | if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) | ||
583 | DMEMIT(" max_write_behind %lu", | ||
584 | rs->md.bitmap_info.max_write_behind); | ||
585 | |||
586 | if (rs->print_flags & DMPF_STRIPE_CACHE) { | ||
587 | raid5_conf_t *conf = rs->md.private; | ||
588 | |||
589 | /* convert from kiB to sectors */ | ||
590 | DMEMIT(" stripe_cache %d", | ||
591 | conf ? conf->max_nr_stripes * 2 : 0); | ||
592 | } | ||
593 | |||
594 | DMEMIT(" %d", rs->md.raid_disks); | ||
595 | for (i = 0; i < rs->md.raid_disks; i++) { | ||
596 | DMEMIT(" -"); /* metadata device */ | ||
597 | |||
598 | if (rs->dev[i].data_dev) | ||
599 | DMEMIT(" %s", rs->dev[i].data_dev->name); | ||
600 | else | ||
601 | DMEMIT(" -"); | ||
602 | } | ||
603 | } | ||
604 | |||
605 | return 0; | ||
606 | } | ||
607 | |||
608 | static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) | ||
609 | { | ||
610 | struct raid_set *rs = ti->private; | ||
611 | unsigned i; | ||
612 | int ret = 0; | ||
613 | |||
614 | for (i = 0; !ret && i < rs->md.raid_disks; i++) | ||
615 | if (rs->dev[i].data_dev) | ||
616 | ret = fn(ti, | ||
617 | rs->dev[i].data_dev, | ||
618 | 0, /* No offset on data devs */ | ||
619 | rs->md.dev_sectors, | ||
620 | data); | ||
621 | |||
622 | return ret; | ||
623 | } | ||
624 | |||
625 | static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
626 | { | ||
627 | struct raid_set *rs = ti->private; | ||
628 | unsigned chunk_size = rs->md.chunk_sectors << 9; | ||
629 | raid5_conf_t *conf = rs->md.private; | ||
630 | |||
631 | blk_limits_io_min(limits, chunk_size); | ||
632 | blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); | ||
633 | } | ||
634 | |||
635 | static void raid_presuspend(struct dm_target *ti) | ||
636 | { | ||
637 | struct raid_set *rs = ti->private; | ||
638 | |||
639 | md_stop_writes(&rs->md); | ||
640 | } | ||
641 | |||
642 | static void raid_postsuspend(struct dm_target *ti) | ||
643 | { | ||
644 | struct raid_set *rs = ti->private; | ||
645 | |||
646 | mddev_suspend(&rs->md); | ||
647 | } | ||
648 | |||
649 | static void raid_resume(struct dm_target *ti) | ||
650 | { | ||
651 | struct raid_set *rs = ti->private; | ||
652 | |||
653 | mddev_resume(&rs->md); | ||
654 | } | ||
655 | |||
656 | static struct target_type raid_target = { | ||
657 | .name = "raid", | ||
658 | .version = {1, 0, 0}, | ||
659 | .module = THIS_MODULE, | ||
660 | .ctr = raid_ctr, | ||
661 | .dtr = raid_dtr, | ||
662 | .map = raid_map, | ||
663 | .status = raid_status, | ||
664 | .iterate_devices = raid_iterate_devices, | ||
665 | .io_hints = raid_io_hints, | ||
666 | .presuspend = raid_presuspend, | ||
667 | .postsuspend = raid_postsuspend, | ||
668 | .resume = raid_resume, | ||
669 | }; | ||
670 | |||
671 | static int __init dm_raid_init(void) | ||
672 | { | ||
673 | return dm_register_target(&raid_target); | ||
674 | } | ||
675 | |||
676 | static void __exit dm_raid_exit(void) | ||
677 | { | ||
678 | dm_unregister_target(&raid_target); | ||
679 | } | ||
680 | |||
681 | module_init(dm_raid_init); | ||
682 | module_exit(dm_raid_exit); | ||
683 | |||
684 | MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); | ||
685 | MODULE_ALIAS("dm-raid4"); | ||
686 | MODULE_ALIAS("dm-raid5"); | ||
687 | MODULE_ALIAS("dm-raid6"); | ||
688 | MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); | ||
689 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 7c081bcbc3cf..9bfd057be686 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -22,8 +22,6 @@ | |||
22 | #define DM_MSG_PREFIX "raid1" | 22 | #define DM_MSG_PREFIX "raid1" |
23 | 23 | ||
24 | #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ | 24 | #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ |
25 | #define DM_IO_PAGES 64 | ||
26 | #define DM_KCOPYD_PAGES 64 | ||
27 | 25 | ||
28 | #define DM_RAID1_HANDLE_ERRORS 0x01 | 26 | #define DM_RAID1_HANDLE_ERRORS 0x01 |
29 | #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) | 27 | #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) |
@@ -259,9 +257,9 @@ static int mirror_flush(struct dm_target *ti) | |||
259 | struct dm_io_region io[ms->nr_mirrors]; | 257 | struct dm_io_region io[ms->nr_mirrors]; |
260 | struct mirror *m; | 258 | struct mirror *m; |
261 | struct dm_io_request io_req = { | 259 | struct dm_io_request io_req = { |
262 | .bi_rw = WRITE_BARRIER, | 260 | .bi_rw = WRITE_FLUSH, |
263 | .mem.type = DM_IO_KMEM, | 261 | .mem.type = DM_IO_KMEM, |
264 | .mem.ptr.bvec = NULL, | 262 | .mem.ptr.addr = NULL, |
265 | .client = ms->io_client, | 263 | .client = ms->io_client, |
266 | }; | 264 | }; |
267 | 265 | ||
@@ -629,7 +627,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
629 | struct dm_io_region io[ms->nr_mirrors], *dest = io; | 627 | struct dm_io_region io[ms->nr_mirrors], *dest = io; |
630 | struct mirror *m; | 628 | struct mirror *m; |
631 | struct dm_io_request io_req = { | 629 | struct dm_io_request io_req = { |
632 | .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), | 630 | .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), |
633 | .mem.type = DM_IO_BVEC, | 631 | .mem.type = DM_IO_BVEC, |
634 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 632 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
635 | .notify.fn = write_callback, | 633 | .notify.fn = write_callback, |
@@ -637,6 +635,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio) | |||
637 | .client = ms->io_client, | 635 | .client = ms->io_client, |
638 | }; | 636 | }; |
639 | 637 | ||
638 | if (bio->bi_rw & REQ_DISCARD) { | ||
639 | io_req.bi_rw |= REQ_DISCARD; | ||
640 | io_req.mem.type = DM_IO_KMEM; | ||
641 | io_req.mem.ptr.addr = NULL; | ||
642 | } | ||
643 | |||
640 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) | 644 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) |
641 | map_region(dest++, m, bio); | 645 | map_region(dest++, m, bio); |
642 | 646 | ||
@@ -670,7 +674,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) | |||
670 | bio_list_init(&requeue); | 674 | bio_list_init(&requeue); |
671 | 675 | ||
672 | while ((bio = bio_list_pop(writes))) { | 676 | while ((bio = bio_list_pop(writes))) { |
673 | if (unlikely(bio_empty_barrier(bio))) { | 677 | if ((bio->bi_rw & REQ_FLUSH) || |
678 | (bio->bi_rw & REQ_DISCARD)) { | ||
674 | bio_list_add(&sync, bio); | 679 | bio_list_add(&sync, bio); |
675 | continue; | 680 | continue; |
676 | } | 681 | } |
@@ -835,8 +840,6 @@ static void do_mirror(struct work_struct *work) | |||
835 | do_reads(ms, &reads); | 840 | do_reads(ms, &reads); |
836 | do_writes(ms, &writes); | 841 | do_writes(ms, &writes); |
837 | do_failures(ms, &failures); | 842 | do_failures(ms, &failures); |
838 | |||
839 | dm_table_unplug_all(ms->ti->table); | ||
840 | } | 843 | } |
841 | 844 | ||
842 | /*----------------------------------------------------------------- | 845 | /*----------------------------------------------------------------- |
@@ -882,7 +885,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, | |||
882 | return NULL; | 885 | return NULL; |
883 | } | 886 | } |
884 | 887 | ||
885 | ms->io_client = dm_io_client_create(DM_IO_PAGES); | 888 | ms->io_client = dm_io_client_create(); |
886 | if (IS_ERR(ms->io_client)) { | 889 | if (IS_ERR(ms->io_client)) { |
887 | ti->error = "Error creating dm_io client"; | 890 | ti->error = "Error creating dm_io client"; |
888 | mempool_destroy(ms->read_record_pool); | 891 | mempool_destroy(ms->read_record_pool); |
@@ -1076,8 +1079,10 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1076 | ti->private = ms; | 1079 | ti->private = ms; |
1077 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1080 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1078 | ti->num_flush_requests = 1; | 1081 | ti->num_flush_requests = 1; |
1082 | ti->num_discard_requests = 1; | ||
1079 | 1083 | ||
1080 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 1084 | ms->kmirrord_wq = alloc_workqueue("kmirrord", |
1085 | WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); | ||
1081 | if (!ms->kmirrord_wq) { | 1086 | if (!ms->kmirrord_wq) { |
1082 | DMERR("couldn't start kmirrord"); | 1087 | DMERR("couldn't start kmirrord"); |
1083 | r = -ENOMEM; | 1088 | r = -ENOMEM; |
@@ -1110,9 +1115,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1110 | goto err_destroy_wq; | 1115 | goto err_destroy_wq; |
1111 | } | 1116 | } |
1112 | 1117 | ||
1113 | r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); | 1118 | ms->kcopyd_client = dm_kcopyd_client_create(); |
1114 | if (r) | 1119 | if (IS_ERR(ms->kcopyd_client)) { |
1120 | r = PTR_ERR(ms->kcopyd_client); | ||
1115 | goto err_destroy_wq; | 1121 | goto err_destroy_wq; |
1122 | } | ||
1116 | 1123 | ||
1117 | wakeup_mirrord(ms); | 1124 | wakeup_mirrord(ms); |
1118 | return 0; | 1125 | return 0; |
@@ -1130,7 +1137,7 @@ static void mirror_dtr(struct dm_target *ti) | |||
1130 | 1137 | ||
1131 | del_timer_sync(&ms->timer); | 1138 | del_timer_sync(&ms->timer); |
1132 | flush_workqueue(ms->kmirrord_wq); | 1139 | flush_workqueue(ms->kmirrord_wq); |
1133 | flush_scheduled_work(); | 1140 | flush_work_sync(&ms->trigger_event); |
1134 | dm_kcopyd_client_destroy(ms->kcopyd_client); | 1141 | dm_kcopyd_client_destroy(ms->kcopyd_client); |
1135 | destroy_workqueue(ms->kmirrord_wq); | 1142 | destroy_workqueue(ms->kmirrord_wq); |
1136 | free_context(ms, ti, ms->nr_mirrors); | 1143 | free_context(ms, ti, ms->nr_mirrors); |
@@ -1203,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, | |||
1203 | * We need to dec pending if this was a write. | 1210 | * We need to dec pending if this was a write. |
1204 | */ | 1211 | */ |
1205 | if (rw == WRITE) { | 1212 | if (rw == WRITE) { |
1206 | if (likely(!bio_empty_barrier(bio))) | 1213 | if (!(bio->bi_rw & REQ_FLUSH)) |
1207 | dm_rh_dec(ms->rh, map_context->ll); | 1214 | dm_rh_dec(ms->rh, map_context->ll); |
1208 | return error; | 1215 | return error; |
1209 | } | 1216 | } |
@@ -1406,7 +1413,7 @@ static int mirror_iterate_devices(struct dm_target *ti, | |||
1406 | 1413 | ||
1407 | static struct target_type mirror_target = { | 1414 | static struct target_type mirror_target = { |
1408 | .name = "mirror", | 1415 | .name = "mirror", |
1409 | .version = {1, 12, 0}, | 1416 | .version = {1, 12, 1}, |
1410 | .module = THIS_MODULE, | 1417 | .module = THIS_MODULE, |
1411 | .ctr = mirror_ctr, | 1418 | .ctr = mirror_ctr, |
1412 | .dtr = mirror_dtr, | 1419 | .dtr = mirror_dtr, |
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index bd5c58b28868..7771ed212182 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c | |||
@@ -81,9 +81,9 @@ struct dm_region_hash { | |||
81 | struct list_head failed_recovered_regions; | 81 | struct list_head failed_recovered_regions; |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * If there was a barrier failure no regions can be marked clean. | 84 | * If there was a flush failure no regions can be marked clean. |
85 | */ | 85 | */ |
86 | int barrier_failure; | 86 | int flush_failure; |
87 | 87 | ||
88 | void *context; | 88 | void *context; |
89 | sector_t target_begin; | 89 | sector_t target_begin; |
@@ -217,7 +217,7 @@ struct dm_region_hash *dm_region_hash_create( | |||
217 | INIT_LIST_HEAD(&rh->quiesced_regions); | 217 | INIT_LIST_HEAD(&rh->quiesced_regions); |
218 | INIT_LIST_HEAD(&rh->recovered_regions); | 218 | INIT_LIST_HEAD(&rh->recovered_regions); |
219 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | 219 | INIT_LIST_HEAD(&rh->failed_recovered_regions); |
220 | rh->barrier_failure = 0; | 220 | rh->flush_failure = 0; |
221 | 221 | ||
222 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | 222 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, |
223 | sizeof(struct dm_region)); | 223 | sizeof(struct dm_region)); |
@@ -399,8 +399,8 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) | |||
399 | region_t region = dm_rh_bio_to_region(rh, bio); | 399 | region_t region = dm_rh_bio_to_region(rh, bio); |
400 | int recovering = 0; | 400 | int recovering = 0; |
401 | 401 | ||
402 | if (bio_empty_barrier(bio)) { | 402 | if (bio->bi_rw & REQ_FLUSH) { |
403 | rh->barrier_failure = 1; | 403 | rh->flush_failure = 1; |
404 | return; | 404 | return; |
405 | } | 405 | } |
406 | 406 | ||
@@ -419,7 +419,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) | |||
419 | /* | 419 | /* |
420 | * Possible cases: | 420 | * Possible cases: |
421 | * 1) DM_RH_DIRTY | 421 | * 1) DM_RH_DIRTY |
422 | * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed | 422 | * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed |
423 | * 3) DM_RH_RECOVERING: flushing pending writes | 423 | * 3) DM_RH_RECOVERING: flushing pending writes |
424 | * Either case, the region should have not been connected to list. | 424 | * Either case, the region should have not been connected to list. |
425 | */ | 425 | */ |
@@ -524,7 +524,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | |||
524 | struct bio *bio; | 524 | struct bio *bio; |
525 | 525 | ||
526 | for (bio = bios->head; bio; bio = bio->bi_next) { | 526 | for (bio = bios->head; bio; bio = bio->bi_next) { |
527 | if (bio_empty_barrier(bio)) | 527 | if (bio->bi_rw & REQ_FLUSH) |
528 | continue; | 528 | continue; |
529 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | 529 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); |
530 | } | 530 | } |
@@ -555,9 +555,9 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region) | |||
555 | */ | 555 | */ |
556 | 556 | ||
557 | /* do nothing for DM_RH_NOSYNC */ | 557 | /* do nothing for DM_RH_NOSYNC */ |
558 | if (unlikely(rh->barrier_failure)) { | 558 | if (unlikely(rh->flush_failure)) { |
559 | /* | 559 | /* |
560 | * If a write barrier failed some time ago, we | 560 | * If a write flush failed some time ago, we |
561 | * don't know whether or not this write made it | 561 | * don't know whether or not this write made it |
562 | * to the disk, so we must resync the device. | 562 | * to the disk, so we must resync the device. |
563 | */ | 563 | */ |
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index cc2bdb83f9ad..135c2f1fdbfc 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c | |||
@@ -154,11 +154,6 @@ struct pstore { | |||
154 | struct workqueue_struct *metadata_wq; | 154 | struct workqueue_struct *metadata_wq; |
155 | }; | 155 | }; |
156 | 156 | ||
157 | static unsigned sectors_to_pages(unsigned sectors) | ||
158 | { | ||
159 | return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); | ||
160 | } | ||
161 | |||
162 | static int alloc_area(struct pstore *ps) | 157 | static int alloc_area(struct pstore *ps) |
163 | { | 158 | { |
164 | int r = -ENOMEM; | 159 | int r = -ENOMEM; |
@@ -254,9 +249,9 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, | |||
254 | * Issue the synchronous I/O from a different thread | 249 | * Issue the synchronous I/O from a different thread |
255 | * to avoid generic_make_request recursion. | 250 | * to avoid generic_make_request recursion. |
256 | */ | 251 | */ |
257 | INIT_WORK_ON_STACK(&req.work, do_metadata); | 252 | INIT_WORK_ONSTACK(&req.work, do_metadata); |
258 | queue_work(ps->metadata_wq, &req.work); | 253 | queue_work(ps->metadata_wq, &req.work); |
259 | flush_workqueue(ps->metadata_wq); | 254 | flush_work(&req.work); |
260 | 255 | ||
261 | return req.result; | 256 | return req.result; |
262 | } | 257 | } |
@@ -318,8 +313,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
318 | chunk_size_supplied = 0; | 313 | chunk_size_supplied = 0; |
319 | } | 314 | } |
320 | 315 | ||
321 | ps->io_client = dm_io_client_create(sectors_to_pages(ps->store-> | 316 | ps->io_client = dm_io_client_create(); |
322 | chunk_size)); | ||
323 | if (IS_ERR(ps->io_client)) | 317 | if (IS_ERR(ps->io_client)) |
324 | return PTR_ERR(ps->io_client); | 318 | return PTR_ERR(ps->io_client); |
325 | 319 | ||
@@ -368,11 +362,6 @@ static int read_header(struct pstore *ps, int *new_snapshot) | |||
368 | return r; | 362 | return r; |
369 | } | 363 | } |
370 | 364 | ||
371 | r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), | ||
372 | ps->io_client); | ||
373 | if (r) | ||
374 | return r; | ||
375 | |||
376 | r = alloc_area(ps); | 365 | r = alloc_area(ps); |
377 | return r; | 366 | return r; |
378 | 367 | ||
@@ -687,7 +676,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, | |||
687 | /* | 676 | /* |
688 | * Commit exceptions to disk. | 677 | * Commit exceptions to disk. |
689 | */ | 678 | */ |
690 | if (ps->valid && area_io(ps, WRITE_BARRIER)) | 679 | if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) |
691 | ps->valid = 0; | 680 | ps->valid = 0; |
692 | 681 | ||
693 | /* | 682 | /* |
@@ -818,7 +807,7 @@ static int persistent_ctr(struct dm_exception_store *store, | |||
818 | atomic_set(&ps->pending_count, 0); | 807 | atomic_set(&ps->pending_count, 0); |
819 | ps->callbacks = NULL; | 808 | ps->callbacks = NULL; |
820 | 809 | ||
821 | ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); | 810 | ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); |
822 | if (!ps->metadata_wq) { | 811 | if (!ps->metadata_wq) { |
823 | kfree(ps); | 812 | kfree(ps); |
824 | DMERR("couldn't start header metadata update thread"); | 813 | DMERR("couldn't start header metadata update thread"); |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5974d3094d97..9ecff5f3023a 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
@@ -19,7 +19,6 @@ | |||
19 | #include <linux/vmalloc.h> | 19 | #include <linux/vmalloc.h> |
20 | #include <linux/log2.h> | 20 | #include <linux/log2.h> |
21 | #include <linux/dm-kcopyd.h> | 21 | #include <linux/dm-kcopyd.h> |
22 | #include <linux/workqueue.h> | ||
23 | 22 | ||
24 | #include "dm-exception-store.h" | 23 | #include "dm-exception-store.h" |
25 | 24 | ||
@@ -41,11 +40,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; | |||
41 | #define SNAPSHOT_COPY_PRIORITY 2 | 40 | #define SNAPSHOT_COPY_PRIORITY 2 |
42 | 41 | ||
43 | /* | 42 | /* |
44 | * Reserve 1MB for each snapshot initially (with minimum of 1 page). | ||
45 | */ | ||
46 | #define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) | ||
47 | |||
48 | /* | ||
49 | * The size of the mempool used to track chunks in use. | 43 | * The size of the mempool used to track chunks in use. |
50 | */ | 44 | */ |
51 | #define MIN_IOS 256 | 45 | #define MIN_IOS 256 |
@@ -80,9 +74,6 @@ struct dm_snapshot { | |||
80 | /* Origin writes don't trigger exceptions until this is set */ | 74 | /* Origin writes don't trigger exceptions until this is set */ |
81 | int active; | 75 | int active; |
82 | 76 | ||
83 | /* Whether or not owning mapped_device is suspended */ | ||
84 | int suspended; | ||
85 | |||
86 | atomic_t pending_exceptions_count; | 77 | atomic_t pending_exceptions_count; |
87 | 78 | ||
88 | mempool_t *pending_pool; | 79 | mempool_t *pending_pool; |
@@ -106,10 +97,6 @@ struct dm_snapshot { | |||
106 | 97 | ||
107 | struct dm_kcopyd_client *kcopyd_client; | 98 | struct dm_kcopyd_client *kcopyd_client; |
108 | 99 | ||
109 | /* Queue of snapshot writes for ksnapd to flush */ | ||
110 | struct bio_list queued_bios; | ||
111 | struct work_struct queued_bios_work; | ||
112 | |||
113 | /* Wait for events based on state_bits */ | 100 | /* Wait for events based on state_bits */ |
114 | unsigned long state_bits; | 101 | unsigned long state_bits; |
115 | 102 | ||
@@ -160,9 +147,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s) | |||
160 | } | 147 | } |
161 | EXPORT_SYMBOL(dm_snap_cow); | 148 | EXPORT_SYMBOL(dm_snap_cow); |
162 | 149 | ||
163 | static struct workqueue_struct *ksnapd; | ||
164 | static void flush_queued_bios(struct work_struct *work); | ||
165 | |||
166 | static sector_t chunk_to_sector(struct dm_exception_store *store, | 150 | static sector_t chunk_to_sector(struct dm_exception_store *store, |
167 | chunk_t chunk) | 151 | chunk_t chunk) |
168 | { | 152 | { |
@@ -706,8 +690,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) | |||
706 | return 0; | 690 | return 0; |
707 | } | 691 | } |
708 | 692 | ||
709 | #define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) | ||
710 | |||
711 | /* | 693 | /* |
712 | * Return a minimum chunk size of all snapshots that have the specified origin. | 694 | * Return a minimum chunk size of all snapshots that have the specified origin. |
713 | * Return zero if the origin has no snapshots. | 695 | * Return zero if the origin has no snapshots. |
@@ -1093,7 +1075,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1093 | argv++; | 1075 | argv++; |
1094 | argc--; | 1076 | argc--; |
1095 | 1077 | ||
1096 | r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); | 1078 | r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); |
1097 | if (r) { | 1079 | if (r) { |
1098 | ti->error = "Cannot get COW device"; | 1080 | ti->error = "Cannot get COW device"; |
1099 | goto bad_cow; | 1081 | goto bad_cow; |
@@ -1112,7 +1094,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1112 | s->ti = ti; | 1094 | s->ti = ti; |
1113 | s->valid = 1; | 1095 | s->valid = 1; |
1114 | s->active = 0; | 1096 | s->active = 0; |
1115 | s->suspended = 0; | ||
1116 | atomic_set(&s->pending_exceptions_count, 0); | 1097 | atomic_set(&s->pending_exceptions_count, 0); |
1117 | init_rwsem(&s->lock); | 1098 | init_rwsem(&s->lock); |
1118 | INIT_LIST_HEAD(&s->list); | 1099 | INIT_LIST_HEAD(&s->list); |
@@ -1130,8 +1111,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1130 | goto bad_hash_tables; | 1111 | goto bad_hash_tables; |
1131 | } | 1112 | } |
1132 | 1113 | ||
1133 | r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); | 1114 | s->kcopyd_client = dm_kcopyd_client_create(); |
1134 | if (r) { | 1115 | if (IS_ERR(s->kcopyd_client)) { |
1116 | r = PTR_ERR(s->kcopyd_client); | ||
1135 | ti->error = "Could not create kcopyd client"; | 1117 | ti->error = "Could not create kcopyd client"; |
1136 | goto bad_kcopyd; | 1118 | goto bad_kcopyd; |
1137 | } | 1119 | } |
@@ -1155,9 +1137,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1155 | 1137 | ||
1156 | spin_lock_init(&s->tracked_chunk_lock); | 1138 | spin_lock_init(&s->tracked_chunk_lock); |
1157 | 1139 | ||
1158 | bio_list_init(&s->queued_bios); | ||
1159 | INIT_WORK(&s->queued_bios_work, flush_queued_bios); | ||
1160 | |||
1161 | ti->private = s; | 1140 | ti->private = s; |
1162 | ti->num_flush_requests = num_flush_requests; | 1141 | ti->num_flush_requests = num_flush_requests; |
1163 | 1142 | ||
@@ -1281,8 +1260,6 @@ static void snapshot_dtr(struct dm_target *ti) | |||
1281 | struct dm_snapshot *s = ti->private; | 1260 | struct dm_snapshot *s = ti->private; |
1282 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; | 1261 | struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; |
1283 | 1262 | ||
1284 | flush_workqueue(ksnapd); | ||
1285 | |||
1286 | down_read(&_origins_lock); | 1263 | down_read(&_origins_lock); |
1287 | /* Check whether exception handover must be cancelled */ | 1264 | /* Check whether exception handover must be cancelled */ |
1288 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); | 1265 | (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); |
@@ -1344,20 +1321,6 @@ static void flush_bios(struct bio *bio) | |||
1344 | } | 1321 | } |
1345 | } | 1322 | } |
1346 | 1323 | ||
1347 | static void flush_queued_bios(struct work_struct *work) | ||
1348 | { | ||
1349 | struct dm_snapshot *s = | ||
1350 | container_of(work, struct dm_snapshot, queued_bios_work); | ||
1351 | struct bio *queued_bios; | ||
1352 | unsigned long flags; | ||
1353 | |||
1354 | spin_lock_irqsave(&s->pe_lock, flags); | ||
1355 | queued_bios = bio_list_get(&s->queued_bios); | ||
1356 | spin_unlock_irqrestore(&s->pe_lock, flags); | ||
1357 | |||
1358 | flush_bios(queued_bios); | ||
1359 | } | ||
1360 | |||
1361 | static int do_origin(struct dm_dev *origin, struct bio *bio); | 1324 | static int do_origin(struct dm_dev *origin, struct bio *bio); |
1362 | 1325 | ||
1363 | /* | 1326 | /* |
@@ -1587,7 +1550,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, | |||
1587 | chunk_t chunk; | 1550 | chunk_t chunk; |
1588 | struct dm_snap_pending_exception *pe = NULL; | 1551 | struct dm_snap_pending_exception *pe = NULL; |
1589 | 1552 | ||
1590 | if (unlikely(bio_empty_barrier(bio))) { | 1553 | if (bio->bi_rw & REQ_FLUSH) { |
1591 | bio->bi_bdev = s->cow->bdev; | 1554 | bio->bi_bdev = s->cow->bdev; |
1592 | return DM_MAPIO_REMAPPED; | 1555 | return DM_MAPIO_REMAPPED; |
1593 | } | 1556 | } |
@@ -1691,7 +1654,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, | |||
1691 | int r = DM_MAPIO_REMAPPED; | 1654 | int r = DM_MAPIO_REMAPPED; |
1692 | chunk_t chunk; | 1655 | chunk_t chunk; |
1693 | 1656 | ||
1694 | if (unlikely(bio_empty_barrier(bio))) { | 1657 | if (bio->bi_rw & REQ_FLUSH) { |
1695 | if (!map_context->target_request_nr) | 1658 | if (!map_context->target_request_nr) |
1696 | bio->bi_bdev = s->origin->bdev; | 1659 | bio->bi_bdev = s->origin->bdev; |
1697 | else | 1660 | else |
@@ -1762,15 +1725,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti) | |||
1762 | stop_merge(s); | 1725 | stop_merge(s); |
1763 | } | 1726 | } |
1764 | 1727 | ||
1765 | static void snapshot_postsuspend(struct dm_target *ti) | ||
1766 | { | ||
1767 | struct dm_snapshot *s = ti->private; | ||
1768 | |||
1769 | down_write(&s->lock); | ||
1770 | s->suspended = 1; | ||
1771 | up_write(&s->lock); | ||
1772 | } | ||
1773 | |||
1774 | static int snapshot_preresume(struct dm_target *ti) | 1728 | static int snapshot_preresume(struct dm_target *ti) |
1775 | { | 1729 | { |
1776 | int r = 0; | 1730 | int r = 0; |
@@ -1785,7 +1739,7 @@ static int snapshot_preresume(struct dm_target *ti) | |||
1785 | DMERR("Unable to resume snapshot source until " | 1739 | DMERR("Unable to resume snapshot source until " |
1786 | "handover completes."); | 1740 | "handover completes."); |
1787 | r = -EINVAL; | 1741 | r = -EINVAL; |
1788 | } else if (!snap_src->suspended) { | 1742 | } else if (!dm_suspended(snap_src->ti)) { |
1789 | DMERR("Unable to perform snapshot handover until " | 1743 | DMERR("Unable to perform snapshot handover until " |
1790 | "source is suspended."); | 1744 | "source is suspended."); |
1791 | r = -EINVAL; | 1745 | r = -EINVAL; |
@@ -1818,7 +1772,6 @@ static void snapshot_resume(struct dm_target *ti) | |||
1818 | 1772 | ||
1819 | down_write(&s->lock); | 1773 | down_write(&s->lock); |
1820 | s->active = 1; | 1774 | s->active = 1; |
1821 | s->suspended = 0; | ||
1822 | up_write(&s->lock); | 1775 | up_write(&s->lock); |
1823 | } | 1776 | } |
1824 | 1777 | ||
@@ -2135,7 +2088,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio, | |||
2135 | struct dm_dev *dev = ti->private; | 2088 | struct dm_dev *dev = ti->private; |
2136 | bio->bi_bdev = dev->bdev; | 2089 | bio->bi_bdev = dev->bdev; |
2137 | 2090 | ||
2138 | if (unlikely(bio_empty_barrier(bio))) | 2091 | if (bio->bi_rw & REQ_FLUSH) |
2139 | return DM_MAPIO_REMAPPED; | 2092 | return DM_MAPIO_REMAPPED; |
2140 | 2093 | ||
2141 | /* Only tell snapshots if this is a write */ | 2094 | /* Only tell snapshots if this is a write */ |
@@ -2196,7 +2149,7 @@ static int origin_iterate_devices(struct dm_target *ti, | |||
2196 | 2149 | ||
2197 | static struct target_type origin_target = { | 2150 | static struct target_type origin_target = { |
2198 | .name = "snapshot-origin", | 2151 | .name = "snapshot-origin", |
2199 | .version = {1, 7, 0}, | 2152 | .version = {1, 7, 1}, |
2200 | .module = THIS_MODULE, | 2153 | .module = THIS_MODULE, |
2201 | .ctr = origin_ctr, | 2154 | .ctr = origin_ctr, |
2202 | .dtr = origin_dtr, | 2155 | .dtr = origin_dtr, |
@@ -2209,13 +2162,12 @@ static struct target_type origin_target = { | |||
2209 | 2162 | ||
2210 | static struct target_type snapshot_target = { | 2163 | static struct target_type snapshot_target = { |
2211 | .name = "snapshot", | 2164 | .name = "snapshot", |
2212 | .version = {1, 9, 0}, | 2165 | .version = {1, 10, 0}, |
2213 | .module = THIS_MODULE, | 2166 | .module = THIS_MODULE, |
2214 | .ctr = snapshot_ctr, | 2167 | .ctr = snapshot_ctr, |
2215 | .dtr = snapshot_dtr, | 2168 | .dtr = snapshot_dtr, |
2216 | .map = snapshot_map, | 2169 | .map = snapshot_map, |
2217 | .end_io = snapshot_end_io, | 2170 | .end_io = snapshot_end_io, |
2218 | .postsuspend = snapshot_postsuspend, | ||
2219 | .preresume = snapshot_preresume, | 2171 | .preresume = snapshot_preresume, |
2220 | .resume = snapshot_resume, | 2172 | .resume = snapshot_resume, |
2221 | .status = snapshot_status, | 2173 | .status = snapshot_status, |
@@ -2224,14 +2176,13 @@ static struct target_type snapshot_target = { | |||
2224 | 2176 | ||
2225 | static struct target_type merge_target = { | 2177 | static struct target_type merge_target = { |
2226 | .name = dm_snapshot_merge_target_name, | 2178 | .name = dm_snapshot_merge_target_name, |
2227 | .version = {1, 0, 0}, | 2179 | .version = {1, 1, 0}, |
2228 | .module = THIS_MODULE, | 2180 | .module = THIS_MODULE, |
2229 | .ctr = snapshot_ctr, | 2181 | .ctr = snapshot_ctr, |
2230 | .dtr = snapshot_dtr, | 2182 | .dtr = snapshot_dtr, |
2231 | .map = snapshot_merge_map, | 2183 | .map = snapshot_merge_map, |
2232 | .end_io = snapshot_end_io, | 2184 | .end_io = snapshot_end_io, |
2233 | .presuspend = snapshot_merge_presuspend, | 2185 | .presuspend = snapshot_merge_presuspend, |
2234 | .postsuspend = snapshot_postsuspend, | ||
2235 | .preresume = snapshot_preresume, | 2186 | .preresume = snapshot_preresume, |
2236 | .resume = snapshot_merge_resume, | 2187 | .resume = snapshot_merge_resume, |
2237 | .status = snapshot_status, | 2188 | .status = snapshot_status, |
@@ -2293,17 +2244,8 @@ static int __init dm_snapshot_init(void) | |||
2293 | goto bad_tracked_chunk_cache; | 2244 | goto bad_tracked_chunk_cache; |
2294 | } | 2245 | } |
2295 | 2246 | ||
2296 | ksnapd = create_singlethread_workqueue("ksnapd"); | ||
2297 | if (!ksnapd) { | ||
2298 | DMERR("Failed to create ksnapd workqueue."); | ||
2299 | r = -ENOMEM; | ||
2300 | goto bad_pending_pool; | ||
2301 | } | ||
2302 | |||
2303 | return 0; | 2247 | return 0; |
2304 | 2248 | ||
2305 | bad_pending_pool: | ||
2306 | kmem_cache_destroy(tracked_chunk_cache); | ||
2307 | bad_tracked_chunk_cache: | 2249 | bad_tracked_chunk_cache: |
2308 | kmem_cache_destroy(pending_cache); | 2250 | kmem_cache_destroy(pending_cache); |
2309 | bad_pending_cache: | 2251 | bad_pending_cache: |
@@ -2324,8 +2266,6 @@ bad_register_snapshot_target: | |||
2324 | 2266 | ||
2325 | static void __exit dm_snapshot_exit(void) | 2267 | static void __exit dm_snapshot_exit(void) |
2326 | { | 2268 | { |
2327 | destroy_workqueue(ksnapd); | ||
2328 | |||
2329 | dm_unregister_target(&snapshot_target); | 2269 | dm_unregister_target(&snapshot_target); |
2330 | dm_unregister_target(&origin_target); | 2270 | dm_unregister_target(&origin_target); |
2331 | dm_unregister_target(&merge_target); | 2271 | dm_unregister_target(&merge_target); |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c297f6da91ea..3d80cf0c152d 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -39,23 +39,20 @@ struct stripe_c { | |||
39 | struct dm_target *ti; | 39 | struct dm_target *ti; |
40 | 40 | ||
41 | /* Work struct used for triggering events*/ | 41 | /* Work struct used for triggering events*/ |
42 | struct work_struct kstriped_ws; | 42 | struct work_struct trigger_event; |
43 | 43 | ||
44 | struct stripe stripe[0]; | 44 | struct stripe stripe[0]; |
45 | }; | 45 | }; |
46 | 46 | ||
47 | static struct workqueue_struct *kstriped; | ||
48 | |||
49 | /* | 47 | /* |
50 | * An event is triggered whenever a drive | 48 | * An event is triggered whenever a drive |
51 | * drops out of a stripe volume. | 49 | * drops out of a stripe volume. |
52 | */ | 50 | */ |
53 | static void trigger_event(struct work_struct *work) | 51 | static void trigger_event(struct work_struct *work) |
54 | { | 52 | { |
55 | struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); | 53 | struct stripe_c *sc = container_of(work, struct stripe_c, |
56 | 54 | trigger_event); | |
57 | dm_table_event(sc->ti->table); | 55 | dm_table_event(sc->ti->table); |
58 | |||
59 | } | 56 | } |
60 | 57 | ||
61 | static inline struct stripe_c *alloc_context(unsigned int stripes) | 58 | static inline struct stripe_c *alloc_context(unsigned int stripes) |
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
160 | return -ENOMEM; | 157 | return -ENOMEM; |
161 | } | 158 | } |
162 | 159 | ||
163 | INIT_WORK(&sc->kstriped_ws, trigger_event); | 160 | INIT_WORK(&sc->trigger_event, trigger_event); |
164 | 161 | ||
165 | /* Set pointer to dm target; used in trigger_event */ | 162 | /* Set pointer to dm target; used in trigger_event */ |
166 | sc->ti = ti; | 163 | sc->ti = ti; |
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti) | |||
211 | for (i = 0; i < sc->stripes; i++) | 208 | for (i = 0; i < sc->stripes; i++) |
212 | dm_put_device(ti, sc->stripe[i].dev); | 209 | dm_put_device(ti, sc->stripe[i].dev); |
213 | 210 | ||
214 | flush_workqueue(kstriped); | 211 | flush_work_sync(&sc->trigger_event); |
215 | kfree(sc); | 212 | kfree(sc); |
216 | } | 213 | } |
217 | 214 | ||
@@ -271,7 +268,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, | |||
271 | uint32_t stripe; | 268 | uint32_t stripe; |
272 | unsigned target_request_nr; | 269 | unsigned target_request_nr; |
273 | 270 | ||
274 | if (unlikely(bio_empty_barrier(bio))) { | 271 | if (bio->bi_rw & REQ_FLUSH) { |
275 | target_request_nr = map_context->target_request_nr; | 272 | target_request_nr = map_context->target_request_nr; |
276 | BUG_ON(target_request_nr >= sc->stripes); | 273 | BUG_ON(target_request_nr >= sc->stripes); |
277 | bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; | 274 | bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; |
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, | |||
367 | atomic_inc(&(sc->stripe[i].error_count)); | 364 | atomic_inc(&(sc->stripe[i].error_count)); |
368 | if (atomic_read(&(sc->stripe[i].error_count)) < | 365 | if (atomic_read(&(sc->stripe[i].error_count)) < |
369 | DM_IO_ERROR_THRESHOLD) | 366 | DM_IO_ERROR_THRESHOLD) |
370 | queue_work(kstriped, &sc->kstriped_ws); | 367 | schedule_work(&sc->trigger_event); |
371 | } | 368 | } |
372 | 369 | ||
373 | return error; | 370 | return error; |
@@ -399,9 +396,29 @@ static void stripe_io_hints(struct dm_target *ti, | |||
399 | blk_limits_io_opt(limits, chunk_size * sc->stripes); | 396 | blk_limits_io_opt(limits, chunk_size * sc->stripes); |
400 | } | 397 | } |
401 | 398 | ||
399 | static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
400 | struct bio_vec *biovec, int max_size) | ||
401 | { | ||
402 | struct stripe_c *sc = ti->private; | ||
403 | sector_t bvm_sector = bvm->bi_sector; | ||
404 | uint32_t stripe; | ||
405 | struct request_queue *q; | ||
406 | |||
407 | stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector); | ||
408 | |||
409 | q = bdev_get_queue(sc->stripe[stripe].dev->bdev); | ||
410 | if (!q->merge_bvec_fn) | ||
411 | return max_size; | ||
412 | |||
413 | bvm->bi_bdev = sc->stripe[stripe].dev->bdev; | ||
414 | bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector; | ||
415 | |||
416 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
417 | } | ||
418 | |||
402 | static struct target_type stripe_target = { | 419 | static struct target_type stripe_target = { |
403 | .name = "striped", | 420 | .name = "striped", |
404 | .version = {1, 3, 0}, | 421 | .version = {1, 4, 0}, |
405 | .module = THIS_MODULE, | 422 | .module = THIS_MODULE, |
406 | .ctr = stripe_ctr, | 423 | .ctr = stripe_ctr, |
407 | .dtr = stripe_dtr, | 424 | .dtr = stripe_dtr, |
@@ -410,6 +427,7 @@ static struct target_type stripe_target = { | |||
410 | .status = stripe_status, | 427 | .status = stripe_status, |
411 | .iterate_devices = stripe_iterate_devices, | 428 | .iterate_devices = stripe_iterate_devices, |
412 | .io_hints = stripe_io_hints, | 429 | .io_hints = stripe_io_hints, |
430 | .merge = stripe_merge, | ||
413 | }; | 431 | }; |
414 | 432 | ||
415 | int __init dm_stripe_init(void) | 433 | int __init dm_stripe_init(void) |
@@ -422,20 +440,10 @@ int __init dm_stripe_init(void) | |||
422 | return r; | 440 | return r; |
423 | } | 441 | } |
424 | 442 | ||
425 | kstriped = create_singlethread_workqueue("kstriped"); | ||
426 | if (!kstriped) { | ||
427 | DMERR("failed to create workqueue kstriped"); | ||
428 | dm_unregister_target(&stripe_target); | ||
429 | return -ENOMEM; | ||
430 | } | ||
431 | |||
432 | return r; | 443 | return r; |
433 | } | 444 | } |
434 | 445 | ||
435 | void dm_stripe_exit(void) | 446 | void dm_stripe_exit(void) |
436 | { | 447 | { |
437 | dm_unregister_target(&stripe_target); | 448 | dm_unregister_target(&stripe_target); |
438 | destroy_workqueue(kstriped); | ||
439 | |||
440 | return; | ||
441 | } | 449 | } |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9fc07d7a4b9..451c3bb176d2 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -55,6 +55,7 @@ struct dm_table { | |||
55 | struct dm_target *targets; | 55 | struct dm_target *targets; |
56 | 56 | ||
57 | unsigned discards_supported:1; | 57 | unsigned discards_supported:1; |
58 | unsigned integrity_supported:1; | ||
58 | 59 | ||
59 | /* | 60 | /* |
60 | * Indicates the rw permissions for the new logical | 61 | * Indicates the rw permissions for the new logical |
@@ -71,6 +72,8 @@ struct dm_table { | |||
71 | void *event_context; | 72 | void *event_context; |
72 | 73 | ||
73 | struct dm_md_mempools *mempools; | 74 | struct dm_md_mempools *mempools; |
75 | |||
76 | struct list_head target_callbacks; | ||
74 | }; | 77 | }; |
75 | 78 | ||
76 | /* | 79 | /* |
@@ -204,6 +207,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode, | |||
204 | return -ENOMEM; | 207 | return -ENOMEM; |
205 | 208 | ||
206 | INIT_LIST_HEAD(&t->devices); | 209 | INIT_LIST_HEAD(&t->devices); |
210 | INIT_LIST_HEAD(&t->target_callbacks); | ||
207 | atomic_set(&t->holders, 0); | 211 | atomic_set(&t->holders, 0); |
208 | t->discards_supported = 1; | 212 | t->discards_supported = 1; |
209 | 213 | ||
@@ -325,15 +329,18 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev, | |||
325 | 329 | ||
326 | BUG_ON(d->dm_dev.bdev); | 330 | BUG_ON(d->dm_dev.bdev); |
327 | 331 | ||
328 | bdev = open_by_devnum(dev, d->dm_dev.mode); | 332 | bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); |
329 | if (IS_ERR(bdev)) | 333 | if (IS_ERR(bdev)) |
330 | return PTR_ERR(bdev); | 334 | return PTR_ERR(bdev); |
331 | r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md)); | 335 | |
332 | if (r) | 336 | r = bd_link_disk_holder(bdev, dm_disk(md)); |
333 | blkdev_put(bdev, d->dm_dev.mode); | 337 | if (r) { |
334 | else | 338 | blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); |
335 | d->dm_dev.bdev = bdev; | 339 | return r; |
336 | return r; | 340 | } |
341 | |||
342 | d->dm_dev.bdev = bdev; | ||
343 | return 0; | ||
337 | } | 344 | } |
338 | 345 | ||
339 | /* | 346 | /* |
@@ -344,8 +351,8 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
344 | if (!d->dm_dev.bdev) | 351 | if (!d->dm_dev.bdev) |
345 | return; | 352 | return; |
346 | 353 | ||
347 | bd_release_from_disk(d->dm_dev.bdev, dm_disk(md)); | 354 | bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); |
348 | blkdev_put(d->dm_dev.bdev, d->dm_dev.mode); | 355 | blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); |
349 | d->dm_dev.bdev = NULL; | 356 | d->dm_dev.bdev = NULL; |
350 | } | 357 | } |
351 | 358 | ||
@@ -355,6 +362,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) | |||
355 | static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, | 362 | static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, |
356 | sector_t start, sector_t len, void *data) | 363 | sector_t start, sector_t len, void *data) |
357 | { | 364 | { |
365 | struct request_queue *q; | ||
358 | struct queue_limits *limits = data; | 366 | struct queue_limits *limits = data; |
359 | struct block_device *bdev = dev->bdev; | 367 | struct block_device *bdev = dev->bdev; |
360 | sector_t dev_size = | 368 | sector_t dev_size = |
@@ -363,6 +371,22 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, | |||
363 | limits->logical_block_size >> SECTOR_SHIFT; | 371 | limits->logical_block_size >> SECTOR_SHIFT; |
364 | char b[BDEVNAME_SIZE]; | 372 | char b[BDEVNAME_SIZE]; |
365 | 373 | ||
374 | /* | ||
375 | * Some devices exist without request functions, | ||
376 | * such as loop devices not yet bound to backing files. | ||
377 | * Forbid the use of such devices. | ||
378 | */ | ||
379 | q = bdev_get_queue(bdev); | ||
380 | if (!q || !q->make_request_fn) { | ||
381 | DMWARN("%s: %s is not yet initialised: " | ||
382 | "start=%llu, len=%llu, dev_size=%llu", | ||
383 | dm_device_name(ti->table->md), bdevname(bdev, b), | ||
384 | (unsigned long long)start, | ||
385 | (unsigned long long)len, | ||
386 | (unsigned long long)dev_size); | ||
387 | return 1; | ||
388 | } | ||
389 | |||
366 | if (!dev_size) | 390 | if (!dev_size) |
367 | return 0; | 391 | return 0; |
368 | 392 | ||
@@ -486,11 +510,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, | |||
486 | return 0; | 510 | return 0; |
487 | } | 511 | } |
488 | 512 | ||
489 | /* | ||
490 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
491 | */ | ||
492 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
493 | |||
494 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | 513 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, |
495 | sector_t start, sector_t len, void *data) | 514 | sector_t start, sector_t len, void *data) |
496 | { | 515 | { |
@@ -522,9 +541,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, | |||
522 | */ | 541 | */ |
523 | 542 | ||
524 | if (q->merge_bvec_fn && !ti->type->merge) | 543 | if (q->merge_bvec_fn && !ti->type->merge) |
525 | limits->max_sectors = | 544 | blk_limits_max_hw_sectors(limits, |
526 | min_not_zero(limits->max_sectors, | 545 | (unsigned int) (PAGE_SIZE >> 9)); |
527 | (unsigned int) (PAGE_SIZE >> 9)); | ||
528 | return 0; | 546 | return 0; |
529 | } | 547 | } |
530 | EXPORT_SYMBOL_GPL(dm_set_device_limits); | 548 | EXPORT_SYMBOL_GPL(dm_set_device_limits); |
@@ -859,7 +877,7 @@ int dm_table_alloc_md_mempools(struct dm_table *t) | |||
859 | return -EINVAL; | 877 | return -EINVAL; |
860 | } | 878 | } |
861 | 879 | ||
862 | t->mempools = dm_alloc_md_mempools(type); | 880 | t->mempools = dm_alloc_md_mempools(type, t->integrity_supported); |
863 | if (!t->mempools) | 881 | if (!t->mempools) |
864 | return -ENOMEM; | 882 | return -ENOMEM; |
865 | 883 | ||
@@ -926,18 +944,80 @@ static int dm_table_build_index(struct dm_table *t) | |||
926 | } | 944 | } |
927 | 945 | ||
928 | /* | 946 | /* |
947 | * Get a disk whose integrity profile reflects the table's profile. | ||
948 | * If %match_all is true, all devices' profiles must match. | ||
949 | * If %match_all is false, all devices must at least have an | ||
950 | * allocated integrity profile; but uninitialized is ok. | ||
951 | * Returns NULL if integrity support was inconsistent or unavailable. | ||
952 | */ | ||
953 | static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, | ||
954 | bool match_all) | ||
955 | { | ||
956 | struct list_head *devices = dm_table_get_devices(t); | ||
957 | struct dm_dev_internal *dd = NULL; | ||
958 | struct gendisk *prev_disk = NULL, *template_disk = NULL; | ||
959 | |||
960 | list_for_each_entry(dd, devices, list) { | ||
961 | template_disk = dd->dm_dev.bdev->bd_disk; | ||
962 | if (!blk_get_integrity(template_disk)) | ||
963 | goto no_integrity; | ||
964 | if (!match_all && !blk_integrity_is_initialized(template_disk)) | ||
965 | continue; /* skip uninitialized profiles */ | ||
966 | else if (prev_disk && | ||
967 | blk_integrity_compare(prev_disk, template_disk) < 0) | ||
968 | goto no_integrity; | ||
969 | prev_disk = template_disk; | ||
970 | } | ||
971 | |||
972 | return template_disk; | ||
973 | |||
974 | no_integrity: | ||
975 | if (prev_disk) | ||
976 | DMWARN("%s: integrity not set: %s and %s profile mismatch", | ||
977 | dm_device_name(t->md), | ||
978 | prev_disk->disk_name, | ||
979 | template_disk->disk_name); | ||
980 | return NULL; | ||
981 | } | ||
982 | |||
983 | /* | ||
929 | * Register the mapped device for blk_integrity support if | 984 | * Register the mapped device for blk_integrity support if |
930 | * the underlying devices support it. | 985 | * the underlying devices have an integrity profile. But all devices |
986 | * may not have matching profiles (checking all devices isn't reliable | ||
987 | * during table load because this table may use other DM device(s) which | ||
988 | * must be resumed before they will have an initialized integity profile). | ||
989 | * Stacked DM devices force a 2 stage integrity profile validation: | ||
990 | * 1 - during load, validate all initialized integrity profiles match | ||
991 | * 2 - during resume, validate all integrity profiles match | ||
931 | */ | 992 | */ |
932 | static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) | 993 | static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) |
933 | { | 994 | { |
934 | struct list_head *devices = dm_table_get_devices(t); | 995 | struct gendisk *template_disk = NULL; |
935 | struct dm_dev_internal *dd; | 996 | |
997 | template_disk = dm_table_get_integrity_disk(t, false); | ||
998 | if (!template_disk) | ||
999 | return 0; | ||
936 | 1000 | ||
937 | list_for_each_entry(dd, devices, list) | 1001 | if (!blk_integrity_is_initialized(dm_disk(md))) { |
938 | if (bdev_get_integrity(dd->dm_dev.bdev)) | 1002 | t->integrity_supported = 1; |
939 | return blk_integrity_register(dm_disk(md), NULL); | 1003 | return blk_integrity_register(dm_disk(md), NULL); |
1004 | } | ||
940 | 1005 | ||
1006 | /* | ||
1007 | * If DM device already has an initalized integrity | ||
1008 | * profile the new profile should not conflict. | ||
1009 | */ | ||
1010 | if (blk_integrity_is_initialized(template_disk) && | ||
1011 | blk_integrity_compare(dm_disk(md), template_disk) < 0) { | ||
1012 | DMWARN("%s: conflict with existing integrity profile: " | ||
1013 | "%s profile mismatch", | ||
1014 | dm_device_name(t->md), | ||
1015 | template_disk->disk_name); | ||
1016 | return 1; | ||
1017 | } | ||
1018 | |||
1019 | /* Preserve existing initialized integrity profile */ | ||
1020 | t->integrity_supported = 1; | ||
941 | return 0; | 1021 | return 0; |
942 | } | 1022 | } |
943 | 1023 | ||
@@ -1091,41 +1171,27 @@ combine_limits: | |||
1091 | 1171 | ||
1092 | /* | 1172 | /* |
1093 | * Set the integrity profile for this device if all devices used have | 1173 | * Set the integrity profile for this device if all devices used have |
1094 | * matching profiles. | 1174 | * matching profiles. We're quite deep in the resume path but still |
1175 | * don't know if all devices (particularly DM devices this device | ||
1176 | * may be stacked on) have matching profiles. Even if the profiles | ||
1177 | * don't match we have no way to fail (to resume) at this point. | ||
1095 | */ | 1178 | */ |
1096 | static void dm_table_set_integrity(struct dm_table *t) | 1179 | static void dm_table_set_integrity(struct dm_table *t) |
1097 | { | 1180 | { |
1098 | struct list_head *devices = dm_table_get_devices(t); | 1181 | struct gendisk *template_disk = NULL; |
1099 | struct dm_dev_internal *prev = NULL, *dd = NULL; | ||
1100 | 1182 | ||
1101 | if (!blk_get_integrity(dm_disk(t->md))) | 1183 | if (!blk_get_integrity(dm_disk(t->md))) |
1102 | return; | 1184 | return; |
1103 | 1185 | ||
1104 | list_for_each_entry(dd, devices, list) { | 1186 | template_disk = dm_table_get_integrity_disk(t, true); |
1105 | if (prev && | 1187 | if (!template_disk && |
1106 | blk_integrity_compare(prev->dm_dev.bdev->bd_disk, | 1188 | blk_integrity_is_initialized(dm_disk(t->md))) { |
1107 | dd->dm_dev.bdev->bd_disk) < 0) { | 1189 | DMWARN("%s: device no longer has a valid integrity profile", |
1108 | DMWARN("%s: integrity not set: %s and %s mismatch", | 1190 | dm_device_name(t->md)); |
1109 | dm_device_name(t->md), | 1191 | return; |
1110 | prev->dm_dev.bdev->bd_disk->disk_name, | ||
1111 | dd->dm_dev.bdev->bd_disk->disk_name); | ||
1112 | goto no_integrity; | ||
1113 | } | ||
1114 | prev = dd; | ||
1115 | } | 1192 | } |
1116 | |||
1117 | if (!prev || !bdev_get_integrity(prev->dm_dev.bdev)) | ||
1118 | goto no_integrity; | ||
1119 | |||
1120 | blk_integrity_register(dm_disk(t->md), | 1193 | blk_integrity_register(dm_disk(t->md), |
1121 | bdev_get_integrity(prev->dm_dev.bdev)); | 1194 | blk_get_integrity(template_disk)); |
1122 | |||
1123 | return; | ||
1124 | |||
1125 | no_integrity: | ||
1126 | blk_integrity_register(dm_disk(t->md), NULL); | ||
1127 | |||
1128 | return; | ||
1129 | } | 1195 | } |
1130 | 1196 | ||
1131 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | 1197 | void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, |
@@ -1136,11 +1202,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, | |||
1136 | */ | 1202 | */ |
1137 | q->limits = *limits; | 1203 | q->limits = *limits; |
1138 | 1204 | ||
1139 | if (limits->no_cluster) | ||
1140 | queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); | ||
1141 | else | ||
1142 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); | ||
1143 | |||
1144 | if (!dm_table_supports_discards(t)) | 1205 | if (!dm_table_supports_discards(t)) |
1145 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); | 1206 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); |
1146 | else | 1207 | else |
@@ -1234,10 +1295,17 @@ int dm_table_resume_targets(struct dm_table *t) | |||
1234 | return 0; | 1295 | return 0; |
1235 | } | 1296 | } |
1236 | 1297 | ||
1298 | void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) | ||
1299 | { | ||
1300 | list_add(&cb->list, &t->target_callbacks); | ||
1301 | } | ||
1302 | EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); | ||
1303 | |||
1237 | int dm_table_any_congested(struct dm_table *t, int bdi_bits) | 1304 | int dm_table_any_congested(struct dm_table *t, int bdi_bits) |
1238 | { | 1305 | { |
1239 | struct dm_dev_internal *dd; | 1306 | struct dm_dev_internal *dd; |
1240 | struct list_head *devices = dm_table_get_devices(t); | 1307 | struct list_head *devices = dm_table_get_devices(t); |
1308 | struct dm_target_callbacks *cb; | ||
1241 | int r = 0; | 1309 | int r = 0; |
1242 | 1310 | ||
1243 | list_for_each_entry(dd, devices, list) { | 1311 | list_for_each_entry(dd, devices, list) { |
@@ -1252,6 +1320,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) | |||
1252 | bdevname(dd->dm_dev.bdev, b)); | 1320 | bdevname(dd->dm_dev.bdev, b)); |
1253 | } | 1321 | } |
1254 | 1322 | ||
1323 | list_for_each_entry(cb, &t->target_callbacks, list) | ||
1324 | if (cb->congested_fn) | ||
1325 | r |= cb->congested_fn(cb, bdi_bits); | ||
1326 | |||
1255 | return r; | 1327 | return r; |
1256 | } | 1328 | } |
1257 | 1329 | ||
@@ -1269,24 +1341,6 @@ int dm_table_any_busy_target(struct dm_table *t) | |||
1269 | return 0; | 1341 | return 0; |
1270 | } | 1342 | } |
1271 | 1343 | ||
1272 | void dm_table_unplug_all(struct dm_table *t) | ||
1273 | { | ||
1274 | struct dm_dev_internal *dd; | ||
1275 | struct list_head *devices = dm_table_get_devices(t); | ||
1276 | |||
1277 | list_for_each_entry(dd, devices, list) { | ||
1278 | struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); | ||
1279 | char b[BDEVNAME_SIZE]; | ||
1280 | |||
1281 | if (likely(q)) | ||
1282 | blk_unplug(q); | ||
1283 | else | ||
1284 | DMWARN_LIMIT("%s: Cannot unplug nonexistent device %s", | ||
1285 | dm_device_name(t->md), | ||
1286 | bdevname(dd->dm_dev.bdev, b)); | ||
1287 | } | ||
1288 | } | ||
1289 | |||
1290 | struct mapped_device *dm_table_get_md(struct dm_table *t) | 1344 | struct mapped_device *dm_table_get_md(struct dm_table *t) |
1291 | { | 1345 | { |
1292 | return t->md; | 1346 | return t->md; |
@@ -1309,7 +1363,8 @@ bool dm_table_supports_discards(struct dm_table *t) | |||
1309 | return 0; | 1363 | return 0; |
1310 | 1364 | ||
1311 | /* | 1365 | /* |
1312 | * Ensure that at least one underlying device supports discards. | 1366 | * Unless any target used by the table set discards_supported, |
1367 | * require at least one underlying device to support discards. | ||
1313 | * t->devices includes internal dm devices such as mirror logs | 1368 | * t->devices includes internal dm devices such as mirror logs |
1314 | * so we need to use iterate_devices here, which targets | 1369 | * so we need to use iterate_devices here, which targets |
1315 | * supporting discard must provide. | 1370 | * supporting discard must provide. |
@@ -1317,6 +1372,9 @@ bool dm_table_supports_discards(struct dm_table *t) | |||
1317 | while (i < dm_table_get_num_targets(t)) { | 1372 | while (i < dm_table_get_num_targets(t)) { |
1318 | ti = dm_table_get_target(t, i++); | 1373 | ti = dm_table_get_target(t, i++); |
1319 | 1374 | ||
1375 | if (ti->discards_supported) | ||
1376 | return 1; | ||
1377 | |||
1320 | if (ti->type->iterate_devices && | 1378 | if (ti->type->iterate_devices && |
1321 | ti->type->iterate_devices(ti, device_discard_capable, NULL)) | 1379 | ti->type->iterate_devices(ti, device_discard_capable, NULL)) |
1322 | return 1; | 1380 | return 1; |
@@ -1334,4 +1392,3 @@ EXPORT_SYMBOL(dm_table_get_mode); | |||
1334 | EXPORT_SYMBOL(dm_table_get_md); | 1392 | EXPORT_SYMBOL(dm_table_get_md); |
1335 | EXPORT_SYMBOL(dm_table_put); | 1393 | EXPORT_SYMBOL(dm_table_put); |
1336 | EXPORT_SYMBOL(dm_table_get); | 1394 | EXPORT_SYMBOL(dm_table_get); |
1337 | EXPORT_SYMBOL(dm_table_unplug_all); | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ac384b2a6a33..0cf68b478878 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/blkpg.h> | 15 | #include <linux/blkpg.h> |
16 | #include <linux/bio.h> | 16 | #include <linux/bio.h> |
17 | #include <linux/buffer_head.h> | 17 | #include <linux/buffer_head.h> |
18 | #include <linux/smp_lock.h> | ||
19 | #include <linux/mempool.h> | 18 | #include <linux/mempool.h> |
20 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
21 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
@@ -110,7 +109,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | |||
110 | #define DMF_FREEING 3 | 109 | #define DMF_FREEING 3 |
111 | #define DMF_DELETING 4 | 110 | #define DMF_DELETING 4 |
112 | #define DMF_NOFLUSH_SUSPENDING 5 | 111 | #define DMF_NOFLUSH_SUSPENDING 5 |
113 | #define DMF_QUEUE_IO_TO_THREAD 6 | ||
114 | 112 | ||
115 | /* | 113 | /* |
116 | * Work processed by per-device workqueue. | 114 | * Work processed by per-device workqueue. |
@@ -144,24 +142,9 @@ struct mapped_device { | |||
144 | spinlock_t deferred_lock; | 142 | spinlock_t deferred_lock; |
145 | 143 | ||
146 | /* | 144 | /* |
147 | * An error from the barrier request currently being processed. | 145 | * Processing queue (flush) |
148 | */ | ||
149 | int barrier_error; | ||
150 | |||
151 | /* | ||
152 | * Protect barrier_error from concurrent endio processing | ||
153 | * in request-based dm. | ||
154 | */ | ||
155 | spinlock_t barrier_error_lock; | ||
156 | |||
157 | /* | ||
158 | * Processing queue (flush/barriers) | ||
159 | */ | 146 | */ |
160 | struct workqueue_struct *wq; | 147 | struct workqueue_struct *wq; |
161 | struct work_struct barrier_work; | ||
162 | |||
163 | /* A pointer to the currently processing pre/post flush request */ | ||
164 | struct request *flush_request; | ||
165 | 148 | ||
166 | /* | 149 | /* |
167 | * The current mapping. | 150 | * The current mapping. |
@@ -200,8 +183,8 @@ struct mapped_device { | |||
200 | /* sysfs handle */ | 183 | /* sysfs handle */ |
201 | struct kobject kobj; | 184 | struct kobject kobj; |
202 | 185 | ||
203 | /* zero-length barrier that will be cloned and submitted to targets */ | 186 | /* zero-length flush that will be cloned and submitted to targets */ |
204 | struct bio barrier_bio; | 187 | struct bio flush_bio; |
205 | }; | 188 | }; |
206 | 189 | ||
207 | /* | 190 | /* |
@@ -344,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) | |||
344 | { | 327 | { |
345 | struct mapped_device *md; | 328 | struct mapped_device *md; |
346 | 329 | ||
347 | lock_kernel(); | ||
348 | spin_lock(&_minor_lock); | 330 | spin_lock(&_minor_lock); |
349 | 331 | ||
350 | md = bdev->bd_disk->private_data; | 332 | md = bdev->bd_disk->private_data; |
@@ -362,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode) | |||
362 | 344 | ||
363 | out: | 345 | out: |
364 | spin_unlock(&_minor_lock); | 346 | spin_unlock(&_minor_lock); |
365 | unlock_kernel(); | ||
366 | 347 | ||
367 | return md ? 0 : -ENXIO; | 348 | return md ? 0 : -ENXIO; |
368 | } | 349 | } |
@@ -371,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) | |||
371 | { | 352 | { |
372 | struct mapped_device *md = disk->private_data; | 353 | struct mapped_device *md = disk->private_data; |
373 | 354 | ||
374 | lock_kernel(); | 355 | spin_lock(&_minor_lock); |
356 | |||
375 | atomic_dec(&md->open_count); | 357 | atomic_dec(&md->open_count); |
376 | dm_put(md); | 358 | dm_put(md); |
377 | unlock_kernel(); | 359 | |
360 | spin_unlock(&_minor_lock); | ||
378 | 361 | ||
379 | return 0; | 362 | return 0; |
380 | } | 363 | } |
@@ -494,7 +477,8 @@ static void start_io_acct(struct dm_io *io) | |||
494 | cpu = part_stat_lock(); | 477 | cpu = part_stat_lock(); |
495 | part_round_stats(cpu, &dm_disk(md)->part0); | 478 | part_round_stats(cpu, &dm_disk(md)->part0); |
496 | part_stat_unlock(); | 479 | part_stat_unlock(); |
497 | dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); | 480 | atomic_set(&dm_disk(md)->part0.in_flight[rw], |
481 | atomic_inc_return(&md->pending[rw])); | ||
498 | } | 482 | } |
499 | 483 | ||
500 | static void end_io_acct(struct dm_io *io) | 484 | static void end_io_acct(struct dm_io *io) |
@@ -512,10 +496,10 @@ static void end_io_acct(struct dm_io *io) | |||
512 | 496 | ||
513 | /* | 497 | /* |
514 | * After this is decremented the bio must not be touched if it is | 498 | * After this is decremented the bio must not be touched if it is |
515 | * a barrier. | 499 | * a flush. |
516 | */ | 500 | */ |
517 | dm_disk(md)->part0.in_flight[rw] = pending = | 501 | pending = atomic_dec_return(&md->pending[rw]); |
518 | atomic_dec_return(&md->pending[rw]); | 502 | atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); |
519 | pending += atomic_read(&md->pending[rw^0x1]); | 503 | pending += atomic_read(&md->pending[rw^0x1]); |
520 | 504 | ||
521 | /* nudge anyone waiting on suspend queue */ | 505 | /* nudge anyone waiting on suspend queue */ |
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io) | |||
528 | */ | 512 | */ |
529 | static void queue_io(struct mapped_device *md, struct bio *bio) | 513 | static void queue_io(struct mapped_device *md, struct bio *bio) |
530 | { | 514 | { |
531 | down_write(&md->io_lock); | 515 | unsigned long flags; |
532 | 516 | ||
533 | spin_lock_irq(&md->deferred_lock); | 517 | spin_lock_irqsave(&md->deferred_lock, flags); |
534 | bio_list_add(&md->deferred, bio); | 518 | bio_list_add(&md->deferred, bio); |
535 | spin_unlock_irq(&md->deferred_lock); | 519 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
536 | 520 | queue_work(md->wq, &md->work); | |
537 | if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) | ||
538 | queue_work(md->wq, &md->work); | ||
539 | |||
540 | up_write(&md->io_lock); | ||
541 | } | 521 | } |
542 | 522 | ||
543 | /* | 523 | /* |
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error) | |||
625 | * Target requested pushing back the I/O. | 605 | * Target requested pushing back the I/O. |
626 | */ | 606 | */ |
627 | spin_lock_irqsave(&md->deferred_lock, flags); | 607 | spin_lock_irqsave(&md->deferred_lock, flags); |
628 | if (__noflush_suspending(md)) { | 608 | if (__noflush_suspending(md)) |
629 | if (!(io->bio->bi_rw & REQ_HARDBARRIER)) | 609 | bio_list_add_head(&md->deferred, io->bio); |
630 | bio_list_add_head(&md->deferred, | 610 | else |
631 | io->bio); | ||
632 | } else | ||
633 | /* noflush suspend was interrupted. */ | 611 | /* noflush suspend was interrupted. */ |
634 | io->error = -EIO; | 612 | io->error = -EIO; |
635 | spin_unlock_irqrestore(&md->deferred_lock, flags); | 613 | spin_unlock_irqrestore(&md->deferred_lock, flags); |
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error) | |||
637 | 615 | ||
638 | io_error = io->error; | 616 | io_error = io->error; |
639 | bio = io->bio; | 617 | bio = io->bio; |
618 | end_io_acct(io); | ||
619 | free_io(md, io); | ||
620 | |||
621 | if (io_error == DM_ENDIO_REQUEUE) | ||
622 | return; | ||
640 | 623 | ||
641 | if (bio->bi_rw & REQ_HARDBARRIER) { | 624 | if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { |
642 | /* | 625 | /* |
643 | * There can be just one barrier request so we use | 626 | * Preflush done for flush with data, reissue |
644 | * a per-device variable for error reporting. | 627 | * without REQ_FLUSH. |
645 | * Note that you can't touch the bio after end_io_acct | ||
646 | * | ||
647 | * We ignore -EOPNOTSUPP for empty flush reported by | ||
648 | * underlying devices. We assume that if the device | ||
649 | * doesn't support empty barriers, it doesn't need | ||
650 | * cache flushing commands. | ||
651 | */ | 628 | */ |
652 | if (!md->barrier_error && | 629 | bio->bi_rw &= ~REQ_FLUSH; |
653 | !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) | 630 | queue_io(md, bio); |
654 | md->barrier_error = io_error; | ||
655 | end_io_acct(io); | ||
656 | free_io(md, io); | ||
657 | } else { | 631 | } else { |
658 | end_io_acct(io); | 632 | /* done with normal IO or empty flush */ |
659 | free_io(md, io); | 633 | trace_block_bio_complete(md->queue, bio, io_error); |
660 | 634 | bio_endio(bio, io_error); | |
661 | if (io_error != DM_ENDIO_REQUEUE) { | ||
662 | trace_block_bio_complete(md->queue, bio); | ||
663 | |||
664 | bio_endio(bio, io_error); | ||
665 | } | ||
666 | } | 635 | } |
667 | } | 636 | } |
668 | } | 637 | } |
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error) | |||
755 | blk_update_request(tio->orig, 0, nr_bytes); | 724 | blk_update_request(tio->orig, 0, nr_bytes); |
756 | } | 725 | } |
757 | 726 | ||
758 | static void store_barrier_error(struct mapped_device *md, int error) | ||
759 | { | ||
760 | unsigned long flags; | ||
761 | |||
762 | spin_lock_irqsave(&md->barrier_error_lock, flags); | ||
763 | /* | ||
764 | * Basically, the first error is taken, but: | ||
765 | * -EOPNOTSUPP supersedes any I/O error. | ||
766 | * Requeue request supersedes any I/O error but -EOPNOTSUPP. | ||
767 | */ | ||
768 | if (!md->barrier_error || error == -EOPNOTSUPP || | ||
769 | (md->barrier_error != -EOPNOTSUPP && | ||
770 | error == DM_ENDIO_REQUEUE)) | ||
771 | md->barrier_error = error; | ||
772 | spin_unlock_irqrestore(&md->barrier_error_lock, flags); | ||
773 | } | ||
774 | |||
775 | /* | 727 | /* |
776 | * Don't touch any member of the md after calling this function because | 728 | * Don't touch any member of the md after calling this function because |
777 | * the md may be freed in dm_put() at the end of this function. | 729 | * the md may be freed in dm_put() at the end of this function. |
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone) | |||
809 | static void dm_end_request(struct request *clone, int error) | 761 | static void dm_end_request(struct request *clone, int error) |
810 | { | 762 | { |
811 | int rw = rq_data_dir(clone); | 763 | int rw = rq_data_dir(clone); |
812 | int run_queue = 1; | ||
813 | bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER; | ||
814 | struct dm_rq_target_io *tio = clone->end_io_data; | 764 | struct dm_rq_target_io *tio = clone->end_io_data; |
815 | struct mapped_device *md = tio->md; | 765 | struct mapped_device *md = tio->md; |
816 | struct request *rq = tio->orig; | 766 | struct request *rq = tio->orig; |
817 | 767 | ||
818 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { | 768 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
819 | rq->errors = clone->errors; | 769 | rq->errors = clone->errors; |
820 | rq->resid_len = clone->resid_len; | 770 | rq->resid_len = clone->resid_len; |
821 | 771 | ||
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error) | |||
829 | } | 779 | } |
830 | 780 | ||
831 | free_rq_clone(clone); | 781 | free_rq_clone(clone); |
832 | 782 | blk_end_request_all(rq, error); | |
833 | if (unlikely(is_barrier)) { | 783 | rq_completed(md, rw, true); |
834 | if (unlikely(error)) | ||
835 | store_barrier_error(md, error); | ||
836 | run_queue = 0; | ||
837 | } else | ||
838 | blk_end_request_all(rq, error); | ||
839 | |||
840 | rq_completed(md, rw, run_queue); | ||
841 | } | 784 | } |
842 | 785 | ||
843 | static void dm_unprep_request(struct request *rq) | 786 | static void dm_unprep_request(struct request *rq) |
@@ -862,21 +805,9 @@ void dm_requeue_unmapped_request(struct request *clone) | |||
862 | struct request_queue *q = rq->q; | 805 | struct request_queue *q = rq->q; |
863 | unsigned long flags; | 806 | unsigned long flags; |
864 | 807 | ||
865 | if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { | ||
866 | /* | ||
867 | * Barrier clones share an original request. | ||
868 | * Leave it to dm_end_request(), which handles this special | ||
869 | * case. | ||
870 | */ | ||
871 | dm_end_request(clone, DM_ENDIO_REQUEUE); | ||
872 | return; | ||
873 | } | ||
874 | |||
875 | dm_unprep_request(rq); | 808 | dm_unprep_request(rq); |
876 | 809 | ||
877 | spin_lock_irqsave(q->queue_lock, flags); | 810 | spin_lock_irqsave(q->queue_lock, flags); |
878 | if (elv_queue_empty(q)) | ||
879 | blk_plug_device(q); | ||
880 | blk_requeue_request(q, rq); | 811 | blk_requeue_request(q, rq); |
881 | spin_unlock_irqrestore(q->queue_lock, flags); | 812 | spin_unlock_irqrestore(q->queue_lock, flags); |
882 | 813 | ||
@@ -961,19 +892,6 @@ static void dm_complete_request(struct request *clone, int error) | |||
961 | struct dm_rq_target_io *tio = clone->end_io_data; | 892 | struct dm_rq_target_io *tio = clone->end_io_data; |
962 | struct request *rq = tio->orig; | 893 | struct request *rq = tio->orig; |
963 | 894 | ||
964 | if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { | ||
965 | /* | ||
966 | * Barrier clones share an original request. So can't use | ||
967 | * softirq_done with the original. | ||
968 | * Pass the clone to dm_done() directly in this special case. | ||
969 | * It is safe (even if clone->q->queue_lock is held here) | ||
970 | * because there is no I/O dispatching during the completion | ||
971 | * of barrier clone. | ||
972 | */ | ||
973 | dm_done(clone, error, true); | ||
974 | return; | ||
975 | } | ||
976 | |||
977 | tio->error = error; | 895 | tio->error = error; |
978 | rq->completion_data = clone; | 896 | rq->completion_data = clone; |
979 | blk_complete_request(rq); | 897 | blk_complete_request(rq); |
@@ -990,17 +908,6 @@ void dm_kill_unmapped_request(struct request *clone, int error) | |||
990 | struct dm_rq_target_io *tio = clone->end_io_data; | 908 | struct dm_rq_target_io *tio = clone->end_io_data; |
991 | struct request *rq = tio->orig; | 909 | struct request *rq = tio->orig; |
992 | 910 | ||
993 | if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { | ||
994 | /* | ||
995 | * Barrier clones share an original request. | ||
996 | * Leave it to dm_end_request(), which handles this special | ||
997 | * case. | ||
998 | */ | ||
999 | BUG_ON(error > 0); | ||
1000 | dm_end_request(clone, error); | ||
1001 | return; | ||
1002 | } | ||
1003 | |||
1004 | rq->cmd_flags |= REQ_FAILED; | 911 | rq->cmd_flags |= REQ_FAILED; |
1005 | dm_complete_request(clone, error); | 912 | dm_complete_request(clone, error); |
1006 | } | 913 | } |
@@ -1081,8 +988,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
1081 | if (r == DM_MAPIO_REMAPPED) { | 988 | if (r == DM_MAPIO_REMAPPED) { |
1082 | /* the bio has been remapped so dispatch it */ | 989 | /* the bio has been remapped so dispatch it */ |
1083 | 990 | ||
1084 | trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, | 991 | trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, |
1085 | tio->io->bio->bi_bdev->bd_dev, sector); | 992 | tio->io->bio->bi_bdev->bd_dev, sector); |
1086 | 993 | ||
1087 | generic_make_request(clone); | 994 | generic_make_request(clone); |
1088 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { | 995 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { |
@@ -1119,7 +1026,7 @@ static void dm_bio_destructor(struct bio *bio) | |||
1119 | } | 1026 | } |
1120 | 1027 | ||
1121 | /* | 1028 | /* |
1122 | * Creates a little bio that is just does part of a bvec. | 1029 | * Creates a little bio that just does part of a bvec. |
1123 | */ | 1030 | */ |
1124 | static struct bio *split_bvec(struct bio *bio, sector_t sector, | 1031 | static struct bio *split_bvec(struct bio *bio, sector_t sector, |
1125 | unsigned short idx, unsigned int offset, | 1032 | unsigned short idx, unsigned int offset, |
@@ -1134,7 +1041,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector, | |||
1134 | 1041 | ||
1135 | clone->bi_sector = sector; | 1042 | clone->bi_sector = sector; |
1136 | clone->bi_bdev = bio->bi_bdev; | 1043 | clone->bi_bdev = bio->bi_bdev; |
1137 | clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; | 1044 | clone->bi_rw = bio->bi_rw; |
1138 | clone->bi_vcnt = 1; | 1045 | clone->bi_vcnt = 1; |
1139 | clone->bi_size = to_bytes(len); | 1046 | clone->bi_size = to_bytes(len); |
1140 | clone->bi_io_vec->bv_offset = offset; | 1047 | clone->bi_io_vec->bv_offset = offset; |
@@ -1161,7 +1068,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector, | |||
1161 | 1068 | ||
1162 | clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); | 1069 | clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); |
1163 | __bio_clone(clone, bio); | 1070 | __bio_clone(clone, bio); |
1164 | clone->bi_rw &= ~REQ_HARDBARRIER; | ||
1165 | clone->bi_destructor = dm_bio_destructor; | 1071 | clone->bi_destructor = dm_bio_destructor; |
1166 | clone->bi_sector = sector; | 1072 | clone->bi_sector = sector; |
1167 | clone->bi_idx = idx; | 1073 | clone->bi_idx = idx; |
@@ -1225,16 +1131,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, | |||
1225 | __issue_target_request(ci, ti, request_nr, len); | 1131 | __issue_target_request(ci, ti, request_nr, len); |
1226 | } | 1132 | } |
1227 | 1133 | ||
1228 | static int __clone_and_map_empty_barrier(struct clone_info *ci) | 1134 | static int __clone_and_map_empty_flush(struct clone_info *ci) |
1229 | { | 1135 | { |
1230 | unsigned target_nr = 0; | 1136 | unsigned target_nr = 0; |
1231 | struct dm_target *ti; | 1137 | struct dm_target *ti; |
1232 | 1138 | ||
1139 | BUG_ON(bio_has_data(ci->bio)); | ||
1233 | while ((ti = dm_table_get_target(ci->map, target_nr++))) | 1140 | while ((ti = dm_table_get_target(ci->map, target_nr++))) |
1234 | __issue_target_requests(ci, ti, ti->num_flush_requests, 0); | 1141 | __issue_target_requests(ci, ti, ti->num_flush_requests, 0); |
1235 | 1142 | ||
1236 | ci->sector_count = 0; | ||
1237 | |||
1238 | return 0; | 1143 | return 0; |
1239 | } | 1144 | } |
1240 | 1145 | ||
@@ -1289,9 +1194,6 @@ static int __clone_and_map(struct clone_info *ci) | |||
1289 | sector_t len = 0, max; | 1194 | sector_t len = 0, max; |
1290 | struct dm_target_io *tio; | 1195 | struct dm_target_io *tio; |
1291 | 1196 | ||
1292 | if (unlikely(bio_empty_barrier(bio))) | ||
1293 | return __clone_and_map_empty_barrier(ci); | ||
1294 | |||
1295 | if (unlikely(bio->bi_rw & REQ_DISCARD)) | 1197 | if (unlikely(bio->bi_rw & REQ_DISCARD)) |
1296 | return __clone_and_map_discard(ci); | 1198 | return __clone_and_map_discard(ci); |
1297 | 1199 | ||
@@ -1383,16 +1285,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1383 | 1285 | ||
1384 | ci.map = dm_get_live_table(md); | 1286 | ci.map = dm_get_live_table(md); |
1385 | if (unlikely(!ci.map)) { | 1287 | if (unlikely(!ci.map)) { |
1386 | if (!(bio->bi_rw & REQ_HARDBARRIER)) | 1288 | bio_io_error(bio); |
1387 | bio_io_error(bio); | ||
1388 | else | ||
1389 | if (!md->barrier_error) | ||
1390 | md->barrier_error = -EIO; | ||
1391 | return; | 1289 | return; |
1392 | } | 1290 | } |
1393 | 1291 | ||
1394 | ci.md = md; | 1292 | ci.md = md; |
1395 | ci.bio = bio; | ||
1396 | ci.io = alloc_io(md); | 1293 | ci.io = alloc_io(md); |
1397 | ci.io->error = 0; | 1294 | ci.io->error = 0; |
1398 | atomic_set(&ci.io->io_count, 1); | 1295 | atomic_set(&ci.io->io_count, 1); |
@@ -1400,14 +1297,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
1400 | ci.io->md = md; | 1297 | ci.io->md = md; |
1401 | spin_lock_init(&ci.io->endio_lock); | 1298 | spin_lock_init(&ci.io->endio_lock); |
1402 | ci.sector = bio->bi_sector; | 1299 | ci.sector = bio->bi_sector; |
1403 | ci.sector_count = bio_sectors(bio); | ||
1404 | if (unlikely(bio_empty_barrier(bio))) | ||
1405 | ci.sector_count = 1; | ||
1406 | ci.idx = bio->bi_idx; | 1300 | ci.idx = bio->bi_idx; |
1407 | 1301 | ||
1408 | start_io_acct(ci.io); | 1302 | start_io_acct(ci.io); |
1409 | while (ci.sector_count && !error) | 1303 | if (bio->bi_rw & REQ_FLUSH) { |
1410 | error = __clone_and_map(&ci); | 1304 | ci.bio = &ci.md->flush_bio; |
1305 | ci.sector_count = 0; | ||
1306 | error = __clone_and_map_empty_flush(&ci); | ||
1307 | /* dec_pending submits any data associated with flush */ | ||
1308 | } else { | ||
1309 | ci.bio = bio; | ||
1310 | ci.sector_count = bio_sectors(bio); | ||
1311 | while (ci.sector_count && !error) | ||
1312 | error = __clone_and_map(&ci); | ||
1313 | } | ||
1411 | 1314 | ||
1412 | /* drop the extra reference count */ | 1315 | /* drop the extra reference count */ |
1413 | dec_pending(ci.io, error); | 1316 | dec_pending(ci.io, error); |
@@ -1491,22 +1394,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio) | |||
1491 | part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); | 1394 | part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); |
1492 | part_stat_unlock(); | 1395 | part_stat_unlock(); |
1493 | 1396 | ||
1494 | /* | 1397 | /* if we're suspended, we have to queue this io for later */ |
1495 | * If we're suspended or the thread is processing barriers | 1398 | if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { |
1496 | * we have to queue this io for later. | ||
1497 | */ | ||
1498 | if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || | ||
1499 | unlikely(bio->bi_rw & REQ_HARDBARRIER)) { | ||
1500 | up_read(&md->io_lock); | 1399 | up_read(&md->io_lock); |
1501 | 1400 | ||
1502 | if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && | 1401 | if (bio_rw(bio) != READA) |
1503 | bio_rw(bio) == READA) { | 1402 | queue_io(md, bio); |
1403 | else | ||
1504 | bio_io_error(bio); | 1404 | bio_io_error(bio); |
1505 | return 0; | ||
1506 | } | ||
1507 | |||
1508 | queue_io(md, bio); | ||
1509 | |||
1510 | return 0; | 1405 | return 0; |
1511 | } | 1406 | } |
1512 | 1407 | ||
@@ -1537,14 +1432,6 @@ static int dm_request(struct request_queue *q, struct bio *bio) | |||
1537 | return _dm_request(q, bio); | 1432 | return _dm_request(q, bio); |
1538 | } | 1433 | } |
1539 | 1434 | ||
1540 | static bool dm_rq_is_flush_request(struct request *rq) | ||
1541 | { | ||
1542 | if (rq->cmd_flags & REQ_FLUSH) | ||
1543 | return true; | ||
1544 | else | ||
1545 | return false; | ||
1546 | } | ||
1547 | |||
1548 | void dm_dispatch_request(struct request *rq) | 1435 | void dm_dispatch_request(struct request *rq) |
1549 | { | 1436 | { |
1550 | int r; | 1437 | int r; |
@@ -1592,22 +1479,15 @@ static int setup_clone(struct request *clone, struct request *rq, | |||
1592 | { | 1479 | { |
1593 | int r; | 1480 | int r; |
1594 | 1481 | ||
1595 | if (dm_rq_is_flush_request(rq)) { | 1482 | r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, |
1596 | blk_rq_init(NULL, clone); | 1483 | dm_rq_bio_constructor, tio); |
1597 | clone->cmd_type = REQ_TYPE_FS; | 1484 | if (r) |
1598 | clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); | 1485 | return r; |
1599 | } else { | ||
1600 | r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, | ||
1601 | dm_rq_bio_constructor, tio); | ||
1602 | if (r) | ||
1603 | return r; | ||
1604 | |||
1605 | clone->cmd = rq->cmd; | ||
1606 | clone->cmd_len = rq->cmd_len; | ||
1607 | clone->sense = rq->sense; | ||
1608 | clone->buffer = rq->buffer; | ||
1609 | } | ||
1610 | 1486 | ||
1487 | clone->cmd = rq->cmd; | ||
1488 | clone->cmd_len = rq->cmd_len; | ||
1489 | clone->sense = rq->sense; | ||
1490 | clone->buffer = rq->buffer; | ||
1611 | clone->end_io = end_clone_request; | 1491 | clone->end_io = end_clone_request; |
1612 | clone->end_io_data = tio; | 1492 | clone->end_io_data = tio; |
1613 | 1493 | ||
@@ -1648,9 +1528,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) | |||
1648 | struct mapped_device *md = q->queuedata; | 1528 | struct mapped_device *md = q->queuedata; |
1649 | struct request *clone; | 1529 | struct request *clone; |
1650 | 1530 | ||
1651 | if (unlikely(dm_rq_is_flush_request(rq))) | ||
1652 | return BLKPREP_OK; | ||
1653 | |||
1654 | if (unlikely(rq->special)) { | 1531 | if (unlikely(rq->special)) { |
1655 | DMWARN("Already has something in rq->special."); | 1532 | DMWARN("Already has something in rq->special."); |
1656 | return BLKPREP_KILL; | 1533 | return BLKPREP_KILL; |
@@ -1727,6 +1604,7 @@ static void dm_request_fn(struct request_queue *q) | |||
1727 | struct dm_table *map = dm_get_live_table(md); | 1604 | struct dm_table *map = dm_get_live_table(md); |
1728 | struct dm_target *ti; | 1605 | struct dm_target *ti; |
1729 | struct request *rq, *clone; | 1606 | struct request *rq, *clone; |
1607 | sector_t pos; | ||
1730 | 1608 | ||
1731 | /* | 1609 | /* |
1732 | * For suspend, check blk_queue_stopped() and increment | 1610 | * For suspend, check blk_queue_stopped() and increment |
@@ -1734,22 +1612,21 @@ static void dm_request_fn(struct request_queue *q) | |||
1734 | * number of in-flight I/Os after the queue is stopped in | 1612 | * number of in-flight I/Os after the queue is stopped in |
1735 | * dm_suspend(). | 1613 | * dm_suspend(). |
1736 | */ | 1614 | */ |
1737 | while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { | 1615 | while (!blk_queue_stopped(q)) { |
1738 | rq = blk_peek_request(q); | 1616 | rq = blk_peek_request(q); |
1739 | if (!rq) | 1617 | if (!rq) |
1740 | goto plug_and_out; | 1618 | goto delay_and_out; |
1741 | 1619 | ||
1742 | if (unlikely(dm_rq_is_flush_request(rq))) { | 1620 | /* always use block 0 to find the target for flushes for now */ |
1743 | BUG_ON(md->flush_request); | 1621 | pos = 0; |
1744 | md->flush_request = rq; | 1622 | if (!(rq->cmd_flags & REQ_FLUSH)) |
1745 | blk_start_request(rq); | 1623 | pos = blk_rq_pos(rq); |
1746 | queue_work(md->wq, &md->barrier_work); | 1624 | |
1747 | goto out; | 1625 | ti = dm_table_find_target(map, pos); |
1748 | } | 1626 | BUG_ON(!dm_target_is_valid(ti)); |
1749 | 1627 | ||
1750 | ti = dm_table_find_target(map, blk_rq_pos(rq)); | ||
1751 | if (ti->type->busy && ti->type->busy(ti)) | 1628 | if (ti->type->busy && ti->type->busy(ti)) |
1752 | goto plug_and_out; | 1629 | goto delay_and_out; |
1753 | 1630 | ||
1754 | blk_start_request(rq); | 1631 | blk_start_request(rq); |
1755 | clone = rq->special; | 1632 | clone = rq->special; |
@@ -1759,19 +1636,18 @@ static void dm_request_fn(struct request_queue *q) | |||
1759 | if (map_request(ti, clone, md)) | 1636 | if (map_request(ti, clone, md)) |
1760 | goto requeued; | 1637 | goto requeued; |
1761 | 1638 | ||
1762 | spin_lock_irq(q->queue_lock); | 1639 | BUG_ON(!irqs_disabled()); |
1640 | spin_lock(q->queue_lock); | ||
1763 | } | 1641 | } |
1764 | 1642 | ||
1765 | goto out; | 1643 | goto out; |
1766 | 1644 | ||
1767 | requeued: | 1645 | requeued: |
1768 | spin_lock_irq(q->queue_lock); | 1646 | BUG_ON(!irqs_disabled()); |
1769 | 1647 | spin_lock(q->queue_lock); | |
1770 | plug_and_out: | ||
1771 | if (!elv_queue_empty(q)) | ||
1772 | /* Some requests still remain, retry later */ | ||
1773 | blk_plug_device(q); | ||
1774 | 1648 | ||
1649 | delay_and_out: | ||
1650 | blk_delay_queue(q, HZ / 10); | ||
1775 | out: | 1651 | out: |
1776 | dm_table_put(map); | 1652 | dm_table_put(map); |
1777 | 1653 | ||
@@ -1800,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q) | |||
1800 | return r; | 1676 | return r; |
1801 | } | 1677 | } |
1802 | 1678 | ||
1803 | static void dm_unplug_all(struct request_queue *q) | ||
1804 | { | ||
1805 | struct mapped_device *md = q->queuedata; | ||
1806 | struct dm_table *map = dm_get_live_table(md); | ||
1807 | |||
1808 | if (map) { | ||
1809 | if (dm_request_based(md)) | ||
1810 | generic_unplug_device(q); | ||
1811 | |||
1812 | dm_table_unplug_all(map); | ||
1813 | dm_table_put(map); | ||
1814 | } | ||
1815 | } | ||
1816 | |||
1817 | static int dm_any_congested(void *congested_data, int bdi_bits) | 1679 | static int dm_any_congested(void *congested_data, int bdi_bits) |
1818 | { | 1680 | { |
1819 | int r = bdi_bits; | 1681 | int r = bdi_bits; |
@@ -1918,7 +1780,6 @@ out: | |||
1918 | static const struct block_device_operations dm_blk_dops; | 1780 | static const struct block_device_operations dm_blk_dops; |
1919 | 1781 | ||
1920 | static void dm_wq_work(struct work_struct *work); | 1782 | static void dm_wq_work(struct work_struct *work); |
1921 | static void dm_rq_barrier_work(struct work_struct *work); | ||
1922 | 1783 | ||
1923 | static void dm_init_md_queue(struct mapped_device *md) | 1784 | static void dm_init_md_queue(struct mapped_device *md) |
1924 | { | 1785 | { |
@@ -1938,8 +1799,8 @@ static void dm_init_md_queue(struct mapped_device *md) | |||
1938 | md->queue->backing_dev_info.congested_data = md; | 1799 | md->queue->backing_dev_info.congested_data = md; |
1939 | blk_queue_make_request(md->queue, dm_request); | 1800 | blk_queue_make_request(md->queue, dm_request); |
1940 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1801 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
1941 | md->queue->unplug_fn = dm_unplug_all; | ||
1942 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); | 1802 | blk_queue_merge_bvec(md->queue, dm_merge_bvec); |
1803 | blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA); | ||
1943 | } | 1804 | } |
1944 | 1805 | ||
1945 | /* | 1806 | /* |
@@ -1972,7 +1833,6 @@ static struct mapped_device *alloc_dev(int minor) | |||
1972 | mutex_init(&md->suspend_lock); | 1833 | mutex_init(&md->suspend_lock); |
1973 | mutex_init(&md->type_lock); | 1834 | mutex_init(&md->type_lock); |
1974 | spin_lock_init(&md->deferred_lock); | 1835 | spin_lock_init(&md->deferred_lock); |
1975 | spin_lock_init(&md->barrier_error_lock); | ||
1976 | rwlock_init(&md->map_lock); | 1836 | rwlock_init(&md->map_lock); |
1977 | atomic_set(&md->holders, 1); | 1837 | atomic_set(&md->holders, 1); |
1978 | atomic_set(&md->open_count, 0); | 1838 | atomic_set(&md->open_count, 0); |
@@ -1995,7 +1855,6 @@ static struct mapped_device *alloc_dev(int minor) | |||
1995 | atomic_set(&md->pending[1], 0); | 1855 | atomic_set(&md->pending[1], 0); |
1996 | init_waitqueue_head(&md->wait); | 1856 | init_waitqueue_head(&md->wait); |
1997 | INIT_WORK(&md->work, dm_wq_work); | 1857 | INIT_WORK(&md->work, dm_wq_work); |
1998 | INIT_WORK(&md->barrier_work, dm_rq_barrier_work); | ||
1999 | init_waitqueue_head(&md->eventq); | 1858 | init_waitqueue_head(&md->eventq); |
2000 | 1859 | ||
2001 | md->disk->major = _major; | 1860 | md->disk->major = _major; |
@@ -2007,7 +1866,8 @@ static struct mapped_device *alloc_dev(int minor) | |||
2007 | add_disk(md->disk); | 1866 | add_disk(md->disk); |
2008 | format_dev_t(md->name, MKDEV(_major, minor)); | 1867 | format_dev_t(md->name, MKDEV(_major, minor)); |
2009 | 1868 | ||
2010 | md->wq = create_singlethread_workqueue("kdmflush"); | 1869 | md->wq = alloc_workqueue("kdmflush", |
1870 | WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); | ||
2011 | if (!md->wq) | 1871 | if (!md->wq) |
2012 | goto bad_thread; | 1872 | goto bad_thread; |
2013 | 1873 | ||
@@ -2015,6 +1875,10 @@ static struct mapped_device *alloc_dev(int minor) | |||
2015 | if (!md->bdev) | 1875 | if (!md->bdev) |
2016 | goto bad_bdev; | 1876 | goto bad_bdev; |
2017 | 1877 | ||
1878 | bio_init(&md->flush_bio); | ||
1879 | md->flush_bio.bi_bdev = md->bdev; | ||
1880 | md->flush_bio.bi_rw = WRITE_FLUSH; | ||
1881 | |||
2018 | /* Populate the mapping, nobody knows we exist yet */ | 1882 | /* Populate the mapping, nobody knows we exist yet */ |
2019 | spin_lock(&_minor_lock); | 1883 | spin_lock(&_minor_lock); |
2020 | old_md = idr_replace(&_minor_idr, md, minor); | 1884 | old_md = idr_replace(&_minor_idr, md, minor); |
@@ -2111,13 +1975,14 @@ static void event_callback(void *context) | |||
2111 | wake_up(&md->eventq); | 1975 | wake_up(&md->eventq); |
2112 | } | 1976 | } |
2113 | 1977 | ||
1978 | /* | ||
1979 | * Protected by md->suspend_lock obtained by dm_swap_table(). | ||
1980 | */ | ||
2114 | static void __set_size(struct mapped_device *md, sector_t size) | 1981 | static void __set_size(struct mapped_device *md, sector_t size) |
2115 | { | 1982 | { |
2116 | set_capacity(md->disk, size); | 1983 | set_capacity(md->disk, size); |
2117 | 1984 | ||
2118 | mutex_lock(&md->bdev->bd_inode->i_mutex); | ||
2119 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1985 | i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
2120 | mutex_unlock(&md->bdev->bd_inode->i_mutex); | ||
2121 | } | 1986 | } |
2122 | 1987 | ||
2123 | /* | 1988 | /* |
@@ -2245,7 +2110,6 @@ static int dm_init_request_based_queue(struct mapped_device *md) | |||
2245 | blk_queue_softirq_done(md->queue, dm_softirq_done); | 2110 | blk_queue_softirq_done(md->queue, dm_softirq_done); |
2246 | blk_queue_prep_rq(md->queue, dm_prep_fn); | 2111 | blk_queue_prep_rq(md->queue, dm_prep_fn); |
2247 | blk_queue_lld_busy(md->queue, dm_lld_busy); | 2112 | blk_queue_lld_busy(md->queue, dm_lld_busy); |
2248 | blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); | ||
2249 | 2113 | ||
2250 | elv_register_queue(md->queue); | 2114 | elv_register_queue(md->queue); |
2251 | 2115 | ||
@@ -2380,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2380 | int r = 0; | 2244 | int r = 0; |
2381 | DECLARE_WAITQUEUE(wait, current); | 2245 | DECLARE_WAITQUEUE(wait, current); |
2382 | 2246 | ||
2383 | dm_unplug_all(md->queue); | ||
2384 | |||
2385 | add_wait_queue(&md->wait, &wait); | 2247 | add_wait_queue(&md->wait, &wait); |
2386 | 2248 | ||
2387 | while (1) { | 2249 | while (1) { |
@@ -2406,43 +2268,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible) | |||
2406 | return r; | 2268 | return r; |
2407 | } | 2269 | } |
2408 | 2270 | ||
2409 | static void dm_flush(struct mapped_device *md) | ||
2410 | { | ||
2411 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
2412 | |||
2413 | bio_init(&md->barrier_bio); | ||
2414 | md->barrier_bio.bi_bdev = md->bdev; | ||
2415 | md->barrier_bio.bi_rw = WRITE_BARRIER; | ||
2416 | __split_and_process_bio(md, &md->barrier_bio); | ||
2417 | |||
2418 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
2419 | } | ||
2420 | |||
2421 | static void process_barrier(struct mapped_device *md, struct bio *bio) | ||
2422 | { | ||
2423 | md->barrier_error = 0; | ||
2424 | |||
2425 | dm_flush(md); | ||
2426 | |||
2427 | if (!bio_empty_barrier(bio)) { | ||
2428 | __split_and_process_bio(md, bio); | ||
2429 | /* | ||
2430 | * If the request isn't supported, don't waste time with | ||
2431 | * the second flush. | ||
2432 | */ | ||
2433 | if (md->barrier_error != -EOPNOTSUPP) | ||
2434 | dm_flush(md); | ||
2435 | } | ||
2436 | |||
2437 | if (md->barrier_error != DM_ENDIO_REQUEUE) | ||
2438 | bio_endio(bio, md->barrier_error); | ||
2439 | else { | ||
2440 | spin_lock_irq(&md->deferred_lock); | ||
2441 | bio_list_add_head(&md->deferred, bio); | ||
2442 | spin_unlock_irq(&md->deferred_lock); | ||
2443 | } | ||
2444 | } | ||
2445 | |||
2446 | /* | 2271 | /* |
2447 | * Process the deferred bios | 2272 | * Process the deferred bios |
2448 | */ | 2273 | */ |
@@ -2452,33 +2277,27 @@ static void dm_wq_work(struct work_struct *work) | |||
2452 | work); | 2277 | work); |
2453 | struct bio *c; | 2278 | struct bio *c; |
2454 | 2279 | ||
2455 | down_write(&md->io_lock); | 2280 | down_read(&md->io_lock); |
2456 | 2281 | ||
2457 | while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { | 2282 | while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { |
2458 | spin_lock_irq(&md->deferred_lock); | 2283 | spin_lock_irq(&md->deferred_lock); |
2459 | c = bio_list_pop(&md->deferred); | 2284 | c = bio_list_pop(&md->deferred); |
2460 | spin_unlock_irq(&md->deferred_lock); | 2285 | spin_unlock_irq(&md->deferred_lock); |
2461 | 2286 | ||
2462 | if (!c) { | 2287 | if (!c) |
2463 | clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | ||
2464 | break; | 2288 | break; |
2465 | } | ||
2466 | 2289 | ||
2467 | up_write(&md->io_lock); | 2290 | up_read(&md->io_lock); |
2468 | 2291 | ||
2469 | if (dm_request_based(md)) | 2292 | if (dm_request_based(md)) |
2470 | generic_make_request(c); | 2293 | generic_make_request(c); |
2471 | else { | 2294 | else |
2472 | if (c->bi_rw & REQ_HARDBARRIER) | 2295 | __split_and_process_bio(md, c); |
2473 | process_barrier(md, c); | ||
2474 | else | ||
2475 | __split_and_process_bio(md, c); | ||
2476 | } | ||
2477 | 2296 | ||
2478 | down_write(&md->io_lock); | 2297 | down_read(&md->io_lock); |
2479 | } | 2298 | } |
2480 | 2299 | ||
2481 | up_write(&md->io_lock); | 2300 | up_read(&md->io_lock); |
2482 | } | 2301 | } |
2483 | 2302 | ||
2484 | static void dm_queue_flush(struct mapped_device *md) | 2303 | static void dm_queue_flush(struct mapped_device *md) |
@@ -2488,73 +2307,6 @@ static void dm_queue_flush(struct mapped_device *md) | |||
2488 | queue_work(md->wq, &md->work); | 2307 | queue_work(md->wq, &md->work); |
2489 | } | 2308 | } |
2490 | 2309 | ||
2491 | static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr) | ||
2492 | { | ||
2493 | struct dm_rq_target_io *tio = clone->end_io_data; | ||
2494 | |||
2495 | tio->info.target_request_nr = request_nr; | ||
2496 | } | ||
2497 | |||
2498 | /* Issue barrier requests to targets and wait for their completion. */ | ||
2499 | static int dm_rq_barrier(struct mapped_device *md) | ||
2500 | { | ||
2501 | int i, j; | ||
2502 | struct dm_table *map = dm_get_live_table(md); | ||
2503 | unsigned num_targets = dm_table_get_num_targets(map); | ||
2504 | struct dm_target *ti; | ||
2505 | struct request *clone; | ||
2506 | |||
2507 | md->barrier_error = 0; | ||
2508 | |||
2509 | for (i = 0; i < num_targets; i++) { | ||
2510 | ti = dm_table_get_target(map, i); | ||
2511 | for (j = 0; j < ti->num_flush_requests; j++) { | ||
2512 | clone = clone_rq(md->flush_request, md, GFP_NOIO); | ||
2513 | dm_rq_set_target_request_nr(clone, j); | ||
2514 | atomic_inc(&md->pending[rq_data_dir(clone)]); | ||
2515 | map_request(ti, clone, md); | ||
2516 | } | ||
2517 | } | ||
2518 | |||
2519 | dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); | ||
2520 | dm_table_put(map); | ||
2521 | |||
2522 | return md->barrier_error; | ||
2523 | } | ||
2524 | |||
2525 | static void dm_rq_barrier_work(struct work_struct *work) | ||
2526 | { | ||
2527 | int error; | ||
2528 | struct mapped_device *md = container_of(work, struct mapped_device, | ||
2529 | barrier_work); | ||
2530 | struct request_queue *q = md->queue; | ||
2531 | struct request *rq; | ||
2532 | unsigned long flags; | ||
2533 | |||
2534 | /* | ||
2535 | * Hold the md reference here and leave it at the last part so that | ||
2536 | * the md can't be deleted by device opener when the barrier request | ||
2537 | * completes. | ||
2538 | */ | ||
2539 | dm_get(md); | ||
2540 | |||
2541 | error = dm_rq_barrier(md); | ||
2542 | |||
2543 | rq = md->flush_request; | ||
2544 | md->flush_request = NULL; | ||
2545 | |||
2546 | if (error == DM_ENDIO_REQUEUE) { | ||
2547 | spin_lock_irqsave(q->queue_lock, flags); | ||
2548 | blk_requeue_request(q, rq); | ||
2549 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2550 | } else | ||
2551 | blk_end_request_all(rq, error); | ||
2552 | |||
2553 | blk_run_queue(q); | ||
2554 | |||
2555 | dm_put(md); | ||
2556 | } | ||
2557 | |||
2558 | /* | 2310 | /* |
2559 | * Swap in a new table, returning the old one for the caller to destroy. | 2311 | * Swap in a new table, returning the old one for the caller to destroy. |
2560 | */ | 2312 | */ |
@@ -2677,23 +2429,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | |||
2677 | * | 2429 | * |
2678 | * To get all processes out of __split_and_process_bio in dm_request, | 2430 | * To get all processes out of __split_and_process_bio in dm_request, |
2679 | * we take the write lock. To prevent any process from reentering | 2431 | * we take the write lock. To prevent any process from reentering |
2680 | * __split_and_process_bio from dm_request, we set | 2432 | * __split_and_process_bio from dm_request and quiesce the thread |
2681 | * DMF_QUEUE_IO_TO_THREAD. | 2433 | * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call |
2682 | * | 2434 | * flush_workqueue(md->wq). |
2683 | * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND | ||
2684 | * and call flush_workqueue(md->wq). flush_workqueue will wait until | ||
2685 | * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any | ||
2686 | * further calls to __split_and_process_bio from dm_wq_work. | ||
2687 | */ | 2435 | */ |
2688 | down_write(&md->io_lock); | 2436 | down_write(&md->io_lock); |
2689 | set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); | 2437 | set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); |
2690 | set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); | ||
2691 | up_write(&md->io_lock); | 2438 | up_write(&md->io_lock); |
2692 | 2439 | ||
2693 | /* | 2440 | /* |
2694 | * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which | 2441 | * Stop md->queue before flushing md->wq in case request-based |
2695 | * can be kicked until md->queue is stopped. So stop md->queue before | 2442 | * dm defers requests to md->wq from md->queue. |
2696 | * flushing md->wq. | ||
2697 | */ | 2443 | */ |
2698 | if (dm_request_based(md)) | 2444 | if (dm_request_based(md)) |
2699 | stop_queue(md->queue); | 2445 | stop_queue(md->queue); |
@@ -2772,7 +2518,6 @@ int dm_resume(struct mapped_device *md) | |||
2772 | 2518 | ||
2773 | clear_bit(DMF_SUSPENDED, &md->flags); | 2519 | clear_bit(DMF_SUSPENDED, &md->flags); |
2774 | 2520 | ||
2775 | dm_table_unplug_all(map); | ||
2776 | r = 0; | 2521 | r = 0; |
2777 | out: | 2522 | out: |
2778 | dm_table_put(map); | 2523 | dm_table_put(map); |
@@ -2876,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti) | |||
2876 | } | 2621 | } |
2877 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 2622 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
2878 | 2623 | ||
2879 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) | 2624 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) |
2880 | { | 2625 | { |
2881 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); | 2626 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); |
2627 | unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; | ||
2882 | 2628 | ||
2883 | if (!pools) | 2629 | if (!pools) |
2884 | return NULL; | 2630 | return NULL; |
@@ -2895,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) | |||
2895 | if (!pools->tio_pool) | 2641 | if (!pools->tio_pool) |
2896 | goto free_io_pool_and_out; | 2642 | goto free_io_pool_and_out; |
2897 | 2643 | ||
2898 | pools->bs = (type == DM_TYPE_BIO_BASED) ? | 2644 | pools->bs = bioset_create(pool_size, 0); |
2899 | bioset_create(16, 0) : bioset_create(MIN_IOS, 0); | ||
2900 | if (!pools->bs) | 2645 | if (!pools->bs) |
2901 | goto free_tio_pool_and_out; | 2646 | goto free_tio_pool_and_out; |
2902 | 2647 | ||
2648 | if (integrity && bioset_integrity_create(pools->bs, pool_size)) | ||
2649 | goto free_bioset_and_out; | ||
2650 | |||
2903 | return pools; | 2651 | return pools; |
2904 | 2652 | ||
2653 | free_bioset_and_out: | ||
2654 | bioset_free(pools->bs); | ||
2655 | |||
2905 | free_tio_pool_and_out: | 2656 | free_tio_pool_and_out: |
2906 | mempool_destroy(pools->tio_pool); | 2657 | mempool_destroy(pools->tio_pool); |
2907 | 2658 | ||
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 0c2dd5f4af76..1aaf16746da8 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
@@ -149,7 +149,7 @@ void dm_kcopyd_exit(void); | |||
149 | /* | 149 | /* |
150 | * Mempool operations | 150 | * Mempool operations |
151 | */ | 151 | */ |
152 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); | 152 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity); |
153 | void dm_free_md_mempools(struct dm_md_mempools *pools); | 153 | void dm_free_md_mempools(struct dm_md_mempools *pools); |
154 | 154 | ||
155 | #endif | 155 | #endif |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 1a8987884614..23078dabb6df 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -30,7 +30,7 @@ | |||
30 | * | 30 | * |
31 | * Different modes can be active at a time, but only | 31 | * Different modes can be active at a time, but only |
32 | * one can be set at array creation. Others can be added later. | 32 | * one can be set at array creation. Others can be added later. |
33 | * A mode can be one-shot or recurrent with the recurrance being | 33 | * A mode can be one-shot or recurrent with the recurrence being |
34 | * once in every N requests. | 34 | * once in every N requests. |
35 | * The bottom 5 bits of the "layout" indicate the mode. The | 35 | * The bottom 5 bits of the "layout" indicate the mode. The |
36 | * remainder indicate a period, or 0 for one-shot. | 36 | * remainder indicate a period, or 0 for one-shot. |
@@ -210,7 +210,7 @@ static int make_request(mddev_t *mddev, struct bio *bio) | |||
210 | } | 210 | } |
211 | } | 211 | } |
212 | if (failit) { | 212 | if (failit) { |
213 | struct bio *b = bio_clone(bio, GFP_NOIO); | 213 | struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); |
214 | b->bi_bdev = conf->rdev->bdev; | 214 | b->bi_bdev = conf->rdev->bdev; |
215 | b->bi_private = bio; | 215 | b->bi_private = bio; |
216 | b->bi_end_io = faulty_fail; | 216 | b->bi_end_io = faulty_fail; |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index ba19060bcf3f..abfb59a61ede 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -87,22 +87,6 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
87 | return maxsectors << 9; | 87 | return maxsectors << 9; |
88 | } | 88 | } |
89 | 89 | ||
90 | static void linear_unplug(struct request_queue *q) | ||
91 | { | ||
92 | mddev_t *mddev = q->queuedata; | ||
93 | linear_conf_t *conf; | ||
94 | int i; | ||
95 | |||
96 | rcu_read_lock(); | ||
97 | conf = rcu_dereference(mddev->private); | ||
98 | |||
99 | for (i=0; i < mddev->raid_disks; i++) { | ||
100 | struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); | ||
101 | blk_unplug(r_queue); | ||
102 | } | ||
103 | rcu_read_unlock(); | ||
104 | } | ||
105 | |||
106 | static int linear_congested(void *data, int bits) | 90 | static int linear_congested(void *data, int bits) |
107 | { | 91 | { |
108 | mddev_t *mddev = data; | 92 | mddev_t *mddev = data; |
@@ -216,7 +200,6 @@ static int linear_run (mddev_t *mddev) | |||
216 | 200 | ||
217 | if (md_check_no_bitmap(mddev)) | 201 | if (md_check_no_bitmap(mddev)) |
218 | return -EINVAL; | 202 | return -EINVAL; |
219 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | ||
220 | conf = linear_conf(mddev, mddev->raid_disks); | 203 | conf = linear_conf(mddev, mddev->raid_disks); |
221 | 204 | ||
222 | if (!conf) | 205 | if (!conf) |
@@ -225,11 +208,9 @@ static int linear_run (mddev_t *mddev) | |||
225 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 208 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
226 | 209 | ||
227 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 210 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
228 | mddev->queue->unplug_fn = linear_unplug; | ||
229 | mddev->queue->backing_dev_info.congested_fn = linear_congested; | 211 | mddev->queue->backing_dev_info.congested_fn = linear_congested; |
230 | mddev->queue->backing_dev_info.congested_data = mddev; | 212 | mddev->queue->backing_dev_info.congested_data = mddev; |
231 | md_integrity_register(mddev); | 213 | return md_integrity_register(mddev); |
232 | return 0; | ||
233 | } | 214 | } |
234 | 215 | ||
235 | static void free_conf(struct rcu_head *head) | 216 | static void free_conf(struct rcu_head *head) |
@@ -294,8 +275,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio) | |||
294 | dev_info_t *tmp_dev; | 275 | dev_info_t *tmp_dev; |
295 | sector_t start_sector; | 276 | sector_t start_sector; |
296 | 277 | ||
297 | if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { | 278 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
298 | md_barrier_request(mddev, bio); | 279 | md_flush_request(mddev, bio); |
299 | return 0; | 280 | return 0; |
300 | } | 281 | } |
301 | 282 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index f20d13e717d5..91e31e260b4a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -36,7 +36,7 @@ | |||
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/sysctl.h> | 37 | #include <linux/sysctl.h> |
38 | #include <linux/seq_file.h> | 38 | #include <linux/seq_file.h> |
39 | #include <linux/smp_lock.h> | 39 | #include <linux/mutex.h> |
40 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 40 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
41 | #include <linux/poll.h> | 41 | #include <linux/poll.h> |
42 | #include <linux/ctype.h> | 42 | #include <linux/ctype.h> |
@@ -57,7 +57,6 @@ | |||
57 | #define DEBUG 0 | 57 | #define DEBUG 0 |
58 | #define dprintk(x...) ((void)(DEBUG && printk(x))) | 58 | #define dprintk(x...) ((void)(DEBUG && printk(x))) |
59 | 59 | ||
60 | |||
61 | #ifndef MODULE | 60 | #ifndef MODULE |
62 | static void autostart_arrays(int part); | 61 | static void autostart_arrays(int part); |
63 | #endif | 62 | #endif |
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock); | |||
68 | static void md_print_devices(void); | 67 | static void md_print_devices(void); |
69 | 68 | ||
70 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); | 69 | static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
70 | static struct workqueue_struct *md_wq; | ||
71 | static struct workqueue_struct *md_misc_wq; | ||
71 | 72 | ||
72 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } | 73 | #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } |
73 | 74 | ||
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops; | |||
148 | 149 | ||
149 | static int start_readonly; | 150 | static int start_readonly; |
150 | 151 | ||
152 | /* bio_clone_mddev | ||
153 | * like bio_clone, but with a local bio set | ||
154 | */ | ||
155 | |||
156 | static void mddev_bio_destructor(struct bio *bio) | ||
157 | { | ||
158 | mddev_t *mddev, **mddevp; | ||
159 | |||
160 | mddevp = (void*)bio; | ||
161 | mddev = mddevp[-1]; | ||
162 | |||
163 | bio_free(bio, mddev->bio_set); | ||
164 | } | ||
165 | |||
166 | struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | ||
167 | mddev_t *mddev) | ||
168 | { | ||
169 | struct bio *b; | ||
170 | mddev_t **mddevp; | ||
171 | |||
172 | if (!mddev || !mddev->bio_set) | ||
173 | return bio_alloc(gfp_mask, nr_iovecs); | ||
174 | |||
175 | b = bio_alloc_bioset(gfp_mask, nr_iovecs, | ||
176 | mddev->bio_set); | ||
177 | if (!b) | ||
178 | return NULL; | ||
179 | mddevp = (void*)b; | ||
180 | mddevp[-1] = mddev; | ||
181 | b->bi_destructor = mddev_bio_destructor; | ||
182 | return b; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(bio_alloc_mddev); | ||
185 | |||
186 | struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | ||
187 | mddev_t *mddev) | ||
188 | { | ||
189 | struct bio *b; | ||
190 | mddev_t **mddevp; | ||
191 | |||
192 | if (!mddev || !mddev->bio_set) | ||
193 | return bio_clone(bio, gfp_mask); | ||
194 | |||
195 | b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, | ||
196 | mddev->bio_set); | ||
197 | if (!b) | ||
198 | return NULL; | ||
199 | mddevp = (void*)b; | ||
200 | mddevp[-1] = mddev; | ||
201 | b->bi_destructor = mddev_bio_destructor; | ||
202 | __bio_clone(b, bio); | ||
203 | if (bio_integrity(bio)) { | ||
204 | int ret; | ||
205 | |||
206 | ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); | ||
207 | |||
208 | if (ret < 0) { | ||
209 | bio_put(b); | ||
210 | return NULL; | ||
211 | } | ||
212 | } | ||
213 | |||
214 | return b; | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | ||
217 | |||
151 | /* | 218 | /* |
152 | * We have a system wide 'event count' that is incremented | 219 | * We have a system wide 'event count' that is incremented |
153 | * on any 'interesting' event, and readers of /proc/mdstat | 220 | * on any 'interesting' event, and readers of /proc/mdstat |
@@ -220,18 +287,21 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
220 | mddev_t *mddev = q->queuedata; | 287 | mddev_t *mddev = q->queuedata; |
221 | int rv; | 288 | int rv; |
222 | int cpu; | 289 | int cpu; |
290 | unsigned int sectors; | ||
223 | 291 | ||
224 | if (mddev == NULL || mddev->pers == NULL) { | 292 | if (mddev == NULL || mddev->pers == NULL |
293 | || !mddev->ready) { | ||
225 | bio_io_error(bio); | 294 | bio_io_error(bio); |
226 | return 0; | 295 | return 0; |
227 | } | 296 | } |
297 | smp_rmb(); /* Ensure implications of 'active' are visible */ | ||
228 | rcu_read_lock(); | 298 | rcu_read_lock(); |
229 | if (mddev->suspended || mddev->barrier) { | 299 | if (mddev->suspended) { |
230 | DEFINE_WAIT(__wait); | 300 | DEFINE_WAIT(__wait); |
231 | for (;;) { | 301 | for (;;) { |
232 | prepare_to_wait(&mddev->sb_wait, &__wait, | 302 | prepare_to_wait(&mddev->sb_wait, &__wait, |
233 | TASK_UNINTERRUPTIBLE); | 303 | TASK_UNINTERRUPTIBLE); |
234 | if (!mddev->suspended && !mddev->barrier) | 304 | if (!mddev->suspended) |
235 | break; | 305 | break; |
236 | rcu_read_unlock(); | 306 | rcu_read_unlock(); |
237 | schedule(); | 307 | schedule(); |
@@ -242,12 +312,16 @@ static int md_make_request(struct request_queue *q, struct bio *bio) | |||
242 | atomic_inc(&mddev->active_io); | 312 | atomic_inc(&mddev->active_io); |
243 | rcu_read_unlock(); | 313 | rcu_read_unlock(); |
244 | 314 | ||
315 | /* | ||
316 | * save the sectors now since our bio can | ||
317 | * go away inside make_request | ||
318 | */ | ||
319 | sectors = bio_sectors(bio); | ||
245 | rv = mddev->pers->make_request(mddev, bio); | 320 | rv = mddev->pers->make_request(mddev, bio); |
246 | 321 | ||
247 | cpu = part_stat_lock(); | 322 | cpu = part_stat_lock(); |
248 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); | 323 | part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); |
249 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], | 324 | part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); |
250 | bio_sectors(bio)); | ||
251 | part_stat_unlock(); | 325 | part_stat_unlock(); |
252 | 326 | ||
253 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) | 327 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) |
@@ -277,48 +351,45 @@ void mddev_resume(mddev_t *mddev) | |||
277 | mddev->suspended = 0; | 351 | mddev->suspended = 0; |
278 | wake_up(&mddev->sb_wait); | 352 | wake_up(&mddev->sb_wait); |
279 | mddev->pers->quiesce(mddev, 0); | 353 | mddev->pers->quiesce(mddev, 0); |
354 | |||
355 | md_wakeup_thread(mddev->thread); | ||
356 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | ||
280 | } | 357 | } |
281 | EXPORT_SYMBOL_GPL(mddev_resume); | 358 | EXPORT_SYMBOL_GPL(mddev_resume); |
282 | 359 | ||
283 | int mddev_congested(mddev_t *mddev, int bits) | 360 | int mddev_congested(mddev_t *mddev, int bits) |
284 | { | 361 | { |
285 | if (mddev->barrier) | ||
286 | return 1; | ||
287 | return mddev->suspended; | 362 | return mddev->suspended; |
288 | } | 363 | } |
289 | EXPORT_SYMBOL(mddev_congested); | 364 | EXPORT_SYMBOL(mddev_congested); |
290 | 365 | ||
291 | /* | 366 | /* |
292 | * Generic barrier handling for md | 367 | * Generic flush handling for md |
293 | */ | 368 | */ |
294 | 369 | ||
295 | #define POST_REQUEST_BARRIER ((void*)1) | 370 | static void md_end_flush(struct bio *bio, int err) |
296 | |||
297 | static void md_end_barrier(struct bio *bio, int err) | ||
298 | { | 371 | { |
299 | mdk_rdev_t *rdev = bio->bi_private; | 372 | mdk_rdev_t *rdev = bio->bi_private; |
300 | mddev_t *mddev = rdev->mddev; | 373 | mddev_t *mddev = rdev->mddev; |
301 | if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) | ||
302 | set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); | ||
303 | 374 | ||
304 | rdev_dec_pending(rdev, mddev); | 375 | rdev_dec_pending(rdev, mddev); |
305 | 376 | ||
306 | if (atomic_dec_and_test(&mddev->flush_pending)) { | 377 | if (atomic_dec_and_test(&mddev->flush_pending)) { |
307 | if (mddev->barrier == POST_REQUEST_BARRIER) { | 378 | /* The pre-request flush has finished */ |
308 | /* This was a post-request barrier */ | 379 | queue_work(md_wq, &mddev->flush_work); |
309 | mddev->barrier = NULL; | ||
310 | wake_up(&mddev->sb_wait); | ||
311 | } else | ||
312 | /* The pre-request barrier has finished */ | ||
313 | schedule_work(&mddev->barrier_work); | ||
314 | } | 380 | } |
315 | bio_put(bio); | 381 | bio_put(bio); |
316 | } | 382 | } |
317 | 383 | ||
318 | static void submit_barriers(mddev_t *mddev) | 384 | static void md_submit_flush_data(struct work_struct *ws); |
385 | |||
386 | static void submit_flushes(struct work_struct *ws) | ||
319 | { | 387 | { |
388 | mddev_t *mddev = container_of(ws, mddev_t, flush_work); | ||
320 | mdk_rdev_t *rdev; | 389 | mdk_rdev_t *rdev; |
321 | 390 | ||
391 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); | ||
392 | atomic_set(&mddev->flush_pending, 1); | ||
322 | rcu_read_lock(); | 393 | rcu_read_lock(); |
323 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 394 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) |
324 | if (rdev->raid_disk >= 0 && | 395 | if (rdev->raid_disk >= 0 && |
@@ -331,106 +402,107 @@ static void submit_barriers(mddev_t *mddev) | |||
331 | atomic_inc(&rdev->nr_pending); | 402 | atomic_inc(&rdev->nr_pending); |
332 | atomic_inc(&rdev->nr_pending); | 403 | atomic_inc(&rdev->nr_pending); |
333 | rcu_read_unlock(); | 404 | rcu_read_unlock(); |
334 | bi = bio_alloc(GFP_KERNEL, 0); | 405 | bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); |
335 | bi->bi_end_io = md_end_barrier; | 406 | bi->bi_end_io = md_end_flush; |
336 | bi->bi_private = rdev; | 407 | bi->bi_private = rdev; |
337 | bi->bi_bdev = rdev->bdev; | 408 | bi->bi_bdev = rdev->bdev; |
338 | atomic_inc(&mddev->flush_pending); | 409 | atomic_inc(&mddev->flush_pending); |
339 | submit_bio(WRITE_BARRIER, bi); | 410 | submit_bio(WRITE_FLUSH, bi); |
340 | rcu_read_lock(); | 411 | rcu_read_lock(); |
341 | rdev_dec_pending(rdev, mddev); | 412 | rdev_dec_pending(rdev, mddev); |
342 | } | 413 | } |
343 | rcu_read_unlock(); | 414 | rcu_read_unlock(); |
415 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
416 | queue_work(md_wq, &mddev->flush_work); | ||
344 | } | 417 | } |
345 | 418 | ||
346 | static void md_submit_barrier(struct work_struct *ws) | 419 | static void md_submit_flush_data(struct work_struct *ws) |
347 | { | 420 | { |
348 | mddev_t *mddev = container_of(ws, mddev_t, barrier_work); | 421 | mddev_t *mddev = container_of(ws, mddev_t, flush_work); |
349 | struct bio *bio = mddev->barrier; | 422 | struct bio *bio = mddev->flush_bio; |
350 | |||
351 | atomic_set(&mddev->flush_pending, 1); | ||
352 | 423 | ||
353 | if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) | 424 | if (bio->bi_size == 0) |
354 | bio_endio(bio, -EOPNOTSUPP); | ||
355 | else if (bio->bi_size == 0) | ||
356 | /* an empty barrier - all done */ | 425 | /* an empty barrier - all done */ |
357 | bio_endio(bio, 0); | 426 | bio_endio(bio, 0); |
358 | else { | 427 | else { |
359 | bio->bi_rw &= ~REQ_HARDBARRIER; | 428 | bio->bi_rw &= ~REQ_FLUSH; |
360 | if (mddev->pers->make_request(mddev, bio)) | 429 | if (mddev->pers->make_request(mddev, bio)) |
361 | generic_make_request(bio); | 430 | generic_make_request(bio); |
362 | mddev->barrier = POST_REQUEST_BARRIER; | ||
363 | submit_barriers(mddev); | ||
364 | } | ||
365 | if (atomic_dec_and_test(&mddev->flush_pending)) { | ||
366 | mddev->barrier = NULL; | ||
367 | wake_up(&mddev->sb_wait); | ||
368 | } | 431 | } |
432 | |||
433 | mddev->flush_bio = NULL; | ||
434 | wake_up(&mddev->sb_wait); | ||
369 | } | 435 | } |
370 | 436 | ||
371 | void md_barrier_request(mddev_t *mddev, struct bio *bio) | 437 | void md_flush_request(mddev_t *mddev, struct bio *bio) |
372 | { | 438 | { |
373 | spin_lock_irq(&mddev->write_lock); | 439 | spin_lock_irq(&mddev->write_lock); |
374 | wait_event_lock_irq(mddev->sb_wait, | 440 | wait_event_lock_irq(mddev->sb_wait, |
375 | !mddev->barrier, | 441 | !mddev->flush_bio, |
376 | mddev->write_lock, /*nothing*/); | 442 | mddev->write_lock, /*nothing*/); |
377 | mddev->barrier = bio; | 443 | mddev->flush_bio = bio; |
378 | spin_unlock_irq(&mddev->write_lock); | 444 | spin_unlock_irq(&mddev->write_lock); |
379 | 445 | ||
380 | atomic_set(&mddev->flush_pending, 1); | 446 | INIT_WORK(&mddev->flush_work, submit_flushes); |
381 | INIT_WORK(&mddev->barrier_work, md_submit_barrier); | 447 | queue_work(md_wq, &mddev->flush_work); |
382 | |||
383 | submit_barriers(mddev); | ||
384 | |||
385 | if (atomic_dec_and_test(&mddev->flush_pending)) | ||
386 | schedule_work(&mddev->barrier_work); | ||
387 | } | 448 | } |
388 | EXPORT_SYMBOL(md_barrier_request); | 449 | EXPORT_SYMBOL(md_flush_request); |
389 | 450 | ||
390 | /* Support for plugging. | 451 | /* Support for plugging. |
391 | * This mirrors the plugging support in request_queue, but does not | 452 | * This mirrors the plugging support in request_queue, but does not |
392 | * require having a whole queue | 453 | * require having a whole queue or request structures. |
454 | * We allocate an md_plug_cb for each md device and each thread it gets | ||
455 | * plugged on. This links tot the private plug_handle structure in the | ||
456 | * personality data where we keep a count of the number of outstanding | ||
457 | * plugs so other code can see if a plug is active. | ||
393 | */ | 458 | */ |
394 | static void plugger_work(struct work_struct *work) | 459 | struct md_plug_cb { |
395 | { | 460 | struct blk_plug_cb cb; |
396 | struct plug_handle *plug = | 461 | mddev_t *mddev; |
397 | container_of(work, struct plug_handle, unplug_work); | 462 | }; |
398 | plug->unplug_fn(plug); | ||
399 | } | ||
400 | static void plugger_timeout(unsigned long data) | ||
401 | { | ||
402 | struct plug_handle *plug = (void *)data; | ||
403 | kblockd_schedule_work(NULL, &plug->unplug_work); | ||
404 | } | ||
405 | void plugger_init(struct plug_handle *plug, | ||
406 | void (*unplug_fn)(struct plug_handle *)) | ||
407 | { | ||
408 | plug->unplug_flag = 0; | ||
409 | plug->unplug_fn = unplug_fn; | ||
410 | init_timer(&plug->unplug_timer); | ||
411 | plug->unplug_timer.function = plugger_timeout; | ||
412 | plug->unplug_timer.data = (unsigned long)plug; | ||
413 | INIT_WORK(&plug->unplug_work, plugger_work); | ||
414 | } | ||
415 | EXPORT_SYMBOL_GPL(plugger_init); | ||
416 | 463 | ||
417 | void plugger_set_plug(struct plug_handle *plug) | 464 | static void plugger_unplug(struct blk_plug_cb *cb) |
418 | { | 465 | { |
419 | if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) | 466 | struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); |
420 | mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); | 467 | if (atomic_dec_and_test(&mdcb->mddev->plug_cnt)) |
468 | md_wakeup_thread(mdcb->mddev->thread); | ||
469 | kfree(mdcb); | ||
421 | } | 470 | } |
422 | EXPORT_SYMBOL_GPL(plugger_set_plug); | ||
423 | 471 | ||
424 | int plugger_remove_plug(struct plug_handle *plug) | 472 | /* Check that an unplug wakeup will come shortly. |
473 | * If not, wakeup the md thread immediately | ||
474 | */ | ||
475 | int mddev_check_plugged(mddev_t *mddev) | ||
425 | { | 476 | { |
426 | if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { | 477 | struct blk_plug *plug = current->plug; |
427 | del_timer(&plug->unplug_timer); | 478 | struct md_plug_cb *mdcb; |
428 | return 1; | 479 | |
429 | } else | 480 | if (!plug) |
430 | return 0; | 481 | return 0; |
431 | } | ||
432 | EXPORT_SYMBOL_GPL(plugger_remove_plug); | ||
433 | 482 | ||
483 | list_for_each_entry(mdcb, &plug->cb_list, cb.list) { | ||
484 | if (mdcb->cb.callback == plugger_unplug && | ||
485 | mdcb->mddev == mddev) { | ||
486 | /* Already on the list, move to top */ | ||
487 | if (mdcb != list_first_entry(&plug->cb_list, | ||
488 | struct md_plug_cb, | ||
489 | cb.list)) | ||
490 | list_move(&mdcb->cb.list, &plug->cb_list); | ||
491 | return 1; | ||
492 | } | ||
493 | } | ||
494 | /* Not currently on the callback list */ | ||
495 | mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); | ||
496 | if (!mdcb) | ||
497 | return 0; | ||
498 | |||
499 | mdcb->mddev = mddev; | ||
500 | mdcb->cb.callback = plugger_unplug; | ||
501 | atomic_inc(&mddev->plug_cnt); | ||
502 | list_add(&mdcb->cb.list, &plug->cb_list); | ||
503 | return 1; | ||
504 | } | ||
505 | EXPORT_SYMBOL_GPL(mddev_check_plugged); | ||
434 | 506 | ||
435 | static inline mddev_t *mddev_get(mddev_t *mddev) | 507 | static inline mddev_t *mddev_get(mddev_t *mddev) |
436 | { | 508 | { |
@@ -442,6 +514,8 @@ static void mddev_delayed_delete(struct work_struct *ws); | |||
442 | 514 | ||
443 | static void mddev_put(mddev_t *mddev) | 515 | static void mddev_put(mddev_t *mddev) |
444 | { | 516 | { |
517 | struct bio_set *bs = NULL; | ||
518 | |||
445 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) | 519 | if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
446 | return; | 520 | return; |
447 | if (!mddev->raid_disks && list_empty(&mddev->disks) && | 521 | if (!mddev->raid_disks && list_empty(&mddev->disks) && |
@@ -449,19 +523,22 @@ static void mddev_put(mddev_t *mddev) | |||
449 | /* Array is not configured at all, and not held active, | 523 | /* Array is not configured at all, and not held active, |
450 | * so destroy it */ | 524 | * so destroy it */ |
451 | list_del(&mddev->all_mddevs); | 525 | list_del(&mddev->all_mddevs); |
526 | bs = mddev->bio_set; | ||
527 | mddev->bio_set = NULL; | ||
452 | if (mddev->gendisk) { | 528 | if (mddev->gendisk) { |
453 | /* we did a probe so need to clean up. | 529 | /* We did a probe so need to clean up. Call |
454 | * Call schedule_work inside the spinlock | 530 | * queue_work inside the spinlock so that |
455 | * so that flush_scheduled_work() after | 531 | * flush_workqueue() after mddev_find will |
456 | * mddev_find will succeed in waiting for the | 532 | * succeed in waiting for the work to be done. |
457 | * work to be done. | ||
458 | */ | 533 | */ |
459 | INIT_WORK(&mddev->del_work, mddev_delayed_delete); | 534 | INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
460 | schedule_work(&mddev->del_work); | 535 | queue_work(md_misc_wq, &mddev->del_work); |
461 | } else | 536 | } else |
462 | kfree(mddev); | 537 | kfree(mddev); |
463 | } | 538 | } |
464 | spin_unlock(&all_mddevs_lock); | 539 | spin_unlock(&all_mddevs_lock); |
540 | if (bs) | ||
541 | bioset_free(bs); | ||
465 | } | 542 | } |
466 | 543 | ||
467 | void mddev_init(mddev_t *mddev) | 544 | void mddev_init(mddev_t *mddev) |
@@ -475,6 +552,7 @@ void mddev_init(mddev_t *mddev) | |||
475 | atomic_set(&mddev->active, 1); | 552 | atomic_set(&mddev->active, 1); |
476 | atomic_set(&mddev->openers, 0); | 553 | atomic_set(&mddev->openers, 0); |
477 | atomic_set(&mddev->active_io, 0); | 554 | atomic_set(&mddev->active_io, 0); |
555 | atomic_set(&mddev->plug_cnt, 0); | ||
478 | spin_lock_init(&mddev->write_lock); | 556 | spin_lock_init(&mddev->write_lock); |
479 | atomic_set(&mddev->flush_pending, 0); | 557 | atomic_set(&mddev->flush_pending, 0); |
480 | init_waitqueue_head(&mddev->sb_wait); | 558 | init_waitqueue_head(&mddev->sb_wait); |
@@ -490,6 +568,9 @@ static mddev_t * mddev_find(dev_t unit) | |||
490 | { | 568 | { |
491 | mddev_t *mddev, *new = NULL; | 569 | mddev_t *mddev, *new = NULL; |
492 | 570 | ||
571 | if (unit && MAJOR(unit) != MD_MAJOR) | ||
572 | unit &= ~((1<<MdpMinorShift)-1); | ||
573 | |||
493 | retry: | 574 | retry: |
494 | spin_lock(&all_mddevs_lock); | 575 | spin_lock(&all_mddevs_lock); |
495 | 576 | ||
@@ -647,9 +728,9 @@ static struct mdk_personality *find_pers(int level, char *clevel) | |||
647 | } | 728 | } |
648 | 729 | ||
649 | /* return the offset of the super block in 512byte sectors */ | 730 | /* return the offset of the super block in 512byte sectors */ |
650 | static inline sector_t calc_dev_sboffset(struct block_device *bdev) | 731 | static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev) |
651 | { | 732 | { |
652 | sector_t num_sectors = bdev->bd_inode->i_size / 512; | 733 | sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; |
653 | return MD_NEW_SIZE_SECTORS(num_sectors); | 734 | return MD_NEW_SIZE_SECTORS(num_sectors); |
654 | } | 735 | } |
655 | 736 | ||
@@ -696,31 +777,6 @@ static void super_written(struct bio *bio, int error) | |||
696 | bio_put(bio); | 777 | bio_put(bio); |
697 | } | 778 | } |
698 | 779 | ||
699 | static void super_written_barrier(struct bio *bio, int error) | ||
700 | { | ||
701 | struct bio *bio2 = bio->bi_private; | ||
702 | mdk_rdev_t *rdev = bio2->bi_private; | ||
703 | mddev_t *mddev = rdev->mddev; | ||
704 | |||
705 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && | ||
706 | error == -EOPNOTSUPP) { | ||
707 | unsigned long flags; | ||
708 | /* barriers don't appear to be supported :-( */ | ||
709 | set_bit(BarriersNotsupp, &rdev->flags); | ||
710 | mddev->barriers_work = 0; | ||
711 | spin_lock_irqsave(&mddev->write_lock, flags); | ||
712 | bio2->bi_next = mddev->biolist; | ||
713 | mddev->biolist = bio2; | ||
714 | spin_unlock_irqrestore(&mddev->write_lock, flags); | ||
715 | wake_up(&mddev->sb_wait); | ||
716 | bio_put(bio); | ||
717 | } else { | ||
718 | bio_put(bio2); | ||
719 | bio->bi_private = rdev; | ||
720 | super_written(bio, error); | ||
721 | } | ||
722 | } | ||
723 | |||
724 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 780 | void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
725 | sector_t sector, int size, struct page *page) | 781 | sector_t sector, int size, struct page *page) |
726 | { | 782 | { |
@@ -729,51 +785,27 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | |||
729 | * and decrement it on completion, waking up sb_wait | 785 | * and decrement it on completion, waking up sb_wait |
730 | * if zero is reached. | 786 | * if zero is reached. |
731 | * If an error occurred, call md_error | 787 | * If an error occurred, call md_error |
732 | * | ||
733 | * As we might need to resubmit the request if REQ_HARDBARRIER | ||
734 | * causes ENOTSUPP, we allocate a spare bio... | ||
735 | */ | 788 | */ |
736 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | 789 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); |
737 | int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG; | ||
738 | 790 | ||
739 | bio->bi_bdev = rdev->bdev; | 791 | bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; |
740 | bio->bi_sector = sector; | 792 | bio->bi_sector = sector; |
741 | bio_add_page(bio, page, size, 0); | 793 | bio_add_page(bio, page, size, 0); |
742 | bio->bi_private = rdev; | 794 | bio->bi_private = rdev; |
743 | bio->bi_end_io = super_written; | 795 | bio->bi_end_io = super_written; |
744 | bio->bi_rw = rw; | ||
745 | 796 | ||
746 | atomic_inc(&mddev->pending_writes); | 797 | atomic_inc(&mddev->pending_writes); |
747 | if (!test_bit(BarriersNotsupp, &rdev->flags)) { | 798 | submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio); |
748 | struct bio *rbio; | ||
749 | rw |= REQ_HARDBARRIER; | ||
750 | rbio = bio_clone(bio, GFP_NOIO); | ||
751 | rbio->bi_private = bio; | ||
752 | rbio->bi_end_io = super_written_barrier; | ||
753 | submit_bio(rw, rbio); | ||
754 | } else | ||
755 | submit_bio(rw, bio); | ||
756 | } | 799 | } |
757 | 800 | ||
758 | void md_super_wait(mddev_t *mddev) | 801 | void md_super_wait(mddev_t *mddev) |
759 | { | 802 | { |
760 | /* wait for all superblock writes that were scheduled to complete. | 803 | /* wait for all superblock writes that were scheduled to complete */ |
761 | * if any had to be retried (due to BARRIER problems), retry them | ||
762 | */ | ||
763 | DEFINE_WAIT(wq); | 804 | DEFINE_WAIT(wq); |
764 | for(;;) { | 805 | for(;;) { |
765 | prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); | 806 | prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); |
766 | if (atomic_read(&mddev->pending_writes)==0) | 807 | if (atomic_read(&mddev->pending_writes)==0) |
767 | break; | 808 | break; |
768 | while (mddev->biolist) { | ||
769 | struct bio *bio; | ||
770 | spin_lock_irq(&mddev->write_lock); | ||
771 | bio = mddev->biolist; | ||
772 | mddev->biolist = bio->bi_next ; | ||
773 | bio->bi_next = NULL; | ||
774 | spin_unlock_irq(&mddev->write_lock); | ||
775 | submit_bio(bio->bi_rw, bio); | ||
776 | } | ||
777 | schedule(); | 809 | schedule(); |
778 | } | 810 | } |
779 | finish_wait(&mddev->sb_wait, &wq); | 811 | finish_wait(&mddev->sb_wait, &wq); |
@@ -784,17 +816,21 @@ static void bi_complete(struct bio *bio, int error) | |||
784 | complete((struct completion*)bio->bi_private); | 816 | complete((struct completion*)bio->bi_private); |
785 | } | 817 | } |
786 | 818 | ||
787 | int sync_page_io(struct block_device *bdev, sector_t sector, int size, | 819 | int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, |
788 | struct page *page, int rw) | 820 | struct page *page, int rw, bool metadata_op) |
789 | { | 821 | { |
790 | struct bio *bio = bio_alloc(GFP_NOIO, 1); | 822 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); |
791 | struct completion event; | 823 | struct completion event; |
792 | int ret; | 824 | int ret; |
793 | 825 | ||
794 | rw |= REQ_SYNC | REQ_UNPLUG; | 826 | rw |= REQ_SYNC; |
795 | 827 | ||
796 | bio->bi_bdev = bdev; | 828 | bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? |
797 | bio->bi_sector = sector; | 829 | rdev->meta_bdev : rdev->bdev; |
830 | if (metadata_op) | ||
831 | bio->bi_sector = sector + rdev->sb_start; | ||
832 | else | ||
833 | bio->bi_sector = sector + rdev->data_offset; | ||
798 | bio_add_page(bio, page, size, 0); | 834 | bio_add_page(bio, page, size, 0); |
799 | init_completion(&event); | 835 | init_completion(&event); |
800 | bio->bi_private = &event; | 836 | bio->bi_private = &event; |
@@ -819,7 +855,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) | |||
819 | return 0; | 855 | return 0; |
820 | 856 | ||
821 | 857 | ||
822 | if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) | 858 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) |
823 | goto fail; | 859 | goto fail; |
824 | rdev->sb_loaded = 1; | 860 | rdev->sb_loaded = 1; |
825 | return 0; | 861 | return 0; |
@@ -981,7 +1017,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
981 | * | 1017 | * |
982 | * It also happens to be a multiple of 4Kb. | 1018 | * It also happens to be a multiple of 4Kb. |
983 | */ | 1019 | */ |
984 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 1020 | rdev->sb_start = calc_dev_sboffset(rdev); |
985 | 1021 | ||
986 | ret = read_disk_sb(rdev, MD_SB_BYTES); | 1022 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
987 | if (ret) return ret; | 1023 | if (ret) return ret; |
@@ -1070,7 +1106,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1070 | clear_bit(Faulty, &rdev->flags); | 1106 | clear_bit(Faulty, &rdev->flags); |
1071 | clear_bit(In_sync, &rdev->flags); | 1107 | clear_bit(In_sync, &rdev->flags); |
1072 | clear_bit(WriteMostly, &rdev->flags); | 1108 | clear_bit(WriteMostly, &rdev->flags); |
1073 | clear_bit(BarriersNotsupp, &rdev->flags); | ||
1074 | 1109 | ||
1075 | if (mddev->raid_disks == 0) { | 1110 | if (mddev->raid_disks == 0) { |
1076 | mddev->major_version = 0; | 1111 | mddev->major_version = 0; |
@@ -1323,13 +1358,13 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1323 | return 0; /* component must fit device */ | 1358 | return 0; /* component must fit device */ |
1324 | if (rdev->mddev->bitmap_info.offset) | 1359 | if (rdev->mddev->bitmap_info.offset) |
1325 | return 0; /* can't move bitmap */ | 1360 | return 0; /* can't move bitmap */ |
1326 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 1361 | rdev->sb_start = calc_dev_sboffset(rdev); |
1327 | if (!num_sectors || num_sectors > rdev->sb_start) | 1362 | if (!num_sectors || num_sectors > rdev->sb_start) |
1328 | num_sectors = rdev->sb_start; | 1363 | num_sectors = rdev->sb_start; |
1329 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1364 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
1330 | rdev->sb_page); | 1365 | rdev->sb_page); |
1331 | md_super_wait(rdev->mddev); | 1366 | md_super_wait(rdev->mddev); |
1332 | return num_sectors / 2; /* kB for sysfs */ | 1367 | return num_sectors; |
1333 | } | 1368 | } |
1334 | 1369 | ||
1335 | 1370 | ||
@@ -1378,7 +1413,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1378 | */ | 1413 | */ |
1379 | switch(minor_version) { | 1414 | switch(minor_version) { |
1380 | case 0: | 1415 | case 0: |
1381 | sb_start = rdev->bdev->bd_inode->i_size >> 9; | 1416 | sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; |
1382 | sb_start -= 8*2; | 1417 | sb_start -= 8*2; |
1383 | sb_start &= ~(sector_t)(4*2-1); | 1418 | sb_start &= ~(sector_t)(4*2-1); |
1384 | break; | 1419 | break; |
@@ -1464,7 +1499,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1464 | ret = 0; | 1499 | ret = 0; |
1465 | } | 1500 | } |
1466 | if (minor_version) | 1501 | if (minor_version) |
1467 | rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - | 1502 | rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
1468 | le64_to_cpu(sb->data_offset); | 1503 | le64_to_cpu(sb->data_offset); |
1469 | else | 1504 | else |
1470 | rdev->sectors = rdev->sb_start; | 1505 | rdev->sectors = rdev->sb_start; |
@@ -1485,7 +1520,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1485 | clear_bit(Faulty, &rdev->flags); | 1520 | clear_bit(Faulty, &rdev->flags); |
1486 | clear_bit(In_sync, &rdev->flags); | 1521 | clear_bit(In_sync, &rdev->flags); |
1487 | clear_bit(WriteMostly, &rdev->flags); | 1522 | clear_bit(WriteMostly, &rdev->flags); |
1488 | clear_bit(BarriersNotsupp, &rdev->flags); | ||
1489 | 1523 | ||
1490 | if (mddev->raid_disks == 0) { | 1524 | if (mddev->raid_disks == 0) { |
1491 | mddev->major_version = 1; | 1525 | mddev->major_version = 1; |
@@ -1673,7 +1707,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1673 | return 0; /* component must fit device */ | 1707 | return 0; /* component must fit device */ |
1674 | if (rdev->sb_start < rdev->data_offset) { | 1708 | if (rdev->sb_start < rdev->data_offset) { |
1675 | /* minor versions 1 and 2; superblock before data */ | 1709 | /* minor versions 1 and 2; superblock before data */ |
1676 | max_sectors = rdev->bdev->bd_inode->i_size >> 9; | 1710 | max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; |
1677 | max_sectors -= rdev->data_offset; | 1711 | max_sectors -= rdev->data_offset; |
1678 | if (!num_sectors || num_sectors > max_sectors) | 1712 | if (!num_sectors || num_sectors > max_sectors) |
1679 | num_sectors = max_sectors; | 1713 | num_sectors = max_sectors; |
@@ -1683,7 +1717,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1683 | } else { | 1717 | } else { |
1684 | /* minor version 0; superblock after data */ | 1718 | /* minor version 0; superblock after data */ |
1685 | sector_t sb_start; | 1719 | sector_t sb_start; |
1686 | sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; | 1720 | sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; |
1687 | sb_start &= ~(sector_t)(4*2 - 1); | 1721 | sb_start &= ~(sector_t)(4*2 - 1); |
1688 | max_sectors = rdev->sectors + sb_start - rdev->sb_start; | 1722 | max_sectors = rdev->sectors + sb_start - rdev->sb_start; |
1689 | if (!num_sectors || num_sectors > max_sectors) | 1723 | if (!num_sectors || num_sectors > max_sectors) |
@@ -1697,7 +1731,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1697 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1731 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
1698 | rdev->sb_page); | 1732 | rdev->sb_page); |
1699 | md_super_wait(rdev->mddev); | 1733 | md_super_wait(rdev->mddev); |
1700 | return num_sectors / 2; /* kB for sysfs */ | 1734 | return num_sectors; |
1701 | } | 1735 | } |
1702 | 1736 | ||
1703 | static struct super_type super_types[] = { | 1737 | static struct super_type super_types[] = { |
@@ -1719,6 +1753,18 @@ static struct super_type super_types[] = { | |||
1719 | }, | 1753 | }, |
1720 | }; | 1754 | }; |
1721 | 1755 | ||
1756 | static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev) | ||
1757 | { | ||
1758 | if (mddev->sync_super) { | ||
1759 | mddev->sync_super(mddev, rdev); | ||
1760 | return; | ||
1761 | } | ||
1762 | |||
1763 | BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); | ||
1764 | |||
1765 | super_types[mddev->major_version].sync_super(mddev, rdev); | ||
1766 | } | ||
1767 | |||
1722 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | 1768 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) |
1723 | { | 1769 | { |
1724 | mdk_rdev_t *rdev, *rdev2; | 1770 | mdk_rdev_t *rdev, *rdev2; |
@@ -1750,20 +1796,14 @@ int md_integrity_register(mddev_t *mddev) | |||
1750 | 1796 | ||
1751 | if (list_empty(&mddev->disks)) | 1797 | if (list_empty(&mddev->disks)) |
1752 | return 0; /* nothing to do */ | 1798 | return 0; /* nothing to do */ |
1753 | if (blk_get_integrity(mddev->gendisk)) | 1799 | if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) |
1754 | return 0; /* already registered */ | 1800 | return 0; /* shouldn't register, or already is */ |
1755 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 1801 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1756 | /* skip spares and non-functional disks */ | 1802 | /* skip spares and non-functional disks */ |
1757 | if (test_bit(Faulty, &rdev->flags)) | 1803 | if (test_bit(Faulty, &rdev->flags)) |
1758 | continue; | 1804 | continue; |
1759 | if (rdev->raid_disk < 0) | 1805 | if (rdev->raid_disk < 0) |
1760 | continue; | 1806 | continue; |
1761 | /* | ||
1762 | * If at least one rdev is not integrity capable, we can not | ||
1763 | * enable data integrity for the md device. | ||
1764 | */ | ||
1765 | if (!bdev_get_integrity(rdev->bdev)) | ||
1766 | return -EINVAL; | ||
1767 | if (!reference) { | 1807 | if (!reference) { |
1768 | /* Use the first rdev as the reference */ | 1808 | /* Use the first rdev as the reference */ |
1769 | reference = rdev; | 1809 | reference = rdev; |
@@ -1774,6 +1814,8 @@ int md_integrity_register(mddev_t *mddev) | |||
1774 | rdev->bdev->bd_disk) < 0) | 1814 | rdev->bdev->bd_disk) < 0) |
1775 | return -EINVAL; | 1815 | return -EINVAL; |
1776 | } | 1816 | } |
1817 | if (!reference || !bdev_get_integrity(reference->bdev)) | ||
1818 | return 0; | ||
1777 | /* | 1819 | /* |
1778 | * All component devices are integrity capable and have matching | 1820 | * All component devices are integrity capable and have matching |
1779 | * profiles, register the common profile for the md device. | 1821 | * profiles, register the common profile for the md device. |
@@ -1784,8 +1826,12 @@ int md_integrity_register(mddev_t *mddev) | |||
1784 | mdname(mddev)); | 1826 | mdname(mddev)); |
1785 | return -EINVAL; | 1827 | return -EINVAL; |
1786 | } | 1828 | } |
1787 | printk(KERN_NOTICE "md: data integrity on %s enabled\n", | 1829 | printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); |
1788 | mdname(mddev)); | 1830 | if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { |
1831 | printk(KERN_ERR "md: failed to create integrity pool for %s\n", | ||
1832 | mdname(mddev)); | ||
1833 | return -EINVAL; | ||
1834 | } | ||
1789 | return 0; | 1835 | return 0; |
1790 | } | 1836 | } |
1791 | EXPORT_SYMBOL(md_integrity_register); | 1837 | EXPORT_SYMBOL(md_integrity_register); |
@@ -1873,7 +1919,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1873 | rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); | 1919 | rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); |
1874 | 1920 | ||
1875 | list_add_rcu(&rdev->same_set, &mddev->disks); | 1921 | list_add_rcu(&rdev->same_set, &mddev->disks); |
1876 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); | 1922 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
1877 | 1923 | ||
1878 | /* May as well allow recovery to be retried once */ | 1924 | /* May as well allow recovery to be retried once */ |
1879 | mddev->recovery_disabled = 0; | 1925 | mddev->recovery_disabled = 0; |
@@ -1900,7 +1946,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1900 | MD_BUG(); | 1946 | MD_BUG(); |
1901 | return; | 1947 | return; |
1902 | } | 1948 | } |
1903 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); | 1949 | bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
1904 | list_del_rcu(&rdev->same_set); | 1950 | list_del_rcu(&rdev->same_set); |
1905 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 1951 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
1906 | rdev->mddev = NULL; | 1952 | rdev->mddev = NULL; |
@@ -1914,7 +1960,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1914 | synchronize_rcu(); | 1960 | synchronize_rcu(); |
1915 | INIT_WORK(&rdev->del_work, md_delayed_delete); | 1961 | INIT_WORK(&rdev->del_work, md_delayed_delete); |
1916 | kobject_get(&rdev->kobj); | 1962 | kobject_get(&rdev->kobj); |
1917 | schedule_work(&rdev->del_work); | 1963 | queue_work(md_misc_wq, &rdev->del_work); |
1918 | } | 1964 | } |
1919 | 1965 | ||
1920 | /* | 1966 | /* |
@@ -1928,21 +1974,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) | |||
1928 | struct block_device *bdev; | 1974 | struct block_device *bdev; |
1929 | char b[BDEVNAME_SIZE]; | 1975 | char b[BDEVNAME_SIZE]; |
1930 | 1976 | ||
1931 | bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); | 1977 | bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
1978 | shared ? (mdk_rdev_t *)lock_rdev : rdev); | ||
1932 | if (IS_ERR(bdev)) { | 1979 | if (IS_ERR(bdev)) { |
1933 | printk(KERN_ERR "md: could not open %s.\n", | 1980 | printk(KERN_ERR "md: could not open %s.\n", |
1934 | __bdevname(dev, b)); | 1981 | __bdevname(dev, b)); |
1935 | return PTR_ERR(bdev); | 1982 | return PTR_ERR(bdev); |
1936 | } | 1983 | } |
1937 | err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); | ||
1938 | if (err) { | ||
1939 | printk(KERN_ERR "md: could not bd_claim %s.\n", | ||
1940 | bdevname(bdev, b)); | ||
1941 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE); | ||
1942 | return err; | ||
1943 | } | ||
1944 | if (!shared) | ||
1945 | set_bit(AllReserved, &rdev->flags); | ||
1946 | rdev->bdev = bdev; | 1984 | rdev->bdev = bdev; |
1947 | return err; | 1985 | return err; |
1948 | } | 1986 | } |
@@ -1953,8 +1991,7 @@ static void unlock_rdev(mdk_rdev_t *rdev) | |||
1953 | rdev->bdev = NULL; | 1991 | rdev->bdev = NULL; |
1954 | if (!bdev) | 1992 | if (!bdev) |
1955 | MD_BUG(); | 1993 | MD_BUG(); |
1956 | bd_release(bdev); | 1994 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); |
1957 | blkdev_put(bdev, FMODE_READ|FMODE_WRITE); | ||
1958 | } | 1995 | } |
1959 | 1996 | ||
1960 | void md_autodetect_dev(dev_t dev); | 1997 | void md_autodetect_dev(dev_t dev); |
@@ -2146,8 +2183,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) | |||
2146 | /* Don't update this superblock */ | 2183 | /* Don't update this superblock */ |
2147 | rdev->sb_loaded = 2; | 2184 | rdev->sb_loaded = 2; |
2148 | } else { | 2185 | } else { |
2149 | super_types[mddev->major_version]. | 2186 | sync_super(mddev, rdev); |
2150 | sync_super(mddev, rdev); | ||
2151 | rdev->sb_loaded = 1; | 2187 | rdev->sb_loaded = 1; |
2152 | } | 2188 | } |
2153 | } | 2189 | } |
@@ -2172,6 +2208,8 @@ repeat: | |||
2172 | if (!mddev->persistent) { | 2208 | if (!mddev->persistent) { |
2173 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2209 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
2174 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2210 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2211 | if (!mddev->external) | ||
2212 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | ||
2175 | wake_up(&mddev->sb_wait); | 2213 | wake_up(&mddev->sb_wait); |
2176 | return; | 2214 | return; |
2177 | } | 2215 | } |
@@ -2438,7 +2476,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2438 | if (rdev->raid_disk == -1) | 2476 | if (rdev->raid_disk == -1) |
2439 | return -EEXIST; | 2477 | return -EEXIST; |
2440 | /* personality does all needed checks */ | 2478 | /* personality does all needed checks */ |
2441 | if (rdev->mddev->pers->hot_add_disk == NULL) | 2479 | if (rdev->mddev->pers->hot_remove_disk == NULL) |
2442 | return -EINVAL; | 2480 | return -EINVAL; |
2443 | err = rdev->mddev->pers-> | 2481 | err = rdev->mddev->pers-> |
2444 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2482 | hot_remove_disk(rdev->mddev, rdev->raid_disk); |
@@ -2458,6 +2496,9 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2458 | if (rdev->raid_disk != -1) | 2496 | if (rdev->raid_disk != -1) |
2459 | return -EBUSY; | 2497 | return -EBUSY; |
2460 | 2498 | ||
2499 | if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) | ||
2500 | return -EBUSY; | ||
2501 | |||
2461 | if (rdev->mddev->pers->hot_add_disk == NULL) | 2502 | if (rdev->mddev->pers->hot_add_disk == NULL) |
2462 | return -EINVAL; | 2503 | return -EINVAL; |
2463 | 2504 | ||
@@ -2465,6 +2506,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2465 | if (rdev2->raid_disk == slot) | 2506 | if (rdev2->raid_disk == slot) |
2466 | return -EEXIST; | 2507 | return -EEXIST; |
2467 | 2508 | ||
2509 | if (slot >= rdev->mddev->raid_disks && | ||
2510 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) | ||
2511 | return -ENOSPC; | ||
2512 | |||
2468 | rdev->raid_disk = slot; | 2513 | rdev->raid_disk = slot; |
2469 | if (test_bit(In_sync, &rdev->flags)) | 2514 | if (test_bit(In_sync, &rdev->flags)) |
2470 | rdev->saved_raid_disk = slot; | 2515 | rdev->saved_raid_disk = slot; |
@@ -2482,7 +2527,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2482 | /* failure here is OK */; | 2527 | /* failure here is OK */; |
2483 | /* don't wakeup anyone, leave that to userspace. */ | 2528 | /* don't wakeup anyone, leave that to userspace. */ |
2484 | } else { | 2529 | } else { |
2485 | if (slot >= rdev->mddev->raid_disks) | 2530 | if (slot >= rdev->mddev->raid_disks && |
2531 | slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) | ||
2486 | return -ENOSPC; | 2532 | return -ENOSPC; |
2487 | rdev->raid_disk = slot; | 2533 | rdev->raid_disk = slot; |
2488 | /* assume it is working */ | 2534 | /* assume it is working */ |
@@ -2575,7 +2621,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2575 | if (!sectors) | 2621 | if (!sectors) |
2576 | return -EBUSY; | 2622 | return -EBUSY; |
2577 | } else if (!sectors) | 2623 | } else if (!sectors) |
2578 | sectors = (rdev->bdev->bd_inode->i_size >> 9) - | 2624 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
2579 | rdev->data_offset; | 2625 | rdev->data_offset; |
2580 | } | 2626 | } |
2581 | if (sectors < my_mddev->dev_sectors) | 2627 | if (sectors < my_mddev->dev_sectors) |
@@ -2598,12 +2644,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2598 | 2644 | ||
2599 | mddev_lock(mddev); | 2645 | mddev_lock(mddev); |
2600 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 2646 | list_for_each_entry(rdev2, &mddev->disks, same_set) |
2601 | if (test_bit(AllReserved, &rdev2->flags) || | 2647 | if (rdev->bdev == rdev2->bdev && |
2602 | (rdev->bdev == rdev2->bdev && | 2648 | rdev != rdev2 && |
2603 | rdev != rdev2 && | 2649 | overlaps(rdev->data_offset, rdev->sectors, |
2604 | overlaps(rdev->data_offset, rdev->sectors, | 2650 | rdev2->data_offset, |
2605 | rdev2->data_offset, | 2651 | rdev2->sectors)) { |
2606 | rdev2->sectors))) { | ||
2607 | overlap = 1; | 2652 | overlap = 1; |
2608 | break; | 2653 | break; |
2609 | } | 2654 | } |
@@ -2788,7 +2833,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2788 | 2833 | ||
2789 | kobject_init(&rdev->kobj, &rdev_ktype); | 2834 | kobject_init(&rdev->kobj, &rdev_ktype); |
2790 | 2835 | ||
2791 | size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | 2836 | size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; |
2792 | if (!size) { | 2837 | if (!size) { |
2793 | printk(KERN_WARNING | 2838 | printk(KERN_WARNING |
2794 | "md: %s has zero or unknown size, marking faulty!\n", | 2839 | "md: %s has zero or unknown size, marking faulty!\n", |
@@ -3107,7 +3152,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3107 | char nm[20]; | 3152 | char nm[20]; |
3108 | if (rdev->raid_disk < 0) | 3153 | if (rdev->raid_disk < 0) |
3109 | continue; | 3154 | continue; |
3110 | if (rdev->new_raid_disk > mddev->raid_disks) | 3155 | if (rdev->new_raid_disk >= mddev->raid_disks) |
3111 | rdev->new_raid_disk = -1; | 3156 | rdev->new_raid_disk = -1; |
3112 | if (rdev->new_raid_disk == rdev->raid_disk) | 3157 | if (rdev->new_raid_disk == rdev->raid_disk) |
3113 | continue; | 3158 | continue; |
@@ -3139,6 +3184,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3139 | mddev->layout = mddev->new_layout; | 3184 | mddev->layout = mddev->new_layout; |
3140 | mddev->chunk_sectors = mddev->new_chunk_sectors; | 3185 | mddev->chunk_sectors = mddev->new_chunk_sectors; |
3141 | mddev->delta_disks = 0; | 3186 | mddev->delta_disks = 0; |
3187 | mddev->degraded = 0; | ||
3142 | if (mddev->pers->sync_request == NULL) { | 3188 | if (mddev->pers->sync_request == NULL) { |
3143 | /* this is now an array without redundancy, so | 3189 | /* this is now an array without redundancy, so |
3144 | * it must always be in_sync | 3190 | * it must always be in_sync |
@@ -3292,7 +3338,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) | |||
3292 | char *e; | 3338 | char *e; |
3293 | unsigned long long n = simple_strtoull(buf, &e, 10); | 3339 | unsigned long long n = simple_strtoull(buf, &e, 10); |
3294 | 3340 | ||
3295 | if (mddev->pers) | 3341 | if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) |
3296 | return -EBUSY; | 3342 | return -EBUSY; |
3297 | if (cmd_match(buf, "none")) | 3343 | if (cmd_match(buf, "none")) |
3298 | n = MaxSector; | 3344 | n = MaxSector; |
@@ -3736,6 +3782,8 @@ action_show(mddev_t *mddev, char *page) | |||
3736 | return sprintf(page, "%s\n", type); | 3782 | return sprintf(page, "%s\n", type); |
3737 | } | 3783 | } |
3738 | 3784 | ||
3785 | static void reap_sync_thread(mddev_t *mddev); | ||
3786 | |||
3739 | static ssize_t | 3787 | static ssize_t |
3740 | action_store(mddev_t *mddev, const char *page, size_t len) | 3788 | action_store(mddev_t *mddev, const char *page, size_t len) |
3741 | { | 3789 | { |
@@ -3750,9 +3798,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
3750 | if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { | 3798 | if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { |
3751 | if (mddev->sync_thread) { | 3799 | if (mddev->sync_thread) { |
3752 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 3800 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
3753 | md_unregister_thread(mddev->sync_thread); | 3801 | reap_sync_thread(mddev); |
3754 | mddev->sync_thread = NULL; | ||
3755 | mddev->recovery = 0; | ||
3756 | } | 3802 | } |
3757 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || | 3803 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
3758 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 3804 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
@@ -3904,7 +3950,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); | |||
3904 | static ssize_t | 3950 | static ssize_t |
3905 | sync_completed_show(mddev_t *mddev, char *page) | 3951 | sync_completed_show(mddev_t *mddev, char *page) |
3906 | { | 3952 | { |
3907 | unsigned long max_sectors, resync; | 3953 | unsigned long long max_sectors, resync; |
3908 | 3954 | ||
3909 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 3955 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
3910 | return sprintf(page, "none\n"); | 3956 | return sprintf(page, "none\n"); |
@@ -3915,7 +3961,7 @@ sync_completed_show(mddev_t *mddev, char *page) | |||
3915 | max_sectors = mddev->dev_sectors; | 3961 | max_sectors = mddev->dev_sectors; |
3916 | 3962 | ||
3917 | resync = mddev->curr_resync_completed; | 3963 | resync = mddev->curr_resync_completed; |
3918 | return sprintf(page, "%lu / %lu\n", resync, max_sectors); | 3964 | return sprintf(page, "%llu / %llu\n", resync, max_sectors); |
3919 | } | 3965 | } |
3920 | 3966 | ||
3921 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); | 3967 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); |
@@ -4002,19 +4048,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) | |||
4002 | { | 4048 | { |
4003 | char *e; | 4049 | char *e; |
4004 | unsigned long long new = simple_strtoull(buf, &e, 10); | 4050 | unsigned long long new = simple_strtoull(buf, &e, 10); |
4051 | unsigned long long old = mddev->suspend_lo; | ||
4005 | 4052 | ||
4006 | if (mddev->pers == NULL || | 4053 | if (mddev->pers == NULL || |
4007 | mddev->pers->quiesce == NULL) | 4054 | mddev->pers->quiesce == NULL) |
4008 | return -EINVAL; | 4055 | return -EINVAL; |
4009 | if (buf == e || (*e && *e != '\n')) | 4056 | if (buf == e || (*e && *e != '\n')) |
4010 | return -EINVAL; | 4057 | return -EINVAL; |
4011 | if (new >= mddev->suspend_hi || | 4058 | |
4012 | (new > mddev->suspend_lo && new < mddev->suspend_hi)) { | 4059 | mddev->suspend_lo = new; |
4013 | mddev->suspend_lo = new; | 4060 | if (new >= old) |
4061 | /* Shrinking suspended region */ | ||
4014 | mddev->pers->quiesce(mddev, 2); | 4062 | mddev->pers->quiesce(mddev, 2); |
4015 | return len; | 4063 | else { |
4016 | } else | 4064 | /* Expanding suspended region - need to wait */ |
4017 | return -EINVAL; | 4065 | mddev->pers->quiesce(mddev, 1); |
4066 | mddev->pers->quiesce(mddev, 0); | ||
4067 | } | ||
4068 | return len; | ||
4018 | } | 4069 | } |
4019 | static struct md_sysfs_entry md_suspend_lo = | 4070 | static struct md_sysfs_entry md_suspend_lo = |
4020 | __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); | 4071 | __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); |
@@ -4031,20 +4082,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) | |||
4031 | { | 4082 | { |
4032 | char *e; | 4083 | char *e; |
4033 | unsigned long long new = simple_strtoull(buf, &e, 10); | 4084 | unsigned long long new = simple_strtoull(buf, &e, 10); |
4085 | unsigned long long old = mddev->suspend_hi; | ||
4034 | 4086 | ||
4035 | if (mddev->pers == NULL || | 4087 | if (mddev->pers == NULL || |
4036 | mddev->pers->quiesce == NULL) | 4088 | mddev->pers->quiesce == NULL) |
4037 | return -EINVAL; | 4089 | return -EINVAL; |
4038 | if (buf == e || (*e && *e != '\n')) | 4090 | if (buf == e || (*e && *e != '\n')) |
4039 | return -EINVAL; | 4091 | return -EINVAL; |
4040 | if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || | 4092 | |
4041 | (new > mddev->suspend_lo && new > mddev->suspend_hi)) { | 4093 | mddev->suspend_hi = new; |
4042 | mddev->suspend_hi = new; | 4094 | if (new <= old) |
4095 | /* Shrinking suspended region */ | ||
4096 | mddev->pers->quiesce(mddev, 2); | ||
4097 | else { | ||
4098 | /* Expanding suspended region - need to wait */ | ||
4043 | mddev->pers->quiesce(mddev, 1); | 4099 | mddev->pers->quiesce(mddev, 1); |
4044 | mddev->pers->quiesce(mddev, 0); | 4100 | mddev->pers->quiesce(mddev, 0); |
4045 | return len; | 4101 | } |
4046 | } else | 4102 | return len; |
4047 | return -EINVAL; | ||
4048 | } | 4103 | } |
4049 | static struct md_sysfs_entry md_suspend_hi = | 4104 | static struct md_sysfs_entry md_suspend_hi = |
4050 | __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); | 4105 | __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); |
@@ -4112,10 +4167,10 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) | |||
4112 | } | 4167 | } |
4113 | 4168 | ||
4114 | mddev->array_sectors = sectors; | 4169 | mddev->array_sectors = sectors; |
4115 | set_capacity(mddev->gendisk, mddev->array_sectors); | 4170 | if (mddev->pers) { |
4116 | if (mddev->pers) | 4171 | set_capacity(mddev->gendisk, mddev->array_sectors); |
4117 | revalidate_disk(mddev->gendisk); | 4172 | revalidate_disk(mddev->gendisk); |
4118 | 4173 | } | |
4119 | return len; | 4174 | return len; |
4120 | } | 4175 | } |
4121 | 4176 | ||
@@ -4256,10 +4311,10 @@ static int md_alloc(dev_t dev, char *name) | |||
4256 | shift = partitioned ? MdpMinorShift : 0; | 4311 | shift = partitioned ? MdpMinorShift : 0; |
4257 | unit = MINOR(mddev->unit) >> shift; | 4312 | unit = MINOR(mddev->unit) >> shift; |
4258 | 4313 | ||
4259 | /* wait for any previous instance if this device | 4314 | /* wait for any previous instance of this device to be |
4260 | * to be completed removed (mddev_delayed_delete). | 4315 | * completely removed (mddev_delayed_delete). |
4261 | */ | 4316 | */ |
4262 | flush_scheduled_work(); | 4317 | flush_workqueue(md_misc_wq); |
4263 | 4318 | ||
4264 | mutex_lock(&disks_mutex); | 4319 | mutex_lock(&disks_mutex); |
4265 | error = -EEXIST; | 4320 | error = -EEXIST; |
@@ -4287,9 +4342,6 @@ static int md_alloc(dev_t dev, char *name) | |||
4287 | goto abort; | 4342 | goto abort; |
4288 | mddev->queue->queuedata = mddev; | 4343 | mddev->queue->queuedata = mddev; |
4289 | 4344 | ||
4290 | /* Can be unlocked because the queue is new: no concurrency */ | ||
4291 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); | ||
4292 | |||
4293 | blk_queue_make_request(mddev->queue, md_make_request); | 4345 | blk_queue_make_request(mddev->queue, md_make_request); |
4294 | 4346 | ||
4295 | disk = alloc_disk(1 << shift); | 4347 | disk = alloc_disk(1 << shift); |
@@ -4309,13 +4361,19 @@ static int md_alloc(dev_t dev, char *name) | |||
4309 | disk->fops = &md_fops; | 4361 | disk->fops = &md_fops; |
4310 | disk->private_data = mddev; | 4362 | disk->private_data = mddev; |
4311 | disk->queue = mddev->queue; | 4363 | disk->queue = mddev->queue; |
4364 | blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); | ||
4312 | /* Allow extended partitions. This makes the | 4365 | /* Allow extended partitions. This makes the |
4313 | * 'mdp' device redundant, but we can't really | 4366 | * 'mdp' device redundant, but we can't really |
4314 | * remove it now. | 4367 | * remove it now. |
4315 | */ | 4368 | */ |
4316 | disk->flags |= GENHD_FL_EXT_DEVT; | 4369 | disk->flags |= GENHD_FL_EXT_DEVT; |
4317 | add_disk(disk); | ||
4318 | mddev->gendisk = disk; | 4370 | mddev->gendisk = disk; |
4371 | /* As soon as we call add_disk(), another thread could get | ||
4372 | * through to md_open, so make sure it doesn't get too far | ||
4373 | */ | ||
4374 | mutex_lock(&mddev->open_mutex); | ||
4375 | add_disk(disk); | ||
4376 | |||
4319 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, | 4377 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, |
4320 | &disk_to_dev(disk)->kobj, "%s", "md"); | 4378 | &disk_to_dev(disk)->kobj, "%s", "md"); |
4321 | if (error) { | 4379 | if (error) { |
@@ -4329,6 +4387,7 @@ static int md_alloc(dev_t dev, char *name) | |||
4329 | if (mddev->kobj.sd && | 4387 | if (mddev->kobj.sd && |
4330 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | 4388 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) |
4331 | printk(KERN_DEBUG "pointless warning\n"); | 4389 | printk(KERN_DEBUG "pointless warning\n"); |
4390 | mutex_unlock(&mddev->open_mutex); | ||
4332 | abort: | 4391 | abort: |
4333 | mutex_unlock(&disks_mutex); | 4392 | mutex_unlock(&disks_mutex); |
4334 | if (!error && mddev->kobj.sd) { | 4393 | if (!error && mddev->kobj.sd) { |
@@ -4423,7 +4482,9 @@ int md_run(mddev_t *mddev) | |||
4423 | * We don't want the data to overlap the metadata, | 4482 | * We don't want the data to overlap the metadata, |
4424 | * Internal Bitmap issues have been handled elsewhere. | 4483 | * Internal Bitmap issues have been handled elsewhere. |
4425 | */ | 4484 | */ |
4426 | if (rdev->data_offset < rdev->sb_start) { | 4485 | if (rdev->meta_bdev) { |
4486 | /* Nothing to check */; | ||
4487 | } else if (rdev->data_offset < rdev->sb_start) { | ||
4427 | if (mddev->dev_sectors && | 4488 | if (mddev->dev_sectors && |
4428 | rdev->data_offset + mddev->dev_sectors | 4489 | rdev->data_offset + mddev->dev_sectors |
4429 | > rdev->sb_start) { | 4490 | > rdev->sb_start) { |
@@ -4442,6 +4503,9 @@ int md_run(mddev_t *mddev) | |||
4442 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 4503 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
4443 | } | 4504 | } |
4444 | 4505 | ||
4506 | if (mddev->bio_set == NULL) | ||
4507 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); | ||
4508 | |||
4445 | spin_lock(&pers_lock); | 4509 | spin_lock(&pers_lock); |
4446 | pers = find_pers(mddev->level, mddev->clevel); | 4510 | pers = find_pers(mddev->level, mddev->clevel); |
4447 | if (!pers || !try_module_get(pers->owner)) { | 4511 | if (!pers || !try_module_get(pers->owner)) { |
@@ -4504,7 +4568,6 @@ int md_run(mddev_t *mddev) | |||
4504 | /* may be over-ridden by personality */ | 4568 | /* may be over-ridden by personality */ |
4505 | mddev->resync_max_sectors = mddev->dev_sectors; | 4569 | mddev->resync_max_sectors = mddev->dev_sectors; |
4506 | 4570 | ||
4507 | mddev->barriers_work = 1; | ||
4508 | mddev->ok_start_degraded = start_dirty_degraded; | 4571 | mddev->ok_start_degraded = start_dirty_degraded; |
4509 | 4572 | ||
4510 | if (start_readonly && mddev->ro == 0) | 4573 | if (start_readonly && mddev->ro == 0) |
@@ -4555,7 +4618,8 @@ int md_run(mddev_t *mddev) | |||
4555 | mddev->safemode_timer.data = (unsigned long) mddev; | 4618 | mddev->safemode_timer.data = (unsigned long) mddev; |
4556 | mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ | 4619 | mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ |
4557 | mddev->in_sync = 1; | 4620 | mddev->in_sync = 1; |
4558 | 4621 | smp_wmb(); | |
4622 | mddev->ready = 1; | ||
4559 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4623 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4560 | if (rdev->raid_disk >= 0) { | 4624 | if (rdev->raid_disk >= 0) { |
4561 | char nm[20]; | 4625 | char nm[20]; |
@@ -4569,9 +4633,6 @@ int md_run(mddev_t *mddev) | |||
4569 | if (mddev->flags) | 4633 | if (mddev->flags) |
4570 | md_update_sb(mddev, 0); | 4634 | md_update_sb(mddev, 0); |
4571 | 4635 | ||
4572 | md_wakeup_thread(mddev->thread); | ||
4573 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | ||
4574 | |||
4575 | md_new_event(mddev); | 4636 | md_new_event(mddev); |
4576 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 4637 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
4577 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 4638 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
@@ -4592,8 +4653,13 @@ static int do_md_run(mddev_t *mddev) | |||
4592 | bitmap_destroy(mddev); | 4653 | bitmap_destroy(mddev); |
4593 | goto out; | 4654 | goto out; |
4594 | } | 4655 | } |
4656 | |||
4657 | md_wakeup_thread(mddev->thread); | ||
4658 | md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ | ||
4659 | |||
4595 | set_capacity(mddev->gendisk, mddev->array_sectors); | 4660 | set_capacity(mddev->gendisk, mddev->array_sectors); |
4596 | revalidate_disk(mddev->gendisk); | 4661 | revalidate_disk(mddev->gendisk); |
4662 | mddev->changed = 1; | ||
4597 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); | 4663 | kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); |
4598 | out: | 4664 | out: |
4599 | return err; | 4665 | return err; |
@@ -4682,24 +4748,22 @@ static void md_clean(mddev_t *mddev) | |||
4682 | mddev->sync_speed_min = mddev->sync_speed_max = 0; | 4748 | mddev->sync_speed_min = mddev->sync_speed_max = 0; |
4683 | mddev->recovery = 0; | 4749 | mddev->recovery = 0; |
4684 | mddev->in_sync = 0; | 4750 | mddev->in_sync = 0; |
4751 | mddev->changed = 0; | ||
4685 | mddev->degraded = 0; | 4752 | mddev->degraded = 0; |
4686 | mddev->barriers_work = 0; | ||
4687 | mddev->safemode = 0; | 4753 | mddev->safemode = 0; |
4688 | mddev->bitmap_info.offset = 0; | 4754 | mddev->bitmap_info.offset = 0; |
4689 | mddev->bitmap_info.default_offset = 0; | 4755 | mddev->bitmap_info.default_offset = 0; |
4690 | mddev->bitmap_info.chunksize = 0; | 4756 | mddev->bitmap_info.chunksize = 0; |
4691 | mddev->bitmap_info.daemon_sleep = 0; | 4757 | mddev->bitmap_info.daemon_sleep = 0; |
4692 | mddev->bitmap_info.max_write_behind = 0; | 4758 | mddev->bitmap_info.max_write_behind = 0; |
4693 | mddev->plug = NULL; | ||
4694 | } | 4759 | } |
4695 | 4760 | ||
4696 | void md_stop_writes(mddev_t *mddev) | 4761 | static void __md_stop_writes(mddev_t *mddev) |
4697 | { | 4762 | { |
4698 | if (mddev->sync_thread) { | 4763 | if (mddev->sync_thread) { |
4699 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 4764 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
4700 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 4765 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
4701 | md_unregister_thread(mddev->sync_thread); | 4766 | reap_sync_thread(mddev); |
4702 | mddev->sync_thread = NULL; | ||
4703 | } | 4767 | } |
4704 | 4768 | ||
4705 | del_timer_sync(&mddev->safemode_timer); | 4769 | del_timer_sync(&mddev->safemode_timer); |
@@ -4713,10 +4777,18 @@ void md_stop_writes(mddev_t *mddev) | |||
4713 | md_update_sb(mddev, 1); | 4777 | md_update_sb(mddev, 1); |
4714 | } | 4778 | } |
4715 | } | 4779 | } |
4780 | |||
4781 | void md_stop_writes(mddev_t *mddev) | ||
4782 | { | ||
4783 | mddev_lock(mddev); | ||
4784 | __md_stop_writes(mddev); | ||
4785 | mddev_unlock(mddev); | ||
4786 | } | ||
4716 | EXPORT_SYMBOL_GPL(md_stop_writes); | 4787 | EXPORT_SYMBOL_GPL(md_stop_writes); |
4717 | 4788 | ||
4718 | void md_stop(mddev_t *mddev) | 4789 | void md_stop(mddev_t *mddev) |
4719 | { | 4790 | { |
4791 | mddev->ready = 0; | ||
4720 | mddev->pers->stop(mddev); | 4792 | mddev->pers->stop(mddev); |
4721 | if (mddev->pers->sync_request && mddev->to_remove == NULL) | 4793 | if (mddev->pers->sync_request && mddev->to_remove == NULL) |
4722 | mddev->to_remove = &md_redundancy_group; | 4794 | mddev->to_remove = &md_redundancy_group; |
@@ -4736,7 +4808,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) | |||
4736 | goto out; | 4808 | goto out; |
4737 | } | 4809 | } |
4738 | if (mddev->pers) { | 4810 | if (mddev->pers) { |
4739 | md_stop_writes(mddev); | 4811 | __md_stop_writes(mddev); |
4740 | 4812 | ||
4741 | err = -ENXIO; | 4813 | err = -ENXIO; |
4742 | if (mddev->ro==1) | 4814 | if (mddev->ro==1) |
@@ -4773,10 +4845,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4773 | if (mddev->ro) | 4845 | if (mddev->ro) |
4774 | set_disk_ro(disk, 0); | 4846 | set_disk_ro(disk, 0); |
4775 | 4847 | ||
4776 | md_stop_writes(mddev); | 4848 | __md_stop_writes(mddev); |
4777 | md_stop(mddev); | 4849 | md_stop(mddev); |
4778 | mddev->queue->merge_bvec_fn = NULL; | 4850 | mddev->queue->merge_bvec_fn = NULL; |
4779 | mddev->queue->unplug_fn = NULL; | ||
4780 | mddev->queue->backing_dev_info.congested_fn = NULL; | 4851 | mddev->queue->backing_dev_info.congested_fn = NULL; |
4781 | 4852 | ||
4782 | /* tell userspace to handle 'inactive' */ | 4853 | /* tell userspace to handle 'inactive' */ |
@@ -4791,6 +4862,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4791 | 4862 | ||
4792 | set_capacity(disk, 0); | 4863 | set_capacity(disk, 0); |
4793 | mutex_unlock(&mddev->open_mutex); | 4864 | mutex_unlock(&mddev->open_mutex); |
4865 | mddev->changed = 1; | ||
4794 | revalidate_disk(disk); | 4866 | revalidate_disk(disk); |
4795 | 4867 | ||
4796 | if (mddev->ro) | 4868 | if (mddev->ro) |
@@ -5148,17 +5220,31 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5148 | PTR_ERR(rdev)); | 5220 | PTR_ERR(rdev)); |
5149 | return PTR_ERR(rdev); | 5221 | return PTR_ERR(rdev); |
5150 | } | 5222 | } |
5151 | /* set save_raid_disk if appropriate */ | 5223 | /* set saved_raid_disk if appropriate */ |
5152 | if (!mddev->persistent) { | 5224 | if (!mddev->persistent) { |
5153 | if (info->state & (1<<MD_DISK_SYNC) && | 5225 | if (info->state & (1<<MD_DISK_SYNC) && |
5154 | info->raid_disk < mddev->raid_disks) | 5226 | info->raid_disk < mddev->raid_disks) { |
5155 | rdev->raid_disk = info->raid_disk; | 5227 | rdev->raid_disk = info->raid_disk; |
5156 | else | 5228 | set_bit(In_sync, &rdev->flags); |
5229 | } else | ||
5157 | rdev->raid_disk = -1; | 5230 | rdev->raid_disk = -1; |
5158 | } else | 5231 | } else |
5159 | super_types[mddev->major_version]. | 5232 | super_types[mddev->major_version]. |
5160 | validate_super(mddev, rdev); | 5233 | validate_super(mddev, rdev); |
5161 | rdev->saved_raid_disk = rdev->raid_disk; | 5234 | if ((info->state & (1<<MD_DISK_SYNC)) && |
5235 | (!test_bit(In_sync, &rdev->flags) || | ||
5236 | rdev->raid_disk != info->raid_disk)) { | ||
5237 | /* This was a hot-add request, but events doesn't | ||
5238 | * match, so reject it. | ||
5239 | */ | ||
5240 | export_rdev(rdev); | ||
5241 | return -EINVAL; | ||
5242 | } | ||
5243 | |||
5244 | if (test_bit(In_sync, &rdev->flags)) | ||
5245 | rdev->saved_raid_disk = rdev->raid_disk; | ||
5246 | else | ||
5247 | rdev->saved_raid_disk = -1; | ||
5162 | 5248 | ||
5163 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ | 5249 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ |
5164 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | 5250 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
@@ -5188,6 +5274,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5188 | if (mddev->degraded) | 5274 | if (mddev->degraded) |
5189 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 5275 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
5190 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5276 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5277 | if (!err) | ||
5278 | md_new_event(mddev); | ||
5191 | md_wakeup_thread(mddev->thread); | 5279 | md_wakeup_thread(mddev->thread); |
5192 | return err; | 5280 | return err; |
5193 | } | 5281 | } |
@@ -5225,9 +5313,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5225 | 5313 | ||
5226 | if (!mddev->persistent) { | 5314 | if (!mddev->persistent) { |
5227 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | 5315 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); |
5228 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 5316 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; |
5229 | } else | 5317 | } else |
5230 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 5318 | rdev->sb_start = calc_dev_sboffset(rdev); |
5231 | rdev->sectors = rdev->sb_start; | 5319 | rdev->sectors = rdev->sb_start; |
5232 | 5320 | ||
5233 | err = bind_rdev_to_array(rdev, mddev); | 5321 | err = bind_rdev_to_array(rdev, mddev); |
@@ -5294,9 +5382,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
5294 | } | 5382 | } |
5295 | 5383 | ||
5296 | if (mddev->persistent) | 5384 | if (mddev->persistent) |
5297 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 5385 | rdev->sb_start = calc_dev_sboffset(rdev); |
5298 | else | 5386 | else |
5299 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 5387 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; |
5300 | 5388 | ||
5301 | rdev->sectors = rdev->sb_start; | 5389 | rdev->sectors = rdev->sb_start; |
5302 | 5390 | ||
@@ -5507,7 +5595,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) | |||
5507 | * sb_start or, if that is <data_offset, it must fit before the size | 5595 | * sb_start or, if that is <data_offset, it must fit before the size |
5508 | * of each device. If num_sectors is zero, we find the largest size | 5596 | * of each device. If num_sectors is zero, we find the largest size |
5509 | * that fits. | 5597 | * that fits. |
5510 | |||
5511 | */ | 5598 | */ |
5512 | if (mddev->sync_thread) | 5599 | if (mddev->sync_thread) |
5513 | return -EBUSY; | 5600 | return -EBUSY; |
@@ -5544,6 +5631,8 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) | |||
5544 | mddev->delta_disks = raid_disks - mddev->raid_disks; | 5631 | mddev->delta_disks = raid_disks - mddev->raid_disks; |
5545 | 5632 | ||
5546 | rv = mddev->pers->check_reshape(mddev); | 5633 | rv = mddev->pers->check_reshape(mddev); |
5634 | if (rv < 0) | ||
5635 | mddev->delta_disks = 0; | ||
5547 | return rv; | 5636 | return rv; |
5548 | } | 5637 | } |
5549 | 5638 | ||
@@ -5951,16 +6040,14 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5951 | mddev_t *mddev = mddev_find(bdev->bd_dev); | 6040 | mddev_t *mddev = mddev_find(bdev->bd_dev); |
5952 | int err; | 6041 | int err; |
5953 | 6042 | ||
5954 | lock_kernel(); | ||
5955 | if (mddev->gendisk != bdev->bd_disk) { | 6043 | if (mddev->gendisk != bdev->bd_disk) { |
5956 | /* we are racing with mddev_put which is discarding this | 6044 | /* we are racing with mddev_put which is discarding this |
5957 | * bd_disk. | 6045 | * bd_disk. |
5958 | */ | 6046 | */ |
5959 | mddev_put(mddev); | 6047 | mddev_put(mddev); |
5960 | /* Wait until bdev->bd_disk is definitely gone */ | 6048 | /* Wait until bdev->bd_disk is definitely gone */ |
5961 | flush_scheduled_work(); | 6049 | flush_workqueue(md_misc_wq); |
5962 | /* Then retry the open from the top */ | 6050 | /* Then retry the open from the top */ |
5963 | unlock_kernel(); | ||
5964 | return -ERESTARTSYS; | 6051 | return -ERESTARTSYS; |
5965 | } | 6052 | } |
5966 | BUG_ON(mddev != bdev->bd_disk->private_data); | 6053 | BUG_ON(mddev != bdev->bd_disk->private_data); |
@@ -5972,9 +6059,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
5972 | atomic_inc(&mddev->openers); | 6059 | atomic_inc(&mddev->openers); |
5973 | mutex_unlock(&mddev->open_mutex); | 6060 | mutex_unlock(&mddev->open_mutex); |
5974 | 6061 | ||
5975 | check_disk_size_change(mddev->gendisk, bdev); | 6062 | check_disk_change(bdev); |
5976 | out: | 6063 | out: |
5977 | unlock_kernel(); | ||
5978 | return err; | 6064 | return err; |
5979 | } | 6065 | } |
5980 | 6066 | ||
@@ -5983,13 +6069,26 @@ static int md_release(struct gendisk *disk, fmode_t mode) | |||
5983 | mddev_t *mddev = disk->private_data; | 6069 | mddev_t *mddev = disk->private_data; |
5984 | 6070 | ||
5985 | BUG_ON(!mddev); | 6071 | BUG_ON(!mddev); |
5986 | lock_kernel(); | ||
5987 | atomic_dec(&mddev->openers); | 6072 | atomic_dec(&mddev->openers); |
5988 | mddev_put(mddev); | 6073 | mddev_put(mddev); |
5989 | unlock_kernel(); | ||
5990 | 6074 | ||
5991 | return 0; | 6075 | return 0; |
5992 | } | 6076 | } |
6077 | |||
6078 | static int md_media_changed(struct gendisk *disk) | ||
6079 | { | ||
6080 | mddev_t *mddev = disk->private_data; | ||
6081 | |||
6082 | return mddev->changed; | ||
6083 | } | ||
6084 | |||
6085 | static int md_revalidate(struct gendisk *disk) | ||
6086 | { | ||
6087 | mddev_t *mddev = disk->private_data; | ||
6088 | |||
6089 | mddev->changed = 0; | ||
6090 | return 0; | ||
6091 | } | ||
5993 | static const struct block_device_operations md_fops = | 6092 | static const struct block_device_operations md_fops = |
5994 | { | 6093 | { |
5995 | .owner = THIS_MODULE, | 6094 | .owner = THIS_MODULE, |
@@ -6000,6 +6099,8 @@ static const struct block_device_operations md_fops = | |||
6000 | .compat_ioctl = md_compat_ioctl, | 6099 | .compat_ioctl = md_compat_ioctl, |
6001 | #endif | 6100 | #endif |
6002 | .getgeo = md_getgeo, | 6101 | .getgeo = md_getgeo, |
6102 | .media_changed = md_media_changed, | ||
6103 | .revalidate_disk= md_revalidate, | ||
6003 | }; | 6104 | }; |
6004 | 6105 | ||
6005 | static int md_thread(void * arg) | 6106 | static int md_thread(void * arg) |
@@ -6036,8 +6137,8 @@ static int md_thread(void * arg) | |||
6036 | thread->timeout); | 6137 | thread->timeout); |
6037 | 6138 | ||
6038 | clear_bit(THREAD_WAKEUP, &thread->flags); | 6139 | clear_bit(THREAD_WAKEUP, &thread->flags); |
6039 | 6140 | if (!kthread_should_stop()) | |
6040 | thread->run(thread->mddev); | 6141 | thread->run(thread->mddev); |
6041 | } | 6142 | } |
6042 | 6143 | ||
6043 | return 0; | 6144 | return 0; |
@@ -6118,7 +6219,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
6118 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6219 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
6119 | md_wakeup_thread(mddev->thread); | 6220 | md_wakeup_thread(mddev->thread); |
6120 | if (mddev->event_work.func) | 6221 | if (mddev->event_work.func) |
6121 | schedule_work(&mddev->event_work); | 6222 | queue_work(md_misc_wq, &mddev->event_work); |
6122 | md_new_event_inintr(mddev); | 6223 | md_new_event_inintr(mddev); |
6123 | } | 6224 | } |
6124 | 6225 | ||
@@ -6209,7 +6310,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) | |||
6209 | * rt is a sector_t, so could be 32bit or 64bit. | 6310 | * rt is a sector_t, so could be 32bit or 64bit. |
6210 | * So we divide before multiply in case it is 32bit and close | 6311 | * So we divide before multiply in case it is 32bit and close |
6211 | * to the limit. | 6312 | * to the limit. |
6212 | * We scale the divisor (db) by 32 to avoid loosing precision | 6313 | * We scale the divisor (db) by 32 to avoid losing precision |
6213 | * near the end of resync when the number of remaining sectors | 6314 | * near the end of resync when the number of remaining sectors |
6214 | * is close to 'db'. | 6315 | * is close to 'db'. |
6215 | * We then divide rt by 32 after multiplying by db to compensate. | 6316 | * We then divide rt by 32 after multiplying by db to compensate. |
@@ -6631,14 +6732,6 @@ int md_allow_write(mddev_t *mddev) | |||
6631 | } | 6732 | } |
6632 | EXPORT_SYMBOL_GPL(md_allow_write); | 6733 | EXPORT_SYMBOL_GPL(md_allow_write); |
6633 | 6734 | ||
6634 | void md_unplug(mddev_t *mddev) | ||
6635 | { | ||
6636 | if (mddev->queue) | ||
6637 | blk_unplug(mddev->queue); | ||
6638 | if (mddev->plug) | ||
6639 | mddev->plug->unplug_fn(mddev->plug); | ||
6640 | } | ||
6641 | |||
6642 | #define SYNC_MARKS 10 | 6735 | #define SYNC_MARKS 10 |
6643 | #define SYNC_MARK_STEP (3*HZ) | 6736 | #define SYNC_MARK_STEP (3*HZ) |
6644 | void md_do_sync(mddev_t *mddev) | 6737 | void md_do_sync(mddev_t *mddev) |
@@ -6790,8 +6883,8 @@ void md_do_sync(mddev_t *mddev) | |||
6790 | * Tune reconstruction: | 6883 | * Tune reconstruction: |
6791 | */ | 6884 | */ |
6792 | window = 32*(PAGE_SIZE/512); | 6885 | window = 32*(PAGE_SIZE/512); |
6793 | printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", | 6886 | printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", |
6794 | window/2,(unsigned long long) max_sectors/2); | 6887 | window/2, (unsigned long long)max_sectors/2); |
6795 | 6888 | ||
6796 | atomic_set(&mddev->recovery_active, 0); | 6889 | atomic_set(&mddev->recovery_active, 0); |
6797 | last_check = 0; | 6890 | last_check = 0; |
@@ -6802,7 +6895,7 @@ void md_do_sync(mddev_t *mddev) | |||
6802 | desc, mdname(mddev)); | 6895 | desc, mdname(mddev)); |
6803 | mddev->curr_resync = j; | 6896 | mddev->curr_resync = j; |
6804 | } | 6897 | } |
6805 | mddev->curr_resync_completed = mddev->curr_resync; | 6898 | mddev->curr_resync_completed = j; |
6806 | 6899 | ||
6807 | while (j < max_sectors) { | 6900 | while (j < max_sectors) { |
6808 | sector_t sectors; | 6901 | sector_t sectors; |
@@ -6817,11 +6910,9 @@ void md_do_sync(mddev_t *mddev) | |||
6817 | >= mddev->resync_max - mddev->curr_resync_completed | 6910 | >= mddev->resync_max - mddev->curr_resync_completed |
6818 | )) { | 6911 | )) { |
6819 | /* time to update curr_resync_completed */ | 6912 | /* time to update curr_resync_completed */ |
6820 | md_unplug(mddev); | ||
6821 | wait_event(mddev->recovery_wait, | 6913 | wait_event(mddev->recovery_wait, |
6822 | atomic_read(&mddev->recovery_active) == 0); | 6914 | atomic_read(&mddev->recovery_active) == 0); |
6823 | mddev->curr_resync_completed = | 6915 | mddev->curr_resync_completed = j; |
6824 | mddev->curr_resync; | ||
6825 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 6916 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
6826 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 6917 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
6827 | } | 6918 | } |
@@ -6894,7 +6985,6 @@ void md_do_sync(mddev_t *mddev) | |||
6894 | * about not overloading the IO subsystem. (things like an | 6985 | * about not overloading the IO subsystem. (things like an |
6895 | * e2fsck being done on the RAID array should execute fast) | 6986 | * e2fsck being done on the RAID array should execute fast) |
6896 | */ | 6987 | */ |
6897 | md_unplug(mddev); | ||
6898 | cond_resched(); | 6988 | cond_resched(); |
6899 | 6989 | ||
6900 | currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 | 6990 | currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 |
@@ -6913,8 +7003,6 @@ void md_do_sync(mddev_t *mddev) | |||
6913 | * this also signals 'finished resyncing' to md_stop | 7003 | * this also signals 'finished resyncing' to md_stop |
6914 | */ | 7004 | */ |
6915 | out: | 7005 | out: |
6916 | md_unplug(mddev); | ||
6917 | |||
6918 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); | 7006 | wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); |
6919 | 7007 | ||
6920 | /* tell personality that we are finished */ | 7008 | /* tell personality that we are finished */ |
@@ -6957,9 +7045,6 @@ void md_do_sync(mddev_t *mddev) | |||
6957 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | 7045 | } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
6958 | mddev->resync_min = mddev->curr_resync_completed; | 7046 | mddev->resync_min = mddev->curr_resync_completed; |
6959 | mddev->curr_resync = 0; | 7047 | mddev->curr_resync = 0; |
6960 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6961 | mddev->curr_resync_completed = 0; | ||
6962 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
6963 | wake_up(&resync_wait); | 7048 | wake_up(&resync_wait); |
6964 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); | 7049 | set_bit(MD_RECOVERY_DONE, &mddev->recovery); |
6965 | md_wakeup_thread(mddev->thread); | 7050 | md_wakeup_thread(mddev->thread); |
@@ -6977,7 +7062,6 @@ void md_do_sync(mddev_t *mddev) | |||
6977 | } | 7062 | } |
6978 | EXPORT_SYMBOL_GPL(md_do_sync); | 7063 | EXPORT_SYMBOL_GPL(md_do_sync); |
6979 | 7064 | ||
6980 | |||
6981 | static int remove_and_add_spares(mddev_t *mddev) | 7065 | static int remove_and_add_spares(mddev_t *mddev) |
6982 | { | 7066 | { |
6983 | mdk_rdev_t *rdev; | 7067 | mdk_rdev_t *rdev; |
@@ -7000,10 +7084,11 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7000 | } | 7084 | } |
7001 | } | 7085 | } |
7002 | 7086 | ||
7003 | if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { | 7087 | if (mddev->degraded && !mddev->recovery_disabled) { |
7004 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7088 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7005 | if (rdev->raid_disk >= 0 && | 7089 | if (rdev->raid_disk >= 0 && |
7006 | !test_bit(In_sync, &rdev->flags) && | 7090 | !test_bit(In_sync, &rdev->flags) && |
7091 | !test_bit(Faulty, &rdev->flags) && | ||
7007 | !test_bit(Blocked, &rdev->flags)) | 7092 | !test_bit(Blocked, &rdev->flags)) |
7008 | spares++; | 7093 | spares++; |
7009 | if (rdev->raid_disk < 0 | 7094 | if (rdev->raid_disk < 0 |
@@ -7026,6 +7111,45 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7026 | } | 7111 | } |
7027 | return spares; | 7112 | return spares; |
7028 | } | 7113 | } |
7114 | |||
7115 | static void reap_sync_thread(mddev_t *mddev) | ||
7116 | { | ||
7117 | mdk_rdev_t *rdev; | ||
7118 | |||
7119 | /* resync has finished, collect result */ | ||
7120 | md_unregister_thread(mddev->sync_thread); | ||
7121 | mddev->sync_thread = NULL; | ||
7122 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | ||
7123 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||
7124 | /* success...*/ | ||
7125 | /* activate any spares */ | ||
7126 | if (mddev->pers->spare_active(mddev)) | ||
7127 | sysfs_notify(&mddev->kobj, NULL, | ||
7128 | "degraded"); | ||
7129 | } | ||
7130 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
7131 | mddev->pers->finish_reshape) | ||
7132 | mddev->pers->finish_reshape(mddev); | ||
7133 | md_update_sb(mddev, 1); | ||
7134 | |||
7135 | /* if array is no-longer degraded, then any saved_raid_disk | ||
7136 | * information must be scrapped | ||
7137 | */ | ||
7138 | if (!mddev->degraded) | ||
7139 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
7140 | rdev->saved_raid_disk = -1; | ||
7141 | |||
7142 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
7143 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
7144 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
7145 | clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); | ||
7146 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
7147 | /* flag recovery needed just to double check */ | ||
7148 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
7149 | sysfs_notify_dirent_safe(mddev->sysfs_action); | ||
7150 | md_new_event(mddev); | ||
7151 | } | ||
7152 | |||
7029 | /* | 7153 | /* |
7030 | * This routine is regularly called by all per-raid-array threads to | 7154 | * This routine is regularly called by all per-raid-array threads to |
7031 | * deal with generic issues like resync and super-block update. | 7155 | * deal with generic issues like resync and super-block update. |
@@ -7050,8 +7174,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7050 | */ | 7174 | */ |
7051 | void md_check_recovery(mddev_t *mddev) | 7175 | void md_check_recovery(mddev_t *mddev) |
7052 | { | 7176 | { |
7053 | mdk_rdev_t *rdev; | 7177 | if (mddev->suspended) |
7054 | 7178 | return; | |
7055 | 7179 | ||
7056 | if (mddev->bitmap) | 7180 | if (mddev->bitmap) |
7057 | bitmap_daemon_work(mddev); | 7181 | bitmap_daemon_work(mddev); |
@@ -7087,7 +7211,20 @@ void md_check_recovery(mddev_t *mddev) | |||
7087 | /* Only thing we do on a ro array is remove | 7211 | /* Only thing we do on a ro array is remove |
7088 | * failed devices. | 7212 | * failed devices. |
7089 | */ | 7213 | */ |
7090 | remove_and_add_spares(mddev); | 7214 | mdk_rdev_t *rdev; |
7215 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
7216 | if (rdev->raid_disk >= 0 && | ||
7217 | !test_bit(Blocked, &rdev->flags) && | ||
7218 | test_bit(Faulty, &rdev->flags) && | ||
7219 | atomic_read(&rdev->nr_pending)==0) { | ||
7220 | if (mddev->pers->hot_remove_disk( | ||
7221 | mddev, rdev->raid_disk)==0) { | ||
7222 | char nm[20]; | ||
7223 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7224 | sysfs_remove_link(&mddev->kobj, nm); | ||
7225 | rdev->raid_disk = -1; | ||
7226 | } | ||
7227 | } | ||
7091 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 7228 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
7092 | goto unlock; | 7229 | goto unlock; |
7093 | } | 7230 | } |
@@ -7120,34 +7257,7 @@ void md_check_recovery(mddev_t *mddev) | |||
7120 | goto unlock; | 7257 | goto unlock; |
7121 | } | 7258 | } |
7122 | if (mddev->sync_thread) { | 7259 | if (mddev->sync_thread) { |
7123 | /* resync has finished, collect result */ | 7260 | reap_sync_thread(mddev); |
7124 | md_unregister_thread(mddev->sync_thread); | ||
7125 | mddev->sync_thread = NULL; | ||
7126 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && | ||
7127 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | ||
7128 | /* success...*/ | ||
7129 | /* activate any spares */ | ||
7130 | if (mddev->pers->spare_active(mddev)) | ||
7131 | sysfs_notify(&mddev->kobj, NULL, | ||
7132 | "degraded"); | ||
7133 | } | ||
7134 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
7135 | mddev->pers->finish_reshape) | ||
7136 | mddev->pers->finish_reshape(mddev); | ||
7137 | md_update_sb(mddev, 1); | ||
7138 | |||
7139 | /* if array is no-longer degraded, then any saved_raid_disk | ||
7140 | * information must be scrapped | ||
7141 | */ | ||
7142 | if (!mddev->degraded) | ||
7143 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
7144 | rdev->saved_raid_disk = -1; | ||
7145 | |||
7146 | mddev->recovery = 0; | ||
7147 | /* flag recovery needed just to double check */ | ||
7148 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
7149 | sysfs_notify_dirent_safe(mddev->sysfs_action); | ||
7150 | md_new_event(mddev); | ||
7151 | goto unlock; | 7261 | goto unlock; |
7152 | } | 7262 | } |
7153 | /* Set RUNNING before clearing NEEDED to avoid | 7263 | /* Set RUNNING before clearing NEEDED to avoid |
@@ -7205,7 +7315,11 @@ void md_check_recovery(mddev_t *mddev) | |||
7205 | " thread...\n", | 7315 | " thread...\n", |
7206 | mdname(mddev)); | 7316 | mdname(mddev)); |
7207 | /* leave the spares where they are, it shouldn't hurt */ | 7317 | /* leave the spares where they are, it shouldn't hurt */ |
7208 | mddev->recovery = 0; | 7318 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
7319 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | ||
7320 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | ||
7321 | clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); | ||
7322 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | ||
7209 | } else | 7323 | } else |
7210 | md_wakeup_thread(mddev->sync_thread); | 7324 | md_wakeup_thread(mddev->sync_thread); |
7211 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 7325 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
@@ -7278,12 +7392,23 @@ static void md_geninit(void) | |||
7278 | 7392 | ||
7279 | static int __init md_init(void) | 7393 | static int __init md_init(void) |
7280 | { | 7394 | { |
7281 | if (register_blkdev(MD_MAJOR, "md")) | 7395 | int ret = -ENOMEM; |
7282 | return -1; | 7396 | |
7283 | if ((mdp_major=register_blkdev(0, "mdp"))<=0) { | 7397 | md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); |
7284 | unregister_blkdev(MD_MAJOR, "md"); | 7398 | if (!md_wq) |
7285 | return -1; | 7399 | goto err_wq; |
7286 | } | 7400 | |
7401 | md_misc_wq = alloc_workqueue("md_misc", 0, 0); | ||
7402 | if (!md_misc_wq) | ||
7403 | goto err_misc_wq; | ||
7404 | |||
7405 | if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) | ||
7406 | goto err_md; | ||
7407 | |||
7408 | if ((ret = register_blkdev(0, "mdp")) < 0) | ||
7409 | goto err_mdp; | ||
7410 | mdp_major = ret; | ||
7411 | |||
7287 | blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, | 7412 | blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, |
7288 | md_probe, NULL, NULL); | 7413 | md_probe, NULL, NULL); |
7289 | blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, | 7414 | blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, |
@@ -7294,8 +7419,16 @@ static int __init md_init(void) | |||
7294 | 7419 | ||
7295 | md_geninit(); | 7420 | md_geninit(); |
7296 | return 0; | 7421 | return 0; |
7297 | } | ||
7298 | 7422 | ||
7423 | err_mdp: | ||
7424 | unregister_blkdev(MD_MAJOR, "md"); | ||
7425 | err_md: | ||
7426 | destroy_workqueue(md_misc_wq); | ||
7427 | err_misc_wq: | ||
7428 | destroy_workqueue(md_wq); | ||
7429 | err_wq: | ||
7430 | return ret; | ||
7431 | } | ||
7299 | 7432 | ||
7300 | #ifndef MODULE | 7433 | #ifndef MODULE |
7301 | 7434 | ||
@@ -7382,6 +7515,8 @@ static __exit void md_exit(void) | |||
7382 | export_array(mddev); | 7515 | export_array(mddev); |
7383 | mddev->hold_active = 0; | 7516 | mddev->hold_active = 0; |
7384 | } | 7517 | } |
7518 | destroy_workqueue(md_misc_wq); | ||
7519 | destroy_workqueue(md_wq); | ||
7385 | } | 7520 | } |
7386 | 7521 | ||
7387 | subsys_initcall(md_init); | 7522 | subsys_initcall(md_init); |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 3931299788dc..1c26c7a08ae6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -29,26 +29,6 @@ | |||
29 | typedef struct mddev_s mddev_t; | 29 | typedef struct mddev_s mddev_t; |
30 | typedef struct mdk_rdev_s mdk_rdev_t; | 30 | typedef struct mdk_rdev_s mdk_rdev_t; |
31 | 31 | ||
32 | /* generic plugging support - like that provided with request_queue, | ||
33 | * but does not require a request_queue | ||
34 | */ | ||
35 | struct plug_handle { | ||
36 | void (*unplug_fn)(struct plug_handle *); | ||
37 | struct timer_list unplug_timer; | ||
38 | struct work_struct unplug_work; | ||
39 | unsigned long unplug_flag; | ||
40 | }; | ||
41 | #define PLUGGED_FLAG 1 | ||
42 | void plugger_init(struct plug_handle *plug, | ||
43 | void (*unplug_fn)(struct plug_handle *)); | ||
44 | void plugger_set_plug(struct plug_handle *plug); | ||
45 | int plugger_remove_plug(struct plug_handle *plug); | ||
46 | static inline void plugger_flush(struct plug_handle *plug) | ||
47 | { | ||
48 | del_timer_sync(&plug->unplug_timer); | ||
49 | cancel_work_sync(&plug->unplug_work); | ||
50 | } | ||
51 | |||
52 | /* | 32 | /* |
53 | * MD's 'extended' device | 33 | * MD's 'extended' device |
54 | */ | 34 | */ |
@@ -60,6 +40,12 @@ struct mdk_rdev_s | |||
60 | mddev_t *mddev; /* RAID array if running */ | 40 | mddev_t *mddev; /* RAID array if running */ |
61 | int last_events; /* IO event timestamp */ | 41 | int last_events; /* IO event timestamp */ |
62 | 42 | ||
43 | /* | ||
44 | * If meta_bdev is non-NULL, it means that a separate device is | ||
45 | * being used to store the metadata (superblock/bitmap) which | ||
46 | * would otherwise be contained on the same device as the data (bdev). | ||
47 | */ | ||
48 | struct block_device *meta_bdev; | ||
63 | struct block_device *bdev; /* block device handle */ | 49 | struct block_device *bdev; /* block device handle */ |
64 | 50 | ||
65 | struct page *sb_page; | 51 | struct page *sb_page; |
@@ -87,11 +73,8 @@ struct mdk_rdev_s | |||
87 | #define Faulty 1 /* device is known to have a fault */ | 73 | #define Faulty 1 /* device is known to have a fault */ |
88 | #define In_sync 2 /* device is in_sync with rest of array */ | 74 | #define In_sync 2 /* device is in_sync with rest of array */ |
89 | #define WriteMostly 4 /* Avoid reading if at all possible */ | 75 | #define WriteMostly 4 /* Avoid reading if at all possible */ |
90 | #define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */ | ||
91 | #define AllReserved 6 /* If whole device is reserved for | ||
92 | * one array */ | ||
93 | #define AutoDetected 7 /* added by auto-detect */ | 76 | #define AutoDetected 7 /* added by auto-detect */ |
94 | #define Blocked 8 /* An error occured on an externally | 77 | #define Blocked 8 /* An error occurred on an externally |
95 | * managed array, don't allow writes | 78 | * managed array, don't allow writes |
96 | * until it is cleared */ | 79 | * until it is cleared */ |
97 | wait_queue_head_t blocked_wait; | 80 | wait_queue_head_t blocked_wait; |
@@ -141,6 +124,7 @@ struct mddev_s | |||
141 | #define MD_CHANGE_DEVS 0 /* Some device status has changed */ | 124 | #define MD_CHANGE_DEVS 0 /* Some device status has changed */ |
142 | #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ | 125 | #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ |
143 | #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ | 126 | #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ |
127 | #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ | ||
144 | 128 | ||
145 | int suspended; | 129 | int suspended; |
146 | atomic_t active_io; | 130 | atomic_t active_io; |
@@ -149,7 +133,8 @@ struct mddev_s | |||
149 | * are happening, so run/ | 133 | * are happening, so run/ |
150 | * takeover/stop are not safe | 134 | * takeover/stop are not safe |
151 | */ | 135 | */ |
152 | 136 | int ready; /* See when safe to pass | |
137 | * IO requests down */ | ||
153 | struct gendisk *gendisk; | 138 | struct gendisk *gendisk; |
154 | 139 | ||
155 | struct kobject kobj; | 140 | struct kobject kobj; |
@@ -195,6 +180,9 @@ struct mddev_s | |||
195 | int delta_disks, new_level, new_layout; | 180 | int delta_disks, new_level, new_layout; |
196 | int new_chunk_sectors; | 181 | int new_chunk_sectors; |
197 | 182 | ||
183 | atomic_t plug_cnt; /* If device is expecting | ||
184 | * more bios soon. | ||
185 | */ | ||
198 | struct mdk_thread_s *thread; /* management thread */ | 186 | struct mdk_thread_s *thread; /* management thread */ |
199 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ | 187 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ |
200 | sector_t curr_resync; /* last block scheduled */ | 188 | sector_t curr_resync; /* last block scheduled */ |
@@ -270,16 +258,11 @@ struct mddev_s | |||
270 | atomic_t active; /* general refcount */ | 258 | atomic_t active; /* general refcount */ |
271 | atomic_t openers; /* number of active opens */ | 259 | atomic_t openers; /* number of active opens */ |
272 | 260 | ||
261 | int changed; /* True if we might need to | ||
262 | * reread partition info */ | ||
273 | int degraded; /* whether md should consider | 263 | int degraded; /* whether md should consider |
274 | * adding a spare | 264 | * adding a spare |
275 | */ | 265 | */ |
276 | int barriers_work; /* initialised to true, cleared as soon | ||
277 | * as a barrier request to slave | ||
278 | * fails. Only supported | ||
279 | */ | ||
280 | struct bio *biolist; /* bios that need to be retried | ||
281 | * because REQ_HARDBARRIER is not supported | ||
282 | */ | ||
283 | 266 | ||
284 | atomic_t recovery_active; /* blocks scheduled, but not written */ | 267 | atomic_t recovery_active; /* blocks scheduled, but not written */ |
285 | wait_queue_head_t recovery_wait; | 268 | wait_queue_head_t recovery_wait; |
@@ -337,19 +320,18 @@ struct mddev_s | |||
337 | struct list_head all_mddevs; | 320 | struct list_head all_mddevs; |
338 | 321 | ||
339 | struct attribute_group *to_remove; | 322 | struct attribute_group *to_remove; |
340 | struct plug_handle *plug; /* if used by personality */ | 323 | |
341 | 324 | struct bio_set *bio_set; | |
342 | /* Generic barrier handling. | 325 | |
343 | * If there is a pending barrier request, all other | 326 | /* Generic flush handling. |
344 | * writes are blocked while the devices are flushed. | 327 | * The last to finish preflush schedules a worker to submit |
345 | * The last to finish a flush schedules a worker to | 328 | * the rest of the request (without the REQ_FLUSH flag). |
346 | * submit the barrier request (without the barrier flag), | ||
347 | * then submit more flush requests. | ||
348 | */ | 329 | */ |
349 | struct bio *barrier; | 330 | struct bio *flush_bio; |
350 | atomic_t flush_pending; | 331 | atomic_t flush_pending; |
351 | struct work_struct barrier_work; | 332 | struct work_struct flush_work; |
352 | struct work_struct event_work; /* used by dm to report failure event */ | 333 | struct work_struct event_work; /* used by dm to report failure event */ |
334 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
353 | }; | 335 | }; |
354 | 336 | ||
355 | 337 | ||
@@ -502,12 +484,12 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); | |||
502 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); | 484 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); |
503 | 485 | ||
504 | extern int mddev_congested(mddev_t *mddev, int bits); | 486 | extern int mddev_congested(mddev_t *mddev, int bits); |
505 | extern void md_barrier_request(mddev_t *mddev, struct bio *bio); | 487 | extern void md_flush_request(mddev_t *mddev, struct bio *bio); |
506 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | 488 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, |
507 | sector_t sector, int size, struct page *page); | 489 | sector_t sector, int size, struct page *page); |
508 | extern void md_super_wait(mddev_t *mddev); | 490 | extern void md_super_wait(mddev_t *mddev); |
509 | extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | 491 | extern int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, |
510 | struct page *page, int rw); | 492 | struct page *page, int rw, bool metadata_op); |
511 | extern void md_do_sync(mddev_t *mddev); | 493 | extern void md_do_sync(mddev_t *mddev); |
512 | extern void md_new_event(mddev_t *mddev); | 494 | extern void md_new_event(mddev_t *mddev); |
513 | extern int md_allow_write(mddev_t *mddev); | 495 | extern int md_allow_write(mddev_t *mddev); |
@@ -518,7 +500,6 @@ extern int md_integrity_register(mddev_t *mddev); | |||
518 | extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 500 | extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
519 | extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); | 501 | extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); |
520 | extern void restore_bitmap_write_access(struct file *file); | 502 | extern void restore_bitmap_write_access(struct file *file); |
521 | extern void md_unplug(mddev_t *mddev); | ||
522 | 503 | ||
523 | extern void mddev_init(mddev_t *mddev); | 504 | extern void mddev_init(mddev_t *mddev); |
524 | extern int md_run(mddev_t *mddev); | 505 | extern int md_run(mddev_t *mddev); |
@@ -528,4 +509,9 @@ extern void md_rdev_init(mdk_rdev_t *rdev); | |||
528 | 509 | ||
529 | extern void mddev_suspend(mddev_t *mddev); | 510 | extern void mddev_suspend(mddev_t *mddev); |
530 | extern void mddev_resume(mddev_t *mddev); | 511 | extern void mddev_resume(mddev_t *mddev); |
512 | extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | ||
513 | mddev_t *mddev); | ||
514 | extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, | ||
515 | mddev_t *mddev); | ||
516 | extern int mddev_check_plugged(mddev_t *mddev); | ||
531 | #endif /* _MD_MD_H */ | 517 | #endif /* _MD_MD_H */ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 0307d217e7a4..3535c23af288 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -106,44 +106,14 @@ static void multipath_end_request(struct bio *bio, int error) | |||
106 | rdev_dec_pending(rdev, conf->mddev); | 106 | rdev_dec_pending(rdev, conf->mddev); |
107 | } | 107 | } |
108 | 108 | ||
109 | static void unplug_slaves(mddev_t *mddev) | ||
110 | { | ||
111 | multipath_conf_t *conf = mddev->private; | ||
112 | int i; | ||
113 | |||
114 | rcu_read_lock(); | ||
115 | for (i=0; i<mddev->raid_disks; i++) { | ||
116 | mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); | ||
117 | if (rdev && !test_bit(Faulty, &rdev->flags) | ||
118 | && atomic_read(&rdev->nr_pending)) { | ||
119 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
120 | |||
121 | atomic_inc(&rdev->nr_pending); | ||
122 | rcu_read_unlock(); | ||
123 | |||
124 | blk_unplug(r_queue); | ||
125 | |||
126 | rdev_dec_pending(rdev, mddev); | ||
127 | rcu_read_lock(); | ||
128 | } | ||
129 | } | ||
130 | rcu_read_unlock(); | ||
131 | } | ||
132 | |||
133 | static void multipath_unplug(struct request_queue *q) | ||
134 | { | ||
135 | unplug_slaves(q->queuedata); | ||
136 | } | ||
137 | |||
138 | |||
139 | static int multipath_make_request(mddev_t *mddev, struct bio * bio) | 109 | static int multipath_make_request(mddev_t *mddev, struct bio * bio) |
140 | { | 110 | { |
141 | multipath_conf_t *conf = mddev->private; | 111 | multipath_conf_t *conf = mddev->private; |
142 | struct multipath_bh * mp_bh; | 112 | struct multipath_bh * mp_bh; |
143 | struct multipath_info *multipath; | 113 | struct multipath_info *multipath; |
144 | 114 | ||
145 | if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { | 115 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
146 | md_barrier_request(mddev, bio); | 116 | md_flush_request(mddev, bio); |
147 | return 0; | 117 | return 0; |
148 | } | 118 | } |
149 | 119 | ||
@@ -176,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) | |||
176 | int i; | 146 | int i; |
177 | 147 | ||
178 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, | 148 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, |
179 | conf->working_disks); | 149 | conf->raid_disks - mddev->degraded); |
180 | for (i = 0; i < conf->raid_disks; i++) | 150 | for (i = 0; i < conf->raid_disks; i++) |
181 | seq_printf (seq, "%s", | 151 | seq_printf (seq, "%s", |
182 | conf->multipaths[i].rdev && | 152 | conf->multipaths[i].rdev && |
@@ -216,35 +186,36 @@ static int multipath_congested(void *data, int bits) | |||
216 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) | 186 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) |
217 | { | 187 | { |
218 | multipath_conf_t *conf = mddev->private; | 188 | multipath_conf_t *conf = mddev->private; |
189 | char b[BDEVNAME_SIZE]; | ||
219 | 190 | ||
220 | if (conf->working_disks <= 1) { | 191 | if (conf->raid_disks - mddev->degraded <= 1) { |
221 | /* | 192 | /* |
222 | * Uh oh, we can do nothing if this is our last path, but | 193 | * Uh oh, we can do nothing if this is our last path, but |
223 | * first check if this is a queued request for a device | 194 | * first check if this is a queued request for a device |
224 | * which has just failed. | 195 | * which has just failed. |
225 | */ | 196 | */ |
226 | printk(KERN_ALERT | 197 | printk(KERN_ALERT |
227 | "multipath: only one IO path left and IO error.\n"); | 198 | "multipath: only one IO path left and IO error.\n"); |
228 | /* leave it active... it's all we have */ | 199 | /* leave it active... it's all we have */ |
229 | } else { | 200 | return; |
230 | /* | ||
231 | * Mark disk as unusable | ||
232 | */ | ||
233 | if (!test_bit(Faulty, &rdev->flags)) { | ||
234 | char b[BDEVNAME_SIZE]; | ||
235 | clear_bit(In_sync, &rdev->flags); | ||
236 | set_bit(Faulty, &rdev->flags); | ||
237 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
238 | conf->working_disks--; | ||
239 | mddev->degraded++; | ||
240 | printk(KERN_ALERT "multipath: IO failure on %s," | ||
241 | " disabling IO path.\n" | ||
242 | "multipath: Operation continuing" | ||
243 | " on %d IO paths.\n", | ||
244 | bdevname (rdev->bdev,b), | ||
245 | conf->working_disks); | ||
246 | } | ||
247 | } | 201 | } |
202 | /* | ||
203 | * Mark disk as unusable | ||
204 | */ | ||
205 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | ||
206 | unsigned long flags; | ||
207 | spin_lock_irqsave(&conf->device_lock, flags); | ||
208 | mddev->degraded++; | ||
209 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
210 | } | ||
211 | set_bit(Faulty, &rdev->flags); | ||
212 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
213 | printk(KERN_ALERT "multipath: IO failure on %s," | ||
214 | " disabling IO path.\n" | ||
215 | "multipath: Operation continuing" | ||
216 | " on %d IO paths.\n", | ||
217 | bdevname(rdev->bdev, b), | ||
218 | conf->raid_disks - mddev->degraded); | ||
248 | } | 219 | } |
249 | 220 | ||
250 | static void print_multipath_conf (multipath_conf_t *conf) | 221 | static void print_multipath_conf (multipath_conf_t *conf) |
@@ -257,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf) | |||
257 | printk("(conf==NULL)\n"); | 228 | printk("(conf==NULL)\n"); |
258 | return; | 229 | return; |
259 | } | 230 | } |
260 | printk(" --- wd:%d rd:%d\n", conf->working_disks, | 231 | printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
261 | conf->raid_disks); | 232 | conf->raid_disks); |
262 | 233 | ||
263 | for (i = 0; i < conf->raid_disks; i++) { | 234 | for (i = 0; i < conf->raid_disks; i++) { |
@@ -304,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
304 | PAGE_CACHE_SIZE - 1); | 275 | PAGE_CACHE_SIZE - 1); |
305 | } | 276 | } |
306 | 277 | ||
307 | conf->working_disks++; | 278 | spin_lock_irq(&conf->device_lock); |
308 | mddev->degraded--; | 279 | mddev->degraded--; |
309 | rdev->raid_disk = path; | 280 | rdev->raid_disk = path; |
310 | set_bit(In_sync, &rdev->flags); | 281 | set_bit(In_sync, &rdev->flags); |
282 | spin_unlock_irq(&conf->device_lock); | ||
311 | rcu_assign_pointer(p->rdev, rdev); | 283 | rcu_assign_pointer(p->rdev, rdev); |
312 | err = 0; | 284 | err = 0; |
313 | md_integrity_add_rdev(rdev, mddev); | 285 | md_integrity_add_rdev(rdev, mddev); |
@@ -345,7 +317,7 @@ static int multipath_remove_disk(mddev_t *mddev, int number) | |||
345 | p->rdev = rdev; | 317 | p->rdev = rdev; |
346 | goto abort; | 318 | goto abort; |
347 | } | 319 | } |
348 | md_integrity_register(mddev); | 320 | err = md_integrity_register(mddev); |
349 | } | 321 | } |
350 | abort: | 322 | abort: |
351 | 323 | ||
@@ -421,6 +393,7 @@ static int multipath_run (mddev_t *mddev) | |||
421 | int disk_idx; | 393 | int disk_idx; |
422 | struct multipath_info *disk; | 394 | struct multipath_info *disk; |
423 | mdk_rdev_t *rdev; | 395 | mdk_rdev_t *rdev; |
396 | int working_disks; | ||
424 | 397 | ||
425 | if (md_check_no_bitmap(mddev)) | 398 | if (md_check_no_bitmap(mddev)) |
426 | return -EINVAL; | 399 | return -EINVAL; |
@@ -435,7 +408,6 @@ static int multipath_run (mddev_t *mddev) | |||
435 | * bookkeeping area. [whatever we allocate in multipath_run(), | 408 | * bookkeeping area. [whatever we allocate in multipath_run(), |
436 | * should be freed in multipath_stop()] | 409 | * should be freed in multipath_stop()] |
437 | */ | 410 | */ |
438 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | ||
439 | 411 | ||
440 | conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL); | 412 | conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL); |
441 | mddev->private = conf; | 413 | mddev->private = conf; |
@@ -455,7 +427,7 @@ static int multipath_run (mddev_t *mddev) | |||
455 | goto out_free_conf; | 427 | goto out_free_conf; |
456 | } | 428 | } |
457 | 429 | ||
458 | conf->working_disks = 0; | 430 | working_disks = 0; |
459 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 431 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
460 | disk_idx = rdev->raid_disk; | 432 | disk_idx = rdev->raid_disk; |
461 | if (disk_idx < 0 || | 433 | if (disk_idx < 0 || |
@@ -477,7 +449,7 @@ static int multipath_run (mddev_t *mddev) | |||
477 | } | 449 | } |
478 | 450 | ||
479 | if (!test_bit(Faulty, &rdev->flags)) | 451 | if (!test_bit(Faulty, &rdev->flags)) |
480 | conf->working_disks++; | 452 | working_disks++; |
481 | } | 453 | } |
482 | 454 | ||
483 | conf->raid_disks = mddev->raid_disks; | 455 | conf->raid_disks = mddev->raid_disks; |
@@ -485,12 +457,12 @@ static int multipath_run (mddev_t *mddev) | |||
485 | spin_lock_init(&conf->device_lock); | 457 | spin_lock_init(&conf->device_lock); |
486 | INIT_LIST_HEAD(&conf->retry_list); | 458 | INIT_LIST_HEAD(&conf->retry_list); |
487 | 459 | ||
488 | if (!conf->working_disks) { | 460 | if (!working_disks) { |
489 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", | 461 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", |
490 | mdname(mddev)); | 462 | mdname(mddev)); |
491 | goto out_free_conf; | 463 | goto out_free_conf; |
492 | } | 464 | } |
493 | mddev->degraded = conf->raid_disks - conf->working_disks; | 465 | mddev->degraded = conf->raid_disks - working_disks; |
494 | 466 | ||
495 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, | 467 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, |
496 | sizeof(struct multipath_bh)); | 468 | sizeof(struct multipath_bh)); |
@@ -512,16 +484,19 @@ static int multipath_run (mddev_t *mddev) | |||
512 | 484 | ||
513 | printk(KERN_INFO | 485 | printk(KERN_INFO |
514 | "multipath: array %s active with %d out of %d IO paths\n", | 486 | "multipath: array %s active with %d out of %d IO paths\n", |
515 | mdname(mddev), conf->working_disks, mddev->raid_disks); | 487 | mdname(mddev), conf->raid_disks - mddev->degraded, |
488 | mddev->raid_disks); | ||
516 | /* | 489 | /* |
517 | * Ok, everything is just fine now | 490 | * Ok, everything is just fine now |
518 | */ | 491 | */ |
519 | md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); | 492 | md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); |
520 | 493 | ||
521 | mddev->queue->unplug_fn = multipath_unplug; | ||
522 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; | 494 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; |
523 | mddev->queue->backing_dev_info.congested_data = mddev; | 495 | mddev->queue->backing_dev_info.congested_data = mddev; |
524 | md_integrity_register(mddev); | 496 | |
497 | if (md_integrity_register(mddev)) | ||
498 | goto out_free_conf; | ||
499 | |||
525 | return 0; | 500 | return 0; |
526 | 501 | ||
527 | out_free_conf: | 502 | out_free_conf: |
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index d1c2a8d78395..3c5a45eb5f8a 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h | |||
@@ -9,7 +9,6 @@ struct multipath_private_data { | |||
9 | mddev_t *mddev; | 9 | mddev_t *mddev; |
10 | struct multipath_info *multipaths; | 10 | struct multipath_info *multipaths; |
11 | int raid_disks; | 11 | int raid_disks; |
12 | int working_disks; | ||
13 | spinlock_t device_lock; | 12 | spinlock_t device_lock; |
14 | struct list_head retry_list; | 13 | struct list_head retry_list; |
15 | 14 | ||
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f7af46d623c..e86bf3682e1e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -25,21 +25,6 @@ | |||
25 | #include "raid0.h" | 25 | #include "raid0.h" |
26 | #include "raid5.h" | 26 | #include "raid5.h" |
27 | 27 | ||
28 | static void raid0_unplug(struct request_queue *q) | ||
29 | { | ||
30 | mddev_t *mddev = q->queuedata; | ||
31 | raid0_conf_t *conf = mddev->private; | ||
32 | mdk_rdev_t **devlist = conf->devlist; | ||
33 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
34 | int i; | ||
35 | |||
36 | for (i=0; i < raid_disks; i++) { | ||
37 | struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); | ||
38 | |||
39 | blk_unplug(r_queue); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | static int raid0_congested(void *data, int bits) | 28 | static int raid0_congested(void *data, int bits) |
44 | { | 29 | { |
45 | mddev_t *mddev = data; | 30 | mddev_t *mddev = data; |
@@ -179,6 +164,14 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) | |||
179 | rdev1->new_raid_disk = j; | 164 | rdev1->new_raid_disk = j; |
180 | } | 165 | } |
181 | 166 | ||
167 | if (mddev->level == 1) { | ||
168 | /* taiking over a raid1 array- | ||
169 | * we have only one active disk | ||
170 | */ | ||
171 | j = 0; | ||
172 | rdev1->new_raid_disk = j; | ||
173 | } | ||
174 | |||
182 | if (j < 0 || j >= mddev->raid_disks) { | 175 | if (j < 0 || j >= mddev->raid_disks) { |
183 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 176 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " |
184 | "aborting!\n", mdname(mddev), j); | 177 | "aborting!\n", mdname(mddev), j); |
@@ -264,7 +257,6 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) | |||
264 | mdname(mddev), | 257 | mdname(mddev), |
265 | (unsigned long long)smallest->sectors); | 258 | (unsigned long long)smallest->sectors); |
266 | } | 259 | } |
267 | mddev->queue->unplug_fn = raid0_unplug; | ||
268 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; | 260 | mddev->queue->backing_dev_info.congested_fn = raid0_congested; |
269 | mddev->queue->backing_dev_info.congested_data = mddev; | 261 | mddev->queue->backing_dev_info.congested_data = mddev; |
270 | 262 | ||
@@ -353,7 +345,6 @@ static int raid0_run(mddev_t *mddev) | |||
353 | if (md_check_no_bitmap(mddev)) | 345 | if (md_check_no_bitmap(mddev)) |
354 | return -EINVAL; | 346 | return -EINVAL; |
355 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); | 347 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); |
356 | mddev->queue->queue_lock = &mddev->queue->__queue_lock; | ||
357 | 348 | ||
358 | /* if private is not null, we are here after takeover */ | 349 | /* if private is not null, we are here after takeover */ |
359 | if (mddev->private == NULL) { | 350 | if (mddev->private == NULL) { |
@@ -388,8 +379,7 @@ static int raid0_run(mddev_t *mddev) | |||
388 | 379 | ||
389 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); | 380 | blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); |
390 | dump_zones(mddev); | 381 | dump_zones(mddev); |
391 | md_integrity_register(mddev); | 382 | return md_integrity_register(mddev); |
392 | return 0; | ||
393 | } | 383 | } |
394 | 384 | ||
395 | static int raid0_stop(mddev_t *mddev) | 385 | static int raid0_stop(mddev_t *mddev) |
@@ -483,8 +473,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio) | |||
483 | struct strip_zone *zone; | 473 | struct strip_zone *zone; |
484 | mdk_rdev_t *tmp_dev; | 474 | mdk_rdev_t *tmp_dev; |
485 | 475 | ||
486 | if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { | 476 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
487 | md_barrier_request(mddev, bio); | 477 | md_flush_request(mddev, bio); |
488 | return 0; | 478 | return 0; |
489 | } | 479 | } |
490 | 480 | ||
@@ -644,12 +634,39 @@ static void *raid0_takeover_raid10(mddev_t *mddev) | |||
644 | return priv_conf; | 634 | return priv_conf; |
645 | } | 635 | } |
646 | 636 | ||
637 | static void *raid0_takeover_raid1(mddev_t *mddev) | ||
638 | { | ||
639 | raid0_conf_t *priv_conf; | ||
640 | |||
641 | /* Check layout: | ||
642 | * - (N - 1) mirror drives must be already faulty | ||
643 | */ | ||
644 | if ((mddev->raid_disks - 1) != mddev->degraded) { | ||
645 | printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", | ||
646 | mdname(mddev)); | ||
647 | return ERR_PTR(-EINVAL); | ||
648 | } | ||
649 | |||
650 | /* Set new parameters */ | ||
651 | mddev->new_level = 0; | ||
652 | mddev->new_layout = 0; | ||
653 | mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */ | ||
654 | mddev->delta_disks = 1 - mddev->raid_disks; | ||
655 | mddev->raid_disks = 1; | ||
656 | /* make sure it will be not marked as dirty */ | ||
657 | mddev->recovery_cp = MaxSector; | ||
658 | |||
659 | create_strip_zones(mddev, &priv_conf); | ||
660 | return priv_conf; | ||
661 | } | ||
662 | |||
647 | static void *raid0_takeover(mddev_t *mddev) | 663 | static void *raid0_takeover(mddev_t *mddev) |
648 | { | 664 | { |
649 | /* raid0 can take over: | 665 | /* raid0 can take over: |
650 | * raid4 - if all data disks are active. | 666 | * raid4 - if all data disks are active. |
651 | * raid5 - providing it is Raid4 layout and one disk is faulty | 667 | * raid5 - providing it is Raid4 layout and one disk is faulty |
652 | * raid10 - assuming we have all necessary active disks | 668 | * raid10 - assuming we have all necessary active disks |
669 | * raid1 - with (N -1) mirror drives faulty | ||
653 | */ | 670 | */ |
654 | if (mddev->level == 4) | 671 | if (mddev->level == 4) |
655 | return raid0_takeover_raid45(mddev); | 672 | return raid0_takeover_raid45(mddev); |
@@ -665,6 +682,12 @@ static void *raid0_takeover(mddev_t *mddev) | |||
665 | if (mddev->level == 10) | 682 | if (mddev->level == 10) |
666 | return raid0_takeover_raid10(mddev); | 683 | return raid0_takeover_raid10(mddev); |
667 | 684 | ||
685 | if (mddev->level == 1) | ||
686 | return raid0_takeover_raid1(mddev); | ||
687 | |||
688 | printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", | ||
689 | mddev->level); | ||
690 | |||
668 | return ERR_PTR(-EINVAL); | 691 | return ERR_PTR(-EINVAL); |
669 | } | 692 | } |
670 | 693 | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0b830bbe1d8b..f7431b6d8447 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -52,23 +52,16 @@ | |||
52 | #define NR_RAID1_BIOS 256 | 52 | #define NR_RAID1_BIOS 256 |
53 | 53 | ||
54 | 54 | ||
55 | static void unplug_slaves(mddev_t *mddev); | ||
56 | |||
57 | static void allow_barrier(conf_t *conf); | 55 | static void allow_barrier(conf_t *conf); |
58 | static void lower_barrier(conf_t *conf); | 56 | static void lower_barrier(conf_t *conf); |
59 | 57 | ||
60 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 58 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
61 | { | 59 | { |
62 | struct pool_info *pi = data; | 60 | struct pool_info *pi = data; |
63 | r1bio_t *r1_bio; | ||
64 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); | 61 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); |
65 | 62 | ||
66 | /* allocate a r1bio with room for raid_disks entries in the bios array */ | 63 | /* allocate a r1bio with room for raid_disks entries in the bios array */ |
67 | r1_bio = kzalloc(size, gfp_flags); | 64 | return kzalloc(size, gfp_flags); |
68 | if (!r1_bio && pi->mddev) | ||
69 | unplug_slaves(pi->mddev); | ||
70 | |||
71 | return r1_bio; | ||
72 | } | 65 | } |
73 | 66 | ||
74 | static void r1bio_pool_free(void *r1_bio, void *data) | 67 | static void r1bio_pool_free(void *r1_bio, void *data) |
@@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
91 | int i, j; | 84 | int i, j; |
92 | 85 | ||
93 | r1_bio = r1bio_pool_alloc(gfp_flags, pi); | 86 | r1_bio = r1bio_pool_alloc(gfp_flags, pi); |
94 | if (!r1_bio) { | 87 | if (!r1_bio) |
95 | unplug_slaves(pi->mddev); | ||
96 | return NULL; | 88 | return NULL; |
97 | } | ||
98 | 89 | ||
99 | /* | 90 | /* |
100 | * Allocate bios : 1 for reading, n-1 for writing | 91 | * Allocate bios : 1 for reading, n-1 for writing |
101 | */ | 92 | */ |
102 | for (j = pi->raid_disks ; j-- ; ) { | 93 | for (j = pi->raid_disks ; j-- ; ) { |
103 | bio = bio_alloc(gfp_flags, RESYNC_PAGES); | 94 | bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); |
104 | if (!bio) | 95 | if (!bio) |
105 | goto out_free_bio; | 96 | goto out_free_bio; |
106 | r1_bio->bios[j] = bio; | 97 | r1_bio->bios[j] = bio; |
@@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
306 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
307 | } | 298 | } |
308 | 299 | ||
300 | static void r1_bio_write_done(r1bio_t *r1_bio) | ||
301 | { | ||
302 | if (atomic_dec_and_test(&r1_bio->remaining)) | ||
303 | { | ||
304 | /* it really is the end of this request */ | ||
305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
306 | /* free extra copy of the data pages */ | ||
307 | int i = r1_bio->behind_page_count; | ||
308 | while (i--) | ||
309 | safe_put_page(r1_bio->behind_pages[i]); | ||
310 | kfree(r1_bio->behind_pages); | ||
311 | r1_bio->behind_pages = NULL; | ||
312 | } | ||
313 | /* clear the bitmap if all writes complete successfully */ | ||
314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
315 | r1_bio->sectors, | ||
316 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
318 | md_write_end(r1_bio->mddev); | ||
319 | raid_end_bio_io(r1_bio); | ||
320 | } | ||
321 | } | ||
322 | |||
309 | static void raid1_end_write_request(struct bio *bio, int error) | 323 | static void raid1_end_write_request(struct bio *bio, int error) |
310 | { | 324 | { |
311 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 325 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -319,84 +333,61 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
319 | if (r1_bio->bios[mirror] == bio) | 333 | if (r1_bio->bios[mirror] == bio) |
320 | break; | 334 | break; |
321 | 335 | ||
322 | if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { | 336 | /* |
323 | set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); | 337 | * 'one mirror IO has finished' event handler: |
324 | set_bit(R1BIO_BarrierRetry, &r1_bio->state); | 338 | */ |
325 | r1_bio->mddev->barriers_work = 0; | 339 | r1_bio->bios[mirror] = NULL; |
326 | /* Don't rdev_dec_pending in this branch - keep it for the retry */ | 340 | to_put = bio; |
327 | } else { | 341 | if (!uptodate) { |
342 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | ||
343 | /* an I/O failed, we can't clear the bitmap */ | ||
344 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
345 | } else | ||
328 | /* | 346 | /* |
329 | * this branch is our 'one mirror IO has finished' event handler: | 347 | * Set R1BIO_Uptodate in our master bio, so that we |
348 | * will return a good error code for to the higher | ||
349 | * levels even if IO on some other mirrored buffer | ||
350 | * fails. | ||
351 | * | ||
352 | * The 'master' represents the composite IO operation | ||
353 | * to user-side. So if something waits for IO, then it | ||
354 | * will wait for the 'master' bio. | ||
330 | */ | 355 | */ |
331 | r1_bio->bios[mirror] = NULL; | 356 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
332 | to_put = bio; | 357 | |
333 | if (!uptodate) { | 358 | update_head_pos(mirror, r1_bio); |
334 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 359 | |
335 | /* an I/O failed, we can't clear the bitmap */ | 360 | if (behind) { |
336 | set_bit(R1BIO_Degraded, &r1_bio->state); | 361 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) |
337 | } else | 362 | atomic_dec(&r1_bio->behind_remaining); |
338 | /* | 363 | |
339 | * Set R1BIO_Uptodate in our master bio, so that | 364 | /* |
340 | * we will return a good error code for to the higher | 365 | * In behind mode, we ACK the master bio once the I/O |
341 | * levels even if IO on some other mirrored buffer fails. | 366 | * has safely reached all non-writemostly |
342 | * | 367 | * disks. Setting the Returned bit ensures that this |
343 | * The 'master' represents the composite IO operation to | 368 | * gets done only once -- we don't ever want to return |
344 | * user-side. So if something waits for IO, then it will | 369 | * -EIO here, instead we'll wait |
345 | * wait for the 'master' bio. | 370 | */ |
346 | */ | 371 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && |
347 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 372 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { |
348 | 373 | /* Maybe we can return now */ | |
349 | update_head_pos(mirror, r1_bio); | 374 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
350 | 375 | struct bio *mbio = r1_bio->master_bio; | |
351 | if (behind) { | 376 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", |
352 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | 377 | (unsigned long long) mbio->bi_sector, |
353 | atomic_dec(&r1_bio->behind_remaining); | 378 | (unsigned long long) mbio->bi_sector + |
354 | 379 | (mbio->bi_size >> 9) - 1); | |
355 | /* In behind mode, we ACK the master bio once the I/O has safely | 380 | bio_endio(mbio, 0); |
356 | * reached all non-writemostly disks. Setting the Returned bit | ||
357 | * ensures that this gets done only once -- we don't ever want to | ||
358 | * return -EIO here, instead we'll wait */ | ||
359 | |||
360 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
361 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
362 | /* Maybe we can return now */ | ||
363 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
364 | struct bio *mbio = r1_bio->master_bio; | ||
365 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
366 | (unsigned long long) mbio->bi_sector, | ||
367 | (unsigned long long) mbio->bi_sector + | ||
368 | (mbio->bi_size >> 9) - 1); | ||
369 | bio_endio(mbio, 0); | ||
370 | } | ||
371 | } | 381 | } |
372 | } | 382 | } |
373 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
374 | } | 383 | } |
384 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
385 | |||
375 | /* | 386 | /* |
376 | * | ||
377 | * Let's see if all mirrored write operations have finished | 387 | * Let's see if all mirrored write operations have finished |
378 | * already. | 388 | * already. |
379 | */ | 389 | */ |
380 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 390 | r1_bio_write_done(r1_bio); |
381 | if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) | ||
382 | reschedule_retry(r1_bio); | ||
383 | else { | ||
384 | /* it really is the end of this request */ | ||
385 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
386 | /* free extra copy of the data pages */ | ||
387 | int i = bio->bi_vcnt; | ||
388 | while (i--) | ||
389 | safe_put_page(bio->bi_io_vec[i].bv_page); | ||
390 | } | ||
391 | /* clear the bitmap if all writes complete successfully */ | ||
392 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
393 | r1_bio->sectors, | ||
394 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
395 | behind); | ||
396 | md_write_end(r1_bio->mddev); | ||
397 | raid_end_bio_io(r1_bio); | ||
398 | } | ||
399 | } | ||
400 | 391 | ||
401 | if (to_put) | 392 | if (to_put) |
402 | bio_put(to_put); | 393 | bio_put(to_put); |
@@ -420,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
420 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | 411 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) |
421 | { | 412 | { |
422 | const sector_t this_sector = r1_bio->sector; | 413 | const sector_t this_sector = r1_bio->sector; |
423 | int new_disk = conf->last_used, disk = new_disk; | ||
424 | int wonly_disk = -1; | ||
425 | const int sectors = r1_bio->sectors; | 414 | const int sectors = r1_bio->sectors; |
426 | sector_t new_distance, current_distance; | 415 | int start_disk; |
416 | int best_disk; | ||
417 | int i; | ||
418 | sector_t best_dist; | ||
427 | mdk_rdev_t *rdev; | 419 | mdk_rdev_t *rdev; |
420 | int choose_first; | ||
428 | 421 | ||
429 | rcu_read_lock(); | 422 | rcu_read_lock(); |
430 | /* | 423 | /* |
@@ -433,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
433 | * We take the first readable disk when above the resync window. | 426 | * We take the first readable disk when above the resync window. |
434 | */ | 427 | */ |
435 | retry: | 428 | retry: |
429 | best_disk = -1; | ||
430 | best_dist = MaxSector; | ||
436 | if (conf->mddev->recovery_cp < MaxSector && | 431 | if (conf->mddev->recovery_cp < MaxSector && |
437 | (this_sector + sectors >= conf->next_resync)) { | 432 | (this_sector + sectors >= conf->next_resync)) { |
438 | /* Choose the first operational device, for consistancy */ | 433 | choose_first = 1; |
439 | new_disk = 0; | 434 | start_disk = 0; |
440 | 435 | } else { | |
441 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 436 | choose_first = 0; |
442 | r1_bio->bios[new_disk] == IO_BLOCKED || | 437 | start_disk = conf->last_used; |
443 | !rdev || !test_bit(In_sync, &rdev->flags) | ||
444 | || test_bit(WriteMostly, &rdev->flags); | ||
445 | rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { | ||
446 | |||
447 | if (rdev && test_bit(In_sync, &rdev->flags) && | ||
448 | r1_bio->bios[new_disk] != IO_BLOCKED) | ||
449 | wonly_disk = new_disk; | ||
450 | |||
451 | if (new_disk == conf->raid_disks - 1) { | ||
452 | new_disk = wonly_disk; | ||
453 | break; | ||
454 | } | ||
455 | } | ||
456 | goto rb_out; | ||
457 | } | ||
458 | |||
459 | |||
460 | /* make sure the disk is operational */ | ||
461 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | ||
462 | r1_bio->bios[new_disk] == IO_BLOCKED || | ||
463 | !rdev || !test_bit(In_sync, &rdev->flags) || | ||
464 | test_bit(WriteMostly, &rdev->flags); | ||
465 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { | ||
466 | |||
467 | if (rdev && test_bit(In_sync, &rdev->flags) && | ||
468 | r1_bio->bios[new_disk] != IO_BLOCKED) | ||
469 | wonly_disk = new_disk; | ||
470 | |||
471 | if (new_disk <= 0) | ||
472 | new_disk = conf->raid_disks; | ||
473 | new_disk--; | ||
474 | if (new_disk == disk) { | ||
475 | new_disk = wonly_disk; | ||
476 | break; | ||
477 | } | ||
478 | } | 438 | } |
479 | 439 | ||
480 | if (new_disk < 0) | 440 | for (i = 0 ; i < conf->raid_disks ; i++) { |
481 | goto rb_out; | 441 | sector_t dist; |
482 | 442 | int disk = start_disk + i; | |
483 | disk = new_disk; | 443 | if (disk >= conf->raid_disks) |
484 | /* now disk == new_disk == starting point for search */ | 444 | disk -= conf->raid_disks; |
485 | |||
486 | /* | ||
487 | * Don't change to another disk for sequential reads: | ||
488 | */ | ||
489 | if (conf->next_seq_sect == this_sector) | ||
490 | goto rb_out; | ||
491 | if (this_sector == conf->mirrors[new_disk].head_position) | ||
492 | goto rb_out; | ||
493 | |||
494 | current_distance = abs(this_sector - conf->mirrors[disk].head_position); | ||
495 | |||
496 | /* Find the disk whose head is closest */ | ||
497 | |||
498 | do { | ||
499 | if (disk <= 0) | ||
500 | disk = conf->raid_disks; | ||
501 | disk--; | ||
502 | 445 | ||
503 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 446 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
504 | 447 | if (r1_bio->bios[disk] == IO_BLOCKED | |
505 | if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || | 448 | || rdev == NULL |
506 | !test_bit(In_sync, &rdev->flags) || | 449 | || test_bit(Faulty, &rdev->flags)) |
507 | test_bit(WriteMostly, &rdev->flags)) | ||
508 | continue; | 450 | continue; |
509 | 451 | if (!test_bit(In_sync, &rdev->flags) && | |
510 | if (!atomic_read(&rdev->nr_pending)) { | 452 | rdev->recovery_offset < this_sector + sectors) |
511 | new_disk = disk; | 453 | continue; |
454 | if (test_bit(WriteMostly, &rdev->flags)) { | ||
455 | /* Don't balance among write-mostly, just | ||
456 | * use the first as a last resort */ | ||
457 | if (best_disk < 0) | ||
458 | best_disk = disk; | ||
459 | continue; | ||
460 | } | ||
461 | /* This is a reasonable device to use. It might | ||
462 | * even be best. | ||
463 | */ | ||
464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | ||
465 | if (choose_first | ||
466 | /* Don't change to another disk for sequential reads */ | ||
467 | || conf->next_seq_sect == this_sector | ||
468 | || dist == 0 | ||
469 | /* If device is idle, use it */ | ||
470 | || atomic_read(&rdev->nr_pending) == 0) { | ||
471 | best_disk = disk; | ||
512 | break; | 472 | break; |
513 | } | 473 | } |
514 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 474 | if (dist < best_dist) { |
515 | if (new_distance < current_distance) { | 475 | best_dist = dist; |
516 | current_distance = new_distance; | 476 | best_disk = disk; |
517 | new_disk = disk; | ||
518 | } | 477 | } |
519 | } while (disk != conf->last_used); | 478 | } |
520 | |||
521 | rb_out: | ||
522 | |||
523 | 479 | ||
524 | if (new_disk >= 0) { | 480 | if (best_disk >= 0) { |
525 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 481 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
526 | if (!rdev) | 482 | if (!rdev) |
527 | goto retry; | 483 | goto retry; |
528 | atomic_inc(&rdev->nr_pending); | 484 | atomic_inc(&rdev->nr_pending); |
529 | if (!test_bit(In_sync, &rdev->flags)) { | 485 | if (test_bit(Faulty, &rdev->flags)) { |
530 | /* cannot risk returning a device that failed | 486 | /* cannot risk returning a device that failed |
531 | * before we inc'ed nr_pending | 487 | * before we inc'ed nr_pending |
532 | */ | 488 | */ |
@@ -534,59 +490,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
534 | goto retry; | 490 | goto retry; |
535 | } | 491 | } |
536 | conf->next_seq_sect = this_sector + sectors; | 492 | conf->next_seq_sect = this_sector + sectors; |
537 | conf->last_used = new_disk; | 493 | conf->last_used = best_disk; |
538 | } | 494 | } |
539 | rcu_read_unlock(); | 495 | rcu_read_unlock(); |
540 | 496 | ||
541 | return new_disk; | 497 | return best_disk; |
542 | } | 498 | } |
543 | 499 | ||
544 | static void unplug_slaves(mddev_t *mddev) | 500 | int md_raid1_congested(mddev_t *mddev, int bits) |
545 | { | 501 | { |
546 | conf_t *conf = mddev->private; | 502 | conf_t *conf = mddev->private; |
547 | int i; | ||
548 | |||
549 | rcu_read_lock(); | ||
550 | for (i=0; i<mddev->raid_disks; i++) { | ||
551 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
552 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | ||
553 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
554 | |||
555 | atomic_inc(&rdev->nr_pending); | ||
556 | rcu_read_unlock(); | ||
557 | |||
558 | blk_unplug(r_queue); | ||
559 | |||
560 | rdev_dec_pending(rdev, mddev); | ||
561 | rcu_read_lock(); | ||
562 | } | ||
563 | } | ||
564 | rcu_read_unlock(); | ||
565 | } | ||
566 | |||
567 | static void raid1_unplug(struct request_queue *q) | ||
568 | { | ||
569 | mddev_t *mddev = q->queuedata; | ||
570 | |||
571 | unplug_slaves(mddev); | ||
572 | md_wakeup_thread(mddev->thread); | ||
573 | } | ||
574 | |||
575 | static int raid1_congested(void *data, int bits) | ||
576 | { | ||
577 | mddev_t *mddev = data; | ||
578 | conf_t *conf = mddev->private; | ||
579 | int i, ret = 0; | 503 | int i, ret = 0; |
580 | 504 | ||
581 | if (mddev_congested(mddev, bits)) | ||
582 | return 1; | ||
583 | |||
584 | rcu_read_lock(); | 505 | rcu_read_lock(); |
585 | for (i = 0; i < mddev->raid_disks; i++) { | 506 | for (i = 0; i < mddev->raid_disks; i++) { |
586 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 507 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
587 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 508 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
588 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 509 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
589 | 510 | ||
511 | BUG_ON(!q); | ||
512 | |||
590 | /* Note the '|| 1' - when read_balance prefers | 513 | /* Note the '|| 1' - when read_balance prefers |
591 | * non-congested targets, it can be removed | 514 | * non-congested targets, it can be removed |
592 | */ | 515 | */ |
@@ -599,22 +522,26 @@ static int raid1_congested(void *data, int bits) | |||
599 | rcu_read_unlock(); | 522 | rcu_read_unlock(); |
600 | return ret; | 523 | return ret; |
601 | } | 524 | } |
525 | EXPORT_SYMBOL_GPL(md_raid1_congested); | ||
602 | 526 | ||
527 | static int raid1_congested(void *data, int bits) | ||
528 | { | ||
529 | mddev_t *mddev = data; | ||
603 | 530 | ||
604 | static int flush_pending_writes(conf_t *conf) | 531 | return mddev_congested(mddev, bits) || |
532 | md_raid1_congested(mddev, bits); | ||
533 | } | ||
534 | |||
535 | static void flush_pending_writes(conf_t *conf) | ||
605 | { | 536 | { |
606 | /* Any writes that have been queued but are awaiting | 537 | /* Any writes that have been queued but are awaiting |
607 | * bitmap updates get flushed here. | 538 | * bitmap updates get flushed here. |
608 | * We return 1 if any requests were actually submitted. | ||
609 | */ | 539 | */ |
610 | int rv = 0; | ||
611 | |||
612 | spin_lock_irq(&conf->device_lock); | 540 | spin_lock_irq(&conf->device_lock); |
613 | 541 | ||
614 | if (conf->pending_bio_list.head) { | 542 | if (conf->pending_bio_list.head) { |
615 | struct bio *bio; | 543 | struct bio *bio; |
616 | bio = bio_list_get(&conf->pending_bio_list); | 544 | bio = bio_list_get(&conf->pending_bio_list); |
617 | blk_remove_plug(conf->mddev->queue); | ||
618 | spin_unlock_irq(&conf->device_lock); | 545 | spin_unlock_irq(&conf->device_lock); |
619 | /* flush any pending bitmap writes to | 546 | /* flush any pending bitmap writes to |
620 | * disk before proceeding w/ I/O */ | 547 | * disk before proceeding w/ I/O */ |
@@ -626,10 +553,8 @@ static int flush_pending_writes(conf_t *conf) | |||
626 | generic_make_request(bio); | 553 | generic_make_request(bio); |
627 | bio = next; | 554 | bio = next; |
628 | } | 555 | } |
629 | rv = 1; | ||
630 | } else | 556 | } else |
631 | spin_unlock_irq(&conf->device_lock); | 557 | spin_unlock_irq(&conf->device_lock); |
632 | return rv; | ||
633 | } | 558 | } |
634 | 559 | ||
635 | /* Barriers.... | 560 | /* Barriers.... |
@@ -661,17 +586,15 @@ static void raise_barrier(conf_t *conf) | |||
661 | 586 | ||
662 | /* Wait until no block IO is waiting */ | 587 | /* Wait until no block IO is waiting */ |
663 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | 588 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, |
664 | conf->resync_lock, | 589 | conf->resync_lock, ); |
665 | raid1_unplug(conf->mddev->queue)); | ||
666 | 590 | ||
667 | /* block any new IO from starting */ | 591 | /* block any new IO from starting */ |
668 | conf->barrier++; | 592 | conf->barrier++; |
669 | 593 | ||
670 | /* No wait for all pending IO to complete */ | 594 | /* Now wait for all pending IO to complete */ |
671 | wait_event_lock_irq(conf->wait_barrier, | 595 | wait_event_lock_irq(conf->wait_barrier, |
672 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 596 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, |
673 | conf->resync_lock, | 597 | conf->resync_lock, ); |
674 | raid1_unplug(conf->mddev->queue)); | ||
675 | 598 | ||
676 | spin_unlock_irq(&conf->resync_lock); | 599 | spin_unlock_irq(&conf->resync_lock); |
677 | } | 600 | } |
@@ -693,7 +616,7 @@ static void wait_barrier(conf_t *conf) | |||
693 | conf->nr_waiting++; | 616 | conf->nr_waiting++; |
694 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | 617 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, |
695 | conf->resync_lock, | 618 | conf->resync_lock, |
696 | raid1_unplug(conf->mddev->queue)); | 619 | ); |
697 | conf->nr_waiting--; | 620 | conf->nr_waiting--; |
698 | } | 621 | } |
699 | conf->nr_pending++; | 622 | conf->nr_pending++; |
@@ -729,8 +652,7 @@ static void freeze_array(conf_t *conf) | |||
729 | wait_event_lock_irq(conf->wait_barrier, | 652 | wait_event_lock_irq(conf->wait_barrier, |
730 | conf->nr_pending == conf->nr_queued+1, | 653 | conf->nr_pending == conf->nr_queued+1, |
731 | conf->resync_lock, | 654 | conf->resync_lock, |
732 | ({ flush_pending_writes(conf); | 655 | flush_pending_writes(conf)); |
733 | raid1_unplug(conf->mddev->queue); })); | ||
734 | spin_unlock_irq(&conf->resync_lock); | 656 | spin_unlock_irq(&conf->resync_lock); |
735 | } | 657 | } |
736 | static void unfreeze_array(conf_t *conf) | 658 | static void unfreeze_array(conf_t *conf) |
@@ -744,15 +666,16 @@ static void unfreeze_array(conf_t *conf) | |||
744 | } | 666 | } |
745 | 667 | ||
746 | 668 | ||
747 | /* duplicate the data pages for behind I/O */ | 669 | /* duplicate the data pages for behind I/O |
748 | static struct page **alloc_behind_pages(struct bio *bio) | 670 | */ |
671 | static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) | ||
749 | { | 672 | { |
750 | int i; | 673 | int i; |
751 | struct bio_vec *bvec; | 674 | struct bio_vec *bvec; |
752 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), | 675 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), |
753 | GFP_NOIO); | 676 | GFP_NOIO); |
754 | if (unlikely(!pages)) | 677 | if (unlikely(!pages)) |
755 | goto do_sync_io; | 678 | return; |
756 | 679 | ||
757 | bio_for_each_segment(bvec, bio, i) { | 680 | bio_for_each_segment(bvec, bio, i) { |
758 | pages[i] = alloc_page(GFP_NOIO); | 681 | pages[i] = alloc_page(GFP_NOIO); |
@@ -763,16 +686,17 @@ static struct page **alloc_behind_pages(struct bio *bio) | |||
763 | kunmap(pages[i]); | 686 | kunmap(pages[i]); |
764 | kunmap(bvec->bv_page); | 687 | kunmap(bvec->bv_page); |
765 | } | 688 | } |
766 | 689 | r1_bio->behind_pages = pages; | |
767 | return pages; | 690 | r1_bio->behind_page_count = bio->bi_vcnt; |
691 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
692 | return; | ||
768 | 693 | ||
769 | do_sync_io: | 694 | do_sync_io: |
770 | if (pages) | 695 | for (i = 0; i < bio->bi_vcnt; i++) |
771 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | 696 | if (pages[i]) |
772 | put_page(pages[i]); | 697 | put_page(pages[i]); |
773 | kfree(pages); | 698 | kfree(pages); |
774 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 699 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
775 | return NULL; | ||
776 | } | 700 | } |
777 | 701 | ||
778 | static int make_request(mddev_t *mddev, struct bio * bio) | 702 | static int make_request(mddev_t *mddev, struct bio * bio) |
@@ -784,20 +708,16 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
784 | int i, targets = 0, disks; | 708 | int i, targets = 0, disks; |
785 | struct bitmap *bitmap; | 709 | struct bitmap *bitmap; |
786 | unsigned long flags; | 710 | unsigned long flags; |
787 | struct bio_list bl; | ||
788 | struct page **behind_pages = NULL; | ||
789 | const int rw = bio_data_dir(bio); | 711 | const int rw = bio_data_dir(bio); |
790 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 712 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
791 | unsigned long do_barriers; | 713 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
792 | mdk_rdev_t *blocked_rdev; | 714 | mdk_rdev_t *blocked_rdev; |
715 | int plugged; | ||
793 | 716 | ||
794 | /* | 717 | /* |
795 | * Register the new request and wait if the reconstruction | 718 | * Register the new request and wait if the reconstruction |
796 | * thread has put up a bar for new requests. | 719 | * thread has put up a bar for new requests. |
797 | * Continue immediately if no resync is active currently. | 720 | * Continue immediately if no resync is active currently. |
798 | * We test barriers_work *after* md_write_start as md_write_start | ||
799 | * may cause the first superblock write, and that will check out | ||
800 | * if barriers work. | ||
801 | */ | 721 | */ |
802 | 722 | ||
803 | md_write_start(mddev, bio); /* wait on superblock update early */ | 723 | md_write_start(mddev, bio); /* wait on superblock update early */ |
@@ -821,13 +741,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | } | 741 | } |
822 | finish_wait(&conf->wait_barrier, &w); | 742 | finish_wait(&conf->wait_barrier, &w); |
823 | } | 743 | } |
824 | if (unlikely(!mddev->barriers_work && | ||
825 | (bio->bi_rw & REQ_HARDBARRIER))) { | ||
826 | if (rw == WRITE) | ||
827 | md_write_end(mddev); | ||
828 | bio_endio(bio, -EOPNOTSUPP); | ||
829 | return 0; | ||
830 | } | ||
831 | 744 | ||
832 | wait_barrier(conf); | 745 | wait_barrier(conf); |
833 | 746 | ||
@@ -870,7 +783,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
870 | } | 783 | } |
871 | r1_bio->read_disk = rdisk; | 784 | r1_bio->read_disk = rdisk; |
872 | 785 | ||
873 | read_bio = bio_clone(bio, GFP_NOIO); | 786 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
874 | 787 | ||
875 | r1_bio->bios[rdisk] = read_bio; | 788 | r1_bio->bios[rdisk] = read_bio; |
876 | 789 | ||
@@ -891,14 +804,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
891 | * inc refcount on their rdev. Record them by setting | 804 | * inc refcount on their rdev. Record them by setting |
892 | * bios[x] to bio | 805 | * bios[x] to bio |
893 | */ | 806 | */ |
807 | plugged = mddev_check_plugged(mddev); | ||
808 | |||
894 | disks = conf->raid_disks; | 809 | disks = conf->raid_disks; |
895 | #if 0 | ||
896 | { static int first=1; | ||
897 | if (first) printk("First Write sector %llu disks %d\n", | ||
898 | (unsigned long long)r1_bio->sector, disks); | ||
899 | first = 0; | ||
900 | } | ||
901 | #endif | ||
902 | retry_write: | 810 | retry_write: |
903 | blocked_rdev = NULL; | 811 | blocked_rdev = NULL; |
904 | rcu_read_lock(); | 812 | rcu_read_lock(); |
@@ -952,33 +860,29 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
952 | if (bitmap && | 860 | if (bitmap && |
953 | (atomic_read(&bitmap->behind_writes) | 861 | (atomic_read(&bitmap->behind_writes) |
954 | < mddev->bitmap_info.max_write_behind) && | 862 | < mddev->bitmap_info.max_write_behind) && |
955 | !waitqueue_active(&bitmap->behind_wait) && | 863 | !waitqueue_active(&bitmap->behind_wait)) |
956 | (behind_pages = alloc_behind_pages(bio)) != NULL) | 864 | alloc_behind_pages(bio, r1_bio); |
957 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
958 | 865 | ||
959 | atomic_set(&r1_bio->remaining, 0); | 866 | atomic_set(&r1_bio->remaining, 1); |
960 | atomic_set(&r1_bio->behind_remaining, 0); | 867 | atomic_set(&r1_bio->behind_remaining, 0); |
961 | 868 | ||
962 | do_barriers = bio->bi_rw & REQ_HARDBARRIER; | 869 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
963 | if (do_barriers) | 870 | test_bit(R1BIO_BehindIO, &r1_bio->state)); |
964 | set_bit(R1BIO_Barrier, &r1_bio->state); | ||
965 | |||
966 | bio_list_init(&bl); | ||
967 | for (i = 0; i < disks; i++) { | 871 | for (i = 0; i < disks; i++) { |
968 | struct bio *mbio; | 872 | struct bio *mbio; |
969 | if (!r1_bio->bios[i]) | 873 | if (!r1_bio->bios[i]) |
970 | continue; | 874 | continue; |
971 | 875 | ||
972 | mbio = bio_clone(bio, GFP_NOIO); | 876 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
973 | r1_bio->bios[i] = mbio; | 877 | r1_bio->bios[i] = mbio; |
974 | 878 | ||
975 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 879 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
976 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 880 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
977 | mbio->bi_end_io = raid1_end_write_request; | 881 | mbio->bi_end_io = raid1_end_write_request; |
978 | mbio->bi_rw = WRITE | do_barriers | do_sync; | 882 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; |
979 | mbio->bi_private = r1_bio; | 883 | mbio->bi_private = r1_bio; |
980 | 884 | ||
981 | if (behind_pages) { | 885 | if (r1_bio->behind_pages) { |
982 | struct bio_vec *bvec; | 886 | struct bio_vec *bvec; |
983 | int j; | 887 | int j; |
984 | 888 | ||
@@ -986,39 +890,27 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
986 | * we clear any unused pointer in the io_vec, rather | 890 | * we clear any unused pointer in the io_vec, rather |
987 | * than leave them unchanged. This is important | 891 | * than leave them unchanged. This is important |
988 | * because when we come to free the pages, we won't | 892 | * because when we come to free the pages, we won't |
989 | * know the originial bi_idx, so we just free | 893 | * know the original bi_idx, so we just free |
990 | * them all | 894 | * them all |
991 | */ | 895 | */ |
992 | __bio_for_each_segment(bvec, mbio, j, 0) | 896 | __bio_for_each_segment(bvec, mbio, j, 0) |
993 | bvec->bv_page = behind_pages[j]; | 897 | bvec->bv_page = r1_bio->behind_pages[j]; |
994 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 898 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
995 | atomic_inc(&r1_bio->behind_remaining); | 899 | atomic_inc(&r1_bio->behind_remaining); |
996 | } | 900 | } |
997 | 901 | ||
998 | atomic_inc(&r1_bio->remaining); | 902 | atomic_inc(&r1_bio->remaining); |
999 | 903 | spin_lock_irqsave(&conf->device_lock, flags); | |
1000 | bio_list_add(&bl, mbio); | 904 | bio_list_add(&conf->pending_bio_list, mbio); |
905 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1001 | } | 906 | } |
1002 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | 907 | r1_bio_write_done(r1_bio); |
1003 | |||
1004 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, | ||
1005 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
1006 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1007 | bio_list_merge(&conf->pending_bio_list, &bl); | ||
1008 | bio_list_init(&bl); | ||
1009 | 908 | ||
1010 | blk_plug_device(mddev->queue); | 909 | /* In case raid1d snuck in to freeze_array */ |
1011 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1012 | |||
1013 | /* In case raid1d snuck into freeze_array */ | ||
1014 | wake_up(&conf->wait_barrier); | 910 | wake_up(&conf->wait_barrier); |
1015 | 911 | ||
1016 | if (do_sync) | 912 | if (do_sync || !bitmap || !plugged) |
1017 | md_wakeup_thread(mddev->thread); | 913 | md_wakeup_thread(mddev->thread); |
1018 | #if 0 | ||
1019 | while ((bio = bio_list_pop(&bl)) != NULL) | ||
1020 | generic_make_request(bio); | ||
1021 | #endif | ||
1022 | 914 | ||
1023 | return 0; | 915 | return 0; |
1024 | } | 916 | } |
@@ -1076,8 +968,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1076 | } else | 968 | } else |
1077 | set_bit(Faulty, &rdev->flags); | 969 | set_bit(Faulty, &rdev->flags); |
1078 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 970 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1079 | printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" | 971 | printk(KERN_ALERT |
1080 | KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", | 972 | "md/raid1:%s: Disk failure on %s, disabling device.\n" |
973 | "md/raid1:%s: Operation continuing on %d devices.\n", | ||
1081 | mdname(mddev), bdevname(rdev->bdev, b), | 974 | mdname(mddev), bdevname(rdev->bdev, b), |
1082 | mdname(mddev), conf->raid_disks - mddev->degraded); | 975 | mdname(mddev), conf->raid_disks - mddev->degraded); |
1083 | } | 976 | } |
@@ -1206,10 +1099,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1206 | err = -EBUSY; | 1099 | err = -EBUSY; |
1207 | goto abort; | 1100 | goto abort; |
1208 | } | 1101 | } |
1209 | /* Only remove non-faulty devices is recovery | 1102 | /* Only remove non-faulty devices if recovery |
1210 | * is not possible. | 1103 | * is not possible. |
1211 | */ | 1104 | */ |
1212 | if (!test_bit(Faulty, &rdev->flags) && | 1105 | if (!test_bit(Faulty, &rdev->flags) && |
1106 | !mddev->recovery_disabled && | ||
1213 | mddev->degraded < conf->raid_disks) { | 1107 | mddev->degraded < conf->raid_disks) { |
1214 | err = -EBUSY; | 1108 | err = -EBUSY; |
1215 | goto abort; | 1109 | goto abort; |
@@ -1222,7 +1116,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1222 | p->rdev = rdev; | 1116 | p->rdev = rdev; |
1223 | goto abort; | 1117 | goto abort; |
1224 | } | 1118 | } |
1225 | md_integrity_register(mddev); | 1119 | err = md_integrity_register(mddev); |
1226 | } | 1120 | } |
1227 | abort: | 1121 | abort: |
1228 | 1122 | ||
@@ -1268,7 +1162,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
1268 | break; | 1162 | break; |
1269 | } | 1163 | } |
1270 | if (!uptodate) { | 1164 | if (!uptodate) { |
1271 | int sync_blocks = 0; | 1165 | sector_t sync_blocks = 0; |
1272 | sector_t s = r1_bio->sector; | 1166 | sector_t s = r1_bio->sector; |
1273 | long sectors_to_go = r1_bio->sectors; | 1167 | long sectors_to_go = r1_bio->sectors; |
1274 | /* make sure these bits doesn't get cleared. */ | 1168 | /* make sure these bits doesn't get cleared. */ |
@@ -1290,194 +1184,210 @@ static void end_sync_write(struct bio *bio, int error) | |||
1290 | } | 1184 | } |
1291 | } | 1185 | } |
1292 | 1186 | ||
1293 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | 1187 | static int fix_sync_read_error(r1bio_t *r1_bio) |
1294 | { | 1188 | { |
1189 | /* Try some synchronous reads of other devices to get | ||
1190 | * good data, much like with normal read errors. Only | ||
1191 | * read into the pages we already have so we don't | ||
1192 | * need to re-issue the read request. | ||
1193 | * We don't need to freeze the array, because being in an | ||
1194 | * active sync request, there is no normal IO, and | ||
1195 | * no overlapping syncs. | ||
1196 | */ | ||
1197 | mddev_t *mddev = r1_bio->mddev; | ||
1295 | conf_t *conf = mddev->private; | 1198 | conf_t *conf = mddev->private; |
1296 | int i; | 1199 | struct bio *bio = r1_bio->bios[r1_bio->read_disk]; |
1297 | int disks = conf->raid_disks; | 1200 | sector_t sect = r1_bio->sector; |
1298 | struct bio *bio, *wbio; | 1201 | int sectors = r1_bio->sectors; |
1299 | 1202 | int idx = 0; | |
1300 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1301 | 1203 | ||
1204 | while(sectors) { | ||
1205 | int s = sectors; | ||
1206 | int d = r1_bio->read_disk; | ||
1207 | int success = 0; | ||
1208 | mdk_rdev_t *rdev; | ||
1209 | int start; | ||
1302 | 1210 | ||
1303 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1211 | if (s > (PAGE_SIZE>>9)) |
1304 | /* We have read all readable devices. If we haven't | 1212 | s = PAGE_SIZE >> 9; |
1305 | * got the block, then there is no hope left. | 1213 | do { |
1306 | * If we have, then we want to do a comparison | 1214 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { |
1307 | * and skip the write if everything is the same. | 1215 | /* No rcu protection needed here devices |
1308 | * If any blocks failed to read, then we need to | 1216 | * can only be removed when no resync is |
1309 | * attempt an over-write | 1217 | * active, and resync is currently active |
1310 | */ | 1218 | */ |
1311 | int primary; | 1219 | rdev = conf->mirrors[d].rdev; |
1312 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1220 | if (sync_page_io(rdev, |
1313 | for (i=0; i<mddev->raid_disks; i++) | 1221 | sect, |
1314 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) | 1222 | s<<9, |
1315 | md_error(mddev, conf->mirrors[i].rdev); | 1223 | bio->bi_io_vec[idx].bv_page, |
1224 | READ, false)) { | ||
1225 | success = 1; | ||
1226 | break; | ||
1227 | } | ||
1228 | } | ||
1229 | d++; | ||
1230 | if (d == conf->raid_disks) | ||
1231 | d = 0; | ||
1232 | } while (!success && d != r1_bio->read_disk); | ||
1316 | 1233 | ||
1317 | md_done_sync(mddev, r1_bio->sectors, 1); | 1234 | if (!success) { |
1235 | char b[BDEVNAME_SIZE]; | ||
1236 | /* Cannot read from anywhere, array is toast */ | ||
1237 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1238 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1239 | " for block %llu\n", | ||
1240 | mdname(mddev), | ||
1241 | bdevname(bio->bi_bdev, b), | ||
1242 | (unsigned long long)r1_bio->sector); | ||
1243 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1318 | put_buf(r1_bio); | 1244 | put_buf(r1_bio); |
1319 | return; | 1245 | return 0; |
1320 | } | 1246 | } |
1321 | for (primary=0; primary<mddev->raid_disks; primary++) | ||
1322 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | ||
1323 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | ||
1324 | r1_bio->bios[primary]->bi_end_io = NULL; | ||
1325 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); | ||
1326 | break; | ||
1327 | } | ||
1328 | r1_bio->read_disk = primary; | ||
1329 | for (i=0; i<mddev->raid_disks; i++) | ||
1330 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) { | ||
1331 | int j; | ||
1332 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); | ||
1333 | struct bio *pbio = r1_bio->bios[primary]; | ||
1334 | struct bio *sbio = r1_bio->bios[i]; | ||
1335 | |||
1336 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { | ||
1337 | for (j = vcnt; j-- ; ) { | ||
1338 | struct page *p, *s; | ||
1339 | p = pbio->bi_io_vec[j].bv_page; | ||
1340 | s = sbio->bi_io_vec[j].bv_page; | ||
1341 | if (memcmp(page_address(p), | ||
1342 | page_address(s), | ||
1343 | PAGE_SIZE)) | ||
1344 | break; | ||
1345 | } | ||
1346 | } else | ||
1347 | j = 0; | ||
1348 | if (j >= 0) | ||
1349 | mddev->resync_mismatches += r1_bio->sectors; | ||
1350 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1351 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1352 | sbio->bi_end_io = NULL; | ||
1353 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1354 | } else { | ||
1355 | /* fixup the bio for reuse */ | ||
1356 | int size; | ||
1357 | sbio->bi_vcnt = vcnt; | ||
1358 | sbio->bi_size = r1_bio->sectors << 9; | ||
1359 | sbio->bi_idx = 0; | ||
1360 | sbio->bi_phys_segments = 0; | ||
1361 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1362 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1363 | sbio->bi_next = NULL; | ||
1364 | sbio->bi_sector = r1_bio->sector + | ||
1365 | conf->mirrors[i].rdev->data_offset; | ||
1366 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1367 | size = sbio->bi_size; | ||
1368 | for (j = 0; j < vcnt ; j++) { | ||
1369 | struct bio_vec *bi; | ||
1370 | bi = &sbio->bi_io_vec[j]; | ||
1371 | bi->bv_offset = 0; | ||
1372 | if (size > PAGE_SIZE) | ||
1373 | bi->bv_len = PAGE_SIZE; | ||
1374 | else | ||
1375 | bi->bv_len = size; | ||
1376 | size -= PAGE_SIZE; | ||
1377 | memcpy(page_address(bi->bv_page), | ||
1378 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1379 | PAGE_SIZE); | ||
1380 | } | ||
1381 | 1247 | ||
1382 | } | 1248 | start = d; |
1383 | } | 1249 | /* write it back and re-read */ |
1250 | while (d != r1_bio->read_disk) { | ||
1251 | if (d == 0) | ||
1252 | d = conf->raid_disks; | ||
1253 | d--; | ||
1254 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1255 | continue; | ||
1256 | rdev = conf->mirrors[d].rdev; | ||
1257 | if (sync_page_io(rdev, | ||
1258 | sect, | ||
1259 | s<<9, | ||
1260 | bio->bi_io_vec[idx].bv_page, | ||
1261 | WRITE, false) == 0) { | ||
1262 | r1_bio->bios[d]->bi_end_io = NULL; | ||
1263 | rdev_dec_pending(rdev, mddev); | ||
1264 | md_error(mddev, rdev); | ||
1265 | } else | ||
1266 | atomic_add(s, &rdev->corrected_errors); | ||
1267 | } | ||
1268 | d = start; | ||
1269 | while (d != r1_bio->read_disk) { | ||
1270 | if (d == 0) | ||
1271 | d = conf->raid_disks; | ||
1272 | d--; | ||
1273 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1274 | continue; | ||
1275 | rdev = conf->mirrors[d].rdev; | ||
1276 | if (sync_page_io(rdev, | ||
1277 | sect, | ||
1278 | s<<9, | ||
1279 | bio->bi_io_vec[idx].bv_page, | ||
1280 | READ, false) == 0) | ||
1281 | md_error(mddev, rdev); | ||
1282 | } | ||
1283 | sectors -= s; | ||
1284 | sect += s; | ||
1285 | idx ++; | ||
1384 | } | 1286 | } |
1385 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1287 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
1386 | /* ouch - failed to read all of that. | 1288 | set_bit(BIO_UPTODATE, &bio->bi_flags); |
1387 | * Try some synchronous reads of other devices to get | 1289 | return 1; |
1388 | * good data, much like with normal read errors. Only | 1290 | } |
1389 | * read into the pages we already have so we don't | 1291 | |
1390 | * need to re-issue the read request. | 1292 | static int process_checks(r1bio_t *r1_bio) |
1391 | * We don't need to freeze the array, because being in an | 1293 | { |
1392 | * active sync request, there is no normal IO, and | 1294 | /* We have read all readable devices. If we haven't |
1393 | * no overlapping syncs. | 1295 | * got the block, then there is no hope left. |
1394 | */ | 1296 | * If we have, then we want to do a comparison |
1395 | sector_t sect = r1_bio->sector; | 1297 | * and skip the write if everything is the same. |
1396 | int sectors = r1_bio->sectors; | 1298 | * If any blocks failed to read, then we need to |
1397 | int idx = 0; | 1299 | * attempt an over-write |
1398 | 1300 | */ | |
1399 | while(sectors) { | 1301 | mddev_t *mddev = r1_bio->mddev; |
1400 | int s = sectors; | 1302 | conf_t *conf = mddev->private; |
1401 | int d = r1_bio->read_disk; | 1303 | int primary; |
1402 | int success = 0; | 1304 | int i; |
1403 | mdk_rdev_t *rdev; | 1305 | |
1404 | 1306 | for (primary = 0; primary < conf->raid_disks; primary++) | |
1405 | if (s > (PAGE_SIZE>>9)) | 1307 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && |
1406 | s = PAGE_SIZE >> 9; | 1308 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { |
1407 | do { | 1309 | r1_bio->bios[primary]->bi_end_io = NULL; |
1408 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { | 1310 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); |
1409 | /* No rcu protection needed here devices | 1311 | break; |
1410 | * can only be removed when no resync is | 1312 | } |
1411 | * active, and resync is currently active | 1313 | r1_bio->read_disk = primary; |
1412 | */ | 1314 | for (i = 0; i < conf->raid_disks; i++) { |
1413 | rdev = conf->mirrors[d].rdev; | 1315 | int j; |
1414 | if (sync_page_io(rdev->bdev, | 1316 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); |
1415 | sect + rdev->data_offset, | 1317 | struct bio *pbio = r1_bio->bios[primary]; |
1416 | s<<9, | 1318 | struct bio *sbio = r1_bio->bios[i]; |
1417 | bio->bi_io_vec[idx].bv_page, | 1319 | int size; |
1418 | READ)) { | 1320 | |
1419 | success = 1; | 1321 | if (r1_bio->bios[i]->bi_end_io != end_sync_read) |
1420 | break; | 1322 | continue; |
1421 | } | 1323 | |
1422 | } | 1324 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { |
1423 | d++; | 1325 | for (j = vcnt; j-- ; ) { |
1424 | if (d == conf->raid_disks) | 1326 | struct page *p, *s; |
1425 | d = 0; | 1327 | p = pbio->bi_io_vec[j].bv_page; |
1426 | } while (!success && d != r1_bio->read_disk); | 1328 | s = sbio->bi_io_vec[j].bv_page; |
1427 | 1329 | if (memcmp(page_address(p), | |
1428 | if (success) { | 1330 | page_address(s), |
1429 | int start = d; | 1331 | PAGE_SIZE)) |
1430 | /* write it back and re-read */ | 1332 | break; |
1431 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
1432 | while (d != r1_bio->read_disk) { | ||
1433 | if (d == 0) | ||
1434 | d = conf->raid_disks; | ||
1435 | d--; | ||
1436 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1437 | continue; | ||
1438 | rdev = conf->mirrors[d].rdev; | ||
1439 | atomic_add(s, &rdev->corrected_errors); | ||
1440 | if (sync_page_io(rdev->bdev, | ||
1441 | sect + rdev->data_offset, | ||
1442 | s<<9, | ||
1443 | bio->bi_io_vec[idx].bv_page, | ||
1444 | WRITE) == 0) | ||
1445 | md_error(mddev, rdev); | ||
1446 | } | ||
1447 | d = start; | ||
1448 | while (d != r1_bio->read_disk) { | ||
1449 | if (d == 0) | ||
1450 | d = conf->raid_disks; | ||
1451 | d--; | ||
1452 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1453 | continue; | ||
1454 | rdev = conf->mirrors[d].rdev; | ||
1455 | if (sync_page_io(rdev->bdev, | ||
1456 | sect + rdev->data_offset, | ||
1457 | s<<9, | ||
1458 | bio->bi_io_vec[idx].bv_page, | ||
1459 | READ) == 0) | ||
1460 | md_error(mddev, rdev); | ||
1461 | } | ||
1462 | } else { | ||
1463 | char b[BDEVNAME_SIZE]; | ||
1464 | /* Cannot read from anywhere, array is toast */ | ||
1465 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1466 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1467 | " for block %llu\n", | ||
1468 | mdname(mddev), | ||
1469 | bdevname(bio->bi_bdev, b), | ||
1470 | (unsigned long long)r1_bio->sector); | ||
1471 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1472 | put_buf(r1_bio); | ||
1473 | return; | ||
1474 | } | 1333 | } |
1475 | sectors -= s; | 1334 | } else |
1476 | sect += s; | 1335 | j = 0; |
1477 | idx ++; | 1336 | if (j >= 0) |
1337 | mddev->resync_mismatches += r1_bio->sectors; | ||
1338 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1339 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1340 | /* No need to write to this device. */ | ||
1341 | sbio->bi_end_io = NULL; | ||
1342 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1343 | continue; | ||
1344 | } | ||
1345 | /* fixup the bio for reuse */ | ||
1346 | sbio->bi_vcnt = vcnt; | ||
1347 | sbio->bi_size = r1_bio->sectors << 9; | ||
1348 | sbio->bi_idx = 0; | ||
1349 | sbio->bi_phys_segments = 0; | ||
1350 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1351 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1352 | sbio->bi_next = NULL; | ||
1353 | sbio->bi_sector = r1_bio->sector + | ||
1354 | conf->mirrors[i].rdev->data_offset; | ||
1355 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1356 | size = sbio->bi_size; | ||
1357 | for (j = 0; j < vcnt ; j++) { | ||
1358 | struct bio_vec *bi; | ||
1359 | bi = &sbio->bi_io_vec[j]; | ||
1360 | bi->bv_offset = 0; | ||
1361 | if (size > PAGE_SIZE) | ||
1362 | bi->bv_len = PAGE_SIZE; | ||
1363 | else | ||
1364 | bi->bv_len = size; | ||
1365 | size -= PAGE_SIZE; | ||
1366 | memcpy(page_address(bi->bv_page), | ||
1367 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1368 | PAGE_SIZE); | ||
1478 | } | 1369 | } |
1479 | } | 1370 | } |
1371 | return 0; | ||
1372 | } | ||
1373 | |||
1374 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | ||
1375 | { | ||
1376 | conf_t *conf = mddev->private; | ||
1377 | int i; | ||
1378 | int disks = conf->raid_disks; | ||
1379 | struct bio *bio, *wbio; | ||
1380 | |||
1381 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1480 | 1382 | ||
1383 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
1384 | /* ouch - failed to read all of that. */ | ||
1385 | if (!fix_sync_read_error(r1_bio)) | ||
1386 | return; | ||
1387 | |||
1388 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
1389 | if (process_checks(r1_bio) < 0) | ||
1390 | return; | ||
1481 | /* | 1391 | /* |
1482 | * schedule writes | 1392 | * schedule writes |
1483 | */ | 1393 | */ |
@@ -1536,10 +1446,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1536 | rdev = conf->mirrors[d].rdev; | 1446 | rdev = conf->mirrors[d].rdev; |
1537 | if (rdev && | 1447 | if (rdev && |
1538 | test_bit(In_sync, &rdev->flags) && | 1448 | test_bit(In_sync, &rdev->flags) && |
1539 | sync_page_io(rdev->bdev, | 1449 | sync_page_io(rdev, sect, s<<9, |
1540 | sect + rdev->data_offset, | 1450 | conf->tmppage, READ, false)) |
1541 | s<<9, | ||
1542 | conf->tmppage, READ)) | ||
1543 | success = 1; | 1451 | success = 1; |
1544 | else { | 1452 | else { |
1545 | d++; | 1453 | d++; |
@@ -1562,9 +1470,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1562 | rdev = conf->mirrors[d].rdev; | 1470 | rdev = conf->mirrors[d].rdev; |
1563 | if (rdev && | 1471 | if (rdev && |
1564 | test_bit(In_sync, &rdev->flags)) { | 1472 | test_bit(In_sync, &rdev->flags)) { |
1565 | if (sync_page_io(rdev->bdev, | 1473 | if (sync_page_io(rdev, sect, s<<9, |
1566 | sect + rdev->data_offset, | 1474 | conf->tmppage, WRITE, false) |
1567 | s<<9, conf->tmppage, WRITE) | ||
1568 | == 0) | 1475 | == 0) |
1569 | /* Well, this device is dead */ | 1476 | /* Well, this device is dead */ |
1570 | md_error(mddev, rdev); | 1477 | md_error(mddev, rdev); |
@@ -1579,9 +1486,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1579 | rdev = conf->mirrors[d].rdev; | 1486 | rdev = conf->mirrors[d].rdev; |
1580 | if (rdev && | 1487 | if (rdev && |
1581 | test_bit(In_sync, &rdev->flags)) { | 1488 | test_bit(In_sync, &rdev->flags)) { |
1582 | if (sync_page_io(rdev->bdev, | 1489 | if (sync_page_io(rdev, sect, s<<9, |
1583 | sect + rdev->data_offset, | 1490 | conf->tmppage, READ, false) |
1584 | s<<9, conf->tmppage, READ) | ||
1585 | == 0) | 1491 | == 0) |
1586 | /* Well, this device is dead */ | 1492 | /* Well, this device is dead */ |
1587 | md_error(mddev, rdev); | 1493 | md_error(mddev, rdev); |
@@ -1609,15 +1515,17 @@ static void raid1d(mddev_t *mddev) | |||
1609 | unsigned long flags; | 1515 | unsigned long flags; |
1610 | conf_t *conf = mddev->private; | 1516 | conf_t *conf = mddev->private; |
1611 | struct list_head *head = &conf->retry_list; | 1517 | struct list_head *head = &conf->retry_list; |
1612 | int unplug=0; | ||
1613 | mdk_rdev_t *rdev; | 1518 | mdk_rdev_t *rdev; |
1519 | struct blk_plug plug; | ||
1614 | 1520 | ||
1615 | md_check_recovery(mddev); | 1521 | md_check_recovery(mddev); |
1616 | 1522 | ||
1523 | blk_start_plug(&plug); | ||
1617 | for (;;) { | 1524 | for (;;) { |
1618 | char b[BDEVNAME_SIZE]; | 1525 | char b[BDEVNAME_SIZE]; |
1619 | 1526 | ||
1620 | unplug += flush_pending_writes(conf); | 1527 | if (atomic_read(&mddev->plug_cnt) == 0) |
1528 | flush_pending_writes(conf); | ||
1621 | 1529 | ||
1622 | spin_lock_irqsave(&conf->device_lock, flags); | 1530 | spin_lock_irqsave(&conf->device_lock, flags); |
1623 | if (list_empty(head)) { | 1531 | if (list_empty(head)) { |
@@ -1631,45 +1539,9 @@ static void raid1d(mddev_t *mddev) | |||
1631 | 1539 | ||
1632 | mddev = r1_bio->mddev; | 1540 | mddev = r1_bio->mddev; |
1633 | conf = mddev->private; | 1541 | conf = mddev->private; |
1634 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1542 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) |
1635 | sync_request_write(mddev, r1_bio); | 1543 | sync_request_write(mddev, r1_bio); |
1636 | unplug = 1; | 1544 | else { |
1637 | } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
1638 | /* some requests in the r1bio were REQ_HARDBARRIER | ||
1639 | * requests which failed with -EOPNOTSUPP. Hohumm.. | ||
1640 | * Better resubmit without the barrier. | ||
1641 | * We know which devices to resubmit for, because | ||
1642 | * all others have had their bios[] entry cleared. | ||
1643 | * We already have a nr_pending reference on these rdevs. | ||
1644 | */ | ||
1645 | int i; | ||
1646 | const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); | ||
1647 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | ||
1648 | clear_bit(R1BIO_Barrier, &r1_bio->state); | ||
1649 | for (i=0; i < conf->raid_disks; i++) | ||
1650 | if (r1_bio->bios[i]) | ||
1651 | atomic_inc(&r1_bio->remaining); | ||
1652 | for (i=0; i < conf->raid_disks; i++) | ||
1653 | if (r1_bio->bios[i]) { | ||
1654 | struct bio_vec *bvec; | ||
1655 | int j; | ||
1656 | |||
1657 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
1658 | /* copy pages from the failed bio, as | ||
1659 | * this might be a write-behind device */ | ||
1660 | __bio_for_each_segment(bvec, bio, j, 0) | ||
1661 | bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; | ||
1662 | bio_put(r1_bio->bios[i]); | ||
1663 | bio->bi_sector = r1_bio->sector + | ||
1664 | conf->mirrors[i].rdev->data_offset; | ||
1665 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1666 | bio->bi_end_io = raid1_end_write_request; | ||
1667 | bio->bi_rw = WRITE | do_sync; | ||
1668 | bio->bi_private = r1_bio; | ||
1669 | r1_bio->bios[i] = bio; | ||
1670 | generic_make_request(bio); | ||
1671 | } | ||
1672 | } else { | ||
1673 | int disk; | 1545 | int disk; |
1674 | 1546 | ||
1675 | /* we got a read error. Maybe the drive is bad. Maybe just | 1547 | /* we got a read error. Maybe the drive is bad. Maybe just |
@@ -1704,7 +1576,8 @@ static void raid1d(mddev_t *mddev) | |||
1704 | mddev->ro ? IO_BLOCKED : NULL; | 1576 | mddev->ro ? IO_BLOCKED : NULL; |
1705 | r1_bio->read_disk = disk; | 1577 | r1_bio->read_disk = disk; |
1706 | bio_put(bio); | 1578 | bio_put(bio); |
1707 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | 1579 | bio = bio_clone_mddev(r1_bio->master_bio, |
1580 | GFP_NOIO, mddev); | ||
1708 | r1_bio->bios[r1_bio->read_disk] = bio; | 1581 | r1_bio->bios[r1_bio->read_disk] = bio; |
1709 | rdev = conf->mirrors[disk].rdev; | 1582 | rdev = conf->mirrors[disk].rdev; |
1710 | if (printk_ratelimit()) | 1583 | if (printk_ratelimit()) |
@@ -1718,14 +1591,12 @@ static void raid1d(mddev_t *mddev) | |||
1718 | bio->bi_end_io = raid1_end_read_request; | 1591 | bio->bi_end_io = raid1_end_read_request; |
1719 | bio->bi_rw = READ | do_sync; | 1592 | bio->bi_rw = READ | do_sync; |
1720 | bio->bi_private = r1_bio; | 1593 | bio->bi_private = r1_bio; |
1721 | unplug = 1; | ||
1722 | generic_make_request(bio); | 1594 | generic_make_request(bio); |
1723 | } | 1595 | } |
1724 | } | 1596 | } |
1725 | cond_resched(); | 1597 | cond_resched(); |
1726 | } | 1598 | } |
1727 | if (unplug) | 1599 | blk_finish_plug(&plug); |
1728 | unplug_slaves(mddev); | ||
1729 | } | 1600 | } |
1730 | 1601 | ||
1731 | 1602 | ||
@@ -1763,7 +1634,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1763 | int i; | 1634 | int i; |
1764 | int wonly = -1; | 1635 | int wonly = -1; |
1765 | int write_targets = 0, read_targets = 0; | 1636 | int write_targets = 0, read_targets = 0; |
1766 | int sync_blocks; | 1637 | sector_t sync_blocks; |
1767 | int still_degraded = 0; | 1638 | int still_degraded = 0; |
1768 | 1639 | ||
1769 | if (!conf->r1buf_pool) | 1640 | if (!conf->r1buf_pool) |
@@ -1813,11 +1684,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1813 | msleep_interruptible(1000); | 1684 | msleep_interruptible(1000); |
1814 | 1685 | ||
1815 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); | 1686 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); |
1687 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | ||
1816 | raise_barrier(conf); | 1688 | raise_barrier(conf); |
1817 | 1689 | ||
1818 | conf->next_resync = sector_nr; | 1690 | conf->next_resync = sector_nr; |
1819 | 1691 | ||
1820 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | ||
1821 | rcu_read_lock(); | 1692 | rcu_read_lock(); |
1822 | /* | 1693 | /* |
1823 | * If we get a correctably read error during resync or recovery, | 1694 | * If we get a correctably read error during resync or recovery, |
@@ -2029,7 +1900,6 @@ static conf_t *setup_conf(mddev_t *mddev) | |||
2029 | init_waitqueue_head(&conf->wait_barrier); | 1900 | init_waitqueue_head(&conf->wait_barrier); |
2030 | 1901 | ||
2031 | bio_list_init(&conf->pending_bio_list); | 1902 | bio_list_init(&conf->pending_bio_list); |
2032 | bio_list_init(&conf->flushing_bio_list); | ||
2033 | 1903 | ||
2034 | conf->last_used = -1; | 1904 | conf->last_used = -1; |
2035 | for (i = 0; i < conf->raid_disks; i++) { | 1905 | for (i = 0; i < conf->raid_disks; i++) { |
@@ -2107,8 +1977,9 @@ static int run(mddev_t *mddev) | |||
2107 | if (IS_ERR(conf)) | 1977 | if (IS_ERR(conf)) |
2108 | return PTR_ERR(conf); | 1978 | return PTR_ERR(conf); |
2109 | 1979 | ||
2110 | mddev->queue->queue_lock = &conf->device_lock; | ||
2111 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 1980 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1981 | if (!mddev->gendisk) | ||
1982 | continue; | ||
2112 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1983 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2113 | rdev->data_offset << 9); | 1984 | rdev->data_offset << 9); |
2114 | /* as we don't honour merge_bvec_fn, we must never risk | 1985 | /* as we don't honour merge_bvec_fn, we must never risk |
@@ -2150,11 +2021,11 @@ static int run(mddev_t *mddev) | |||
2150 | 2021 | ||
2151 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 2022 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
2152 | 2023 | ||
2153 | mddev->queue->unplug_fn = raid1_unplug; | 2024 | if (mddev->queue) { |
2154 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2025 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
2155 | mddev->queue->backing_dev_info.congested_data = mddev; | 2026 | mddev->queue->backing_dev_info.congested_data = mddev; |
2156 | md_integrity_register(mddev); | 2027 | } |
2157 | return 0; | 2028 | return md_integrity_register(mddev); |
2158 | } | 2029 | } |
2159 | 2030 | ||
2160 | static int stop(mddev_t *mddev) | 2031 | static int stop(mddev_t *mddev) |
@@ -2176,7 +2047,6 @@ static int stop(mddev_t *mddev) | |||
2176 | 2047 | ||
2177 | md_unregister_thread(mddev->thread); | 2048 | md_unregister_thread(mddev->thread); |
2178 | mddev->thread = NULL; | 2049 | mddev->thread = NULL; |
2179 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
2180 | if (conf->r1bio_pool) | 2050 | if (conf->r1bio_pool) |
2181 | mempool_destroy(conf->r1bio_pool); | 2051 | mempool_destroy(conf->r1bio_pool); |
2182 | kfree(conf->mirrors); | 2052 | kfree(conf->mirrors); |
@@ -2201,7 +2071,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2201 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2071 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2202 | revalidate_disk(mddev->gendisk); | 2072 | revalidate_disk(mddev->gendisk); |
2203 | if (sectors > mddev->dev_sectors && | 2073 | if (sectors > mddev->dev_sectors && |
2204 | mddev->recovery_cp == MaxSector) { | 2074 | mddev->recovery_cp > mddev->dev_sectors) { |
2205 | mddev->recovery_cp = mddev->dev_sectors; | 2075 | mddev->recovery_cp = mddev->dev_sectors; |
2206 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2076 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2207 | } | 2077 | } |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5f2d443ae28a..e743a64fac4f 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -35,8 +35,6 @@ struct r1_private_data_s { | |||
35 | struct list_head retry_list; | 35 | struct list_head retry_list; |
36 | /* queue pending writes and submit them on unplug */ | 36 | /* queue pending writes and submit them on unplug */ |
37 | struct bio_list pending_bio_list; | 37 | struct bio_list pending_bio_list; |
38 | /* queue of writes that have been unplugged */ | ||
39 | struct bio_list flushing_bio_list; | ||
40 | 38 | ||
41 | /* for use when syncing mirrors: */ | 39 | /* for use when syncing mirrors: */ |
42 | 40 | ||
@@ -96,7 +94,9 @@ struct r1bio_s { | |||
96 | int read_disk; | 94 | int read_disk; |
97 | 95 | ||
98 | struct list_head retry_list; | 96 | struct list_head retry_list; |
99 | struct bitmap_update *bitmap_update; | 97 | /* Next two are only valid when R1BIO_BehindIO is set */ |
98 | struct page **behind_pages; | ||
99 | int behind_page_count; | ||
100 | /* | 100 | /* |
101 | * if the IO is in WRITE direction, then multiple bios are used. | 101 | * if the IO is in WRITE direction, then multiple bios are used. |
102 | * We choose the number when they are allocated. | 102 | * We choose the number when they are allocated. |
@@ -117,8 +117,6 @@ struct r1bio_s { | |||
117 | #define R1BIO_IsSync 1 | 117 | #define R1BIO_IsSync 1 |
118 | #define R1BIO_Degraded 2 | 118 | #define R1BIO_Degraded 2 |
119 | #define R1BIO_BehindIO 3 | 119 | #define R1BIO_BehindIO 3 |
120 | #define R1BIO_Barrier 4 | ||
121 | #define R1BIO_BarrierRetry 5 | ||
122 | /* For write-behind requests, we call bi_end_io when | 120 | /* For write-behind requests, we call bi_end_io when |
123 | * the last non-write-behind device completes, providing | 121 | * the last non-write-behind device completes, providing |
124 | * any write was successful. Otherwise we call when | 122 | * any write was successful. Otherwise we call when |
@@ -128,4 +126,6 @@ struct r1bio_s { | |||
128 | */ | 126 | */ |
129 | #define R1BIO_Returned 6 | 127 | #define R1BIO_Returned 6 |
130 | 128 | ||
129 | extern int md_raid1_congested(mddev_t *mddev, int bits); | ||
130 | |||
131 | #endif | 131 | #endif |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84718383124d..6e846688962f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * RAID-10 support for md. | 6 | * RAID-10 support for md. |
7 | * | 7 | * |
8 | * Base on code in raid1.c. See raid1.c for futher copyright information. | 8 | * Base on code in raid1.c. See raid1.c for further copyright information. |
9 | * | 9 | * |
10 | * | 10 | * |
11 | * This program is free software; you can redistribute it and/or modify | 11 | * This program is free software; you can redistribute it and/or modify |
@@ -57,23 +57,16 @@ | |||
57 | */ | 57 | */ |
58 | #define NR_RAID10_BIOS 256 | 58 | #define NR_RAID10_BIOS 256 |
59 | 59 | ||
60 | static void unplug_slaves(mddev_t *mddev); | ||
61 | |||
62 | static void allow_barrier(conf_t *conf); | 60 | static void allow_barrier(conf_t *conf); |
63 | static void lower_barrier(conf_t *conf); | 61 | static void lower_barrier(conf_t *conf); |
64 | 62 | ||
65 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 63 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
66 | { | 64 | { |
67 | conf_t *conf = data; | 65 | conf_t *conf = data; |
68 | r10bio_t *r10_bio; | ||
69 | int size = offsetof(struct r10bio_s, devs[conf->copies]); | 66 | int size = offsetof(struct r10bio_s, devs[conf->copies]); |
70 | 67 | ||
71 | /* allocate a r10bio with room for raid_disks entries in the bios array */ | 68 | /* allocate a r10bio with room for raid_disks entries in the bios array */ |
72 | r10_bio = kzalloc(size, gfp_flags); | 69 | return kzalloc(size, gfp_flags); |
73 | if (!r10_bio && conf->mddev) | ||
74 | unplug_slaves(conf->mddev); | ||
75 | |||
76 | return r10_bio; | ||
77 | } | 70 | } |
78 | 71 | ||
79 | static void r10bio_pool_free(void *r10_bio, void *data) | 72 | static void r10bio_pool_free(void *r10_bio, void *data) |
@@ -106,10 +99,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
106 | int nalloc; | 99 | int nalloc; |
107 | 100 | ||
108 | r10_bio = r10bio_pool_alloc(gfp_flags, conf); | 101 | r10_bio = r10bio_pool_alloc(gfp_flags, conf); |
109 | if (!r10_bio) { | 102 | if (!r10_bio) |
110 | unplug_slaves(conf->mddev); | ||
111 | return NULL; | 103 | return NULL; |
112 | } | ||
113 | 104 | ||
114 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) | 105 | if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) |
115 | nalloc = conf->copies; /* resync */ | 106 | nalloc = conf->copies; /* resync */ |
@@ -120,7 +111,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
120 | * Allocate bios. | 111 | * Allocate bios. |
121 | */ | 112 | */ |
122 | for (j = nalloc ; j-- ; ) { | 113 | for (j = nalloc ; j-- ; ) { |
123 | bio = bio_alloc(gfp_flags, RESYNC_PAGES); | 114 | bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); |
124 | if (!bio) | 115 | if (!bio) |
125 | goto out_free_bio; | 116 | goto out_free_bio; |
126 | r10_bio->devs[j].bio = bio; | 117 | r10_bio->devs[j].bio = bio; |
@@ -280,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
280 | */ | 271 | */ |
281 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 272 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
282 | raid_end_bio_io(r10_bio); | 273 | raid_end_bio_io(r10_bio); |
274 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
283 | } else { | 275 | } else { |
284 | /* | 276 | /* |
285 | * oops, read error: | 277 | * oops, read error - keep the refcount on the rdev |
286 | */ | 278 | */ |
287 | char b[BDEVNAME_SIZE]; | 279 | char b[BDEVNAME_SIZE]; |
288 | if (printk_ratelimit()) | 280 | if (printk_ratelimit()) |
@@ -291,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
291 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); |
292 | reschedule_retry(r10_bio); | 284 | reschedule_retry(r10_bio); |
293 | } | 285 | } |
294 | |||
295 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
296 | } | 286 | } |
297 | 287 | ||
298 | static void raid10_end_write_request(struct bio *bio, int error) | 288 | static void raid10_end_write_request(struct bio *bio, int error) |
@@ -349,14 +339,14 @@ static void raid10_end_write_request(struct bio *bio, int error) | |||
349 | 339 | ||
350 | /* | 340 | /* |
351 | * RAID10 layout manager | 341 | * RAID10 layout manager |
352 | * Aswell as the chunksize and raid_disks count, there are two | 342 | * As well as the chunksize and raid_disks count, there are two |
353 | * parameters: near_copies and far_copies. | 343 | * parameters: near_copies and far_copies. |
354 | * near_copies * far_copies must be <= raid_disks. | 344 | * near_copies * far_copies must be <= raid_disks. |
355 | * Normally one of these will be 1. | 345 | * Normally one of these will be 1. |
356 | * If both are 1, we get raid0. | 346 | * If both are 1, we get raid0. |
357 | * If near_copies == raid_disks, we get raid1. | 347 | * If near_copies == raid_disks, we get raid1. |
358 | * | 348 | * |
359 | * Chunks are layed out in raid0 style with near_copies copies of the | 349 | * Chunks are laid out in raid0 style with near_copies copies of the |
360 | * first chunk, followed by near_copies copies of the next chunk and | 350 | * first chunk, followed by near_copies copies of the next chunk and |
361 | * so on. | 351 | * so on. |
362 | * If far_copies > 1, then after 1/far_copies of the array has been assigned | 352 | * If far_copies > 1, then after 1/far_copies of the array has been assigned |
@@ -497,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
497 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) |
498 | { | 488 | { |
499 | const sector_t this_sector = r10_bio->sector; | 489 | const sector_t this_sector = r10_bio->sector; |
500 | int disk, slot, nslot; | 490 | int disk, slot; |
501 | const int sectors = r10_bio->sectors; | 491 | const int sectors = r10_bio->sectors; |
502 | sector_t new_distance, current_distance; | 492 | sector_t new_distance, best_dist; |
503 | mdk_rdev_t *rdev; | 493 | mdk_rdev_t *rdev; |
494 | int do_balance; | ||
495 | int best_slot; | ||
504 | 496 | ||
505 | raid10_find_phys(conf, r10_bio); | 497 | raid10_find_phys(conf, r10_bio); |
506 | rcu_read_lock(); | 498 | rcu_read_lock(); |
499 | retry: | ||
500 | best_slot = -1; | ||
501 | best_dist = MaxSector; | ||
502 | do_balance = 1; | ||
507 | /* | 503 | /* |
508 | * Check if we can balance. We can balance on the whole | 504 | * Check if we can balance. We can balance on the whole |
509 | * device if no resync is going on (recovery is ok), or below | 505 | * device if no resync is going on (recovery is ok), or below |
@@ -511,123 +507,64 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
511 | * above the resync window. | 507 | * above the resync window. |
512 | */ | 508 | */ |
513 | if (conf->mddev->recovery_cp < MaxSector | 509 | if (conf->mddev->recovery_cp < MaxSector |
514 | && (this_sector + sectors >= conf->next_resync)) { | 510 | && (this_sector + sectors >= conf->next_resync)) |
515 | /* make sure that disk is operational */ | 511 | do_balance = 0; |
516 | slot = 0; | ||
517 | disk = r10_bio->devs[slot].devnum; | ||
518 | 512 | ||
519 | while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || | 513 | for (slot = 0; slot < conf->copies ; slot++) { |
520 | r10_bio->devs[slot].bio == IO_BLOCKED || | 514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
521 | !test_bit(In_sync, &rdev->flags)) { | 515 | continue; |
522 | slot++; | ||
523 | if (slot == conf->copies) { | ||
524 | slot = 0; | ||
525 | disk = -1; | ||
526 | break; | ||
527 | } | ||
528 | disk = r10_bio->devs[slot].devnum; | ||
529 | } | ||
530 | goto rb_out; | ||
531 | } | ||
532 | |||
533 | |||
534 | /* make sure the disk is operational */ | ||
535 | slot = 0; | ||
536 | disk = r10_bio->devs[slot].devnum; | ||
537 | while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || | ||
538 | r10_bio->devs[slot].bio == IO_BLOCKED || | ||
539 | !test_bit(In_sync, &rdev->flags)) { | ||
540 | slot ++; | ||
541 | if (slot == conf->copies) { | ||
542 | disk = -1; | ||
543 | goto rb_out; | ||
544 | } | ||
545 | disk = r10_bio->devs[slot].devnum; | 516 | disk = r10_bio->devs[slot].devnum; |
546 | } | 517 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
547 | 518 | if (rdev == NULL) | |
548 | 519 | continue; | |
549 | current_distance = abs(r10_bio->devs[slot].addr - | 520 | if (!test_bit(In_sync, &rdev->flags)) |
550 | conf->mirrors[disk].head_position); | ||
551 | |||
552 | /* Find the disk whose head is closest, | ||
553 | * or - for far > 1 - find the closest to partition beginning */ | ||
554 | |||
555 | for (nslot = slot; nslot < conf->copies; nslot++) { | ||
556 | int ndisk = r10_bio->devs[nslot].devnum; | ||
557 | |||
558 | |||
559 | if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || | ||
560 | r10_bio->devs[nslot].bio == IO_BLOCKED || | ||
561 | !test_bit(In_sync, &rdev->flags)) | ||
562 | continue; | 521 | continue; |
563 | 522 | ||
523 | if (!do_balance) | ||
524 | break; | ||
525 | |||
564 | /* This optimisation is debatable, and completely destroys | 526 | /* This optimisation is debatable, and completely destroys |
565 | * sequential read speed for 'far copies' arrays. So only | 527 | * sequential read speed for 'far copies' arrays. So only |
566 | * keep it for 'near' arrays, and review those later. | 528 | * keep it for 'near' arrays, and review those later. |
567 | */ | 529 | */ |
568 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { | 530 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
569 | disk = ndisk; | ||
570 | slot = nslot; | ||
571 | break; | 531 | break; |
572 | } | ||
573 | 532 | ||
574 | /* for far > 1 always use the lowest address */ | 533 | /* for far > 1 always use the lowest address */ |
575 | if (conf->far_copies > 1) | 534 | if (conf->far_copies > 1) |
576 | new_distance = r10_bio->devs[nslot].addr; | 535 | new_distance = r10_bio->devs[slot].addr; |
577 | else | 536 | else |
578 | new_distance = abs(r10_bio->devs[nslot].addr - | 537 | new_distance = abs(r10_bio->devs[slot].addr - |
579 | conf->mirrors[ndisk].head_position); | 538 | conf->mirrors[disk].head_position); |
580 | if (new_distance < current_distance) { | 539 | if (new_distance < best_dist) { |
581 | current_distance = new_distance; | 540 | best_dist = new_distance; |
582 | disk = ndisk; | 541 | best_slot = slot; |
583 | slot = nslot; | ||
584 | } | 542 | } |
585 | } | 543 | } |
544 | if (slot == conf->copies) | ||
545 | slot = best_slot; | ||
586 | 546 | ||
587 | rb_out: | 547 | if (slot >= 0) { |
588 | r10_bio->read_slot = slot; | 548 | disk = r10_bio->devs[slot].devnum; |
589 | /* conf->next_seq_sect = this_sector + sectors;*/ | 549 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
590 | 550 | if (!rdev) | |
591 | if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) | 551 | goto retry; |
592 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | 552 | atomic_inc(&rdev->nr_pending); |
593 | else | 553 | if (test_bit(Faulty, &rdev->flags)) { |
554 | /* Cannot risk returning a device that failed | ||
555 | * before we inc'ed nr_pending | ||
556 | */ | ||
557 | rdev_dec_pending(rdev, conf->mddev); | ||
558 | goto retry; | ||
559 | } | ||
560 | r10_bio->read_slot = slot; | ||
561 | } else | ||
594 | disk = -1; | 562 | disk = -1; |
595 | rcu_read_unlock(); | 563 | rcu_read_unlock(); |
596 | 564 | ||
597 | return disk; | 565 | return disk; |
598 | } | 566 | } |
599 | 567 | ||
600 | static void unplug_slaves(mddev_t *mddev) | ||
601 | { | ||
602 | conf_t *conf = mddev->private; | ||
603 | int i; | ||
604 | |||
605 | rcu_read_lock(); | ||
606 | for (i=0; i < conf->raid_disks; i++) { | ||
607 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
608 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | ||
609 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
610 | |||
611 | atomic_inc(&rdev->nr_pending); | ||
612 | rcu_read_unlock(); | ||
613 | |||
614 | blk_unplug(r_queue); | ||
615 | |||
616 | rdev_dec_pending(rdev, mddev); | ||
617 | rcu_read_lock(); | ||
618 | } | ||
619 | } | ||
620 | rcu_read_unlock(); | ||
621 | } | ||
622 | |||
623 | static void raid10_unplug(struct request_queue *q) | ||
624 | { | ||
625 | mddev_t *mddev = q->queuedata; | ||
626 | |||
627 | unplug_slaves(q->queuedata); | ||
628 | md_wakeup_thread(mddev->thread); | ||
629 | } | ||
630 | |||
631 | static int raid10_congested(void *data, int bits) | 568 | static int raid10_congested(void *data, int bits) |
632 | { | 569 | { |
633 | mddev_t *mddev = data; | 570 | mddev_t *mddev = data; |
@@ -649,20 +586,16 @@ static int raid10_congested(void *data, int bits) | |||
649 | return ret; | 586 | return ret; |
650 | } | 587 | } |
651 | 588 | ||
652 | static int flush_pending_writes(conf_t *conf) | 589 | static void flush_pending_writes(conf_t *conf) |
653 | { | 590 | { |
654 | /* Any writes that have been queued but are awaiting | 591 | /* Any writes that have been queued but are awaiting |
655 | * bitmap updates get flushed here. | 592 | * bitmap updates get flushed here. |
656 | * We return 1 if any requests were actually submitted. | ||
657 | */ | 593 | */ |
658 | int rv = 0; | ||
659 | |||
660 | spin_lock_irq(&conf->device_lock); | 594 | spin_lock_irq(&conf->device_lock); |
661 | 595 | ||
662 | if (conf->pending_bio_list.head) { | 596 | if (conf->pending_bio_list.head) { |
663 | struct bio *bio; | 597 | struct bio *bio; |
664 | bio = bio_list_get(&conf->pending_bio_list); | 598 | bio = bio_list_get(&conf->pending_bio_list); |
665 | blk_remove_plug(conf->mddev->queue); | ||
666 | spin_unlock_irq(&conf->device_lock); | 599 | spin_unlock_irq(&conf->device_lock); |
667 | /* flush any pending bitmap writes to disk | 600 | /* flush any pending bitmap writes to disk |
668 | * before proceeding w/ I/O */ | 601 | * before proceeding w/ I/O */ |
@@ -674,11 +607,10 @@ static int flush_pending_writes(conf_t *conf) | |||
674 | generic_make_request(bio); | 607 | generic_make_request(bio); |
675 | bio = next; | 608 | bio = next; |
676 | } | 609 | } |
677 | rv = 1; | ||
678 | } else | 610 | } else |
679 | spin_unlock_irq(&conf->device_lock); | 611 | spin_unlock_irq(&conf->device_lock); |
680 | return rv; | ||
681 | } | 612 | } |
613 | |||
682 | /* Barriers.... | 614 | /* Barriers.... |
683 | * Sometimes we need to suspend IO while we do something else, | 615 | * Sometimes we need to suspend IO while we do something else, |
684 | * either some resync/recovery, or reconfigure the array. | 616 | * either some resync/recovery, or reconfigure the array. |
@@ -708,17 +640,15 @@ static void raise_barrier(conf_t *conf, int force) | |||
708 | 640 | ||
709 | /* Wait until no block IO is waiting (unless 'force') */ | 641 | /* Wait until no block IO is waiting (unless 'force') */ |
710 | wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, | 642 | wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, |
711 | conf->resync_lock, | 643 | conf->resync_lock, ); |
712 | raid10_unplug(conf->mddev->queue)); | ||
713 | 644 | ||
714 | /* block any new IO from starting */ | 645 | /* block any new IO from starting */ |
715 | conf->barrier++; | 646 | conf->barrier++; |
716 | 647 | ||
717 | /* No wait for all pending IO to complete */ | 648 | /* Now wait for all pending IO to complete */ |
718 | wait_event_lock_irq(conf->wait_barrier, | 649 | wait_event_lock_irq(conf->wait_barrier, |
719 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 650 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, |
720 | conf->resync_lock, | 651 | conf->resync_lock, ); |
721 | raid10_unplug(conf->mddev->queue)); | ||
722 | 652 | ||
723 | spin_unlock_irq(&conf->resync_lock); | 653 | spin_unlock_irq(&conf->resync_lock); |
724 | } | 654 | } |
@@ -739,7 +669,7 @@ static void wait_barrier(conf_t *conf) | |||
739 | conf->nr_waiting++; | 669 | conf->nr_waiting++; |
740 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | 670 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, |
741 | conf->resync_lock, | 671 | conf->resync_lock, |
742 | raid10_unplug(conf->mddev->queue)); | 672 | ); |
743 | conf->nr_waiting--; | 673 | conf->nr_waiting--; |
744 | } | 674 | } |
745 | conf->nr_pending++; | 675 | conf->nr_pending++; |
@@ -775,8 +705,8 @@ static void freeze_array(conf_t *conf) | |||
775 | wait_event_lock_irq(conf->wait_barrier, | 705 | wait_event_lock_irq(conf->wait_barrier, |
776 | conf->nr_pending == conf->nr_queued+1, | 706 | conf->nr_pending == conf->nr_queued+1, |
777 | conf->resync_lock, | 707 | conf->resync_lock, |
778 | ({ flush_pending_writes(conf); | 708 | flush_pending_writes(conf)); |
779 | raid10_unplug(conf->mddev->queue); })); | 709 | |
780 | spin_unlock_irq(&conf->resync_lock); | 710 | spin_unlock_irq(&conf->resync_lock); |
781 | } | 711 | } |
782 | 712 | ||
@@ -800,12 +730,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
800 | int chunk_sects = conf->chunk_mask + 1; | 730 | int chunk_sects = conf->chunk_mask + 1; |
801 | const int rw = bio_data_dir(bio); | 731 | const int rw = bio_data_dir(bio); |
802 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 732 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
803 | struct bio_list bl; | 733 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
804 | unsigned long flags; | 734 | unsigned long flags; |
805 | mdk_rdev_t *blocked_rdev; | 735 | mdk_rdev_t *blocked_rdev; |
736 | int plugged; | ||
806 | 737 | ||
807 | if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { | 738 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
808 | md_barrier_request(mddev, bio); | 739 | md_flush_request(mddev, bio); |
809 | return 0; | 740 | return 0; |
810 | } | 741 | } |
811 | 742 | ||
@@ -889,7 +820,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
889 | } | 820 | } |
890 | mirror = conf->mirrors + disk; | 821 | mirror = conf->mirrors + disk; |
891 | 822 | ||
892 | read_bio = bio_clone(bio, GFP_NOIO); | 823 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
893 | 824 | ||
894 | r10_bio->devs[slot].bio = read_bio; | 825 | r10_bio->devs[slot].bio = read_bio; |
895 | 826 | ||
@@ -911,6 +842,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
911 | * inc refcount on their rdev. Record them by setting | 842 | * inc refcount on their rdev. Record them by setting |
912 | * bios[x] to bio | 843 | * bios[x] to bio |
913 | */ | 844 | */ |
845 | plugged = mddev_check_plugged(mddev); | ||
846 | |||
914 | raid10_find_phys(conf, r10_bio); | 847 | raid10_find_phys(conf, r10_bio); |
915 | retry_write: | 848 | retry_write: |
916 | blocked_rdev = NULL; | 849 | blocked_rdev = NULL; |
@@ -949,48 +882,46 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
949 | goto retry_write; | 882 | goto retry_write; |
950 | } | 883 | } |
951 | 884 | ||
952 | atomic_set(&r10_bio->remaining, 0); | 885 | atomic_set(&r10_bio->remaining, 1); |
886 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | ||
953 | 887 | ||
954 | bio_list_init(&bl); | ||
955 | for (i = 0; i < conf->copies; i++) { | 888 | for (i = 0; i < conf->copies; i++) { |
956 | struct bio *mbio; | 889 | struct bio *mbio; |
957 | int d = r10_bio->devs[i].devnum; | 890 | int d = r10_bio->devs[i].devnum; |
958 | if (!r10_bio->devs[i].bio) | 891 | if (!r10_bio->devs[i].bio) |
959 | continue; | 892 | continue; |
960 | 893 | ||
961 | mbio = bio_clone(bio, GFP_NOIO); | 894 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
962 | r10_bio->devs[i].bio = mbio; | 895 | r10_bio->devs[i].bio = mbio; |
963 | 896 | ||
964 | mbio->bi_sector = r10_bio->devs[i].addr+ | 897 | mbio->bi_sector = r10_bio->devs[i].addr+ |
965 | conf->mirrors[d].rdev->data_offset; | 898 | conf->mirrors[d].rdev->data_offset; |
966 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 899 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
967 | mbio->bi_end_io = raid10_end_write_request; | 900 | mbio->bi_end_io = raid10_end_write_request; |
968 | mbio->bi_rw = WRITE | do_sync; | 901 | mbio->bi_rw = WRITE | do_sync | do_fua; |
969 | mbio->bi_private = r10_bio; | 902 | mbio->bi_private = r10_bio; |
970 | 903 | ||
971 | atomic_inc(&r10_bio->remaining); | 904 | atomic_inc(&r10_bio->remaining); |
972 | bio_list_add(&bl, mbio); | 905 | spin_lock_irqsave(&conf->device_lock, flags); |
906 | bio_list_add(&conf->pending_bio_list, mbio); | ||
907 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
973 | } | 908 | } |
974 | 909 | ||
975 | if (unlikely(!atomic_read(&r10_bio->remaining))) { | 910 | if (atomic_dec_and_test(&r10_bio->remaining)) { |
976 | /* the array is dead */ | 911 | /* This matches the end of raid10_end_write_request() */ |
912 | bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, | ||
913 | r10_bio->sectors, | ||
914 | !test_bit(R10BIO_Degraded, &r10_bio->state), | ||
915 | 0); | ||
977 | md_write_end(mddev); | 916 | md_write_end(mddev); |
978 | raid_end_bio_io(r10_bio); | 917 | raid_end_bio_io(r10_bio); |
979 | return 0; | ||
980 | } | 918 | } |
981 | 919 | ||
982 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | ||
983 | spin_lock_irqsave(&conf->device_lock, flags); | ||
984 | bio_list_merge(&conf->pending_bio_list, &bl); | ||
985 | blk_plug_device(mddev->queue); | ||
986 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
987 | |||
988 | /* In case raid10d snuck in to freeze_array */ | 920 | /* In case raid10d snuck in to freeze_array */ |
989 | wake_up(&conf->wait_barrier); | 921 | wake_up(&conf->wait_barrier); |
990 | 922 | ||
991 | if (do_sync) | 923 | if (do_sync || !mddev->bitmap || !plugged) |
992 | md_wakeup_thread(mddev->thread); | 924 | md_wakeup_thread(mddev->thread); |
993 | |||
994 | return 0; | 925 | return 0; |
995 | } | 926 | } |
996 | 927 | ||
@@ -1051,8 +982,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1051 | } | 982 | } |
1052 | set_bit(Faulty, &rdev->flags); | 983 | set_bit(Faulty, &rdev->flags); |
1053 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 984 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1054 | printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" | 985 | printk(KERN_ALERT |
1055 | KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", | 986 | "md/raid10:%s: Disk failure on %s, disabling device.\n" |
987 | "md/raid10:%s: Operation continuing on %d devices.\n", | ||
1056 | mdname(mddev), bdevname(rdev->bdev, b), | 988 | mdname(mddev), bdevname(rdev->bdev, b), |
1057 | mdname(mddev), conf->raid_disks - mddev->degraded); | 989 | mdname(mddev), conf->raid_disks - mddev->degraded); |
1058 | } | 990 | } |
@@ -1229,7 +1161,7 @@ static int raid10_remove_disk(mddev_t *mddev, int number) | |||
1229 | p->rdev = rdev; | 1161 | p->rdev = rdev; |
1230 | goto abort; | 1162 | goto abort; |
1231 | } | 1163 | } |
1232 | md_integrity_register(mddev); | 1164 | err = md_integrity_register(mddev); |
1233 | } | 1165 | } |
1234 | abort: | 1166 | abort: |
1235 | 1167 | ||
@@ -1505,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1505 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | 1437 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); |
1506 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | 1438 | int d = r10_bio->devs[r10_bio->read_slot].devnum; |
1507 | 1439 | ||
1508 | rcu_read_lock(); | 1440 | /* still own a reference to this rdev, so it cannot |
1509 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1441 | * have been cleared recently. |
1510 | if (rdev) { /* If rdev is not NULL */ | 1442 | */ |
1511 | char b[BDEVNAME_SIZE]; | 1443 | rdev = conf->mirrors[d].rdev; |
1512 | int cur_read_error_count = 0; | ||
1513 | 1444 | ||
1514 | bdevname(rdev->bdev, b); | 1445 | if (test_bit(Faulty, &rdev->flags)) |
1446 | /* drive has already been failed, just ignore any | ||
1447 | more fix_read_error() attempts */ | ||
1448 | return; | ||
1515 | 1449 | ||
1516 | if (test_bit(Faulty, &rdev->flags)) { | 1450 | check_decay_read_errors(mddev, rdev); |
1517 | rcu_read_unlock(); | 1451 | atomic_inc(&rdev->read_errors); |
1518 | /* drive has already been failed, just ignore any | 1452 | if (atomic_read(&rdev->read_errors) > max_read_errors) { |
1519 | more fix_read_error() attempts */ | 1453 | char b[BDEVNAME_SIZE]; |
1520 | return; | 1454 | bdevname(rdev->bdev, b); |
1521 | } | ||
1522 | 1455 | ||
1523 | check_decay_read_errors(mddev, rdev); | 1456 | printk(KERN_NOTICE |
1524 | atomic_inc(&rdev->read_errors); | 1457 | "md/raid10:%s: %s: Raid device exceeded " |
1525 | cur_read_error_count = atomic_read(&rdev->read_errors); | 1458 | "read_error threshold [cur %d:max %d]\n", |
1526 | if (cur_read_error_count > max_read_errors) { | 1459 | mdname(mddev), b, |
1527 | rcu_read_unlock(); | 1460 | atomic_read(&rdev->read_errors), max_read_errors); |
1528 | printk(KERN_NOTICE | 1461 | printk(KERN_NOTICE |
1529 | "md/raid10:%s: %s: Raid device exceeded " | 1462 | "md/raid10:%s: %s: Failing raid device\n", |
1530 | "read_error threshold " | 1463 | mdname(mddev), b); |
1531 | "[cur %d:max %d]\n", | 1464 | md_error(mddev, conf->mirrors[d].rdev); |
1532 | mdname(mddev), | 1465 | return; |
1533 | b, cur_read_error_count, max_read_errors); | ||
1534 | printk(KERN_NOTICE | ||
1535 | "md/raid10:%s: %s: Failing raid " | ||
1536 | "device\n", mdname(mddev), b); | ||
1537 | md_error(mddev, conf->mirrors[d].rdev); | ||
1538 | return; | ||
1539 | } | ||
1540 | } | 1466 | } |
1541 | rcu_read_unlock(); | ||
1542 | 1467 | ||
1543 | while(sectors) { | 1468 | while(sectors) { |
1544 | int s = sectors; | 1469 | int s = sectors; |
@@ -1557,11 +1482,11 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1557 | test_bit(In_sync, &rdev->flags)) { | 1482 | test_bit(In_sync, &rdev->flags)) { |
1558 | atomic_inc(&rdev->nr_pending); | 1483 | atomic_inc(&rdev->nr_pending); |
1559 | rcu_read_unlock(); | 1484 | rcu_read_unlock(); |
1560 | success = sync_page_io(rdev->bdev, | 1485 | success = sync_page_io(rdev, |
1561 | r10_bio->devs[sl].addr + | 1486 | r10_bio->devs[sl].addr + |
1562 | sect + rdev->data_offset, | 1487 | sect, |
1563 | s<<9, | 1488 | s<<9, |
1564 | conf->tmppage, READ); | 1489 | conf->tmppage, READ, false); |
1565 | rdev_dec_pending(rdev, mddev); | 1490 | rdev_dec_pending(rdev, mddev); |
1566 | rcu_read_lock(); | 1491 | rcu_read_lock(); |
1567 | if (success) | 1492 | if (success) |
@@ -1596,10 +1521,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1596 | atomic_inc(&rdev->nr_pending); | 1521 | atomic_inc(&rdev->nr_pending); |
1597 | rcu_read_unlock(); | 1522 | rcu_read_unlock(); |
1598 | atomic_add(s, &rdev->corrected_errors); | 1523 | atomic_add(s, &rdev->corrected_errors); |
1599 | if (sync_page_io(rdev->bdev, | 1524 | if (sync_page_io(rdev, |
1600 | r10_bio->devs[sl].addr + | 1525 | r10_bio->devs[sl].addr + |
1601 | sect + rdev->data_offset, | 1526 | sect, |
1602 | s<<9, conf->tmppage, WRITE) | 1527 | s<<9, conf->tmppage, WRITE, false) |
1603 | == 0) { | 1528 | == 0) { |
1604 | /* Well, this device is dead */ | 1529 | /* Well, this device is dead */ |
1605 | printk(KERN_NOTICE | 1530 | printk(KERN_NOTICE |
@@ -1607,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1607 | "write failed" | 1532 | "write failed" |
1608 | " (%d sectors at %llu on %s)\n", | 1533 | " (%d sectors at %llu on %s)\n", |
1609 | mdname(mddev), s, | 1534 | mdname(mddev), s, |
1610 | (unsigned long long)(sect+ | 1535 | (unsigned long long)( |
1611 | rdev->data_offset), | 1536 | sect + rdev->data_offset), |
1612 | bdevname(rdev->bdev, b)); | 1537 | bdevname(rdev->bdev, b)); |
1613 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
1614 | "drive\n", | 1539 | "drive\n", |
@@ -1633,19 +1558,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1633 | char b[BDEVNAME_SIZE]; | 1558 | char b[BDEVNAME_SIZE]; |
1634 | atomic_inc(&rdev->nr_pending); | 1559 | atomic_inc(&rdev->nr_pending); |
1635 | rcu_read_unlock(); | 1560 | rcu_read_unlock(); |
1636 | if (sync_page_io(rdev->bdev, | 1561 | if (sync_page_io(rdev, |
1637 | r10_bio->devs[sl].addr + | 1562 | r10_bio->devs[sl].addr + |
1638 | sect + rdev->data_offset, | 1563 | sect, |
1639 | s<<9, conf->tmppage, | 1564 | s<<9, conf->tmppage, |
1640 | READ) == 0) { | 1565 | READ, false) == 0) { |
1641 | /* Well, this device is dead */ | 1566 | /* Well, this device is dead */ |
1642 | printk(KERN_NOTICE | 1567 | printk(KERN_NOTICE |
1643 | "md/raid10:%s: unable to read back " | 1568 | "md/raid10:%s: unable to read back " |
1644 | "corrected sectors" | 1569 | "corrected sectors" |
1645 | " (%d sectors at %llu on %s)\n", | 1570 | " (%d sectors at %llu on %s)\n", |
1646 | mdname(mddev), s, | 1571 | mdname(mddev), s, |
1647 | (unsigned long long)(sect+ | 1572 | (unsigned long long)( |
1648 | rdev->data_offset), | 1573 | sect + rdev->data_offset), |
1649 | bdevname(rdev->bdev, b)); | 1574 | bdevname(rdev->bdev, b)); |
1650 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | 1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", |
1651 | mdname(mddev), | 1576 | mdname(mddev), |
@@ -1657,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1657 | "md/raid10:%s: read error corrected" | 1582 | "md/raid10:%s: read error corrected" |
1658 | " (%d sectors at %llu on %s)\n", | 1583 | " (%d sectors at %llu on %s)\n", |
1659 | mdname(mddev), s, | 1584 | mdname(mddev), s, |
1660 | (unsigned long long)(sect+ | 1585 | (unsigned long long)( |
1661 | rdev->data_offset), | 1586 | sect + rdev->data_offset), |
1662 | bdevname(rdev->bdev, b)); | 1587 | bdevname(rdev->bdev, b)); |
1663 | } | 1588 | } |
1664 | 1589 | ||
@@ -1680,15 +1605,16 @@ static void raid10d(mddev_t *mddev) | |||
1680 | unsigned long flags; | 1605 | unsigned long flags; |
1681 | conf_t *conf = mddev->private; | 1606 | conf_t *conf = mddev->private; |
1682 | struct list_head *head = &conf->retry_list; | 1607 | struct list_head *head = &conf->retry_list; |
1683 | int unplug=0; | ||
1684 | mdk_rdev_t *rdev; | 1608 | mdk_rdev_t *rdev; |
1609 | struct blk_plug plug; | ||
1685 | 1610 | ||
1686 | md_check_recovery(mddev); | 1611 | md_check_recovery(mddev); |
1687 | 1612 | ||
1613 | blk_start_plug(&plug); | ||
1688 | for (;;) { | 1614 | for (;;) { |
1689 | char b[BDEVNAME_SIZE]; | 1615 | char b[BDEVNAME_SIZE]; |
1690 | 1616 | ||
1691 | unplug += flush_pending_writes(conf); | 1617 | flush_pending_writes(conf); |
1692 | 1618 | ||
1693 | spin_lock_irqsave(&conf->device_lock, flags); | 1619 | spin_lock_irqsave(&conf->device_lock, flags); |
1694 | if (list_empty(head)) { | 1620 | if (list_empty(head)) { |
@@ -1702,14 +1628,13 @@ static void raid10d(mddev_t *mddev) | |||
1702 | 1628 | ||
1703 | mddev = r10_bio->mddev; | 1629 | mddev = r10_bio->mddev; |
1704 | conf = mddev->private; | 1630 | conf = mddev->private; |
1705 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) { | 1631 | if (test_bit(R10BIO_IsSync, &r10_bio->state)) |
1706 | sync_request_write(mddev, r10_bio); | 1632 | sync_request_write(mddev, r10_bio); |
1707 | unplug = 1; | 1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1708 | } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { | ||
1709 | recovery_request_write(mddev, r10_bio); | 1634 | recovery_request_write(mddev, r10_bio); |
1710 | unplug = 1; | 1635 | else { |
1711 | } else { | 1636 | int slot = r10_bio->read_slot; |
1712 | int mirror; | 1637 | int mirror = r10_bio->devs[slot].devnum; |
1713 | /* we got a read error. Maybe the drive is bad. Maybe just | 1638 | /* we got a read error. Maybe the drive is bad. Maybe just |
1714 | * the block and we can fix it. | 1639 | * the block and we can fix it. |
1715 | * We freeze all other IO, and try reading the block from | 1640 | * We freeze all other IO, and try reading the block from |
@@ -1723,9 +1648,10 @@ static void raid10d(mddev_t *mddev) | |||
1723 | fix_read_error(conf, mddev, r10_bio); | 1648 | fix_read_error(conf, mddev, r10_bio); |
1724 | unfreeze_array(conf); | 1649 | unfreeze_array(conf); |
1725 | } | 1650 | } |
1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1726 | 1652 | ||
1727 | bio = r10_bio->devs[r10_bio->read_slot].bio; | 1653 | bio = r10_bio->devs[slot].bio; |
1728 | r10_bio->devs[r10_bio->read_slot].bio = | 1654 | r10_bio->devs[slot].bio = |
1729 | mddev->ro ? IO_BLOCKED : NULL; | 1655 | mddev->ro ? IO_BLOCKED : NULL; |
1730 | mirror = read_balance(conf, r10_bio); | 1656 | mirror = read_balance(conf, r10_bio); |
1731 | if (mirror == -1) { | 1657 | if (mirror == -1) { |
@@ -1739,6 +1665,7 @@ static void raid10d(mddev_t *mddev) | |||
1739 | } else { | 1665 | } else { |
1740 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | 1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); |
1741 | bio_put(bio); | 1667 | bio_put(bio); |
1668 | slot = r10_bio->read_slot; | ||
1742 | rdev = conf->mirrors[mirror].rdev; | 1669 | rdev = conf->mirrors[mirror].rdev; |
1743 | if (printk_ratelimit()) | 1670 | if (printk_ratelimit()) |
1744 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | 1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" |
@@ -1746,22 +1673,21 @@ static void raid10d(mddev_t *mddev) | |||
1746 | mdname(mddev), | 1673 | mdname(mddev), |
1747 | bdevname(rdev->bdev,b), | 1674 | bdevname(rdev->bdev,b), |
1748 | (unsigned long long)r10_bio->sector); | 1675 | (unsigned long long)r10_bio->sector); |
1749 | bio = bio_clone(r10_bio->master_bio, GFP_NOIO); | 1676 | bio = bio_clone_mddev(r10_bio->master_bio, |
1750 | r10_bio->devs[r10_bio->read_slot].bio = bio; | 1677 | GFP_NOIO, mddev); |
1751 | bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr | 1678 | r10_bio->devs[slot].bio = bio; |
1679 | bio->bi_sector = r10_bio->devs[slot].addr | ||
1752 | + rdev->data_offset; | 1680 | + rdev->data_offset; |
1753 | bio->bi_bdev = rdev->bdev; | 1681 | bio->bi_bdev = rdev->bdev; |
1754 | bio->bi_rw = READ | do_sync; | 1682 | bio->bi_rw = READ | do_sync; |
1755 | bio->bi_private = r10_bio; | 1683 | bio->bi_private = r10_bio; |
1756 | bio->bi_end_io = raid10_end_read_request; | 1684 | bio->bi_end_io = raid10_end_read_request; |
1757 | unplug = 1; | ||
1758 | generic_make_request(bio); | 1685 | generic_make_request(bio); |
1759 | } | 1686 | } |
1760 | } | 1687 | } |
1761 | cond_resched(); | 1688 | cond_resched(); |
1762 | } | 1689 | } |
1763 | if (unplug) | 1690 | blk_finish_plug(&plug); |
1764 | unplug_slaves(mddev); | ||
1765 | } | 1691 | } |
1766 | 1692 | ||
1767 | 1693 | ||
@@ -1810,16 +1736,16 @@ static int init_resync(conf_t *conf) | |||
1810 | * | 1736 | * |
1811 | */ | 1737 | */ |
1812 | 1738 | ||
1813 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1739 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, |
1740 | int *skipped, int go_faster) | ||
1814 | { | 1741 | { |
1815 | conf_t *conf = mddev->private; | 1742 | conf_t *conf = mddev->private; |
1816 | r10bio_t *r10_bio; | 1743 | r10bio_t *r10_bio; |
1817 | struct bio *biolist = NULL, *bio; | 1744 | struct bio *biolist = NULL, *bio; |
1818 | sector_t max_sector, nr_sectors; | 1745 | sector_t max_sector, nr_sectors; |
1819 | int disk; | ||
1820 | int i; | 1746 | int i; |
1821 | int max_sync; | 1747 | int max_sync; |
1822 | int sync_blocks; | 1748 | sector_t sync_blocks; |
1823 | 1749 | ||
1824 | sector_t sectors_skipped = 0; | 1750 | sector_t sectors_skipped = 0; |
1825 | int chunks_skipped = 0; | 1751 | int chunks_skipped = 0; |
@@ -1905,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1905 | int j, k; | 1831 | int j, k; |
1906 | r10_bio = NULL; | 1832 | r10_bio = NULL; |
1907 | 1833 | ||
1908 | for (i=0 ; i<conf->raid_disks; i++) | 1834 | for (i=0 ; i<conf->raid_disks; i++) { |
1909 | if (conf->mirrors[i].rdev && | 1835 | int still_degraded; |
1910 | !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { | 1836 | r10bio_t *rb2; |
1911 | int still_degraded = 0; | 1837 | sector_t sect; |
1912 | /* want to reconstruct this device */ | 1838 | int must_sync; |
1913 | r10bio_t *rb2 = r10_bio; | ||
1914 | sector_t sect = raid10_find_virt(conf, sector_nr, i); | ||
1915 | int must_sync; | ||
1916 | /* Unless we are doing a full sync, we only need | ||
1917 | * to recover the block if it is set in the bitmap | ||
1918 | */ | ||
1919 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1920 | &sync_blocks, 1); | ||
1921 | if (sync_blocks < max_sync) | ||
1922 | max_sync = sync_blocks; | ||
1923 | if (!must_sync && | ||
1924 | !conf->fullsync) { | ||
1925 | /* yep, skip the sync_blocks here, but don't assume | ||
1926 | * that there will never be anything to do here | ||
1927 | */ | ||
1928 | chunks_skipped = -1; | ||
1929 | continue; | ||
1930 | } | ||
1931 | 1839 | ||
1932 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | 1840 | if (conf->mirrors[i].rdev == NULL || |
1933 | raise_barrier(conf, rb2 != NULL); | 1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
1934 | atomic_set(&r10_bio->remaining, 0); | 1842 | continue; |
1935 | 1843 | ||
1936 | r10_bio->master_bio = (struct bio*)rb2; | 1844 | still_degraded = 0; |
1937 | if (rb2) | 1845 | /* want to reconstruct this device */ |
1938 | atomic_inc(&rb2->remaining); | 1846 | rb2 = r10_bio; |
1939 | r10_bio->mddev = mddev; | 1847 | sect = raid10_find_virt(conf, sector_nr, i); |
1940 | set_bit(R10BIO_IsRecover, &r10_bio->state); | 1848 | /* Unless we are doing a full sync, we only need |
1941 | r10_bio->sector = sect; | 1849 | * to recover the block if it is set in the bitmap |
1850 | */ | ||
1851 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1852 | &sync_blocks, 1); | ||
1853 | if (sync_blocks < max_sync) | ||
1854 | max_sync = sync_blocks; | ||
1855 | if (!must_sync && | ||
1856 | !conf->fullsync) { | ||
1857 | /* yep, skip the sync_blocks here, but don't assume | ||
1858 | * that there will never be anything to do here | ||
1859 | */ | ||
1860 | chunks_skipped = -1; | ||
1861 | continue; | ||
1862 | } | ||
1942 | 1863 | ||
1943 | raid10_find_phys(conf, r10_bio); | 1864 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); |
1865 | raise_barrier(conf, rb2 != NULL); | ||
1866 | atomic_set(&r10_bio->remaining, 0); | ||
1944 | 1867 | ||
1945 | /* Need to check if the array will still be | 1868 | r10_bio->master_bio = (struct bio*)rb2; |
1946 | * degraded | 1869 | if (rb2) |
1947 | */ | 1870 | atomic_inc(&rb2->remaining); |
1948 | for (j=0; j<conf->raid_disks; j++) | 1871 | r10_bio->mddev = mddev; |
1949 | if (conf->mirrors[j].rdev == NULL || | 1872 | set_bit(R10BIO_IsRecover, &r10_bio->state); |
1950 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { | 1873 | r10_bio->sector = sect; |
1951 | still_degraded = 1; | ||
1952 | break; | ||
1953 | } | ||
1954 | |||
1955 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1956 | &sync_blocks, still_degraded); | ||
1957 | |||
1958 | for (j=0; j<conf->copies;j++) { | ||
1959 | int d = r10_bio->devs[j].devnum; | ||
1960 | if (conf->mirrors[d].rdev && | ||
1961 | test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { | ||
1962 | /* This is where we read from */ | ||
1963 | bio = r10_bio->devs[0].bio; | ||
1964 | bio->bi_next = biolist; | ||
1965 | biolist = bio; | ||
1966 | bio->bi_private = r10_bio; | ||
1967 | bio->bi_end_io = end_sync_read; | ||
1968 | bio->bi_rw = READ; | ||
1969 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1970 | conf->mirrors[d].rdev->data_offset; | ||
1971 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1972 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1973 | atomic_inc(&r10_bio->remaining); | ||
1974 | /* and we write to 'i' */ | ||
1975 | |||
1976 | for (k=0; k<conf->copies; k++) | ||
1977 | if (r10_bio->devs[k].devnum == i) | ||
1978 | break; | ||
1979 | BUG_ON(k == conf->copies); | ||
1980 | bio = r10_bio->devs[1].bio; | ||
1981 | bio->bi_next = biolist; | ||
1982 | biolist = bio; | ||
1983 | bio->bi_private = r10_bio; | ||
1984 | bio->bi_end_io = end_sync_write; | ||
1985 | bio->bi_rw = WRITE; | ||
1986 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1987 | conf->mirrors[i].rdev->data_offset; | ||
1988 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1989 | |||
1990 | r10_bio->devs[0].devnum = d; | ||
1991 | r10_bio->devs[1].devnum = i; | ||
1992 | 1874 | ||
1993 | break; | 1875 | raid10_find_phys(conf, r10_bio); |
1994 | } | 1876 | |
1995 | } | 1877 | /* Need to check if the array will still be |
1996 | if (j == conf->copies) { | 1878 | * degraded |
1997 | /* Cannot recover, so abort the recovery */ | 1879 | */ |
1998 | put_buf(r10_bio); | 1880 | for (j=0; j<conf->raid_disks; j++) |
1999 | if (rb2) | 1881 | if (conf->mirrors[j].rdev == NULL || |
2000 | atomic_dec(&rb2->remaining); | 1882 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { |
2001 | r10_bio = rb2; | 1883 | still_degraded = 1; |
2002 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
2003 | &mddev->recovery)) | ||
2004 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
2005 | "working devices for recovery.\n", | ||
2006 | mdname(mddev)); | ||
2007 | break; | 1884 | break; |
2008 | } | 1885 | } |
1886 | |||
1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1888 | &sync_blocks, still_degraded); | ||
1889 | |||
1890 | for (j=0; j<conf->copies;j++) { | ||
1891 | int d = r10_bio->devs[j].devnum; | ||
1892 | if (!conf->mirrors[d].rdev || | ||
1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | ||
1894 | continue; | ||
1895 | /* This is where we read from */ | ||
1896 | bio = r10_bio->devs[0].bio; | ||
1897 | bio->bi_next = biolist; | ||
1898 | biolist = bio; | ||
1899 | bio->bi_private = r10_bio; | ||
1900 | bio->bi_end_io = end_sync_read; | ||
1901 | bio->bi_rw = READ; | ||
1902 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1903 | conf->mirrors[d].rdev->data_offset; | ||
1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1906 | atomic_inc(&r10_bio->remaining); | ||
1907 | /* and we write to 'i' */ | ||
1908 | |||
1909 | for (k=0; k<conf->copies; k++) | ||
1910 | if (r10_bio->devs[k].devnum == i) | ||
1911 | break; | ||
1912 | BUG_ON(k == conf->copies); | ||
1913 | bio = r10_bio->devs[1].bio; | ||
1914 | bio->bi_next = biolist; | ||
1915 | biolist = bio; | ||
1916 | bio->bi_private = r10_bio; | ||
1917 | bio->bi_end_io = end_sync_write; | ||
1918 | bio->bi_rw = WRITE; | ||
1919 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1920 | conf->mirrors[i].rdev->data_offset; | ||
1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1922 | |||
1923 | r10_bio->devs[0].devnum = d; | ||
1924 | r10_bio->devs[1].devnum = i; | ||
1925 | |||
1926 | break; | ||
1927 | } | ||
1928 | if (j == conf->copies) { | ||
1929 | /* Cannot recover, so abort the recovery */ | ||
1930 | put_buf(r10_bio); | ||
1931 | if (rb2) | ||
1932 | atomic_dec(&rb2->remaining); | ||
1933 | r10_bio = rb2; | ||
1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
1935 | &mddev->recovery)) | ||
1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
1937 | "working devices for recovery.\n", | ||
1938 | mdname(mddev)); | ||
1939 | break; | ||
2009 | } | 1940 | } |
1941 | } | ||
2010 | if (biolist == NULL) { | 1942 | if (biolist == NULL) { |
2011 | while (r10_bio) { | 1943 | while (r10_bio) { |
2012 | r10bio_t *rb2 = r10_bio; | 1944 | r10bio_t *rb2 = r10_bio; |
@@ -2024,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2024 | 1956 | ||
2025 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, | 1957 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, |
2026 | &sync_blocks, mddev->degraded) && | 1958 | &sync_blocks, mddev->degraded) && |
2027 | !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1959 | !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, |
1960 | &mddev->recovery)) { | ||
2028 | /* We can skip this block */ | 1961 | /* We can skip this block */ |
2029 | *skipped = 1; | 1962 | *skipped = 1; |
2030 | return sync_blocks + sectors_skipped; | 1963 | return sync_blocks + sectors_skipped; |
@@ -2069,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2069 | for (i=0; i<conf->copies; i++) { | 2002 | for (i=0; i<conf->copies; i++) { |
2070 | int d = r10_bio->devs[i].devnum; | 2003 | int d = r10_bio->devs[i].devnum; |
2071 | if (r10_bio->devs[i].bio->bi_end_io) | 2004 | if (r10_bio->devs[i].bio->bi_end_io) |
2072 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 2005 | rdev_dec_pending(conf->mirrors[d].rdev, |
2006 | mddev); | ||
2073 | } | 2007 | } |
2074 | put_buf(r10_bio); | 2008 | put_buf(r10_bio); |
2075 | biolist = NULL; | 2009 | biolist = NULL; |
@@ -2094,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2094 | do { | 2028 | do { |
2095 | struct page *page; | 2029 | struct page *page; |
2096 | int len = PAGE_SIZE; | 2030 | int len = PAGE_SIZE; |
2097 | disk = 0; | ||
2098 | if (sector_nr + (len>>9) > max_sector) | 2031 | if (sector_nr + (len>>9) > max_sector) |
2099 | len = (max_sector - sector_nr) << 9; | 2032 | len = (max_sector - sector_nr) << 9; |
2100 | if (len == 0) | 2033 | if (len == 0) |
2101 | break; | 2034 | break; |
2102 | for (bio= biolist ; bio ; bio=bio->bi_next) { | 2035 | for (bio= biolist ; bio ; bio=bio->bi_next) { |
2036 | struct bio *bio2; | ||
2103 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; | 2037 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; |
2104 | if (bio_add_page(bio, page, len, 0) == 0) { | 2038 | if (bio_add_page(bio, page, len, 0)) |
2105 | /* stop here */ | 2039 | continue; |
2106 | struct bio *bio2; | 2040 | |
2107 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; | 2041 | /* stop here */ |
2108 | for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { | 2042 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; |
2109 | /* remove last page from this bio */ | 2043 | for (bio2 = biolist; |
2110 | bio2->bi_vcnt--; | 2044 | bio2 && bio2 != bio; |
2111 | bio2->bi_size -= len; | 2045 | bio2 = bio2->bi_next) { |
2112 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | 2046 | /* remove last page from this bio */ |
2113 | } | 2047 | bio2->bi_vcnt--; |
2114 | goto bio_full; | 2048 | bio2->bi_size -= len; |
2049 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | ||
2115 | } | 2050 | } |
2116 | disk = i; | 2051 | goto bio_full; |
2117 | } | 2052 | } |
2118 | nr_sectors += len>>9; | 2053 | nr_sectors += len>>9; |
2119 | sector_nr += len>>9; | 2054 | sector_nr += len>>9; |
@@ -2302,8 +2237,6 @@ static int run(mddev_t *mddev) | |||
2302 | if (!conf) | 2237 | if (!conf) |
2303 | goto out; | 2238 | goto out; |
2304 | 2239 | ||
2305 | mddev->queue->queue_lock = &conf->device_lock; | ||
2306 | |||
2307 | mddev->thread = conf->thread; | 2240 | mddev->thread = conf->thread; |
2308 | conf->thread = NULL; | 2241 | conf->thread = NULL; |
2309 | 2242 | ||
@@ -2374,7 +2307,6 @@ static int run(mddev_t *mddev) | |||
2374 | md_set_array_sectors(mddev, size); | 2307 | md_set_array_sectors(mddev, size); |
2375 | mddev->resync_max_sectors = size; | 2308 | mddev->resync_max_sectors = size; |
2376 | 2309 | ||
2377 | mddev->queue->unplug_fn = raid10_unplug; | ||
2378 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 2310 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; |
2379 | mddev->queue->backing_dev_info.congested_data = mddev; | 2311 | mddev->queue->backing_dev_info.congested_data = mddev; |
2380 | 2312 | ||
@@ -2392,17 +2324,20 @@ static int run(mddev_t *mddev) | |||
2392 | 2324 | ||
2393 | if (conf->near_copies < conf->raid_disks) | 2325 | if (conf->near_copies < conf->raid_disks) |
2394 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | 2326 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
2395 | md_integrity_register(mddev); | 2327 | |
2328 | if (md_integrity_register(mddev)) | ||
2329 | goto out_free_conf; | ||
2330 | |||
2396 | return 0; | 2331 | return 0; |
2397 | 2332 | ||
2398 | out_free_conf: | 2333 | out_free_conf: |
2334 | md_unregister_thread(mddev->thread); | ||
2399 | if (conf->r10bio_pool) | 2335 | if (conf->r10bio_pool) |
2400 | mempool_destroy(conf->r10bio_pool); | 2336 | mempool_destroy(conf->r10bio_pool); |
2401 | safe_put_page(conf->tmppage); | 2337 | safe_put_page(conf->tmppage); |
2402 | kfree(conf->mirrors); | 2338 | kfree(conf->mirrors); |
2403 | kfree(conf); | 2339 | kfree(conf); |
2404 | mddev->private = NULL; | 2340 | mddev->private = NULL; |
2405 | md_unregister_thread(mddev->thread); | ||
2406 | out: | 2341 | out: |
2407 | return -EIO; | 2342 | return -EIO; |
2408 | } | 2343 | } |
@@ -2461,11 +2396,13 @@ static void *raid10_takeover_raid0(mddev_t *mddev) | |||
2461 | mddev->recovery_cp = MaxSector; | 2396 | mddev->recovery_cp = MaxSector; |
2462 | 2397 | ||
2463 | conf = setup_conf(mddev); | 2398 | conf = setup_conf(mddev); |
2464 | if (!IS_ERR(conf)) | 2399 | if (!IS_ERR(conf)) { |
2465 | list_for_each_entry(rdev, &mddev->disks, same_set) | 2400 | list_for_each_entry(rdev, &mddev->disks, same_set) |
2466 | if (rdev->raid_disk >= 0) | 2401 | if (rdev->raid_disk >= 0) |
2467 | rdev->new_raid_disk = rdev->raid_disk * 2; | 2402 | rdev->new_raid_disk = rdev->raid_disk * 2; |
2468 | 2403 | conf->barrier = 1; | |
2404 | } | ||
2405 | |||
2469 | return conf; | 2406 | return conf; |
2470 | } | 2407 | } |
2471 | 2408 | ||
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 2316ac2e8e21..944b1104d3b4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -17,8 +17,8 @@ struct r10_private_data_s { | |||
17 | spinlock_t device_lock; | 17 | spinlock_t device_lock; |
18 | 18 | ||
19 | /* geometry */ | 19 | /* geometry */ |
20 | int near_copies; /* number of copies layed out raid0 style */ | 20 | int near_copies; /* number of copies laid out raid0 style */ |
21 | int far_copies; /* number of copies layed out | 21 | int far_copies; /* number of copies laid out |
22 | * at large strides across drives | 22 | * at large strides across drives |
23 | */ | 23 | */ |
24 | int far_offset; /* far_copies are offset by 1 stripe | 24 | int far_offset; /* far_copies are offset by 1 stripe |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 69b0a169e43d..b72edf35ec54 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -27,12 +27,12 @@ | |||
27 | * | 27 | * |
28 | * We group bitmap updates into batches. Each batch has a number. | 28 | * We group bitmap updates into batches. Each batch has a number. |
29 | * We may write out several batches at once, but that isn't very important. | 29 | * We may write out several batches at once, but that isn't very important. |
30 | * conf->bm_write is the number of the last batch successfully written. | 30 | * conf->seq_write is the number of the last batch successfully written. |
31 | * conf->bm_flush is the number of the last batch that was closed to | 31 | * conf->seq_flush is the number of the last batch that was closed to |
32 | * new additions. | 32 | * new additions. |
33 | * When we discover that we will need to write to any block in a stripe | 33 | * When we discover that we will need to write to any block in a stripe |
34 | * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq | 34 | * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq |
35 | * the number of the batch it will be in. This is bm_flush+1. | 35 | * the number of the batch it will be in. This is seq_flush+1. |
36 | * When we are ready to do a write, if that batch hasn't been written yet, | 36 | * When we are ready to do a write, if that batch hasn't been written yet, |
37 | * we plug the array and queue the stripe for later. | 37 | * we plug the array and queue the stripe for later. |
38 | * When an unplug happens, we increment bm_flush, thus closing the current | 38 | * When an unplug happens, we increment bm_flush, thus closing the current |
@@ -129,7 +129,7 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio) | |||
129 | 129 | ||
130 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) | 130 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) |
131 | { | 131 | { |
132 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); | 132 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); |
133 | } | 133 | } |
134 | 134 | ||
135 | /* Find first data disk in a raid6 stripe */ | 135 | /* Find first data disk in a raid6 stripe */ |
@@ -199,14 +199,12 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
199 | BUG_ON(!list_empty(&sh->lru)); | 199 | BUG_ON(!list_empty(&sh->lru)); |
200 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 200 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
201 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 201 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
202 | if (test_bit(STRIPE_DELAYED, &sh->state)) { | 202 | if (test_bit(STRIPE_DELAYED, &sh->state)) |
203 | list_add_tail(&sh->lru, &conf->delayed_list); | 203 | list_add_tail(&sh->lru, &conf->delayed_list); |
204 | plugger_set_plug(&conf->plug); | 204 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
205 | } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && | 205 | sh->bm_seq - conf->seq_write > 0) |
206 | sh->bm_seq - conf->seq_write > 0) { | ||
207 | list_add_tail(&sh->lru, &conf->bitmap_list); | 206 | list_add_tail(&sh->lru, &conf->bitmap_list); |
208 | plugger_set_plug(&conf->plug); | 207 | else { |
209 | } else { | ||
210 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | 208 | clear_bit(STRIPE_BIT_DELAY, &sh->state); |
211 | list_add_tail(&sh->lru, &conf->handle_list); | 209 | list_add_tail(&sh->lru, &conf->handle_list); |
212 | } | 210 | } |
@@ -433,8 +431,6 @@ static int has_failed(raid5_conf_t *conf) | |||
433 | return 0; | 431 | return 0; |
434 | } | 432 | } |
435 | 433 | ||
436 | static void unplug_slaves(mddev_t *mddev); | ||
437 | |||
438 | static struct stripe_head * | 434 | static struct stripe_head * |
439 | get_active_stripe(raid5_conf_t *conf, sector_t sector, | 435 | get_active_stripe(raid5_conf_t *conf, sector_t sector, |
440 | int previous, int noblock, int noquiesce) | 436 | int previous, int noblock, int noquiesce) |
@@ -463,8 +459,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, | |||
463 | < (conf->max_nr_stripes *3/4) | 459 | < (conf->max_nr_stripes *3/4) |
464 | || !conf->inactive_blocked), | 460 | || !conf->inactive_blocked), |
465 | conf->device_lock, | 461 | conf->device_lock, |
466 | md_raid5_unplug_device(conf) | 462 | ); |
467 | ); | ||
468 | conf->inactive_blocked = 0; | 463 | conf->inactive_blocked = 0; |
469 | } else | 464 | } else |
470 | init_stripe(sh, sector, previous); | 465 | init_stripe(sh, sector, previous); |
@@ -506,9 +501,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
506 | int rw; | 501 | int rw; |
507 | struct bio *bi; | 502 | struct bio *bi; |
508 | mdk_rdev_t *rdev; | 503 | mdk_rdev_t *rdev; |
509 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | 504 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { |
510 | rw = WRITE; | 505 | if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) |
511 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | 506 | rw = WRITE_FUA; |
507 | else | ||
508 | rw = WRITE; | ||
509 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
512 | rw = READ; | 510 | rw = READ; |
513 | else | 511 | else |
514 | continue; | 512 | continue; |
@@ -516,7 +514,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
516 | bi = &sh->dev[i].req; | 514 | bi = &sh->dev[i].req; |
517 | 515 | ||
518 | bi->bi_rw = rw; | 516 | bi->bi_rw = rw; |
519 | if (rw == WRITE) | 517 | if (rw & WRITE) |
520 | bi->bi_end_io = raid5_end_write_request; | 518 | bi->bi_end_io = raid5_end_write_request; |
521 | else | 519 | else |
522 | bi->bi_end_io = raid5_end_read_request; | 520 | bi->bi_end_io = raid5_end_read_request; |
@@ -550,13 +548,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
550 | bi->bi_io_vec[0].bv_offset = 0; | 548 | bi->bi_io_vec[0].bv_offset = 0; |
551 | bi->bi_size = STRIPE_SIZE; | 549 | bi->bi_size = STRIPE_SIZE; |
552 | bi->bi_next = NULL; | 550 | bi->bi_next = NULL; |
553 | if (rw == WRITE && | 551 | if ((rw & WRITE) && |
554 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | 552 | test_bit(R5_ReWrite, &sh->dev[i].flags)) |
555 | atomic_add(STRIPE_SECTORS, | 553 | atomic_add(STRIPE_SECTORS, |
556 | &rdev->corrected_errors); | 554 | &rdev->corrected_errors); |
557 | generic_make_request(bi); | 555 | generic_make_request(bi); |
558 | } else { | 556 | } else { |
559 | if (rw == WRITE) | 557 | if (rw & WRITE) |
560 | set_bit(STRIPE_DEGRADED, &sh->state); | 558 | set_bit(STRIPE_DEGRADED, &sh->state); |
561 | pr_debug("skip op %ld on disc %d for sector %llu\n", | 559 | pr_debug("skip op %ld on disc %d for sector %llu\n", |
562 | bi->bi_rw, i, (unsigned long long)sh->sector); | 560 | bi->bi_rw, i, (unsigned long long)sh->sector); |
@@ -587,7 +585,7 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
587 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); | 585 | init_async_submit(&submit, flags, tx, NULL, NULL, NULL); |
588 | 586 | ||
589 | bio_for_each_segment(bvl, bio, i) { | 587 | bio_for_each_segment(bvl, bio, i) { |
590 | int len = bio_iovec_idx(bio, i)->bv_len; | 588 | int len = bvl->bv_len; |
591 | int clen; | 589 | int clen; |
592 | int b_offset = 0; | 590 | int b_offset = 0; |
593 | 591 | ||
@@ -603,8 +601,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, | |||
603 | clen = len; | 601 | clen = len; |
604 | 602 | ||
605 | if (clen > 0) { | 603 | if (clen > 0) { |
606 | b_offset += bio_iovec_idx(bio, i)->bv_offset; | 604 | b_offset += bvl->bv_offset; |
607 | bio_page = bio_iovec_idx(bio, i)->bv_page; | 605 | bio_page = bvl->bv_page; |
608 | if (frombio) | 606 | if (frombio) |
609 | tx = async_memcpy(page, bio_page, page_offset, | 607 | tx = async_memcpy(page, bio_page, page_offset, |
610 | b_offset, clen, &submit); | 608 | b_offset, clen, &submit); |
@@ -1031,6 +1029,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1031 | 1029 | ||
1032 | while (wbi && wbi->bi_sector < | 1030 | while (wbi && wbi->bi_sector < |
1033 | dev->sector + STRIPE_SECTORS) { | 1031 | dev->sector + STRIPE_SECTORS) { |
1032 | if (wbi->bi_rw & REQ_FUA) | ||
1033 | set_bit(R5_WantFUA, &dev->flags); | ||
1034 | tx = async_copy_data(1, wbi, dev->page, | 1034 | tx = async_copy_data(1, wbi, dev->page, |
1035 | dev->sector, tx); | 1035 | dev->sector, tx); |
1036 | wbi = r5_next_bio(wbi, dev->sector); | 1036 | wbi = r5_next_bio(wbi, dev->sector); |
@@ -1048,15 +1048,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1048 | int pd_idx = sh->pd_idx; | 1048 | int pd_idx = sh->pd_idx; |
1049 | int qd_idx = sh->qd_idx; | 1049 | int qd_idx = sh->qd_idx; |
1050 | int i; | 1050 | int i; |
1051 | bool fua = false; | ||
1051 | 1052 | ||
1052 | pr_debug("%s: stripe %llu\n", __func__, | 1053 | pr_debug("%s: stripe %llu\n", __func__, |
1053 | (unsigned long long)sh->sector); | 1054 | (unsigned long long)sh->sector); |
1054 | 1055 | ||
1056 | for (i = disks; i--; ) | ||
1057 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | ||
1058 | |||
1055 | for (i = disks; i--; ) { | 1059 | for (i = disks; i--; ) { |
1056 | struct r5dev *dev = &sh->dev[i]; | 1060 | struct r5dev *dev = &sh->dev[i]; |
1057 | 1061 | ||
1058 | if (dev->written || i == pd_idx || i == qd_idx) | 1062 | if (dev->written || i == pd_idx || i == qd_idx) { |
1059 | set_bit(R5_UPTODATE, &dev->flags); | 1063 | set_bit(R5_UPTODATE, &dev->flags); |
1064 | if (fua) | ||
1065 | set_bit(R5_WantFUA, &dev->flags); | ||
1066 | } | ||
1060 | } | 1067 | } |
1061 | 1068 | ||
1062 | if (sh->reconstruct_state == reconstruct_state_drain_run) | 1069 | if (sh->reconstruct_state == reconstruct_state_drain_run) |
@@ -1461,8 +1468,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1461 | wait_event_lock_irq(conf->wait_for_stripe, | 1468 | wait_event_lock_irq(conf->wait_for_stripe, |
1462 | !list_empty(&conf->inactive_list), | 1469 | !list_empty(&conf->inactive_list), |
1463 | conf->device_lock, | 1470 | conf->device_lock, |
1464 | unplug_slaves(conf->mddev) | 1471 | ); |
1465 | ); | ||
1466 | osh = get_free_stripe(conf); | 1472 | osh = get_free_stripe(conf); |
1467 | spin_unlock_irq(&conf->device_lock); | 1473 | spin_unlock_irq(&conf->device_lock); |
1468 | atomic_set(&nsh->count, 1); | 1474 | atomic_set(&nsh->count, 1); |
@@ -1694,28 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1694 | raid5_conf_t *conf = mddev->private; | 1700 | raid5_conf_t *conf = mddev->private; |
1695 | pr_debug("raid456: error called\n"); | 1701 | pr_debug("raid456: error called\n"); |
1696 | 1702 | ||
1697 | if (!test_bit(Faulty, &rdev->flags)) { | 1703 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
1698 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1704 | unsigned long flags; |
1699 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1705 | spin_lock_irqsave(&conf->device_lock, flags); |
1700 | unsigned long flags; | 1706 | mddev->degraded++; |
1701 | spin_lock_irqsave(&conf->device_lock, flags); | 1707 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1702 | mddev->degraded++; | 1708 | /* |
1703 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1709 | * if recovery was running, make sure it aborts. |
1704 | /* | 1710 | */ |
1705 | * if recovery was running, make sure it aborts. | 1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1706 | */ | ||
1707 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1708 | } | ||
1709 | set_bit(Faulty, &rdev->flags); | ||
1710 | printk(KERN_ALERT | ||
1711 | "md/raid:%s: Disk failure on %s, disabling device.\n" | ||
1712 | KERN_ALERT | ||
1713 | "md/raid:%s: Operation continuing on %d devices.\n", | ||
1714 | mdname(mddev), | ||
1715 | bdevname(rdev->bdev, b), | ||
1716 | mdname(mddev), | ||
1717 | conf->raid_disks - mddev->degraded); | ||
1718 | } | 1712 | } |
1713 | set_bit(Faulty, &rdev->flags); | ||
1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1715 | printk(KERN_ALERT | ||
1716 | "md/raid:%s: Disk failure on %s, disabling device.\n" | ||
1717 | "md/raid:%s: Operation continuing on %d devices.\n", | ||
1718 | mdname(mddev), | ||
1719 | bdevname(rdev->bdev, b), | ||
1720 | mdname(mddev), | ||
1721 | conf->raid_disks - mddev->degraded); | ||
1719 | } | 1722 | } |
1720 | 1723 | ||
1721 | /* | 1724 | /* |
@@ -3281,7 +3284,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3281 | 3284 | ||
3282 | if (dec_preread_active) { | 3285 | if (dec_preread_active) { |
3283 | /* We delay this until after ops_run_io so that if make_request | 3286 | /* We delay this until after ops_run_io so that if make_request |
3284 | * is waiting on a barrier, it won't continue until the writes | 3287 | * is waiting on a flush, it won't continue until the writes |
3285 | * have actually been submitted. | 3288 | * have actually been submitted. |
3286 | */ | 3289 | */ |
3287 | atomic_dec(&conf->preread_active_stripes); | 3290 | atomic_dec(&conf->preread_active_stripes); |
@@ -3583,7 +3586,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3583 | 3586 | ||
3584 | if (dec_preread_active) { | 3587 | if (dec_preread_active) { |
3585 | /* We delay this until after ops_run_io so that if make_request | 3588 | /* We delay this until after ops_run_io so that if make_request |
3586 | * is waiting on a barrier, it won't continue until the writes | 3589 | * is waiting on a flush, it won't continue until the writes |
3587 | * have actually been submitted. | 3590 | * have actually been submitted. |
3588 | */ | 3591 | */ |
3589 | atomic_dec(&conf->preread_active_stripes); | 3592 | atomic_dec(&conf->preread_active_stripes); |
@@ -3616,8 +3619,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf) | |||
3616 | atomic_inc(&conf->preread_active_stripes); | 3619 | atomic_inc(&conf->preread_active_stripes); |
3617 | list_add_tail(&sh->lru, &conf->hold_list); | 3620 | list_add_tail(&sh->lru, &conf->hold_list); |
3618 | } | 3621 | } |
3619 | } else | 3622 | } |
3620 | plugger_set_plug(&conf->plug); | ||
3621 | } | 3623 | } |
3622 | 3624 | ||
3623 | static void activate_bit_delay(raid5_conf_t *conf) | 3625 | static void activate_bit_delay(raid5_conf_t *conf) |
@@ -3634,60 +3636,6 @@ static void activate_bit_delay(raid5_conf_t *conf) | |||
3634 | } | 3636 | } |
3635 | } | 3637 | } |
3636 | 3638 | ||
3637 | static void unplug_slaves(mddev_t *mddev) | ||
3638 | { | ||
3639 | raid5_conf_t *conf = mddev->private; | ||
3640 | int i; | ||
3641 | int devs = max(conf->raid_disks, conf->previous_raid_disks); | ||
3642 | |||
3643 | rcu_read_lock(); | ||
3644 | for (i = 0; i < devs; i++) { | ||
3645 | mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); | ||
3646 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | ||
3647 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
3648 | |||
3649 | atomic_inc(&rdev->nr_pending); | ||
3650 | rcu_read_unlock(); | ||
3651 | |||
3652 | blk_unplug(r_queue); | ||
3653 | |||
3654 | rdev_dec_pending(rdev, mddev); | ||
3655 | rcu_read_lock(); | ||
3656 | } | ||
3657 | } | ||
3658 | rcu_read_unlock(); | ||
3659 | } | ||
3660 | |||
3661 | void md_raid5_unplug_device(raid5_conf_t *conf) | ||
3662 | { | ||
3663 | unsigned long flags; | ||
3664 | |||
3665 | spin_lock_irqsave(&conf->device_lock, flags); | ||
3666 | |||
3667 | if (plugger_remove_plug(&conf->plug)) { | ||
3668 | conf->seq_flush++; | ||
3669 | raid5_activate_delayed(conf); | ||
3670 | } | ||
3671 | md_wakeup_thread(conf->mddev->thread); | ||
3672 | |||
3673 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
3674 | |||
3675 | unplug_slaves(conf->mddev); | ||
3676 | } | ||
3677 | EXPORT_SYMBOL_GPL(md_raid5_unplug_device); | ||
3678 | |||
3679 | static void raid5_unplug(struct plug_handle *plug) | ||
3680 | { | ||
3681 | raid5_conf_t *conf = container_of(plug, raid5_conf_t, plug); | ||
3682 | md_raid5_unplug_device(conf); | ||
3683 | } | ||
3684 | |||
3685 | static void raid5_unplug_queue(struct request_queue *q) | ||
3686 | { | ||
3687 | mddev_t *mddev = q->queuedata; | ||
3688 | md_raid5_unplug_device(mddev->private); | ||
3689 | } | ||
3690 | |||
3691 | int md_raid5_congested(mddev_t *mddev, int bits) | 3639 | int md_raid5_congested(mddev_t *mddev, int bits) |
3692 | { | 3640 | { |
3693 | raid5_conf_t *conf = mddev->private; | 3641 | raid5_conf_t *conf = mddev->private; |
@@ -3864,9 +3812,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) | |||
3864 | return 0; | 3812 | return 0; |
3865 | } | 3813 | } |
3866 | /* | 3814 | /* |
3867 | * use bio_clone to make a copy of the bio | 3815 | * use bio_clone_mddev to make a copy of the bio |
3868 | */ | 3816 | */ |
3869 | align_bi = bio_clone(raid_bio, GFP_NOIO); | 3817 | align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); |
3870 | if (!align_bi) | 3818 | if (!align_bi) |
3871 | return 0; | 3819 | return 0; |
3872 | /* | 3820 | /* |
@@ -3977,15 +3925,10 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
3977 | struct stripe_head *sh; | 3925 | struct stripe_head *sh; |
3978 | const int rw = bio_data_dir(bi); | 3926 | const int rw = bio_data_dir(bi); |
3979 | int remaining; | 3927 | int remaining; |
3928 | int plugged; | ||
3980 | 3929 | ||
3981 | if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { | 3930 | if (unlikely(bi->bi_rw & REQ_FLUSH)) { |
3982 | /* Drain all pending writes. We only really need | 3931 | md_flush_request(mddev, bi); |
3983 | * to ensure they have been submitted, but this is | ||
3984 | * easier. | ||
3985 | */ | ||
3986 | mddev->pers->quiesce(mddev, 1); | ||
3987 | mddev->pers->quiesce(mddev, 0); | ||
3988 | md_barrier_request(mddev, bi); | ||
3989 | return 0; | 3932 | return 0; |
3990 | } | 3933 | } |
3991 | 3934 | ||
@@ -4001,6 +3944,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4001 | bi->bi_next = NULL; | 3944 | bi->bi_next = NULL; |
4002 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | 3945 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ |
4003 | 3946 | ||
3947 | plugged = mddev_check_plugged(mddev); | ||
4004 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3948 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
4005 | DEFINE_WAIT(w); | 3949 | DEFINE_WAIT(w); |
4006 | int disks, data_disks; | 3950 | int disks, data_disks; |
@@ -4014,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4014 | /* spinlock is needed as reshape_progress may be | 3958 | /* spinlock is needed as reshape_progress may be |
4015 | * 64bit on a 32bit platform, and so it might be | 3959 | * 64bit on a 32bit platform, and so it might be |
4016 | * possible to see a half-updated value | 3960 | * possible to see a half-updated value |
4017 | * Ofcourse reshape_progress could change after | 3961 | * Of course reshape_progress could change after |
4018 | * the lock is dropped, so once we get a reference | 3962 | * the lock is dropped, so once we get a reference |
4019 | * to the stripe that we think it is, we will have | 3963 | * to the stripe that we think it is, we will have |
4020 | * to check again. | 3964 | * to check again. |
@@ -4095,7 +4039,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4095 | * add failed due to overlap. Flush everything | 4039 | * add failed due to overlap. Flush everything |
4096 | * and wait a while | 4040 | * and wait a while |
4097 | */ | 4041 | */ |
4098 | md_raid5_unplug_device(conf); | 4042 | md_wakeup_thread(mddev->thread); |
4099 | release_stripe(sh); | 4043 | release_stripe(sh); |
4100 | schedule(); | 4044 | schedule(); |
4101 | goto retry; | 4045 | goto retry; |
@@ -4103,7 +4047,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4103 | finish_wait(&conf->wait_for_overlap, &w); | 4047 | finish_wait(&conf->wait_for_overlap, &w); |
4104 | set_bit(STRIPE_HANDLE, &sh->state); | 4048 | set_bit(STRIPE_HANDLE, &sh->state); |
4105 | clear_bit(STRIPE_DELAYED, &sh->state); | 4049 | clear_bit(STRIPE_DELAYED, &sh->state); |
4106 | if (mddev->barrier && | 4050 | if ((bi->bi_rw & REQ_SYNC) && |
4107 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 4051 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
4108 | atomic_inc(&conf->preread_active_stripes); | 4052 | atomic_inc(&conf->preread_active_stripes); |
4109 | release_stripe(sh); | 4053 | release_stripe(sh); |
@@ -4115,6 +4059,9 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4115 | } | 4059 | } |
4116 | 4060 | ||
4117 | } | 4061 | } |
4062 | if (!plugged) | ||
4063 | md_wakeup_thread(mddev->thread); | ||
4064 | |||
4118 | spin_lock_irq(&conf->device_lock); | 4065 | spin_lock_irq(&conf->device_lock); |
4119 | remaining = raid5_dec_bi_phys_segments(bi); | 4066 | remaining = raid5_dec_bi_phys_segments(bi); |
4120 | spin_unlock_irq(&conf->device_lock); | 4067 | spin_unlock_irq(&conf->device_lock); |
@@ -4126,13 +4073,6 @@ static int make_request(mddev_t *mddev, struct bio * bi) | |||
4126 | bio_endio(bi, 0); | 4073 | bio_endio(bi, 0); |
4127 | } | 4074 | } |
4128 | 4075 | ||
4129 | if (mddev->barrier) { | ||
4130 | /* We need to wait for the stripes to all be handled. | ||
4131 | * So: wait for preread_active_stripes to drop to 0. | ||
4132 | */ | ||
4133 | wait_event(mddev->thread->wqueue, | ||
4134 | atomic_read(&conf->preread_active_stripes) == 0); | ||
4135 | } | ||
4136 | return 0; | 4076 | return 0; |
4137 | } | 4077 | } |
4138 | 4078 | ||
@@ -4238,7 +4178,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4238 | wait_event(conf->wait_for_overlap, | 4178 | wait_event(conf->wait_for_overlap, |
4239 | atomic_read(&conf->reshape_stripes)==0); | 4179 | atomic_read(&conf->reshape_stripes)==0); |
4240 | mddev->reshape_position = conf->reshape_progress; | 4180 | mddev->reshape_position = conf->reshape_progress; |
4241 | mddev->curr_resync_completed = mddev->curr_resync; | 4181 | mddev->curr_resync_completed = sector_nr; |
4242 | conf->reshape_checkpoint = jiffies; | 4182 | conf->reshape_checkpoint = jiffies; |
4243 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4183 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4244 | md_wakeup_thread(mddev->thread); | 4184 | md_wakeup_thread(mddev->thread); |
@@ -4339,7 +4279,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
4339 | wait_event(conf->wait_for_overlap, | 4279 | wait_event(conf->wait_for_overlap, |
4340 | atomic_read(&conf->reshape_stripes) == 0); | 4280 | atomic_read(&conf->reshape_stripes) == 0); |
4341 | mddev->reshape_position = conf->reshape_progress; | 4281 | mddev->reshape_position = conf->reshape_progress; |
4342 | mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; | 4282 | mddev->curr_resync_completed = sector_nr; |
4343 | conf->reshape_checkpoint = jiffies; | 4283 | conf->reshape_checkpoint = jiffies; |
4344 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4284 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
4345 | md_wakeup_thread(mddev->thread); | 4285 | md_wakeup_thread(mddev->thread); |
@@ -4361,13 +4301,12 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
4361 | raid5_conf_t *conf = mddev->private; | 4301 | raid5_conf_t *conf = mddev->private; |
4362 | struct stripe_head *sh; | 4302 | struct stripe_head *sh; |
4363 | sector_t max_sector = mddev->dev_sectors; | 4303 | sector_t max_sector = mddev->dev_sectors; |
4364 | int sync_blocks; | 4304 | sector_t sync_blocks; |
4365 | int still_degraded = 0; | 4305 | int still_degraded = 0; |
4366 | int i; | 4306 | int i; |
4367 | 4307 | ||
4368 | if (sector_nr >= max_sector) { | 4308 | if (sector_nr >= max_sector) { |
4369 | /* just being told to finish up .. nothing much to do */ | 4309 | /* just being told to finish up .. nothing much to do */ |
4370 | unplug_slaves(mddev); | ||
4371 | 4310 | ||
4372 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | 4311 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { |
4373 | end_reshape(conf); | 4312 | end_reshape(conf); |
@@ -4524,24 +4463,30 @@ static void raid5d(mddev_t *mddev) | |||
4524 | struct stripe_head *sh; | 4463 | struct stripe_head *sh; |
4525 | raid5_conf_t *conf = mddev->private; | 4464 | raid5_conf_t *conf = mddev->private; |
4526 | int handled; | 4465 | int handled; |
4466 | struct blk_plug plug; | ||
4527 | 4467 | ||
4528 | pr_debug("+++ raid5d active\n"); | 4468 | pr_debug("+++ raid5d active\n"); |
4529 | 4469 | ||
4530 | md_check_recovery(mddev); | 4470 | md_check_recovery(mddev); |
4531 | 4471 | ||
4472 | blk_start_plug(&plug); | ||
4532 | handled = 0; | 4473 | handled = 0; |
4533 | spin_lock_irq(&conf->device_lock); | 4474 | spin_lock_irq(&conf->device_lock); |
4534 | while (1) { | 4475 | while (1) { |
4535 | struct bio *bio; | 4476 | struct bio *bio; |
4536 | 4477 | ||
4537 | if (conf->seq_flush != conf->seq_write) { | 4478 | if (atomic_read(&mddev->plug_cnt) == 0 && |
4538 | int seq = conf->seq_flush; | 4479 | !list_empty(&conf->bitmap_list)) { |
4480 | /* Now is a good time to flush some bitmap updates */ | ||
4481 | conf->seq_flush++; | ||
4539 | spin_unlock_irq(&conf->device_lock); | 4482 | spin_unlock_irq(&conf->device_lock); |
4540 | bitmap_unplug(mddev->bitmap); | 4483 | bitmap_unplug(mddev->bitmap); |
4541 | spin_lock_irq(&conf->device_lock); | 4484 | spin_lock_irq(&conf->device_lock); |
4542 | conf->seq_write = seq; | 4485 | conf->seq_write = conf->seq_flush; |
4543 | activate_bit_delay(conf); | 4486 | activate_bit_delay(conf); |
4544 | } | 4487 | } |
4488 | if (atomic_read(&mddev->plug_cnt) == 0) | ||
4489 | raid5_activate_delayed(conf); | ||
4545 | 4490 | ||
4546 | while ((bio = remove_bio_from_retry(conf))) { | 4491 | while ((bio = remove_bio_from_retry(conf))) { |
4547 | int ok; | 4492 | int ok; |
@@ -4571,7 +4516,7 @@ static void raid5d(mddev_t *mddev) | |||
4571 | spin_unlock_irq(&conf->device_lock); | 4516 | spin_unlock_irq(&conf->device_lock); |
4572 | 4517 | ||
4573 | async_tx_issue_pending_all(); | 4518 | async_tx_issue_pending_all(); |
4574 | unplug_slaves(mddev); | 4519 | blk_finish_plug(&plug); |
4575 | 4520 | ||
4576 | pr_debug("--- raid5d inactive\n"); | 4521 | pr_debug("--- raid5d inactive\n"); |
4577 | } | 4522 | } |
@@ -4913,7 +4858,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) | |||
4913 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" | 4858 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" |
4914 | " disk %d\n", | 4859 | " disk %d\n", |
4915 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); | 4860 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); |
4916 | } else | 4861 | } else if (rdev->saved_raid_disk != raid_disk) |
4917 | /* Cannot rely on bitmap to complete recovery */ | 4862 | /* Cannot rely on bitmap to complete recovery */ |
4918 | conf->fullsync = 1; | 4863 | conf->fullsync = 1; |
4919 | } | 4864 | } |
@@ -5188,8 +5133,6 @@ static int run(mddev_t *mddev) | |||
5188 | mdname(mddev)); | 5133 | mdname(mddev)); |
5189 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 5134 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
5190 | 5135 | ||
5191 | plugger_init(&conf->plug, raid5_unplug); | ||
5192 | mddev->plug = &conf->plug; | ||
5193 | if (mddev->queue) { | 5136 | if (mddev->queue) { |
5194 | int chunk_size; | 5137 | int chunk_size; |
5195 | /* read-ahead size must cover two whole stripes, which | 5138 | /* read-ahead size must cover two whole stripes, which |
@@ -5206,8 +5149,6 @@ static int run(mddev_t *mddev) | |||
5206 | 5149 | ||
5207 | mddev->queue->backing_dev_info.congested_data = mddev; | 5150 | mddev->queue->backing_dev_info.congested_data = mddev; |
5208 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; | 5151 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; |
5209 | mddev->queue->queue_lock = &conf->device_lock; | ||
5210 | mddev->queue->unplug_fn = raid5_unplug_queue; | ||
5211 | 5152 | ||
5212 | chunk_size = mddev->chunk_sectors << 9; | 5153 | chunk_size = mddev->chunk_sectors << 9; |
5213 | blk_queue_io_min(mddev->queue, chunk_size); | 5154 | blk_queue_io_min(mddev->queue, chunk_size); |
@@ -5240,7 +5181,6 @@ static int stop(mddev_t *mddev) | |||
5240 | mddev->thread = NULL; | 5181 | mddev->thread = NULL; |
5241 | if (mddev->queue) | 5182 | if (mddev->queue) |
5242 | mddev->queue->backing_dev_info.congested_fn = NULL; | 5183 | mddev->queue->backing_dev_info.congested_fn = NULL; |
5243 | plugger_flush(&conf->plug); /* the unplug fn references 'conf'*/ | ||
5244 | free_conf(conf); | 5184 | free_conf(conf); |
5245 | mddev->private = NULL; | 5185 | mddev->private = NULL; |
5246 | mddev->to_remove = &raid5_attrs_group; | 5186 | mddev->to_remove = &raid5_attrs_group; |
@@ -5340,7 +5280,7 @@ static int raid5_spare_active(mddev_t *mddev) | |||
5340 | && !test_bit(Faulty, &tmp->rdev->flags) | 5280 | && !test_bit(Faulty, &tmp->rdev->flags) |
5341 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 5281 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
5342 | count++; | 5282 | count++; |
5343 | sysfs_notify_dirent(tmp->rdev->sysfs_state); | 5283 | sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); |
5344 | } | 5284 | } |
5345 | } | 5285 | } |
5346 | spin_lock_irqsave(&conf->device_lock, flags); | 5286 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -5449,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
5449 | return -EINVAL; | 5389 | return -EINVAL; |
5450 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5390 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5451 | revalidate_disk(mddev->gendisk); | 5391 | revalidate_disk(mddev->gendisk); |
5452 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { | 5392 | if (sectors > mddev->dev_sectors && |
5393 | mddev->recovery_cp > mddev->dev_sectors) { | ||
5453 | mddev->recovery_cp = mddev->dev_sectors; | 5394 | mddev->recovery_cp = mddev->dev_sectors; |
5454 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5395 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5455 | } | 5396 | } |
@@ -5519,7 +5460,6 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5519 | raid5_conf_t *conf = mddev->private; | 5460 | raid5_conf_t *conf = mddev->private; |
5520 | mdk_rdev_t *rdev; | 5461 | mdk_rdev_t *rdev; |
5521 | int spares = 0; | 5462 | int spares = 0; |
5522 | int added_devices = 0; | ||
5523 | unsigned long flags; | 5463 | unsigned long flags; |
5524 | 5464 | ||
5525 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 5465 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
@@ -5529,8 +5469,8 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5529 | return -ENOSPC; | 5469 | return -ENOSPC; |
5530 | 5470 | ||
5531 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5471 | list_for_each_entry(rdev, &mddev->disks, same_set) |
5532 | if (rdev->raid_disk < 0 && | 5472 | if (!test_bit(In_sync, &rdev->flags) |
5533 | !test_bit(Faulty, &rdev->flags)) | 5473 | && !test_bit(Faulty, &rdev->flags)) |
5534 | spares++; | 5474 | spares++; |
5535 | 5475 | ||
5536 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) | 5476 | if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) |
@@ -5573,29 +5513,35 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
5573 | * to correctly record the "partially reconstructed" state of | 5513 | * to correctly record the "partially reconstructed" state of |
5574 | * such devices during the reshape and confusion could result. | 5514 | * such devices during the reshape and confusion could result. |
5575 | */ | 5515 | */ |
5576 | if (mddev->delta_disks >= 0) | 5516 | if (mddev->delta_disks >= 0) { |
5577 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5517 | int added_devices = 0; |
5578 | if (rdev->raid_disk < 0 && | 5518 | list_for_each_entry(rdev, &mddev->disks, same_set) |
5579 | !test_bit(Faulty, &rdev->flags)) { | 5519 | if (rdev->raid_disk < 0 && |
5580 | if (raid5_add_disk(mddev, rdev) == 0) { | 5520 | !test_bit(Faulty, &rdev->flags)) { |
5581 | char nm[20]; | 5521 | if (raid5_add_disk(mddev, rdev) == 0) { |
5582 | if (rdev->raid_disk >= conf->previous_raid_disks) { | 5522 | char nm[20]; |
5583 | set_bit(In_sync, &rdev->flags); | 5523 | if (rdev->raid_disk |
5584 | added_devices++; | 5524 | >= conf->previous_raid_disks) { |
5585 | } else | 5525 | set_bit(In_sync, &rdev->flags); |
5586 | rdev->recovery_offset = 0; | 5526 | added_devices++; |
5587 | sprintf(nm, "rd%d", rdev->raid_disk); | 5527 | } else |
5588 | if (sysfs_create_link(&mddev->kobj, | 5528 | rdev->recovery_offset = 0; |
5589 | &rdev->kobj, nm)) | 5529 | sprintf(nm, "rd%d", rdev->raid_disk); |
5590 | /* Failure here is OK */; | 5530 | if (sysfs_create_link(&mddev->kobj, |
5591 | } else | 5531 | &rdev->kobj, nm)) |
5592 | break; | 5532 | /* Failure here is OK */; |
5593 | } | 5533 | } |
5534 | } else if (rdev->raid_disk >= conf->previous_raid_disks | ||
5535 | && !test_bit(Faulty, &rdev->flags)) { | ||
5536 | /* This is a spare that was manually added */ | ||
5537 | set_bit(In_sync, &rdev->flags); | ||
5538 | added_devices++; | ||
5539 | } | ||
5594 | 5540 | ||
5595 | /* When a reshape changes the number of devices, ->degraded | 5541 | /* When a reshape changes the number of devices, |
5596 | * is measured against the larger of the pre and post number of | 5542 | * ->degraded is measured against the larger of the |
5597 | * devices.*/ | 5543 | * pre and post number of devices. |
5598 | if (mddev->delta_disks > 0) { | 5544 | */ |
5599 | spin_lock_irqsave(&conf->device_lock, flags); | 5545 | spin_lock_irqsave(&conf->device_lock, flags); |
5600 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) | 5546 | mddev->degraded += (conf->raid_disks - conf->previous_raid_disks) |
5601 | - added_devices; | 5547 | - added_devices; |
@@ -5731,6 +5677,7 @@ static void raid5_quiesce(mddev_t *mddev, int state) | |||
5731 | static void *raid45_takeover_raid0(mddev_t *mddev, int level) | 5677 | static void *raid45_takeover_raid0(mddev_t *mddev, int level) |
5732 | { | 5678 | { |
5733 | struct raid0_private_data *raid0_priv = mddev->private; | 5679 | struct raid0_private_data *raid0_priv = mddev->private; |
5680 | sector_t sectors; | ||
5734 | 5681 | ||
5735 | /* for raid0 takeover only one zone is supported */ | 5682 | /* for raid0 takeover only one zone is supported */ |
5736 | if (raid0_priv->nr_strip_zones > 1) { | 5683 | if (raid0_priv->nr_strip_zones > 1) { |
@@ -5739,6 +5686,9 @@ static void *raid45_takeover_raid0(mddev_t *mddev, int level) | |||
5739 | return ERR_PTR(-EINVAL); | 5686 | return ERR_PTR(-EINVAL); |
5740 | } | 5687 | } |
5741 | 5688 | ||
5689 | sectors = raid0_priv->strip_zone[0].zone_end; | ||
5690 | sector_div(sectors, raid0_priv->strip_zone[0].nb_dev); | ||
5691 | mddev->dev_sectors = sectors; | ||
5742 | mddev->new_level = level; | 5692 | mddev->new_level = level; |
5743 | mddev->new_layout = ALGORITHM_PARITY_N; | 5693 | mddev->new_layout = ALGORITHM_PARITY_N; |
5744 | mddev->new_chunk_sectors = mddev->chunk_sectors; | 5694 | mddev->new_chunk_sectors = mddev->chunk_sectors; |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 36eaed5dfd6e..3ca77a2613ba 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -275,6 +275,7 @@ struct r6_state { | |||
275 | * filling | 275 | * filling |
276 | */ | 276 | */ |
277 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ | 277 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
278 | #define R5_WantFUA 14 /* Write should be FUA */ | ||
278 | /* | 279 | /* |
279 | * Write method | 280 | * Write method |
280 | */ | 281 | */ |
@@ -399,8 +400,6 @@ struct raid5_private_data { | |||
399 | * Cleared when a sync completes. | 400 | * Cleared when a sync completes. |
400 | */ | 401 | */ |
401 | 402 | ||
402 | struct plug_handle plug; | ||
403 | |||
404 | /* per cpu variables */ | 403 | /* per cpu variables */ |
405 | struct raid5_percpu { | 404 | struct raid5_percpu { |
406 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | 405 | struct page *spare_page; /* Used when checking P/Q in raid6 */ |
@@ -502,6 +501,6 @@ static inline int algorithm_is_DDF(int layout) | |||
502 | } | 501 | } |
503 | 502 | ||
504 | extern int md_raid5_congested(mddev_t *mddev, int bits); | 503 | extern int md_raid5_congested(mddev_t *mddev, int bits); |
505 | extern void md_raid5_unplug_device(raid5_conf_t *conf); | 504 | extern void md_raid5_kick_device(raid5_conf_t *conf); |
506 | extern int raid5_set_cache_size(mddev_t *mddev, int size); | 505 | extern int raid5_set_cache_size(mddev_t *mddev, int size); |
507 | #endif | 506 | #endif |