diff options
author | Shaohua Li <shli@fb.com> | 2016-12-13 15:40:15 -0500 |
---|---|---|
committer | Shaohua Li <shli@fb.com> | 2016-12-13 15:40:15 -0500 |
commit | 20737738d397dfadbca1ea50dcc00d7259f500cf (patch) | |
tree | 5765b1815331bac9ca32208963c850e60806d6de | |
parent | b78b499a67c3f77aeb6cd0b54724bc38b141255d (diff) | |
parent | 2953079c692da067aeb6345659875b97378f9b0a (diff) |
Merge branch 'md-next' into md-linus
-rw-r--r-- | drivers/md/bitmap.c | 166 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 4 | ||||
-rw-r--r-- | drivers/md/linear.c | 31 | ||||
-rw-r--r-- | drivers/md/md.c | 701 | ||||
-rw-r--r-- | drivers/md/md.h | 108 | ||||
-rw-r--r-- | drivers/md/multipath.c | 92 | ||||
-rw-r--r-- | drivers/md/raid0.c | 107 | ||||
-rw-r--r-- | drivers/md/raid1.c | 247 | ||||
-rw-r--r-- | drivers/md/raid1.h | 19 | ||||
-rw-r--r-- | drivers/md/raid10.c | 295 | ||||
-rw-r--r-- | drivers/md/raid10.h | 2 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 1833 | ||||
-rw-r--r-- | drivers/md/raid5.c | 623 | ||||
-rw-r--r-- | drivers/md/raid5.h | 172 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_p.h | 7 | ||||
-rw-r--r-- | lib/raid6/avx2.c | 232 |
16 files changed, 3403 insertions, 1236 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2d826927a3bf..9fb2ccac958a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/seq_file.h> | 29 | #include <linux/seq_file.h> |
30 | #include <trace/events/block.h> | ||
30 | #include "md.h" | 31 | #include "md.h" |
31 | #include "bitmap.h" | 32 | #include "bitmap.h" |
32 | 33 | ||
@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde | |||
208 | 209 | ||
209 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | 210 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) |
210 | { | 211 | { |
211 | struct md_rdev *rdev = NULL; | 212 | struct md_rdev *rdev; |
212 | struct block_device *bdev; | 213 | struct block_device *bdev; |
213 | struct mddev *mddev = bitmap->mddev; | 214 | struct mddev *mddev = bitmap->mddev; |
214 | struct bitmap_storage *store = &bitmap->storage; | 215 | struct bitmap_storage *store = &bitmap->storage; |
215 | 216 | ||
217 | restart: | ||
218 | rdev = NULL; | ||
216 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 219 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
217 | int size = PAGE_SIZE; | 220 | int size = PAGE_SIZE; |
218 | loff_t offset = mddev->bitmap_info.offset; | 221 | loff_t offset = mddev->bitmap_info.offset; |
@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
268 | page); | 271 | page); |
269 | } | 272 | } |
270 | 273 | ||
271 | if (wait) | 274 | if (wait && md_super_wait(mddev) < 0) |
272 | md_super_wait(mddev); | 275 | goto restart; |
273 | return 0; | 276 | return 0; |
274 | 277 | ||
275 | bad_alignment: | 278 | bad_alignment: |
@@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index, | |||
405 | ret = -EIO; | 408 | ret = -EIO; |
406 | out: | 409 | out: |
407 | if (ret) | 410 | if (ret) |
408 | printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", | 411 | pr_err("md: bitmap read error: (%dB @ %llu): %d\n", |
409 | (int)PAGE_SIZE, | 412 | (int)PAGE_SIZE, |
410 | (unsigned long long)index << PAGE_SHIFT, | 413 | (unsigned long long)index << PAGE_SHIFT, |
411 | ret); | 414 | ret); |
412 | return ret; | 415 | return ret; |
413 | } | 416 | } |
414 | 417 | ||
@@ -416,6 +419,28 @@ out: | |||
416 | * bitmap file superblock operations | 419 | * bitmap file superblock operations |
417 | */ | 420 | */ |
418 | 421 | ||
422 | /* | ||
423 | * bitmap_wait_writes() should be called before writing any bitmap | ||
424 | * blocks, to ensure previous writes, particularly from | ||
425 | * bitmap_daemon_work(), have completed. | ||
426 | */ | ||
427 | static void bitmap_wait_writes(struct bitmap *bitmap) | ||
428 | { | ||
429 | if (bitmap->storage.file) | ||
430 | wait_event(bitmap->write_wait, | ||
431 | atomic_read(&bitmap->pending_writes)==0); | ||
432 | else | ||
433 | /* Note that we ignore the return value. The writes | ||
434 | * might have failed, but that would just mean that | ||
435 | * some bits which should be cleared haven't been, | ||
436 | * which is safe. The relevant bitmap blocks will | ||
437 | * probably get written again, but there is no great | ||
438 | * loss if they aren't. | ||
439 | */ | ||
440 | md_super_wait(bitmap->mddev); | ||
441 | } | ||
442 | |||
443 | |||
419 | /* update the event counter and sync the superblock to disk */ | 444 | /* update the event counter and sync the superblock to disk */ |
420 | void bitmap_update_sb(struct bitmap *bitmap) | 445 | void bitmap_update_sb(struct bitmap *bitmap) |
421 | { | 446 | { |
@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
455 | if (!bitmap || !bitmap->storage.sb_page) | 480 | if (!bitmap || !bitmap->storage.sb_page) |
456 | return; | 481 | return; |
457 | sb = kmap_atomic(bitmap->storage.sb_page); | 482 | sb = kmap_atomic(bitmap->storage.sb_page); |
458 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); | 483 | pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); |
459 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); | 484 | pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); |
460 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); | 485 | pr_debug(" version: %d\n", le32_to_cpu(sb->version)); |
461 | printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", | 486 | pr_debug(" uuid: %08x.%08x.%08x.%08x\n", |
462 | *(__u32 *)(sb->uuid+0), | 487 | *(__u32 *)(sb->uuid+0), |
463 | *(__u32 *)(sb->uuid+4), | 488 | *(__u32 *)(sb->uuid+4), |
464 | *(__u32 *)(sb->uuid+8), | 489 | *(__u32 *)(sb->uuid+8), |
465 | *(__u32 *)(sb->uuid+12)); | 490 | *(__u32 *)(sb->uuid+12)); |
466 | printk(KERN_DEBUG " events: %llu\n", | 491 | pr_debug(" events: %llu\n", |
467 | (unsigned long long) le64_to_cpu(sb->events)); | 492 | (unsigned long long) le64_to_cpu(sb->events)); |
468 | printk(KERN_DEBUG "events cleared: %llu\n", | 493 | pr_debug("events cleared: %llu\n", |
469 | (unsigned long long) le64_to_cpu(sb->events_cleared)); | 494 | (unsigned long long) le64_to_cpu(sb->events_cleared)); |
470 | printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); | 495 | pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); |
471 | printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); | 496 | pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); |
472 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); | 497 | pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); |
473 | printk(KERN_DEBUG " sync size: %llu KB\n", | 498 | pr_debug(" sync size: %llu KB\n", |
474 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); | 499 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); |
475 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); | 500 | pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); |
476 | kunmap_atomic(sb); | 501 | kunmap_atomic(sb); |
477 | } | 502 | } |
478 | 503 | ||
@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
506 | BUG_ON(!chunksize); | 531 | BUG_ON(!chunksize); |
507 | if (!is_power_of_2(chunksize)) { | 532 | if (!is_power_of_2(chunksize)) { |
508 | kunmap_atomic(sb); | 533 | kunmap_atomic(sb); |
509 | printk(KERN_ERR "bitmap chunksize not a power of 2\n"); | 534 | pr_warn("bitmap chunksize not a power of 2\n"); |
510 | return -EINVAL; | 535 | return -EINVAL; |
511 | } | 536 | } |
512 | sb->chunksize = cpu_to_le32(chunksize); | 537 | sb->chunksize = cpu_to_le32(chunksize); |
513 | 538 | ||
514 | daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; | 539 | daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; |
515 | if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { | 540 | if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { |
516 | printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); | 541 | pr_debug("Choosing daemon_sleep default (5 sec)\n"); |
517 | daemon_sleep = 5 * HZ; | 542 | daemon_sleep = 5 * HZ; |
518 | } | 543 | } |
519 | sb->daemon_sleep = cpu_to_le32(daemon_sleep); | 544 | sb->daemon_sleep = cpu_to_le32(daemon_sleep); |
@@ -584,7 +609,7 @@ re_read: | |||
584 | /* to 4k blocks */ | 609 | /* to 4k blocks */ |
585 | bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); | 610 | bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); |
586 | offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); | 611 | offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); |
587 | pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, | 612 | pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, |
588 | bitmap->cluster_slot, offset); | 613 | bitmap->cluster_slot, offset); |
589 | } | 614 | } |
590 | 615 | ||
@@ -634,7 +659,7 @@ re_read: | |||
634 | else if (write_behind > COUNTER_MAX) | 659 | else if (write_behind > COUNTER_MAX) |
635 | reason = "write-behind limit out of range (0 - 16383)"; | 660 | reason = "write-behind limit out of range (0 - 16383)"; |
636 | if (reason) { | 661 | if (reason) { |
637 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", | 662 | pr_warn("%s: invalid bitmap file superblock: %s\n", |
638 | bmname(bitmap), reason); | 663 | bmname(bitmap), reason); |
639 | goto out; | 664 | goto out; |
640 | } | 665 | } |
@@ -648,18 +673,15 @@ re_read: | |||
648 | * bitmap's UUID and event counter to the mddev's | 673 | * bitmap's UUID and event counter to the mddev's |
649 | */ | 674 | */ |
650 | if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { | 675 | if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { |
651 | printk(KERN_INFO | 676 | pr_warn("%s: bitmap superblock UUID mismatch\n", |
652 | "%s: bitmap superblock UUID mismatch\n", | 677 | bmname(bitmap)); |
653 | bmname(bitmap)); | ||
654 | goto out; | 678 | goto out; |
655 | } | 679 | } |
656 | events = le64_to_cpu(sb->events); | 680 | events = le64_to_cpu(sb->events); |
657 | if (!nodes && (events < bitmap->mddev->events)) { | 681 | if (!nodes && (events < bitmap->mddev->events)) { |
658 | printk(KERN_INFO | 682 | pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", |
659 | "%s: bitmap file is out of date (%llu < %llu) " | 683 | bmname(bitmap), events, |
660 | "-- forcing full recovery\n", | 684 | (unsigned long long) bitmap->mddev->events); |
661 | bmname(bitmap), events, | ||
662 | (unsigned long long) bitmap->mddev->events); | ||
663 | set_bit(BITMAP_STALE, &bitmap->flags); | 685 | set_bit(BITMAP_STALE, &bitmap->flags); |
664 | } | 686 | } |
665 | } | 687 | } |
@@ -679,8 +701,8 @@ out: | |||
679 | if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { | 701 | if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { |
680 | err = md_setup_cluster(bitmap->mddev, nodes); | 702 | err = md_setup_cluster(bitmap->mddev, nodes); |
681 | if (err) { | 703 | if (err) { |
682 | pr_err("%s: Could not setup cluster service (%d)\n", | 704 | pr_warn("%s: Could not setup cluster service (%d)\n", |
683 | bmname(bitmap), err); | 705 | bmname(bitmap), err); |
684 | goto out_no_sb; | 706 | goto out_no_sb; |
685 | } | 707 | } |
686 | bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); | 708 | bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); |
@@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap) | |||
847 | ptr = file_path(bitmap->storage.file, | 869 | ptr = file_path(bitmap->storage.file, |
848 | path, PAGE_SIZE); | 870 | path, PAGE_SIZE); |
849 | 871 | ||
850 | printk(KERN_ALERT | 872 | pr_warn("%s: kicking failed bitmap file %s from array!\n", |
851 | "%s: kicking failed bitmap file %s from array!\n", | 873 | bmname(bitmap), IS_ERR(ptr) ? "" : ptr); |
852 | bmname(bitmap), IS_ERR(ptr) ? "" : ptr); | ||
853 | 874 | ||
854 | kfree(path); | 875 | kfree(path); |
855 | } else | 876 | } else |
856 | printk(KERN_ALERT | 877 | pr_warn("%s: disabling internal bitmap due to errors\n", |
857 | "%s: disabling internal bitmap due to errors\n", | 878 | bmname(bitmap)); |
858 | bmname(bitmap)); | ||
859 | } | 879 | } |
860 | } | 880 | } |
861 | 881 | ||
@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
983 | { | 1003 | { |
984 | unsigned long i; | 1004 | unsigned long i; |
985 | int dirty, need_write; | 1005 | int dirty, need_write; |
1006 | int writing = 0; | ||
986 | 1007 | ||
987 | if (!bitmap || !bitmap->storage.filemap || | 1008 | if (!bitmap || !bitmap->storage.filemap || |
988 | test_bit(BITMAP_STALE, &bitmap->flags)) | 1009 | test_bit(BITMAP_STALE, &bitmap->flags)) |
@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
997 | need_write = test_and_clear_page_attr(bitmap, i, | 1018 | need_write = test_and_clear_page_attr(bitmap, i, |
998 | BITMAP_PAGE_NEEDWRITE); | 1019 | BITMAP_PAGE_NEEDWRITE); |
999 | if (dirty || need_write) { | 1020 | if (dirty || need_write) { |
1021 | if (!writing) { | ||
1022 | bitmap_wait_writes(bitmap); | ||
1023 | if (bitmap->mddev->queue) | ||
1024 | blk_add_trace_msg(bitmap->mddev->queue, | ||
1025 | "md bitmap_unplug"); | ||
1026 | } | ||
1000 | clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); | 1027 | clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); |
1001 | write_page(bitmap, bitmap->storage.filemap[i], 0); | 1028 | write_page(bitmap, bitmap->storage.filemap[i], 0); |
1029 | writing = 1; | ||
1002 | } | 1030 | } |
1003 | } | 1031 | } |
1004 | if (bitmap->storage.file) | 1032 | if (writing) |
1005 | wait_event(bitmap->write_wait, | 1033 | bitmap_wait_writes(bitmap); |
1006 | atomic_read(&bitmap->pending_writes)==0); | ||
1007 | else | ||
1008 | md_super_wait(bitmap->mddev); | ||
1009 | 1034 | ||
1010 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) | 1035 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
1011 | bitmap_file_kick(bitmap); | 1036 | bitmap_file_kick(bitmap); |
@@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1056 | 1081 | ||
1057 | outofdate = test_bit(BITMAP_STALE, &bitmap->flags); | 1082 | outofdate = test_bit(BITMAP_STALE, &bitmap->flags); |
1058 | if (outofdate) | 1083 | if (outofdate) |
1059 | printk(KERN_INFO "%s: bitmap file is out of date, doing full " | 1084 | pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); |
1060 | "recovery\n", bmname(bitmap)); | ||
1061 | 1085 | ||
1062 | if (file && i_size_read(file->f_mapping->host) < store->bytes) { | 1086 | if (file && i_size_read(file->f_mapping->host) < store->bytes) { |
1063 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", | 1087 | pr_warn("%s: bitmap file too short %lu < %lu\n", |
1064 | bmname(bitmap), | 1088 | bmname(bitmap), |
1065 | (unsigned long) i_size_read(file->f_mapping->host), | 1089 | (unsigned long) i_size_read(file->f_mapping->host), |
1066 | store->bytes); | 1090 | store->bytes); |
1067 | goto err; | 1091 | goto err; |
1068 | } | 1092 | } |
1069 | 1093 | ||
@@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1137 | offset = 0; | 1161 | offset = 0; |
1138 | } | 1162 | } |
1139 | 1163 | ||
1140 | printk(KERN_INFO "%s: bitmap initialized from disk: " | 1164 | pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", |
1141 | "read %lu pages, set %lu of %lu bits\n", | 1165 | bmname(bitmap), store->file_pages, |
1142 | bmname(bitmap), store->file_pages, | 1166 | bit_cnt, chunks); |
1143 | bit_cnt, chunks); | ||
1144 | 1167 | ||
1145 | return 0; | 1168 | return 0; |
1146 | 1169 | ||
1147 | err: | 1170 | err: |
1148 | printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", | 1171 | pr_warn("%s: bitmap initialisation failed: %d\n", |
1149 | bmname(bitmap), ret); | 1172 | bmname(bitmap), ret); |
1150 | return ret; | 1173 | return ret; |
1151 | } | 1174 | } |
1152 | 1175 | ||
@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1225 | } | 1248 | } |
1226 | bitmap->allclean = 1; | 1249 | bitmap->allclean = 1; |
1227 | 1250 | ||
1251 | if (bitmap->mddev->queue) | ||
1252 | blk_add_trace_msg(bitmap->mddev->queue, | ||
1253 | "md bitmap_daemon_work"); | ||
1254 | |||
1228 | /* Any file-page which is PENDING now needs to be written. | 1255 | /* Any file-page which is PENDING now needs to be written. |
1229 | * So set NEEDWRITE now, then after we make any last-minute changes | 1256 | * So set NEEDWRITE now, then after we make any last-minute changes |
1230 | * we will write it. | 1257 | * we will write it. |
@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1289 | } | 1316 | } |
1290 | spin_unlock_irq(&counts->lock); | 1317 | spin_unlock_irq(&counts->lock); |
1291 | 1318 | ||
1319 | bitmap_wait_writes(bitmap); | ||
1292 | /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. | 1320 | /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. |
1293 | * DIRTY pages need to be written by bitmap_unplug so it can wait | 1321 | * DIRTY pages need to be written by bitmap_unplug so it can wait |
1294 | * for them. | 1322 | * for them. |
@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) | |||
1595 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1623 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
1596 | 1624 | ||
1597 | bitmap->mddev->curr_resync_completed = sector; | 1625 | bitmap->mddev->curr_resync_completed = sector; |
1598 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | 1626 | set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); |
1599 | sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); | 1627 | sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); |
1600 | s = 0; | 1628 | s = 0; |
1601 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1629 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
@@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) | |||
1825 | if (err) | 1853 | if (err) |
1826 | goto error; | 1854 | goto error; |
1827 | 1855 | ||
1828 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1856 | pr_debug("created bitmap (%lu pages) for device %s\n", |
1829 | bitmap->counts.pages, bmname(bitmap)); | 1857 | bitmap->counts.pages, bmname(bitmap)); |
1830 | 1858 | ||
1831 | err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; | 1859 | err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; |
1832 | if (err) | 1860 | if (err) |
@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | |||
2029 | !bitmap->mddev->bitmap_info.external, | 2057 | !bitmap->mddev->bitmap_info.external, |
2030 | mddev_is_clustered(bitmap->mddev) | 2058 | mddev_is_clustered(bitmap->mddev) |
2031 | ? bitmap->cluster_slot : 0); | 2059 | ? bitmap->cluster_slot : 0); |
2032 | if (ret) | 2060 | if (ret) { |
2061 | bitmap_file_unmap(&store); | ||
2033 | goto err; | 2062 | goto err; |
2063 | } | ||
2034 | 2064 | ||
2035 | pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); | 2065 | pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); |
2036 | 2066 | ||
@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | |||
2089 | bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + | 2119 | bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + |
2090 | BITMAP_BLOCK_SHIFT); | 2120 | BITMAP_BLOCK_SHIFT); |
2091 | blocks = old_counts.chunks << old_counts.chunkshift; | 2121 | blocks = old_counts.chunks << old_counts.chunkshift; |
2092 | pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); | 2122 | pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); |
2093 | break; | 2123 | break; |
2094 | } else | 2124 | } else |
2095 | bitmap->counts.bp[page].count += 1; | 2125 | bitmap->counts.bp[page].count += 1; |
@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
2266 | /* Ensure new bitmap info is stored in | 2296 | /* Ensure new bitmap info is stored in |
2267 | * metadata promptly. | 2297 | * metadata promptly. |
2268 | */ | 2298 | */ |
2269 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2299 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2270 | md_wakeup_thread(mddev->thread); | 2300 | md_wakeup_thread(mddev->thread); |
2271 | } | 2301 | } |
2272 | rv = 0; | 2302 | rv = 0; |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6d53810963f7..953159d9a825 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) | |||
2011 | sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); | 2011 | sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); |
2012 | 2012 | ||
2013 | /* Force writing of superblocks to disk */ | 2013 | /* Force writing of superblocks to disk */ |
2014 | set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); | 2014 | set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags); |
2015 | 2015 | ||
2016 | /* Any superblock is better than none, choose that if given */ | 2016 | /* Any superblock is better than none, choose that if given */ |
2017 | return refdev ? 0 : 1; | 2017 | return refdev ? 0 : 1; |
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs) | |||
3497 | struct mddev *mddev = &rs->md; | 3497 | struct mddev *mddev = &rs->md; |
3498 | int ro = mddev->ro; | 3498 | int ro = mddev->ro; |
3499 | 3499 | ||
3500 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3500 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
3501 | mddev->ro = 0; | 3501 | mddev->ro = 0; |
3502 | md_update_sb(mddev, 1); | 3502 | md_update_sb(mddev, 1); |
3503 | mddev->ro = ro; | 3503 | mddev->ro = ro; |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 86f5d435901d..5975c9915684 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <trace/events/block.h> | ||
24 | #include "md.h" | 25 | #include "md.h" |
25 | #include "linear.h" | 26 | #include "linear.h" |
26 | 27 | ||
@@ -101,8 +102,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
101 | sector_t sectors; | 102 | sector_t sectors; |
102 | 103 | ||
103 | if (j < 0 || j >= raid_disks || disk->rdev) { | 104 | if (j < 0 || j >= raid_disks || disk->rdev) { |
104 | printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", | 105 | pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", |
105 | mdname(mddev)); | 106 | mdname(mddev)); |
106 | goto out; | 107 | goto out; |
107 | } | 108 | } |
108 | 109 | ||
@@ -123,8 +124,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
123 | discard_supported = true; | 124 | discard_supported = true; |
124 | } | 125 | } |
125 | if (cnt != raid_disks) { | 126 | if (cnt != raid_disks) { |
126 | printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", | 127 | pr_warn("md/linear:%s: not enough drives present. Aborting!\n", |
127 | mdname(mddev)); | 128 | mdname(mddev)); |
128 | goto out; | 129 | goto out; |
129 | } | 130 | } |
130 | 131 | ||
@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) | |||
227 | } | 228 | } |
228 | 229 | ||
229 | do { | 230 | do { |
230 | tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); | 231 | sector_t bio_sector = bio->bi_iter.bi_sector; |
232 | tmp_dev = which_dev(mddev, bio_sector); | ||
231 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | 233 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; |
232 | end_sector = tmp_dev->end_sector; | 234 | end_sector = tmp_dev->end_sector; |
233 | data_offset = tmp_dev->rdev->data_offset; | 235 | data_offset = tmp_dev->rdev->data_offset; |
234 | bio->bi_bdev = tmp_dev->rdev->bdev; | 236 | bio->bi_bdev = tmp_dev->rdev->bdev; |
235 | 237 | ||
236 | if (unlikely(bio->bi_iter.bi_sector >= end_sector || | 238 | if (unlikely(bio_sector >= end_sector || |
237 | bio->bi_iter.bi_sector < start_sector)) | 239 | bio_sector < start_sector)) |
238 | goto out_of_bounds; | 240 | goto out_of_bounds; |
239 | 241 | ||
240 | if (unlikely(bio_end_sector(bio) > end_sector)) { | 242 | if (unlikely(bio_end_sector(bio) > end_sector)) { |
241 | /* This bio crosses a device boundary, so we have to | 243 | /* This bio crosses a device boundary, so we have to |
242 | * split it. | 244 | * split it. |
243 | */ | 245 | */ |
244 | split = bio_split(bio, end_sector - | 246 | split = bio_split(bio, end_sector - bio_sector, |
245 | bio->bi_iter.bi_sector, | ||
246 | GFP_NOIO, fs_bio_set); | 247 | GFP_NOIO, fs_bio_set); |
247 | bio_chain(split, bio); | 248 | bio_chain(split, bio); |
248 | } else { | 249 | } else { |
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) | |||
256 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { | 257 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { |
257 | /* Just ignore it */ | 258 | /* Just ignore it */ |
258 | bio_endio(split); | 259 | bio_endio(split); |
259 | } else | 260 | } else { |
261 | if (mddev->gendisk) | ||
262 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), | ||
263 | split, disk_devt(mddev->gendisk), | ||
264 | bio_sector); | ||
260 | generic_make_request(split); | 265 | generic_make_request(split); |
266 | } | ||
261 | } while (split != bio); | 267 | } while (split != bio); |
262 | return; | 268 | return; |
263 | 269 | ||
264 | out_of_bounds: | 270 | out_of_bounds: |
265 | printk(KERN_ERR | 271 | pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", |
266 | "md/linear:%s: make_request: Sector %llu out of bounds on " | ||
267 | "dev %s: %llu sectors, offset %llu\n", | ||
268 | mdname(mddev), | 272 | mdname(mddev), |
269 | (unsigned long long)bio->bi_iter.bi_sector, | 273 | (unsigned long long)bio->bi_iter.bi_sector, |
270 | bdevname(tmp_dev->rdev->bdev, b), | 274 | bdevname(tmp_dev->rdev->bdev, b), |
@@ -275,7 +279,6 @@ out_of_bounds: | |||
275 | 279 | ||
276 | static void linear_status (struct seq_file *seq, struct mddev *mddev) | 280 | static void linear_status (struct seq_file *seq, struct mddev *mddev) |
277 | { | 281 | { |
278 | |||
279 | seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); | 282 | seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); |
280 | } | 283 | } |
281 | 284 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index f975cd08923d..82821ee0d57f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -30,6 +30,18 @@ | |||
30 | You should have received a copy of the GNU General Public License | 30 | You should have received a copy of the GNU General Public License |
31 | (for example /usr/src/linux/COPYING); if not, write to the Free | 31 | (for example /usr/src/linux/COPYING); if not, write to the Free |
32 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 32 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
33 | |||
34 | Errors, Warnings, etc. | ||
35 | Please use: | ||
36 | pr_crit() for error conditions that risk data loss | ||
37 | pr_err() for error conditions that are unexpected, like an IO error | ||
38 | or internal inconsistency | ||
39 | pr_warn() for error conditions that could have been predicated, like | ||
40 | adding a device to an array when it has incompatible metadata | ||
41 | pr_info() for every interesting, very rare events, like an array starting | ||
42 | or stopping, or resync starting or stopping | ||
43 | pr_debug() for everything else. | ||
44 | |||
33 | */ | 45 | */ |
34 | 46 | ||
35 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
@@ -52,6 +64,7 @@ | |||
52 | #include <linux/raid/md_p.h> | 64 | #include <linux/raid/md_p.h> |
53 | #include <linux/raid/md_u.h> | 65 | #include <linux/raid/md_u.h> |
54 | #include <linux/slab.h> | 66 | #include <linux/slab.h> |
67 | #include <trace/events/block.h> | ||
55 | #include "md.h" | 68 | #include "md.h" |
56 | #include "bitmap.h" | 69 | #include "bitmap.h" |
57 | #include "md-cluster.h" | 70 | #include "md-cluster.h" |
@@ -684,11 +697,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) | |||
684 | static int alloc_disk_sb(struct md_rdev *rdev) | 697 | static int alloc_disk_sb(struct md_rdev *rdev) |
685 | { | 698 | { |
686 | rdev->sb_page = alloc_page(GFP_KERNEL); | 699 | rdev->sb_page = alloc_page(GFP_KERNEL); |
687 | if (!rdev->sb_page) { | 700 | if (!rdev->sb_page) |
688 | printk(KERN_ALERT "md: out of memory.\n"); | ||
689 | return -ENOMEM; | 701 | return -ENOMEM; |
690 | } | ||
691 | |||
692 | return 0; | 702 | return 0; |
693 | } | 703 | } |
694 | 704 | ||
@@ -715,9 +725,15 @@ static void super_written(struct bio *bio) | |||
715 | struct mddev *mddev = rdev->mddev; | 725 | struct mddev *mddev = rdev->mddev; |
716 | 726 | ||
717 | if (bio->bi_error) { | 727 | if (bio->bi_error) { |
718 | printk("md: super_written gets error=%d\n", bio->bi_error); | 728 | pr_err("md: super_written gets error=%d\n", bio->bi_error); |
719 | md_error(mddev, rdev); | 729 | md_error(mddev, rdev); |
720 | } | 730 | if (!test_bit(Faulty, &rdev->flags) |
731 | && (bio->bi_opf & MD_FAILFAST)) { | ||
732 | set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); | ||
733 | set_bit(LastDev, &rdev->flags); | ||
734 | } | ||
735 | } else | ||
736 | clear_bit(LastDev, &rdev->flags); | ||
721 | 737 | ||
722 | if (atomic_dec_and_test(&mddev->pending_writes)) | 738 | if (atomic_dec_and_test(&mddev->pending_writes)) |
723 | wake_up(&mddev->sb_wait); | 739 | wake_up(&mddev->sb_wait); |
@@ -734,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | |||
734 | * if zero is reached. | 750 | * if zero is reached. |
735 | * If an error occurred, call md_error | 751 | * If an error occurred, call md_error |
736 | */ | 752 | */ |
737 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | 753 | struct bio *bio; |
754 | int ff = 0; | ||
755 | |||
756 | if (test_bit(Faulty, &rdev->flags)) | ||
757 | return; | ||
758 | |||
759 | bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | ||
738 | 760 | ||
739 | atomic_inc(&rdev->nr_pending); | 761 | atomic_inc(&rdev->nr_pending); |
740 | 762 | ||
@@ -743,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | |||
743 | bio_add_page(bio, page, size, 0); | 765 | bio_add_page(bio, page, size, 0); |
744 | bio->bi_private = rdev; | 766 | bio->bi_private = rdev; |
745 | bio->bi_end_io = super_written; | 767 | bio->bi_end_io = super_written; |
746 | bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA; | 768 | |
769 | if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && | ||
770 | test_bit(FailFast, &rdev->flags) && | ||
771 | !test_bit(LastDev, &rdev->flags)) | ||
772 | ff = MD_FAILFAST; | ||
773 | bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff; | ||
747 | 774 | ||
748 | atomic_inc(&mddev->pending_writes); | 775 | atomic_inc(&mddev->pending_writes); |
749 | submit_bio(bio); | 776 | submit_bio(bio); |
750 | } | 777 | } |
751 | 778 | ||
752 | void md_super_wait(struct mddev *mddev) | 779 | int md_super_wait(struct mddev *mddev) |
753 | { | 780 | { |
754 | /* wait for all superblock writes that were scheduled to complete */ | 781 | /* wait for all superblock writes that were scheduled to complete */ |
755 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); | 782 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
783 | if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) | ||
784 | return -EAGAIN; | ||
785 | return 0; | ||
756 | } | 786 | } |
757 | 787 | ||
758 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | 788 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
@@ -795,8 +825,8 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
795 | return 0; | 825 | return 0; |
796 | 826 | ||
797 | fail: | 827 | fail: |
798 | printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", | 828 | pr_err("md: disabled device %s, could not read superblock.\n", |
799 | bdevname(rdev->bdev,b)); | 829 | bdevname(rdev->bdev,b)); |
800 | return -EINVAL; | 830 | return -EINVAL; |
801 | } | 831 | } |
802 | 832 | ||
@@ -818,7 +848,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
818 | 848 | ||
819 | if (!tmp1 || !tmp2) { | 849 | if (!tmp1 || !tmp2) { |
820 | ret = 0; | 850 | ret = 0; |
821 | printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); | ||
822 | goto abort; | 851 | goto abort; |
823 | } | 852 | } |
824 | 853 | ||
@@ -932,7 +961,7 @@ int md_check_no_bitmap(struct mddev *mddev) | |||
932 | { | 961 | { |
933 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) | 962 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
934 | return 0; | 963 | return 0; |
935 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", | 964 | pr_warn("%s: bitmaps are not supported for %s\n", |
936 | mdname(mddev), mddev->pers->name); | 965 | mdname(mddev), mddev->pers->name); |
937 | return 1; | 966 | return 1; |
938 | } | 967 | } |
@@ -956,7 +985,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
956 | rdev->sb_start = calc_dev_sboffset(rdev); | 985 | rdev->sb_start = calc_dev_sboffset(rdev); |
957 | 986 | ||
958 | ret = read_disk_sb(rdev, MD_SB_BYTES); | 987 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
959 | if (ret) return ret; | 988 | if (ret) |
989 | return ret; | ||
960 | 990 | ||
961 | ret = -EINVAL; | 991 | ret = -EINVAL; |
962 | 992 | ||
@@ -964,17 +994,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
964 | sb = page_address(rdev->sb_page); | 994 | sb = page_address(rdev->sb_page); |
965 | 995 | ||
966 | if (sb->md_magic != MD_SB_MAGIC) { | 996 | if (sb->md_magic != MD_SB_MAGIC) { |
967 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | 997 | pr_warn("md: invalid raid superblock magic on %s\n", b); |
968 | b); | ||
969 | goto abort; | 998 | goto abort; |
970 | } | 999 | } |
971 | 1000 | ||
972 | if (sb->major_version != 0 || | 1001 | if (sb->major_version != 0 || |
973 | sb->minor_version < 90 || | 1002 | sb->minor_version < 90 || |
974 | sb->minor_version > 91) { | 1003 | sb->minor_version > 91) { |
975 | printk(KERN_WARNING "Bad version number %d.%d on %s\n", | 1004 | pr_warn("Bad version number %d.%d on %s\n", |
976 | sb->major_version, sb->minor_version, | 1005 | sb->major_version, sb->minor_version, b); |
977 | b); | ||
978 | goto abort; | 1006 | goto abort; |
979 | } | 1007 | } |
980 | 1008 | ||
@@ -982,8 +1010,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
982 | goto abort; | 1010 | goto abort; |
983 | 1011 | ||
984 | if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { | 1012 | if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { |
985 | printk(KERN_WARNING "md: invalid superblock checksum on %s\n", | 1013 | pr_warn("md: invalid superblock checksum on %s\n", b); |
986 | b); | ||
987 | goto abort; | 1014 | goto abort; |
988 | } | 1015 | } |
989 | 1016 | ||
@@ -1004,14 +1031,13 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
1004 | __u64 ev1, ev2; | 1031 | __u64 ev1, ev2; |
1005 | mdp_super_t *refsb = page_address(refdev->sb_page); | 1032 | mdp_super_t *refsb = page_address(refdev->sb_page); |
1006 | if (!uuid_equal(refsb, sb)) { | 1033 | if (!uuid_equal(refsb, sb)) { |
1007 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | 1034 | pr_warn("md: %s has different UUID to %s\n", |
1008 | b, bdevname(refdev->bdev,b2)); | 1035 | b, bdevname(refdev->bdev,b2)); |
1009 | goto abort; | 1036 | goto abort; |
1010 | } | 1037 | } |
1011 | if (!sb_equal(refsb, sb)) { | 1038 | if (!sb_equal(refsb, sb)) { |
1012 | printk(KERN_WARNING "md: %s has same UUID" | 1039 | pr_warn("md: %s has same UUID but different superblock to %s\n", |
1013 | " but different superblock to %s\n", | 1040 | b, bdevname(refdev->bdev, b2)); |
1014 | b, bdevname(refdev->bdev, b2)); | ||
1015 | goto abort; | 1041 | goto abort; |
1016 | } | 1042 | } |
1017 | ev1 = md_event(sb); | 1043 | ev1 = md_event(sb); |
@@ -1158,6 +1184,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1158 | } | 1184 | } |
1159 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) | 1185 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
1160 | set_bit(WriteMostly, &rdev->flags); | 1186 | set_bit(WriteMostly, &rdev->flags); |
1187 | if (desc->state & (1<<MD_DISK_FAILFAST)) | ||
1188 | set_bit(FailFast, &rdev->flags); | ||
1161 | } else /* MULTIPATH are always insync */ | 1189 | } else /* MULTIPATH are always insync */ |
1162 | set_bit(In_sync, &rdev->flags); | 1190 | set_bit(In_sync, &rdev->flags); |
1163 | return 0; | 1191 | return 0; |
@@ -1283,6 +1311,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1283 | } | 1311 | } |
1284 | if (test_bit(WriteMostly, &rdev2->flags)) | 1312 | if (test_bit(WriteMostly, &rdev2->flags)) |
1285 | d->state |= (1<<MD_DISK_WRITEMOSTLY); | 1313 | d->state |= (1<<MD_DISK_WRITEMOSTLY); |
1314 | if (test_bit(FailFast, &rdev2->flags)) | ||
1315 | d->state |= (1<<MD_DISK_FAILFAST); | ||
1286 | } | 1316 | } |
1287 | /* now set the "removed" and "faulty" bits on any missing devices */ | 1317 | /* now set the "removed" and "faulty" bits on any missing devices */ |
1288 | for (i=0 ; i < mddev->raid_disks ; i++) { | 1318 | for (i=0 ; i < mddev->raid_disks ; i++) { |
@@ -1324,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
1324 | if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && | 1354 | if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && |
1325 | rdev->mddev->level >= 1) | 1355 | rdev->mddev->level >= 1) |
1326 | num_sectors = (sector_t)(2ULL << 32) - 2; | 1356 | num_sectors = (sector_t)(2ULL << 32) - 2; |
1327 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1357 | do { |
1358 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||
1328 | rdev->sb_page); | 1359 | rdev->sb_page); |
1329 | md_super_wait(rdev->mddev); | 1360 | } while (md_super_wait(rdev->mddev) < 0); |
1330 | return num_sectors; | 1361 | return num_sectors; |
1331 | } | 1362 | } |
1332 | 1363 | ||
@@ -1413,13 +1444,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
1413 | return -EINVAL; | 1444 | return -EINVAL; |
1414 | 1445 | ||
1415 | if (calc_sb_1_csum(sb) != sb->sb_csum) { | 1446 | if (calc_sb_1_csum(sb) != sb->sb_csum) { |
1416 | printk("md: invalid superblock checksum on %s\n", | 1447 | pr_warn("md: invalid superblock checksum on %s\n", |
1417 | bdevname(rdev->bdev,b)); | 1448 | bdevname(rdev->bdev,b)); |
1418 | return -EINVAL; | 1449 | return -EINVAL; |
1419 | } | 1450 | } |
1420 | if (le64_to_cpu(sb->data_size) < 10) { | 1451 | if (le64_to_cpu(sb->data_size) < 10) { |
1421 | printk("md: data_size too small on %s\n", | 1452 | pr_warn("md: data_size too small on %s\n", |
1422 | bdevname(rdev->bdev,b)); | 1453 | bdevname(rdev->bdev,b)); |
1423 | return -EINVAL; | 1454 | return -EINVAL; |
1424 | } | 1455 | } |
1425 | if (sb->pad0 || | 1456 | if (sb->pad0 || |
@@ -1503,8 +1534,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
1503 | sb->level != refsb->level || | 1534 | sb->level != refsb->level || |
1504 | sb->layout != refsb->layout || | 1535 | sb->layout != refsb->layout || |
1505 | sb->chunksize != refsb->chunksize) { | 1536 | sb->chunksize != refsb->chunksize) { |
1506 | printk(KERN_WARNING "md: %s has strangely different" | 1537 | pr_warn("md: %s has strangely different superblock to %s\n", |
1507 | " superblock to %s\n", | ||
1508 | bdevname(rdev->bdev,b), | 1538 | bdevname(rdev->bdev,b), |
1509 | bdevname(refdev->bdev,b2)); | 1539 | bdevname(refdev->bdev,b2)); |
1510 | return -EINVAL; | 1540 | return -EINVAL; |
@@ -1646,8 +1676,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1646 | case MD_DISK_ROLE_JOURNAL: /* journal device */ | 1676 | case MD_DISK_ROLE_JOURNAL: /* journal device */ |
1647 | if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { | 1677 | if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { |
1648 | /* journal device without journal feature */ | 1678 | /* journal device without journal feature */ |
1649 | printk(KERN_WARNING | 1679 | pr_warn("md: journal device provided without journal feature, ignoring the device\n"); |
1650 | "md: journal device provided without journal feature, ignoring the device\n"); | ||
1651 | return -EINVAL; | 1680 | return -EINVAL; |
1652 | } | 1681 | } |
1653 | set_bit(Journal, &rdev->flags); | 1682 | set_bit(Journal, &rdev->flags); |
@@ -1669,6 +1698,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
1669 | } | 1698 | } |
1670 | if (sb->devflags & WriteMostly1) | 1699 | if (sb->devflags & WriteMostly1) |
1671 | set_bit(WriteMostly, &rdev->flags); | 1700 | set_bit(WriteMostly, &rdev->flags); |
1701 | if (sb->devflags & FailFast1) | ||
1702 | set_bit(FailFast, &rdev->flags); | ||
1672 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) | 1703 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) |
1673 | set_bit(Replacement, &rdev->flags); | 1704 | set_bit(Replacement, &rdev->flags); |
1674 | } else /* MULTIPATH are always insync */ | 1705 | } else /* MULTIPATH are always insync */ |
@@ -1707,6 +1738,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1707 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); | 1738 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
1708 | sb->level = cpu_to_le32(mddev->level); | 1739 | sb->level = cpu_to_le32(mddev->level); |
1709 | sb->layout = cpu_to_le32(mddev->layout); | 1740 | sb->layout = cpu_to_le32(mddev->layout); |
1741 | if (test_bit(FailFast, &rdev->flags)) | ||
1742 | sb->devflags |= FailFast1; | ||
1743 | else | ||
1744 | sb->devflags &= ~FailFast1; | ||
1710 | 1745 | ||
1711 | if (test_bit(WriteMostly, &rdev->flags)) | 1746 | if (test_bit(WriteMostly, &rdev->flags)) |
1712 | sb->devflags |= WriteMostly1; | 1747 | sb->devflags |= WriteMostly1; |
@@ -1863,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
1863 | sb->data_size = cpu_to_le64(num_sectors); | 1898 | sb->data_size = cpu_to_le64(num_sectors); |
1864 | sb->super_offset = rdev->sb_start; | 1899 | sb->super_offset = rdev->sb_start; |
1865 | sb->sb_csum = calc_sb_1_csum(sb); | 1900 | sb->sb_csum = calc_sb_1_csum(sb); |
1866 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1901 | do { |
1867 | rdev->sb_page); | 1902 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
1868 | md_super_wait(rdev->mddev); | 1903 | rdev->sb_page); |
1904 | } while (md_super_wait(rdev->mddev) < 0); | ||
1869 | return num_sectors; | 1905 | return num_sectors; |
1870 | 1906 | ||
1871 | } | 1907 | } |
@@ -2004,9 +2040,9 @@ int md_integrity_register(struct mddev *mddev) | |||
2004 | blk_integrity_register(mddev->gendisk, | 2040 | blk_integrity_register(mddev->gendisk, |
2005 | bdev_get_integrity(reference->bdev)); | 2041 | bdev_get_integrity(reference->bdev)); |
2006 | 2042 | ||
2007 | printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); | 2043 | pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); |
2008 | if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { | 2044 | if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { |
2009 | printk(KERN_ERR "md: failed to create integrity pool for %s\n", | 2045 | pr_err("md: failed to create integrity pool for %s\n", |
2010 | mdname(mddev)); | 2046 | mdname(mddev)); |
2011 | return -EINVAL; | 2047 | return -EINVAL; |
2012 | } | 2048 | } |
@@ -2034,8 +2070,8 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) | |||
2034 | return 0; | 2070 | return 0; |
2035 | 2071 | ||
2036 | if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { | 2072 | if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { |
2037 | printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", | 2073 | pr_err("%s: incompatible integrity profile for %s\n", |
2038 | mdname(mddev), bdevname(rdev->bdev, name)); | 2074 | mdname(mddev), bdevname(rdev->bdev, name)); |
2039 | return -ENXIO; | 2075 | return -ENXIO; |
2040 | } | 2076 | } |
2041 | 2077 | ||
@@ -2089,15 +2125,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) | |||
2089 | rcu_read_unlock(); | 2125 | rcu_read_unlock(); |
2090 | if (!test_bit(Journal, &rdev->flags) && | 2126 | if (!test_bit(Journal, &rdev->flags) && |
2091 | mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { | 2127 | mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
2092 | printk(KERN_WARNING "md: %s: array is limited to %d devices\n", | 2128 | pr_warn("md: %s: array is limited to %d devices\n", |
2093 | mdname(mddev), mddev->max_disks); | 2129 | mdname(mddev), mddev->max_disks); |
2094 | return -EBUSY; | 2130 | return -EBUSY; |
2095 | } | 2131 | } |
2096 | bdevname(rdev->bdev,b); | 2132 | bdevname(rdev->bdev,b); |
2097 | strreplace(b, '/', '!'); | 2133 | strreplace(b, '/', '!'); |
2098 | 2134 | ||
2099 | rdev->mddev = mddev; | 2135 | rdev->mddev = mddev; |
2100 | printk(KERN_INFO "md: bind<%s>\n", b); | 2136 | pr_debug("md: bind<%s>\n", b); |
2101 | 2137 | ||
2102 | if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) | 2138 | if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) |
2103 | goto fail; | 2139 | goto fail; |
@@ -2116,8 +2152,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) | |||
2116 | return 0; | 2152 | return 0; |
2117 | 2153 | ||
2118 | fail: | 2154 | fail: |
2119 | printk(KERN_WARNING "md: failed to register dev-%s for %s\n", | 2155 | pr_warn("md: failed to register dev-%s for %s\n", |
2120 | b, mdname(mddev)); | 2156 | b, mdname(mddev)); |
2121 | return err; | 2157 | return err; |
2122 | } | 2158 | } |
2123 | 2159 | ||
@@ -2134,7 +2170,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) | |||
2134 | 2170 | ||
2135 | bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); | 2171 | bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
2136 | list_del_rcu(&rdev->same_set); | 2172 | list_del_rcu(&rdev->same_set); |
2137 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 2173 | pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
2138 | rdev->mddev = NULL; | 2174 | rdev->mddev = NULL; |
2139 | sysfs_remove_link(&rdev->kobj, "block"); | 2175 | sysfs_remove_link(&rdev->kobj, "block"); |
2140 | sysfs_put(rdev->sysfs_state); | 2176 | sysfs_put(rdev->sysfs_state); |
@@ -2164,8 +2200,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) | |||
2164 | bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, | 2200 | bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
2165 | shared ? (struct md_rdev *)lock_rdev : rdev); | 2201 | shared ? (struct md_rdev *)lock_rdev : rdev); |
2166 | if (IS_ERR(bdev)) { | 2202 | if (IS_ERR(bdev)) { |
2167 | printk(KERN_ERR "md: could not open %s.\n", | 2203 | pr_warn("md: could not open %s.\n", __bdevname(dev, b)); |
2168 | __bdevname(dev, b)); | ||
2169 | return PTR_ERR(bdev); | 2204 | return PTR_ERR(bdev); |
2170 | } | 2205 | } |
2171 | rdev->bdev = bdev; | 2206 | rdev->bdev = bdev; |
@@ -2185,8 +2220,7 @@ static void export_rdev(struct md_rdev *rdev) | |||
2185 | { | 2220 | { |
2186 | char b[BDEVNAME_SIZE]; | 2221 | char b[BDEVNAME_SIZE]; |
2187 | 2222 | ||
2188 | printk(KERN_INFO "md: export_rdev(%s)\n", | 2223 | pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); |
2189 | bdevname(rdev->bdev,b)); | ||
2190 | md_rdev_clear(rdev); | 2224 | md_rdev_clear(rdev); |
2191 | #ifndef MODULE | 2225 | #ifndef MODULE |
2192 | if (test_bit(AutoDetected, &rdev->flags)) | 2226 | if (test_bit(AutoDetected, &rdev->flags)) |
@@ -2288,24 +2322,24 @@ void md_update_sb(struct mddev *mddev, int force_change) | |||
2288 | 2322 | ||
2289 | if (mddev->ro) { | 2323 | if (mddev->ro) { |
2290 | if (force_change) | 2324 | if (force_change) |
2291 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2325 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2292 | return; | 2326 | return; |
2293 | } | 2327 | } |
2294 | 2328 | ||
2295 | repeat: | 2329 | repeat: |
2296 | if (mddev_is_clustered(mddev)) { | 2330 | if (mddev_is_clustered(mddev)) { |
2297 | if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) | 2331 | if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
2298 | force_change = 1; | 2332 | force_change = 1; |
2299 | if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) | 2333 | if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
2300 | nospares = 1; | 2334 | nospares = 1; |
2301 | ret = md_cluster_ops->metadata_update_start(mddev); | 2335 | ret = md_cluster_ops->metadata_update_start(mddev); |
2302 | /* Has someone else has updated the sb */ | 2336 | /* Has someone else has updated the sb */ |
2303 | if (!does_sb_need_changing(mddev)) { | 2337 | if (!does_sb_need_changing(mddev)) { |
2304 | if (ret == 0) | 2338 | if (ret == 0) |
2305 | md_cluster_ops->metadata_update_cancel(mddev); | 2339 | md_cluster_ops->metadata_update_cancel(mddev); |
2306 | bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), | 2340 | bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
2307 | BIT(MD_CHANGE_DEVS) | | 2341 | BIT(MD_SB_CHANGE_DEVS) | |
2308 | BIT(MD_CHANGE_CLEAN)); | 2342 | BIT(MD_SB_CHANGE_CLEAN)); |
2309 | return; | 2343 | return; |
2310 | } | 2344 | } |
2311 | } | 2345 | } |
@@ -2321,10 +2355,10 @@ repeat: | |||
2321 | 2355 | ||
2322 | } | 2356 | } |
2323 | if (!mddev->persistent) { | 2357 | if (!mddev->persistent) { |
2324 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2358 | clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
2325 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2359 | clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2326 | if (!mddev->external) { | 2360 | if (!mddev->external) { |
2327 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2361 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
2328 | rdev_for_each(rdev, mddev) { | 2362 | rdev_for_each(rdev, mddev) { |
2329 | if (rdev->badblocks.changed) { | 2363 | if (rdev->badblocks.changed) { |
2330 | rdev->badblocks.changed = 0; | 2364 | rdev->badblocks.changed = 0; |
@@ -2344,9 +2378,9 @@ repeat: | |||
2344 | 2378 | ||
2345 | mddev->utime = ktime_get_real_seconds(); | 2379 | mddev->utime = ktime_get_real_seconds(); |
2346 | 2380 | ||
2347 | if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) | 2381 | if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
2348 | force_change = 1; | 2382 | force_change = 1; |
2349 | if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) | 2383 | if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
2350 | /* just a clean<-> dirty transition, possibly leave spares alone, | 2384 | /* just a clean<-> dirty transition, possibly leave spares alone, |
2351 | * though if events isn't the right even/odd, we will have to do | 2385 | * though if events isn't the right even/odd, we will have to do |
2352 | * spares after all | 2386 | * spares after all |
@@ -2402,6 +2436,9 @@ repeat: | |||
2402 | pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", | 2436 | pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", |
2403 | mdname(mddev), mddev->in_sync); | 2437 | mdname(mddev), mddev->in_sync); |
2404 | 2438 | ||
2439 | if (mddev->queue) | ||
2440 | blk_add_trace_msg(mddev->queue, "md md_update_sb"); | ||
2441 | rewrite: | ||
2405 | bitmap_update_sb(mddev->bitmap); | 2442 | bitmap_update_sb(mddev->bitmap); |
2406 | rdev_for_each(rdev, mddev) { | 2443 | rdev_for_each(rdev, mddev) { |
2407 | char b[BDEVNAME_SIZE]; | 2444 | char b[BDEVNAME_SIZE]; |
@@ -2433,15 +2470,16 @@ repeat: | |||
2433 | /* only need to write one superblock... */ | 2470 | /* only need to write one superblock... */ |
2434 | break; | 2471 | break; |
2435 | } | 2472 | } |
2436 | md_super_wait(mddev); | 2473 | if (md_super_wait(mddev) < 0) |
2437 | /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ | 2474 | goto rewrite; |
2475 | /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ | ||
2438 | 2476 | ||
2439 | if (mddev_is_clustered(mddev) && ret == 0) | 2477 | if (mddev_is_clustered(mddev) && ret == 0) |
2440 | md_cluster_ops->metadata_update_finish(mddev); | 2478 | md_cluster_ops->metadata_update_finish(mddev); |
2441 | 2479 | ||
2442 | if (mddev->in_sync != sync_req || | 2480 | if (mddev->in_sync != sync_req || |
2443 | !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), | 2481 | !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
2444 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) | 2482 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) |
2445 | /* have to write it out again */ | 2483 | /* have to write it out again */ |
2446 | goto repeat; | 2484 | goto repeat; |
2447 | wake_up(&mddev->sb_wait); | 2485 | wake_up(&mddev->sb_wait); |
@@ -2485,7 +2523,7 @@ static int add_bound_rdev(struct md_rdev *rdev) | |||
2485 | } | 2523 | } |
2486 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2524 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
2487 | 2525 | ||
2488 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2526 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2489 | if (mddev->degraded) | 2527 | if (mddev->degraded) |
2490 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 2528 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
2491 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2529 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
@@ -2523,51 +2561,41 @@ struct rdev_sysfs_entry { | |||
2523 | static ssize_t | 2561 | static ssize_t |
2524 | state_show(struct md_rdev *rdev, char *page) | 2562 | state_show(struct md_rdev *rdev, char *page) |
2525 | { | 2563 | { |
2526 | char *sep = ""; | 2564 | char *sep = ","; |
2527 | size_t len = 0; | 2565 | size_t len = 0; |
2528 | unsigned long flags = ACCESS_ONCE(rdev->flags); | 2566 | unsigned long flags = ACCESS_ONCE(rdev->flags); |
2529 | 2567 | ||
2530 | if (test_bit(Faulty, &flags) || | 2568 | if (test_bit(Faulty, &flags) || |
2531 | rdev->badblocks.unacked_exist) { | 2569 | (!test_bit(ExternalBbl, &flags) && |
2532 | len+= sprintf(page+len, "%sfaulty",sep); | 2570 | rdev->badblocks.unacked_exist)) |
2533 | sep = ","; | 2571 | len += sprintf(page+len, "faulty%s", sep); |
2534 | } | 2572 | if (test_bit(In_sync, &flags)) |
2535 | if (test_bit(In_sync, &flags)) { | 2573 | len += sprintf(page+len, "in_sync%s", sep); |
2536 | len += sprintf(page+len, "%sin_sync",sep); | 2574 | if (test_bit(Journal, &flags)) |
2537 | sep = ","; | 2575 | len += sprintf(page+len, "journal%s", sep); |
2538 | } | 2576 | if (test_bit(WriteMostly, &flags)) |
2539 | if (test_bit(Journal, &flags)) { | 2577 | len += sprintf(page+len, "write_mostly%s", sep); |
2540 | len += sprintf(page+len, "%sjournal",sep); | ||
2541 | sep = ","; | ||
2542 | } | ||
2543 | if (test_bit(WriteMostly, &flags)) { | ||
2544 | len += sprintf(page+len, "%swrite_mostly",sep); | ||
2545 | sep = ","; | ||
2546 | } | ||
2547 | if (test_bit(Blocked, &flags) || | 2578 | if (test_bit(Blocked, &flags) || |
2548 | (rdev->badblocks.unacked_exist | 2579 | (rdev->badblocks.unacked_exist |
2549 | && !test_bit(Faulty, &flags))) { | 2580 | && !test_bit(Faulty, &flags))) |
2550 | len += sprintf(page+len, "%sblocked", sep); | 2581 | len += sprintf(page+len, "blocked%s", sep); |
2551 | sep = ","; | ||
2552 | } | ||
2553 | if (!test_bit(Faulty, &flags) && | 2582 | if (!test_bit(Faulty, &flags) && |
2554 | !test_bit(Journal, &flags) && | 2583 | !test_bit(Journal, &flags) && |
2555 | !test_bit(In_sync, &flags)) { | 2584 | !test_bit(In_sync, &flags)) |
2556 | len += sprintf(page+len, "%sspare", sep); | 2585 | len += sprintf(page+len, "spare%s", sep); |
2557 | sep = ","; | 2586 | if (test_bit(WriteErrorSeen, &flags)) |
2558 | } | 2587 | len += sprintf(page+len, "write_error%s", sep); |
2559 | if (test_bit(WriteErrorSeen, &flags)) { | 2588 | if (test_bit(WantReplacement, &flags)) |
2560 | len += sprintf(page+len, "%swrite_error", sep); | 2589 | len += sprintf(page+len, "want_replacement%s", sep); |
2561 | sep = ","; | 2590 | if (test_bit(Replacement, &flags)) |
2562 | } | 2591 | len += sprintf(page+len, "replacement%s", sep); |
2563 | if (test_bit(WantReplacement, &flags)) { | 2592 | if (test_bit(ExternalBbl, &flags)) |
2564 | len += sprintf(page+len, "%swant_replacement", sep); | 2593 | len += sprintf(page+len, "external_bbl%s", sep); |
2565 | sep = ","; | 2594 | if (test_bit(FailFast, &flags)) |
2566 | } | 2595 | len += sprintf(page+len, "failfast%s", sep); |
2567 | if (test_bit(Replacement, &flags)) { | 2596 | |
2568 | len += sprintf(page+len, "%sreplacement", sep); | 2597 | if (len) |
2569 | sep = ","; | 2598 | len -= strlen(sep); |
2570 | } | ||
2571 | 2599 | ||
2572 | return len+sprintf(page+len, "\n"); | 2600 | return len+sprintf(page+len, "\n"); |
2573 | } | 2601 | } |
@@ -2587,6 +2615,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2587 | * so that it gets rebuilt based on bitmap | 2615 | * so that it gets rebuilt based on bitmap |
2588 | * write_error - sets WriteErrorSeen | 2616 | * write_error - sets WriteErrorSeen |
2589 | * -write_error - clears WriteErrorSeen | 2617 | * -write_error - clears WriteErrorSeen |
2618 | * {,-}failfast - set/clear FailFast | ||
2590 | */ | 2619 | */ |
2591 | int err = -EINVAL; | 2620 | int err = -EINVAL; |
2592 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 2621 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
@@ -2610,8 +2639,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2610 | 2639 | ||
2611 | if (err == 0) { | 2640 | if (err == 0) { |
2612 | md_kick_rdev_from_array(rdev); | 2641 | md_kick_rdev_from_array(rdev); |
2613 | if (mddev->pers) | 2642 | if (mddev->pers) { |
2614 | md_update_sb(mddev, 1); | 2643 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2644 | md_wakeup_thread(mddev->thread); | ||
2645 | } | ||
2615 | md_new_event(mddev); | 2646 | md_new_event(mddev); |
2616 | } | 2647 | } |
2617 | } | 2648 | } |
@@ -2626,6 +2657,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2626 | err = 0; | 2657 | err = 0; |
2627 | } else if (cmd_match(buf, "-blocked")) { | 2658 | } else if (cmd_match(buf, "-blocked")) { |
2628 | if (!test_bit(Faulty, &rdev->flags) && | 2659 | if (!test_bit(Faulty, &rdev->flags) && |
2660 | !test_bit(ExternalBbl, &rdev->flags) && | ||
2629 | rdev->badblocks.unacked_exist) { | 2661 | rdev->badblocks.unacked_exist) { |
2630 | /* metadata handler doesn't understand badblocks, | 2662 | /* metadata handler doesn't understand badblocks, |
2631 | * so we need to fail the device | 2663 | * so we need to fail the device |
@@ -2642,6 +2674,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2642 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2674 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
2643 | set_bit(In_sync, &rdev->flags); | 2675 | set_bit(In_sync, &rdev->flags); |
2644 | err = 0; | 2676 | err = 0; |
2677 | } else if (cmd_match(buf, "failfast")) { | ||
2678 | set_bit(FailFast, &rdev->flags); | ||
2679 | err = 0; | ||
2680 | } else if (cmd_match(buf, "-failfast")) { | ||
2681 | clear_bit(FailFast, &rdev->flags); | ||
2682 | err = 0; | ||
2645 | } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && | 2683 | } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && |
2646 | !test_bit(Journal, &rdev->flags)) { | 2684 | !test_bit(Journal, &rdev->flags)) { |
2647 | if (rdev->mddev->pers == NULL) { | 2685 | if (rdev->mddev->pers == NULL) { |
@@ -2708,6 +2746,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2708 | } | 2746 | } |
2709 | } else | 2747 | } else |
2710 | err = -EBUSY; | 2748 | err = -EBUSY; |
2749 | } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { | ||
2750 | set_bit(ExternalBbl, &rdev->flags); | ||
2751 | rdev->badblocks.shift = 0; | ||
2752 | err = 0; | ||
2753 | } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { | ||
2754 | clear_bit(ExternalBbl, &rdev->flags); | ||
2755 | err = 0; | ||
2711 | } | 2756 | } |
2712 | if (!err) | 2757 | if (!err) |
2713 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2758 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -3211,10 +3256,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
3211 | sector_t size; | 3256 | sector_t size; |
3212 | 3257 | ||
3213 | rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); | 3258 | rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); |
3214 | if (!rdev) { | 3259 | if (!rdev) |
3215 | printk(KERN_ERR "md: could not alloc mem for new device!\n"); | ||
3216 | return ERR_PTR(-ENOMEM); | 3260 | return ERR_PTR(-ENOMEM); |
3217 | } | ||
3218 | 3261 | ||
3219 | err = md_rdev_init(rdev); | 3262 | err = md_rdev_init(rdev); |
3220 | if (err) | 3263 | if (err) |
@@ -3231,8 +3274,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
3231 | 3274 | ||
3232 | size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; | 3275 | size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; |
3233 | if (!size) { | 3276 | if (!size) { |
3234 | printk(KERN_WARNING | 3277 | pr_warn("md: %s has zero or unknown size, marking faulty!\n", |
3235 | "md: %s has zero or unknown size, marking faulty!\n", | ||
3236 | bdevname(rdev->bdev,b)); | 3278 | bdevname(rdev->bdev,b)); |
3237 | err = -EINVAL; | 3279 | err = -EINVAL; |
3238 | goto abort_free; | 3280 | goto abort_free; |
@@ -3242,16 +3284,13 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
3242 | err = super_types[super_format]. | 3284 | err = super_types[super_format]. |
3243 | load_super(rdev, NULL, super_minor); | 3285 | load_super(rdev, NULL, super_minor); |
3244 | if (err == -EINVAL) { | 3286 | if (err == -EINVAL) { |
3245 | printk(KERN_WARNING | 3287 | pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", |
3246 | "md: %s does not have a valid v%d.%d " | ||
3247 | "superblock, not importing!\n", | ||
3248 | bdevname(rdev->bdev,b), | 3288 | bdevname(rdev->bdev,b), |
3249 | super_format, super_minor); | 3289 | super_format, super_minor); |
3250 | goto abort_free; | 3290 | goto abort_free; |
3251 | } | 3291 | } |
3252 | if (err < 0) { | 3292 | if (err < 0) { |
3253 | printk(KERN_WARNING | 3293 | pr_warn("md: could not read %s's sb, not importing!\n", |
3254 | "md: could not read %s's sb, not importing!\n", | ||
3255 | bdevname(rdev->bdev,b)); | 3294 | bdevname(rdev->bdev,b)); |
3256 | goto abort_free; | 3295 | goto abort_free; |
3257 | } | 3296 | } |
@@ -3287,9 +3326,7 @@ static void analyze_sbs(struct mddev *mddev) | |||
3287 | case 0: | 3326 | case 0: |
3288 | break; | 3327 | break; |
3289 | default: | 3328 | default: |
3290 | printk( KERN_ERR \ | 3329 | pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", |
3291 | "md: fatal superblock inconsistency in %s" | ||
3292 | " -- removing from array\n", | ||
3293 | bdevname(rdev->bdev,b)); | 3330 | bdevname(rdev->bdev,b)); |
3294 | md_kick_rdev_from_array(rdev); | 3331 | md_kick_rdev_from_array(rdev); |
3295 | } | 3332 | } |
@@ -3302,18 +3339,16 @@ static void analyze_sbs(struct mddev *mddev) | |||
3302 | if (mddev->max_disks && | 3339 | if (mddev->max_disks && |
3303 | (rdev->desc_nr >= mddev->max_disks || | 3340 | (rdev->desc_nr >= mddev->max_disks || |
3304 | i > mddev->max_disks)) { | 3341 | i > mddev->max_disks)) { |
3305 | printk(KERN_WARNING | 3342 | pr_warn("md: %s: %s: only %d devices permitted\n", |
3306 | "md: %s: %s: only %d devices permitted\n", | 3343 | mdname(mddev), bdevname(rdev->bdev, b), |
3307 | mdname(mddev), bdevname(rdev->bdev, b), | 3344 | mddev->max_disks); |
3308 | mddev->max_disks); | ||
3309 | md_kick_rdev_from_array(rdev); | 3345 | md_kick_rdev_from_array(rdev); |
3310 | continue; | 3346 | continue; |
3311 | } | 3347 | } |
3312 | if (rdev != freshest) { | 3348 | if (rdev != freshest) { |
3313 | if (super_types[mddev->major_version]. | 3349 | if (super_types[mddev->major_version]. |
3314 | validate_super(mddev, rdev)) { | 3350 | validate_super(mddev, rdev)) { |
3315 | printk(KERN_WARNING "md: kicking non-fresh %s" | 3351 | pr_warn("md: kicking non-fresh %s from array!\n", |
3316 | " from array!\n", | ||
3317 | bdevname(rdev->bdev,b)); | 3352 | bdevname(rdev->bdev,b)); |
3318 | md_kick_rdev_from_array(rdev); | 3353 | md_kick_rdev_from_array(rdev); |
3319 | continue; | 3354 | continue; |
@@ -3384,7 +3419,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) | |||
3384 | unsigned long msec; | 3419 | unsigned long msec; |
3385 | 3420 | ||
3386 | if (mddev_is_clustered(mddev)) { | 3421 | if (mddev_is_clustered(mddev)) { |
3387 | pr_info("md: Safemode is disabled for clustered mode\n"); | 3422 | pr_warn("md: Safemode is disabled for clustered mode\n"); |
3388 | return -EINVAL; | 3423 | return -EINVAL; |
3389 | } | 3424 | } |
3390 | 3425 | ||
@@ -3472,8 +3507,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3472 | 3507 | ||
3473 | rv = -EINVAL; | 3508 | rv = -EINVAL; |
3474 | if (!mddev->pers->quiesce) { | 3509 | if (!mddev->pers->quiesce) { |
3475 | printk(KERN_WARNING "md: %s: %s does not support online personality change\n", | 3510 | pr_warn("md: %s: %s does not support online personality change\n", |
3476 | mdname(mddev), mddev->pers->name); | 3511 | mdname(mddev), mddev->pers->name); |
3477 | goto out_unlock; | 3512 | goto out_unlock; |
3478 | } | 3513 | } |
3479 | 3514 | ||
@@ -3491,7 +3526,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3491 | pers = find_pers(level, clevel); | 3526 | pers = find_pers(level, clevel); |
3492 | if (!pers || !try_module_get(pers->owner)) { | 3527 | if (!pers || !try_module_get(pers->owner)) { |
3493 | spin_unlock(&pers_lock); | 3528 | spin_unlock(&pers_lock); |
3494 | printk(KERN_WARNING "md: personality %s not loaded\n", clevel); | 3529 | pr_warn("md: personality %s not loaded\n", clevel); |
3495 | rv = -EINVAL; | 3530 | rv = -EINVAL; |
3496 | goto out_unlock; | 3531 | goto out_unlock; |
3497 | } | 3532 | } |
@@ -3505,8 +3540,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3505 | } | 3540 | } |
3506 | if (!pers->takeover) { | 3541 | if (!pers->takeover) { |
3507 | module_put(pers->owner); | 3542 | module_put(pers->owner); |
3508 | printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", | 3543 | pr_warn("md: %s: %s does not support personality takeover\n", |
3509 | mdname(mddev), clevel); | 3544 | mdname(mddev), clevel); |
3510 | rv = -EINVAL; | 3545 | rv = -EINVAL; |
3511 | goto out_unlock; | 3546 | goto out_unlock; |
3512 | } | 3547 | } |
@@ -3526,8 +3561,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3526 | mddev->delta_disks = 0; | 3561 | mddev->delta_disks = 0; |
3527 | mddev->reshape_backwards = 0; | 3562 | mddev->reshape_backwards = 0; |
3528 | module_put(pers->owner); | 3563 | module_put(pers->owner); |
3529 | printk(KERN_WARNING "md: %s: %s would not accept array\n", | 3564 | pr_warn("md: %s: %s would not accept array\n", |
3530 | mdname(mddev), clevel); | 3565 | mdname(mddev), clevel); |
3531 | rv = PTR_ERR(priv); | 3566 | rv = PTR_ERR(priv); |
3532 | goto out_unlock; | 3567 | goto out_unlock; |
3533 | } | 3568 | } |
@@ -3570,9 +3605,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3570 | pers->sync_request != NULL) { | 3605 | pers->sync_request != NULL) { |
3571 | /* need to add the md_redundancy_group */ | 3606 | /* need to add the md_redundancy_group */ |
3572 | if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) | 3607 | if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) |
3573 | printk(KERN_WARNING | 3608 | pr_warn("md: cannot register extra attributes for %s\n", |
3574 | "md: cannot register extra attributes for %s\n", | 3609 | mdname(mddev)); |
3575 | mdname(mddev)); | ||
3576 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); | 3610 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); |
3577 | } | 3611 | } |
3578 | if (oldpers->sync_request != NULL && | 3612 | if (oldpers->sync_request != NULL && |
@@ -3603,9 +3637,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3603 | clear_bit(In_sync, &rdev->flags); | 3637 | clear_bit(In_sync, &rdev->flags); |
3604 | else { | 3638 | else { |
3605 | if (sysfs_link_rdev(mddev, rdev)) | 3639 | if (sysfs_link_rdev(mddev, rdev)) |
3606 | printk(KERN_WARNING "md: cannot register rd%d" | 3640 | pr_warn("md: cannot register rd%d for %s after level change\n", |
3607 | " for %s after level change\n", | 3641 | rdev->raid_disk, mdname(mddev)); |
3608 | rdev->raid_disk, mdname(mddev)); | ||
3609 | } | 3642 | } |
3610 | } | 3643 | } |
3611 | 3644 | ||
@@ -3618,7 +3651,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3618 | } | 3651 | } |
3619 | blk_set_stacking_limits(&mddev->queue->limits); | 3652 | blk_set_stacking_limits(&mddev->queue->limits); |
3620 | pers->run(mddev); | 3653 | pers->run(mddev); |
3621 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3654 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
3622 | mddev_resume(mddev); | 3655 | mddev_resume(mddev); |
3623 | if (!mddev->thread) | 3656 | if (!mddev->thread) |
3624 | md_update_sb(mddev, 1); | 3657 | md_update_sb(mddev, 1); |
@@ -3813,7 +3846,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) | |||
3813 | if (!err) { | 3846 | if (!err) { |
3814 | mddev->recovery_cp = n; | 3847 | mddev->recovery_cp = n; |
3815 | if (mddev->pers) | 3848 | if (mddev->pers) |
3816 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 3849 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
3817 | } | 3850 | } |
3818 | mddev_unlock(mddev); | 3851 | mddev_unlock(mddev); |
3819 | return err ?: len; | 3852 | return err ?: len; |
@@ -3887,7 +3920,7 @@ array_state_show(struct mddev *mddev, char *page) | |||
3887 | st = read_auto; | 3920 | st = read_auto; |
3888 | break; | 3921 | break; |
3889 | case 0: | 3922 | case 0: |
3890 | if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) | 3923 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
3891 | st = write_pending; | 3924 | st = write_pending; |
3892 | else if (mddev->in_sync) | 3925 | else if (mddev->in_sync) |
3893 | st = clean; | 3926 | st = clean; |
@@ -3925,7 +3958,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
3925 | spin_lock(&mddev->lock); | 3958 | spin_lock(&mddev->lock); |
3926 | if (st == active) { | 3959 | if (st == active) { |
3927 | restart_array(mddev); | 3960 | restart_array(mddev); |
3928 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 3961 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
3962 | md_wakeup_thread(mddev->thread); | ||
3929 | wake_up(&mddev->sb_wait); | 3963 | wake_up(&mddev->sb_wait); |
3930 | err = 0; | 3964 | err = 0; |
3931 | } else /* st == clean */ { | 3965 | } else /* st == clean */ { |
@@ -3935,7 +3969,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
3935 | mddev->in_sync = 1; | 3969 | mddev->in_sync = 1; |
3936 | if (mddev->safemode == 1) | 3970 | if (mddev->safemode == 1) |
3937 | mddev->safemode = 0; | 3971 | mddev->safemode = 0; |
3938 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 3972 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
3939 | } | 3973 | } |
3940 | err = 0; | 3974 | err = 0; |
3941 | } else | 3975 | } else |
@@ -4001,7 +4035,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
4001 | mddev->in_sync = 1; | 4035 | mddev->in_sync = 1; |
4002 | if (mddev->safemode == 1) | 4036 | if (mddev->safemode == 1) |
4003 | mddev->safemode = 0; | 4037 | mddev->safemode = 0; |
4004 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 4038 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
4005 | } | 4039 | } |
4006 | err = 0; | 4040 | err = 0; |
4007 | } else | 4041 | } else |
@@ -4015,7 +4049,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
4015 | err = restart_array(mddev); | 4049 | err = restart_array(mddev); |
4016 | if (err) | 4050 | if (err) |
4017 | break; | 4051 | break; |
4018 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 4052 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
4019 | wake_up(&mddev->sb_wait); | 4053 | wake_up(&mddev->sb_wait); |
4020 | err = 0; | 4054 | err = 0; |
4021 | } else { | 4055 | } else { |
@@ -5071,13 +5105,13 @@ static int md_alloc(dev_t dev, char *name) | |||
5071 | /* This isn't possible, but as kobject_init_and_add is marked | 5105 | /* This isn't possible, but as kobject_init_and_add is marked |
5072 | * __must_check, we must do something with the result | 5106 | * __must_check, we must do something with the result |
5073 | */ | 5107 | */ |
5074 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 5108 | pr_debug("md: cannot register %s/md - name in use\n", |
5075 | disk->disk_name); | 5109 | disk->disk_name); |
5076 | error = 0; | 5110 | error = 0; |
5077 | } | 5111 | } |
5078 | if (mddev->kobj.sd && | 5112 | if (mddev->kobj.sd && |
5079 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | 5113 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) |
5080 | printk(KERN_DEBUG "pointless warning\n"); | 5114 | pr_debug("pointless warning\n"); |
5081 | mutex_unlock(&mddev->open_mutex); | 5115 | mutex_unlock(&mddev->open_mutex); |
5082 | abort: | 5116 | abort: |
5083 | mutex_unlock(&disks_mutex); | 5117 | mutex_unlock(&disks_mutex); |
@@ -5179,15 +5213,15 @@ int md_run(struct mddev *mddev) | |||
5179 | if (mddev->dev_sectors && | 5213 | if (mddev->dev_sectors && |
5180 | rdev->data_offset + mddev->dev_sectors | 5214 | rdev->data_offset + mddev->dev_sectors |
5181 | > rdev->sb_start) { | 5215 | > rdev->sb_start) { |
5182 | printk("md: %s: data overlaps metadata\n", | 5216 | pr_warn("md: %s: data overlaps metadata\n", |
5183 | mdname(mddev)); | 5217 | mdname(mddev)); |
5184 | return -EINVAL; | 5218 | return -EINVAL; |
5185 | } | 5219 | } |
5186 | } else { | 5220 | } else { |
5187 | if (rdev->sb_start + rdev->sb_size/512 | 5221 | if (rdev->sb_start + rdev->sb_size/512 |
5188 | > rdev->data_offset) { | 5222 | > rdev->data_offset) { |
5189 | printk("md: %s: metadata overlaps data\n", | 5223 | pr_warn("md: %s: metadata overlaps data\n", |
5190 | mdname(mddev)); | 5224 | mdname(mddev)); |
5191 | return -EINVAL; | 5225 | return -EINVAL; |
5192 | } | 5226 | } |
5193 | } | 5227 | } |
@@ -5202,11 +5236,11 @@ int md_run(struct mddev *mddev) | |||
5202 | if (!pers || !try_module_get(pers->owner)) { | 5236 | if (!pers || !try_module_get(pers->owner)) { |
5203 | spin_unlock(&pers_lock); | 5237 | spin_unlock(&pers_lock); |
5204 | if (mddev->level != LEVEL_NONE) | 5238 | if (mddev->level != LEVEL_NONE) |
5205 | printk(KERN_WARNING "md: personality for level %d is not loaded!\n", | 5239 | pr_warn("md: personality for level %d is not loaded!\n", |
5206 | mddev->level); | 5240 | mddev->level); |
5207 | else | 5241 | else |
5208 | printk(KERN_WARNING "md: personality for level %s is not loaded!\n", | 5242 | pr_warn("md: personality for level %s is not loaded!\n", |
5209 | mddev->clevel); | 5243 | mddev->clevel); |
5210 | return -EINVAL; | 5244 | return -EINVAL; |
5211 | } | 5245 | } |
5212 | spin_unlock(&pers_lock); | 5246 | spin_unlock(&pers_lock); |
@@ -5236,21 +5270,16 @@ int md_run(struct mddev *mddev) | |||
5236 | if (rdev < rdev2 && | 5270 | if (rdev < rdev2 && |
5237 | rdev->bdev->bd_contains == | 5271 | rdev->bdev->bd_contains == |
5238 | rdev2->bdev->bd_contains) { | 5272 | rdev2->bdev->bd_contains) { |
5239 | printk(KERN_WARNING | 5273 | pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", |
5240 | "%s: WARNING: %s appears to be" | 5274 | mdname(mddev), |
5241 | " on the same physical disk as" | 5275 | bdevname(rdev->bdev,b), |
5242 | " %s.\n", | 5276 | bdevname(rdev2->bdev,b2)); |
5243 | mdname(mddev), | ||
5244 | bdevname(rdev->bdev,b), | ||
5245 | bdevname(rdev2->bdev,b2)); | ||
5246 | warned = 1; | 5277 | warned = 1; |
5247 | } | 5278 | } |
5248 | } | 5279 | } |
5249 | 5280 | ||
5250 | if (warned) | 5281 | if (warned) |
5251 | printk(KERN_WARNING | 5282 | pr_warn("True protection against single-disk failure might be compromised.\n"); |
5252 | "True protection against single-disk" | ||
5253 | " failure might be compromised.\n"); | ||
5254 | } | 5283 | } |
5255 | 5284 | ||
5256 | mddev->recovery = 0; | 5285 | mddev->recovery = 0; |
@@ -5264,14 +5293,14 @@ int md_run(struct mddev *mddev) | |||
5264 | 5293 | ||
5265 | err = pers->run(mddev); | 5294 | err = pers->run(mddev); |
5266 | if (err) | 5295 | if (err) |
5267 | printk(KERN_ERR "md: pers->run() failed ...\n"); | 5296 | pr_warn("md: pers->run() failed ...\n"); |
5268 | else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { | 5297 | else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { |
5269 | WARN_ONCE(!mddev->external_size, "%s: default size too small," | 5298 | WARN_ONCE(!mddev->external_size, |
5270 | " but 'external_size' not in effect?\n", __func__); | 5299 | "%s: default size too small, but 'external_size' not in effect?\n", |
5271 | printk(KERN_ERR | 5300 | __func__); |
5272 | "md: invalid array_size %llu > default size %llu\n", | 5301 | pr_warn("md: invalid array_size %llu > default size %llu\n", |
5273 | (unsigned long long)mddev->array_sectors / 2, | 5302 | (unsigned long long)mddev->array_sectors / 2, |
5274 | (unsigned long long)pers->size(mddev, 0, 0) / 2); | 5303 | (unsigned long long)pers->size(mddev, 0, 0) / 2); |
5275 | err = -EINVAL; | 5304 | err = -EINVAL; |
5276 | } | 5305 | } |
5277 | if (err == 0 && pers->sync_request && | 5306 | if (err == 0 && pers->sync_request && |
@@ -5281,8 +5310,8 @@ int md_run(struct mddev *mddev) | |||
5281 | bitmap = bitmap_create(mddev, -1); | 5310 | bitmap = bitmap_create(mddev, -1); |
5282 | if (IS_ERR(bitmap)) { | 5311 | if (IS_ERR(bitmap)) { |
5283 | err = PTR_ERR(bitmap); | 5312 | err = PTR_ERR(bitmap); |
5284 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 5313 | pr_warn("%s: failed to create bitmap (%d)\n", |
5285 | mdname(mddev), err); | 5314 | mdname(mddev), err); |
5286 | } else | 5315 | } else |
5287 | mddev->bitmap = bitmap; | 5316 | mddev->bitmap = bitmap; |
5288 | 5317 | ||
@@ -5318,9 +5347,8 @@ int md_run(struct mddev *mddev) | |||
5318 | if (pers->sync_request) { | 5347 | if (pers->sync_request) { |
5319 | if (mddev->kobj.sd && | 5348 | if (mddev->kobj.sd && |
5320 | sysfs_create_group(&mddev->kobj, &md_redundancy_group)) | 5349 | sysfs_create_group(&mddev->kobj, &md_redundancy_group)) |
5321 | printk(KERN_WARNING | 5350 | pr_warn("md: cannot register extra attributes for %s\n", |
5322 | "md: cannot register extra attributes for %s\n", | 5351 | mdname(mddev)); |
5323 | mdname(mddev)); | ||
5324 | mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); | 5352 | mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); |
5325 | } else if (mddev->ro == 2) /* auto-readonly not meaningful */ | 5353 | } else if (mddev->ro == 2) /* auto-readonly not meaningful */ |
5326 | mddev->ro = 0; | 5354 | mddev->ro = 0; |
@@ -5350,7 +5378,7 @@ int md_run(struct mddev *mddev) | |||
5350 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 5378 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
5351 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5379 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5352 | 5380 | ||
5353 | if (mddev->flags & MD_UPDATE_SB_FLAGS) | 5381 | if (mddev->sb_flags) |
5354 | md_update_sb(mddev, 0); | 5382 | md_update_sb(mddev, 0); |
5355 | 5383 | ||
5356 | md_new_event(mddev); | 5384 | md_new_event(mddev); |
@@ -5421,8 +5449,7 @@ static int restart_array(struct mddev *mddev) | |||
5421 | mddev->safemode = 0; | 5449 | mddev->safemode = 0; |
5422 | mddev->ro = 0; | 5450 | mddev->ro = 0; |
5423 | set_disk_ro(disk, 0); | 5451 | set_disk_ro(disk, 0); |
5424 | printk(KERN_INFO "md: %s switched to read-write mode.\n", | 5452 | pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); |
5425 | mdname(mddev)); | ||
5426 | /* Kick recovery or resync if necessary */ | 5453 | /* Kick recovery or resync if necessary */ |
5427 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5454 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5428 | md_wakeup_thread(mddev->thread); | 5455 | md_wakeup_thread(mddev->thread); |
@@ -5446,6 +5473,7 @@ static void md_clean(struct mddev *mddev) | |||
5446 | mddev->level = LEVEL_NONE; | 5473 | mddev->level = LEVEL_NONE; |
5447 | mddev->clevel[0] = 0; | 5474 | mddev->clevel[0] = 0; |
5448 | mddev->flags = 0; | 5475 | mddev->flags = 0; |
5476 | mddev->sb_flags = 0; | ||
5449 | mddev->ro = 0; | 5477 | mddev->ro = 0; |
5450 | mddev->metadata_type[0] = 0; | 5478 | mddev->metadata_type[0] = 0; |
5451 | mddev->chunk_sectors = 0; | 5479 | mddev->chunk_sectors = 0; |
@@ -5490,12 +5518,15 @@ static void __md_stop_writes(struct mddev *mddev) | |||
5490 | 5518 | ||
5491 | del_timer_sync(&mddev->safemode_timer); | 5519 | del_timer_sync(&mddev->safemode_timer); |
5492 | 5520 | ||
5521 | if (mddev->pers && mddev->pers->quiesce) { | ||
5522 | mddev->pers->quiesce(mddev, 1); | ||
5523 | mddev->pers->quiesce(mddev, 0); | ||
5524 | } | ||
5493 | bitmap_flush(mddev); | 5525 | bitmap_flush(mddev); |
5494 | md_super_wait(mddev); | ||
5495 | 5526 | ||
5496 | if (mddev->ro == 0 && | 5527 | if (mddev->ro == 0 && |
5497 | ((!mddev->in_sync && !mddev_is_clustered(mddev)) || | 5528 | ((!mddev->in_sync && !mddev_is_clustered(mddev)) || |
5498 | (mddev->flags & MD_UPDATE_SB_FLAGS))) { | 5529 | mddev->sb_flags)) { |
5499 | /* mark array as shutdown cleanly */ | 5530 | /* mark array as shutdown cleanly */ |
5500 | if (!mddev_is_clustered(mddev)) | 5531 | if (!mddev_is_clustered(mddev)) |
5501 | mddev->in_sync = 1; | 5532 | mddev->in_sync = 1; |
@@ -5516,8 +5547,8 @@ static void mddev_detach(struct mddev *mddev) | |||
5516 | struct bitmap *bitmap = mddev->bitmap; | 5547 | struct bitmap *bitmap = mddev->bitmap; |
5517 | /* wait for behind writes to complete */ | 5548 | /* wait for behind writes to complete */ |
5518 | if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | 5549 | if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { |
5519 | printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", | 5550 | pr_debug("md:%s: behind writes in progress - waiting to stop.\n", |
5520 | mdname(mddev)); | 5551 | mdname(mddev)); |
5521 | /* need to kick something here to make sure I/O goes? */ | 5552 | /* need to kick something here to make sure I/O goes? */ |
5522 | wait_event(bitmap->behind_wait, | 5553 | wait_event(bitmap->behind_wait, |
5523 | atomic_read(&bitmap->behind_writes) == 0); | 5554 | atomic_read(&bitmap->behind_writes) == 0); |
@@ -5578,20 +5609,20 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | |||
5578 | * which will now never happen */ | 5609 | * which will now never happen */ |
5579 | wake_up_process(mddev->sync_thread->tsk); | 5610 | wake_up_process(mddev->sync_thread->tsk); |
5580 | 5611 | ||
5581 | if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) | 5612 | if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
5582 | return -EBUSY; | 5613 | return -EBUSY; |
5583 | mddev_unlock(mddev); | 5614 | mddev_unlock(mddev); |
5584 | wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, | 5615 | wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, |
5585 | &mddev->recovery)); | 5616 | &mddev->recovery)); |
5586 | wait_event(mddev->sb_wait, | 5617 | wait_event(mddev->sb_wait, |
5587 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 5618 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
5588 | mddev_lock_nointr(mddev); | 5619 | mddev_lock_nointr(mddev); |
5589 | 5620 | ||
5590 | mutex_lock(&mddev->open_mutex); | 5621 | mutex_lock(&mddev->open_mutex); |
5591 | if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || | 5622 | if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || |
5592 | mddev->sync_thread || | 5623 | mddev->sync_thread || |
5593 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { | 5624 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
5594 | printk("md: %s still in use.\n",mdname(mddev)); | 5625 | pr_warn("md: %s still in use.\n",mdname(mddev)); |
5595 | if (did_freeze) { | 5626 | if (did_freeze) { |
5596 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5627 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
5597 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5628 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
@@ -5653,7 +5684,7 @@ static int do_md_stop(struct mddev *mddev, int mode, | |||
5653 | mddev->sysfs_active || | 5684 | mddev->sysfs_active || |
5654 | mddev->sync_thread || | 5685 | mddev->sync_thread || |
5655 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { | 5686 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
5656 | printk("md: %s still in use.\n",mdname(mddev)); | 5687 | pr_warn("md: %s still in use.\n",mdname(mddev)); |
5657 | mutex_unlock(&mddev->open_mutex); | 5688 | mutex_unlock(&mddev->open_mutex); |
5658 | if (did_freeze) { | 5689 | if (did_freeze) { |
5659 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5690 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
@@ -5690,7 +5721,7 @@ static int do_md_stop(struct mddev *mddev, int mode, | |||
5690 | * Free resources if final stop | 5721 | * Free resources if final stop |
5691 | */ | 5722 | */ |
5692 | if (mode == 0) { | 5723 | if (mode == 0) { |
5693 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); | 5724 | pr_info("md: %s stopped.\n", mdname(mddev)); |
5694 | 5725 | ||
5695 | bitmap_destroy(mddev); | 5726 | bitmap_destroy(mddev); |
5696 | if (mddev->bitmap_info.file) { | 5727 | if (mddev->bitmap_info.file) { |
@@ -5722,17 +5753,17 @@ static void autorun_array(struct mddev *mddev) | |||
5722 | if (list_empty(&mddev->disks)) | 5753 | if (list_empty(&mddev->disks)) |
5723 | return; | 5754 | return; |
5724 | 5755 | ||
5725 | printk(KERN_INFO "md: running: "); | 5756 | pr_info("md: running: "); |
5726 | 5757 | ||
5727 | rdev_for_each(rdev, mddev) { | 5758 | rdev_for_each(rdev, mddev) { |
5728 | char b[BDEVNAME_SIZE]; | 5759 | char b[BDEVNAME_SIZE]; |
5729 | printk("<%s>", bdevname(rdev->bdev,b)); | 5760 | pr_cont("<%s>", bdevname(rdev->bdev,b)); |
5730 | } | 5761 | } |
5731 | printk("\n"); | 5762 | pr_cont("\n"); |
5732 | 5763 | ||
5733 | err = do_md_run(mddev); | 5764 | err = do_md_run(mddev); |
5734 | if (err) { | 5765 | if (err) { |
5735 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); | 5766 | pr_warn("md: do_md_run() returned %d\n", err); |
5736 | do_md_stop(mddev, 0, NULL); | 5767 | do_md_stop(mddev, 0, NULL); |
5737 | } | 5768 | } |
5738 | } | 5769 | } |
@@ -5755,7 +5786,7 @@ static void autorun_devices(int part) | |||
5755 | struct mddev *mddev; | 5786 | struct mddev *mddev; |
5756 | char b[BDEVNAME_SIZE]; | 5787 | char b[BDEVNAME_SIZE]; |
5757 | 5788 | ||
5758 | printk(KERN_INFO "md: autorun ...\n"); | 5789 | pr_info("md: autorun ...\n"); |
5759 | while (!list_empty(&pending_raid_disks)) { | 5790 | while (!list_empty(&pending_raid_disks)) { |
5760 | int unit; | 5791 | int unit; |
5761 | dev_t dev; | 5792 | dev_t dev; |
@@ -5763,13 +5794,12 @@ static void autorun_devices(int part) | |||
5763 | rdev0 = list_entry(pending_raid_disks.next, | 5794 | rdev0 = list_entry(pending_raid_disks.next, |
5764 | struct md_rdev, same_set); | 5795 | struct md_rdev, same_set); |
5765 | 5796 | ||
5766 | printk(KERN_INFO "md: considering %s ...\n", | 5797 | pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); |
5767 | bdevname(rdev0->bdev,b)); | ||
5768 | INIT_LIST_HEAD(&candidates); | 5798 | INIT_LIST_HEAD(&candidates); |
5769 | rdev_for_each_list(rdev, tmp, &pending_raid_disks) | 5799 | rdev_for_each_list(rdev, tmp, &pending_raid_disks) |
5770 | if (super_90_load(rdev, rdev0, 0) >= 0) { | 5800 | if (super_90_load(rdev, rdev0, 0) >= 0) { |
5771 | printk(KERN_INFO "md: adding %s ...\n", | 5801 | pr_debug("md: adding %s ...\n", |
5772 | bdevname(rdev->bdev,b)); | 5802 | bdevname(rdev->bdev,b)); |
5773 | list_move(&rdev->same_set, &candidates); | 5803 | list_move(&rdev->same_set, &candidates); |
5774 | } | 5804 | } |
5775 | /* | 5805 | /* |
@@ -5786,8 +5816,8 @@ static void autorun_devices(int part) | |||
5786 | unit = MINOR(dev); | 5816 | unit = MINOR(dev); |
5787 | } | 5817 | } |
5788 | if (rdev0->preferred_minor != unit) { | 5818 | if (rdev0->preferred_minor != unit) { |
5789 | printk(KERN_INFO "md: unit number in %s is bad: %d\n", | 5819 | pr_warn("md: unit number in %s is bad: %d\n", |
5790 | bdevname(rdev0->bdev, b), rdev0->preferred_minor); | 5820 | bdevname(rdev0->bdev, b), rdev0->preferred_minor); |
5791 | break; | 5821 | break; |
5792 | } | 5822 | } |
5793 | 5823 | ||
@@ -5796,21 +5826,17 @@ static void autorun_devices(int part) | |||
5796 | if (!mddev || !mddev->gendisk) { | 5826 | if (!mddev || !mddev->gendisk) { |
5797 | if (mddev) | 5827 | if (mddev) |
5798 | mddev_put(mddev); | 5828 | mddev_put(mddev); |
5799 | printk(KERN_ERR | ||
5800 | "md: cannot allocate memory for md drive.\n"); | ||
5801 | break; | 5829 | break; |
5802 | } | 5830 | } |
5803 | if (mddev_lock(mddev)) | 5831 | if (mddev_lock(mddev)) |
5804 | printk(KERN_WARNING "md: %s locked, cannot run\n", | 5832 | pr_warn("md: %s locked, cannot run\n", mdname(mddev)); |
5805 | mdname(mddev)); | ||
5806 | else if (mddev->raid_disks || mddev->major_version | 5833 | else if (mddev->raid_disks || mddev->major_version |
5807 | || !list_empty(&mddev->disks)) { | 5834 | || !list_empty(&mddev->disks)) { |
5808 | printk(KERN_WARNING | 5835 | pr_warn("md: %s already running, cannot run %s\n", |
5809 | "md: %s already running, cannot run %s\n", | ||
5810 | mdname(mddev), bdevname(rdev0->bdev,b)); | 5836 | mdname(mddev), bdevname(rdev0->bdev,b)); |
5811 | mddev_unlock(mddev); | 5837 | mddev_unlock(mddev); |
5812 | } else { | 5838 | } else { |
5813 | printk(KERN_INFO "md: created %s\n", mdname(mddev)); | 5839 | pr_debug("md: created %s\n", mdname(mddev)); |
5814 | mddev->persistent = 1; | 5840 | mddev->persistent = 1; |
5815 | rdev_for_each_list(rdev, tmp, &candidates) { | 5841 | rdev_for_each_list(rdev, tmp, &candidates) { |
5816 | list_del_init(&rdev->same_set); | 5842 | list_del_init(&rdev->same_set); |
@@ -5829,7 +5855,7 @@ static void autorun_devices(int part) | |||
5829 | } | 5855 | } |
5830 | mddev_put(mddev); | 5856 | mddev_put(mddev); |
5831 | } | 5857 | } |
5832 | printk(KERN_INFO "md: ... autorun DONE.\n"); | 5858 | pr_info("md: ... autorun DONE.\n"); |
5833 | } | 5859 | } |
5834 | #endif /* !MODULE */ | 5860 | #endif /* !MODULE */ |
5835 | 5861 | ||
@@ -5964,6 +5990,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) | |||
5964 | info.state |= (1<<MD_DISK_JOURNAL); | 5990 | info.state |= (1<<MD_DISK_JOURNAL); |
5965 | if (test_bit(WriteMostly, &rdev->flags)) | 5991 | if (test_bit(WriteMostly, &rdev->flags)) |
5966 | info.state |= (1<<MD_DISK_WRITEMOSTLY); | 5992 | info.state |= (1<<MD_DISK_WRITEMOSTLY); |
5993 | if (test_bit(FailFast, &rdev->flags)) | ||
5994 | info.state |= (1<<MD_DISK_FAILFAST); | ||
5967 | } else { | 5995 | } else { |
5968 | info.major = info.minor = 0; | 5996 | info.major = info.minor = 0; |
5969 | info.raid_disk = -1; | 5997 | info.raid_disk = -1; |
@@ -5985,8 +6013,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
5985 | 6013 | ||
5986 | if (mddev_is_clustered(mddev) && | 6014 | if (mddev_is_clustered(mddev) && |
5987 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { | 6015 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { |
5988 | pr_err("%s: Cannot add to clustered mddev.\n", | 6016 | pr_warn("%s: Cannot add to clustered mddev.\n", |
5989 | mdname(mddev)); | 6017 | mdname(mddev)); |
5990 | return -EINVAL; | 6018 | return -EINVAL; |
5991 | } | 6019 | } |
5992 | 6020 | ||
@@ -5998,8 +6026,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
5998 | /* expecting a device which has a superblock */ | 6026 | /* expecting a device which has a superblock */ |
5999 | rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); | 6027 | rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); |
6000 | if (IS_ERR(rdev)) { | 6028 | if (IS_ERR(rdev)) { |
6001 | printk(KERN_WARNING | 6029 | pr_warn("md: md_import_device returned %ld\n", |
6002 | "md: md_import_device returned %ld\n", | ||
6003 | PTR_ERR(rdev)); | 6030 | PTR_ERR(rdev)); |
6004 | return PTR_ERR(rdev); | 6031 | return PTR_ERR(rdev); |
6005 | } | 6032 | } |
@@ -6010,8 +6037,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6010 | err = super_types[mddev->major_version] | 6037 | err = super_types[mddev->major_version] |
6011 | .load_super(rdev, rdev0, mddev->minor_version); | 6038 | .load_super(rdev, rdev0, mddev->minor_version); |
6012 | if (err < 0) { | 6039 | if (err < 0) { |
6013 | printk(KERN_WARNING | 6040 | pr_warn("md: %s has different UUID to %s\n", |
6014 | "md: %s has different UUID to %s\n", | ||
6015 | bdevname(rdev->bdev,b), | 6041 | bdevname(rdev->bdev,b), |
6016 | bdevname(rdev0->bdev,b2)); | 6042 | bdevname(rdev0->bdev,b2)); |
6017 | export_rdev(rdev); | 6043 | export_rdev(rdev); |
@@ -6032,9 +6058,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6032 | if (mddev->pers) { | 6058 | if (mddev->pers) { |
6033 | int err; | 6059 | int err; |
6034 | if (!mddev->pers->hot_add_disk) { | 6060 | if (!mddev->pers->hot_add_disk) { |
6035 | printk(KERN_WARNING | 6061 | pr_warn("%s: personality does not support diskops!\n", |
6036 | "%s: personality does not support diskops!\n", | 6062 | mdname(mddev)); |
6037 | mdname(mddev)); | ||
6038 | return -EINVAL; | 6063 | return -EINVAL; |
6039 | } | 6064 | } |
6040 | if (mddev->persistent) | 6065 | if (mddev->persistent) |
@@ -6043,8 +6068,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6043 | else | 6068 | else |
6044 | rdev = md_import_device(dev, -1, -1); | 6069 | rdev = md_import_device(dev, -1, -1); |
6045 | if (IS_ERR(rdev)) { | 6070 | if (IS_ERR(rdev)) { |
6046 | printk(KERN_WARNING | 6071 | pr_warn("md: md_import_device returned %ld\n", |
6047 | "md: md_import_device returned %ld\n", | ||
6048 | PTR_ERR(rdev)); | 6072 | PTR_ERR(rdev)); |
6049 | return PTR_ERR(rdev); | 6073 | return PTR_ERR(rdev); |
6050 | } | 6074 | } |
@@ -6075,6 +6099,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6075 | set_bit(WriteMostly, &rdev->flags); | 6099 | set_bit(WriteMostly, &rdev->flags); |
6076 | else | 6100 | else |
6077 | clear_bit(WriteMostly, &rdev->flags); | 6101 | clear_bit(WriteMostly, &rdev->flags); |
6102 | if (info->state & (1<<MD_DISK_FAILFAST)) | ||
6103 | set_bit(FailFast, &rdev->flags); | ||
6104 | else | ||
6105 | clear_bit(FailFast, &rdev->flags); | ||
6078 | 6106 | ||
6079 | if (info->state & (1<<MD_DISK_JOURNAL)) { | 6107 | if (info->state & (1<<MD_DISK_JOURNAL)) { |
6080 | struct md_rdev *rdev2; | 6108 | struct md_rdev *rdev2; |
@@ -6140,8 +6168,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6140 | * for major_version==0 superblocks | 6168 | * for major_version==0 superblocks |
6141 | */ | 6169 | */ |
6142 | if (mddev->major_version != 0) { | 6170 | if (mddev->major_version != 0) { |
6143 | printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", | 6171 | pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); |
6144 | mdname(mddev)); | ||
6145 | return -EINVAL; | 6172 | return -EINVAL; |
6146 | } | 6173 | } |
6147 | 6174 | ||
@@ -6149,8 +6176,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6149 | int err; | 6176 | int err; |
6150 | rdev = md_import_device(dev, -1, 0); | 6177 | rdev = md_import_device(dev, -1, 0); |
6151 | if (IS_ERR(rdev)) { | 6178 | if (IS_ERR(rdev)) { |
6152 | printk(KERN_WARNING | 6179 | pr_warn("md: error, md_import_device() returned %ld\n", |
6153 | "md: error, md_import_device() returned %ld\n", | ||
6154 | PTR_ERR(rdev)); | 6180 | PTR_ERR(rdev)); |
6155 | return PTR_ERR(rdev); | 6181 | return PTR_ERR(rdev); |
6156 | } | 6182 | } |
@@ -6166,9 +6192,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
6166 | 6192 | ||
6167 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | 6193 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
6168 | set_bit(WriteMostly, &rdev->flags); | 6194 | set_bit(WriteMostly, &rdev->flags); |
6195 | if (info->state & (1<<MD_DISK_FAILFAST)) | ||
6196 | set_bit(FailFast, &rdev->flags); | ||
6169 | 6197 | ||
6170 | if (!mddev->persistent) { | 6198 | if (!mddev->persistent) { |
6171 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | 6199 | pr_debug("md: nonpersistent superblock ...\n"); |
6172 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; | 6200 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; |
6173 | } else | 6201 | } else |
6174 | rdev->sb_start = calc_dev_sboffset(rdev); | 6202 | rdev->sb_start = calc_dev_sboffset(rdev); |
@@ -6207,13 +6235,17 @@ kick_rdev: | |||
6207 | md_cluster_ops->remove_disk(mddev, rdev); | 6235 | md_cluster_ops->remove_disk(mddev, rdev); |
6208 | 6236 | ||
6209 | md_kick_rdev_from_array(rdev); | 6237 | md_kick_rdev_from_array(rdev); |
6210 | md_update_sb(mddev, 1); | 6238 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
6239 | if (mddev->thread) | ||
6240 | md_wakeup_thread(mddev->thread); | ||
6241 | else | ||
6242 | md_update_sb(mddev, 1); | ||
6211 | md_new_event(mddev); | 6243 | md_new_event(mddev); |
6212 | 6244 | ||
6213 | return 0; | 6245 | return 0; |
6214 | busy: | 6246 | busy: |
6215 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", | 6247 | pr_debug("md: cannot remove active disk %s from %s ...\n", |
6216 | bdevname(rdev->bdev,b), mdname(mddev)); | 6248 | bdevname(rdev->bdev,b), mdname(mddev)); |
6217 | return -EBUSY; | 6249 | return -EBUSY; |
6218 | } | 6250 | } |
6219 | 6251 | ||
@@ -6227,22 +6259,19 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
6227 | return -ENODEV; | 6259 | return -ENODEV; |
6228 | 6260 | ||
6229 | if (mddev->major_version != 0) { | 6261 | if (mddev->major_version != 0) { |
6230 | printk(KERN_WARNING "%s: HOT_ADD may only be used with" | 6262 | pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", |
6231 | " version-0 superblocks.\n", | ||
6232 | mdname(mddev)); | 6263 | mdname(mddev)); |
6233 | return -EINVAL; | 6264 | return -EINVAL; |
6234 | } | 6265 | } |
6235 | if (!mddev->pers->hot_add_disk) { | 6266 | if (!mddev->pers->hot_add_disk) { |
6236 | printk(KERN_WARNING | 6267 | pr_warn("%s: personality does not support diskops!\n", |
6237 | "%s: personality does not support diskops!\n", | ||
6238 | mdname(mddev)); | 6268 | mdname(mddev)); |
6239 | return -EINVAL; | 6269 | return -EINVAL; |
6240 | } | 6270 | } |
6241 | 6271 | ||
6242 | rdev = md_import_device(dev, -1, 0); | 6272 | rdev = md_import_device(dev, -1, 0); |
6243 | if (IS_ERR(rdev)) { | 6273 | if (IS_ERR(rdev)) { |
6244 | printk(KERN_WARNING | 6274 | pr_warn("md: error, md_import_device() returned %ld\n", |
6245 | "md: error, md_import_device() returned %ld\n", | ||
6246 | PTR_ERR(rdev)); | 6275 | PTR_ERR(rdev)); |
6247 | return -EINVAL; | 6276 | return -EINVAL; |
6248 | } | 6277 | } |
@@ -6255,8 +6284,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
6255 | rdev->sectors = rdev->sb_start; | 6284 | rdev->sectors = rdev->sb_start; |
6256 | 6285 | ||
6257 | if (test_bit(Faulty, &rdev->flags)) { | 6286 | if (test_bit(Faulty, &rdev->flags)) { |
6258 | printk(KERN_WARNING | 6287 | pr_warn("md: can not hot-add faulty %s disk to %s!\n", |
6259 | "md: can not hot-add faulty %s disk to %s!\n", | ||
6260 | bdevname(rdev->bdev,b), mdname(mddev)); | 6288 | bdevname(rdev->bdev,b), mdname(mddev)); |
6261 | err = -EINVAL; | 6289 | err = -EINVAL; |
6262 | goto abort_export; | 6290 | goto abort_export; |
@@ -6276,7 +6304,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
6276 | 6304 | ||
6277 | rdev->raid_disk = -1; | 6305 | rdev->raid_disk = -1; |
6278 | 6306 | ||
6279 | md_update_sb(mddev, 1); | 6307 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
6308 | if (!mddev->thread) | ||
6309 | md_update_sb(mddev, 1); | ||
6280 | /* | 6310 | /* |
6281 | * Kick recovery, maybe this spare has to be added to the | 6311 | * Kick recovery, maybe this spare has to be added to the |
6282 | * array immediately. | 6312 | * array immediately. |
@@ -6312,23 +6342,23 @@ static int set_bitmap_file(struct mddev *mddev, int fd) | |||
6312 | f = fget(fd); | 6342 | f = fget(fd); |
6313 | 6343 | ||
6314 | if (f == NULL) { | 6344 | if (f == NULL) { |
6315 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | 6345 | pr_warn("%s: error: failed to get bitmap file\n", |
6316 | mdname(mddev)); | 6346 | mdname(mddev)); |
6317 | return -EBADF; | 6347 | return -EBADF; |
6318 | } | 6348 | } |
6319 | 6349 | ||
6320 | inode = f->f_mapping->host; | 6350 | inode = f->f_mapping->host; |
6321 | if (!S_ISREG(inode->i_mode)) { | 6351 | if (!S_ISREG(inode->i_mode)) { |
6322 | printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", | 6352 | pr_warn("%s: error: bitmap file must be a regular file\n", |
6323 | mdname(mddev)); | 6353 | mdname(mddev)); |
6324 | err = -EBADF; | 6354 | err = -EBADF; |
6325 | } else if (!(f->f_mode & FMODE_WRITE)) { | 6355 | } else if (!(f->f_mode & FMODE_WRITE)) { |
6326 | printk(KERN_ERR "%s: error: bitmap file must open for write\n", | 6356 | pr_warn("%s: error: bitmap file must open for write\n", |
6327 | mdname(mddev)); | 6357 | mdname(mddev)); |
6328 | err = -EBADF; | 6358 | err = -EBADF; |
6329 | } else if (atomic_read(&inode->i_writecount) != 1) { | 6359 | } else if (atomic_read(&inode->i_writecount) != 1) { |
6330 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | 6360 | pr_warn("%s: error: bitmap file is already in use\n", |
6331 | mdname(mddev)); | 6361 | mdname(mddev)); |
6332 | err = -EBUSY; | 6362 | err = -EBUSY; |
6333 | } | 6363 | } |
6334 | if (err) { | 6364 | if (err) { |
@@ -6393,8 +6423,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6393 | info->major_version >= ARRAY_SIZE(super_types) || | 6423 | info->major_version >= ARRAY_SIZE(super_types) || |
6394 | super_types[info->major_version].name == NULL) { | 6424 | super_types[info->major_version].name == NULL) { |
6395 | /* maybe try to auto-load a module? */ | 6425 | /* maybe try to auto-load a module? */ |
6396 | printk(KERN_INFO | 6426 | pr_warn("md: superblock version %d not known\n", |
6397 | "md: superblock version %d not known\n", | ||
6398 | info->major_version); | 6427 | info->major_version); |
6399 | return -EINVAL; | 6428 | return -EINVAL; |
6400 | } | 6429 | } |
@@ -6432,9 +6461,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6432 | 6461 | ||
6433 | mddev->max_disks = MD_SB_DISKS; | 6462 | mddev->max_disks = MD_SB_DISKS; |
6434 | 6463 | ||
6435 | if (mddev->persistent) | 6464 | if (mddev->persistent) { |
6436 | mddev->flags = 0; | 6465 | mddev->flags = 0; |
6437 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 6466 | mddev->sb_flags = 0; |
6467 | } | ||
6468 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); | ||
6438 | 6469 | ||
6439 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 6470 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
6440 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); | 6471 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
@@ -6660,8 +6691,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6660 | if (mddev->bitmap_info.nodes) { | 6691 | if (mddev->bitmap_info.nodes) { |
6661 | /* hold PW on all the bitmap lock */ | 6692 | /* hold PW on all the bitmap lock */ |
6662 | if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { | 6693 | if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { |
6663 | printk("md: can't change bitmap to none since the" | 6694 | pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); |
6664 | " array is in use by more than one node\n"); | ||
6665 | rv = -EPERM; | 6695 | rv = -EPERM; |
6666 | md_cluster_ops->unlock_all_bitmaps(mddev); | 6696 | md_cluster_ops->unlock_all_bitmaps(mddev); |
6667 | goto err; | 6697 | goto err; |
@@ -6829,7 +6859,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6829 | /* need to ensure recovery thread has run */ | 6859 | /* need to ensure recovery thread has run */ |
6830 | wait_event_interruptible_timeout(mddev->sb_wait, | 6860 | wait_event_interruptible_timeout(mddev->sb_wait, |
6831 | !test_bit(MD_RECOVERY_NEEDED, | 6861 | !test_bit(MD_RECOVERY_NEEDED, |
6832 | &mddev->flags), | 6862 | &mddev->recovery), |
6833 | msecs_to_jiffies(5000)); | 6863 | msecs_to_jiffies(5000)); |
6834 | if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { | 6864 | if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { |
6835 | /* Need to flush page cache, and ensure no-one else opens | 6865 | /* Need to flush page cache, and ensure no-one else opens |
@@ -6847,9 +6877,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6847 | } | 6877 | } |
6848 | err = mddev_lock(mddev); | 6878 | err = mddev_lock(mddev); |
6849 | if (err) { | 6879 | if (err) { |
6850 | printk(KERN_INFO | 6880 | pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", |
6851 | "md: ioctl lock interrupted, reason %d, cmd %d\n", | 6881 | err, cmd); |
6852 | err, cmd); | ||
6853 | goto out; | 6882 | goto out; |
6854 | } | 6883 | } |
6855 | 6884 | ||
@@ -6864,30 +6893,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6864 | if (mddev->pers) { | 6893 | if (mddev->pers) { |
6865 | err = update_array_info(mddev, &info); | 6894 | err = update_array_info(mddev, &info); |
6866 | if (err) { | 6895 | if (err) { |
6867 | printk(KERN_WARNING "md: couldn't update" | 6896 | pr_warn("md: couldn't update array info. %d\n", err); |
6868 | " array info. %d\n", err); | ||
6869 | goto unlock; | 6897 | goto unlock; |
6870 | } | 6898 | } |
6871 | goto unlock; | 6899 | goto unlock; |
6872 | } | 6900 | } |
6873 | if (!list_empty(&mddev->disks)) { | 6901 | if (!list_empty(&mddev->disks)) { |
6874 | printk(KERN_WARNING | 6902 | pr_warn("md: array %s already has disks!\n", mdname(mddev)); |
6875 | "md: array %s already has disks!\n", | ||
6876 | mdname(mddev)); | ||
6877 | err = -EBUSY; | 6903 | err = -EBUSY; |
6878 | goto unlock; | 6904 | goto unlock; |
6879 | } | 6905 | } |
6880 | if (mddev->raid_disks) { | 6906 | if (mddev->raid_disks) { |
6881 | printk(KERN_WARNING | 6907 | pr_warn("md: array %s already initialised!\n", mdname(mddev)); |
6882 | "md: array %s already initialised!\n", | ||
6883 | mdname(mddev)); | ||
6884 | err = -EBUSY; | 6908 | err = -EBUSY; |
6885 | goto unlock; | 6909 | goto unlock; |
6886 | } | 6910 | } |
6887 | err = set_array_info(mddev, &info); | 6911 | err = set_array_info(mddev, &info); |
6888 | if (err) { | 6912 | if (err) { |
6889 | printk(KERN_WARNING "md: couldn't set" | 6913 | pr_warn("md: couldn't set array info. %d\n", err); |
6890 | " array info. %d\n", err); | ||
6891 | goto unlock; | 6914 | goto unlock; |
6892 | } | 6915 | } |
6893 | goto unlock; | 6916 | goto unlock; |
@@ -6987,11 +7010,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6987 | /* If a device failed while we were read-only, we | 7010 | /* If a device failed while we were read-only, we |
6988 | * need to make sure the metadata is updated now. | 7011 | * need to make sure the metadata is updated now. |
6989 | */ | 7012 | */ |
6990 | if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { | 7013 | if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { |
6991 | mddev_unlock(mddev); | 7014 | mddev_unlock(mddev); |
6992 | wait_event(mddev->sb_wait, | 7015 | wait_event(mddev->sb_wait, |
6993 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | 7016 | !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && |
6994 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 7017 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
6995 | mddev_lock_nointr(mddev); | 7018 | mddev_lock_nointr(mddev); |
6996 | } | 7019 | } |
6997 | } else { | 7020 | } else { |
@@ -7092,7 +7115,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
7092 | 7115 | ||
7093 | if (test_bit(MD_CLOSING, &mddev->flags)) { | 7116 | if (test_bit(MD_CLOSING, &mddev->flags)) { |
7094 | mutex_unlock(&mddev->open_mutex); | 7117 | mutex_unlock(&mddev->open_mutex); |
7095 | return -ENODEV; | 7118 | err = -ENODEV; |
7119 | goto out; | ||
7096 | } | 7120 | } |
7097 | 7121 | ||
7098 | err = 0; | 7122 | err = 0; |
@@ -7101,6 +7125,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
7101 | 7125 | ||
7102 | check_disk_change(bdev); | 7126 | check_disk_change(bdev); |
7103 | out: | 7127 | out: |
7128 | if (err) | ||
7129 | mddev_put(mddev); | ||
7104 | return err; | 7130 | return err; |
7105 | } | 7131 | } |
7106 | 7132 | ||
@@ -7171,10 +7197,12 @@ static int md_thread(void *arg) | |||
7171 | wait_event_interruptible_timeout | 7197 | wait_event_interruptible_timeout |
7172 | (thread->wqueue, | 7198 | (thread->wqueue, |
7173 | test_bit(THREAD_WAKEUP, &thread->flags) | 7199 | test_bit(THREAD_WAKEUP, &thread->flags) |
7174 | || kthread_should_stop(), | 7200 | || kthread_should_stop() || kthread_should_park(), |
7175 | thread->timeout); | 7201 | thread->timeout); |
7176 | 7202 | ||
7177 | clear_bit(THREAD_WAKEUP, &thread->flags); | 7203 | clear_bit(THREAD_WAKEUP, &thread->flags); |
7204 | if (kthread_should_park()) | ||
7205 | kthread_parkme(); | ||
7178 | if (!kthread_should_stop()) | 7206 | if (!kthread_should_stop()) |
7179 | thread->run(thread); | 7207 | thread->run(thread); |
7180 | } | 7208 | } |
@@ -7588,8 +7616,8 @@ static const struct file_operations md_seq_fops = { | |||
7588 | 7616 | ||
7589 | int register_md_personality(struct md_personality *p) | 7617 | int register_md_personality(struct md_personality *p) |
7590 | { | 7618 | { |
7591 | printk(KERN_INFO "md: %s personality registered for level %d\n", | 7619 | pr_debug("md: %s personality registered for level %d\n", |
7592 | p->name, p->level); | 7620 | p->name, p->level); |
7593 | spin_lock(&pers_lock); | 7621 | spin_lock(&pers_lock); |
7594 | list_add_tail(&p->list, &pers_list); | 7622 | list_add_tail(&p->list, &pers_list); |
7595 | spin_unlock(&pers_lock); | 7623 | spin_unlock(&pers_lock); |
@@ -7599,7 +7627,7 @@ EXPORT_SYMBOL(register_md_personality); | |||
7599 | 7627 | ||
7600 | int unregister_md_personality(struct md_personality *p) | 7628 | int unregister_md_personality(struct md_personality *p) |
7601 | { | 7629 | { |
7602 | printk(KERN_INFO "md: %s personality unregistered\n", p->name); | 7630 | pr_debug("md: %s personality unregistered\n", p->name); |
7603 | spin_lock(&pers_lock); | 7631 | spin_lock(&pers_lock); |
7604 | list_del_init(&p->list); | 7632 | list_del_init(&p->list); |
7605 | spin_unlock(&pers_lock); | 7633 | spin_unlock(&pers_lock); |
@@ -7639,7 +7667,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) | |||
7639 | spin_lock(&pers_lock); | 7667 | spin_lock(&pers_lock); |
7640 | /* ensure module won't be unloaded */ | 7668 | /* ensure module won't be unloaded */ |
7641 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { | 7669 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { |
7642 | pr_err("can't find md-cluster module or get it's reference.\n"); | 7670 | pr_warn("can't find md-cluster module or get it's reference.\n"); |
7643 | spin_unlock(&pers_lock); | 7671 | spin_unlock(&pers_lock); |
7644 | return -ENOENT; | 7672 | return -ENOENT; |
7645 | } | 7673 | } |
@@ -7741,8 +7769,8 @@ void md_write_start(struct mddev *mddev, struct bio *bi) | |||
7741 | spin_lock(&mddev->lock); | 7769 | spin_lock(&mddev->lock); |
7742 | if (mddev->in_sync) { | 7770 | if (mddev->in_sync) { |
7743 | mddev->in_sync = 0; | 7771 | mddev->in_sync = 0; |
7744 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 7772 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
7745 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | 7773 | set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
7746 | md_wakeup_thread(mddev->thread); | 7774 | md_wakeup_thread(mddev->thread); |
7747 | did_change = 1; | 7775 | did_change = 1; |
7748 | } | 7776 | } |
@@ -7751,7 +7779,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) | |||
7751 | if (did_change) | 7779 | if (did_change) |
7752 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 7780 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
7753 | wait_event(mddev->sb_wait, | 7781 | wait_event(mddev->sb_wait, |
7754 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 7782 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
7755 | } | 7783 | } |
7756 | EXPORT_SYMBOL(md_write_start); | 7784 | EXPORT_SYMBOL(md_write_start); |
7757 | 7785 | ||
@@ -7772,7 +7800,7 @@ EXPORT_SYMBOL(md_write_end); | |||
7772 | * attempting a GFP_KERNEL allocation while holding the mddev lock. | 7800 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
7773 | * Must be called with mddev_lock held. | 7801 | * Must be called with mddev_lock held. |
7774 | * | 7802 | * |
7775 | * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock | 7803 | * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock |
7776 | * is dropped, so return -EAGAIN after notifying userspace. | 7804 | * is dropped, so return -EAGAIN after notifying userspace. |
7777 | */ | 7805 | */ |
7778 | int md_allow_write(struct mddev *mddev) | 7806 | int md_allow_write(struct mddev *mddev) |
@@ -7787,8 +7815,8 @@ int md_allow_write(struct mddev *mddev) | |||
7787 | spin_lock(&mddev->lock); | 7815 | spin_lock(&mddev->lock); |
7788 | if (mddev->in_sync) { | 7816 | if (mddev->in_sync) { |
7789 | mddev->in_sync = 0; | 7817 | mddev->in_sync = 0; |
7790 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 7818 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
7791 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | 7819 | set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
7792 | if (mddev->safemode_delay && | 7820 | if (mddev->safemode_delay && |
7793 | mddev->safemode == 0) | 7821 | mddev->safemode == 0) |
7794 | mddev->safemode = 1; | 7822 | mddev->safemode = 1; |
@@ -7798,7 +7826,7 @@ int md_allow_write(struct mddev *mddev) | |||
7798 | } else | 7826 | } else |
7799 | spin_unlock(&mddev->lock); | 7827 | spin_unlock(&mddev->lock); |
7800 | 7828 | ||
7801 | if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) | 7829 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
7802 | return -EAGAIN; | 7830 | return -EAGAIN; |
7803 | else | 7831 | else |
7804 | return 0; | 7832 | return 0; |
@@ -7914,11 +7942,9 @@ void md_do_sync(struct md_thread *thread) | |||
7914 | mddev2->curr_resync >= mddev->curr_resync) { | 7942 | mddev2->curr_resync >= mddev->curr_resync) { |
7915 | if (mddev2_minor != mddev2->md_minor) { | 7943 | if (mddev2_minor != mddev2->md_minor) { |
7916 | mddev2_minor = mddev2->md_minor; | 7944 | mddev2_minor = mddev2->md_minor; |
7917 | printk(KERN_INFO "md: delaying %s of %s" | 7945 | pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", |
7918 | " until %s has finished (they" | 7946 | desc, mdname(mddev), |
7919 | " share one or more physical units)\n", | 7947 | mdname(mddev2)); |
7920 | desc, mdname(mddev), | ||
7921 | mdname(mddev2)); | ||
7922 | } | 7948 | } |
7923 | mddev_put(mddev2); | 7949 | mddev_put(mddev2); |
7924 | if (signal_pending(current)) | 7950 | if (signal_pending(current)) |
@@ -7975,12 +8001,10 @@ void md_do_sync(struct md_thread *thread) | |||
7975 | } | 8001 | } |
7976 | } | 8002 | } |
7977 | 8003 | ||
7978 | printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); | 8004 | pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); |
7979 | printk(KERN_INFO "md: minimum _guaranteed_ speed:" | 8005 | pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); |
7980 | " %d KB/sec/disk.\n", speed_min(mddev)); | 8006 | pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", |
7981 | printk(KERN_INFO "md: using maximum available idle IO bandwidth " | 8007 | speed_max(mddev), desc); |
7982 | "(but not more than %d KB/sec) for %s.\n", | ||
7983 | speed_max(mddev), desc); | ||
7984 | 8008 | ||
7985 | is_mddev_idle(mddev, 1); /* this initializes IO event counters */ | 8009 | is_mddev_idle(mddev, 1); /* this initializes IO event counters */ |
7986 | 8010 | ||
@@ -7997,16 +8021,15 @@ void md_do_sync(struct md_thread *thread) | |||
7997 | * Tune reconstruction: | 8021 | * Tune reconstruction: |
7998 | */ | 8022 | */ |
7999 | window = 32*(PAGE_SIZE/512); | 8023 | window = 32*(PAGE_SIZE/512); |
8000 | printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", | 8024 | pr_debug("md: using %dk window, over a total of %lluk.\n", |
8001 | window/2, (unsigned long long)max_sectors/2); | 8025 | window/2, (unsigned long long)max_sectors/2); |
8002 | 8026 | ||
8003 | atomic_set(&mddev->recovery_active, 0); | 8027 | atomic_set(&mddev->recovery_active, 0); |
8004 | last_check = 0; | 8028 | last_check = 0; |
8005 | 8029 | ||
8006 | if (j>2) { | 8030 | if (j>2) { |
8007 | printk(KERN_INFO | 8031 | pr_debug("md: resuming %s of %s from checkpoint.\n", |
8008 | "md: resuming %s of %s from checkpoint.\n", | 8032 | desc, mdname(mddev)); |
8009 | desc, mdname(mddev)); | ||
8010 | mddev->curr_resync = j; | 8033 | mddev->curr_resync = j; |
8011 | } else | 8034 | } else |
8012 | mddev->curr_resync = 3; /* no longer delayed */ | 8035 | mddev->curr_resync = 3; /* no longer delayed */ |
@@ -8038,7 +8061,7 @@ void md_do_sync(struct md_thread *thread) | |||
8038 | j > mddev->recovery_cp) | 8061 | j > mddev->recovery_cp) |
8039 | mddev->recovery_cp = j; | 8062 | mddev->recovery_cp = j; |
8040 | update_time = jiffies; | 8063 | update_time = jiffies; |
8041 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 8064 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
8042 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 8065 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
8043 | } | 8066 | } |
8044 | 8067 | ||
@@ -8133,9 +8156,9 @@ void md_do_sync(struct md_thread *thread) | |||
8133 | } | 8156 | } |
8134 | } | 8157 | } |
8135 | } | 8158 | } |
8136 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, | 8159 | pr_info("md: %s: %s %s.\n",mdname(mddev), desc, |
8137 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) | 8160 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) |
8138 | ? "interrupted" : "done"); | 8161 | ? "interrupted" : "done"); |
8139 | /* | 8162 | /* |
8140 | * this also signals 'finished resyncing' to md_stop | 8163 | * this also signals 'finished resyncing' to md_stop |
8141 | */ | 8164 | */ |
@@ -8155,9 +8178,8 @@ void md_do_sync(struct md_thread *thread) | |||
8155 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 8178 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
8156 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 8179 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
8157 | if (mddev->curr_resync >= mddev->recovery_cp) { | 8180 | if (mddev->curr_resync >= mddev->recovery_cp) { |
8158 | printk(KERN_INFO | 8181 | pr_debug("md: checkpointing %s of %s.\n", |
8159 | "md: checkpointing %s of %s.\n", | 8182 | desc, mdname(mddev)); |
8160 | desc, mdname(mddev)); | ||
8161 | if (test_bit(MD_RECOVERY_ERROR, | 8183 | if (test_bit(MD_RECOVERY_ERROR, |
8162 | &mddev->recovery)) | 8184 | &mddev->recovery)) |
8163 | mddev->recovery_cp = | 8185 | mddev->recovery_cp = |
@@ -8187,8 +8209,8 @@ void md_do_sync(struct md_thread *thread) | |||
8187 | /* set CHANGE_PENDING here since maybe another update is needed, | 8209 | /* set CHANGE_PENDING here since maybe another update is needed, |
8188 | * so other nodes are informed. It should be harmless for normal | 8210 | * so other nodes are informed. It should be harmless for normal |
8189 | * raid */ | 8211 | * raid */ |
8190 | set_mask_bits(&mddev->flags, 0, | 8212 | set_mask_bits(&mddev->sb_flags, 0, |
8191 | BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); | 8213 | BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); |
8192 | 8214 | ||
8193 | spin_lock(&mddev->lock); | 8215 | spin_lock(&mddev->lock); |
8194 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 8216 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
@@ -8288,12 +8310,12 @@ static int remove_and_add_spares(struct mddev *mddev, | |||
8288 | if (!test_bit(Journal, &rdev->flags)) | 8310 | if (!test_bit(Journal, &rdev->flags)) |
8289 | spares++; | 8311 | spares++; |
8290 | md_new_event(mddev); | 8312 | md_new_event(mddev); |
8291 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8313 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
8292 | } | 8314 | } |
8293 | } | 8315 | } |
8294 | no_add: | 8316 | no_add: |
8295 | if (removed) | 8317 | if (removed) |
8296 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8318 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
8297 | return spares; | 8319 | return spares; |
8298 | } | 8320 | } |
8299 | 8321 | ||
@@ -8305,8 +8327,8 @@ static void md_start_sync(struct work_struct *ws) | |||
8305 | mddev, | 8327 | mddev, |
8306 | "resync"); | 8328 | "resync"); |
8307 | if (!mddev->sync_thread) { | 8329 | if (!mddev->sync_thread) { |
8308 | printk(KERN_ERR "%s: could not start resync thread...\n", | 8330 | pr_warn("%s: could not start resync thread...\n", |
8309 | mdname(mddev)); | 8331 | mdname(mddev)); |
8310 | /* leave the spares where they are, it shouldn't hurt */ | 8332 | /* leave the spares where they are, it shouldn't hurt */ |
8311 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 8333 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
8312 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 8334 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
@@ -8356,8 +8378,8 @@ void md_check_recovery(struct mddev *mddev) | |||
8356 | 8378 | ||
8357 | if (signal_pending(current)) { | 8379 | if (signal_pending(current)) { |
8358 | if (mddev->pers->sync_request && !mddev->external) { | 8380 | if (mddev->pers->sync_request && !mddev->external) { |
8359 | printk(KERN_INFO "md: %s in immediate safe mode\n", | 8381 | pr_debug("md: %s in immediate safe mode\n", |
8360 | mdname(mddev)); | 8382 | mdname(mddev)); |
8361 | mddev->safemode = 2; | 8383 | mddev->safemode = 2; |
8362 | } | 8384 | } |
8363 | flush_signals(current); | 8385 | flush_signals(current); |
@@ -8366,7 +8388,7 @@ void md_check_recovery(struct mddev *mddev) | |||
8366 | if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 8388 | if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
8367 | return; | 8389 | return; |
8368 | if ( ! ( | 8390 | if ( ! ( |
8369 | (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || | 8391 | (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || |
8370 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || | 8392 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || |
8371 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || | 8393 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
8372 | test_bit(MD_RELOAD_SB, &mddev->flags) || | 8394 | test_bit(MD_RELOAD_SB, &mddev->flags) || |
@@ -8404,7 +8426,7 @@ void md_check_recovery(struct mddev *mddev) | |||
8404 | md_reap_sync_thread(mddev); | 8426 | md_reap_sync_thread(mddev); |
8405 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 8427 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
8406 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 8428 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
8407 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 8429 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
8408 | goto unlock; | 8430 | goto unlock; |
8409 | } | 8431 | } |
8410 | 8432 | ||
@@ -8432,7 +8454,7 @@ void md_check_recovery(struct mddev *mddev) | |||
8432 | mddev->recovery_cp == MaxSector) { | 8454 | mddev->recovery_cp == MaxSector) { |
8433 | mddev->in_sync = 1; | 8455 | mddev->in_sync = 1; |
8434 | did_change = 1; | 8456 | did_change = 1; |
8435 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 8457 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
8436 | } | 8458 | } |
8437 | if (mddev->safemode == 1) | 8459 | if (mddev->safemode == 1) |
8438 | mddev->safemode = 0; | 8460 | mddev->safemode = 0; |
@@ -8441,7 +8463,7 @@ void md_check_recovery(struct mddev *mddev) | |||
8441 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 8463 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
8442 | } | 8464 | } |
8443 | 8465 | ||
8444 | if (mddev->flags & MD_UPDATE_SB_FLAGS) | 8466 | if (mddev->sb_flags) |
8445 | md_update_sb(mddev, 0); | 8467 | md_update_sb(mddev, 0); |
8446 | 8468 | ||
8447 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 8469 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
@@ -8537,7 +8559,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
8537 | if (mddev->pers->spare_active(mddev)) { | 8559 | if (mddev->pers->spare_active(mddev)) { |
8538 | sysfs_notify(&mddev->kobj, NULL, | 8560 | sysfs_notify(&mddev->kobj, NULL, |
8539 | "degraded"); | 8561 | "degraded"); |
8540 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8562 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
8541 | } | 8563 | } |
8542 | } | 8564 | } |
8543 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 8565 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
@@ -8552,7 +8574,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
8552 | rdev->saved_raid_disk = -1; | 8574 | rdev->saved_raid_disk = -1; |
8553 | 8575 | ||
8554 | md_update_sb(mddev, 1); | 8576 | md_update_sb(mddev, 1); |
8555 | /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can | 8577 | /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can |
8556 | * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by | 8578 | * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by |
8557 | * clustered raid */ | 8579 | * clustered raid */ |
8558 | if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) | 8580 | if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) |
@@ -8614,9 +8636,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | |||
8614 | rv = badblocks_set(&rdev->badblocks, s, sectors, 0); | 8636 | rv = badblocks_set(&rdev->badblocks, s, sectors, 0); |
8615 | if (rv == 0) { | 8637 | if (rv == 0) { |
8616 | /* Make sure they get written out promptly */ | 8638 | /* Make sure they get written out promptly */ |
8639 | if (test_bit(ExternalBbl, &rdev->flags)) | ||
8640 | sysfs_notify(&rdev->kobj, NULL, | ||
8641 | "unacknowledged_bad_blocks"); | ||
8617 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 8642 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
8618 | set_mask_bits(&mddev->flags, 0, | 8643 | set_mask_bits(&mddev->sb_flags, 0, |
8619 | BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); | 8644 | BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); |
8620 | md_wakeup_thread(rdev->mddev->thread); | 8645 | md_wakeup_thread(rdev->mddev->thread); |
8621 | return 1; | 8646 | return 1; |
8622 | } else | 8647 | } else |
@@ -8627,12 +8652,15 @@ EXPORT_SYMBOL_GPL(rdev_set_badblocks); | |||
8627 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | 8652 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
8628 | int is_new) | 8653 | int is_new) |
8629 | { | 8654 | { |
8655 | int rv; | ||
8630 | if (is_new) | 8656 | if (is_new) |
8631 | s += rdev->new_data_offset; | 8657 | s += rdev->new_data_offset; |
8632 | else | 8658 | else |
8633 | s += rdev->data_offset; | 8659 | s += rdev->data_offset; |
8634 | return badblocks_clear(&rdev->badblocks, | 8660 | rv = badblocks_clear(&rdev->badblocks, s, sectors); |
8635 | s, sectors); | 8661 | if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) |
8662 | sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); | ||
8663 | return rv; | ||
8636 | } | 8664 | } |
8637 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | 8665 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); |
8638 | 8666 | ||
@@ -8749,7 +8777,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) | |||
8749 | rdev2->saved_raid_disk = role; | 8777 | rdev2->saved_raid_disk = role; |
8750 | ret = remove_and_add_spares(mddev, rdev2); | 8778 | ret = remove_and_add_spares(mddev, rdev2); |
8751 | pr_info("Activated spare: %s\n", | 8779 | pr_info("Activated spare: %s\n", |
8752 | bdevname(rdev2->bdev,b)); | 8780 | bdevname(rdev2->bdev,b)); |
8753 | /* wakeup mddev->thread here, so array could | 8781 | /* wakeup mddev->thread here, so array could |
8754 | * perform resync with the new activated disk */ | 8782 | * perform resync with the new activated disk */ |
8755 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 8783 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
@@ -8785,15 +8813,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) | |||
8785 | * variable in case we err in the future | 8813 | * variable in case we err in the future |
8786 | */ | 8814 | */ |
8787 | rdev->sb_page = NULL; | 8815 | rdev->sb_page = NULL; |
8788 | alloc_disk_sb(rdev); | 8816 | err = alloc_disk_sb(rdev); |
8789 | ClearPageUptodate(rdev->sb_page); | 8817 | if (err == 0) { |
8790 | rdev->sb_loaded = 0; | 8818 | ClearPageUptodate(rdev->sb_page); |
8791 | err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); | 8819 | rdev->sb_loaded = 0; |
8792 | 8820 | err = super_types[mddev->major_version]. | |
8821 | load_super(rdev, NULL, mddev->minor_version); | ||
8822 | } | ||
8793 | if (err < 0) { | 8823 | if (err < 0) { |
8794 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", | 8824 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", |
8795 | __func__, __LINE__, rdev->desc_nr, err); | 8825 | __func__, __LINE__, rdev->desc_nr, err); |
8796 | put_page(rdev->sb_page); | 8826 | if (rdev->sb_page) |
8827 | put_page(rdev->sb_page); | ||
8797 | rdev->sb_page = swapout; | 8828 | rdev->sb_page = swapout; |
8798 | rdev->sb_loaded = 1; | 8829 | rdev->sb_loaded = 1; |
8799 | return err; | 8830 | return err; |
@@ -8871,9 +8902,6 @@ void md_autodetect_dev(dev_t dev) | |||
8871 | mutex_lock(&detected_devices_mutex); | 8902 | mutex_lock(&detected_devices_mutex); |
8872 | list_add_tail(&node_detected_dev->list, &all_detected_devices); | 8903 | list_add_tail(&node_detected_dev->list, &all_detected_devices); |
8873 | mutex_unlock(&detected_devices_mutex); | 8904 | mutex_unlock(&detected_devices_mutex); |
8874 | } else { | ||
8875 | printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" | ||
8876 | ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); | ||
8877 | } | 8905 | } |
8878 | } | 8906 | } |
8879 | 8907 | ||
@@ -8887,7 +8915,7 @@ static void autostart_arrays(int part) | |||
8887 | i_scanned = 0; | 8915 | i_scanned = 0; |
8888 | i_passed = 0; | 8916 | i_passed = 0; |
8889 | 8917 | ||
8890 | printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); | 8918 | pr_info("md: Autodetecting RAID arrays.\n"); |
8891 | 8919 | ||
8892 | mutex_lock(&detected_devices_mutex); | 8920 | mutex_lock(&detected_devices_mutex); |
8893 | while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { | 8921 | while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { |
@@ -8912,8 +8940,7 @@ static void autostart_arrays(int part) | |||
8912 | } | 8940 | } |
8913 | mutex_unlock(&detected_devices_mutex); | 8941 | mutex_unlock(&detected_devices_mutex); |
8914 | 8942 | ||
8915 | printk(KERN_INFO "md: Scanned %d and added %d devices.\n", | 8943 | pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); |
8916 | i_scanned, i_passed); | ||
8917 | 8944 | ||
8918 | autorun_devices(part); | 8945 | autorun_devices(part); |
8919 | } | 8946 | } |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 2b2041773e79..e38936d05df1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -30,6 +30,16 @@ | |||
30 | #define MaxSector (~(sector_t)0) | 30 | #define MaxSector (~(sector_t)0) |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * These flags should really be called "NO_RETRY" rather than | ||
34 | * "FAILFAST" because they don't make any promise about time lapse, | ||
35 | * only about the number of retries, which will be zero. | ||
36 | * REQ_FAILFAST_DRIVER is not included because | ||
37 | * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") | ||
38 | * seems to suggest that the errors it avoids retrying should usually | ||
39 | * be retried. | ||
40 | */ | ||
41 | #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) | ||
42 | /* | ||
33 | * MD's 'extended' device | 43 | * MD's 'extended' device |
34 | */ | 44 | */ |
35 | struct md_rdev { | 45 | struct md_rdev { |
@@ -168,6 +178,19 @@ enum flag_bits { | |||
168 | * so it is safe to remove without | 178 | * so it is safe to remove without |
169 | * another synchronize_rcu() call. | 179 | * another synchronize_rcu() call. |
170 | */ | 180 | */ |
181 | ExternalBbl, /* External metadata provides bad | ||
182 | * block management for a disk | ||
183 | */ | ||
184 | FailFast, /* Minimal retries should be attempted on | ||
185 | * this device, so use REQ_FAILFAST_DEV. | ||
186 | * Also don't try to repair failed reads. | ||
187 | * It is expects that no bad block log | ||
188 | * is present. | ||
189 | */ | ||
190 | LastDev, /* Seems to be the last working dev as | ||
191 | * it didn't fail, so don't use FailFast | ||
192 | * any more for metadata | ||
193 | */ | ||
171 | }; | 194 | }; |
172 | 195 | ||
173 | static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, | 196 | static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, |
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | |||
189 | int is_new); | 212 | int is_new); |
190 | struct md_cluster_info; | 213 | struct md_cluster_info; |
191 | 214 | ||
215 | enum mddev_flags { | ||
216 | MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ | ||
217 | MD_CLOSING, /* If set, we are closing the array, do not open | ||
218 | * it then */ | ||
219 | MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ | ||
220 | MD_HAS_JOURNAL, /* The raid array has journal feature set */ | ||
221 | MD_RELOAD_SB, /* Reload the superblock because another node | ||
222 | * updated it. | ||
223 | */ | ||
224 | MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node | ||
225 | * already took resync lock, need to | ||
226 | * release the lock */ | ||
227 | MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is | ||
228 | * supported as calls to md_error() will | ||
229 | * never cause the array to become failed. | ||
230 | */ | ||
231 | }; | ||
232 | |||
233 | enum mddev_sb_flags { | ||
234 | MD_SB_CHANGE_DEVS, /* Some device status has changed */ | ||
235 | MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ | ||
236 | MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ | ||
237 | MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ | ||
238 | }; | ||
239 | |||
192 | struct mddev { | 240 | struct mddev { |
193 | void *private; | 241 | void *private; |
194 | struct md_personality *pers; | 242 | struct md_personality *pers; |
@@ -196,21 +244,7 @@ struct mddev { | |||
196 | int md_minor; | 244 | int md_minor; |
197 | struct list_head disks; | 245 | struct list_head disks; |
198 | unsigned long flags; | 246 | unsigned long flags; |
199 | #define MD_CHANGE_DEVS 0 /* Some device status has changed */ | 247 | unsigned long sb_flags; |
200 | #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ | ||
201 | #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ | ||
202 | #define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ | ||
203 | #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ | ||
204 | #define MD_CLOSING 4 /* If set, we are closing the array, do not open | ||
205 | * it then */ | ||
206 | #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ | ||
207 | #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ | ||
208 | #define MD_RELOAD_SB 7 /* Reload the superblock because another node | ||
209 | * updated it. | ||
210 | */ | ||
211 | #define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node | ||
212 | * already took resync lock, need to | ||
213 | * release the lock */ | ||
214 | 248 | ||
215 | int suspended; | 249 | int suspended; |
216 | atomic_t active_io; | 250 | atomic_t active_io; |
@@ -304,31 +338,6 @@ struct mddev { | |||
304 | int parallel_resync; | 338 | int parallel_resync; |
305 | 339 | ||
306 | int ok_start_degraded; | 340 | int ok_start_degraded; |
307 | /* recovery/resync flags | ||
308 | * NEEDED: we might need to start a resync/recover | ||
309 | * RUNNING: a thread is running, or about to be started | ||
310 | * SYNC: actually doing a resync, not a recovery | ||
311 | * RECOVER: doing recovery, or need to try it. | ||
312 | * INTR: resync needs to be aborted for some reason | ||
313 | * DONE: thread is done and is waiting to be reaped | ||
314 | * REQUEST: user-space has requested a sync (used with SYNC) | ||
315 | * CHECK: user-space request for check-only, no repair | ||
316 | * RESHAPE: A reshape is happening | ||
317 | * ERROR: sync-action interrupted because io-error | ||
318 | * | ||
319 | * If neither SYNC or RESHAPE are set, then it is a recovery. | ||
320 | */ | ||
321 | #define MD_RECOVERY_RUNNING 0 | ||
322 | #define MD_RECOVERY_SYNC 1 | ||
323 | #define MD_RECOVERY_RECOVER 2 | ||
324 | #define MD_RECOVERY_INTR 3 | ||
325 | #define MD_RECOVERY_DONE 4 | ||
326 | #define MD_RECOVERY_NEEDED 5 | ||
327 | #define MD_RECOVERY_REQUESTED 6 | ||
328 | #define MD_RECOVERY_CHECK 7 | ||
329 | #define MD_RECOVERY_RESHAPE 8 | ||
330 | #define MD_RECOVERY_FROZEN 9 | ||
331 | #define MD_RECOVERY_ERROR 10 | ||
332 | 341 | ||
333 | unsigned long recovery; | 342 | unsigned long recovery; |
334 | /* If a RAID personality determines that recovery (of a particular | 343 | /* If a RAID personality determines that recovery (of a particular |
@@ -442,6 +451,23 @@ struct mddev { | |||
442 | unsigned int good_device_nr; /* good device num within cluster raid */ | 451 | unsigned int good_device_nr; /* good device num within cluster raid */ |
443 | }; | 452 | }; |
444 | 453 | ||
454 | enum recovery_flags { | ||
455 | /* | ||
456 | * If neither SYNC or RESHAPE are set, then it is a recovery. | ||
457 | */ | ||
458 | MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ | ||
459 | MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ | ||
460 | MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ | ||
461 | MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ | ||
462 | MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ | ||
463 | MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ | ||
464 | MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ | ||
465 | MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ | ||
466 | MD_RECOVERY_RESHAPE, /* A reshape is happening */ | ||
467 | MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ | ||
468 | MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ | ||
469 | }; | ||
470 | |||
445 | static inline int __must_check mddev_lock(struct mddev *mddev) | 471 | static inline int __must_check mddev_lock(struct mddev *mddev) |
446 | { | 472 | { |
447 | return mutex_lock_interruptible(&mddev->reconfig_mutex); | 473 | return mutex_lock_interruptible(&mddev->reconfig_mutex); |
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits); | |||
623 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); | 649 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); |
624 | extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | 650 | extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
625 | sector_t sector, int size, struct page *page); | 651 | sector_t sector, int size, struct page *page); |
626 | extern void md_super_wait(struct mddev *mddev); | 652 | extern int md_super_wait(struct mddev *mddev); |
627 | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | 653 | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
628 | struct page *page, int op, int op_flags, | 654 | struct page *page, int op, int op_flags, |
629 | bool metadata_op); | 655 | bool metadata_op); |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4da06d813b8f..aa8c4e5c1ee2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf) | |||
52 | } | 52 | } |
53 | rcu_read_unlock(); | 53 | rcu_read_unlock(); |
54 | 54 | ||
55 | printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); | 55 | pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); |
56 | return (-1); | 56 | return (-1); |
57 | } | 57 | } |
58 | 58 | ||
@@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio) | |||
97 | */ | 97 | */ |
98 | char b[BDEVNAME_SIZE]; | 98 | char b[BDEVNAME_SIZE]; |
99 | md_error (mp_bh->mddev, rdev); | 99 | md_error (mp_bh->mddev, rdev); |
100 | printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", | 100 | pr_info("multipath: %s: rescheduling sector %llu\n", |
101 | bdevname(rdev->bdev,b), | 101 | bdevname(rdev->bdev,b), |
102 | (unsigned long long)bio->bi_iter.bi_sector); | 102 | (unsigned long long)bio->bi_iter.bi_sector); |
103 | multipath_reschedule_retry(mp_bh); | 103 | multipath_reschedule_retry(mp_bh); |
104 | } else | 104 | } else |
105 | multipath_end_bh_io(mp_bh, bio->bi_error); | 105 | multipath_end_bh_io(mp_bh, bio->bi_error); |
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) | |||
194 | * first check if this is a queued request for a device | 194 | * first check if this is a queued request for a device |
195 | * which has just failed. | 195 | * which has just failed. |
196 | */ | 196 | */ |
197 | printk(KERN_ALERT | 197 | pr_warn("multipath: only one IO path left and IO error.\n"); |
198 | "multipath: only one IO path left and IO error.\n"); | ||
199 | /* leave it active... it's all we have */ | 198 | /* leave it active... it's all we have */ |
200 | return; | 199 | return; |
201 | } | 200 | } |
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) | |||
209 | spin_unlock_irqrestore(&conf->device_lock, flags); | 208 | spin_unlock_irqrestore(&conf->device_lock, flags); |
210 | } | 209 | } |
211 | set_bit(Faulty, &rdev->flags); | 210 | set_bit(Faulty, &rdev->flags); |
212 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 211 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
213 | printk(KERN_ALERT "multipath: IO failure on %s," | 212 | pr_err("multipath: IO failure on %s, disabling IO path.\n" |
214 | " disabling IO path.\n" | 213 | "multipath: Operation continuing on %d IO paths.\n", |
215 | "multipath: Operation continuing" | ||
216 | " on %d IO paths.\n", | ||
217 | bdevname(rdev->bdev, b), | 214 | bdevname(rdev->bdev, b), |
218 | conf->raid_disks - mddev->degraded); | 215 | conf->raid_disks - mddev->degraded); |
219 | } | 216 | } |
@@ -223,21 +220,21 @@ static void print_multipath_conf (struct mpconf *conf) | |||
223 | int i; | 220 | int i; |
224 | struct multipath_info *tmp; | 221 | struct multipath_info *tmp; |
225 | 222 | ||
226 | printk("MULTIPATH conf printout:\n"); | 223 | pr_debug("MULTIPATH conf printout:\n"); |
227 | if (!conf) { | 224 | if (!conf) { |
228 | printk("(conf==NULL)\n"); | 225 | pr_debug("(conf==NULL)\n"); |
229 | return; | 226 | return; |
230 | } | 227 | } |
231 | printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 228 | pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
232 | conf->raid_disks); | 229 | conf->raid_disks); |
233 | 230 | ||
234 | for (i = 0; i < conf->raid_disks; i++) { | 231 | for (i = 0; i < conf->raid_disks; i++) { |
235 | char b[BDEVNAME_SIZE]; | 232 | char b[BDEVNAME_SIZE]; |
236 | tmp = conf->multipaths + i; | 233 | tmp = conf->multipaths + i; |
237 | if (tmp->rdev) | 234 | if (tmp->rdev) |
238 | printk(" disk%d, o:%d, dev:%s\n", | 235 | pr_debug(" disk%d, o:%d, dev:%s\n", |
239 | i,!test_bit(Faulty, &tmp->rdev->flags), | 236 | i,!test_bit(Faulty, &tmp->rdev->flags), |
240 | bdevname(tmp->rdev->bdev,b)); | 237 | bdevname(tmp->rdev->bdev,b)); |
241 | } | 238 | } |
242 | } | 239 | } |
243 | 240 | ||
@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
292 | if (rdev == p->rdev) { | 289 | if (rdev == p->rdev) { |
293 | if (test_bit(In_sync, &rdev->flags) || | 290 | if (test_bit(In_sync, &rdev->flags) || |
294 | atomic_read(&rdev->nr_pending)) { | 291 | atomic_read(&rdev->nr_pending)) { |
295 | printk(KERN_ERR "hot-remove-disk, slot %d is identified" | 292 | pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); |
296 | " but is still operational!\n", number); | ||
297 | err = -EBUSY; | 293 | err = -EBUSY; |
298 | goto abort; | 294 | goto abort; |
299 | } | 295 | } |
@@ -346,16 +342,14 @@ static void multipathd(struct md_thread *thread) | |||
346 | bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; | 342 | bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; |
347 | 343 | ||
348 | if ((mp_bh->path = multipath_map (conf))<0) { | 344 | if ((mp_bh->path = multipath_map (conf))<0) { |
349 | printk(KERN_ALERT "multipath: %s: unrecoverable IO read" | 345 | pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", |
350 | " error for block %llu\n", | 346 | bdevname(bio->bi_bdev,b), |
351 | bdevname(bio->bi_bdev,b), | 347 | (unsigned long long)bio->bi_iter.bi_sector); |
352 | (unsigned long long)bio->bi_iter.bi_sector); | ||
353 | multipath_end_bh_io(mp_bh, -EIO); | 348 | multipath_end_bh_io(mp_bh, -EIO); |
354 | } else { | 349 | } else { |
355 | printk(KERN_ERR "multipath: %s: redirecting sector %llu" | 350 | pr_err("multipath: %s: redirecting sector %llu to another IO path\n", |
356 | " to another IO path\n", | 351 | bdevname(bio->bi_bdev,b), |
357 | bdevname(bio->bi_bdev,b), | 352 | (unsigned long long)bio->bi_iter.bi_sector); |
358 | (unsigned long long)bio->bi_iter.bi_sector); | ||
359 | *bio = *(mp_bh->master_bio); | 353 | *bio = *(mp_bh->master_bio); |
360 | bio->bi_iter.bi_sector += | 354 | bio->bi_iter.bi_sector += |
361 | conf->multipaths[mp_bh->path].rdev->data_offset; | 355 | conf->multipaths[mp_bh->path].rdev->data_offset; |
@@ -389,8 +383,8 @@ static int multipath_run (struct mddev *mddev) | |||
389 | return -EINVAL; | 383 | return -EINVAL; |
390 | 384 | ||
391 | if (mddev->level != LEVEL_MULTIPATH) { | 385 | if (mddev->level != LEVEL_MULTIPATH) { |
392 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", | 386 | pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", |
393 | mdname(mddev), mddev->level); | 387 | mdname(mddev), mddev->level); |
394 | goto out; | 388 | goto out; |
395 | } | 389 | } |
396 | /* | 390 | /* |
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev) | |||
401 | 395 | ||
402 | conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); | 396 | conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); |
403 | mddev->private = conf; | 397 | mddev->private = conf; |
404 | if (!conf) { | 398 | if (!conf) |
405 | printk(KERN_ERR | ||
406 | "multipath: couldn't allocate memory for %s\n", | ||
407 | mdname(mddev)); | ||
408 | goto out; | 399 | goto out; |
409 | } | ||
410 | 400 | ||
411 | conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, | 401 | conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, |
412 | GFP_KERNEL); | 402 | GFP_KERNEL); |
413 | if (!conf->multipaths) { | 403 | if (!conf->multipaths) |
414 | printk(KERN_ERR | ||
415 | "multipath: couldn't allocate memory for %s\n", | ||
416 | mdname(mddev)); | ||
417 | goto out_free_conf; | 404 | goto out_free_conf; |
418 | } | ||
419 | 405 | ||
420 | working_disks = 0; | 406 | working_disks = 0; |
421 | rdev_for_each(rdev, mddev) { | 407 | rdev_for_each(rdev, mddev) { |
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev) | |||
439 | INIT_LIST_HEAD(&conf->retry_list); | 425 | INIT_LIST_HEAD(&conf->retry_list); |
440 | 426 | ||
441 | if (!working_disks) { | 427 | if (!working_disks) { |
442 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", | 428 | pr_warn("multipath: no operational IO paths for %s\n", |
443 | mdname(mddev)); | 429 | mdname(mddev)); |
444 | goto out_free_conf; | 430 | goto out_free_conf; |
445 | } | 431 | } |
@@ -447,27 +433,17 @@ static int multipath_run (struct mddev *mddev) | |||
447 | 433 | ||
448 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, | 434 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, |
449 | sizeof(struct multipath_bh)); | 435 | sizeof(struct multipath_bh)); |
450 | if (conf->pool == NULL) { | 436 | if (conf->pool == NULL) |
451 | printk(KERN_ERR | ||
452 | "multipath: couldn't allocate memory for %s\n", | ||
453 | mdname(mddev)); | ||
454 | goto out_free_conf; | 437 | goto out_free_conf; |
455 | } | ||
456 | 438 | ||
457 | { | 439 | mddev->thread = md_register_thread(multipathd, mddev, |
458 | mddev->thread = md_register_thread(multipathd, mddev, | 440 | "multipath"); |
459 | "multipath"); | 441 | if (!mddev->thread) |
460 | if (!mddev->thread) { | 442 | goto out_free_conf; |
461 | printk(KERN_ERR "multipath: couldn't allocate thread" | ||
462 | " for %s\n", mdname(mddev)); | ||
463 | goto out_free_conf; | ||
464 | } | ||
465 | } | ||
466 | 443 | ||
467 | printk(KERN_INFO | 444 | pr_info("multipath: array %s active with %d out of %d IO paths\n", |
468 | "multipath: array %s active with %d out of %d IO paths\n", | ||
469 | mdname(mddev), conf->raid_disks - mddev->degraded, | 445 | mdname(mddev), conf->raid_disks - mddev->degraded, |
470 | mddev->raid_disks); | 446 | mddev->raid_disks); |
471 | /* | 447 | /* |
472 | * Ok, everything is just fine now | 448 | * Ok, everything is just fine now |
473 | */ | 449 | */ |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 258986a2699d..a162fedeb51a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <trace/events/block.h> | ||
24 | #include "md.h" | 25 | #include "md.h" |
25 | #include "raid0.h" | 26 | #include "raid0.h" |
26 | #include "raid5.h" | 27 | #include "raid5.h" |
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev) | |||
51 | char b[BDEVNAME_SIZE]; | 52 | char b[BDEVNAME_SIZE]; |
52 | struct r0conf *conf = mddev->private; | 53 | struct r0conf *conf = mddev->private; |
53 | int raid_disks = conf->strip_zone[0].nb_dev; | 54 | int raid_disks = conf->strip_zone[0].nb_dev; |
54 | printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", | 55 | pr_debug("md: RAID0 configuration for %s - %d zone%s\n", |
55 | mdname(mddev), | 56 | mdname(mddev), |
56 | conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); | 57 | conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); |
57 | for (j = 0; j < conf->nr_strip_zones; j++) { | 58 | for (j = 0; j < conf->nr_strip_zones; j++) { |
58 | printk(KERN_INFO "md: zone%d=[", j); | 59 | char line[200]; |
60 | int len = 0; | ||
61 | |||
59 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | 62 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) |
60 | printk(KERN_CONT "%s%s", k?"/":"", | 63 | len += snprintf(line+len, 200-len, "%s%s", k?"/":"", |
61 | bdevname(conf->devlist[j*raid_disks | 64 | bdevname(conf->devlist[j*raid_disks |
62 | + k]->bdev, b)); | 65 | + k]->bdev, b)); |
63 | printk(KERN_CONT "]\n"); | 66 | pr_debug("md: zone%d=[%s]\n", j, line); |
64 | 67 | ||
65 | zone_size = conf->strip_zone[j].zone_end - zone_start; | 68 | zone_size = conf->strip_zone[j].zone_end - zone_start; |
66 | printk(KERN_INFO " zone-offset=%10lluKB, " | 69 | pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n", |
67 | "device-offset=%10lluKB, size=%10lluKB\n", | ||
68 | (unsigned long long)zone_start>>1, | 70 | (unsigned long long)zone_start>>1, |
69 | (unsigned long long)conf->strip_zone[j].dev_start>>1, | 71 | (unsigned long long)conf->strip_zone[j].dev_start>>1, |
70 | (unsigned long long)zone_size>>1); | 72 | (unsigned long long)zone_size>>1); |
@@ -142,9 +144,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
142 | * chunk size is a multiple of that sector size | 144 | * chunk size is a multiple of that sector size |
143 | */ | 145 | */ |
144 | if ((mddev->chunk_sectors << 9) % blksize) { | 146 | if ((mddev->chunk_sectors << 9) % blksize) { |
145 | printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", | 147 | pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n", |
146 | mdname(mddev), | 148 | mdname(mddev), |
147 | mddev->chunk_sectors << 9, blksize); | 149 | mddev->chunk_sectors << 9, blksize); |
148 | err = -EINVAL; | 150 | err = -EINVAL; |
149 | goto abort; | 151 | goto abort; |
150 | } | 152 | } |
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
186 | } | 188 | } |
187 | 189 | ||
188 | if (j < 0) { | 190 | if (j < 0) { |
189 | printk(KERN_ERR | 191 | pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n", |
190 | "md/raid0:%s: remove inactive devices before converting to RAID0\n", | 192 | mdname(mddev)); |
191 | mdname(mddev)); | ||
192 | goto abort; | 193 | goto abort; |
193 | } | 194 | } |
194 | if (j >= mddev->raid_disks) { | 195 | if (j >= mddev->raid_disks) { |
195 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 196 | pr_warn("md/raid0:%s: bad disk number %d - aborting!\n", |
196 | "aborting!\n", mdname(mddev), j); | 197 | mdname(mddev), j); |
197 | goto abort; | 198 | goto abort; |
198 | } | 199 | } |
199 | if (dev[j]) { | 200 | if (dev[j]) { |
200 | printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " | 201 | pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n", |
201 | "aborting!\n", mdname(mddev), j); | 202 | mdname(mddev), j); |
202 | goto abort; | 203 | goto abort; |
203 | } | 204 | } |
204 | dev[j] = rdev1; | 205 | dev[j] = rdev1; |
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
208 | cnt++; | 209 | cnt++; |
209 | } | 210 | } |
210 | if (cnt != mddev->raid_disks) { | 211 | if (cnt != mddev->raid_disks) { |
211 | printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " | 212 | pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n", |
212 | "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); | 213 | mdname(mddev), cnt, mddev->raid_disks); |
213 | goto abort; | 214 | goto abort; |
214 | } | 215 | } |
215 | zone->nb_dev = cnt; | 216 | zone->nb_dev = cnt; |
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev) | |||
357 | int ret; | 358 | int ret; |
358 | 359 | ||
359 | if (mddev->chunk_sectors == 0) { | 360 | if (mddev->chunk_sectors == 0) { |
360 | printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", | 361 | pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev)); |
361 | mdname(mddev)); | ||
362 | return -EINVAL; | 362 | return -EINVAL; |
363 | } | 363 | } |
364 | if (md_check_no_bitmap(mddev)) | 364 | if (md_check_no_bitmap(mddev)) |
@@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev) | |||
399 | /* calculate array device size */ | 399 | /* calculate array device size */ |
400 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 400 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
401 | 401 | ||
402 | printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", | 402 | pr_debug("md/raid0:%s: md_size is %llu sectors.\n", |
403 | mdname(mddev), | 403 | mdname(mddev), |
404 | (unsigned long long)mddev->array_sectors); | 404 | (unsigned long long)mddev->array_sectors); |
405 | 405 | ||
406 | if (mddev->queue) { | 406 | if (mddev->queue) { |
407 | /* calculate the max read-ahead size. | 407 | /* calculate the max read-ahead size. |
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
464 | } | 464 | } |
465 | 465 | ||
466 | do { | 466 | do { |
467 | sector_t sector = bio->bi_iter.bi_sector; | 467 | sector_t bio_sector = bio->bi_iter.bi_sector; |
468 | sector_t sector = bio_sector; | ||
468 | unsigned chunk_sects = mddev->chunk_sectors; | 469 | unsigned chunk_sects = mddev->chunk_sectors; |
469 | 470 | ||
470 | unsigned sectors = chunk_sects - | 471 | unsigned sectors = chunk_sects - |
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
473 | : sector_div(sector, chunk_sects)); | 474 | : sector_div(sector, chunk_sects)); |
474 | 475 | ||
475 | /* Restore due to sector_div */ | 476 | /* Restore due to sector_div */ |
476 | sector = bio->bi_iter.bi_sector; | 477 | sector = bio_sector; |
477 | 478 | ||
478 | if (sectors < bio_sectors(bio)) { | 479 | if (sectors < bio_sectors(bio)) { |
479 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); | 480 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); |
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
492 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { | 493 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { |
493 | /* Just ignore it */ | 494 | /* Just ignore it */ |
494 | bio_endio(split); | 495 | bio_endio(split); |
495 | } else | 496 | } else { |
497 | if (mddev->gendisk) | ||
498 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), | ||
499 | split, disk_devt(mddev->gendisk), | ||
500 | bio_sector); | ||
496 | generic_make_request(split); | 501 | generic_make_request(split); |
502 | } | ||
497 | } while (split != bio); | 503 | } while (split != bio); |
498 | } | 504 | } |
499 | 505 | ||
@@ -509,17 +515,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev) | |||
509 | struct r0conf *priv_conf; | 515 | struct r0conf *priv_conf; |
510 | 516 | ||
511 | if (mddev->degraded != 1) { | 517 | if (mddev->degraded != 1) { |
512 | printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", | 518 | pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", |
513 | mdname(mddev), | 519 | mdname(mddev), |
514 | mddev->degraded); | 520 | mddev->degraded); |
515 | return ERR_PTR(-EINVAL); | 521 | return ERR_PTR(-EINVAL); |
516 | } | 522 | } |
517 | 523 | ||
518 | rdev_for_each(rdev, mddev) { | 524 | rdev_for_each(rdev, mddev) { |
519 | /* check slot number for a disk */ | 525 | /* check slot number for a disk */ |
520 | if (rdev->raid_disk == mddev->raid_disks-1) { | 526 | if (rdev->raid_disk == mddev->raid_disks-1) { |
521 | printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", | 527 | pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n", |
522 | mdname(mddev)); | 528 | mdname(mddev)); |
523 | return ERR_PTR(-EINVAL); | 529 | return ERR_PTR(-EINVAL); |
524 | } | 530 | } |
525 | rdev->sectors = mddev->dev_sectors; | 531 | rdev->sectors = mddev->dev_sectors; |
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev) | |||
533 | mddev->delta_disks = -1; | 539 | mddev->delta_disks = -1; |
534 | /* make sure it will be not marked as dirty */ | 540 | /* make sure it will be not marked as dirty */ |
535 | mddev->recovery_cp = MaxSector; | 541 | mddev->recovery_cp = MaxSector; |
542 | clear_bit(MD_HAS_JOURNAL, &mddev->flags); | ||
543 | clear_bit(MD_JOURNAL_CLEAN, &mddev->flags); | ||
536 | 544 | ||
537 | create_strip_zones(mddev, &priv_conf); | 545 | create_strip_zones(mddev, &priv_conf); |
546 | |||
538 | return priv_conf; | 547 | return priv_conf; |
539 | } | 548 | } |
540 | 549 | ||
@@ -549,19 +558,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev) | |||
549 | * - all mirrors must be already degraded | 558 | * - all mirrors must be already degraded |
550 | */ | 559 | */ |
551 | if (mddev->layout != ((1 << 8) + 2)) { | 560 | if (mddev->layout != ((1 << 8) + 2)) { |
552 | printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", | 561 | pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", |
553 | mdname(mddev), | 562 | mdname(mddev), |
554 | mddev->layout); | 563 | mddev->layout); |
555 | return ERR_PTR(-EINVAL); | 564 | return ERR_PTR(-EINVAL); |
556 | } | 565 | } |
557 | if (mddev->raid_disks & 1) { | 566 | if (mddev->raid_disks & 1) { |
558 | printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", | 567 | pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", |
559 | mdname(mddev)); | 568 | mdname(mddev)); |
560 | return ERR_PTR(-EINVAL); | 569 | return ERR_PTR(-EINVAL); |
561 | } | 570 | } |
562 | if (mddev->degraded != (mddev->raid_disks>>1)) { | 571 | if (mddev->degraded != (mddev->raid_disks>>1)) { |
563 | printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", | 572 | pr_warn("md/raid0:%s: All mirrors must be already degraded!\n", |
564 | mdname(mddev)); | 573 | mdname(mddev)); |
565 | return ERR_PTR(-EINVAL); | 574 | return ERR_PTR(-EINVAL); |
566 | } | 575 | } |
567 | 576 | ||
@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) | |||
574 | mddev->degraded = 0; | 583 | mddev->degraded = 0; |
575 | /* make sure it will be not marked as dirty */ | 584 | /* make sure it will be not marked as dirty */ |
576 | mddev->recovery_cp = MaxSector; | 585 | mddev->recovery_cp = MaxSector; |
586 | clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
577 | 587 | ||
578 | create_strip_zones(mddev, &priv_conf); | 588 | create_strip_zones(mddev, &priv_conf); |
579 | return priv_conf; | 589 | return priv_conf; |
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) | |||
588 | * - (N - 1) mirror drives must be already faulty | 598 | * - (N - 1) mirror drives must be already faulty |
589 | */ | 599 | */ |
590 | if ((mddev->raid_disks - 1) != mddev->degraded) { | 600 | if ((mddev->raid_disks - 1) != mddev->degraded) { |
591 | printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", | 601 | pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", |
592 | mdname(mddev)); | 602 | mdname(mddev)); |
593 | return ERR_PTR(-EINVAL); | 603 | return ERR_PTR(-EINVAL); |
594 | } | 604 | } |
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) | |||
616 | mddev->raid_disks = 1; | 626 | mddev->raid_disks = 1; |
617 | /* make sure it will be not marked as dirty */ | 627 | /* make sure it will be not marked as dirty */ |
618 | mddev->recovery_cp = MaxSector; | 628 | mddev->recovery_cp = MaxSector; |
629 | clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
619 | 630 | ||
620 | create_strip_zones(mddev, &priv_conf); | 631 | create_strip_zones(mddev, &priv_conf); |
621 | return priv_conf; | 632 | return priv_conf; |
@@ -631,8 +642,8 @@ static void *raid0_takeover(struct mddev *mddev) | |||
631 | */ | 642 | */ |
632 | 643 | ||
633 | if (mddev->bitmap) { | 644 | if (mddev->bitmap) { |
634 | printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n", | 645 | pr_warn("md/raid0: %s: cannot takeover array with bitmap\n", |
635 | mdname(mddev)); | 646 | mdname(mddev)); |
636 | return ERR_PTR(-EBUSY); | 647 | return ERR_PTR(-EBUSY); |
637 | } | 648 | } |
638 | if (mddev->level == 4) | 649 | if (mddev->level == 4) |
@@ -642,8 +653,8 @@ static void *raid0_takeover(struct mddev *mddev) | |||
642 | if (mddev->layout == ALGORITHM_PARITY_N) | 653 | if (mddev->layout == ALGORITHM_PARITY_N) |
643 | return raid0_takeover_raid45(mddev); | 654 | return raid0_takeover_raid45(mddev); |
644 | 655 | ||
645 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", | 656 | pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", |
646 | mdname(mddev), ALGORITHM_PARITY_N); | 657 | mdname(mddev), ALGORITHM_PARITY_N); |
647 | } | 658 | } |
648 | 659 | ||
649 | if (mddev->level == 10) | 660 | if (mddev->level == 10) |
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev) | |||
652 | if (mddev->level == 1) | 663 | if (mddev->level == 1) |
653 | return raid0_takeover_raid1(mddev); | 664 | return raid0_takeover_raid1(mddev); |
654 | 665 | ||
655 | printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", | 666 | pr_warn("Takeover from raid%i to raid0 not supported\n", |
656 | mddev->level); | 667 | mddev->level); |
657 | 668 | ||
658 | return ERR_PTR(-EINVAL); | 669 | return ERR_PTR(-EINVAL); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 29e2df5cd77b..a1f3fbed9100 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/module.h> | 37 | #include <linux/module.h> |
38 | #include <linux/seq_file.h> | 38 | #include <linux/seq_file.h> |
39 | #include <linux/ratelimit.h> | 39 | #include <linux/ratelimit.h> |
40 | #include <trace/events/block.h> | ||
40 | #include "md.h" | 41 | #include "md.h" |
41 | #include "raid1.h" | 42 | #include "raid1.h" |
42 | #include "bitmap.h" | 43 | #include "bitmap.h" |
@@ -70,6 +71,9 @@ static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | |||
70 | sector_t bi_sector); | 71 | sector_t bi_sector); |
71 | static void lower_barrier(struct r1conf *conf); | 72 | static void lower_barrier(struct r1conf *conf); |
72 | 73 | ||
74 | #define raid1_log(md, fmt, args...) \ | ||
75 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) | ||
76 | |||
73 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 77 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
74 | { | 78 | { |
75 | struct pool_info *pi = data; | 79 | struct pool_info *pi = data; |
@@ -325,6 +329,11 @@ static void raid1_end_read_request(struct bio *bio) | |||
325 | 329 | ||
326 | if (uptodate) | 330 | if (uptodate) |
327 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 331 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
332 | else if (test_bit(FailFast, &rdev->flags) && | ||
333 | test_bit(R1BIO_FailFast, &r1_bio->state)) | ||
334 | /* This was a fail-fast read so we definitely | ||
335 | * want to retry */ | ||
336 | ; | ||
328 | else { | 337 | else { |
329 | /* If all other devices have failed, we want to return | 338 | /* If all other devices have failed, we want to return |
330 | * the error upwards rather than fail the last device. | 339 | * the error upwards rather than fail the last device. |
@@ -347,13 +356,10 @@ static void raid1_end_read_request(struct bio *bio) | |||
347 | * oops, read error: | 356 | * oops, read error: |
348 | */ | 357 | */ |
349 | char b[BDEVNAME_SIZE]; | 358 | char b[BDEVNAME_SIZE]; |
350 | printk_ratelimited( | 359 | pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n", |
351 | KERN_ERR "md/raid1:%s: %s: " | 360 | mdname(conf->mddev), |
352 | "rescheduling sector %llu\n", | 361 | bdevname(rdev->bdev, b), |
353 | mdname(conf->mddev), | 362 | (unsigned long long)r1_bio->sector); |
354 | bdevname(rdev->bdev, | ||
355 | b), | ||
356 | (unsigned long long)r1_bio->sector); | ||
357 | set_bit(R1BIO_ReadError, &r1_bio->state); | 363 | set_bit(R1BIO_ReadError, &r1_bio->state); |
358 | reschedule_retry(r1_bio); | 364 | reschedule_retry(r1_bio); |
359 | /* don't drop the reference on read_disk yet */ | 365 | /* don't drop the reference on read_disk yet */ |
@@ -416,7 +422,24 @@ static void raid1_end_write_request(struct bio *bio) | |||
416 | set_bit(MD_RECOVERY_NEEDED, & | 422 | set_bit(MD_RECOVERY_NEEDED, & |
417 | conf->mddev->recovery); | 423 | conf->mddev->recovery); |
418 | 424 | ||
419 | set_bit(R1BIO_WriteError, &r1_bio->state); | 425 | if (test_bit(FailFast, &rdev->flags) && |
426 | (bio->bi_opf & MD_FAILFAST) && | ||
427 | /* We never try FailFast to WriteMostly devices */ | ||
428 | !test_bit(WriteMostly, &rdev->flags)) { | ||
429 | md_error(r1_bio->mddev, rdev); | ||
430 | if (!test_bit(Faulty, &rdev->flags)) | ||
431 | /* This is the only remaining device, | ||
432 | * We need to retry the write without | ||
433 | * FailFast | ||
434 | */ | ||
435 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
436 | else { | ||
437 | /* Finished with this branch */ | ||
438 | r1_bio->bios[mirror] = NULL; | ||
439 | to_put = bio; | ||
440 | } | ||
441 | } else | ||
442 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
420 | } else { | 443 | } else { |
421 | /* | 444 | /* |
422 | * Set R1BIO_Uptodate in our master bio, so that we | 445 | * Set R1BIO_Uptodate in our master bio, so that we |
@@ -534,6 +557,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
534 | best_good_sectors = 0; | 557 | best_good_sectors = 0; |
535 | has_nonrot_disk = 0; | 558 | has_nonrot_disk = 0; |
536 | choose_next_idle = 0; | 559 | choose_next_idle = 0; |
560 | clear_bit(R1BIO_FailFast, &r1_bio->state); | ||
537 | 561 | ||
538 | if ((conf->mddev->recovery_cp < this_sector + sectors) || | 562 | if ((conf->mddev->recovery_cp < this_sector + sectors) || |
539 | (mddev_is_clustered(conf->mddev) && | 563 | (mddev_is_clustered(conf->mddev) && |
@@ -607,6 +631,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
607 | } else | 631 | } else |
608 | best_good_sectors = sectors; | 632 | best_good_sectors = sectors; |
609 | 633 | ||
634 | if (best_disk >= 0) | ||
635 | /* At least two disks to choose from so failfast is OK */ | ||
636 | set_bit(R1BIO_FailFast, &r1_bio->state); | ||
637 | |||
610 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); | 638 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); |
611 | has_nonrot_disk |= nonrot; | 639 | has_nonrot_disk |= nonrot; |
612 | pending = atomic_read(&rdev->nr_pending); | 640 | pending = atomic_read(&rdev->nr_pending); |
@@ -645,11 +673,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
645 | } | 673 | } |
646 | break; | 674 | break; |
647 | } | 675 | } |
648 | /* If device is idle, use it */ | ||
649 | if (pending == 0) { | ||
650 | best_disk = disk; | ||
651 | break; | ||
652 | } | ||
653 | 676 | ||
654 | if (choose_next_idle) | 677 | if (choose_next_idle) |
655 | continue; | 678 | continue; |
@@ -672,7 +695,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
672 | * mixed ratation/non-rotational disks depending on workload. | 695 | * mixed ratation/non-rotational disks depending on workload. |
673 | */ | 696 | */ |
674 | if (best_disk == -1) { | 697 | if (best_disk == -1) { |
675 | if (has_nonrot_disk) | 698 | if (has_nonrot_disk || min_pending == 0) |
676 | best_disk = best_pending_disk; | 699 | best_disk = best_pending_disk; |
677 | else | 700 | else |
678 | best_disk = best_dist_disk; | 701 | best_disk = best_dist_disk; |
@@ -745,9 +768,14 @@ static void flush_pending_writes(struct r1conf *conf) | |||
745 | 768 | ||
746 | while (bio) { /* submit pending writes */ | 769 | while (bio) { /* submit pending writes */ |
747 | struct bio *next = bio->bi_next; | 770 | struct bio *next = bio->bi_next; |
771 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
748 | bio->bi_next = NULL; | 772 | bio->bi_next = NULL; |
749 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 773 | bio->bi_bdev = rdev->bdev; |
750 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 774 | if (test_bit(Faulty, &rdev->flags)) { |
775 | bio->bi_error = -EIO; | ||
776 | bio_endio(bio); | ||
777 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
778 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
751 | /* Just ignore it */ | 779 | /* Just ignore it */ |
752 | bio_endio(bio); | 780 | bio_endio(bio); |
753 | else | 781 | else |
@@ -832,7 +860,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | |||
832 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | 860 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { |
833 | if ((conf->mddev->curr_resync_completed | 861 | if ((conf->mddev->curr_resync_completed |
834 | >= bio_end_sector(bio)) || | 862 | >= bio_end_sector(bio)) || |
835 | (conf->next_resync + NEXT_NORMALIO_DISTANCE | 863 | (conf->start_next_window + NEXT_NORMALIO_DISTANCE |
836 | <= bio->bi_iter.bi_sector)) | 864 | <= bio->bi_iter.bi_sector)) |
837 | wait = false; | 865 | wait = false; |
838 | else | 866 | else |
@@ -858,6 +886,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | |||
858 | * that queue to allow conf->start_next_window | 886 | * that queue to allow conf->start_next_window |
859 | * to increase. | 887 | * to increase. |
860 | */ | 888 | */ |
889 | raid1_log(conf->mddev, "wait barrier"); | ||
861 | wait_event_lock_irq(conf->wait_barrier, | 890 | wait_event_lock_irq(conf->wait_barrier, |
862 | !conf->array_frozen && | 891 | !conf->array_frozen && |
863 | (!conf->barrier || | 892 | (!conf->barrier || |
@@ -937,6 +966,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
937 | */ | 966 | */ |
938 | spin_lock_irq(&conf->resync_lock); | 967 | spin_lock_irq(&conf->resync_lock); |
939 | conf->array_frozen = 1; | 968 | conf->array_frozen = 1; |
969 | raid1_log(conf->mddev, "wait freeze"); | ||
940 | wait_event_lock_irq_cmd(conf->wait_barrier, | 970 | wait_event_lock_irq_cmd(conf->wait_barrier, |
941 | conf->nr_pending == conf->nr_queued+extra, | 971 | conf->nr_pending == conf->nr_queued+extra, |
942 | conf->resync_lock, | 972 | conf->resync_lock, |
@@ -1019,9 +1049,14 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
1019 | 1049 | ||
1020 | while (bio) { /* submit pending writes */ | 1050 | while (bio) { /* submit pending writes */ |
1021 | struct bio *next = bio->bi_next; | 1051 | struct bio *next = bio->bi_next; |
1052 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
1022 | bio->bi_next = NULL; | 1053 | bio->bi_next = NULL; |
1023 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 1054 | bio->bi_bdev = rdev->bdev; |
1024 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 1055 | if (test_bit(Faulty, &rdev->flags)) { |
1056 | bio->bi_error = -EIO; | ||
1057 | bio_endio(bio); | ||
1058 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
1059 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
1025 | /* Just ignore it */ | 1060 | /* Just ignore it */ |
1026 | bio_endio(bio); | 1061 | bio_endio(bio); |
1027 | else | 1062 | else |
@@ -1136,6 +1171,7 @@ read_again: | |||
1136 | * take care not to over-take any writes | 1171 | * take care not to over-take any writes |
1137 | * that are 'behind' | 1172 | * that are 'behind' |
1138 | */ | 1173 | */ |
1174 | raid1_log(mddev, "wait behind writes"); | ||
1139 | wait_event(bitmap->behind_wait, | 1175 | wait_event(bitmap->behind_wait, |
1140 | atomic_read(&bitmap->behind_writes) == 0); | 1176 | atomic_read(&bitmap->behind_writes) == 0); |
1141 | } | 1177 | } |
@@ -1153,8 +1189,16 @@ read_again: | |||
1153 | read_bio->bi_bdev = mirror->rdev->bdev; | 1189 | read_bio->bi_bdev = mirror->rdev->bdev; |
1154 | read_bio->bi_end_io = raid1_end_read_request; | 1190 | read_bio->bi_end_io = raid1_end_read_request; |
1155 | bio_set_op_attrs(read_bio, op, do_sync); | 1191 | bio_set_op_attrs(read_bio, op, do_sync); |
1192 | if (test_bit(FailFast, &mirror->rdev->flags) && | ||
1193 | test_bit(R1BIO_FailFast, &r1_bio->state)) | ||
1194 | read_bio->bi_opf |= MD_FAILFAST; | ||
1156 | read_bio->bi_private = r1_bio; | 1195 | read_bio->bi_private = r1_bio; |
1157 | 1196 | ||
1197 | if (mddev->gendisk) | ||
1198 | trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), | ||
1199 | read_bio, disk_devt(mddev->gendisk), | ||
1200 | r1_bio->sector); | ||
1201 | |||
1158 | if (max_sectors < r1_bio->sectors) { | 1202 | if (max_sectors < r1_bio->sectors) { |
1159 | /* could not read all from this device, so we will | 1203 | /* could not read all from this device, so we will |
1160 | * need another r1_bio. | 1204 | * need another r1_bio. |
@@ -1195,6 +1239,7 @@ read_again: | |||
1195 | */ | 1239 | */ |
1196 | if (conf->pending_count >= max_queued_requests) { | 1240 | if (conf->pending_count >= max_queued_requests) { |
1197 | md_wakeup_thread(mddev->thread); | 1241 | md_wakeup_thread(mddev->thread); |
1242 | raid1_log(mddev, "wait queued"); | ||
1198 | wait_event(conf->wait_barrier, | 1243 | wait_event(conf->wait_barrier, |
1199 | conf->pending_count < max_queued_requests); | 1244 | conf->pending_count < max_queued_requests); |
1200 | } | 1245 | } |
@@ -1286,6 +1331,7 @@ read_again: | |||
1286 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1331 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
1287 | r1_bio->state = 0; | 1332 | r1_bio->state = 0; |
1288 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); | 1333 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); |
1334 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | ||
1289 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1335 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1290 | start_next_window = wait_barrier(conf, bio); | 1336 | start_next_window = wait_barrier(conf, bio); |
1291 | /* | 1337 | /* |
@@ -1363,10 +1409,21 @@ read_again: | |||
1363 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1409 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1364 | mbio->bi_end_io = raid1_end_write_request; | 1410 | mbio->bi_end_io = raid1_end_write_request; |
1365 | bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); | 1411 | bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); |
1412 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && | ||
1413 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && | ||
1414 | conf->raid_disks - mddev->degraded > 1) | ||
1415 | mbio->bi_opf |= MD_FAILFAST; | ||
1366 | mbio->bi_private = r1_bio; | 1416 | mbio->bi_private = r1_bio; |
1367 | 1417 | ||
1368 | atomic_inc(&r1_bio->remaining); | 1418 | atomic_inc(&r1_bio->remaining); |
1369 | 1419 | ||
1420 | if (mddev->gendisk) | ||
1421 | trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), | ||
1422 | mbio, disk_devt(mddev->gendisk), | ||
1423 | r1_bio->sector); | ||
1424 | /* flush_pending_writes() needs access to the rdev so...*/ | ||
1425 | mbio->bi_bdev = (void*)conf->mirrors[i].rdev; | ||
1426 | |||
1370 | cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); | 1427 | cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); |
1371 | if (cb) | 1428 | if (cb) |
1372 | plug = container_of(cb, struct raid1_plug_cb, cb); | 1429 | plug = container_of(cb, struct raid1_plug_cb, cb); |
@@ -1436,6 +1493,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) | |||
1436 | * next level up know. | 1493 | * next level up know. |
1437 | * else mark the drive as failed | 1494 | * else mark the drive as failed |
1438 | */ | 1495 | */ |
1496 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1439 | if (test_bit(In_sync, &rdev->flags) | 1497 | if (test_bit(In_sync, &rdev->flags) |
1440 | && (conf->raid_disks - mddev->degraded) == 1) { | 1498 | && (conf->raid_disks - mddev->degraded) == 1) { |
1441 | /* | 1499 | /* |
@@ -1445,10 +1503,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) | |||
1445 | * it is very likely to fail. | 1503 | * it is very likely to fail. |
1446 | */ | 1504 | */ |
1447 | conf->recovery_disabled = mddev->recovery_disabled; | 1505 | conf->recovery_disabled = mddev->recovery_disabled; |
1506 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1448 | return; | 1507 | return; |
1449 | } | 1508 | } |
1450 | set_bit(Blocked, &rdev->flags); | 1509 | set_bit(Blocked, &rdev->flags); |
1451 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1452 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1510 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
1453 | mddev->degraded++; | 1511 | mddev->degraded++; |
1454 | set_bit(Faulty, &rdev->flags); | 1512 | set_bit(Faulty, &rdev->flags); |
@@ -1459,36 +1517,35 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) | |||
1459 | * if recovery is running, make sure it aborts. | 1517 | * if recovery is running, make sure it aborts. |
1460 | */ | 1518 | */ |
1461 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1519 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1462 | set_mask_bits(&mddev->flags, 0, | 1520 | set_mask_bits(&mddev->sb_flags, 0, |
1463 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1521 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
1464 | printk(KERN_ALERT | 1522 | pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n" |
1465 | "md/raid1:%s: Disk failure on %s, disabling device.\n" | 1523 | "md/raid1:%s: Operation continuing on %d devices.\n", |
1466 | "md/raid1:%s: Operation continuing on %d devices.\n", | 1524 | mdname(mddev), bdevname(rdev->bdev, b), |
1467 | mdname(mddev), bdevname(rdev->bdev, b), | 1525 | mdname(mddev), conf->raid_disks - mddev->degraded); |
1468 | mdname(mddev), conf->raid_disks - mddev->degraded); | ||
1469 | } | 1526 | } |
1470 | 1527 | ||
1471 | static void print_conf(struct r1conf *conf) | 1528 | static void print_conf(struct r1conf *conf) |
1472 | { | 1529 | { |
1473 | int i; | 1530 | int i; |
1474 | 1531 | ||
1475 | printk(KERN_DEBUG "RAID1 conf printout:\n"); | 1532 | pr_debug("RAID1 conf printout:\n"); |
1476 | if (!conf) { | 1533 | if (!conf) { |
1477 | printk(KERN_DEBUG "(!conf)\n"); | 1534 | pr_debug("(!conf)\n"); |
1478 | return; | 1535 | return; |
1479 | } | 1536 | } |
1480 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1537 | pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
1481 | conf->raid_disks); | 1538 | conf->raid_disks); |
1482 | 1539 | ||
1483 | rcu_read_lock(); | 1540 | rcu_read_lock(); |
1484 | for (i = 0; i < conf->raid_disks; i++) { | 1541 | for (i = 0; i < conf->raid_disks; i++) { |
1485 | char b[BDEVNAME_SIZE]; | 1542 | char b[BDEVNAME_SIZE]; |
1486 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 1543 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
1487 | if (rdev) | 1544 | if (rdev) |
1488 | printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", | 1545 | pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", |
1489 | i, !test_bit(In_sync, &rdev->flags), | 1546 | i, !test_bit(In_sync, &rdev->flags), |
1490 | !test_bit(Faulty, &rdev->flags), | 1547 | !test_bit(Faulty, &rdev->flags), |
1491 | bdevname(rdev->bdev,b)); | 1548 | bdevname(rdev->bdev,b)); |
1492 | } | 1549 | } |
1493 | rcu_read_unlock(); | 1550 | rcu_read_unlock(); |
1494 | } | 1551 | } |
@@ -1788,12 +1845,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
1788 | sector_t sect = r1_bio->sector; | 1845 | sector_t sect = r1_bio->sector; |
1789 | int sectors = r1_bio->sectors; | 1846 | int sectors = r1_bio->sectors; |
1790 | int idx = 0; | 1847 | int idx = 0; |
1848 | struct md_rdev *rdev; | ||
1849 | |||
1850 | rdev = conf->mirrors[r1_bio->read_disk].rdev; | ||
1851 | if (test_bit(FailFast, &rdev->flags)) { | ||
1852 | /* Don't try recovering from here - just fail it | ||
1853 | * ... unless it is the last working device of course */ | ||
1854 | md_error(mddev, rdev); | ||
1855 | if (test_bit(Faulty, &rdev->flags)) | ||
1856 | /* Don't try to read from here, but make sure | ||
1857 | * put_buf does it's thing | ||
1858 | */ | ||
1859 | bio->bi_end_io = end_sync_write; | ||
1860 | } | ||
1791 | 1861 | ||
1792 | while(sectors) { | 1862 | while(sectors) { |
1793 | int s = sectors; | 1863 | int s = sectors; |
1794 | int d = r1_bio->read_disk; | 1864 | int d = r1_bio->read_disk; |
1795 | int success = 0; | 1865 | int success = 0; |
1796 | struct md_rdev *rdev; | ||
1797 | int start; | 1866 | int start; |
1798 | 1867 | ||
1799 | if (s > (PAGE_SIZE>>9)) | 1868 | if (s > (PAGE_SIZE>>9)) |
@@ -1825,11 +1894,10 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
1825 | * work just disable and interrupt the recovery. | 1894 | * work just disable and interrupt the recovery. |
1826 | * Don't fail devices as that won't really help. | 1895 | * Don't fail devices as that won't really help. |
1827 | */ | 1896 | */ |
1828 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | 1897 | pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", |
1829 | " for block %llu\n", | 1898 | mdname(mddev), |
1830 | mdname(mddev), | 1899 | bdevname(bio->bi_bdev, b), |
1831 | bdevname(bio->bi_bdev, b), | 1900 | (unsigned long long)r1_bio->sector); |
1832 | (unsigned long long)r1_bio->sector); | ||
1833 | for (d = 0; d < conf->raid_disks * 2; d++) { | 1901 | for (d = 0; d < conf->raid_disks * 2; d++) { |
1834 | rdev = conf->mirrors[d].rdev; | 1902 | rdev = conf->mirrors[d].rdev; |
1835 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 1903 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
@@ -2013,6 +2081,9 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) | |||
2013 | continue; | 2081 | continue; |
2014 | 2082 | ||
2015 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); | 2083 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); |
2084 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) | ||
2085 | wbio->bi_opf |= MD_FAILFAST; | ||
2086 | |||
2016 | wbio->bi_end_io = end_sync_write; | 2087 | wbio->bi_end_io = end_sync_write; |
2017 | atomic_inc(&r1_bio->remaining); | 2088 | atomic_inc(&r1_bio->remaining); |
2018 | md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); | 2089 | md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); |
@@ -2122,13 +2193,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
2122 | if (r1_sync_page_io(rdev, sect, s, | 2193 | if (r1_sync_page_io(rdev, sect, s, |
2123 | conf->tmppage, READ)) { | 2194 | conf->tmppage, READ)) { |
2124 | atomic_add(s, &rdev->corrected_errors); | 2195 | atomic_add(s, &rdev->corrected_errors); |
2125 | printk(KERN_INFO | 2196 | pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n", |
2126 | "md/raid1:%s: read error corrected " | 2197 | mdname(mddev), s, |
2127 | "(%d sectors at %llu on %s)\n", | 2198 | (unsigned long long)(sect + |
2128 | mdname(mddev), s, | 2199 | rdev->data_offset), |
2129 | (unsigned long long)(sect + | 2200 | bdevname(rdev->bdev, b)); |
2130 | rdev->data_offset), | ||
2131 | bdevname(rdev->bdev, b)); | ||
2132 | } | 2201 | } |
2133 | rdev_dec_pending(rdev, mddev); | 2202 | rdev_dec_pending(rdev, mddev); |
2134 | } else | 2203 | } else |
@@ -2287,6 +2356,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
2287 | struct bio *bio; | 2356 | struct bio *bio; |
2288 | char b[BDEVNAME_SIZE]; | 2357 | char b[BDEVNAME_SIZE]; |
2289 | struct md_rdev *rdev; | 2358 | struct md_rdev *rdev; |
2359 | dev_t bio_dev; | ||
2360 | sector_t bio_sector; | ||
2290 | 2361 | ||
2291 | clear_bit(R1BIO_ReadError, &r1_bio->state); | 2362 | clear_bit(R1BIO_ReadError, &r1_bio->state); |
2292 | /* we got a read error. Maybe the drive is bad. Maybe just | 2363 | /* we got a read error. Maybe the drive is bad. Maybe just |
@@ -2300,10 +2371,14 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
2300 | 2371 | ||
2301 | bio = r1_bio->bios[r1_bio->read_disk]; | 2372 | bio = r1_bio->bios[r1_bio->read_disk]; |
2302 | bdevname(bio->bi_bdev, b); | 2373 | bdevname(bio->bi_bdev, b); |
2374 | bio_dev = bio->bi_bdev->bd_dev; | ||
2375 | bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; | ||
2303 | bio_put(bio); | 2376 | bio_put(bio); |
2304 | r1_bio->bios[r1_bio->read_disk] = NULL; | 2377 | r1_bio->bios[r1_bio->read_disk] = NULL; |
2305 | 2378 | ||
2306 | if (mddev->ro == 0) { | 2379 | rdev = conf->mirrors[r1_bio->read_disk].rdev; |
2380 | if (mddev->ro == 0 | ||
2381 | && !test_bit(FailFast, &rdev->flags)) { | ||
2307 | freeze_array(conf, 1); | 2382 | freeze_array(conf, 1); |
2308 | fix_read_error(conf, r1_bio->read_disk, | 2383 | fix_read_error(conf, r1_bio->read_disk, |
2309 | r1_bio->sector, r1_bio->sectors); | 2384 | r1_bio->sector, r1_bio->sectors); |
@@ -2312,14 +2387,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
2312 | r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; | 2387 | r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; |
2313 | } | 2388 | } |
2314 | 2389 | ||
2315 | rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); | 2390 | rdev_dec_pending(rdev, conf->mddev); |
2316 | 2391 | ||
2317 | read_more: | 2392 | read_more: |
2318 | disk = read_balance(conf, r1_bio, &max_sectors); | 2393 | disk = read_balance(conf, r1_bio, &max_sectors); |
2319 | if (disk == -1) { | 2394 | if (disk == -1) { |
2320 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | 2395 | pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", |
2321 | " read error for block %llu\n", | 2396 | mdname(mddev), b, (unsigned long long)r1_bio->sector); |
2322 | mdname(mddev), b, (unsigned long long)r1_bio->sector); | ||
2323 | raid_end_bio_io(r1_bio); | 2397 | raid_end_bio_io(r1_bio); |
2324 | } else { | 2398 | } else { |
2325 | const unsigned long do_sync | 2399 | const unsigned long do_sync |
@@ -2330,16 +2404,17 @@ read_more: | |||
2330 | max_sectors); | 2404 | max_sectors); |
2331 | r1_bio->bios[r1_bio->read_disk] = bio; | 2405 | r1_bio->bios[r1_bio->read_disk] = bio; |
2332 | rdev = conf->mirrors[disk].rdev; | 2406 | rdev = conf->mirrors[disk].rdev; |
2333 | printk_ratelimited(KERN_ERR | 2407 | pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", |
2334 | "md/raid1:%s: redirecting sector %llu" | 2408 | mdname(mddev), |
2335 | " to other mirror: %s\n", | 2409 | (unsigned long long)r1_bio->sector, |
2336 | mdname(mddev), | 2410 | bdevname(rdev->bdev, b)); |
2337 | (unsigned long long)r1_bio->sector, | ||
2338 | bdevname(rdev->bdev, b)); | ||
2339 | bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; | 2411 | bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; |
2340 | bio->bi_bdev = rdev->bdev; | 2412 | bio->bi_bdev = rdev->bdev; |
2341 | bio->bi_end_io = raid1_end_read_request; | 2413 | bio->bi_end_io = raid1_end_read_request; |
2342 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); | 2414 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); |
2415 | if (test_bit(FailFast, &rdev->flags) && | ||
2416 | test_bit(R1BIO_FailFast, &r1_bio->state)) | ||
2417 | bio->bi_opf |= MD_FAILFAST; | ||
2343 | bio->bi_private = r1_bio; | 2418 | bio->bi_private = r1_bio; |
2344 | if (max_sectors < r1_bio->sectors) { | 2419 | if (max_sectors < r1_bio->sectors) { |
2345 | /* Drat - have to split this up more */ | 2420 | /* Drat - have to split this up more */ |
@@ -2353,6 +2428,8 @@ read_more: | |||
2353 | else | 2428 | else |
2354 | mbio->bi_phys_segments++; | 2429 | mbio->bi_phys_segments++; |
2355 | spin_unlock_irq(&conf->device_lock); | 2430 | spin_unlock_irq(&conf->device_lock); |
2431 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), | ||
2432 | bio, bio_dev, bio_sector); | ||
2356 | generic_make_request(bio); | 2433 | generic_make_request(bio); |
2357 | bio = NULL; | 2434 | bio = NULL; |
2358 | 2435 | ||
@@ -2367,8 +2444,11 @@ read_more: | |||
2367 | sectors_handled; | 2444 | sectors_handled; |
2368 | 2445 | ||
2369 | goto read_more; | 2446 | goto read_more; |
2370 | } else | 2447 | } else { |
2448 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), | ||
2449 | bio, bio_dev, bio_sector); | ||
2371 | generic_make_request(bio); | 2450 | generic_make_request(bio); |
2451 | } | ||
2372 | } | 2452 | } |
2373 | } | 2453 | } |
2374 | 2454 | ||
@@ -2384,10 +2464,10 @@ static void raid1d(struct md_thread *thread) | |||
2384 | md_check_recovery(mddev); | 2464 | md_check_recovery(mddev); |
2385 | 2465 | ||
2386 | if (!list_empty_careful(&conf->bio_end_io_list) && | 2466 | if (!list_empty_careful(&conf->bio_end_io_list) && |
2387 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2467 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
2388 | LIST_HEAD(tmp); | 2468 | LIST_HEAD(tmp); |
2389 | spin_lock_irqsave(&conf->device_lock, flags); | 2469 | spin_lock_irqsave(&conf->device_lock, flags); |
2390 | if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2470 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
2391 | while (!list_empty(&conf->bio_end_io_list)) { | 2471 | while (!list_empty(&conf->bio_end_io_list)) { |
2392 | list_move(conf->bio_end_io_list.prev, &tmp); | 2472 | list_move(conf->bio_end_io_list.prev, &tmp); |
2393 | conf->nr_queued--; | 2473 | conf->nr_queued--; |
@@ -2441,7 +2521,7 @@ static void raid1d(struct md_thread *thread) | |||
2441 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); | 2521 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); |
2442 | 2522 | ||
2443 | cond_resched(); | 2523 | cond_resched(); |
2444 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 2524 | if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) |
2445 | md_check_recovery(mddev); | 2525 | md_check_recovery(mddev); |
2446 | } | 2526 | } |
2447 | blk_finish_plug(&plug); | 2527 | blk_finish_plug(&plug); |
@@ -2623,6 +2703,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2623 | bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; | 2703 | bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; |
2624 | bio->bi_bdev = rdev->bdev; | 2704 | bio->bi_bdev = rdev->bdev; |
2625 | bio->bi_private = r1_bio; | 2705 | bio->bi_private = r1_bio; |
2706 | if (test_bit(FailFast, &rdev->flags)) | ||
2707 | bio->bi_opf |= MD_FAILFAST; | ||
2626 | } | 2708 | } |
2627 | } | 2709 | } |
2628 | rcu_read_unlock(); | 2710 | rcu_read_unlock(); |
@@ -2642,7 +2724,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2642 | min_bad, 0 | 2724 | min_bad, 0 |
2643 | ) && ok; | 2725 | ) && ok; |
2644 | } | 2726 | } |
2645 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2727 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2646 | *skipped = 1; | 2728 | *skipped = 1; |
2647 | put_buf(r1_bio); | 2729 | put_buf(r1_bio); |
2648 | 2730 | ||
@@ -2753,6 +2835,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2753 | if (bio->bi_end_io == end_sync_read) { | 2835 | if (bio->bi_end_io == end_sync_read) { |
2754 | read_targets--; | 2836 | read_targets--; |
2755 | md_sync_acct(bio->bi_bdev, nr_sectors); | 2837 | md_sync_acct(bio->bi_bdev, nr_sectors); |
2838 | if (read_targets == 1) | ||
2839 | bio->bi_opf &= ~MD_FAILFAST; | ||
2756 | generic_make_request(bio); | 2840 | generic_make_request(bio); |
2757 | } | 2841 | } |
2758 | } | 2842 | } |
@@ -2760,6 +2844,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
2760 | atomic_set(&r1_bio->remaining, 1); | 2844 | atomic_set(&r1_bio->remaining, 1); |
2761 | bio = r1_bio->bios[r1_bio->read_disk]; | 2845 | bio = r1_bio->bios[r1_bio->read_disk]; |
2762 | md_sync_acct(bio->bi_bdev, nr_sectors); | 2846 | md_sync_acct(bio->bi_bdev, nr_sectors); |
2847 | if (read_targets == 1) | ||
2848 | bio->bi_opf &= ~MD_FAILFAST; | ||
2763 | generic_make_request(bio); | 2849 | generic_make_request(bio); |
2764 | 2850 | ||
2765 | } | 2851 | } |
@@ -2875,12 +2961,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2875 | 2961 | ||
2876 | err = -ENOMEM; | 2962 | err = -ENOMEM; |
2877 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); | 2963 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); |
2878 | if (!conf->thread) { | 2964 | if (!conf->thread) |
2879 | printk(KERN_ERR | ||
2880 | "md/raid1:%s: couldn't allocate thread\n", | ||
2881 | mdname(mddev)); | ||
2882 | goto abort; | 2965 | goto abort; |
2883 | } | ||
2884 | 2966 | ||
2885 | return conf; | 2967 | return conf; |
2886 | 2968 | ||
@@ -2905,13 +2987,13 @@ static int raid1_run(struct mddev *mddev) | |||
2905 | bool discard_supported = false; | 2987 | bool discard_supported = false; |
2906 | 2988 | ||
2907 | if (mddev->level != 1) { | 2989 | if (mddev->level != 1) { |
2908 | printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", | 2990 | pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", |
2909 | mdname(mddev), mddev->level); | 2991 | mdname(mddev), mddev->level); |
2910 | return -EIO; | 2992 | return -EIO; |
2911 | } | 2993 | } |
2912 | if (mddev->reshape_position != MaxSector) { | 2994 | if (mddev->reshape_position != MaxSector) { |
2913 | printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", | 2995 | pr_warn("md/raid1:%s: reshape_position set but not supported\n", |
2914 | mdname(mddev)); | 2996 | mdname(mddev)); |
2915 | return -EIO; | 2997 | return -EIO; |
2916 | } | 2998 | } |
2917 | /* | 2999 | /* |
@@ -2950,11 +3032,9 @@ static int raid1_run(struct mddev *mddev) | |||
2950 | mddev->recovery_cp = MaxSector; | 3032 | mddev->recovery_cp = MaxSector; |
2951 | 3033 | ||
2952 | if (mddev->recovery_cp != MaxSector) | 3034 | if (mddev->recovery_cp != MaxSector) |
2953 | printk(KERN_NOTICE "md/raid1:%s: not clean" | 3035 | pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", |
2954 | " -- starting background reconstruction\n", | 3036 | mdname(mddev)); |
2955 | mdname(mddev)); | 3037 | pr_info("md/raid1:%s: active with %d out of %d mirrors\n", |
2956 | printk(KERN_INFO | ||
2957 | "md/raid1:%s: active with %d out of %d mirrors\n", | ||
2958 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 3038 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
2959 | mddev->raid_disks); | 3039 | mddev->raid_disks); |
2960 | 3040 | ||
@@ -2964,6 +3044,7 @@ static int raid1_run(struct mddev *mddev) | |||
2964 | mddev->thread = conf->thread; | 3044 | mddev->thread = conf->thread; |
2965 | conf->thread = NULL; | 3045 | conf->thread = NULL; |
2966 | mddev->private = conf; | 3046 | mddev->private = conf; |
3047 | set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
2967 | 3048 | ||
2968 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 3049 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
2969 | 3050 | ||
@@ -3107,9 +3188,8 @@ static int raid1_reshape(struct mddev *mddev) | |||
3107 | rdev->raid_disk = d2; | 3188 | rdev->raid_disk = d2; |
3108 | sysfs_unlink_rdev(mddev, rdev); | 3189 | sysfs_unlink_rdev(mddev, rdev); |
3109 | if (sysfs_link_rdev(mddev, rdev)) | 3190 | if (sysfs_link_rdev(mddev, rdev)) |
3110 | printk(KERN_WARNING | 3191 | pr_warn("md/raid1:%s: cannot register rd%d\n", |
3111 | "md/raid1:%s: cannot register rd%d\n", | 3192 | mdname(mddev), rdev->raid_disk); |
3112 | mdname(mddev), rdev->raid_disk); | ||
3113 | } | 3193 | } |
3114 | if (rdev) | 3194 | if (rdev) |
3115 | newmirrors[d2++].rdev = rdev; | 3195 | newmirrors[d2++].rdev = rdev; |
@@ -3163,9 +3243,12 @@ static void *raid1_takeover(struct mddev *mddev) | |||
3163 | mddev->new_layout = 0; | 3243 | mddev->new_layout = 0; |
3164 | mddev->new_chunk_sectors = 0; | 3244 | mddev->new_chunk_sectors = 0; |
3165 | conf = setup_conf(mddev); | 3245 | conf = setup_conf(mddev); |
3166 | if (!IS_ERR(conf)) | 3246 | if (!IS_ERR(conf)) { |
3167 | /* Array must appear to be quiesced */ | 3247 | /* Array must appear to be quiesced */ |
3168 | conf->array_frozen = 1; | 3248 | conf->array_frozen = 1; |
3249 | clear_bit(MD_HAS_JOURNAL, &mddev->flags); | ||
3250 | clear_bit(MD_JOURNAL_CLEAN, &mddev->flags); | ||
3251 | } | ||
3169 | return conf; | 3252 | return conf; |
3170 | } | 3253 | } |
3171 | return ERR_PTR(-EINVAL); | 3254 | return ERR_PTR(-EINVAL); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 61c39b390cd8..c52ef424a24b 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -161,14 +161,15 @@ struct r1bio { | |||
161 | }; | 161 | }; |
162 | 162 | ||
163 | /* bits for r1bio.state */ | 163 | /* bits for r1bio.state */ |
164 | #define R1BIO_Uptodate 0 | 164 | enum r1bio_state { |
165 | #define R1BIO_IsSync 1 | 165 | R1BIO_Uptodate, |
166 | #define R1BIO_Degraded 2 | 166 | R1BIO_IsSync, |
167 | #define R1BIO_BehindIO 3 | 167 | R1BIO_Degraded, |
168 | R1BIO_BehindIO, | ||
168 | /* Set ReadError on bios that experience a readerror so that | 169 | /* Set ReadError on bios that experience a readerror so that |
169 | * raid1d knows what to do with them. | 170 | * raid1d knows what to do with them. |
170 | */ | 171 | */ |
171 | #define R1BIO_ReadError 4 | 172 | R1BIO_ReadError, |
172 | /* For write-behind requests, we call bi_end_io when | 173 | /* For write-behind requests, we call bi_end_io when |
173 | * the last non-write-behind device completes, providing | 174 | * the last non-write-behind device completes, providing |
174 | * any write was successful. Otherwise we call when | 175 | * any write was successful. Otherwise we call when |
@@ -176,10 +177,12 @@ struct r1bio { | |||
176 | * with failure when last write completes (and all failed). | 177 | * with failure when last write completes (and all failed). |
177 | * Record that bi_end_io was called with this flag... | 178 | * Record that bi_end_io was called with this flag... |
178 | */ | 179 | */ |
179 | #define R1BIO_Returned 6 | 180 | R1BIO_Returned, |
180 | /* If a write for this request means we can clear some | 181 | /* If a write for this request means we can clear some |
181 | * known-bad-block records, we set this flag | 182 | * known-bad-block records, we set this flag |
182 | */ | 183 | */ |
183 | #define R1BIO_MadeGood 7 | 184 | R1BIO_MadeGood, |
184 | #define R1BIO_WriteError 8 | 185 | R1BIO_WriteError, |
186 | R1BIO_FailFast, | ||
187 | }; | ||
185 | #endif | 188 | #endif |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 39fddda2fef2..ab5e86209322 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <trace/events/block.h> | ||
28 | #include "md.h" | 29 | #include "md.h" |
29 | #include "raid10.h" | 30 | #include "raid10.h" |
30 | #include "raid0.h" | 31 | #include "raid0.h" |
@@ -99,12 +100,16 @@ static int max_queued_requests = 1024; | |||
99 | static void allow_barrier(struct r10conf *conf); | 100 | static void allow_barrier(struct r10conf *conf); |
100 | static void lower_barrier(struct r10conf *conf); | 101 | static void lower_barrier(struct r10conf *conf); |
101 | static int _enough(struct r10conf *conf, int previous, int ignore); | 102 | static int _enough(struct r10conf *conf, int previous, int ignore); |
103 | static int enough(struct r10conf *conf, int ignore); | ||
102 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | 104 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, |
103 | int *skipped); | 105 | int *skipped); |
104 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); | 106 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); |
105 | static void end_reshape_write(struct bio *bio); | 107 | static void end_reshape_write(struct bio *bio); |
106 | static void end_reshape(struct r10conf *conf); | 108 | static void end_reshape(struct r10conf *conf); |
107 | 109 | ||
110 | #define raid10_log(md, fmt, args...) \ | ||
111 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) | ||
112 | |||
108 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 113 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
109 | { | 114 | { |
110 | struct r10conf *conf = data; | 115 | struct r10conf *conf = data; |
@@ -404,8 +409,7 @@ static void raid10_end_read_request(struct bio *bio) | |||
404 | * oops, read error - keep the refcount on the rdev | 409 | * oops, read error - keep the refcount on the rdev |
405 | */ | 410 | */ |
406 | char b[BDEVNAME_SIZE]; | 411 | char b[BDEVNAME_SIZE]; |
407 | printk_ratelimited(KERN_ERR | 412 | pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n", |
408 | "md/raid10:%s: %s: rescheduling sector %llu\n", | ||
409 | mdname(conf->mddev), | 413 | mdname(conf->mddev), |
410 | bdevname(rdev->bdev, b), | 414 | bdevname(rdev->bdev, b), |
411 | (unsigned long long)r10_bio->sector); | 415 | (unsigned long long)r10_bio->sector); |
@@ -447,6 +451,7 @@ static void raid10_end_write_request(struct bio *bio) | |||
447 | struct r10conf *conf = r10_bio->mddev->private; | 451 | struct r10conf *conf = r10_bio->mddev->private; |
448 | int slot, repl; | 452 | int slot, repl; |
449 | struct md_rdev *rdev = NULL; | 453 | struct md_rdev *rdev = NULL; |
454 | struct bio *to_put = NULL; | ||
450 | bool discard_error; | 455 | bool discard_error; |
451 | 456 | ||
452 | discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; | 457 | discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; |
@@ -474,8 +479,24 @@ static void raid10_end_write_request(struct bio *bio) | |||
474 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | 479 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) |
475 | set_bit(MD_RECOVERY_NEEDED, | 480 | set_bit(MD_RECOVERY_NEEDED, |
476 | &rdev->mddev->recovery); | 481 | &rdev->mddev->recovery); |
477 | set_bit(R10BIO_WriteError, &r10_bio->state); | 482 | |
478 | dec_rdev = 0; | 483 | dec_rdev = 0; |
484 | if (test_bit(FailFast, &rdev->flags) && | ||
485 | (bio->bi_opf & MD_FAILFAST)) { | ||
486 | md_error(rdev->mddev, rdev); | ||
487 | if (!test_bit(Faulty, &rdev->flags)) | ||
488 | /* This is the only remaining device, | ||
489 | * We need to retry the write without | ||
490 | * FailFast | ||
491 | */ | ||
492 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
493 | else { | ||
494 | r10_bio->devs[slot].bio = NULL; | ||
495 | to_put = bio; | ||
496 | dec_rdev = 1; | ||
497 | } | ||
498 | } else | ||
499 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
479 | } | 500 | } |
480 | } else { | 501 | } else { |
481 | /* | 502 | /* |
@@ -525,6 +546,8 @@ static void raid10_end_write_request(struct bio *bio) | |||
525 | one_write_done(r10_bio); | 546 | one_write_done(r10_bio); |
526 | if (dec_rdev) | 547 | if (dec_rdev) |
527 | rdev_dec_pending(rdev, conf->mddev); | 548 | rdev_dec_pending(rdev, conf->mddev); |
549 | if (to_put) | ||
550 | bio_put(to_put); | ||
528 | } | 551 | } |
529 | 552 | ||
530 | /* | 553 | /* |
@@ -716,6 +739,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
716 | best_dist = MaxSector; | 739 | best_dist = MaxSector; |
717 | best_good_sectors = 0; | 740 | best_good_sectors = 0; |
718 | do_balance = 1; | 741 | do_balance = 1; |
742 | clear_bit(R10BIO_FailFast, &r10_bio->state); | ||
719 | /* | 743 | /* |
720 | * Check if we can balance. We can balance on the whole | 744 | * Check if we can balance. We can balance on the whole |
721 | * device if no resync is going on (recovery is ok), or below | 745 | * device if no resync is going on (recovery is ok), or below |
@@ -780,15 +804,18 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
780 | if (!do_balance) | 804 | if (!do_balance) |
781 | break; | 805 | break; |
782 | 806 | ||
807 | if (best_slot >= 0) | ||
808 | /* At least 2 disks to choose from so failfast is OK */ | ||
809 | set_bit(R10BIO_FailFast, &r10_bio->state); | ||
783 | /* This optimisation is debatable, and completely destroys | 810 | /* This optimisation is debatable, and completely destroys |
784 | * sequential read speed for 'far copies' arrays. So only | 811 | * sequential read speed for 'far copies' arrays. So only |
785 | * keep it for 'near' arrays, and review those later. | 812 | * keep it for 'near' arrays, and review those later. |
786 | */ | 813 | */ |
787 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) | 814 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
788 | break; | 815 | new_distance = 0; |
789 | 816 | ||
790 | /* for far > 1 always use the lowest address */ | 817 | /* for far > 1 always use the lowest address */ |
791 | if (geo->far_copies > 1) | 818 | else if (geo->far_copies > 1) |
792 | new_distance = r10_bio->devs[slot].addr; | 819 | new_distance = r10_bio->devs[slot].addr; |
793 | else | 820 | else |
794 | new_distance = abs(r10_bio->devs[slot].addr - | 821 | new_distance = abs(r10_bio->devs[slot].addr - |
@@ -859,9 +886,14 @@ static void flush_pending_writes(struct r10conf *conf) | |||
859 | 886 | ||
860 | while (bio) { /* submit pending writes */ | 887 | while (bio) { /* submit pending writes */ |
861 | struct bio *next = bio->bi_next; | 888 | struct bio *next = bio->bi_next; |
889 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
862 | bio->bi_next = NULL; | 890 | bio->bi_next = NULL; |
863 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 891 | bio->bi_bdev = rdev->bdev; |
864 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 892 | if (test_bit(Faulty, &rdev->flags)) { |
893 | bio->bi_error = -EIO; | ||
894 | bio_endio(bio); | ||
895 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
896 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
865 | /* Just ignore it */ | 897 | /* Just ignore it */ |
866 | bio_endio(bio); | 898 | bio_endio(bio); |
867 | else | 899 | else |
@@ -937,6 +969,7 @@ static void wait_barrier(struct r10conf *conf) | |||
937 | * that queue to get the nr_pending | 969 | * that queue to get the nr_pending |
938 | * count down. | 970 | * count down. |
939 | */ | 971 | */ |
972 | raid10_log(conf->mddev, "wait barrier"); | ||
940 | wait_event_lock_irq(conf->wait_barrier, | 973 | wait_event_lock_irq(conf->wait_barrier, |
941 | !conf->barrier || | 974 | !conf->barrier || |
942 | (atomic_read(&conf->nr_pending) && | 975 | (atomic_read(&conf->nr_pending) && |
@@ -1037,9 +1070,14 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
1037 | 1070 | ||
1038 | while (bio) { /* submit pending writes */ | 1071 | while (bio) { /* submit pending writes */ |
1039 | struct bio *next = bio->bi_next; | 1072 | struct bio *next = bio->bi_next; |
1073 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
1040 | bio->bi_next = NULL; | 1074 | bio->bi_next = NULL; |
1041 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 1075 | bio->bi_bdev = rdev->bdev; |
1042 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 1076 | if (test_bit(Faulty, &rdev->flags)) { |
1077 | bio->bi_error = -EIO; | ||
1078 | bio_endio(bio); | ||
1079 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
1080 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
1043 | /* Just ignore it */ | 1081 | /* Just ignore it */ |
1044 | bio_endio(bio); | 1082 | bio_endio(bio); |
1045 | else | 1083 | else |
@@ -1083,6 +1121,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio) | |||
1083 | /* IO spans the reshape position. Need to wait for | 1121 | /* IO spans the reshape position. Need to wait for |
1084 | * reshape to pass | 1122 | * reshape to pass |
1085 | */ | 1123 | */ |
1124 | raid10_log(conf->mddev, "wait reshape"); | ||
1086 | allow_barrier(conf); | 1125 | allow_barrier(conf); |
1087 | wait_event(conf->wait_barrier, | 1126 | wait_event(conf->wait_barrier, |
1088 | conf->reshape_progress <= bio->bi_iter.bi_sector || | 1127 | conf->reshape_progress <= bio->bi_iter.bi_sector || |
@@ -1099,11 +1138,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio) | |||
1099 | bio->bi_iter.bi_sector < conf->reshape_progress))) { | 1138 | bio->bi_iter.bi_sector < conf->reshape_progress))) { |
1100 | /* Need to update reshape_position in metadata */ | 1139 | /* Need to update reshape_position in metadata */ |
1101 | mddev->reshape_position = conf->reshape_progress; | 1140 | mddev->reshape_position = conf->reshape_progress; |
1102 | set_mask_bits(&mddev->flags, 0, | 1141 | set_mask_bits(&mddev->sb_flags, 0, |
1103 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1142 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
1104 | md_wakeup_thread(mddev->thread); | 1143 | md_wakeup_thread(mddev->thread); |
1144 | raid10_log(conf->mddev, "wait reshape metadata"); | ||
1105 | wait_event(mddev->sb_wait, | 1145 | wait_event(mddev->sb_wait, |
1106 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 1146 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
1107 | 1147 | ||
1108 | conf->reshape_safe = mddev->reshape_position; | 1148 | conf->reshape_safe = mddev->reshape_position; |
1109 | } | 1149 | } |
@@ -1154,8 +1194,15 @@ read_again: | |||
1154 | read_bio->bi_bdev = rdev->bdev; | 1194 | read_bio->bi_bdev = rdev->bdev; |
1155 | read_bio->bi_end_io = raid10_end_read_request; | 1195 | read_bio->bi_end_io = raid10_end_read_request; |
1156 | bio_set_op_attrs(read_bio, op, do_sync); | 1196 | bio_set_op_attrs(read_bio, op, do_sync); |
1197 | if (test_bit(FailFast, &rdev->flags) && | ||
1198 | test_bit(R10BIO_FailFast, &r10_bio->state)) | ||
1199 | read_bio->bi_opf |= MD_FAILFAST; | ||
1157 | read_bio->bi_private = r10_bio; | 1200 | read_bio->bi_private = r10_bio; |
1158 | 1201 | ||
1202 | if (mddev->gendisk) | ||
1203 | trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), | ||
1204 | read_bio, disk_devt(mddev->gendisk), | ||
1205 | r10_bio->sector); | ||
1159 | if (max_sectors < r10_bio->sectors) { | 1206 | if (max_sectors < r10_bio->sectors) { |
1160 | /* Could not read all from this device, so we will | 1207 | /* Could not read all from this device, so we will |
1161 | * need another r10_bio. | 1208 | * need another r10_bio. |
@@ -1195,6 +1242,7 @@ read_again: | |||
1195 | */ | 1242 | */ |
1196 | if (conf->pending_count >= max_queued_requests) { | 1243 | if (conf->pending_count >= max_queued_requests) { |
1197 | md_wakeup_thread(mddev->thread); | 1244 | md_wakeup_thread(mddev->thread); |
1245 | raid10_log(mddev, "wait queued"); | ||
1198 | wait_event(conf->wait_barrier, | 1246 | wait_event(conf->wait_barrier, |
1199 | conf->pending_count < max_queued_requests); | 1247 | conf->pending_count < max_queued_requests); |
1200 | } | 1248 | } |
@@ -1322,6 +1370,7 @@ retry_write: | |||
1322 | } | 1370 | } |
1323 | } | 1371 | } |
1324 | allow_barrier(conf); | 1372 | allow_barrier(conf); |
1373 | raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | ||
1325 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1374 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
1326 | wait_barrier(conf); | 1375 | wait_barrier(conf); |
1327 | goto retry_write; | 1376 | goto retry_write; |
@@ -1361,8 +1410,18 @@ retry_write: | |||
1361 | mbio->bi_bdev = rdev->bdev; | 1410 | mbio->bi_bdev = rdev->bdev; |
1362 | mbio->bi_end_io = raid10_end_write_request; | 1411 | mbio->bi_end_io = raid10_end_write_request; |
1363 | bio_set_op_attrs(mbio, op, do_sync | do_fua); | 1412 | bio_set_op_attrs(mbio, op, do_sync | do_fua); |
1413 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) && | ||
1414 | enough(conf, d)) | ||
1415 | mbio->bi_opf |= MD_FAILFAST; | ||
1364 | mbio->bi_private = r10_bio; | 1416 | mbio->bi_private = r10_bio; |
1365 | 1417 | ||
1418 | if (conf->mddev->gendisk) | ||
1419 | trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), | ||
1420 | mbio, disk_devt(conf->mddev->gendisk), | ||
1421 | r10_bio->sector); | ||
1422 | /* flush_pending_writes() needs access to the rdev so...*/ | ||
1423 | mbio->bi_bdev = (void*)rdev; | ||
1424 | |||
1366 | atomic_inc(&r10_bio->remaining); | 1425 | atomic_inc(&r10_bio->remaining); |
1367 | 1426 | ||
1368 | cb = blk_check_plugged(raid10_unplug, mddev, | 1427 | cb = blk_check_plugged(raid10_unplug, mddev, |
@@ -1405,6 +1464,13 @@ retry_write: | |||
1405 | bio_set_op_attrs(mbio, op, do_sync | do_fua); | 1464 | bio_set_op_attrs(mbio, op, do_sync | do_fua); |
1406 | mbio->bi_private = r10_bio; | 1465 | mbio->bi_private = r10_bio; |
1407 | 1466 | ||
1467 | if (conf->mddev->gendisk) | ||
1468 | trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), | ||
1469 | mbio, disk_devt(conf->mddev->gendisk), | ||
1470 | r10_bio->sector); | ||
1471 | /* flush_pending_writes() needs access to the rdev so...*/ | ||
1472 | mbio->bi_bdev = (void*)rdev; | ||
1473 | |||
1408 | atomic_inc(&r10_bio->remaining); | 1474 | atomic_inc(&r10_bio->remaining); |
1409 | spin_lock_irqsave(&conf->device_lock, flags); | 1475 | spin_lock_irqsave(&conf->device_lock, flags); |
1410 | bio_list_add(&conf->pending_bio_list, mbio); | 1476 | bio_list_add(&conf->pending_bio_list, mbio); |
@@ -1586,14 +1652,13 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) | |||
1586 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1652 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1587 | set_bit(Blocked, &rdev->flags); | 1653 | set_bit(Blocked, &rdev->flags); |
1588 | set_bit(Faulty, &rdev->flags); | 1654 | set_bit(Faulty, &rdev->flags); |
1589 | set_mask_bits(&mddev->flags, 0, | 1655 | set_mask_bits(&mddev->sb_flags, 0, |
1590 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1656 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
1591 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1657 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1592 | printk(KERN_ALERT | 1658 | pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n" |
1593 | "md/raid10:%s: Disk failure on %s, disabling device.\n" | 1659 | "md/raid10:%s: Operation continuing on %d devices.\n", |
1594 | "md/raid10:%s: Operation continuing on %d devices.\n", | 1660 | mdname(mddev), bdevname(rdev->bdev, b), |
1595 | mdname(mddev), bdevname(rdev->bdev, b), | 1661 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); |
1596 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); | ||
1597 | } | 1662 | } |
1598 | 1663 | ||
1599 | static void print_conf(struct r10conf *conf) | 1664 | static void print_conf(struct r10conf *conf) |
@@ -1601,13 +1666,13 @@ static void print_conf(struct r10conf *conf) | |||
1601 | int i; | 1666 | int i; |
1602 | struct md_rdev *rdev; | 1667 | struct md_rdev *rdev; |
1603 | 1668 | ||
1604 | printk(KERN_DEBUG "RAID10 conf printout:\n"); | 1669 | pr_debug("RAID10 conf printout:\n"); |
1605 | if (!conf) { | 1670 | if (!conf) { |
1606 | printk(KERN_DEBUG "(!conf)\n"); | 1671 | pr_debug("(!conf)\n"); |
1607 | return; | 1672 | return; |
1608 | } | 1673 | } |
1609 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, | 1674 | pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, |
1610 | conf->geo.raid_disks); | 1675 | conf->geo.raid_disks); |
1611 | 1676 | ||
1612 | /* This is only called with ->reconfix_mutex held, so | 1677 | /* This is only called with ->reconfix_mutex held, so |
1613 | * rcu protection of rdev is not needed */ | 1678 | * rcu protection of rdev is not needed */ |
@@ -1615,10 +1680,10 @@ static void print_conf(struct r10conf *conf) | |||
1615 | char b[BDEVNAME_SIZE]; | 1680 | char b[BDEVNAME_SIZE]; |
1616 | rdev = conf->mirrors[i].rdev; | 1681 | rdev = conf->mirrors[i].rdev; |
1617 | if (rdev) | 1682 | if (rdev) |
1618 | printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", | 1683 | pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", |
1619 | i, !test_bit(In_sync, &rdev->flags), | 1684 | i, !test_bit(In_sync, &rdev->flags), |
1620 | !test_bit(Faulty, &rdev->flags), | 1685 | !test_bit(Faulty, &rdev->flags), |
1621 | bdevname(rdev->bdev,b)); | 1686 | bdevname(rdev->bdev,b)); |
1622 | } | 1687 | } |
1623 | } | 1688 | } |
1624 | 1689 | ||
@@ -1953,6 +2018,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1953 | /* now find blocks with errors */ | 2018 | /* now find blocks with errors */ |
1954 | for (i=0 ; i < conf->copies ; i++) { | 2019 | for (i=0 ; i < conf->copies ; i++) { |
1955 | int j, d; | 2020 | int j, d; |
2021 | struct md_rdev *rdev; | ||
1956 | 2022 | ||
1957 | tbio = r10_bio->devs[i].bio; | 2023 | tbio = r10_bio->devs[i].bio; |
1958 | 2024 | ||
@@ -1960,6 +2026,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1960 | continue; | 2026 | continue; |
1961 | if (i == first) | 2027 | if (i == first) |
1962 | continue; | 2028 | continue; |
2029 | d = r10_bio->devs[i].devnum; | ||
2030 | rdev = conf->mirrors[d].rdev; | ||
1963 | if (!r10_bio->devs[i].bio->bi_error) { | 2031 | if (!r10_bio->devs[i].bio->bi_error) { |
1964 | /* We know that the bi_io_vec layout is the same for | 2032 | /* We know that the bi_io_vec layout is the same for |
1965 | * both 'first' and 'i', so we just compare them. | 2033 | * both 'first' and 'i', so we just compare them. |
@@ -1982,6 +2050,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1982 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 2050 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) |
1983 | /* Don't fix anything. */ | 2051 | /* Don't fix anything. */ |
1984 | continue; | 2052 | continue; |
2053 | } else if (test_bit(FailFast, &rdev->flags)) { | ||
2054 | /* Just give up on this device */ | ||
2055 | md_error(rdev->mddev, rdev); | ||
2056 | continue; | ||
1985 | } | 2057 | } |
1986 | /* Ok, we need to write this bio, either to correct an | 2058 | /* Ok, we need to write this bio, either to correct an |
1987 | * inconsistency or to correct an unreadable block. | 2059 | * inconsistency or to correct an unreadable block. |
@@ -1999,11 +2071,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1999 | 2071 | ||
2000 | bio_copy_data(tbio, fbio); | 2072 | bio_copy_data(tbio, fbio); |
2001 | 2073 | ||
2002 | d = r10_bio->devs[i].devnum; | ||
2003 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2074 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
2004 | atomic_inc(&r10_bio->remaining); | 2075 | atomic_inc(&r10_bio->remaining); |
2005 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); | 2076 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); |
2006 | 2077 | ||
2078 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) | ||
2079 | tbio->bi_opf |= MD_FAILFAST; | ||
2007 | tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; | 2080 | tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; |
2008 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2081 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
2009 | generic_make_request(tbio); | 2082 | generic_make_request(tbio); |
@@ -2109,10 +2182,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) | |||
2109 | ok = rdev_set_badblocks(rdev2, addr, s, 0); | 2182 | ok = rdev_set_badblocks(rdev2, addr, s, 0); |
2110 | if (!ok) { | 2183 | if (!ok) { |
2111 | /* just abort the recovery */ | 2184 | /* just abort the recovery */ |
2112 | printk(KERN_NOTICE | 2185 | pr_notice("md/raid10:%s: recovery aborted due to read error\n", |
2113 | "md/raid10:%s: recovery aborted" | 2186 | mdname(mddev)); |
2114 | " due to read error\n", | ||
2115 | mdname(mddev)); | ||
2116 | 2187 | ||
2117 | conf->mirrors[dw].recovery_disabled | 2188 | conf->mirrors[dw].recovery_disabled |
2118 | = mddev->recovery_disabled; | 2189 | = mddev->recovery_disabled; |
@@ -2259,14 +2330,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2259 | char b[BDEVNAME_SIZE]; | 2330 | char b[BDEVNAME_SIZE]; |
2260 | bdevname(rdev->bdev, b); | 2331 | bdevname(rdev->bdev, b); |
2261 | 2332 | ||
2262 | printk(KERN_NOTICE | 2333 | pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n", |
2263 | "md/raid10:%s: %s: Raid device exceeded " | 2334 | mdname(mddev), b, |
2264 | "read_error threshold [cur %d:max %d]\n", | 2335 | atomic_read(&rdev->read_errors), max_read_errors); |
2265 | mdname(mddev), b, | 2336 | pr_notice("md/raid10:%s: %s: Failing raid device\n", |
2266 | atomic_read(&rdev->read_errors), max_read_errors); | 2337 | mdname(mddev), b); |
2267 | printk(KERN_NOTICE | ||
2268 | "md/raid10:%s: %s: Failing raid device\n", | ||
2269 | mdname(mddev), b); | ||
2270 | md_error(mddev, rdev); | 2338 | md_error(mddev, rdev); |
2271 | r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; | 2339 | r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; |
2272 | return; | 2340 | return; |
@@ -2356,20 +2424,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2356 | s, conf->tmppage, WRITE) | 2424 | s, conf->tmppage, WRITE) |
2357 | == 0) { | 2425 | == 0) { |
2358 | /* Well, this device is dead */ | 2426 | /* Well, this device is dead */ |
2359 | printk(KERN_NOTICE | 2427 | pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n", |
2360 | "md/raid10:%s: read correction " | 2428 | mdname(mddev), s, |
2361 | "write failed" | 2429 | (unsigned long long)( |
2362 | " (%d sectors at %llu on %s)\n", | 2430 | sect + |
2363 | mdname(mddev), s, | 2431 | choose_data_offset(r10_bio, |
2364 | (unsigned long long)( | 2432 | rdev)), |
2365 | sect + | 2433 | bdevname(rdev->bdev, b)); |
2366 | choose_data_offset(r10_bio, | 2434 | pr_notice("md/raid10:%s: %s: failing drive\n", |
2367 | rdev)), | 2435 | mdname(mddev), |
2368 | bdevname(rdev->bdev, b)); | 2436 | bdevname(rdev->bdev, b)); |
2369 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
2370 | "drive\n", | ||
2371 | mdname(mddev), | ||
2372 | bdevname(rdev->bdev, b)); | ||
2373 | } | 2437 | } |
2374 | rdev_dec_pending(rdev, mddev); | 2438 | rdev_dec_pending(rdev, mddev); |
2375 | rcu_read_lock(); | 2439 | rcu_read_lock(); |
@@ -2397,24 +2461,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2397 | READ)) { | 2461 | READ)) { |
2398 | case 0: | 2462 | case 0: |
2399 | /* Well, this device is dead */ | 2463 | /* Well, this device is dead */ |
2400 | printk(KERN_NOTICE | 2464 | pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n", |
2401 | "md/raid10:%s: unable to read back " | ||
2402 | "corrected sectors" | ||
2403 | " (%d sectors at %llu on %s)\n", | ||
2404 | mdname(mddev), s, | 2465 | mdname(mddev), s, |
2405 | (unsigned long long)( | 2466 | (unsigned long long)( |
2406 | sect + | 2467 | sect + |
2407 | choose_data_offset(r10_bio, rdev)), | 2468 | choose_data_offset(r10_bio, rdev)), |
2408 | bdevname(rdev->bdev, b)); | 2469 | bdevname(rdev->bdev, b)); |
2409 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2470 | pr_notice("md/raid10:%s: %s: failing drive\n", |
2410 | "drive\n", | ||
2411 | mdname(mddev), | 2471 | mdname(mddev), |
2412 | bdevname(rdev->bdev, b)); | 2472 | bdevname(rdev->bdev, b)); |
2413 | break; | 2473 | break; |
2414 | case 1: | 2474 | case 1: |
2415 | printk(KERN_INFO | 2475 | pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n", |
2416 | "md/raid10:%s: read error corrected" | ||
2417 | " (%d sectors at %llu on %s)\n", | ||
2418 | mdname(mddev), s, | 2476 | mdname(mddev), s, |
2419 | (unsigned long long)( | 2477 | (unsigned long long)( |
2420 | sect + | 2478 | sect + |
@@ -2503,6 +2561,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
2503 | char b[BDEVNAME_SIZE]; | 2561 | char b[BDEVNAME_SIZE]; |
2504 | unsigned long do_sync; | 2562 | unsigned long do_sync; |
2505 | int max_sectors; | 2563 | int max_sectors; |
2564 | dev_t bio_dev; | ||
2565 | sector_t bio_last_sector; | ||
2506 | 2566 | ||
2507 | /* we got a read error. Maybe the drive is bad. Maybe just | 2567 | /* we got a read error. Maybe the drive is bad. Maybe just |
2508 | * the block and we can fix it. | 2568 | * the block and we can fix it. |
@@ -2514,38 +2574,38 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
2514 | */ | 2574 | */ |
2515 | bio = r10_bio->devs[slot].bio; | 2575 | bio = r10_bio->devs[slot].bio; |
2516 | bdevname(bio->bi_bdev, b); | 2576 | bdevname(bio->bi_bdev, b); |
2577 | bio_dev = bio->bi_bdev->bd_dev; | ||
2578 | bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; | ||
2517 | bio_put(bio); | 2579 | bio_put(bio); |
2518 | r10_bio->devs[slot].bio = NULL; | 2580 | r10_bio->devs[slot].bio = NULL; |
2519 | 2581 | ||
2520 | if (mddev->ro == 0) { | 2582 | if (mddev->ro) |
2583 | r10_bio->devs[slot].bio = IO_BLOCKED; | ||
2584 | else if (!test_bit(FailFast, &rdev->flags)) { | ||
2521 | freeze_array(conf, 1); | 2585 | freeze_array(conf, 1); |
2522 | fix_read_error(conf, mddev, r10_bio); | 2586 | fix_read_error(conf, mddev, r10_bio); |
2523 | unfreeze_array(conf); | 2587 | unfreeze_array(conf); |
2524 | } else | 2588 | } else |
2525 | r10_bio->devs[slot].bio = IO_BLOCKED; | 2589 | md_error(mddev, rdev); |
2526 | 2590 | ||
2527 | rdev_dec_pending(rdev, mddev); | 2591 | rdev_dec_pending(rdev, mddev); |
2528 | 2592 | ||
2529 | read_more: | 2593 | read_more: |
2530 | rdev = read_balance(conf, r10_bio, &max_sectors); | 2594 | rdev = read_balance(conf, r10_bio, &max_sectors); |
2531 | if (rdev == NULL) { | 2595 | if (rdev == NULL) { |
2532 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | 2596 | pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", |
2533 | " read error for block %llu\n", | 2597 | mdname(mddev), b, |
2534 | mdname(mddev), b, | 2598 | (unsigned long long)r10_bio->sector); |
2535 | (unsigned long long)r10_bio->sector); | ||
2536 | raid_end_bio_io(r10_bio); | 2599 | raid_end_bio_io(r10_bio); |
2537 | return; | 2600 | return; |
2538 | } | 2601 | } |
2539 | 2602 | ||
2540 | do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); | 2603 | do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); |
2541 | slot = r10_bio->read_slot; | 2604 | slot = r10_bio->read_slot; |
2542 | printk_ratelimited( | 2605 | pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", |
2543 | KERN_ERR | 2606 | mdname(mddev), |
2544 | "md/raid10:%s: %s: redirecting " | 2607 | bdevname(rdev->bdev, b), |
2545 | "sector %llu to another mirror\n", | 2608 | (unsigned long long)r10_bio->sector); |
2546 | mdname(mddev), | ||
2547 | bdevname(rdev->bdev, b), | ||
2548 | (unsigned long long)r10_bio->sector); | ||
2549 | bio = bio_clone_mddev(r10_bio->master_bio, | 2609 | bio = bio_clone_mddev(r10_bio->master_bio, |
2550 | GFP_NOIO, mddev); | 2610 | GFP_NOIO, mddev); |
2551 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); | 2611 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); |
@@ -2555,8 +2615,15 @@ read_more: | |||
2555 | + choose_data_offset(r10_bio, rdev); | 2615 | + choose_data_offset(r10_bio, rdev); |
2556 | bio->bi_bdev = rdev->bdev; | 2616 | bio->bi_bdev = rdev->bdev; |
2557 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); | 2617 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); |
2618 | if (test_bit(FailFast, &rdev->flags) && | ||
2619 | test_bit(R10BIO_FailFast, &r10_bio->state)) | ||
2620 | bio->bi_opf |= MD_FAILFAST; | ||
2558 | bio->bi_private = r10_bio; | 2621 | bio->bi_private = r10_bio; |
2559 | bio->bi_end_io = raid10_end_read_request; | 2622 | bio->bi_end_io = raid10_end_read_request; |
2623 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), | ||
2624 | bio, bio_dev, | ||
2625 | bio_last_sector - r10_bio->sectors); | ||
2626 | |||
2560 | if (max_sectors < r10_bio->sectors) { | 2627 | if (max_sectors < r10_bio->sectors) { |
2561 | /* Drat - have to split this up more */ | 2628 | /* Drat - have to split this up more */ |
2562 | struct bio *mbio = r10_bio->master_bio; | 2629 | struct bio *mbio = r10_bio->master_bio; |
@@ -2694,10 +2761,10 @@ static void raid10d(struct md_thread *thread) | |||
2694 | md_check_recovery(mddev); | 2761 | md_check_recovery(mddev); |
2695 | 2762 | ||
2696 | if (!list_empty_careful(&conf->bio_end_io_list) && | 2763 | if (!list_empty_careful(&conf->bio_end_io_list) && |
2697 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2764 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
2698 | LIST_HEAD(tmp); | 2765 | LIST_HEAD(tmp); |
2699 | spin_lock_irqsave(&conf->device_lock, flags); | 2766 | spin_lock_irqsave(&conf->device_lock, flags); |
2700 | if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2767 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
2701 | while (!list_empty(&conf->bio_end_io_list)) { | 2768 | while (!list_empty(&conf->bio_end_io_list)) { |
2702 | list_move(conf->bio_end_io_list.prev, &tmp); | 2769 | list_move(conf->bio_end_io_list.prev, &tmp); |
2703 | conf->nr_queued--; | 2770 | conf->nr_queued--; |
@@ -2755,7 +2822,7 @@ static void raid10d(struct md_thread *thread) | |||
2755 | } | 2822 | } |
2756 | 2823 | ||
2757 | cond_resched(); | 2824 | cond_resched(); |
2758 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 2825 | if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) |
2759 | md_check_recovery(mddev); | 2826 | md_check_recovery(mddev); |
2760 | } | 2827 | } |
2761 | blk_finish_plug(&plug); | 2828 | blk_finish_plug(&plug); |
@@ -3072,6 +3139,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3072 | bio->bi_private = r10_bio; | 3139 | bio->bi_private = r10_bio; |
3073 | bio->bi_end_io = end_sync_read; | 3140 | bio->bi_end_io = end_sync_read; |
3074 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | 3141 | bio_set_op_attrs(bio, REQ_OP_READ, 0); |
3142 | if (test_bit(FailFast, &rdev->flags)) | ||
3143 | bio->bi_opf |= MD_FAILFAST; | ||
3075 | from_addr = r10_bio->devs[j].addr; | 3144 | from_addr = r10_bio->devs[j].addr; |
3076 | bio->bi_iter.bi_sector = from_addr + | 3145 | bio->bi_iter.bi_sector = from_addr + |
3077 | rdev->data_offset; | 3146 | rdev->data_offset; |
@@ -3160,8 +3229,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3160 | if (!any_working) { | 3229 | if (!any_working) { |
3161 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 3230 | if (!test_and_set_bit(MD_RECOVERY_INTR, |
3162 | &mddev->recovery)) | 3231 | &mddev->recovery)) |
3163 | printk(KERN_INFO "md/raid10:%s: insufficient " | 3232 | pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", |
3164 | "working devices for recovery.\n", | ||
3165 | mdname(mddev)); | 3233 | mdname(mddev)); |
3166 | mirror->recovery_disabled | 3234 | mirror->recovery_disabled |
3167 | = mddev->recovery_disabled; | 3235 | = mddev->recovery_disabled; |
@@ -3178,6 +3246,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3178 | rdev_dec_pending(mrdev, mddev); | 3246 | rdev_dec_pending(mrdev, mddev); |
3179 | if (mreplace) | 3247 | if (mreplace) |
3180 | rdev_dec_pending(mreplace, mddev); | 3248 | rdev_dec_pending(mreplace, mddev); |
3249 | if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { | ||
3250 | /* Only want this if there is elsewhere to | ||
3251 | * read from. 'j' is currently the first | ||
3252 | * readable copy. | ||
3253 | */ | ||
3254 | int targets = 1; | ||
3255 | for (; j < conf->copies; j++) { | ||
3256 | int d = r10_bio->devs[j].devnum; | ||
3257 | if (conf->mirrors[d].rdev && | ||
3258 | test_bit(In_sync, | ||
3259 | &conf->mirrors[d].rdev->flags)) | ||
3260 | targets++; | ||
3261 | } | ||
3262 | if (targets == 1) | ||
3263 | r10_bio->devs[0].bio->bi_opf | ||
3264 | &= ~MD_FAILFAST; | ||
3265 | } | ||
3181 | } | 3266 | } |
3182 | if (biolist == NULL) { | 3267 | if (biolist == NULL) { |
3183 | while (r10_bio) { | 3268 | while (r10_bio) { |
@@ -3256,6 +3341,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3256 | bio->bi_private = r10_bio; | 3341 | bio->bi_private = r10_bio; |
3257 | bio->bi_end_io = end_sync_read; | 3342 | bio->bi_end_io = end_sync_read; |
3258 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | 3343 | bio_set_op_attrs(bio, REQ_OP_READ, 0); |
3344 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) | ||
3345 | bio->bi_opf |= MD_FAILFAST; | ||
3259 | bio->bi_iter.bi_sector = sector + rdev->data_offset; | 3346 | bio->bi_iter.bi_sector = sector + rdev->data_offset; |
3260 | bio->bi_bdev = rdev->bdev; | 3347 | bio->bi_bdev = rdev->bdev; |
3261 | count++; | 3348 | count++; |
@@ -3279,6 +3366,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3279 | bio->bi_private = r10_bio; | 3366 | bio->bi_private = r10_bio; |
3280 | bio->bi_end_io = end_sync_write; | 3367 | bio->bi_end_io = end_sync_write; |
3281 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | 3368 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
3369 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) | ||
3370 | bio->bi_opf |= MD_FAILFAST; | ||
3282 | bio->bi_iter.bi_sector = sector + rdev->data_offset; | 3371 | bio->bi_iter.bi_sector = sector + rdev->data_offset; |
3283 | bio->bi_bdev = rdev->bdev; | 3372 | bio->bi_bdev = rdev->bdev; |
3284 | count++; | 3373 | count++; |
@@ -3489,15 +3578,14 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3489 | copies = setup_geo(&geo, mddev, geo_new); | 3578 | copies = setup_geo(&geo, mddev, geo_new); |
3490 | 3579 | ||
3491 | if (copies == -2) { | 3580 | if (copies == -2) { |
3492 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 3581 | pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", |
3493 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 3582 | mdname(mddev), PAGE_SIZE); |
3494 | mdname(mddev), PAGE_SIZE); | ||
3495 | goto out; | 3583 | goto out; |
3496 | } | 3584 | } |
3497 | 3585 | ||
3498 | if (copies < 2 || copies > mddev->raid_disks) { | 3586 | if (copies < 2 || copies > mddev->raid_disks) { |
3499 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 3587 | pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
3500 | mdname(mddev), mddev->new_layout); | 3588 | mdname(mddev), mddev->new_layout); |
3501 | goto out; | 3589 | goto out; |
3502 | } | 3590 | } |
3503 | 3591 | ||
@@ -3557,9 +3645,6 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
3557 | return conf; | 3645 | return conf; |
3558 | 3646 | ||
3559 | out: | 3647 | out: |
3560 | if (err == -ENOMEM) | ||
3561 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | ||
3562 | mdname(mddev)); | ||
3563 | if (conf) { | 3648 | if (conf) { |
3564 | mempool_destroy(conf->r10bio_pool); | 3649 | mempool_destroy(conf->r10bio_pool); |
3565 | kfree(conf->mirrors); | 3650 | kfree(conf->mirrors); |
@@ -3656,7 +3741,7 @@ static int raid10_run(struct mddev *mddev) | |||
3656 | } | 3741 | } |
3657 | /* need to check that every block has at least one working mirror */ | 3742 | /* need to check that every block has at least one working mirror */ |
3658 | if (!enough(conf, -1)) { | 3743 | if (!enough(conf, -1)) { |
3659 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3744 | pr_err("md/raid10:%s: not enough operational mirrors.\n", |
3660 | mdname(mddev)); | 3745 | mdname(mddev)); |
3661 | goto out_free_conf; | 3746 | goto out_free_conf; |
3662 | } | 3747 | } |
@@ -3698,11 +3783,9 @@ static int raid10_run(struct mddev *mddev) | |||
3698 | } | 3783 | } |
3699 | 3784 | ||
3700 | if (mddev->recovery_cp != MaxSector) | 3785 | if (mddev->recovery_cp != MaxSector) |
3701 | printk(KERN_NOTICE "md/raid10:%s: not clean" | 3786 | pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", |
3702 | " -- starting background reconstruction\n", | 3787 | mdname(mddev)); |
3703 | mdname(mddev)); | 3788 | pr_info("md/raid10:%s: active with %d out of %d devices\n", |
3704 | printk(KERN_INFO | ||
3705 | "md/raid10:%s: active with %d out of %d devices\n", | ||
3706 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, | 3789 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, |
3707 | conf->geo.raid_disks); | 3790 | conf->geo.raid_disks); |
3708 | /* | 3791 | /* |
@@ -3712,6 +3795,7 @@ static int raid10_run(struct mddev *mddev) | |||
3712 | size = raid10_size(mddev, 0, 0); | 3795 | size = raid10_size(mddev, 0, 0); |
3713 | md_set_array_sectors(mddev, size); | 3796 | md_set_array_sectors(mddev, size); |
3714 | mddev->resync_max_sectors = size; | 3797 | mddev->resync_max_sectors = size; |
3798 | set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
3715 | 3799 | ||
3716 | if (mddev->queue) { | 3800 | if (mddev->queue) { |
3717 | int stripe = conf->geo.raid_disks * | 3801 | int stripe = conf->geo.raid_disks * |
@@ -3739,7 +3823,7 @@ static int raid10_run(struct mddev *mddev) | |||
3739 | 3823 | ||
3740 | if (max(before_length, after_length) > min_offset_diff) { | 3824 | if (max(before_length, after_length) > min_offset_diff) { |
3741 | /* This cannot work */ | 3825 | /* This cannot work */ |
3742 | printk("md/raid10: offset difference not enough to continue reshape\n"); | 3826 | pr_warn("md/raid10: offset difference not enough to continue reshape\n"); |
3743 | goto out_free_conf; | 3827 | goto out_free_conf; |
3744 | } | 3828 | } |
3745 | conf->offset_diff = min_offset_diff; | 3829 | conf->offset_diff = min_offset_diff; |
@@ -3846,8 +3930,8 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) | |||
3846 | struct r10conf *conf; | 3930 | struct r10conf *conf; |
3847 | 3931 | ||
3848 | if (mddev->degraded > 0) { | 3932 | if (mddev->degraded > 0) { |
3849 | printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", | 3933 | pr_warn("md/raid10:%s: Error: degraded raid0!\n", |
3850 | mdname(mddev)); | 3934 | mdname(mddev)); |
3851 | return ERR_PTR(-EINVAL); | 3935 | return ERR_PTR(-EINVAL); |
3852 | } | 3936 | } |
3853 | sector_div(size, devs); | 3937 | sector_div(size, devs); |
@@ -3887,9 +3971,8 @@ static void *raid10_takeover(struct mddev *mddev) | |||
3887 | /* for raid0 takeover only one zone is supported */ | 3971 | /* for raid0 takeover only one zone is supported */ |
3888 | raid0_conf = mddev->private; | 3972 | raid0_conf = mddev->private; |
3889 | if (raid0_conf->nr_strip_zones > 1) { | 3973 | if (raid0_conf->nr_strip_zones > 1) { |
3890 | printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" | 3974 | pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", |
3891 | " with more than one zone.\n", | 3975 | mdname(mddev)); |
3892 | mdname(mddev)); | ||
3893 | return ERR_PTR(-EINVAL); | 3976 | return ERR_PTR(-EINVAL); |
3894 | } | 3977 | } |
3895 | return raid10_takeover_raid0(mddev, | 3978 | return raid10_takeover_raid0(mddev, |
@@ -4078,8 +4161,8 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
4078 | sector_t size = raid10_size(mddev, 0, 0); | 4161 | sector_t size = raid10_size(mddev, 0, 0); |
4079 | if (size < mddev->array_sectors) { | 4162 | if (size < mddev->array_sectors) { |
4080 | spin_unlock_irq(&conf->device_lock); | 4163 | spin_unlock_irq(&conf->device_lock); |
4081 | printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", | 4164 | pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", |
4082 | mdname(mddev)); | 4165 | mdname(mddev)); |
4083 | return -EINVAL; | 4166 | return -EINVAL; |
4084 | } | 4167 | } |
4085 | mddev->resync_max_sectors = size; | 4168 | mddev->resync_max_sectors = size; |
@@ -4126,7 +4209,7 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
4126 | spin_unlock_irq(&conf->device_lock); | 4209 | spin_unlock_irq(&conf->device_lock); |
4127 | mddev->raid_disks = conf->geo.raid_disks; | 4210 | mddev->raid_disks = conf->geo.raid_disks; |
4128 | mddev->reshape_position = conf->reshape_progress; | 4211 | mddev->reshape_position = conf->reshape_progress; |
4129 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4212 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
4130 | 4213 | ||
4131 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 4214 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
4132 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 4215 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
@@ -4321,9 +4404,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | |||
4321 | else | 4404 | else |
4322 | mddev->curr_resync_completed = conf->reshape_progress; | 4405 | mddev->curr_resync_completed = conf->reshape_progress; |
4323 | conf->reshape_checkpoint = jiffies; | 4406 | conf->reshape_checkpoint = jiffies; |
4324 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4407 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
4325 | md_wakeup_thread(mddev->thread); | 4408 | md_wakeup_thread(mddev->thread); |
4326 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4409 | wait_event(mddev->sb_wait, mddev->sb_flags == 0 || |
4327 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | 4410 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
4328 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 4411 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
4329 | allow_barrier(conf); | 4412 | allow_barrier(conf); |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 18ec1f7a98bf..3162615e57bd 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -156,5 +156,7 @@ enum r10bio_state { | |||
156 | * flag is set | 156 | * flag is set |
157 | */ | 157 | */ |
158 | R10BIO_Previous, | 158 | R10BIO_Previous, |
159 | /* failfast devices did receive failfast requests. */ | ||
160 | R10BIO_FailFast, | ||
159 | }; | 161 | }; |
160 | #endif | 162 | #endif |
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 8491edcfb5a6..d7bfb6fc8aef 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
@@ -1,5 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2015 Shaohua Li <shli@fb.com> | 2 | * Copyright (C) 2015 Shaohua Li <shli@fb.com> |
3 | * Copyright (C) 2016 Song Liu <songliubraving@fb.com> | ||
3 | * | 4 | * |
4 | * This program is free software; you can redistribute it and/or modify it | 5 | * This program is free software; you can redistribute it and/or modify it |
5 | * under the terms and conditions of the GNU General Public License, | 6 | * under the terms and conditions of the GNU General Public License, |
@@ -18,8 +19,10 @@ | |||
18 | #include <linux/raid/md_p.h> | 19 | #include <linux/raid/md_p.h> |
19 | #include <linux/crc32c.h> | 20 | #include <linux/crc32c.h> |
20 | #include <linux/random.h> | 21 | #include <linux/random.h> |
22 | #include <linux/kthread.h> | ||
21 | #include "md.h" | 23 | #include "md.h" |
22 | #include "raid5.h" | 24 | #include "raid5.h" |
25 | #include "bitmap.h" | ||
23 | 26 | ||
24 | /* | 27 | /* |
25 | * metadata/data stored in disk with 4k size unit (a block) regardless | 28 | * metadata/data stored in disk with 4k size unit (a block) regardless |
@@ -28,18 +31,70 @@ | |||
28 | #define BLOCK_SECTORS (8) | 31 | #define BLOCK_SECTORS (8) |
29 | 32 | ||
30 | /* | 33 | /* |
31 | * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent | 34 | * log->max_free_space is min(1/4 disk size, 10G reclaimable space). |
32 | * recovery scans a very long log | 35 | * |
36 | * In write through mode, the reclaim runs every log->max_free_space. | ||
37 | * This can prevent the recovery scans for too long | ||
33 | */ | 38 | */ |
34 | #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ | 39 | #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ |
35 | #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) | 40 | #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) |
36 | 41 | ||
42 | /* wake up reclaim thread periodically */ | ||
43 | #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) | ||
44 | /* start flush with these full stripes */ | ||
45 | #define R5C_FULL_STRIPE_FLUSH_BATCH 256 | ||
46 | /* reclaim stripes in groups */ | ||
47 | #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) | ||
48 | |||
37 | /* | 49 | /* |
38 | * We only need 2 bios per I/O unit to make progress, but ensure we | 50 | * We only need 2 bios per I/O unit to make progress, but ensure we |
39 | * have a few more available to not get too tight. | 51 | * have a few more available to not get too tight. |
40 | */ | 52 | */ |
41 | #define R5L_POOL_SIZE 4 | 53 | #define R5L_POOL_SIZE 4 |
42 | 54 | ||
55 | /* | ||
56 | * r5c journal modes of the array: write-back or write-through. | ||
57 | * write-through mode has identical behavior as existing log only | ||
58 | * implementation. | ||
59 | */ | ||
60 | enum r5c_journal_mode { | ||
61 | R5C_JOURNAL_MODE_WRITE_THROUGH = 0, | ||
62 | R5C_JOURNAL_MODE_WRITE_BACK = 1, | ||
63 | }; | ||
64 | |||
65 | static char *r5c_journal_mode_str[] = {"write-through", | ||
66 | "write-back"}; | ||
67 | /* | ||
68 | * raid5 cache state machine | ||
69 | * | ||
70 | * With the RAID cache, each stripe works in two phases: | ||
71 | * - caching phase | ||
72 | * - writing-out phase | ||
73 | * | ||
74 | * These two phases are controlled by bit STRIPE_R5C_CACHING: | ||
75 | * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase | ||
76 | * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase | ||
77 | * | ||
78 | * When there is no journal, or the journal is in write-through mode, | ||
79 | * the stripe is always in writing-out phase. | ||
80 | * | ||
81 | * For write-back journal, the stripe is sent to caching phase on write | ||
82 | * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off | ||
83 | * the write-out phase by clearing STRIPE_R5C_CACHING. | ||
84 | * | ||
85 | * Stripes in caching phase do not write the raid disks. Instead, all | ||
86 | * writes are committed from the log device. Therefore, a stripe in | ||
87 | * caching phase handles writes as: | ||
88 | * - write to log device | ||
89 | * - return IO | ||
90 | * | ||
91 | * Stripes in writing-out phase handle writes as: | ||
92 | * - calculate parity | ||
93 | * - write pending data and parity to journal | ||
94 | * - write data and parity to raid disks | ||
95 | * - return IO for pending writes | ||
96 | */ | ||
97 | |||
43 | struct r5l_log { | 98 | struct r5l_log { |
44 | struct md_rdev *rdev; | 99 | struct md_rdev *rdev; |
45 | 100 | ||
@@ -58,7 +113,6 @@ struct r5l_log { | |||
58 | u64 seq; /* log head sequence */ | 113 | u64 seq; /* log head sequence */ |
59 | 114 | ||
60 | sector_t next_checkpoint; | 115 | sector_t next_checkpoint; |
61 | u64 next_cp_seq; | ||
62 | 116 | ||
63 | struct mutex io_mutex; | 117 | struct mutex io_mutex; |
64 | struct r5l_io_unit *current_io; /* current io_unit accepting new data */ | 118 | struct r5l_io_unit *current_io; /* current io_unit accepting new data */ |
@@ -96,6 +150,18 @@ struct r5l_log { | |||
96 | spinlock_t no_space_stripes_lock; | 150 | spinlock_t no_space_stripes_lock; |
97 | 151 | ||
98 | bool need_cache_flush; | 152 | bool need_cache_flush; |
153 | |||
154 | /* for r5c_cache */ | ||
155 | enum r5c_journal_mode r5c_journal_mode; | ||
156 | |||
157 | /* all stripes in r5cache, in the order of seq at sh->log_start */ | ||
158 | struct list_head stripe_in_journal_list; | ||
159 | |||
160 | spinlock_t stripe_in_journal_lock; | ||
161 | atomic_t stripe_in_journal_count; | ||
162 | |||
163 | /* to submit async io_units, to fulfill ordering of flush */ | ||
164 | struct work_struct deferred_io_work; | ||
99 | }; | 165 | }; |
100 | 166 | ||
101 | /* | 167 | /* |
@@ -122,6 +188,18 @@ struct r5l_io_unit { | |||
122 | 188 | ||
123 | int state; | 189 | int state; |
124 | bool need_split_bio; | 190 | bool need_split_bio; |
191 | struct bio *split_bio; | ||
192 | |||
193 | unsigned int has_flush:1; /* include flush request */ | ||
194 | unsigned int has_fua:1; /* include fua request */ | ||
195 | unsigned int has_null_flush:1; /* include empty flush request */ | ||
196 | /* | ||
197 | * io isn't sent yet, flush/fua request can only be submitted till it's | ||
198 | * the first IO in running_ios list | ||
199 | */ | ||
200 | unsigned int io_deferred:1; | ||
201 | |||
202 | struct bio_list flush_barriers; /* size == 0 flush bios */ | ||
125 | }; | 203 | }; |
126 | 204 | ||
127 | /* r5l_io_unit state */ | 205 | /* r5l_io_unit state */ |
@@ -133,6 +211,12 @@ enum r5l_io_unit_state { | |||
133 | IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ | 211 | IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ |
134 | }; | 212 | }; |
135 | 213 | ||
214 | bool r5c_is_writeback(struct r5l_log *log) | ||
215 | { | ||
216 | return (log != NULL && | ||
217 | log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); | ||
218 | } | ||
219 | |||
136 | static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) | 220 | static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) |
137 | { | 221 | { |
138 | start += inc; | 222 | start += inc; |
@@ -168,12 +252,235 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io, | |||
168 | io->state = state; | 252 | io->state = state; |
169 | } | 253 | } |
170 | 254 | ||
255 | static void | ||
256 | r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, | ||
257 | struct bio_list *return_bi) | ||
258 | { | ||
259 | struct bio *wbi, *wbi2; | ||
260 | |||
261 | wbi = dev->written; | ||
262 | dev->written = NULL; | ||
263 | while (wbi && wbi->bi_iter.bi_sector < | ||
264 | dev->sector + STRIPE_SECTORS) { | ||
265 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
266 | if (!raid5_dec_bi_active_stripes(wbi)) { | ||
267 | md_write_end(conf->mddev); | ||
268 | bio_list_add(return_bi, wbi); | ||
269 | } | ||
270 | wbi = wbi2; | ||
271 | } | ||
272 | } | ||
273 | |||
274 | void r5c_handle_cached_data_endio(struct r5conf *conf, | ||
275 | struct stripe_head *sh, int disks, struct bio_list *return_bi) | ||
276 | { | ||
277 | int i; | ||
278 | |||
279 | for (i = sh->disks; i--; ) { | ||
280 | if (sh->dev[i].written) { | ||
281 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
282 | r5c_return_dev_pending_writes(conf, &sh->dev[i], | ||
283 | return_bi); | ||
284 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
285 | STRIPE_SECTORS, | ||
286 | !test_bit(STRIPE_DEGRADED, &sh->state), | ||
287 | 0); | ||
288 | } | ||
289 | } | ||
290 | } | ||
291 | |||
292 | /* Check whether we should flush some stripes to free up stripe cache */ | ||
293 | void r5c_check_stripe_cache_usage(struct r5conf *conf) | ||
294 | { | ||
295 | int total_cached; | ||
296 | |||
297 | if (!r5c_is_writeback(conf->log)) | ||
298 | return; | ||
299 | |||
300 | total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + | ||
301 | atomic_read(&conf->r5c_cached_full_stripes); | ||
302 | |||
303 | /* | ||
304 | * The following condition is true for either of the following: | ||
305 | * - stripe cache pressure high: | ||
306 | * total_cached > 3/4 min_nr_stripes || | ||
307 | * empty_inactive_list_nr > 0 | ||
308 | * - stripe cache pressure moderate: | ||
309 | * total_cached > 1/2 min_nr_stripes | ||
310 | */ | ||
311 | if (total_cached > conf->min_nr_stripes * 1 / 2 || | ||
312 | atomic_read(&conf->empty_inactive_list_nr) > 0) | ||
313 | r5l_wake_reclaim(conf->log, 0); | ||
314 | } | ||
315 | |||
316 | /* | ||
317 | * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full | ||
318 | * stripes in the cache | ||
319 | */ | ||
320 | void r5c_check_cached_full_stripe(struct r5conf *conf) | ||
321 | { | ||
322 | if (!r5c_is_writeback(conf->log)) | ||
323 | return; | ||
324 | |||
325 | /* | ||
326 | * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes | ||
327 | * or a full stripe (chunk size / 4k stripes). | ||
328 | */ | ||
329 | if (atomic_read(&conf->r5c_cached_full_stripes) >= | ||
330 | min(R5C_FULL_STRIPE_FLUSH_BATCH, | ||
331 | conf->chunk_sectors >> STRIPE_SHIFT)) | ||
332 | r5l_wake_reclaim(conf->log, 0); | ||
333 | } | ||
334 | |||
335 | /* | ||
336 | * Total log space (in sectors) needed to flush all data in cache | ||
337 | * | ||
338 | * Currently, writing-out phase automatically includes all pending writes | ||
339 | * to the same sector. So the reclaim of each stripe takes up to | ||
340 | * (conf->raid_disks + 1) pages of log space. | ||
341 | * | ||
342 | * To totally avoid deadlock due to log space, the code reserves | ||
343 | * (conf->raid_disks + 1) pages for each stripe in cache, which is not | ||
344 | * necessary in most cases. | ||
345 | * | ||
346 | * To improve this, we will need writing-out phase to be able to NOT include | ||
347 | * pending writes, which will reduce the requirement to | ||
348 | * (conf->max_degraded + 1) pages per stripe in cache. | ||
349 | */ | ||
350 | static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) | ||
351 | { | ||
352 | struct r5l_log *log = conf->log; | ||
353 | |||
354 | if (!r5c_is_writeback(log)) | ||
355 | return 0; | ||
356 | |||
357 | return BLOCK_SECTORS * (conf->raid_disks + 1) * | ||
358 | atomic_read(&log->stripe_in_journal_count); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL | ||
363 | * | ||
364 | * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of | ||
365 | * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log | ||
366 | * device is less than 2x of reclaim_required_space. | ||
367 | */ | ||
368 | static inline void r5c_update_log_state(struct r5l_log *log) | ||
369 | { | ||
370 | struct r5conf *conf = log->rdev->mddev->private; | ||
371 | sector_t free_space; | ||
372 | sector_t reclaim_space; | ||
373 | bool wake_reclaim = false; | ||
374 | |||
375 | if (!r5c_is_writeback(log)) | ||
376 | return; | ||
377 | |||
378 | free_space = r5l_ring_distance(log, log->log_start, | ||
379 | log->last_checkpoint); | ||
380 | reclaim_space = r5c_log_required_to_flush_cache(conf); | ||
381 | if (free_space < 2 * reclaim_space) | ||
382 | set_bit(R5C_LOG_CRITICAL, &conf->cache_state); | ||
383 | else { | ||
384 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) | ||
385 | wake_reclaim = true; | ||
386 | clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); | ||
387 | } | ||
388 | if (free_space < 3 * reclaim_space) | ||
389 | set_bit(R5C_LOG_TIGHT, &conf->cache_state); | ||
390 | else | ||
391 | clear_bit(R5C_LOG_TIGHT, &conf->cache_state); | ||
392 | |||
393 | if (wake_reclaim) | ||
394 | r5l_wake_reclaim(log, 0); | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. | ||
399 | * This function should only be called in write-back mode. | ||
400 | */ | ||
401 | void r5c_make_stripe_write_out(struct stripe_head *sh) | ||
402 | { | ||
403 | struct r5conf *conf = sh->raid_conf; | ||
404 | struct r5l_log *log = conf->log; | ||
405 | |||
406 | BUG_ON(!r5c_is_writeback(log)); | ||
407 | |||
408 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
409 | clear_bit(STRIPE_R5C_CACHING, &sh->state); | ||
410 | |||
411 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
412 | atomic_inc(&conf->preread_active_stripes); | ||
413 | |||
414 | if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { | ||
415 | BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); | ||
416 | atomic_dec(&conf->r5c_cached_partial_stripes); | ||
417 | } | ||
418 | |||
419 | if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { | ||
420 | BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); | ||
421 | atomic_dec(&conf->r5c_cached_full_stripes); | ||
422 | } | ||
423 | } | ||
424 | |||
425 | static void r5c_handle_data_cached(struct stripe_head *sh) | ||
426 | { | ||
427 | int i; | ||
428 | |||
429 | for (i = sh->disks; i--; ) | ||
430 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | ||
431 | set_bit(R5_InJournal, &sh->dev[i].flags); | ||
432 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
433 | } | ||
434 | clear_bit(STRIPE_LOG_TRAPPED, &sh->state); | ||
435 | } | ||
436 | |||
437 | /* | ||
438 | * this journal write must contain full parity, | ||
439 | * it may also contain some data pages | ||
440 | */ | ||
441 | static void r5c_handle_parity_cached(struct stripe_head *sh) | ||
442 | { | ||
443 | int i; | ||
444 | |||
445 | for (i = sh->disks; i--; ) | ||
446 | if (test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
447 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
448 | } | ||
449 | |||
450 | /* | ||
451 | * Setting proper flags after writing (or flushing) data and/or parity to the | ||
452 | * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). | ||
453 | */ | ||
454 | static void r5c_finish_cache_stripe(struct stripe_head *sh) | ||
455 | { | ||
456 | struct r5l_log *log = sh->raid_conf->log; | ||
457 | |||
458 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { | ||
459 | BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
460 | /* | ||
461 | * Set R5_InJournal for parity dev[pd_idx]. This means | ||
462 | * all data AND parity in the journal. For RAID 6, it is | ||
463 | * NOT necessary to set the flag for dev[qd_idx], as the | ||
464 | * two parities are written out together. | ||
465 | */ | ||
466 | set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
467 | } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { | ||
468 | r5c_handle_data_cached(sh); | ||
469 | } else { | ||
470 | r5c_handle_parity_cached(sh); | ||
471 | set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
472 | } | ||
473 | } | ||
474 | |||
171 | static void r5l_io_run_stripes(struct r5l_io_unit *io) | 475 | static void r5l_io_run_stripes(struct r5l_io_unit *io) |
172 | { | 476 | { |
173 | struct stripe_head *sh, *next; | 477 | struct stripe_head *sh, *next; |
174 | 478 | ||
175 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { | 479 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { |
176 | list_del_init(&sh->log_list); | 480 | list_del_init(&sh->log_list); |
481 | |||
482 | r5c_finish_cache_stripe(sh); | ||
483 | |||
177 | set_bit(STRIPE_HANDLE, &sh->state); | 484 | set_bit(STRIPE_HANDLE, &sh->state); |
178 | raid5_release_stripe(sh); | 485 | raid5_release_stripe(sh); |
179 | } | 486 | } |
@@ -209,9 +516,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log) | |||
209 | } | 516 | } |
210 | } | 517 | } |
211 | 518 | ||
519 | static void __r5l_stripe_write_finished(struct r5l_io_unit *io); | ||
212 | static void r5l_log_endio(struct bio *bio) | 520 | static void r5l_log_endio(struct bio *bio) |
213 | { | 521 | { |
214 | struct r5l_io_unit *io = bio->bi_private; | 522 | struct r5l_io_unit *io = bio->bi_private; |
523 | struct r5l_io_unit *io_deferred; | ||
215 | struct r5l_log *log = io->log; | 524 | struct r5l_log *log = io->log; |
216 | unsigned long flags; | 525 | unsigned long flags; |
217 | 526 | ||
@@ -227,18 +536,89 @@ static void r5l_log_endio(struct bio *bio) | |||
227 | r5l_move_to_end_ios(log); | 536 | r5l_move_to_end_ios(log); |
228 | else | 537 | else |
229 | r5l_log_run_stripes(log); | 538 | r5l_log_run_stripes(log); |
539 | if (!list_empty(&log->running_ios)) { | ||
540 | /* | ||
541 | * FLUSH/FUA io_unit is deferred because of ordering, now we | ||
542 | * can dispatch it | ||
543 | */ | ||
544 | io_deferred = list_first_entry(&log->running_ios, | ||
545 | struct r5l_io_unit, log_sibling); | ||
546 | if (io_deferred->io_deferred) | ||
547 | schedule_work(&log->deferred_io_work); | ||
548 | } | ||
549 | |||
230 | spin_unlock_irqrestore(&log->io_list_lock, flags); | 550 | spin_unlock_irqrestore(&log->io_list_lock, flags); |
231 | 551 | ||
232 | if (log->need_cache_flush) | 552 | if (log->need_cache_flush) |
233 | md_wakeup_thread(log->rdev->mddev->thread); | 553 | md_wakeup_thread(log->rdev->mddev->thread); |
554 | |||
555 | if (io->has_null_flush) { | ||
556 | struct bio *bi; | ||
557 | |||
558 | WARN_ON(bio_list_empty(&io->flush_barriers)); | ||
559 | while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { | ||
560 | bio_endio(bi); | ||
561 | atomic_dec(&io->pending_stripe); | ||
562 | } | ||
563 | if (atomic_read(&io->pending_stripe) == 0) | ||
564 | __r5l_stripe_write_finished(io); | ||
565 | } | ||
566 | } | ||
567 | |||
568 | static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) | ||
569 | { | ||
570 | unsigned long flags; | ||
571 | |||
572 | spin_lock_irqsave(&log->io_list_lock, flags); | ||
573 | __r5l_set_io_unit_state(io, IO_UNIT_IO_START); | ||
574 | spin_unlock_irqrestore(&log->io_list_lock, flags); | ||
575 | |||
576 | if (io->has_flush) | ||
577 | io->current_bio->bi_opf |= REQ_PREFLUSH; | ||
578 | if (io->has_fua) | ||
579 | io->current_bio->bi_opf |= REQ_FUA; | ||
580 | submit_bio(io->current_bio); | ||
581 | |||
582 | if (!io->split_bio) | ||
583 | return; | ||
584 | |||
585 | if (io->has_flush) | ||
586 | io->split_bio->bi_opf |= REQ_PREFLUSH; | ||
587 | if (io->has_fua) | ||
588 | io->split_bio->bi_opf |= REQ_FUA; | ||
589 | submit_bio(io->split_bio); | ||
590 | } | ||
591 | |||
592 | /* deferred io_unit will be dispatched here */ | ||
593 | static void r5l_submit_io_async(struct work_struct *work) | ||
594 | { | ||
595 | struct r5l_log *log = container_of(work, struct r5l_log, | ||
596 | deferred_io_work); | ||
597 | struct r5l_io_unit *io = NULL; | ||
598 | unsigned long flags; | ||
599 | |||
600 | spin_lock_irqsave(&log->io_list_lock, flags); | ||
601 | if (!list_empty(&log->running_ios)) { | ||
602 | io = list_first_entry(&log->running_ios, struct r5l_io_unit, | ||
603 | log_sibling); | ||
604 | if (!io->io_deferred) | ||
605 | io = NULL; | ||
606 | else | ||
607 | io->io_deferred = 0; | ||
608 | } | ||
609 | spin_unlock_irqrestore(&log->io_list_lock, flags); | ||
610 | if (io) | ||
611 | r5l_do_submit_io(log, io); | ||
234 | } | 612 | } |
235 | 613 | ||
236 | static void r5l_submit_current_io(struct r5l_log *log) | 614 | static void r5l_submit_current_io(struct r5l_log *log) |
237 | { | 615 | { |
238 | struct r5l_io_unit *io = log->current_io; | 616 | struct r5l_io_unit *io = log->current_io; |
617 | struct bio *bio; | ||
239 | struct r5l_meta_block *block; | 618 | struct r5l_meta_block *block; |
240 | unsigned long flags; | 619 | unsigned long flags; |
241 | u32 crc; | 620 | u32 crc; |
621 | bool do_submit = true; | ||
242 | 622 | ||
243 | if (!io) | 623 | if (!io) |
244 | return; | 624 | return; |
@@ -247,13 +627,20 @@ static void r5l_submit_current_io(struct r5l_log *log) | |||
247 | block->meta_size = cpu_to_le32(io->meta_offset); | 627 | block->meta_size = cpu_to_le32(io->meta_offset); |
248 | crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); | 628 | crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); |
249 | block->checksum = cpu_to_le32(crc); | 629 | block->checksum = cpu_to_le32(crc); |
630 | bio = io->current_bio; | ||
250 | 631 | ||
251 | log->current_io = NULL; | 632 | log->current_io = NULL; |
252 | spin_lock_irqsave(&log->io_list_lock, flags); | 633 | spin_lock_irqsave(&log->io_list_lock, flags); |
253 | __r5l_set_io_unit_state(io, IO_UNIT_IO_START); | 634 | if (io->has_flush || io->has_fua) { |
635 | if (io != list_first_entry(&log->running_ios, | ||
636 | struct r5l_io_unit, log_sibling)) { | ||
637 | io->io_deferred = 1; | ||
638 | do_submit = false; | ||
639 | } | ||
640 | } | ||
254 | spin_unlock_irqrestore(&log->io_list_lock, flags); | 641 | spin_unlock_irqrestore(&log->io_list_lock, flags); |
255 | 642 | if (do_submit) | |
256 | submit_bio(io->current_bio); | 643 | r5l_do_submit_io(log, io); |
257 | } | 644 | } |
258 | 645 | ||
259 | static struct bio *r5l_bio_alloc(struct r5l_log *log) | 646 | static struct bio *r5l_bio_alloc(struct r5l_log *log) |
@@ -271,6 +658,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) | |||
271 | { | 658 | { |
272 | log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); | 659 | log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); |
273 | 660 | ||
661 | r5c_update_log_state(log); | ||
274 | /* | 662 | /* |
275 | * If we filled up the log device start from the beginning again, | 663 | * If we filled up the log device start from the beginning again, |
276 | * which will require a new bio. | 664 | * which will require a new bio. |
@@ -297,6 +685,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) | |||
297 | io->log = log; | 685 | io->log = log; |
298 | INIT_LIST_HEAD(&io->log_sibling); | 686 | INIT_LIST_HEAD(&io->log_sibling); |
299 | INIT_LIST_HEAD(&io->stripe_list); | 687 | INIT_LIST_HEAD(&io->stripe_list); |
688 | bio_list_init(&io->flush_barriers); | ||
300 | io->state = IO_UNIT_RUNNING; | 689 | io->state = IO_UNIT_RUNNING; |
301 | 690 | ||
302 | io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); | 691 | io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); |
@@ -367,12 +756,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) | |||
367 | struct r5l_io_unit *io = log->current_io; | 756 | struct r5l_io_unit *io = log->current_io; |
368 | 757 | ||
369 | if (io->need_split_bio) { | 758 | if (io->need_split_bio) { |
370 | struct bio *prev = io->current_bio; | 759 | BUG_ON(io->split_bio); |
371 | 760 | io->split_bio = io->current_bio; | |
372 | io->current_bio = r5l_bio_alloc(log); | 761 | io->current_bio = r5l_bio_alloc(log); |
373 | bio_chain(io->current_bio, prev); | 762 | bio_chain(io->current_bio, io->split_bio); |
374 | 763 | io->need_split_bio = false; | |
375 | submit_bio(prev); | ||
376 | } | 764 | } |
377 | 765 | ||
378 | if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) | 766 | if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) |
@@ -401,50 +789,85 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, | |||
401 | 789 | ||
402 | io = log->current_io; | 790 | io = log->current_io; |
403 | 791 | ||
792 | if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) | ||
793 | io->has_flush = 1; | ||
794 | |||
404 | for (i = 0; i < sh->disks; i++) { | 795 | for (i = 0; i < sh->disks; i++) { |
405 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) | 796 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || |
797 | test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
406 | continue; | 798 | continue; |
407 | if (i == sh->pd_idx || i == sh->qd_idx) | 799 | if (i == sh->pd_idx || i == sh->qd_idx) |
408 | continue; | 800 | continue; |
801 | if (test_bit(R5_WantFUA, &sh->dev[i].flags) && | ||
802 | log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { | ||
803 | io->has_fua = 1; | ||
804 | /* | ||
805 | * we need to flush journal to make sure recovery can | ||
806 | * reach the data with fua flag | ||
807 | */ | ||
808 | io->has_flush = 1; | ||
809 | } | ||
409 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, | 810 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, |
410 | raid5_compute_blocknr(sh, i, 0), | 811 | raid5_compute_blocknr(sh, i, 0), |
411 | sh->dev[i].log_checksum, 0, false); | 812 | sh->dev[i].log_checksum, 0, false); |
412 | r5l_append_payload_page(log, sh->dev[i].page); | 813 | r5l_append_payload_page(log, sh->dev[i].page); |
413 | } | 814 | } |
414 | 815 | ||
415 | if (sh->qd_idx >= 0) { | 816 | if (parity_pages == 2) { |
416 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, | 817 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, |
417 | sh->sector, sh->dev[sh->pd_idx].log_checksum, | 818 | sh->sector, sh->dev[sh->pd_idx].log_checksum, |
418 | sh->dev[sh->qd_idx].log_checksum, true); | 819 | sh->dev[sh->qd_idx].log_checksum, true); |
419 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); | 820 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); |
420 | r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); | 821 | r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); |
421 | } else { | 822 | } else if (parity_pages == 1) { |
422 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, | 823 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, |
423 | sh->sector, sh->dev[sh->pd_idx].log_checksum, | 824 | sh->sector, sh->dev[sh->pd_idx].log_checksum, |
424 | 0, false); | 825 | 0, false); |
425 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); | 826 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); |
426 | } | 827 | } else /* Just writing data, not parity, in caching phase */ |
828 | BUG_ON(parity_pages != 0); | ||
427 | 829 | ||
428 | list_add_tail(&sh->log_list, &io->stripe_list); | 830 | list_add_tail(&sh->log_list, &io->stripe_list); |
429 | atomic_inc(&io->pending_stripe); | 831 | atomic_inc(&io->pending_stripe); |
430 | sh->log_io = io; | 832 | sh->log_io = io; |
431 | 833 | ||
834 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
835 | return 0; | ||
836 | |||
837 | if (sh->log_start == MaxSector) { | ||
838 | BUG_ON(!list_empty(&sh->r5c)); | ||
839 | sh->log_start = io->log_start; | ||
840 | spin_lock_irq(&log->stripe_in_journal_lock); | ||
841 | list_add_tail(&sh->r5c, | ||
842 | &log->stripe_in_journal_list); | ||
843 | spin_unlock_irq(&log->stripe_in_journal_lock); | ||
844 | atomic_inc(&log->stripe_in_journal_count); | ||
845 | } | ||
432 | return 0; | 846 | return 0; |
433 | } | 847 | } |
434 | 848 | ||
435 | static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); | 849 | /* add stripe to no_space_stripes, and then wake up reclaim */ |
850 | static inline void r5l_add_no_space_stripe(struct r5l_log *log, | ||
851 | struct stripe_head *sh) | ||
852 | { | ||
853 | spin_lock(&log->no_space_stripes_lock); | ||
854 | list_add_tail(&sh->log_list, &log->no_space_stripes); | ||
855 | spin_unlock(&log->no_space_stripes_lock); | ||
856 | } | ||
857 | |||
436 | /* | 858 | /* |
437 | * running in raid5d, where reclaim could wait for raid5d too (when it flushes | 859 | * running in raid5d, where reclaim could wait for raid5d too (when it flushes |
438 | * data from log to raid disks), so we shouldn't wait for reclaim here | 860 | * data from log to raid disks), so we shouldn't wait for reclaim here |
439 | */ | 861 | */ |
440 | int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | 862 | int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) |
441 | { | 863 | { |
864 | struct r5conf *conf = sh->raid_conf; | ||
442 | int write_disks = 0; | 865 | int write_disks = 0; |
443 | int data_pages, parity_pages; | 866 | int data_pages, parity_pages; |
444 | int meta_size; | ||
445 | int reserve; | 867 | int reserve; |
446 | int i; | 868 | int i; |
447 | int ret = 0; | 869 | int ret = 0; |
870 | bool wake_reclaim = false; | ||
448 | 871 | ||
449 | if (!log) | 872 | if (!log) |
450 | return -EAGAIN; | 873 | return -EAGAIN; |
@@ -456,11 +879,15 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
456 | return -EAGAIN; | 879 | return -EAGAIN; |
457 | } | 880 | } |
458 | 881 | ||
882 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
883 | |||
459 | for (i = 0; i < sh->disks; i++) { | 884 | for (i = 0; i < sh->disks; i++) { |
460 | void *addr; | 885 | void *addr; |
461 | 886 | ||
462 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) | 887 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || |
888 | test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
463 | continue; | 889 | continue; |
890 | |||
464 | write_disks++; | 891 | write_disks++; |
465 | /* checksum is already calculated in last run */ | 892 | /* checksum is already calculated in last run */ |
466 | if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) | 893 | if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) |
@@ -473,15 +900,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
473 | parity_pages = 1 + !!(sh->qd_idx >= 0); | 900 | parity_pages = 1 + !!(sh->qd_idx >= 0); |
474 | data_pages = write_disks - parity_pages; | 901 | data_pages = write_disks - parity_pages; |
475 | 902 | ||
476 | meta_size = | ||
477 | ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) | ||
478 | * data_pages) + | ||
479 | sizeof(struct r5l_payload_data_parity) + | ||
480 | sizeof(__le32) * parity_pages; | ||
481 | /* Doesn't work with very big raid array */ | ||
482 | if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) | ||
483 | return -EINVAL; | ||
484 | |||
485 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); | 903 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); |
486 | /* | 904 | /* |
487 | * The stripe must enter state machine again to finish the write, so | 905 | * The stripe must enter state machine again to finish the write, so |
@@ -493,22 +911,49 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
493 | mutex_lock(&log->io_mutex); | 911 | mutex_lock(&log->io_mutex); |
494 | /* meta + data */ | 912 | /* meta + data */ |
495 | reserve = (1 + write_disks) << (PAGE_SHIFT - 9); | 913 | reserve = (1 + write_disks) << (PAGE_SHIFT - 9); |
496 | if (!r5l_has_free_space(log, reserve)) { | ||
497 | spin_lock(&log->no_space_stripes_lock); | ||
498 | list_add_tail(&sh->log_list, &log->no_space_stripes); | ||
499 | spin_unlock(&log->no_space_stripes_lock); | ||
500 | 914 | ||
501 | r5l_wake_reclaim(log, reserve); | 915 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { |
502 | } else { | 916 | if (!r5l_has_free_space(log, reserve)) { |
503 | ret = r5l_log_stripe(log, sh, data_pages, parity_pages); | 917 | r5l_add_no_space_stripe(log, sh); |
504 | if (ret) { | 918 | wake_reclaim = true; |
505 | spin_lock_irq(&log->io_list_lock); | 919 | } else { |
506 | list_add_tail(&sh->log_list, &log->no_mem_stripes); | 920 | ret = r5l_log_stripe(log, sh, data_pages, parity_pages); |
507 | spin_unlock_irq(&log->io_list_lock); | 921 | if (ret) { |
922 | spin_lock_irq(&log->io_list_lock); | ||
923 | list_add_tail(&sh->log_list, | ||
924 | &log->no_mem_stripes); | ||
925 | spin_unlock_irq(&log->io_list_lock); | ||
926 | } | ||
927 | } | ||
928 | } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ | ||
929 | /* | ||
930 | * log space critical, do not process stripes that are | ||
931 | * not in cache yet (sh->log_start == MaxSector). | ||
932 | */ | ||
933 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && | ||
934 | sh->log_start == MaxSector) { | ||
935 | r5l_add_no_space_stripe(log, sh); | ||
936 | wake_reclaim = true; | ||
937 | reserve = 0; | ||
938 | } else if (!r5l_has_free_space(log, reserve)) { | ||
939 | if (sh->log_start == log->last_checkpoint) | ||
940 | BUG(); | ||
941 | else | ||
942 | r5l_add_no_space_stripe(log, sh); | ||
943 | } else { | ||
944 | ret = r5l_log_stripe(log, sh, data_pages, parity_pages); | ||
945 | if (ret) { | ||
946 | spin_lock_irq(&log->io_list_lock); | ||
947 | list_add_tail(&sh->log_list, | ||
948 | &log->no_mem_stripes); | ||
949 | spin_unlock_irq(&log->io_list_lock); | ||
950 | } | ||
508 | } | 951 | } |
509 | } | 952 | } |
510 | 953 | ||
511 | mutex_unlock(&log->io_mutex); | 954 | mutex_unlock(&log->io_mutex); |
955 | if (wake_reclaim) | ||
956 | r5l_wake_reclaim(log, reserve); | ||
512 | return 0; | 957 | return 0; |
513 | } | 958 | } |
514 | 959 | ||
@@ -525,17 +970,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) | |||
525 | { | 970 | { |
526 | if (!log) | 971 | if (!log) |
527 | return -ENODEV; | 972 | return -ENODEV; |
528 | /* | 973 | |
529 | * we flush log disk cache first, then write stripe data to raid disks. | 974 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { |
530 | * So if bio is finished, the log disk cache is flushed already. The | 975 | /* |
531 | * recovery guarantees we can recovery the bio from log disk, so we | 976 | * in write through (journal only) |
532 | * don't need to flush again | 977 | * we flush log disk cache first, then write stripe data to |
533 | */ | 978 | * raid disks. So if bio is finished, the log disk cache is |
534 | if (bio->bi_iter.bi_size == 0) { | 979 | * flushed already. The recovery guarantees we can recovery |
535 | bio_endio(bio); | 980 | * the bio from log disk, so we don't need to flush again |
536 | return 0; | 981 | */ |
982 | if (bio->bi_iter.bi_size == 0) { | ||
983 | bio_endio(bio); | ||
984 | return 0; | ||
985 | } | ||
986 | bio->bi_opf &= ~REQ_PREFLUSH; | ||
987 | } else { | ||
988 | /* write back (with cache) */ | ||
989 | if (bio->bi_iter.bi_size == 0) { | ||
990 | mutex_lock(&log->io_mutex); | ||
991 | r5l_get_meta(log, 0); | ||
992 | bio_list_add(&log->current_io->flush_barriers, bio); | ||
993 | log->current_io->has_flush = 1; | ||
994 | log->current_io->has_null_flush = 1; | ||
995 | atomic_inc(&log->current_io->pending_stripe); | ||
996 | r5l_submit_current_io(log); | ||
997 | mutex_unlock(&log->io_mutex); | ||
998 | return 0; | ||
999 | } | ||
537 | } | 1000 | } |
538 | bio->bi_opf &= ~REQ_PREFLUSH; | ||
539 | return -EAGAIN; | 1001 | return -EAGAIN; |
540 | } | 1002 | } |
541 | 1003 | ||
@@ -555,10 +1017,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) | |||
555 | spin_unlock(&log->no_space_stripes_lock); | 1017 | spin_unlock(&log->no_space_stripes_lock); |
556 | } | 1018 | } |
557 | 1019 | ||
1020 | /* | ||
1021 | * calculate new last_checkpoint | ||
1022 | * for write through mode, returns log->next_checkpoint | ||
1023 | * for write back, returns log_start of first sh in stripe_in_journal_list | ||
1024 | */ | ||
1025 | static sector_t r5c_calculate_new_cp(struct r5conf *conf) | ||
1026 | { | ||
1027 | struct stripe_head *sh; | ||
1028 | struct r5l_log *log = conf->log; | ||
1029 | sector_t new_cp; | ||
1030 | unsigned long flags; | ||
1031 | |||
1032 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
1033 | return log->next_checkpoint; | ||
1034 | |||
1035 | spin_lock_irqsave(&log->stripe_in_journal_lock, flags); | ||
1036 | if (list_empty(&conf->log->stripe_in_journal_list)) { | ||
1037 | /* all stripes flushed */ | ||
1038 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | ||
1039 | return log->next_checkpoint; | ||
1040 | } | ||
1041 | sh = list_first_entry(&conf->log->stripe_in_journal_list, | ||
1042 | struct stripe_head, r5c); | ||
1043 | new_cp = sh->log_start; | ||
1044 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | ||
1045 | return new_cp; | ||
1046 | } | ||
1047 | |||
558 | static sector_t r5l_reclaimable_space(struct r5l_log *log) | 1048 | static sector_t r5l_reclaimable_space(struct r5l_log *log) |
559 | { | 1049 | { |
1050 | struct r5conf *conf = log->rdev->mddev->private; | ||
1051 | |||
560 | return r5l_ring_distance(log, log->last_checkpoint, | 1052 | return r5l_ring_distance(log, log->last_checkpoint, |
561 | log->next_checkpoint); | 1053 | r5c_calculate_new_cp(conf)); |
562 | } | 1054 | } |
563 | 1055 | ||
564 | static void r5l_run_no_mem_stripe(struct r5l_log *log) | 1056 | static void r5l_run_no_mem_stripe(struct r5l_log *log) |
@@ -589,7 +1081,6 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) | |||
589 | break; | 1081 | break; |
590 | 1082 | ||
591 | log->next_checkpoint = io->log_start; | 1083 | log->next_checkpoint = io->log_start; |
592 | log->next_cp_seq = io->seq; | ||
593 | 1084 | ||
594 | list_del(&io->log_sibling); | 1085 | list_del(&io->log_sibling); |
595 | mempool_free(io, log->io_pool); | 1086 | mempool_free(io, log->io_pool); |
@@ -604,6 +1095,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) | |||
604 | static void __r5l_stripe_write_finished(struct r5l_io_unit *io) | 1095 | static void __r5l_stripe_write_finished(struct r5l_io_unit *io) |
605 | { | 1096 | { |
606 | struct r5l_log *log = io->log; | 1097 | struct r5l_log *log = io->log; |
1098 | struct r5conf *conf = log->rdev->mddev->private; | ||
607 | unsigned long flags; | 1099 | unsigned long flags; |
608 | 1100 | ||
609 | spin_lock_irqsave(&log->io_list_lock, flags); | 1101 | spin_lock_irqsave(&log->io_list_lock, flags); |
@@ -614,7 +1106,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io) | |||
614 | return; | 1106 | return; |
615 | } | 1107 | } |
616 | 1108 | ||
617 | if (r5l_reclaimable_space(log) > log->max_free_space) | 1109 | if (r5l_reclaimable_space(log) > log->max_free_space || |
1110 | test_bit(R5C_LOG_TIGHT, &conf->cache_state)) | ||
618 | r5l_wake_reclaim(log, 0); | 1111 | r5l_wake_reclaim(log, 0); |
619 | 1112 | ||
620 | spin_unlock_irqrestore(&log->io_list_lock, flags); | 1113 | spin_unlock_irqrestore(&log->io_list_lock, flags); |
@@ -713,8 +1206,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, | |||
713 | * there is a deadlock. We workaround this issue with a trylock. | 1206 | * there is a deadlock. We workaround this issue with a trylock. |
714 | * FIXME: we could miss discard if we can't take reconfig mutex | 1207 | * FIXME: we could miss discard if we can't take reconfig mutex |
715 | */ | 1208 | */ |
716 | set_mask_bits(&mddev->flags, 0, | 1209 | set_mask_bits(&mddev->sb_flags, 0, |
717 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1210 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
718 | if (!mddev_trylock(mddev)) | 1211 | if (!mddev_trylock(mddev)) |
719 | return; | 1212 | return; |
720 | md_update_sb(mddev, 1); | 1213 | md_update_sb(mddev, 1); |
@@ -735,15 +1228,148 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, | |||
735 | } | 1228 | } |
736 | } | 1229 | } |
737 | 1230 | ||
1231 | /* | ||
1232 | * r5c_flush_stripe moves stripe from cached list to handle_list. When called, | ||
1233 | * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. | ||
1234 | * | ||
1235 | * must hold conf->device_lock | ||
1236 | */ | ||
1237 | static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) | ||
1238 | { | ||
1239 | BUG_ON(list_empty(&sh->lru)); | ||
1240 | BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
1241 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | ||
1242 | |||
1243 | /* | ||
1244 | * The stripe is not ON_RELEASE_LIST, so it is safe to call | ||
1245 | * raid5_release_stripe() while holding conf->device_lock | ||
1246 | */ | ||
1247 | BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); | ||
1248 | assert_spin_locked(&conf->device_lock); | ||
1249 | |||
1250 | list_del_init(&sh->lru); | ||
1251 | atomic_inc(&sh->count); | ||
1252 | |||
1253 | set_bit(STRIPE_HANDLE, &sh->state); | ||
1254 | atomic_inc(&conf->active_stripes); | ||
1255 | r5c_make_stripe_write_out(sh); | ||
1256 | |||
1257 | raid5_release_stripe(sh); | ||
1258 | } | ||
1259 | |||
1260 | /* | ||
1261 | * if num == 0, flush all full stripes | ||
1262 | * if num > 0, flush all full stripes. If less than num full stripes are | ||
1263 | * flushed, flush some partial stripes until totally num stripes are | ||
1264 | * flushed or there is no more cached stripes. | ||
1265 | */ | ||
1266 | void r5c_flush_cache(struct r5conf *conf, int num) | ||
1267 | { | ||
1268 | int count; | ||
1269 | struct stripe_head *sh, *next; | ||
1270 | |||
1271 | assert_spin_locked(&conf->device_lock); | ||
1272 | if (!conf->log) | ||
1273 | return; | ||
1274 | |||
1275 | count = 0; | ||
1276 | list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { | ||
1277 | r5c_flush_stripe(conf, sh); | ||
1278 | count++; | ||
1279 | } | ||
1280 | |||
1281 | if (count >= num) | ||
1282 | return; | ||
1283 | list_for_each_entry_safe(sh, next, | ||
1284 | &conf->r5c_partial_stripe_list, lru) { | ||
1285 | r5c_flush_stripe(conf, sh); | ||
1286 | if (++count >= num) | ||
1287 | break; | ||
1288 | } | ||
1289 | } | ||
1290 | |||
1291 | static void r5c_do_reclaim(struct r5conf *conf) | ||
1292 | { | ||
1293 | struct r5l_log *log = conf->log; | ||
1294 | struct stripe_head *sh; | ||
1295 | int count = 0; | ||
1296 | unsigned long flags; | ||
1297 | int total_cached; | ||
1298 | int stripes_to_flush; | ||
1299 | |||
1300 | if (!r5c_is_writeback(log)) | ||
1301 | return; | ||
1302 | |||
1303 | total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + | ||
1304 | atomic_read(&conf->r5c_cached_full_stripes); | ||
1305 | |||
1306 | if (total_cached > conf->min_nr_stripes * 3 / 4 || | ||
1307 | atomic_read(&conf->empty_inactive_list_nr) > 0) | ||
1308 | /* | ||
1309 | * if stripe cache pressure high, flush all full stripes and | ||
1310 | * some partial stripes | ||
1311 | */ | ||
1312 | stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; | ||
1313 | else if (total_cached > conf->min_nr_stripes * 1 / 2 || | ||
1314 | atomic_read(&conf->r5c_cached_full_stripes) > | ||
1315 | R5C_FULL_STRIPE_FLUSH_BATCH) | ||
1316 | /* | ||
1317 | * if stripe cache pressure moderate, or if there is many full | ||
1318 | * stripes,flush all full stripes | ||
1319 | */ | ||
1320 | stripes_to_flush = 0; | ||
1321 | else | ||
1322 | /* no need to flush */ | ||
1323 | stripes_to_flush = -1; | ||
1324 | |||
1325 | if (stripes_to_flush >= 0) { | ||
1326 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1327 | r5c_flush_cache(conf, stripes_to_flush); | ||
1328 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1329 | } | ||
1330 | |||
1331 | /* if log space is tight, flush stripes on stripe_in_journal_list */ | ||
1332 | if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { | ||
1333 | spin_lock_irqsave(&log->stripe_in_journal_lock, flags); | ||
1334 | spin_lock(&conf->device_lock); | ||
1335 | list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { | ||
1336 | /* | ||
1337 | * stripes on stripe_in_journal_list could be in any | ||
1338 | * state of the stripe_cache state machine. In this | ||
1339 | * case, we only want to flush stripe on | ||
1340 | * r5c_cached_full/partial_stripes. The following | ||
1341 | * condition makes sure the stripe is on one of the | ||
1342 | * two lists. | ||
1343 | */ | ||
1344 | if (!list_empty(&sh->lru) && | ||
1345 | !test_bit(STRIPE_HANDLE, &sh->state) && | ||
1346 | atomic_read(&sh->count) == 0) { | ||
1347 | r5c_flush_stripe(conf, sh); | ||
1348 | } | ||
1349 | if (count++ >= R5C_RECLAIM_STRIPE_GROUP) | ||
1350 | break; | ||
1351 | } | ||
1352 | spin_unlock(&conf->device_lock); | ||
1353 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | ||
1354 | } | ||
1355 | |||
1356 | if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) | ||
1357 | r5l_run_no_space_stripes(log); | ||
1358 | |||
1359 | md_wakeup_thread(conf->mddev->thread); | ||
1360 | } | ||
738 | 1361 | ||
739 | static void r5l_do_reclaim(struct r5l_log *log) | 1362 | static void r5l_do_reclaim(struct r5l_log *log) |
740 | { | 1363 | { |
1364 | struct r5conf *conf = log->rdev->mddev->private; | ||
741 | sector_t reclaim_target = xchg(&log->reclaim_target, 0); | 1365 | sector_t reclaim_target = xchg(&log->reclaim_target, 0); |
742 | sector_t reclaimable; | 1366 | sector_t reclaimable; |
743 | sector_t next_checkpoint; | 1367 | sector_t next_checkpoint; |
744 | u64 next_cp_seq; | 1368 | bool write_super; |
745 | 1369 | ||
746 | spin_lock_irq(&log->io_list_lock); | 1370 | spin_lock_irq(&log->io_list_lock); |
1371 | write_super = r5l_reclaimable_space(log) > log->max_free_space || | ||
1372 | reclaim_target != 0 || !list_empty(&log->no_space_stripes); | ||
747 | /* | 1373 | /* |
748 | * move proper io_unit to reclaim list. We should not change the order. | 1374 | * move proper io_unit to reclaim list. We should not change the order. |
749 | * reclaimable/unreclaimable io_unit can be mixed in the list, we | 1375 | * reclaimable/unreclaimable io_unit can be mixed in the list, we |
@@ -764,12 +1390,12 @@ static void r5l_do_reclaim(struct r5l_log *log) | |||
764 | log->io_list_lock); | 1390 | log->io_list_lock); |
765 | } | 1391 | } |
766 | 1392 | ||
767 | next_checkpoint = log->next_checkpoint; | 1393 | next_checkpoint = r5c_calculate_new_cp(conf); |
768 | next_cp_seq = log->next_cp_seq; | ||
769 | spin_unlock_irq(&log->io_list_lock); | 1394 | spin_unlock_irq(&log->io_list_lock); |
770 | 1395 | ||
771 | BUG_ON(reclaimable < 0); | 1396 | BUG_ON(reclaimable < 0); |
772 | if (reclaimable == 0) | 1397 | |
1398 | if (reclaimable == 0 || !write_super) | ||
773 | return; | 1399 | return; |
774 | 1400 | ||
775 | /* | 1401 | /* |
@@ -781,7 +1407,7 @@ static void r5l_do_reclaim(struct r5l_log *log) | |||
781 | 1407 | ||
782 | mutex_lock(&log->io_mutex); | 1408 | mutex_lock(&log->io_mutex); |
783 | log->last_checkpoint = next_checkpoint; | 1409 | log->last_checkpoint = next_checkpoint; |
784 | log->last_cp_seq = next_cp_seq; | 1410 | r5c_update_log_state(log); |
785 | mutex_unlock(&log->io_mutex); | 1411 | mutex_unlock(&log->io_mutex); |
786 | 1412 | ||
787 | r5l_run_no_space_stripes(log); | 1413 | r5l_run_no_space_stripes(log); |
@@ -795,14 +1421,17 @@ static void r5l_reclaim_thread(struct md_thread *thread) | |||
795 | 1421 | ||
796 | if (!log) | 1422 | if (!log) |
797 | return; | 1423 | return; |
1424 | r5c_do_reclaim(conf); | ||
798 | r5l_do_reclaim(log); | 1425 | r5l_do_reclaim(log); |
799 | } | 1426 | } |
800 | 1427 | ||
801 | static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) | 1428 | void r5l_wake_reclaim(struct r5l_log *log, sector_t space) |
802 | { | 1429 | { |
803 | unsigned long target; | 1430 | unsigned long target; |
804 | unsigned long new = (unsigned long)space; /* overflow in theory */ | 1431 | unsigned long new = (unsigned long)space; /* overflow in theory */ |
805 | 1432 | ||
1433 | if (!log) | ||
1434 | return; | ||
806 | do { | 1435 | do { |
807 | target = log->reclaim_target; | 1436 | target = log->reclaim_target; |
808 | if (new < target) | 1437 | if (new < target) |
@@ -816,22 +1445,14 @@ void r5l_quiesce(struct r5l_log *log, int state) | |||
816 | struct mddev *mddev; | 1445 | struct mddev *mddev; |
817 | if (!log || state == 2) | 1446 | if (!log || state == 2) |
818 | return; | 1447 | return; |
819 | if (state == 0) { | 1448 | if (state == 0) |
820 | /* | 1449 | kthread_unpark(log->reclaim_thread->tsk); |
821 | * This is a special case for hotadd. In suspend, the array has | 1450 | else if (state == 1) { |
822 | * no journal. In resume, journal is initialized as well as the | ||
823 | * reclaim thread. | ||
824 | */ | ||
825 | if (log->reclaim_thread) | ||
826 | return; | ||
827 | log->reclaim_thread = md_register_thread(r5l_reclaim_thread, | ||
828 | log->rdev->mddev, "reclaim"); | ||
829 | } else if (state == 1) { | ||
830 | /* make sure r5l_write_super_and_discard_space exits */ | 1451 | /* make sure r5l_write_super_and_discard_space exits */ |
831 | mddev = log->rdev->mddev; | 1452 | mddev = log->rdev->mddev; |
832 | wake_up(&mddev->sb_wait); | 1453 | wake_up(&mddev->sb_wait); |
833 | r5l_wake_reclaim(log, -1L); | 1454 | kthread_park(log->reclaim_thread->tsk); |
834 | md_unregister_thread(&log->reclaim_thread); | 1455 | r5l_wake_reclaim(log, MaxSector); |
835 | r5l_do_reclaim(log); | 1456 | r5l_do_reclaim(log); |
836 | } | 1457 | } |
837 | } | 1458 | } |
@@ -857,10 +1478,13 @@ struct r5l_recovery_ctx { | |||
857 | sector_t meta_total_blocks; /* total size of current meta and data */ | 1478 | sector_t meta_total_blocks; /* total size of current meta and data */ |
858 | sector_t pos; /* recovery position */ | 1479 | sector_t pos; /* recovery position */ |
859 | u64 seq; /* recovery position seq */ | 1480 | u64 seq; /* recovery position seq */ |
1481 | int data_parity_stripes; /* number of data_parity stripes */ | ||
1482 | int data_only_stripes; /* number of data_only stripes */ | ||
1483 | struct list_head cached_list; | ||
860 | }; | 1484 | }; |
861 | 1485 | ||
862 | static int r5l_read_meta_block(struct r5l_log *log, | 1486 | static int r5l_recovery_read_meta_block(struct r5l_log *log, |
863 | struct r5l_recovery_ctx *ctx) | 1487 | struct r5l_recovery_ctx *ctx) |
864 | { | 1488 | { |
865 | struct page *page = ctx->meta_page; | 1489 | struct page *page = ctx->meta_page; |
866 | struct r5l_meta_block *mb; | 1490 | struct r5l_meta_block *mb; |
@@ -892,170 +1516,618 @@ static int r5l_read_meta_block(struct r5l_log *log, | |||
892 | return 0; | 1516 | return 0; |
893 | } | 1517 | } |
894 | 1518 | ||
895 | static int r5l_recovery_flush_one_stripe(struct r5l_log *log, | 1519 | static void |
896 | struct r5l_recovery_ctx *ctx, | 1520 | r5l_recovery_create_empty_meta_block(struct r5l_log *log, |
897 | sector_t stripe_sect, | 1521 | struct page *page, |
898 | int *offset, sector_t *log_offset) | 1522 | sector_t pos, u64 seq) |
899 | { | 1523 | { |
900 | struct r5conf *conf = log->rdev->mddev->private; | 1524 | struct r5l_meta_block *mb; |
901 | struct stripe_head *sh; | ||
902 | struct r5l_payload_data_parity *payload; | ||
903 | int disk_index; | ||
904 | 1525 | ||
905 | sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); | 1526 | mb = page_address(page); |
906 | while (1) { | 1527 | clear_page(mb); |
907 | payload = page_address(ctx->meta_page) + *offset; | 1528 | mb->magic = cpu_to_le32(R5LOG_MAGIC); |
1529 | mb->version = R5LOG_VERSION; | ||
1530 | mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); | ||
1531 | mb->seq = cpu_to_le64(seq); | ||
1532 | mb->position = cpu_to_le64(pos); | ||
1533 | } | ||
908 | 1534 | ||
909 | if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { | 1535 | static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, |
910 | raid5_compute_sector(conf, | 1536 | u64 seq) |
911 | le64_to_cpu(payload->location), 0, | 1537 | { |
912 | &disk_index, sh); | 1538 | struct page *page; |
1539 | struct r5l_meta_block *mb; | ||
913 | 1540 | ||
914 | sync_page_io(log->rdev, *log_offset, PAGE_SIZE, | 1541 | page = alloc_page(GFP_KERNEL); |
915 | sh->dev[disk_index].page, REQ_OP_READ, 0, | 1542 | if (!page) |
916 | false); | 1543 | return -ENOMEM; |
917 | sh->dev[disk_index].log_checksum = | 1544 | r5l_recovery_create_empty_meta_block(log, page, pos, seq); |
918 | le32_to_cpu(payload->checksum[0]); | 1545 | mb = page_address(page); |
919 | set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); | 1546 | mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, |
920 | ctx->meta_total_blocks += BLOCK_SECTORS; | 1547 | mb, PAGE_SIZE)); |
921 | } else { | 1548 | if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, |
922 | disk_index = sh->pd_idx; | 1549 | REQ_FUA, false)) { |
923 | sync_page_io(log->rdev, *log_offset, PAGE_SIZE, | 1550 | __free_page(page); |
924 | sh->dev[disk_index].page, REQ_OP_READ, 0, | 1551 | return -EIO; |
925 | false); | 1552 | } |
926 | sh->dev[disk_index].log_checksum = | 1553 | __free_page(page); |
927 | le32_to_cpu(payload->checksum[0]); | 1554 | return 0; |
928 | set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); | 1555 | } |
929 | |||
930 | if (sh->qd_idx >= 0) { | ||
931 | disk_index = sh->qd_idx; | ||
932 | sync_page_io(log->rdev, | ||
933 | r5l_ring_add(log, *log_offset, BLOCK_SECTORS), | ||
934 | PAGE_SIZE, sh->dev[disk_index].page, | ||
935 | REQ_OP_READ, 0, false); | ||
936 | sh->dev[disk_index].log_checksum = | ||
937 | le32_to_cpu(payload->checksum[1]); | ||
938 | set_bit(R5_Wantwrite, | ||
939 | &sh->dev[disk_index].flags); | ||
940 | } | ||
941 | ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; | ||
942 | } | ||
943 | 1556 | ||
944 | *log_offset = r5l_ring_add(log, *log_offset, | 1557 | /* |
945 | le32_to_cpu(payload->size)); | 1558 | * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite |
946 | *offset += sizeof(struct r5l_payload_data_parity) + | 1559 | * to mark valid (potentially not flushed) data in the journal. |
947 | sizeof(__le32) * | 1560 | * |
948 | (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); | 1561 | * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, |
949 | if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) | 1562 | * so there should not be any mismatch here. |
950 | break; | 1563 | */ |
1564 | static void r5l_recovery_load_data(struct r5l_log *log, | ||
1565 | struct stripe_head *sh, | ||
1566 | struct r5l_recovery_ctx *ctx, | ||
1567 | struct r5l_payload_data_parity *payload, | ||
1568 | sector_t log_offset) | ||
1569 | { | ||
1570 | struct mddev *mddev = log->rdev->mddev; | ||
1571 | struct r5conf *conf = mddev->private; | ||
1572 | int dd_idx; | ||
1573 | |||
1574 | raid5_compute_sector(conf, | ||
1575 | le64_to_cpu(payload->location), 0, | ||
1576 | &dd_idx, sh); | ||
1577 | sync_page_io(log->rdev, log_offset, PAGE_SIZE, | ||
1578 | sh->dev[dd_idx].page, REQ_OP_READ, 0, false); | ||
1579 | sh->dev[dd_idx].log_checksum = | ||
1580 | le32_to_cpu(payload->checksum[0]); | ||
1581 | ctx->meta_total_blocks += BLOCK_SECTORS; | ||
1582 | |||
1583 | set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); | ||
1584 | set_bit(STRIPE_R5C_CACHING, &sh->state); | ||
1585 | } | ||
1586 | |||
1587 | static void r5l_recovery_load_parity(struct r5l_log *log, | ||
1588 | struct stripe_head *sh, | ||
1589 | struct r5l_recovery_ctx *ctx, | ||
1590 | struct r5l_payload_data_parity *payload, | ||
1591 | sector_t log_offset) | ||
1592 | { | ||
1593 | struct mddev *mddev = log->rdev->mddev; | ||
1594 | struct r5conf *conf = mddev->private; | ||
1595 | |||
1596 | ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; | ||
1597 | sync_page_io(log->rdev, log_offset, PAGE_SIZE, | ||
1598 | sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); | ||
1599 | sh->dev[sh->pd_idx].log_checksum = | ||
1600 | le32_to_cpu(payload->checksum[0]); | ||
1601 | set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); | ||
1602 | |||
1603 | if (sh->qd_idx >= 0) { | ||
1604 | sync_page_io(log->rdev, | ||
1605 | r5l_ring_add(log, log_offset, BLOCK_SECTORS), | ||
1606 | PAGE_SIZE, sh->dev[sh->qd_idx].page, | ||
1607 | REQ_OP_READ, 0, false); | ||
1608 | sh->dev[sh->qd_idx].log_checksum = | ||
1609 | le32_to_cpu(payload->checksum[1]); | ||
1610 | set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); | ||
951 | } | 1611 | } |
1612 | clear_bit(STRIPE_R5C_CACHING, &sh->state); | ||
1613 | } | ||
952 | 1614 | ||
953 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { | 1615 | static void r5l_recovery_reset_stripe(struct stripe_head *sh) |
954 | void *addr; | 1616 | { |
955 | u32 checksum; | 1617 | int i; |
956 | 1618 | ||
1619 | sh->state = 0; | ||
1620 | sh->log_start = MaxSector; | ||
1621 | for (i = sh->disks; i--; ) | ||
1622 | sh->dev[i].flags = 0; | ||
1623 | } | ||
1624 | |||
1625 | static void | ||
1626 | r5l_recovery_replay_one_stripe(struct r5conf *conf, | ||
1627 | struct stripe_head *sh, | ||
1628 | struct r5l_recovery_ctx *ctx) | ||
1629 | { | ||
1630 | struct md_rdev *rdev, *rrdev; | ||
1631 | int disk_index; | ||
1632 | int data_count = 0; | ||
1633 | |||
1634 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { | ||
957 | if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) | 1635 | if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) |
958 | continue; | 1636 | continue; |
959 | addr = kmap_atomic(sh->dev[disk_index].page); | 1637 | if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) |
960 | checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); | 1638 | continue; |
961 | kunmap_atomic(addr); | 1639 | data_count++; |
962 | if (checksum != sh->dev[disk_index].log_checksum) | ||
963 | goto error; | ||
964 | } | 1640 | } |
965 | 1641 | ||
966 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { | 1642 | /* |
967 | struct md_rdev *rdev, *rrdev; | 1643 | * stripes that only have parity must have been flushed |
1644 | * before the crash that we are now recovering from, so | ||
1645 | * there is nothing more to recovery. | ||
1646 | */ | ||
1647 | if (data_count == 0) | ||
1648 | goto out; | ||
968 | 1649 | ||
969 | if (!test_and_clear_bit(R5_Wantwrite, | 1650 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { |
970 | &sh->dev[disk_index].flags)) | 1651 | if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) |
971 | continue; | 1652 | continue; |
972 | 1653 | ||
973 | /* in case device is broken */ | 1654 | /* in case device is broken */ |
1655 | rcu_read_lock(); | ||
974 | rdev = rcu_dereference(conf->disks[disk_index].rdev); | 1656 | rdev = rcu_dereference(conf->disks[disk_index].rdev); |
975 | if (rdev) | 1657 | if (rdev) { |
976 | sync_page_io(rdev, stripe_sect, PAGE_SIZE, | 1658 | atomic_inc(&rdev->nr_pending); |
1659 | rcu_read_unlock(); | ||
1660 | sync_page_io(rdev, sh->sector, PAGE_SIZE, | ||
977 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, | 1661 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, |
978 | false); | 1662 | false); |
1663 | rdev_dec_pending(rdev, rdev->mddev); | ||
1664 | rcu_read_lock(); | ||
1665 | } | ||
979 | rrdev = rcu_dereference(conf->disks[disk_index].replacement); | 1666 | rrdev = rcu_dereference(conf->disks[disk_index].replacement); |
980 | if (rrdev) | 1667 | if (rrdev) { |
981 | sync_page_io(rrdev, stripe_sect, PAGE_SIZE, | 1668 | atomic_inc(&rrdev->nr_pending); |
1669 | rcu_read_unlock(); | ||
1670 | sync_page_io(rrdev, sh->sector, PAGE_SIZE, | ||
982 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, | 1671 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, |
983 | false); | 1672 | false); |
1673 | rdev_dec_pending(rrdev, rrdev->mddev); | ||
1674 | rcu_read_lock(); | ||
1675 | } | ||
1676 | rcu_read_unlock(); | ||
984 | } | 1677 | } |
985 | raid5_release_stripe(sh); | 1678 | ctx->data_parity_stripes++; |
1679 | out: | ||
1680 | r5l_recovery_reset_stripe(sh); | ||
1681 | } | ||
1682 | |||
1683 | static struct stripe_head * | ||
1684 | r5c_recovery_alloc_stripe(struct r5conf *conf, | ||
1685 | sector_t stripe_sect, | ||
1686 | sector_t log_start) | ||
1687 | { | ||
1688 | struct stripe_head *sh; | ||
1689 | |||
1690 | sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); | ||
1691 | if (!sh) | ||
1692 | return NULL; /* no more stripe available */ | ||
1693 | |||
1694 | r5l_recovery_reset_stripe(sh); | ||
1695 | sh->log_start = log_start; | ||
1696 | |||
1697 | return sh; | ||
1698 | } | ||
1699 | |||
1700 | static struct stripe_head * | ||
1701 | r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) | ||
1702 | { | ||
1703 | struct stripe_head *sh; | ||
1704 | |||
1705 | list_for_each_entry(sh, list, lru) | ||
1706 | if (sh->sector == sect) | ||
1707 | return sh; | ||
1708 | return NULL; | ||
1709 | } | ||
1710 | |||
1711 | static void | ||
1712 | r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, | ||
1713 | struct r5l_recovery_ctx *ctx) | ||
1714 | { | ||
1715 | struct stripe_head *sh, *next; | ||
1716 | |||
1717 | list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { | ||
1718 | r5l_recovery_reset_stripe(sh); | ||
1719 | list_del_init(&sh->lru); | ||
1720 | raid5_release_stripe(sh); | ||
1721 | } | ||
1722 | } | ||
1723 | |||
1724 | static void | ||
1725 | r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, | ||
1726 | struct r5l_recovery_ctx *ctx) | ||
1727 | { | ||
1728 | struct stripe_head *sh, *next; | ||
1729 | |||
1730 | list_for_each_entry_safe(sh, next, cached_stripe_list, lru) | ||
1731 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { | ||
1732 | r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); | ||
1733 | list_del_init(&sh->lru); | ||
1734 | raid5_release_stripe(sh); | ||
1735 | } | ||
1736 | } | ||
1737 | |||
1738 | /* if matches return 0; otherwise return -EINVAL */ | ||
1739 | static int | ||
1740 | r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, | ||
1741 | sector_t log_offset, __le32 log_checksum) | ||
1742 | { | ||
1743 | void *addr; | ||
1744 | u32 checksum; | ||
1745 | |||
1746 | sync_page_io(log->rdev, log_offset, PAGE_SIZE, | ||
1747 | page, REQ_OP_READ, 0, false); | ||
1748 | addr = kmap_atomic(page); | ||
1749 | checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); | ||
1750 | kunmap_atomic(addr); | ||
1751 | return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; | ||
1752 | } | ||
1753 | |||
1754 | /* | ||
1755 | * before loading data to stripe cache, we need verify checksum for all data, | ||
1756 | * if there is mismatch for any data page, we drop all data in the mata block | ||
1757 | */ | ||
1758 | static int | ||
1759 | r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, | ||
1760 | struct r5l_recovery_ctx *ctx) | ||
1761 | { | ||
1762 | struct mddev *mddev = log->rdev->mddev; | ||
1763 | struct r5conf *conf = mddev->private; | ||
1764 | struct r5l_meta_block *mb = page_address(ctx->meta_page); | ||
1765 | sector_t mb_offset = sizeof(struct r5l_meta_block); | ||
1766 | sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); | ||
1767 | struct page *page; | ||
1768 | struct r5l_payload_data_parity *payload; | ||
1769 | |||
1770 | page = alloc_page(GFP_KERNEL); | ||
1771 | if (!page) | ||
1772 | return -ENOMEM; | ||
1773 | |||
1774 | while (mb_offset < le32_to_cpu(mb->meta_size)) { | ||
1775 | payload = (void *)mb + mb_offset; | ||
1776 | |||
1777 | if (payload->header.type == R5LOG_PAYLOAD_DATA) { | ||
1778 | if (r5l_recovery_verify_data_checksum( | ||
1779 | log, page, log_offset, | ||
1780 | payload->checksum[0]) < 0) | ||
1781 | goto mismatch; | ||
1782 | } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { | ||
1783 | if (r5l_recovery_verify_data_checksum( | ||
1784 | log, page, log_offset, | ||
1785 | payload->checksum[0]) < 0) | ||
1786 | goto mismatch; | ||
1787 | if (conf->max_degraded == 2 && /* q for RAID 6 */ | ||
1788 | r5l_recovery_verify_data_checksum( | ||
1789 | log, page, | ||
1790 | r5l_ring_add(log, log_offset, | ||
1791 | BLOCK_SECTORS), | ||
1792 | payload->checksum[1]) < 0) | ||
1793 | goto mismatch; | ||
1794 | } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ | ||
1795 | goto mismatch; | ||
1796 | |||
1797 | log_offset = r5l_ring_add(log, log_offset, | ||
1798 | le32_to_cpu(payload->size)); | ||
1799 | |||
1800 | mb_offset += sizeof(struct r5l_payload_data_parity) + | ||
1801 | sizeof(__le32) * | ||
1802 | (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); | ||
1803 | } | ||
1804 | |||
1805 | put_page(page); | ||
986 | return 0; | 1806 | return 0; |
987 | 1807 | ||
988 | error: | 1808 | mismatch: |
989 | for (disk_index = 0; disk_index < sh->disks; disk_index++) | 1809 | put_page(page); |
990 | sh->dev[disk_index].flags = 0; | ||
991 | raid5_release_stripe(sh); | ||
992 | return -EINVAL; | 1810 | return -EINVAL; |
993 | } | 1811 | } |
994 | 1812 | ||
995 | static int r5l_recovery_flush_one_meta(struct r5l_log *log, | 1813 | /* |
996 | struct r5l_recovery_ctx *ctx) | 1814 | * Analyze all data/parity pages in one meta block |
1815 | * Returns: | ||
1816 | * 0 for success | ||
1817 | * -EINVAL for unknown playload type | ||
1818 | * -EAGAIN for checksum mismatch of data page | ||
1819 | * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) | ||
1820 | */ | ||
1821 | static int | ||
1822 | r5c_recovery_analyze_meta_block(struct r5l_log *log, | ||
1823 | struct r5l_recovery_ctx *ctx, | ||
1824 | struct list_head *cached_stripe_list) | ||
997 | { | 1825 | { |
998 | struct r5conf *conf = log->rdev->mddev->private; | 1826 | struct mddev *mddev = log->rdev->mddev; |
999 | struct r5l_payload_data_parity *payload; | 1827 | struct r5conf *conf = mddev->private; |
1000 | struct r5l_meta_block *mb; | 1828 | struct r5l_meta_block *mb; |
1001 | int offset; | 1829 | struct r5l_payload_data_parity *payload; |
1830 | int mb_offset; | ||
1002 | sector_t log_offset; | 1831 | sector_t log_offset; |
1003 | sector_t stripe_sector; | 1832 | sector_t stripe_sect; |
1833 | struct stripe_head *sh; | ||
1834 | int ret; | ||
1835 | |||
1836 | /* | ||
1837 | * for mismatch in data blocks, we will drop all data in this mb, but | ||
1838 | * we will still read next mb for other data with FLUSH flag, as | ||
1839 | * io_unit could finish out of order. | ||
1840 | */ | ||
1841 | ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); | ||
1842 | if (ret == -EINVAL) | ||
1843 | return -EAGAIN; | ||
1844 | else if (ret) | ||
1845 | return ret; /* -ENOMEM duo to alloc_page() failed */ | ||
1004 | 1846 | ||
1005 | mb = page_address(ctx->meta_page); | 1847 | mb = page_address(ctx->meta_page); |
1006 | offset = sizeof(struct r5l_meta_block); | 1848 | mb_offset = sizeof(struct r5l_meta_block); |
1007 | log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); | 1849 | log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); |
1008 | 1850 | ||
1009 | while (offset < le32_to_cpu(mb->meta_size)) { | 1851 | while (mb_offset < le32_to_cpu(mb->meta_size)) { |
1010 | int dd; | 1852 | int dd; |
1011 | 1853 | ||
1012 | payload = (void *)mb + offset; | 1854 | payload = (void *)mb + mb_offset; |
1013 | stripe_sector = raid5_compute_sector(conf, | 1855 | stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? |
1014 | le64_to_cpu(payload->location), 0, &dd, NULL); | 1856 | raid5_compute_sector( |
1015 | if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, | 1857 | conf, le64_to_cpu(payload->location), 0, &dd, |
1016 | &offset, &log_offset)) | 1858 | NULL) |
1859 | : le64_to_cpu(payload->location); | ||
1860 | |||
1861 | sh = r5c_recovery_lookup_stripe(cached_stripe_list, | ||
1862 | stripe_sect); | ||
1863 | |||
1864 | if (!sh) { | ||
1865 | sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos); | ||
1866 | /* | ||
1867 | * cannot get stripe from raid5_get_active_stripe | ||
1868 | * try replay some stripes | ||
1869 | */ | ||
1870 | if (!sh) { | ||
1871 | r5c_recovery_replay_stripes( | ||
1872 | cached_stripe_list, ctx); | ||
1873 | sh = r5c_recovery_alloc_stripe( | ||
1874 | conf, stripe_sect, ctx->pos); | ||
1875 | } | ||
1876 | if (!sh) { | ||
1877 | pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", | ||
1878 | mdname(mddev), | ||
1879 | conf->min_nr_stripes * 2); | ||
1880 | raid5_set_cache_size(mddev, | ||
1881 | conf->min_nr_stripes * 2); | ||
1882 | sh = r5c_recovery_alloc_stripe( | ||
1883 | conf, stripe_sect, ctx->pos); | ||
1884 | } | ||
1885 | if (!sh) { | ||
1886 | pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", | ||
1887 | mdname(mddev)); | ||
1888 | return -ENOMEM; | ||
1889 | } | ||
1890 | list_add_tail(&sh->lru, cached_stripe_list); | ||
1891 | } | ||
1892 | |||
1893 | if (payload->header.type == R5LOG_PAYLOAD_DATA) { | ||
1894 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && | ||
1895 | test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { | ||
1896 | r5l_recovery_replay_one_stripe(conf, sh, ctx); | ||
1897 | sh->log_start = ctx->pos; | ||
1898 | list_move_tail(&sh->lru, cached_stripe_list); | ||
1899 | } | ||
1900 | r5l_recovery_load_data(log, sh, ctx, payload, | ||
1901 | log_offset); | ||
1902 | } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) | ||
1903 | r5l_recovery_load_parity(log, sh, ctx, payload, | ||
1904 | log_offset); | ||
1905 | else | ||
1017 | return -EINVAL; | 1906 | return -EINVAL; |
1907 | |||
1908 | log_offset = r5l_ring_add(log, log_offset, | ||
1909 | le32_to_cpu(payload->size)); | ||
1910 | |||
1911 | mb_offset += sizeof(struct r5l_payload_data_parity) + | ||
1912 | sizeof(__le32) * | ||
1913 | (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); | ||
1018 | } | 1914 | } |
1915 | |||
1019 | return 0; | 1916 | return 0; |
1020 | } | 1917 | } |
1021 | 1918 | ||
1022 | /* copy data/parity from log to raid disks */ | 1919 | /* |
1023 | static void r5l_recovery_flush_log(struct r5l_log *log, | 1920 | * Load the stripe into cache. The stripe will be written out later by |
1024 | struct r5l_recovery_ctx *ctx) | 1921 | * the stripe cache state machine. |
1922 | */ | ||
1923 | static void r5c_recovery_load_one_stripe(struct r5l_log *log, | ||
1924 | struct stripe_head *sh) | ||
1025 | { | 1925 | { |
1926 | struct r5dev *dev; | ||
1927 | int i; | ||
1928 | |||
1929 | for (i = sh->disks; i--; ) { | ||
1930 | dev = sh->dev + i; | ||
1931 | if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { | ||
1932 | set_bit(R5_InJournal, &dev->flags); | ||
1933 | set_bit(R5_UPTODATE, &dev->flags); | ||
1934 | } | ||
1935 | } | ||
1936 | list_add_tail(&sh->r5c, &log->stripe_in_journal_list); | ||
1937 | atomic_inc(&log->stripe_in_journal_count); | ||
1938 | } | ||
1939 | |||
1940 | /* | ||
1941 | * Scan through the log for all to-be-flushed data | ||
1942 | * | ||
1943 | * For stripes with data and parity, namely Data-Parity stripe | ||
1944 | * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. | ||
1945 | * | ||
1946 | * For stripes with only data, namely Data-Only stripe | ||
1947 | * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. | ||
1948 | * | ||
1949 | * For a stripe, if we see data after parity, we should discard all previous | ||
1950 | * data and parity for this stripe, as these data are already flushed to | ||
1951 | * the array. | ||
1952 | * | ||
1953 | * At the end of the scan, we return the new journal_tail, which points to | ||
1954 | * first data-only stripe on the journal device, or next invalid meta block. | ||
1955 | */ | ||
1956 | static int r5c_recovery_flush_log(struct r5l_log *log, | ||
1957 | struct r5l_recovery_ctx *ctx) | ||
1958 | { | ||
1959 | struct stripe_head *sh; | ||
1960 | int ret = 0; | ||
1961 | |||
1962 | /* scan through the log */ | ||
1026 | while (1) { | 1963 | while (1) { |
1027 | if (r5l_read_meta_block(log, ctx)) | 1964 | if (r5l_recovery_read_meta_block(log, ctx)) |
1028 | return; | 1965 | break; |
1029 | if (r5l_recovery_flush_one_meta(log, ctx)) | 1966 | |
1030 | return; | 1967 | ret = r5c_recovery_analyze_meta_block(log, ctx, |
1968 | &ctx->cached_list); | ||
1969 | /* | ||
1970 | * -EAGAIN means mismatch in data block, in this case, we still | ||
1971 | * try scan the next metablock | ||
1972 | */ | ||
1973 | if (ret && ret != -EAGAIN) | ||
1974 | break; /* ret == -EINVAL or -ENOMEM */ | ||
1031 | ctx->seq++; | 1975 | ctx->seq++; |
1032 | ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); | 1976 | ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); |
1033 | } | 1977 | } |
1978 | |||
1979 | if (ret == -ENOMEM) { | ||
1980 | r5c_recovery_drop_stripes(&ctx->cached_list, ctx); | ||
1981 | return ret; | ||
1982 | } | ||
1983 | |||
1984 | /* replay data-parity stripes */ | ||
1985 | r5c_recovery_replay_stripes(&ctx->cached_list, ctx); | ||
1986 | |||
1987 | /* load data-only stripes to stripe cache */ | ||
1988 | list_for_each_entry(sh, &ctx->cached_list, lru) { | ||
1989 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
1990 | r5c_recovery_load_one_stripe(log, sh); | ||
1991 | ctx->data_only_stripes++; | ||
1992 | } | ||
1993 | |||
1994 | return 0; | ||
1034 | } | 1995 | } |
1035 | 1996 | ||
1036 | static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, | 1997 | /* |
1037 | u64 seq) | 1998 | * we did a recovery. Now ctx.pos points to an invalid meta block. New |
1999 | * log will start here. but we can't let superblock point to last valid | ||
2000 | * meta block. The log might looks like: | ||
2001 | * | meta 1| meta 2| meta 3| | ||
2002 | * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If | ||
2003 | * superblock points to meta 1, we write a new valid meta 2n. if crash | ||
2004 | * happens again, new recovery will start from meta 1. Since meta 2n is | ||
2005 | * valid now, recovery will think meta 3 is valid, which is wrong. | ||
2006 | * The solution is we create a new meta in meta2 with its seq == meta | ||
2007 | * 1's seq + 10000 and let superblock points to meta2. The same recovery | ||
2008 | * will not think meta 3 is a valid meta, because its seq doesn't match | ||
2009 | */ | ||
2010 | |||
2011 | /* | ||
2012 | * Before recovery, the log looks like the following | ||
2013 | * | ||
2014 | * --------------------------------------------- | ||
2015 | * | valid log | invalid log | | ||
2016 | * --------------------------------------------- | ||
2017 | * ^ | ||
2018 | * |- log->last_checkpoint | ||
2019 | * |- log->last_cp_seq | ||
2020 | * | ||
2021 | * Now we scan through the log until we see invalid entry | ||
2022 | * | ||
2023 | * --------------------------------------------- | ||
2024 | * | valid log | invalid log | | ||
2025 | * --------------------------------------------- | ||
2026 | * ^ ^ | ||
2027 | * |- log->last_checkpoint |- ctx->pos | ||
2028 | * |- log->last_cp_seq |- ctx->seq | ||
2029 | * | ||
2030 | * From this point, we need to increase seq number by 10 to avoid | ||
2031 | * confusing next recovery. | ||
2032 | * | ||
2033 | * --------------------------------------------- | ||
2034 | * | valid log | invalid log | | ||
2035 | * --------------------------------------------- | ||
2036 | * ^ ^ | ||
2037 | * |- log->last_checkpoint |- ctx->pos+1 | ||
2038 | * |- log->last_cp_seq |- ctx->seq+10001 | ||
2039 | * | ||
2040 | * However, it is not safe to start the state machine yet, because data only | ||
2041 | * parities are not yet secured in RAID. To save these data only parities, we | ||
2042 | * rewrite them from seq+11. | ||
2043 | * | ||
2044 | * ----------------------------------------------------------------- | ||
2045 | * | valid log | data only stripes | invalid log | | ||
2046 | * ----------------------------------------------------------------- | ||
2047 | * ^ ^ | ||
2048 | * |- log->last_checkpoint |- ctx->pos+n | ||
2049 | * |- log->last_cp_seq |- ctx->seq+10000+n | ||
2050 | * | ||
2051 | * If failure happens again during this process, the recovery can safe start | ||
2052 | * again from log->last_checkpoint. | ||
2053 | * | ||
2054 | * Once data only stripes are rewritten to journal, we move log_tail | ||
2055 | * | ||
2056 | * ----------------------------------------------------------------- | ||
2057 | * | old log | data only stripes | invalid log | | ||
2058 | * ----------------------------------------------------------------- | ||
2059 | * ^ ^ | ||
2060 | * |- log->last_checkpoint |- ctx->pos+n | ||
2061 | * |- log->last_cp_seq |- ctx->seq+10000+n | ||
2062 | * | ||
2063 | * Then we can safely start the state machine. If failure happens from this | ||
2064 | * point on, the recovery will start from new log->last_checkpoint. | ||
2065 | */ | ||
2066 | static int | ||
2067 | r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | ||
2068 | struct r5l_recovery_ctx *ctx) | ||
1038 | { | 2069 | { |
2070 | struct stripe_head *sh, *next; | ||
2071 | struct mddev *mddev = log->rdev->mddev; | ||
1039 | struct page *page; | 2072 | struct page *page; |
1040 | struct r5l_meta_block *mb; | ||
1041 | u32 crc; | ||
1042 | 2073 | ||
1043 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 2074 | page = alloc_page(GFP_KERNEL); |
1044 | if (!page) | 2075 | if (!page) { |
2076 | pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", | ||
2077 | mdname(mddev)); | ||
1045 | return -ENOMEM; | 2078 | return -ENOMEM; |
1046 | mb = page_address(page); | 2079 | } |
1047 | mb->magic = cpu_to_le32(R5LOG_MAGIC); | ||
1048 | mb->version = R5LOG_VERSION; | ||
1049 | mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); | ||
1050 | mb->seq = cpu_to_le64(seq); | ||
1051 | mb->position = cpu_to_le64(pos); | ||
1052 | crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); | ||
1053 | mb->checksum = cpu_to_le32(crc); | ||
1054 | 2080 | ||
1055 | if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, | 2081 | list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { |
1056 | REQ_FUA, false)) { | 2082 | struct r5l_meta_block *mb; |
1057 | __free_page(page); | 2083 | int i; |
1058 | return -EIO; | 2084 | int offset; |
2085 | sector_t write_pos; | ||
2086 | |||
2087 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
2088 | r5l_recovery_create_empty_meta_block(log, page, | ||
2089 | ctx->pos, ctx->seq); | ||
2090 | mb = page_address(page); | ||
2091 | offset = le32_to_cpu(mb->meta_size); | ||
2092 | write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); | ||
2093 | |||
2094 | for (i = sh->disks; i--; ) { | ||
2095 | struct r5dev *dev = &sh->dev[i]; | ||
2096 | struct r5l_payload_data_parity *payload; | ||
2097 | void *addr; | ||
2098 | |||
2099 | if (test_bit(R5_InJournal, &dev->flags)) { | ||
2100 | payload = (void *)mb + offset; | ||
2101 | payload->header.type = cpu_to_le16( | ||
2102 | R5LOG_PAYLOAD_DATA); | ||
2103 | payload->size = BLOCK_SECTORS; | ||
2104 | payload->location = cpu_to_le64( | ||
2105 | raid5_compute_blocknr(sh, i, 0)); | ||
2106 | addr = kmap_atomic(dev->page); | ||
2107 | payload->checksum[0] = cpu_to_le32( | ||
2108 | crc32c_le(log->uuid_checksum, addr, | ||
2109 | PAGE_SIZE)); | ||
2110 | kunmap_atomic(addr); | ||
2111 | sync_page_io(log->rdev, write_pos, PAGE_SIZE, | ||
2112 | dev->page, REQ_OP_WRITE, 0, false); | ||
2113 | write_pos = r5l_ring_add(log, write_pos, | ||
2114 | BLOCK_SECTORS); | ||
2115 | offset += sizeof(__le32) + | ||
2116 | sizeof(struct r5l_payload_data_parity); | ||
2117 | |||
2118 | } | ||
2119 | } | ||
2120 | mb->meta_size = cpu_to_le32(offset); | ||
2121 | mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, | ||
2122 | mb, PAGE_SIZE)); | ||
2123 | sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, | ||
2124 | REQ_OP_WRITE, REQ_FUA, false); | ||
2125 | sh->log_start = ctx->pos; | ||
2126 | ctx->pos = write_pos; | ||
2127 | ctx->seq += 1; | ||
2128 | |||
2129 | list_del_init(&sh->lru); | ||
2130 | raid5_release_stripe(sh); | ||
1059 | } | 2131 | } |
1060 | __free_page(page); | 2132 | __free_page(page); |
1061 | return 0; | 2133 | return 0; |
@@ -1063,45 +2135,60 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, | |||
1063 | 2135 | ||
1064 | static int r5l_recovery_log(struct r5l_log *log) | 2136 | static int r5l_recovery_log(struct r5l_log *log) |
1065 | { | 2137 | { |
2138 | struct mddev *mddev = log->rdev->mddev; | ||
1066 | struct r5l_recovery_ctx ctx; | 2139 | struct r5l_recovery_ctx ctx; |
2140 | int ret; | ||
2141 | sector_t pos; | ||
2142 | struct stripe_head *sh; | ||
1067 | 2143 | ||
1068 | ctx.pos = log->last_checkpoint; | 2144 | ctx.pos = log->last_checkpoint; |
1069 | ctx.seq = log->last_cp_seq; | 2145 | ctx.seq = log->last_cp_seq; |
1070 | ctx.meta_page = alloc_page(GFP_KERNEL); | 2146 | ctx.meta_page = alloc_page(GFP_KERNEL); |
2147 | ctx.data_only_stripes = 0; | ||
2148 | ctx.data_parity_stripes = 0; | ||
2149 | INIT_LIST_HEAD(&ctx.cached_list); | ||
2150 | |||
1071 | if (!ctx.meta_page) | 2151 | if (!ctx.meta_page) |
1072 | return -ENOMEM; | 2152 | return -ENOMEM; |
1073 | 2153 | ||
1074 | r5l_recovery_flush_log(log, &ctx); | 2154 | ret = r5c_recovery_flush_log(log, &ctx); |
1075 | __free_page(ctx.meta_page); | 2155 | __free_page(ctx.meta_page); |
1076 | 2156 | ||
1077 | /* | 2157 | if (ret) |
1078 | * we did a recovery. Now ctx.pos points to an invalid meta block. New | 2158 | return ret; |
1079 | * log will start here. but we can't let superblock point to last valid | 2159 | |
1080 | * meta block. The log might looks like: | 2160 | pos = ctx.pos; |
1081 | * | meta 1| meta 2| meta 3| | 2161 | ctx.seq += 10000; |
1082 | * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If | 2162 | |
1083 | * superblock points to meta 1, we write a new valid meta 2n. if crash | 2163 | if (ctx.data_only_stripes == 0) { |
1084 | * happens again, new recovery will start from meta 1. Since meta 2n is | ||
1085 | * valid now, recovery will think meta 3 is valid, which is wrong. | ||
1086 | * The solution is we create a new meta in meta2 with its seq == meta | ||
1087 | * 1's seq + 10 and let superblock points to meta2. The same recovery will | ||
1088 | * not think meta 3 is a valid meta, because its seq doesn't match | ||
1089 | */ | ||
1090 | if (ctx.seq > log->last_cp_seq) { | ||
1091 | int ret; | ||
1092 | |||
1093 | ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); | ||
1094 | if (ret) | ||
1095 | return ret; | ||
1096 | log->seq = ctx.seq + 11; | ||
1097 | log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); | ||
1098 | r5l_write_super(log, ctx.pos); | ||
1099 | log->last_checkpoint = ctx.pos; | ||
1100 | log->next_checkpoint = ctx.pos; | 2164 | log->next_checkpoint = ctx.pos; |
2165 | r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); | ||
2166 | ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); | ||
1101 | } else { | 2167 | } else { |
1102 | log->log_start = ctx.pos; | 2168 | sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru); |
1103 | log->seq = ctx.seq; | 2169 | log->next_checkpoint = sh->log_start; |
1104 | } | 2170 | } |
2171 | |||
2172 | if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) | ||
2173 | pr_debug("md/raid:%s: starting from clean shutdown\n", | ||
2174 | mdname(mddev)); | ||
2175 | else { | ||
2176 | pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", | ||
2177 | mdname(mddev), ctx.data_only_stripes, | ||
2178 | ctx.data_parity_stripes); | ||
2179 | |||
2180 | if (ctx.data_only_stripes > 0) | ||
2181 | if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { | ||
2182 | pr_err("md/raid:%s: failed to rewrite stripes to journal\n", | ||
2183 | mdname(mddev)); | ||
2184 | return -EIO; | ||
2185 | } | ||
2186 | } | ||
2187 | |||
2188 | log->log_start = ctx.pos; | ||
2189 | log->seq = ctx.seq; | ||
2190 | log->last_checkpoint = pos; | ||
2191 | r5l_write_super(log, pos); | ||
1105 | return 0; | 2192 | return 0; |
1106 | } | 2193 | } |
1107 | 2194 | ||
@@ -1110,7 +2197,293 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) | |||
1110 | struct mddev *mddev = log->rdev->mddev; | 2197 | struct mddev *mddev = log->rdev->mddev; |
1111 | 2198 | ||
1112 | log->rdev->journal_tail = cp; | 2199 | log->rdev->journal_tail = cp; |
1113 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2200 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
2201 | } | ||
2202 | |||
2203 | static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) | ||
2204 | { | ||
2205 | struct r5conf *conf = mddev->private; | ||
2206 | int ret; | ||
2207 | |||
2208 | if (!conf->log) | ||
2209 | return 0; | ||
2210 | |||
2211 | switch (conf->log->r5c_journal_mode) { | ||
2212 | case R5C_JOURNAL_MODE_WRITE_THROUGH: | ||
2213 | ret = snprintf( | ||
2214 | page, PAGE_SIZE, "[%s] %s\n", | ||
2215 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], | ||
2216 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); | ||
2217 | break; | ||
2218 | case R5C_JOURNAL_MODE_WRITE_BACK: | ||
2219 | ret = snprintf( | ||
2220 | page, PAGE_SIZE, "%s [%s]\n", | ||
2221 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], | ||
2222 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); | ||
2223 | break; | ||
2224 | default: | ||
2225 | ret = 0; | ||
2226 | } | ||
2227 | return ret; | ||
2228 | } | ||
2229 | |||
2230 | static ssize_t r5c_journal_mode_store(struct mddev *mddev, | ||
2231 | const char *page, size_t length) | ||
2232 | { | ||
2233 | struct r5conf *conf = mddev->private; | ||
2234 | struct r5l_log *log = conf->log; | ||
2235 | int val = -1, i; | ||
2236 | int len = length; | ||
2237 | |||
2238 | if (!log) | ||
2239 | return -ENODEV; | ||
2240 | |||
2241 | if (len && page[len - 1] == '\n') | ||
2242 | len -= 1; | ||
2243 | for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) | ||
2244 | if (strlen(r5c_journal_mode_str[i]) == len && | ||
2245 | strncmp(page, r5c_journal_mode_str[i], len) == 0) { | ||
2246 | val = i; | ||
2247 | break; | ||
2248 | } | ||
2249 | if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || | ||
2250 | val > R5C_JOURNAL_MODE_WRITE_BACK) | ||
2251 | return -EINVAL; | ||
2252 | |||
2253 | mddev_suspend(mddev); | ||
2254 | conf->log->r5c_journal_mode = val; | ||
2255 | mddev_resume(mddev); | ||
2256 | |||
2257 | pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", | ||
2258 | mdname(mddev), val, r5c_journal_mode_str[val]); | ||
2259 | return length; | ||
2260 | } | ||
2261 | |||
2262 | struct md_sysfs_entry | ||
2263 | r5c_journal_mode = __ATTR(journal_mode, 0644, | ||
2264 | r5c_journal_mode_show, r5c_journal_mode_store); | ||
2265 | |||
2266 | /* | ||
2267 | * Try handle write operation in caching phase. This function should only | ||
2268 | * be called in write-back mode. | ||
2269 | * | ||
2270 | * If all outstanding writes can be handled in caching phase, returns 0 | ||
2271 | * If writes requires write-out phase, call r5c_make_stripe_write_out() | ||
2272 | * and returns -EAGAIN | ||
2273 | */ | ||
2274 | int r5c_try_caching_write(struct r5conf *conf, | ||
2275 | struct stripe_head *sh, | ||
2276 | struct stripe_head_state *s, | ||
2277 | int disks) | ||
2278 | { | ||
2279 | struct r5l_log *log = conf->log; | ||
2280 | int i; | ||
2281 | struct r5dev *dev; | ||
2282 | int to_cache = 0; | ||
2283 | |||
2284 | BUG_ON(!r5c_is_writeback(log)); | ||
2285 | |||
2286 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { | ||
2287 | /* | ||
2288 | * There are two different scenarios here: | ||
2289 | * 1. The stripe has some data cached, and it is sent to | ||
2290 | * write-out phase for reclaim | ||
2291 | * 2. The stripe is clean, and this is the first write | ||
2292 | * | ||
2293 | * For 1, return -EAGAIN, so we continue with | ||
2294 | * handle_stripe_dirtying(). | ||
2295 | * | ||
2296 | * For 2, set STRIPE_R5C_CACHING and continue with caching | ||
2297 | * write. | ||
2298 | */ | ||
2299 | |||
2300 | /* case 1: anything injournal or anything in written */ | ||
2301 | if (s->injournal > 0 || s->written > 0) | ||
2302 | return -EAGAIN; | ||
2303 | /* case 2 */ | ||
2304 | set_bit(STRIPE_R5C_CACHING, &sh->state); | ||
2305 | } | ||
2306 | |||
2307 | for (i = disks; i--; ) { | ||
2308 | dev = &sh->dev[i]; | ||
2309 | /* if non-overwrite, use writing-out phase */ | ||
2310 | if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && | ||
2311 | !test_bit(R5_InJournal, &dev->flags)) { | ||
2312 | r5c_make_stripe_write_out(sh); | ||
2313 | return -EAGAIN; | ||
2314 | } | ||
2315 | } | ||
2316 | |||
2317 | for (i = disks; i--; ) { | ||
2318 | dev = &sh->dev[i]; | ||
2319 | if (dev->towrite) { | ||
2320 | set_bit(R5_Wantwrite, &dev->flags); | ||
2321 | set_bit(R5_Wantdrain, &dev->flags); | ||
2322 | set_bit(R5_LOCKED, &dev->flags); | ||
2323 | to_cache++; | ||
2324 | } | ||
2325 | } | ||
2326 | |||
2327 | if (to_cache) { | ||
2328 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); | ||
2329 | /* | ||
2330 | * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() | ||
2331 | * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in | ||
2332 | * r5c_handle_data_cached() | ||
2333 | */ | ||
2334 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); | ||
2335 | } | ||
2336 | |||
2337 | return 0; | ||
2338 | } | ||
2339 | |||
2340 | /* | ||
2341 | * free extra pages (orig_page) we allocated for prexor | ||
2342 | */ | ||
2343 | void r5c_release_extra_page(struct stripe_head *sh) | ||
2344 | { | ||
2345 | struct r5conf *conf = sh->raid_conf; | ||
2346 | int i; | ||
2347 | bool using_disk_info_extra_page; | ||
2348 | |||
2349 | using_disk_info_extra_page = | ||
2350 | sh->dev[0].orig_page == conf->disks[0].extra_page; | ||
2351 | |||
2352 | for (i = sh->disks; i--; ) | ||
2353 | if (sh->dev[i].page != sh->dev[i].orig_page) { | ||
2354 | struct page *p = sh->dev[i].orig_page; | ||
2355 | |||
2356 | sh->dev[i].orig_page = sh->dev[i].page; | ||
2357 | if (!using_disk_info_extra_page) | ||
2358 | put_page(p); | ||
2359 | } | ||
2360 | |||
2361 | if (using_disk_info_extra_page) { | ||
2362 | clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); | ||
2363 | md_wakeup_thread(conf->mddev->thread); | ||
2364 | } | ||
2365 | } | ||
2366 | |||
2367 | void r5c_use_extra_page(struct stripe_head *sh) | ||
2368 | { | ||
2369 | struct r5conf *conf = sh->raid_conf; | ||
2370 | int i; | ||
2371 | struct r5dev *dev; | ||
2372 | |||
2373 | for (i = sh->disks; i--; ) { | ||
2374 | dev = &sh->dev[i]; | ||
2375 | if (dev->orig_page != dev->page) | ||
2376 | put_page(dev->orig_page); | ||
2377 | dev->orig_page = conf->disks[i].extra_page; | ||
2378 | } | ||
2379 | } | ||
2380 | |||
2381 | /* | ||
2382 | * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the | ||
2383 | * stripe is committed to RAID disks. | ||
2384 | */ | ||
2385 | void r5c_finish_stripe_write_out(struct r5conf *conf, | ||
2386 | struct stripe_head *sh, | ||
2387 | struct stripe_head_state *s) | ||
2388 | { | ||
2389 | int i; | ||
2390 | int do_wakeup = 0; | ||
2391 | |||
2392 | if (!conf->log || | ||
2393 | !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) | ||
2394 | return; | ||
2395 | |||
2396 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
2397 | clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
2398 | |||
2399 | if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
2400 | return; | ||
2401 | |||
2402 | for (i = sh->disks; i--; ) { | ||
2403 | clear_bit(R5_InJournal, &sh->dev[i].flags); | ||
2404 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
2405 | do_wakeup = 1; | ||
2406 | } | ||
2407 | |||
2408 | /* | ||
2409 | * analyse_stripe() runs before r5c_finish_stripe_write_out(), | ||
2410 | * We updated R5_InJournal, so we also update s->injournal. | ||
2411 | */ | ||
2412 | s->injournal = 0; | ||
2413 | |||
2414 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
2415 | if (atomic_dec_and_test(&conf->pending_full_writes)) | ||
2416 | md_wakeup_thread(conf->mddev->thread); | ||
2417 | |||
2418 | if (do_wakeup) | ||
2419 | wake_up(&conf->wait_for_overlap); | ||
2420 | |||
2421 | if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
2422 | return; | ||
2423 | |||
2424 | spin_lock_irq(&conf->log->stripe_in_journal_lock); | ||
2425 | list_del_init(&sh->r5c); | ||
2426 | spin_unlock_irq(&conf->log->stripe_in_journal_lock); | ||
2427 | sh->log_start = MaxSector; | ||
2428 | atomic_dec(&conf->log->stripe_in_journal_count); | ||
2429 | r5c_update_log_state(conf->log); | ||
2430 | } | ||
2431 | |||
2432 | int | ||
2433 | r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, | ||
2434 | struct stripe_head_state *s) | ||
2435 | { | ||
2436 | struct r5conf *conf = sh->raid_conf; | ||
2437 | int pages = 0; | ||
2438 | int reserve; | ||
2439 | int i; | ||
2440 | int ret = 0; | ||
2441 | |||
2442 | BUG_ON(!log); | ||
2443 | |||
2444 | for (i = 0; i < sh->disks; i++) { | ||
2445 | void *addr; | ||
2446 | |||
2447 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
2448 | continue; | ||
2449 | addr = kmap_atomic(sh->dev[i].page); | ||
2450 | sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, | ||
2451 | addr, PAGE_SIZE); | ||
2452 | kunmap_atomic(addr); | ||
2453 | pages++; | ||
2454 | } | ||
2455 | WARN_ON(pages == 0); | ||
2456 | |||
2457 | /* | ||
2458 | * The stripe must enter state machine again to call endio, so | ||
2459 | * don't delay. | ||
2460 | */ | ||
2461 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
2462 | atomic_inc(&sh->count); | ||
2463 | |||
2464 | mutex_lock(&log->io_mutex); | ||
2465 | /* meta + data */ | ||
2466 | reserve = (1 + pages) << (PAGE_SHIFT - 9); | ||
2467 | |||
2468 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && | ||
2469 | sh->log_start == MaxSector) | ||
2470 | r5l_add_no_space_stripe(log, sh); | ||
2471 | else if (!r5l_has_free_space(log, reserve)) { | ||
2472 | if (sh->log_start == log->last_checkpoint) | ||
2473 | BUG(); | ||
2474 | else | ||
2475 | r5l_add_no_space_stripe(log, sh); | ||
2476 | } else { | ||
2477 | ret = r5l_log_stripe(log, sh, pages, 0); | ||
2478 | if (ret) { | ||
2479 | spin_lock_irq(&log->io_list_lock); | ||
2480 | list_add_tail(&sh->log_list, &log->no_mem_stripes); | ||
2481 | spin_unlock_irq(&log->io_list_lock); | ||
2482 | } | ||
2483 | } | ||
2484 | |||
2485 | mutex_unlock(&log->io_mutex); | ||
2486 | return 0; | ||
1114 | } | 2487 | } |
1115 | 2488 | ||
1116 | static int r5l_load_log(struct r5l_log *log) | 2489 | static int r5l_load_log(struct r5l_log *log) |
@@ -1121,7 +2494,7 @@ static int r5l_load_log(struct r5l_log *log) | |||
1121 | sector_t cp = log->rdev->journal_tail; | 2494 | sector_t cp = log->rdev->journal_tail; |
1122 | u32 stored_crc, expected_crc; | 2495 | u32 stored_crc, expected_crc; |
1123 | bool create_super = false; | 2496 | bool create_super = false; |
1124 | int ret; | 2497 | int ret = 0; |
1125 | 2498 | ||
1126 | /* Make sure it's valid */ | 2499 | /* Make sure it's valid */ |
1127 | if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) | 2500 | if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) |
@@ -1171,11 +2544,18 @@ create: | |||
1171 | if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) | 2544 | if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) |
1172 | log->max_free_space = RECLAIM_MAX_FREE_SPACE; | 2545 | log->max_free_space = RECLAIM_MAX_FREE_SPACE; |
1173 | log->last_checkpoint = cp; | 2546 | log->last_checkpoint = cp; |
1174 | log->next_checkpoint = cp; | ||
1175 | 2547 | ||
1176 | __free_page(page); | 2548 | __free_page(page); |
1177 | 2549 | ||
1178 | return r5l_recovery_log(log); | 2550 | if (create_super) { |
2551 | log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); | ||
2552 | log->seq = log->last_cp_seq + 1; | ||
2553 | log->next_checkpoint = cp; | ||
2554 | } else | ||
2555 | ret = r5l_recovery_log(log); | ||
2556 | |||
2557 | r5c_update_log_state(log); | ||
2558 | return ret; | ||
1179 | ioerr: | 2559 | ioerr: |
1180 | __free_page(page); | 2560 | __free_page(page); |
1181 | return ret; | 2561 | return ret; |
@@ -1188,6 +2568,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
1188 | 2568 | ||
1189 | if (PAGE_SIZE != 4096) | 2569 | if (PAGE_SIZE != 4096) |
1190 | return -EINVAL; | 2570 | return -EINVAL; |
2571 | |||
2572 | /* | ||
2573 | * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and | ||
2574 | * raid_disks r5l_payload_data_parity. | ||
2575 | * | ||
2576 | * Write journal and cache does not work for very big array | ||
2577 | * (raid_disks > 203) | ||
2578 | */ | ||
2579 | if (sizeof(struct r5l_meta_block) + | ||
2580 | ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * | ||
2581 | conf->raid_disks) > PAGE_SIZE) { | ||
2582 | pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", | ||
2583 | mdname(conf->mddev), conf->raid_disks); | ||
2584 | return -EINVAL; | ||
2585 | } | ||
2586 | |||
1191 | log = kzalloc(sizeof(*log), GFP_KERNEL); | 2587 | log = kzalloc(sizeof(*log), GFP_KERNEL); |
1192 | if (!log) | 2588 | if (!log) |
1193 | return -ENOMEM; | 2589 | return -ENOMEM; |
@@ -1227,6 +2623,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
1227 | log->rdev->mddev, "reclaim"); | 2623 | log->rdev->mddev, "reclaim"); |
1228 | if (!log->reclaim_thread) | 2624 | if (!log->reclaim_thread) |
1229 | goto reclaim_thread; | 2625 | goto reclaim_thread; |
2626 | log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; | ||
2627 | |||
1230 | init_waitqueue_head(&log->iounit_wait); | 2628 | init_waitqueue_head(&log->iounit_wait); |
1231 | 2629 | ||
1232 | INIT_LIST_HEAD(&log->no_mem_stripes); | 2630 | INIT_LIST_HEAD(&log->no_mem_stripes); |
@@ -1234,6 +2632,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
1234 | INIT_LIST_HEAD(&log->no_space_stripes); | 2632 | INIT_LIST_HEAD(&log->no_space_stripes); |
1235 | spin_lock_init(&log->no_space_stripes_lock); | 2633 | spin_lock_init(&log->no_space_stripes_lock); |
1236 | 2634 | ||
2635 | INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); | ||
2636 | |||
2637 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
2638 | INIT_LIST_HEAD(&log->stripe_in_journal_list); | ||
2639 | spin_lock_init(&log->stripe_in_journal_lock); | ||
2640 | atomic_set(&log->stripe_in_journal_count, 0); | ||
2641 | |||
1237 | if (r5l_load_log(log)) | 2642 | if (r5l_load_log(log)) |
1238 | goto error; | 2643 | goto error; |
1239 | 2644 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5f9e28443c8a..06d7279bdd04 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644); | |||
70 | MODULE_PARM_DESC(devices_handle_discard_safely, | 70 | MODULE_PARM_DESC(devices_handle_discard_safely, |
71 | "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); | 71 | "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); |
72 | static struct workqueue_struct *raid5_wq; | 72 | static struct workqueue_struct *raid5_wq; |
73 | /* | ||
74 | * Stripe cache | ||
75 | */ | ||
76 | |||
77 | #define NR_STRIPES 256 | ||
78 | #define STRIPE_SIZE PAGE_SIZE | ||
79 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | ||
80 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | ||
81 | #define IO_THRESHOLD 1 | ||
82 | #define BYPASS_THRESHOLD 1 | ||
83 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) | ||
84 | #define HASH_MASK (NR_HASH - 1) | ||
85 | #define MAX_STRIPE_BATCH 8 | ||
86 | 73 | ||
87 | static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) | 74 | static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) |
88 | { | 75 | { |
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | |||
126 | local_irq_enable(); | 113 | local_irq_enable(); |
127 | } | 114 | } |
128 | 115 | ||
129 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | ||
130 | * order without overlap. There may be several bio's per stripe+device, and | ||
131 | * a bio could span several devices. | ||
132 | * When walking this list for a particular stripe+device, we must never proceed | ||
133 | * beyond a bio that extends past this device, as the next bio might no longer | ||
134 | * be valid. | ||
135 | * This function is used to determine the 'next' bio in the list, given the sector | ||
136 | * of the current stripe+device | ||
137 | */ | ||
138 | static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | ||
139 | { | ||
140 | int sectors = bio_sectors(bio); | ||
141 | if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) | ||
142 | return bio->bi_next; | ||
143 | else | ||
144 | return NULL; | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * We maintain a biased count of active stripes in the bottom 16 bits of | ||
149 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | ||
150 | */ | ||
151 | static inline int raid5_bi_processed_stripes(struct bio *bio) | ||
152 | { | ||
153 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
154 | return (atomic_read(segments) >> 16) & 0xffff; | ||
155 | } | ||
156 | |||
157 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) | ||
158 | { | ||
159 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
160 | return atomic_sub_return(1, segments) & 0xffff; | ||
161 | } | ||
162 | |||
163 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) | ||
164 | { | ||
165 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
166 | atomic_inc(segments); | ||
167 | } | ||
168 | |||
169 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, | ||
170 | unsigned int cnt) | ||
171 | { | ||
172 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
173 | int old, new; | ||
174 | |||
175 | do { | ||
176 | old = atomic_read(segments); | ||
177 | new = (old & 0xffff) | (cnt << 16); | ||
178 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
179 | } | ||
180 | |||
181 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) | ||
182 | { | ||
183 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
184 | atomic_set(segments, cnt); | ||
185 | } | ||
186 | |||
187 | /* Find first data disk in a raid6 stripe */ | 116 | /* Find first data disk in a raid6 stripe */ |
188 | static inline int raid6_d0(struct stripe_head *sh) | 117 | static inline int raid6_d0(struct stripe_head *sh) |
189 | { | 118 | { |
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) | |||
289 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, | 218 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, |
290 | struct list_head *temp_inactive_list) | 219 | struct list_head *temp_inactive_list) |
291 | { | 220 | { |
221 | int i; | ||
222 | int injournal = 0; /* number of date pages with R5_InJournal */ | ||
223 | |||
292 | BUG_ON(!list_empty(&sh->lru)); | 224 | BUG_ON(!list_empty(&sh->lru)); |
293 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 225 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
226 | |||
227 | if (r5c_is_writeback(conf->log)) | ||
228 | for (i = sh->disks; i--; ) | ||
229 | if (test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
230 | injournal++; | ||
231 | /* | ||
232 | * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with | ||
233 | * data in journal, so they are not released to cached lists | ||
234 | */ | ||
235 | if (conf->quiesce && r5c_is_writeback(conf->log) && | ||
236 | !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { | ||
237 | if (test_bit(STRIPE_R5C_CACHING, &sh->state)) | ||
238 | r5c_make_stripe_write_out(sh); | ||
239 | set_bit(STRIPE_HANDLE, &sh->state); | ||
240 | } | ||
241 | |||
294 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 242 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
295 | if (test_bit(STRIPE_DELAYED, &sh->state) && | 243 | if (test_bit(STRIPE_DELAYED, &sh->state) && |
296 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 244 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
316 | < IO_THRESHOLD) | 264 | < IO_THRESHOLD) |
317 | md_wakeup_thread(conf->mddev->thread); | 265 | md_wakeup_thread(conf->mddev->thread); |
318 | atomic_dec(&conf->active_stripes); | 266 | atomic_dec(&conf->active_stripes); |
319 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) | 267 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
320 | list_add_tail(&sh->lru, temp_inactive_list); | 268 | if (!r5c_is_writeback(conf->log)) |
269 | list_add_tail(&sh->lru, temp_inactive_list); | ||
270 | else { | ||
271 | WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); | ||
272 | if (injournal == 0) | ||
273 | list_add_tail(&sh->lru, temp_inactive_list); | ||
274 | else if (injournal == conf->raid_disks - conf->max_degraded) { | ||
275 | /* full stripe */ | ||
276 | if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) | ||
277 | atomic_inc(&conf->r5c_cached_full_stripes); | ||
278 | if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) | ||
279 | atomic_dec(&conf->r5c_cached_partial_stripes); | ||
280 | list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); | ||
281 | r5c_check_cached_full_stripe(conf); | ||
282 | } else { | ||
283 | /* partial stripe */ | ||
284 | if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, | ||
285 | &sh->state)) | ||
286 | atomic_inc(&conf->r5c_cached_partial_stripes); | ||
287 | list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); | ||
288 | } | ||
289 | } | ||
290 | } | ||
321 | } | 291 | } |
322 | } | 292 | } |
323 | 293 | ||
@@ -541,7 +511,7 @@ retry: | |||
541 | 511 | ||
542 | if (dev->toread || dev->read || dev->towrite || dev->written || | 512 | if (dev->toread || dev->read || dev->towrite || dev->written || |
543 | test_bit(R5_LOCKED, &dev->flags)) { | 513 | test_bit(R5_LOCKED, &dev->flags)) { |
544 | printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", | 514 | pr_err("sector=%llx i=%d %p %p %p %p %d\n", |
545 | (unsigned long long)sh->sector, i, dev->toread, | 515 | (unsigned long long)sh->sector, i, dev->toread, |
546 | dev->read, dev->towrite, dev->written, | 516 | dev->read, dev->towrite, dev->written, |
547 | test_bit(R5_LOCKED, &dev->flags)); | 517 | test_bit(R5_LOCKED, &dev->flags)); |
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, | |||
680 | } | 650 | } |
681 | if (noblock && sh == NULL) | 651 | if (noblock && sh == NULL) |
682 | break; | 652 | break; |
653 | |||
654 | r5c_check_stripe_cache_usage(conf); | ||
683 | if (!sh) { | 655 | if (!sh) { |
684 | set_bit(R5_INACTIVE_BLOCKED, | 656 | set_bit(R5_INACTIVE_BLOCKED, |
685 | &conf->cache_state); | 657 | &conf->cache_state); |
658 | r5l_wake_reclaim(conf->log, 0); | ||
686 | wait_event_lock_irq( | 659 | wait_event_lock_irq( |
687 | conf->wait_for_stripe, | 660 | conf->wait_for_stripe, |
688 | !list_empty(conf->inactive_list + hash) && | 661 | !list_empty(conf->inactive_list + hash) && |
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
901 | 874 | ||
902 | might_sleep(); | 875 | might_sleep(); |
903 | 876 | ||
904 | if (r5l_write_stripe(conf->log, sh) == 0) | 877 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { |
905 | return; | 878 | /* writing out phase */ |
879 | if (s->waiting_extra_page) | ||
880 | return; | ||
881 | if (r5l_write_stripe(conf->log, sh) == 0) | ||
882 | return; | ||
883 | } else { /* caching phase */ | ||
884 | if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { | ||
885 | r5c_cache_data(conf->log, sh, s); | ||
886 | return; | ||
887 | } | ||
888 | } | ||
889 | |||
906 | for (i = disks; i--; ) { | 890 | for (i = disks; i--; ) { |
907 | int op, op_flags = 0; | 891 | int op, op_flags = 0; |
908 | int replace_only = 0; | 892 | int replace_only = 0; |
@@ -977,7 +961,7 @@ again: | |||
977 | if (bad < 0) { | 961 | if (bad < 0) { |
978 | set_bit(BlockedBadBlocks, &rdev->flags); | 962 | set_bit(BlockedBadBlocks, &rdev->flags); |
979 | if (!conf->mddev->external && | 963 | if (!conf->mddev->external && |
980 | conf->mddev->flags) { | 964 | conf->mddev->sb_flags) { |
981 | /* It is very unlikely, but we might | 965 | /* It is very unlikely, but we might |
982 | * still need to write out the | 966 | * still need to write out the |
983 | * bad block log - better give it | 967 | * bad block log - better give it |
@@ -1115,7 +1099,7 @@ again: | |||
1115 | static struct dma_async_tx_descriptor * | 1099 | static struct dma_async_tx_descriptor * |
1116 | async_copy_data(int frombio, struct bio *bio, struct page **page, | 1100 | async_copy_data(int frombio, struct bio *bio, struct page **page, |
1117 | sector_t sector, struct dma_async_tx_descriptor *tx, | 1101 | sector_t sector, struct dma_async_tx_descriptor *tx, |
1118 | struct stripe_head *sh) | 1102 | struct stripe_head *sh, int no_skipcopy) |
1119 | { | 1103 | { |
1120 | struct bio_vec bvl; | 1104 | struct bio_vec bvl; |
1121 | struct bvec_iter iter; | 1105 | struct bvec_iter iter; |
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, | |||
1155 | if (frombio) { | 1139 | if (frombio) { |
1156 | if (sh->raid_conf->skip_copy && | 1140 | if (sh->raid_conf->skip_copy && |
1157 | b_offset == 0 && page_offset == 0 && | 1141 | b_offset == 0 && page_offset == 0 && |
1158 | clen == STRIPE_SIZE) | 1142 | clen == STRIPE_SIZE && |
1143 | !no_skipcopy) | ||
1159 | *page = bio_page; | 1144 | *page = bio_page; |
1160 | else | 1145 | else |
1161 | tx = async_memcpy(*page, bio_page, page_offset, | 1146 | tx = async_memcpy(*page, bio_page, page_offset, |
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
1237 | while (rbi && rbi->bi_iter.bi_sector < | 1222 | while (rbi && rbi->bi_iter.bi_sector < |
1238 | dev->sector + STRIPE_SECTORS) { | 1223 | dev->sector + STRIPE_SECTORS) { |
1239 | tx = async_copy_data(0, rbi, &dev->page, | 1224 | tx = async_copy_data(0, rbi, &dev->page, |
1240 | dev->sector, tx, sh); | 1225 | dev->sector, tx, sh, 0); |
1241 | rbi = r5_next_bio(rbi, dev->sector); | 1226 | rbi = r5_next_bio(rbi, dev->sector); |
1242 | } | 1227 | } |
1243 | } | 1228 | } |
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs, | |||
1364 | if (i == sh->qd_idx || i == sh->pd_idx || | 1349 | if (i == sh->qd_idx || i == sh->pd_idx || |
1365 | (srctype == SYNDROME_SRC_ALL) || | 1350 | (srctype == SYNDROME_SRC_ALL) || |
1366 | (srctype == SYNDROME_SRC_WANT_DRAIN && | 1351 | (srctype == SYNDROME_SRC_WANT_DRAIN && |
1367 | test_bit(R5_Wantdrain, &dev->flags)) || | 1352 | (test_bit(R5_Wantdrain, &dev->flags) || |
1353 | test_bit(R5_InJournal, &dev->flags))) || | ||
1368 | (srctype == SYNDROME_SRC_WRITTEN && | 1354 | (srctype == SYNDROME_SRC_WRITTEN && |
1369 | dev->written)) | 1355 | dev->written)) { |
1370 | srcs[slot] = sh->dev[i].page; | 1356 | if (test_bit(R5_InJournal, &dev->flags)) |
1357 | srcs[slot] = sh->dev[i].orig_page; | ||
1358 | else | ||
1359 | srcs[slot] = sh->dev[i].page; | ||
1360 | } | ||
1371 | i = raid6_next_disk(i, disks); | 1361 | i = raid6_next_disk(i, disks); |
1372 | } while (i != d0_idx); | 1362 | } while (i != d0_idx); |
1373 | 1363 | ||
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
1546 | 1536 | ||
1547 | pr_debug("%s: stripe %llu\n", __func__, | 1537 | pr_debug("%s: stripe %llu\n", __func__, |
1548 | (unsigned long long)sh->sector); | 1538 | (unsigned long long)sh->sector); |
1539 | |||
1540 | if (r5c_is_writeback(sh->raid_conf->log)) | ||
1541 | /* | ||
1542 | * raid5-cache write back uses orig_page during prexor. | ||
1543 | * After prexor, it is time to free orig_page | ||
1544 | */ | ||
1545 | r5c_release_extra_page(sh); | ||
1549 | } | 1546 | } |
1550 | 1547 | ||
1551 | static struct dma_async_tx_descriptor * | 1548 | static struct dma_async_tx_descriptor * |
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1567 | for (i = disks; i--; ) { | 1564 | for (i = disks; i--; ) { |
1568 | struct r5dev *dev = &sh->dev[i]; | 1565 | struct r5dev *dev = &sh->dev[i]; |
1569 | /* Only process blocks that are known to be uptodate */ | 1566 | /* Only process blocks that are known to be uptodate */ |
1570 | if (test_bit(R5_Wantdrain, &dev->flags)) | 1567 | if (test_bit(R5_InJournal, &dev->flags)) |
1568 | xor_srcs[count++] = dev->orig_page; | ||
1569 | else if (test_bit(R5_Wantdrain, &dev->flags)) | ||
1571 | xor_srcs[count++] = dev->page; | 1570 | xor_srcs[count++] = dev->page; |
1572 | } | 1571 | } |
1573 | 1572 | ||
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1601 | static struct dma_async_tx_descriptor * | 1600 | static struct dma_async_tx_descriptor * |
1602 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1601 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
1603 | { | 1602 | { |
1603 | struct r5conf *conf = sh->raid_conf; | ||
1604 | int disks = sh->disks; | 1604 | int disks = sh->disks; |
1605 | int i; | 1605 | int i; |
1606 | struct stripe_head *head_sh = sh; | 1606 | struct stripe_head *head_sh = sh; |
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1618 | 1618 | ||
1619 | again: | 1619 | again: |
1620 | dev = &sh->dev[i]; | 1620 | dev = &sh->dev[i]; |
1621 | /* | ||
1622 | * clear R5_InJournal, so when rewriting a page in | ||
1623 | * journal, it is not skipped by r5l_log_stripe() | ||
1624 | */ | ||
1625 | clear_bit(R5_InJournal, &dev->flags); | ||
1621 | spin_lock_irq(&sh->stripe_lock); | 1626 | spin_lock_irq(&sh->stripe_lock); |
1622 | chosen = dev->towrite; | 1627 | chosen = dev->towrite; |
1623 | dev->towrite = NULL; | 1628 | dev->towrite = NULL; |
@@ -1637,8 +1642,10 @@ again: | |||
1637 | set_bit(R5_Discard, &dev->flags); | 1642 | set_bit(R5_Discard, &dev->flags); |
1638 | else { | 1643 | else { |
1639 | tx = async_copy_data(1, wbi, &dev->page, | 1644 | tx = async_copy_data(1, wbi, &dev->page, |
1640 | dev->sector, tx, sh); | 1645 | dev->sector, tx, sh, |
1641 | if (dev->page != dev->orig_page) { | 1646 | r5c_is_writeback(conf->log)); |
1647 | if (dev->page != dev->orig_page && | ||
1648 | !r5c_is_writeback(conf->log)) { | ||
1642 | set_bit(R5_SkipCopy, &dev->flags); | 1649 | set_bit(R5_SkipCopy, &dev->flags); |
1643 | clear_bit(R5_UPTODATE, &dev->flags); | 1650 | clear_bit(R5_UPTODATE, &dev->flags); |
1644 | clear_bit(R5_OVERWRITE, &dev->flags); | 1651 | clear_bit(R5_OVERWRITE, &dev->flags); |
@@ -1746,7 +1753,8 @@ again: | |||
1746 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1753 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
1747 | for (i = disks; i--; ) { | 1754 | for (i = disks; i--; ) { |
1748 | struct r5dev *dev = &sh->dev[i]; | 1755 | struct r5dev *dev = &sh->dev[i]; |
1749 | if (head_sh->dev[i].written) | 1756 | if (head_sh->dev[i].written || |
1757 | test_bit(R5_InJournal, &head_sh->dev[i].flags)) | ||
1750 | xor_srcs[count++] = dev->page; | 1758 | xor_srcs[count++] = dev->page; |
1751 | } | 1759 | } |
1752 | } else { | 1760 | } else { |
@@ -2000,7 +2008,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, | |||
2000 | spin_lock_init(&sh->batch_lock); | 2008 | spin_lock_init(&sh->batch_lock); |
2001 | INIT_LIST_HEAD(&sh->batch_list); | 2009 | INIT_LIST_HEAD(&sh->batch_list); |
2002 | INIT_LIST_HEAD(&sh->lru); | 2010 | INIT_LIST_HEAD(&sh->lru); |
2011 | INIT_LIST_HEAD(&sh->r5c); | ||
2012 | INIT_LIST_HEAD(&sh->log_list); | ||
2003 | atomic_set(&sh->count, 1); | 2013 | atomic_set(&sh->count, 1); |
2014 | sh->log_start = MaxSector; | ||
2004 | for (i = 0; i < disks; i++) { | 2015 | for (i = 0; i < disks; i++) { |
2005 | struct r5dev *dev = &sh->dev[i]; | 2016 | struct r5dev *dev = &sh->dev[i]; |
2006 | 2017 | ||
@@ -2240,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
2240 | */ | 2251 | */ |
2241 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); | 2252 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); |
2242 | if (ndisks) { | 2253 | if (ndisks) { |
2243 | for (i=0; i<conf->raid_disks; i++) | 2254 | for (i = 0; i < conf->pool_size; i++) |
2244 | ndisks[i] = conf->disks[i]; | 2255 | ndisks[i] = conf->disks[i]; |
2245 | kfree(conf->disks); | 2256 | |
2246 | conf->disks = ndisks; | 2257 | for (i = conf->pool_size; i < newsize; i++) { |
2258 | ndisks[i].extra_page = alloc_page(GFP_NOIO); | ||
2259 | if (!ndisks[i].extra_page) | ||
2260 | err = -ENOMEM; | ||
2261 | } | ||
2262 | |||
2263 | if (err) { | ||
2264 | for (i = conf->pool_size; i < newsize; i++) | ||
2265 | if (ndisks[i].extra_page) | ||
2266 | put_page(ndisks[i].extra_page); | ||
2267 | kfree(ndisks); | ||
2268 | } else { | ||
2269 | kfree(conf->disks); | ||
2270 | conf->disks = ndisks; | ||
2271 | } | ||
2247 | } else | 2272 | } else |
2248 | err = -ENOMEM; | 2273 | err = -ENOMEM; |
2249 | 2274 | ||
@@ -2342,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi) | |||
2342 | * replacement device. We just fail those on | 2367 | * replacement device. We just fail those on |
2343 | * any error | 2368 | * any error |
2344 | */ | 2369 | */ |
2345 | printk_ratelimited( | 2370 | pr_info_ratelimited( |
2346 | KERN_INFO | 2371 | "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", |
2347 | "md/raid:%s: read error corrected" | ||
2348 | " (%lu sectors at %llu on %s)\n", | ||
2349 | mdname(conf->mddev), STRIPE_SECTORS, | 2372 | mdname(conf->mddev), STRIPE_SECTORS, |
2350 | (unsigned long long)s, | 2373 | (unsigned long long)s, |
2351 | bdevname(rdev->bdev, b)); | 2374 | bdevname(rdev->bdev, b)); |
@@ -2365,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi) | |||
2365 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 2388 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
2366 | atomic_inc(&rdev->read_errors); | 2389 | atomic_inc(&rdev->read_errors); |
2367 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) | 2390 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) |
2368 | printk_ratelimited( | 2391 | pr_warn_ratelimited( |
2369 | KERN_WARNING | 2392 | "md/raid:%s: read error on replacement device (sector %llu on %s).\n", |
2370 | "md/raid:%s: read error on replacement device " | ||
2371 | "(sector %llu on %s).\n", | ||
2372 | mdname(conf->mddev), | 2393 | mdname(conf->mddev), |
2373 | (unsigned long long)s, | 2394 | (unsigned long long)s, |
2374 | bdn); | 2395 | bdn); |
2375 | else if (conf->mddev->degraded >= conf->max_degraded) { | 2396 | else if (conf->mddev->degraded >= conf->max_degraded) { |
2376 | set_bad = 1; | 2397 | set_bad = 1; |
2377 | printk_ratelimited( | 2398 | pr_warn_ratelimited( |
2378 | KERN_WARNING | 2399 | "md/raid:%s: read error not correctable (sector %llu on %s).\n", |
2379 | "md/raid:%s: read error not correctable " | ||
2380 | "(sector %llu on %s).\n", | ||
2381 | mdname(conf->mddev), | 2400 | mdname(conf->mddev), |
2382 | (unsigned long long)s, | 2401 | (unsigned long long)s, |
2383 | bdn); | 2402 | bdn); |
2384 | } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { | 2403 | } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { |
2385 | /* Oh, no!!! */ | 2404 | /* Oh, no!!! */ |
2386 | set_bad = 1; | 2405 | set_bad = 1; |
2387 | printk_ratelimited( | 2406 | pr_warn_ratelimited( |
2388 | KERN_WARNING | 2407 | "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", |
2389 | "md/raid:%s: read error NOT corrected!! " | ||
2390 | "(sector %llu on %s).\n", | ||
2391 | mdname(conf->mddev), | 2408 | mdname(conf->mddev), |
2392 | (unsigned long long)s, | 2409 | (unsigned long long)s, |
2393 | bdn); | 2410 | bdn); |
2394 | } else if (atomic_read(&rdev->read_errors) | 2411 | } else if (atomic_read(&rdev->read_errors) |
2395 | > conf->max_nr_stripes) | 2412 | > conf->max_nr_stripes) |
2396 | printk(KERN_WARNING | 2413 | pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", |
2397 | "md/raid:%s: Too many read errors, failing device %s.\n", | ||
2398 | mdname(conf->mddev), bdn); | 2414 | mdname(conf->mddev), bdn); |
2399 | else | 2415 | else |
2400 | retry = 1; | 2416 | retry = 1; |
@@ -2526,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) | |||
2526 | 2542 | ||
2527 | set_bit(Blocked, &rdev->flags); | 2543 | set_bit(Blocked, &rdev->flags); |
2528 | set_bit(Faulty, &rdev->flags); | 2544 | set_bit(Faulty, &rdev->flags); |
2529 | set_mask_bits(&mddev->flags, 0, | 2545 | set_mask_bits(&mddev->sb_flags, 0, |
2530 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 2546 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
2531 | printk(KERN_ALERT | 2547 | pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" |
2532 | "md/raid:%s: Disk failure on %s, disabling device.\n" | 2548 | "md/raid:%s: Operation continuing on %d devices.\n", |
2533 | "md/raid:%s: Operation continuing on %d devices.\n", | 2549 | mdname(mddev), |
2534 | mdname(mddev), | 2550 | bdevname(rdev->bdev, b), |
2535 | bdevname(rdev->bdev, b), | 2551 | mdname(mddev), |
2536 | mdname(mddev), | 2552 | conf->raid_disks - mddev->degraded); |
2537 | conf->raid_disks - mddev->degraded); | ||
2538 | } | 2553 | } |
2539 | 2554 | ||
2540 | /* | 2555 | /* |
@@ -2856,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
2856 | previous, &dummy1, &sh2); | 2871 | previous, &dummy1, &sh2); |
2857 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx | 2872 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx |
2858 | || sh2.qd_idx != sh->qd_idx) { | 2873 | || sh2.qd_idx != sh->qd_idx) { |
2859 | printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", | 2874 | pr_warn("md/raid:%s: compute_blocknr: map not correct\n", |
2860 | mdname(conf->mddev)); | 2875 | mdname(conf->mddev)); |
2861 | return 0; | 2876 | return 0; |
2862 | } | 2877 | } |
2863 | return r_sector; | 2878 | return r_sector; |
@@ -2872,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
2872 | int level = conf->level; | 2887 | int level = conf->level; |
2873 | 2888 | ||
2874 | if (rcw) { | 2889 | if (rcw) { |
2890 | /* | ||
2891 | * In some cases, handle_stripe_dirtying initially decided to | ||
2892 | * run rmw and allocates extra page for prexor. However, rcw is | ||
2893 | * cheaper later on. We need to free the extra page now, | ||
2894 | * because we won't be able to do that in ops_complete_prexor(). | ||
2895 | */ | ||
2896 | r5c_release_extra_page(sh); | ||
2875 | 2897 | ||
2876 | for (i = disks; i--; ) { | 2898 | for (i = disks; i--; ) { |
2877 | struct r5dev *dev = &sh->dev[i]; | 2899 | struct r5dev *dev = &sh->dev[i]; |
@@ -2882,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
2882 | if (!expand) | 2904 | if (!expand) |
2883 | clear_bit(R5_UPTODATE, &dev->flags); | 2905 | clear_bit(R5_UPTODATE, &dev->flags); |
2884 | s->locked++; | 2906 | s->locked++; |
2907 | } else if (test_bit(R5_InJournal, &dev->flags)) { | ||
2908 | set_bit(R5_LOCKED, &dev->flags); | ||
2909 | s->locked++; | ||
2885 | } | 2910 | } |
2886 | } | 2911 | } |
2887 | /* if we are not expanding this is a proper write request, and | 2912 | /* if we are not expanding this is a proper write request, and |
@@ -2921,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
2921 | set_bit(R5_LOCKED, &dev->flags); | 2946 | set_bit(R5_LOCKED, &dev->flags); |
2922 | clear_bit(R5_UPTODATE, &dev->flags); | 2947 | clear_bit(R5_UPTODATE, &dev->flags); |
2923 | s->locked++; | 2948 | s->locked++; |
2949 | } else if (test_bit(R5_InJournal, &dev->flags)) { | ||
2950 | set_bit(R5_LOCKED, &dev->flags); | ||
2951 | s->locked++; | ||
2924 | } | 2952 | } |
2925 | } | 2953 | } |
2926 | if (!s->locked) | 2954 | if (!s->locked) |
@@ -3564,10 +3592,10 @@ unhash: | |||
3564 | break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); | 3592 | break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); |
3565 | } | 3593 | } |
3566 | 3594 | ||
3567 | static void handle_stripe_dirtying(struct r5conf *conf, | 3595 | static int handle_stripe_dirtying(struct r5conf *conf, |
3568 | struct stripe_head *sh, | 3596 | struct stripe_head *sh, |
3569 | struct stripe_head_state *s, | 3597 | struct stripe_head_state *s, |
3570 | int disks) | 3598 | int disks) |
3571 | { | 3599 | { |
3572 | int rmw = 0, rcw = 0, i; | 3600 | int rmw = 0, rcw = 0, i; |
3573 | sector_t recovery_cp = conf->mddev->recovery_cp; | 3601 | sector_t recovery_cp = conf->mddev->recovery_cp; |
@@ -3592,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3592 | } else for (i = disks; i--; ) { | 3620 | } else for (i = disks; i--; ) { |
3593 | /* would I have to read this buffer for read_modify_write */ | 3621 | /* would I have to read this buffer for read_modify_write */ |
3594 | struct r5dev *dev = &sh->dev[i]; | 3622 | struct r5dev *dev = &sh->dev[i]; |
3595 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && | 3623 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || |
3624 | test_bit(R5_InJournal, &dev->flags)) && | ||
3596 | !test_bit(R5_LOCKED, &dev->flags) && | 3625 | !test_bit(R5_LOCKED, &dev->flags) && |
3597 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3626 | !((test_bit(R5_UPTODATE, &dev->flags) && |
3627 | (!test_bit(R5_InJournal, &dev->flags) || | ||
3628 | dev->page != dev->orig_page)) || | ||
3598 | test_bit(R5_Wantcompute, &dev->flags))) { | 3629 | test_bit(R5_Wantcompute, &dev->flags))) { |
3599 | if (test_bit(R5_Insync, &dev->flags)) | 3630 | if (test_bit(R5_Insync, &dev->flags)) |
3600 | rmw++; | 3631 | rmw++; |
@@ -3606,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3606 | i != sh->pd_idx && i != sh->qd_idx && | 3637 | i != sh->pd_idx && i != sh->qd_idx && |
3607 | !test_bit(R5_LOCKED, &dev->flags) && | 3638 | !test_bit(R5_LOCKED, &dev->flags) && |
3608 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3639 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3609 | test_bit(R5_Wantcompute, &dev->flags))) { | 3640 | test_bit(R5_InJournal, &dev->flags) || |
3641 | test_bit(R5_Wantcompute, &dev->flags))) { | ||
3610 | if (test_bit(R5_Insync, &dev->flags)) | 3642 | if (test_bit(R5_Insync, &dev->flags)) |
3611 | rcw++; | 3643 | rcw++; |
3612 | else | 3644 | else |
3613 | rcw += 2*disks; | 3645 | rcw += 2*disks; |
3614 | } | 3646 | } |
3615 | } | 3647 | } |
3648 | |||
3616 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", | 3649 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", |
3617 | (unsigned long long)sh->sector, rmw, rcw); | 3650 | (unsigned long long)sh->sector, rmw, rcw); |
3618 | set_bit(STRIPE_HANDLE, &sh->state); | 3651 | set_bit(STRIPE_HANDLE, &sh->state); |
@@ -3624,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3624 | (unsigned long long)sh->sector, rmw); | 3657 | (unsigned long long)sh->sector, rmw); |
3625 | for (i = disks; i--; ) { | 3658 | for (i = disks; i--; ) { |
3626 | struct r5dev *dev = &sh->dev[i]; | 3659 | struct r5dev *dev = &sh->dev[i]; |
3627 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && | 3660 | if (test_bit(R5_InJournal, &dev->flags) && |
3661 | dev->page == dev->orig_page && | ||
3662 | !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { | ||
3663 | /* alloc page for prexor */ | ||
3664 | struct page *p = alloc_page(GFP_NOIO); | ||
3665 | |||
3666 | if (p) { | ||
3667 | dev->orig_page = p; | ||
3668 | continue; | ||
3669 | } | ||
3670 | |||
3671 | /* | ||
3672 | * alloc_page() failed, try use | ||
3673 | * disk_info->extra_page | ||
3674 | */ | ||
3675 | if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, | ||
3676 | &conf->cache_state)) { | ||
3677 | r5c_use_extra_page(sh); | ||
3678 | break; | ||
3679 | } | ||
3680 | |||
3681 | /* extra_page in use, add to delayed_list */ | ||
3682 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3683 | s->waiting_extra_page = 1; | ||
3684 | return -EAGAIN; | ||
3685 | } | ||
3686 | } | ||
3687 | |||
3688 | for (i = disks; i--; ) { | ||
3689 | struct r5dev *dev = &sh->dev[i]; | ||
3690 | if ((dev->towrite || | ||
3691 | i == sh->pd_idx || i == sh->qd_idx || | ||
3692 | test_bit(R5_InJournal, &dev->flags)) && | ||
3628 | !test_bit(R5_LOCKED, &dev->flags) && | 3693 | !test_bit(R5_LOCKED, &dev->flags) && |
3629 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3694 | !((test_bit(R5_UPTODATE, &dev->flags) && |
3630 | test_bit(R5_Wantcompute, &dev->flags)) && | 3695 | (!test_bit(R5_InJournal, &dev->flags) || |
3696 | dev->page != dev->orig_page)) || | ||
3697 | test_bit(R5_Wantcompute, &dev->flags)) && | ||
3631 | test_bit(R5_Insync, &dev->flags)) { | 3698 | test_bit(R5_Insync, &dev->flags)) { |
3632 | if (test_bit(STRIPE_PREREAD_ACTIVE, | 3699 | if (test_bit(STRIPE_PREREAD_ACTIVE, |
3633 | &sh->state)) { | 3700 | &sh->state)) { |
@@ -3653,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3653 | i != sh->pd_idx && i != sh->qd_idx && | 3720 | i != sh->pd_idx && i != sh->qd_idx && |
3654 | !test_bit(R5_LOCKED, &dev->flags) && | 3721 | !test_bit(R5_LOCKED, &dev->flags) && |
3655 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3722 | !(test_bit(R5_UPTODATE, &dev->flags) || |
3723 | test_bit(R5_InJournal, &dev->flags) || | ||
3656 | test_bit(R5_Wantcompute, &dev->flags))) { | 3724 | test_bit(R5_Wantcompute, &dev->flags))) { |
3657 | rcw++; | 3725 | rcw++; |
3658 | if (test_bit(R5_Insync, &dev->flags) && | 3726 | if (test_bit(R5_Insync, &dev->flags) && |
@@ -3692,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
3692 | */ | 3760 | */ |
3693 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | 3761 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
3694 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 3762 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
3695 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 3763 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
3696 | schedule_reconstruction(sh, s, rcw == 0, 0); | 3764 | schedule_reconstruction(sh, s, rcw == 0, 0); |
3765 | return 0; | ||
3697 | } | 3766 | } |
3698 | 3767 | ||
3699 | static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | 3768 | static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, |
@@ -3777,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | |||
3777 | case check_state_compute_run: | 3846 | case check_state_compute_run: |
3778 | break; | 3847 | break; |
3779 | default: | 3848 | default: |
3780 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | 3849 | pr_err("%s: unknown check_state: %d sector: %llu\n", |
3781 | __func__, sh->check_state, | 3850 | __func__, sh->check_state, |
3782 | (unsigned long long) sh->sector); | 3851 | (unsigned long long) sh->sector); |
3783 | BUG(); | 3852 | BUG(); |
@@ -3941,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, | |||
3941 | case check_state_compute_run: | 4010 | case check_state_compute_run: |
3942 | break; | 4011 | break; |
3943 | default: | 4012 | default: |
3944 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | 4013 | pr_warn("%s: unknown check_state: %d sector: %llu\n", |
3945 | __func__, sh->check_state, | 4014 | __func__, sh->check_state, |
3946 | (unsigned long long) sh->sector); | 4015 | (unsigned long long) sh->sector); |
3947 | BUG(); | 4016 | BUG(); |
3948 | } | 4017 | } |
3949 | } | 4018 | } |
@@ -4183,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
4183 | if (rdev && !test_bit(Faulty, &rdev->flags)) | 4252 | if (rdev && !test_bit(Faulty, &rdev->flags)) |
4184 | do_recovery = 1; | 4253 | do_recovery = 1; |
4185 | } | 4254 | } |
4255 | |||
4256 | if (test_bit(R5_InJournal, &dev->flags)) | ||
4257 | s->injournal++; | ||
4258 | if (test_bit(R5_InJournal, &dev->flags) && dev->written) | ||
4259 | s->just_cached++; | ||
4186 | } | 4260 | } |
4187 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | 4261 | if (test_bit(STRIPE_SYNCING, &sh->state)) { |
4188 | /* If there is a failed device being replaced, | 4262 | /* If there is a failed device being replaced, |
@@ -4411,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
4411 | struct r5dev *dev = &sh->dev[i]; | 4485 | struct r5dev *dev = &sh->dev[i]; |
4412 | if (test_bit(R5_LOCKED, &dev->flags) && | 4486 | if (test_bit(R5_LOCKED, &dev->flags) && |
4413 | (i == sh->pd_idx || i == sh->qd_idx || | 4487 | (i == sh->pd_idx || i == sh->qd_idx || |
4414 | dev->written)) { | 4488 | dev->written || test_bit(R5_InJournal, |
4489 | &dev->flags))) { | ||
4415 | pr_debug("Writing block %d\n", i); | 4490 | pr_debug("Writing block %d\n", i); |
4416 | set_bit(R5_Wantwrite, &dev->flags); | 4491 | set_bit(R5_Wantwrite, &dev->flags); |
4417 | if (prexor) | 4492 | if (prexor) |
@@ -4451,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh) | |||
4451 | test_bit(R5_Discard, &qdev->flags)))))) | 4526 | test_bit(R5_Discard, &qdev->flags)))))) |
4452 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); | 4527 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
4453 | 4528 | ||
4529 | if (s.just_cached) | ||
4530 | r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); | ||
4531 | r5l_stripe_write_finished(sh); | ||
4532 | |||
4454 | /* Now we might consider reading some blocks, either to check/generate | 4533 | /* Now we might consider reading some blocks, either to check/generate |
4455 | * parity, or to satisfy requests | 4534 | * parity, or to satisfy requests |
4456 | * or to load a block that is being partially written. | 4535 | * or to load a block that is being partially written. |
@@ -4462,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh) | |||
4462 | || s.expanding) | 4541 | || s.expanding) |
4463 | handle_stripe_fill(sh, &s, disks); | 4542 | handle_stripe_fill(sh, &s, disks); |
4464 | 4543 | ||
4465 | /* Now to consider new write requests and what else, if anything | 4544 | /* |
4466 | * should be read. We do not handle new writes when: | 4545 | * When the stripe finishes full journal write cycle (write to journal |
4546 | * and raid disk), this is the clean up procedure so it is ready for | ||
4547 | * next operation. | ||
4548 | */ | ||
4549 | r5c_finish_stripe_write_out(conf, sh, &s); | ||
4550 | |||
4551 | /* | ||
4552 | * Now to consider new write requests, cache write back and what else, | ||
4553 | * if anything should be read. We do not handle new writes when: | ||
4467 | * 1/ A 'write' operation (copy+xor) is already in flight. | 4554 | * 1/ A 'write' operation (copy+xor) is already in flight. |
4468 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 4555 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
4469 | * block. | 4556 | * block. |
4557 | * 3/ A r5c cache log write is in flight. | ||
4470 | */ | 4558 | */ |
4471 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 4559 | |
4472 | handle_stripe_dirtying(conf, sh, &s, disks); | 4560 | if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { |
4561 | if (!r5c_is_writeback(conf->log)) { | ||
4562 | if (s.to_write) | ||
4563 | handle_stripe_dirtying(conf, sh, &s, disks); | ||
4564 | } else { /* write back cache */ | ||
4565 | int ret = 0; | ||
4566 | |||
4567 | /* First, try handle writes in caching phase */ | ||
4568 | if (s.to_write) | ||
4569 | ret = r5c_try_caching_write(conf, sh, &s, | ||
4570 | disks); | ||
4571 | /* | ||
4572 | * If caching phase failed: ret == -EAGAIN | ||
4573 | * OR | ||
4574 | * stripe under reclaim: !caching && injournal | ||
4575 | * | ||
4576 | * fall back to handle_stripe_dirtying() | ||
4577 | */ | ||
4578 | if (ret == -EAGAIN || | ||
4579 | /* stripe under reclaim: !caching && injournal */ | ||
4580 | (!test_bit(STRIPE_R5C_CACHING, &sh->state) && | ||
4581 | s.injournal > 0)) { | ||
4582 | ret = handle_stripe_dirtying(conf, sh, &s, | ||
4583 | disks); | ||
4584 | if (ret == -EAGAIN) | ||
4585 | goto finish; | ||
4586 | } | ||
4587 | } | ||
4588 | } | ||
4473 | 4589 | ||
4474 | /* maybe we need to check and possibly fix the parity for this stripe | 4590 | /* maybe we need to check and possibly fix the parity for this stripe |
4475 | * Any reads will already have been scheduled, so we just see if enough | 4591 | * Any reads will already have been scheduled, so we just see if enough |
@@ -4640,9 +4756,7 @@ finish: | |||
4640 | } | 4756 | } |
4641 | 4757 | ||
4642 | if (!bio_list_empty(&s.return_bi)) { | 4758 | if (!bio_list_empty(&s.return_bi)) { |
4643 | if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && | 4759 | if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { |
4644 | (s.failed <= conf->max_degraded || | ||
4645 | conf->mddev->external == 0)) { | ||
4646 | spin_lock_irq(&conf->device_lock); | 4760 | spin_lock_irq(&conf->device_lock); |
4647 | bio_list_merge(&conf->return_bi, &s.return_bi); | 4761 | bio_list_merge(&conf->return_bi, &s.return_bi); |
4648 | spin_unlock_irq(&conf->device_lock); | 4762 | spin_unlock_irq(&conf->device_lock); |
@@ -4698,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits) | |||
4698 | 4812 | ||
4699 | if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) | 4813 | if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) |
4700 | return 1; | 4814 | return 1; |
4815 | |||
4816 | /* Also checks whether there is pressure on r5cache log space */ | ||
4817 | if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) | ||
4818 | return 1; | ||
4701 | if (conf->quiesce) | 4819 | if (conf->quiesce) |
4702 | return 1; | 4820 | return 1; |
4703 | if (atomic_read(&conf->empty_inactive_list_nr)) | 4821 | if (atomic_read(&conf->empty_inactive_list_nr)) |
@@ -5167,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
5167 | int remaining; | 5285 | int remaining; |
5168 | DEFINE_WAIT(w); | 5286 | DEFINE_WAIT(w); |
5169 | bool do_prepare; | 5287 | bool do_prepare; |
5288 | bool do_flush = false; | ||
5170 | 5289 | ||
5171 | if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { | 5290 | if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { |
5172 | int ret = r5l_handle_flush_request(conf->log, bi); | 5291 | int ret = r5l_handle_flush_request(conf->log, bi); |
@@ -5178,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
5178 | return; | 5297 | return; |
5179 | } | 5298 | } |
5180 | /* ret == -EAGAIN, fallback */ | 5299 | /* ret == -EAGAIN, fallback */ |
5300 | /* | ||
5301 | * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, | ||
5302 | * we need to flush journal device | ||
5303 | */ | ||
5304 | do_flush = bi->bi_opf & REQ_PREFLUSH; | ||
5181 | } | 5305 | } |
5182 | 5306 | ||
5183 | md_write_start(mddev, bi); | 5307 | md_write_start(mddev, bi); |
@@ -5188,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
5188 | * data on failed drives. | 5312 | * data on failed drives. |
5189 | */ | 5313 | */ |
5190 | if (rw == READ && mddev->degraded == 0 && | 5314 | if (rw == READ && mddev->degraded == 0 && |
5315 | !r5c_is_writeback(conf->log) && | ||
5191 | mddev->reshape_position == MaxSector) { | 5316 | mddev->reshape_position == MaxSector) { |
5192 | bi = chunk_aligned_read(mddev, bi); | 5317 | bi = chunk_aligned_read(mddev, bi); |
5193 | if (!bi) | 5318 | if (!bi) |
@@ -5316,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
5316 | do_prepare = true; | 5441 | do_prepare = true; |
5317 | goto retry; | 5442 | goto retry; |
5318 | } | 5443 | } |
5444 | if (do_flush) { | ||
5445 | set_bit(STRIPE_R5C_PREFLUSH, &sh->state); | ||
5446 | /* we only need flush for one stripe */ | ||
5447 | do_flush = false; | ||
5448 | } | ||
5449 | |||
5319 | set_bit(STRIPE_HANDLE, &sh->state); | 5450 | set_bit(STRIPE_HANDLE, &sh->state); |
5320 | clear_bit(STRIPE_DELAYED, &sh->state); | 5451 | clear_bit(STRIPE_DELAYED, &sh->state); |
5321 | if ((!sh->batch_head || sh == sh->batch_head) && | 5452 | if ((!sh->batch_head || sh == sh->batch_head) && |
@@ -5481,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
5481 | mddev->reshape_position = conf->reshape_progress; | 5612 | mddev->reshape_position = conf->reshape_progress; |
5482 | mddev->curr_resync_completed = sector_nr; | 5613 | mddev->curr_resync_completed = sector_nr; |
5483 | conf->reshape_checkpoint = jiffies; | 5614 | conf->reshape_checkpoint = jiffies; |
5484 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5615 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
5485 | md_wakeup_thread(mddev->thread); | 5616 | md_wakeup_thread(mddev->thread); |
5486 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 5617 | wait_event(mddev->sb_wait, mddev->sb_flags == 0 || |
5487 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | 5618 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
5488 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 5619 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
5489 | return 0; | 5620 | return 0; |
@@ -5579,10 +5710,10 @@ finish: | |||
5579 | mddev->reshape_position = conf->reshape_progress; | 5710 | mddev->reshape_position = conf->reshape_progress; |
5580 | mddev->curr_resync_completed = sector_nr; | 5711 | mddev->curr_resync_completed = sector_nr; |
5581 | conf->reshape_checkpoint = jiffies; | 5712 | conf->reshape_checkpoint = jiffies; |
5582 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5713 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
5583 | md_wakeup_thread(mddev->thread); | 5714 | md_wakeup_thread(mddev->thread); |
5584 | wait_event(mddev->sb_wait, | 5715 | wait_event(mddev->sb_wait, |
5585 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) | 5716 | !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) |
5586 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | 5717 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
5587 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 5718 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
5588 | goto ret; | 5719 | goto ret; |
@@ -5857,10 +5988,10 @@ static void raid5d(struct md_thread *thread) | |||
5857 | md_check_recovery(mddev); | 5988 | md_check_recovery(mddev); |
5858 | 5989 | ||
5859 | if (!bio_list_empty(&conf->return_bi) && | 5990 | if (!bio_list_empty(&conf->return_bi) && |
5860 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 5991 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
5861 | struct bio_list tmp = BIO_EMPTY_LIST; | 5992 | struct bio_list tmp = BIO_EMPTY_LIST; |
5862 | spin_lock_irq(&conf->device_lock); | 5993 | spin_lock_irq(&conf->device_lock); |
5863 | if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 5994 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
5864 | bio_list_merge(&tmp, &conf->return_bi); | 5995 | bio_list_merge(&tmp, &conf->return_bi); |
5865 | bio_list_init(&conf->return_bi); | 5996 | bio_list_init(&conf->return_bi); |
5866 | } | 5997 | } |
@@ -5907,7 +6038,7 @@ static void raid5d(struct md_thread *thread) | |||
5907 | break; | 6038 | break; |
5908 | handled += batch_size; | 6039 | handled += batch_size; |
5909 | 6040 | ||
5910 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { | 6041 | if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { |
5911 | spin_unlock_irq(&conf->device_lock); | 6042 | spin_unlock_irq(&conf->device_lock); |
5912 | md_check_recovery(mddev); | 6043 | md_check_recovery(mddev); |
5913 | spin_lock_irq(&conf->device_lock); | 6044 | spin_lock_irq(&conf->device_lock); |
@@ -6237,6 +6368,7 @@ static struct attribute *raid5_attrs[] = { | |||
6237 | &raid5_group_thread_cnt.attr, | 6368 | &raid5_group_thread_cnt.attr, |
6238 | &raid5_skip_copy.attr, | 6369 | &raid5_skip_copy.attr, |
6239 | &raid5_rmw_level.attr, | 6370 | &raid5_rmw_level.attr, |
6371 | &r5c_journal_mode.attr, | ||
6240 | NULL, | 6372 | NULL, |
6241 | }; | 6373 | }; |
6242 | static struct attribute_group raid5_attrs_group = { | 6374 | static struct attribute_group raid5_attrs_group = { |
@@ -6363,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf) | |||
6363 | 6495 | ||
6364 | static void free_conf(struct r5conf *conf) | 6496 | static void free_conf(struct r5conf *conf) |
6365 | { | 6497 | { |
6498 | int i; | ||
6499 | |||
6366 | if (conf->log) | 6500 | if (conf->log) |
6367 | r5l_exit_log(conf->log); | 6501 | r5l_exit_log(conf->log); |
6368 | if (conf->shrinker.nr_deferred) | 6502 | if (conf->shrinker.nr_deferred) |
@@ -6371,6 +6505,9 @@ static void free_conf(struct r5conf *conf) | |||
6371 | free_thread_groups(conf); | 6505 | free_thread_groups(conf); |
6372 | shrink_stripes(conf); | 6506 | shrink_stripes(conf); |
6373 | raid5_free_percpu(conf); | 6507 | raid5_free_percpu(conf); |
6508 | for (i = 0; i < conf->pool_size; i++) | ||
6509 | if (conf->disks[i].extra_page) | ||
6510 | put_page(conf->disks[i].extra_page); | ||
6374 | kfree(conf->disks); | 6511 | kfree(conf->disks); |
6375 | kfree(conf->stripe_hashtbl); | 6512 | kfree(conf->stripe_hashtbl); |
6376 | kfree(conf); | 6513 | kfree(conf); |
@@ -6382,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) | |||
6382 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | 6519 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); |
6383 | 6520 | ||
6384 | if (alloc_scratch_buffer(conf, percpu)) { | 6521 | if (alloc_scratch_buffer(conf, percpu)) { |
6385 | pr_err("%s: failed memory allocation for cpu%u\n", | 6522 | pr_warn("%s: failed memory allocation for cpu%u\n", |
6386 | __func__, cpu); | 6523 | __func__, cpu); |
6387 | return -ENOMEM; | 6524 | return -ENOMEM; |
6388 | } | 6525 | } |
6389 | return 0; | 6526 | return 0; |
@@ -6453,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6453 | if (mddev->new_level != 5 | 6590 | if (mddev->new_level != 5 |
6454 | && mddev->new_level != 4 | 6591 | && mddev->new_level != 4 |
6455 | && mddev->new_level != 6) { | 6592 | && mddev->new_level != 6) { |
6456 | printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", | 6593 | pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", |
6457 | mdname(mddev), mddev->new_level); | 6594 | mdname(mddev), mddev->new_level); |
6458 | return ERR_PTR(-EIO); | 6595 | return ERR_PTR(-EIO); |
6459 | } | 6596 | } |
6460 | if ((mddev->new_level == 5 | 6597 | if ((mddev->new_level == 5 |
6461 | && !algorithm_valid_raid5(mddev->new_layout)) || | 6598 | && !algorithm_valid_raid5(mddev->new_layout)) || |
6462 | (mddev->new_level == 6 | 6599 | (mddev->new_level == 6 |
6463 | && !algorithm_valid_raid6(mddev->new_layout))) { | 6600 | && !algorithm_valid_raid6(mddev->new_layout))) { |
6464 | printk(KERN_ERR "md/raid:%s: layout %d not supported\n", | 6601 | pr_warn("md/raid:%s: layout %d not supported\n", |
6465 | mdname(mddev), mddev->new_layout); | 6602 | mdname(mddev), mddev->new_layout); |
6466 | return ERR_PTR(-EIO); | 6603 | return ERR_PTR(-EIO); |
6467 | } | 6604 | } |
6468 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { | 6605 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { |
6469 | printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", | 6606 | pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", |
6470 | mdname(mddev), mddev->raid_disks); | 6607 | mdname(mddev), mddev->raid_disks); |
6471 | return ERR_PTR(-EINVAL); | 6608 | return ERR_PTR(-EINVAL); |
6472 | } | 6609 | } |
6473 | 6610 | ||
6474 | if (!mddev->new_chunk_sectors || | 6611 | if (!mddev->new_chunk_sectors || |
6475 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || | 6612 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || |
6476 | !is_power_of_2(mddev->new_chunk_sectors)) { | 6613 | !is_power_of_2(mddev->new_chunk_sectors)) { |
6477 | printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", | 6614 | pr_warn("md/raid:%s: invalid chunk size %d\n", |
6478 | mdname(mddev), mddev->new_chunk_sectors << 9); | 6615 | mdname(mddev), mddev->new_chunk_sectors << 9); |
6479 | return ERR_PTR(-EINVAL); | 6616 | return ERR_PTR(-EINVAL); |
6480 | } | 6617 | } |
6481 | 6618 | ||
@@ -6517,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6517 | 6654 | ||
6518 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), | 6655 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), |
6519 | GFP_KERNEL); | 6656 | GFP_KERNEL); |
6657 | |||
6520 | if (!conf->disks) | 6658 | if (!conf->disks) |
6521 | goto abort; | 6659 | goto abort; |
6522 | 6660 | ||
6661 | for (i = 0; i < max_disks; i++) { | ||
6662 | conf->disks[i].extra_page = alloc_page(GFP_KERNEL); | ||
6663 | if (!conf->disks[i].extra_page) | ||
6664 | goto abort; | ||
6665 | } | ||
6666 | |||
6523 | conf->mddev = mddev; | 6667 | conf->mddev = mddev; |
6524 | 6668 | ||
6525 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 6669 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
@@ -6540,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6540 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | 6684 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) |
6541 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | 6685 | INIT_LIST_HEAD(conf->temp_inactive_list + i); |
6542 | 6686 | ||
6687 | atomic_set(&conf->r5c_cached_full_stripes, 0); | ||
6688 | INIT_LIST_HEAD(&conf->r5c_full_stripe_list); | ||
6689 | atomic_set(&conf->r5c_cached_partial_stripes, 0); | ||
6690 | INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); | ||
6691 | |||
6543 | conf->level = mddev->new_level; | 6692 | conf->level = mddev->new_level; |
6544 | conf->chunk_sectors = mddev->new_chunk_sectors; | 6693 | conf->chunk_sectors = mddev->new_chunk_sectors; |
6545 | if (raid5_alloc_percpu(conf) != 0) | 6694 | if (raid5_alloc_percpu(conf) != 0) |
@@ -6566,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6566 | 6715 | ||
6567 | if (test_bit(In_sync, &rdev->flags)) { | 6716 | if (test_bit(In_sync, &rdev->flags)) { |
6568 | char b[BDEVNAME_SIZE]; | 6717 | char b[BDEVNAME_SIZE]; |
6569 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" | 6718 | pr_info("md/raid:%s: device %s operational as raid disk %d\n", |
6570 | " disk %d\n", | 6719 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); |
6571 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); | ||
6572 | } else if (rdev->saved_raid_disk != raid_disk) | 6720 | } else if (rdev->saved_raid_disk != raid_disk) |
6573 | /* Cannot rely on bitmap to complete recovery */ | 6721 | /* Cannot rely on bitmap to complete recovery */ |
6574 | conf->fullsync = 1; | 6722 | conf->fullsync = 1; |
@@ -6602,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6602 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); | 6750 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); |
6603 | conf->min_nr_stripes = max(NR_STRIPES, stripes); | 6751 | conf->min_nr_stripes = max(NR_STRIPES, stripes); |
6604 | if (conf->min_nr_stripes != NR_STRIPES) | 6752 | if (conf->min_nr_stripes != NR_STRIPES) |
6605 | printk(KERN_INFO | 6753 | pr_info("md/raid:%s: force stripe size %d for reshape\n", |
6606 | "md/raid:%s: force stripe size %d for reshape\n", | ||
6607 | mdname(mddev), conf->min_nr_stripes); | 6754 | mdname(mddev), conf->min_nr_stripes); |
6608 | } | 6755 | } |
6609 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + | 6756 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + |
6610 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 6757 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
6611 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); | 6758 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
6612 | if (grow_stripes(conf, conf->min_nr_stripes)) { | 6759 | if (grow_stripes(conf, conf->min_nr_stripes)) { |
6613 | printk(KERN_ERR | 6760 | pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", |
6614 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 6761 | mdname(mddev), memory); |
6615 | mdname(mddev), memory); | ||
6616 | goto abort; | 6762 | goto abort; |
6617 | } else | 6763 | } else |
6618 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", | 6764 | pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); |
6619 | mdname(mddev), memory); | ||
6620 | /* | 6765 | /* |
6621 | * Losing a stripe head costs more than the time to refill it, | 6766 | * Losing a stripe head costs more than the time to refill it, |
6622 | * it reduces the queue depth and so can hurt throughput. | 6767 | * it reduces the queue depth and so can hurt throughput. |
@@ -6628,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
6628 | conf->shrinker.batch = 128; | 6773 | conf->shrinker.batch = 128; |
6629 | conf->shrinker.flags = 0; | 6774 | conf->shrinker.flags = 0; |
6630 | if (register_shrinker(&conf->shrinker)) { | 6775 | if (register_shrinker(&conf->shrinker)) { |
6631 | printk(KERN_ERR | 6776 | pr_warn("md/raid:%s: couldn't register shrinker.\n", |
6632 | "md/raid:%s: couldn't register shrinker.\n", | 6777 | mdname(mddev)); |
6633 | mdname(mddev)); | ||
6634 | goto abort; | 6778 | goto abort; |
6635 | } | 6779 | } |
6636 | 6780 | ||
6637 | sprintf(pers_name, "raid%d", mddev->new_level); | 6781 | sprintf(pers_name, "raid%d", mddev->new_level); |
6638 | conf->thread = md_register_thread(raid5d, mddev, pers_name); | 6782 | conf->thread = md_register_thread(raid5d, mddev, pers_name); |
6639 | if (!conf->thread) { | 6783 | if (!conf->thread) { |
6640 | printk(KERN_ERR | 6784 | pr_warn("md/raid:%s: couldn't allocate thread.\n", |
6641 | "md/raid:%s: couldn't allocate thread.\n", | 6785 | mdname(mddev)); |
6642 | mdname(mddev)); | ||
6643 | goto abort; | 6786 | goto abort; |
6644 | } | 6787 | } |
6645 | 6788 | ||
@@ -6692,9 +6835,8 @@ static int raid5_run(struct mddev *mddev) | |||
6692 | int first = 1; | 6835 | int first = 1; |
6693 | 6836 | ||
6694 | if (mddev->recovery_cp != MaxSector) | 6837 | if (mddev->recovery_cp != MaxSector) |
6695 | printk(KERN_NOTICE "md/raid:%s: not clean" | 6838 | pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", |
6696 | " -- starting background reconstruction\n", | 6839 | mdname(mddev)); |
6697 | mdname(mddev)); | ||
6698 | 6840 | ||
6699 | rdev_for_each(rdev, mddev) { | 6841 | rdev_for_each(rdev, mddev) { |
6700 | long long diff; | 6842 | long long diff; |
@@ -6737,15 +6879,14 @@ static int raid5_run(struct mddev *mddev) | |||
6737 | int new_data_disks; | 6879 | int new_data_disks; |
6738 | 6880 | ||
6739 | if (journal_dev) { | 6881 | if (journal_dev) { |
6740 | printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", | 6882 | pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", |
6741 | mdname(mddev)); | 6883 | mdname(mddev)); |
6742 | return -EINVAL; | 6884 | return -EINVAL; |
6743 | } | 6885 | } |
6744 | 6886 | ||
6745 | if (mddev->new_level != mddev->level) { | 6887 | if (mddev->new_level != mddev->level) { |
6746 | printk(KERN_ERR "md/raid:%s: unsupported reshape " | 6888 | pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", |
6747 | "required - aborting.\n", | 6889 | mdname(mddev)); |
6748 | mdname(mddev)); | ||
6749 | return -EINVAL; | 6890 | return -EINVAL; |
6750 | } | 6891 | } |
6751 | old_disks = mddev->raid_disks - mddev->delta_disks; | 6892 | old_disks = mddev->raid_disks - mddev->delta_disks; |
@@ -6760,8 +6901,8 @@ static int raid5_run(struct mddev *mddev) | |||
6760 | chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); | 6901 | chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); |
6761 | new_data_disks = mddev->raid_disks - max_degraded; | 6902 | new_data_disks = mddev->raid_disks - max_degraded; |
6762 | if (sector_div(here_new, chunk_sectors * new_data_disks)) { | 6903 | if (sector_div(here_new, chunk_sectors * new_data_disks)) { |
6763 | printk(KERN_ERR "md/raid:%s: reshape_position not " | 6904 | pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", |
6764 | "on a stripe boundary\n", mdname(mddev)); | 6905 | mdname(mddev)); |
6765 | return -EINVAL; | 6906 | return -EINVAL; |
6766 | } | 6907 | } |
6767 | reshape_offset = here_new * chunk_sectors; | 6908 | reshape_offset = here_new * chunk_sectors; |
@@ -6782,10 +6923,8 @@ static int raid5_run(struct mddev *mddev) | |||
6782 | abs(min_offset_diff) >= mddev->new_chunk_sectors) | 6923 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
6783 | /* not really in-place - so OK */; | 6924 | /* not really in-place - so OK */; |
6784 | else if (mddev->ro == 0) { | 6925 | else if (mddev->ro == 0) { |
6785 | printk(KERN_ERR "md/raid:%s: in-place reshape " | 6926 | pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", |
6786 | "must be started in read-only mode " | 6927 | mdname(mddev)); |
6787 | "- aborting\n", | ||
6788 | mdname(mddev)); | ||
6789 | return -EINVAL; | 6928 | return -EINVAL; |
6790 | } | 6929 | } |
6791 | } else if (mddev->reshape_backwards | 6930 | } else if (mddev->reshape_backwards |
@@ -6794,13 +6933,11 @@ static int raid5_run(struct mddev *mddev) | |||
6794 | : (here_new * chunk_sectors >= | 6933 | : (here_new * chunk_sectors >= |
6795 | here_old * chunk_sectors + (-min_offset_diff))) { | 6934 | here_old * chunk_sectors + (-min_offset_diff))) { |
6796 | /* Reading from the same stripe as writing to - bad */ | 6935 | /* Reading from the same stripe as writing to - bad */ |
6797 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 6936 | pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", |
6798 | "auto-recovery - aborting.\n", | 6937 | mdname(mddev)); |
6799 | mdname(mddev)); | ||
6800 | return -EINVAL; | 6938 | return -EINVAL; |
6801 | } | 6939 | } |
6802 | printk(KERN_INFO "md/raid:%s: reshape will continue\n", | 6940 | pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); |
6803 | mdname(mddev)); | ||
6804 | /* OK, we should be able to continue; */ | 6941 | /* OK, we should be able to continue; */ |
6805 | } else { | 6942 | } else { |
6806 | BUG_ON(mddev->level != mddev->new_level); | 6943 | BUG_ON(mddev->level != mddev->new_level); |
@@ -6819,8 +6956,8 @@ static int raid5_run(struct mddev *mddev) | |||
6819 | 6956 | ||
6820 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { | 6957 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { |
6821 | if (!journal_dev) { | 6958 | if (!journal_dev) { |
6822 | pr_err("md/raid:%s: journal disk is missing, force array readonly\n", | 6959 | pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", |
6823 | mdname(mddev)); | 6960 | mdname(mddev)); |
6824 | mddev->ro = 1; | 6961 | mddev->ro = 1; |
6825 | set_disk_ro(mddev->gendisk, 1); | 6962 | set_disk_ro(mddev->gendisk, 1); |
6826 | } else if (mddev->recovery_cp == MaxSector) | 6963 | } else if (mddev->recovery_cp == MaxSector) |
@@ -6847,8 +6984,7 @@ static int raid5_run(struct mddev *mddev) | |||
6847 | if (conf->disks[i].replacement && | 6984 | if (conf->disks[i].replacement && |
6848 | conf->reshape_progress != MaxSector) { | 6985 | conf->reshape_progress != MaxSector) { |
6849 | /* replacements and reshape simply do not mix. */ | 6986 | /* replacements and reshape simply do not mix. */ |
6850 | printk(KERN_ERR "md: cannot handle concurrent " | 6987 | pr_warn("md: cannot handle concurrent replacement and reshape.\n"); |
6851 | "replacement and reshape.\n"); | ||
6852 | goto abort; | 6988 | goto abort; |
6853 | } | 6989 | } |
6854 | if (test_bit(In_sync, &rdev->flags)) { | 6990 | if (test_bit(In_sync, &rdev->flags)) { |
@@ -6890,8 +7026,7 @@ static int raid5_run(struct mddev *mddev) | |||
6890 | mddev->degraded = calc_degraded(conf); | 7026 | mddev->degraded = calc_degraded(conf); |
6891 | 7027 | ||
6892 | if (has_failed(conf)) { | 7028 | if (has_failed(conf)) { |
6893 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 7029 | pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", |
6894 | " (%d/%d failed)\n", | ||
6895 | mdname(mddev), mddev->degraded, conf->raid_disks); | 7030 | mdname(mddev), mddev->degraded, conf->raid_disks); |
6896 | goto abort; | 7031 | goto abort; |
6897 | } | 7032 | } |
@@ -6903,29 +7038,19 @@ static int raid5_run(struct mddev *mddev) | |||
6903 | if (mddev->degraded > dirty_parity_disks && | 7038 | if (mddev->degraded > dirty_parity_disks && |
6904 | mddev->recovery_cp != MaxSector) { | 7039 | mddev->recovery_cp != MaxSector) { |
6905 | if (mddev->ok_start_degraded) | 7040 | if (mddev->ok_start_degraded) |
6906 | printk(KERN_WARNING | 7041 | pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", |
6907 | "md/raid:%s: starting dirty degraded array" | 7042 | mdname(mddev)); |
6908 | " - data corruption possible.\n", | ||
6909 | mdname(mddev)); | ||
6910 | else { | 7043 | else { |
6911 | printk(KERN_ERR | 7044 | pr_crit("md/raid:%s: cannot start dirty degraded array.\n", |
6912 | "md/raid:%s: cannot start dirty degraded array.\n", | 7045 | mdname(mddev)); |
6913 | mdname(mddev)); | ||
6914 | goto abort; | 7046 | goto abort; |
6915 | } | 7047 | } |
6916 | } | 7048 | } |
6917 | 7049 | ||
6918 | if (mddev->degraded == 0) | 7050 | pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", |
6919 | printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" | 7051 | mdname(mddev), conf->level, |
6920 | " devices, algorithm %d\n", mdname(mddev), conf->level, | 7052 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, |
6921 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, | 7053 | mddev->new_layout); |
6922 | mddev->new_layout); | ||
6923 | else | ||
6924 | printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" | ||
6925 | " out of %d devices, algorithm %d\n", | ||
6926 | mdname(mddev), conf->level, | ||
6927 | mddev->raid_disks - mddev->degraded, | ||
6928 | mddev->raid_disks, mddev->new_layout); | ||
6929 | 7054 | ||
6930 | print_raid5_conf(conf); | 7055 | print_raid5_conf(conf); |
6931 | 7056 | ||
@@ -6945,9 +7070,8 @@ static int raid5_run(struct mddev *mddev) | |||
6945 | mddev->to_remove = NULL; | 7070 | mddev->to_remove = NULL; |
6946 | else if (mddev->kobj.sd && | 7071 | else if (mddev->kobj.sd && |
6947 | sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) | 7072 | sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) |
6948 | printk(KERN_WARNING | 7073 | pr_warn("raid5: failed to create sysfs attributes for %s\n", |
6949 | "raid5: failed to create sysfs attributes for %s\n", | 7074 | mdname(mddev)); |
6950 | mdname(mddev)); | ||
6951 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 7075 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
6952 | 7076 | ||
6953 | if (mddev->queue) { | 7077 | if (mddev->queue) { |
@@ -6979,6 +7103,15 @@ static int raid5_run(struct mddev *mddev) | |||
6979 | stripe = (stripe | (stripe-1)) + 1; | 7103 | stripe = (stripe | (stripe-1)) + 1; |
6980 | mddev->queue->limits.discard_alignment = stripe; | 7104 | mddev->queue->limits.discard_alignment = stripe; |
6981 | mddev->queue->limits.discard_granularity = stripe; | 7105 | mddev->queue->limits.discard_granularity = stripe; |
7106 | |||
7107 | /* | ||
7108 | * We use 16-bit counter of active stripes in bi_phys_segments | ||
7109 | * (minus one for over-loaded initialization) | ||
7110 | */ | ||
7111 | blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); | ||
7112 | blk_queue_max_discard_sectors(mddev->queue, | ||
7113 | 0xfffe * STRIPE_SECTORS); | ||
7114 | |||
6982 | /* | 7115 | /* |
6983 | * unaligned part of discard request will be ignored, so can't | 7116 | * unaligned part of discard request will be ignored, so can't |
6984 | * guarantee discard_zeroes_data | 7117 | * guarantee discard_zeroes_data |
@@ -7035,9 +7168,10 @@ static int raid5_run(struct mddev *mddev) | |||
7035 | if (journal_dev) { | 7168 | if (journal_dev) { |
7036 | char b[BDEVNAME_SIZE]; | 7169 | char b[BDEVNAME_SIZE]; |
7037 | 7170 | ||
7038 | printk(KERN_INFO"md/raid:%s: using device %s as journal\n", | 7171 | pr_debug("md/raid:%s: using device %s as journal\n", |
7039 | mdname(mddev), bdevname(journal_dev->bdev, b)); | 7172 | mdname(mddev), bdevname(journal_dev->bdev, b)); |
7040 | r5l_init_log(conf, journal_dev); | 7173 | if (r5l_init_log(conf, journal_dev)) |
7174 | goto abort; | ||
7041 | } | 7175 | } |
7042 | 7176 | ||
7043 | return 0; | 7177 | return 0; |
@@ -7046,7 +7180,7 @@ abort: | |||
7046 | print_raid5_conf(conf); | 7180 | print_raid5_conf(conf); |
7047 | free_conf(conf); | 7181 | free_conf(conf); |
7048 | mddev->private = NULL; | 7182 | mddev->private = NULL; |
7049 | printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); | 7183 | pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); |
7050 | return -EIO; | 7184 | return -EIO; |
7051 | } | 7185 | } |
7052 | 7186 | ||
@@ -7080,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf) | |||
7080 | int i; | 7214 | int i; |
7081 | struct disk_info *tmp; | 7215 | struct disk_info *tmp; |
7082 | 7216 | ||
7083 | printk(KERN_DEBUG "RAID conf printout:\n"); | 7217 | pr_debug("RAID conf printout:\n"); |
7084 | if (!conf) { | 7218 | if (!conf) { |
7085 | printk("(conf==NULL)\n"); | 7219 | pr_debug("(conf==NULL)\n"); |
7086 | return; | 7220 | return; |
7087 | } | 7221 | } |
7088 | printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, | 7222 | pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, |
7089 | conf->raid_disks, | 7223 | conf->raid_disks, |
7090 | conf->raid_disks - conf->mddev->degraded); | 7224 | conf->raid_disks - conf->mddev->degraded); |
7091 | 7225 | ||
@@ -7093,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf) | |||
7093 | char b[BDEVNAME_SIZE]; | 7227 | char b[BDEVNAME_SIZE]; |
7094 | tmp = conf->disks + i; | 7228 | tmp = conf->disks + i; |
7095 | if (tmp->rdev) | 7229 | if (tmp->rdev) |
7096 | printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", | 7230 | pr_debug(" disk %d, o:%d, dev:%s\n", |
7097 | i, !test_bit(Faulty, &tmp->rdev->flags), | 7231 | i, !test_bit(Faulty, &tmp->rdev->flags), |
7098 | bdevname(tmp->rdev->bdev, b)); | 7232 | bdevname(tmp->rdev->bdev, b)); |
7099 | } | 7233 | } |
@@ -7241,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
7241 | * write requests running. We should be safe | 7375 | * write requests running. We should be safe |
7242 | */ | 7376 | */ |
7243 | r5l_init_log(conf, rdev); | 7377 | r5l_init_log(conf, rdev); |
7244 | printk(KERN_INFO"md/raid:%s: using device %s as journal\n", | 7378 | pr_debug("md/raid:%s: using device %s as journal\n", |
7245 | mdname(mddev), bdevname(rdev->bdev, b)); | 7379 | mdname(mddev), bdevname(rdev->bdev, b)); |
7246 | return 0; | 7380 | return 0; |
7247 | } | 7381 | } |
7248 | if (mddev->recovery_disabled == conf->recovery_disabled) | 7382 | if (mddev->recovery_disabled == conf->recovery_disabled) |
@@ -7346,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev) | |||
7346 | > conf->min_nr_stripes || | 7480 | > conf->min_nr_stripes || |
7347 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | 7481 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 |
7348 | > conf->min_nr_stripes) { | 7482 | > conf->min_nr_stripes) { |
7349 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", | 7483 | pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", |
7350 | mdname(mddev), | 7484 | mdname(mddev), |
7351 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | 7485 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) |
7352 | / STRIPE_SIZE)*4); | 7486 | / STRIPE_SIZE)*4); |
7353 | return 0; | 7487 | return 0; |
7354 | } | 7488 | } |
7355 | return 1; | 7489 | return 1; |
@@ -7430,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
7430 | */ | 7564 | */ |
7431 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) | 7565 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) |
7432 | < mddev->array_sectors) { | 7566 | < mddev->array_sectors) { |
7433 | printk(KERN_ERR "md/raid:%s: array size must be reduced " | 7567 | pr_warn("md/raid:%s: array size must be reduced before number of disks\n", |
7434 | "before number of disks\n", mdname(mddev)); | 7568 | mdname(mddev)); |
7435 | return -EINVAL; | 7569 | return -EINVAL; |
7436 | } | 7570 | } |
7437 | 7571 | ||
@@ -7501,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
7501 | } | 7635 | } |
7502 | mddev->raid_disks = conf->raid_disks; | 7636 | mddev->raid_disks = conf->raid_disks; |
7503 | mddev->reshape_position = conf->reshape_progress; | 7637 | mddev->reshape_position = conf->reshape_progress; |
7504 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 7638 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
7505 | 7639 | ||
7506 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 7640 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
7507 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 7641 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
@@ -7619,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) | |||
7619 | /* '2' tells resync/reshape to pause so that all | 7753 | /* '2' tells resync/reshape to pause so that all |
7620 | * active stripes can drain | 7754 | * active stripes can drain |
7621 | */ | 7755 | */ |
7756 | r5c_flush_cache(conf, INT_MAX); | ||
7622 | conf->quiesce = 2; | 7757 | conf->quiesce = 2; |
7623 | wait_event_cmd(conf->wait_for_quiescent, | 7758 | wait_event_cmd(conf->wait_for_quiescent, |
7624 | atomic_read(&conf->active_stripes) == 0 && | 7759 | atomic_read(&conf->active_stripes) == 0 && |
@@ -7649,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) | |||
7649 | 7784 | ||
7650 | /* for raid0 takeover only one zone is supported */ | 7785 | /* for raid0 takeover only one zone is supported */ |
7651 | if (raid0_conf->nr_strip_zones > 1) { | 7786 | if (raid0_conf->nr_strip_zones > 1) { |
7652 | printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", | 7787 | pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", |
7653 | mdname(mddev)); | 7788 | mdname(mddev)); |
7654 | return ERR_PTR(-EINVAL); | 7789 | return ERR_PTR(-EINVAL); |
7655 | } | 7790 | } |
7656 | 7791 | ||
@@ -7671,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) | |||
7671 | static void *raid5_takeover_raid1(struct mddev *mddev) | 7806 | static void *raid5_takeover_raid1(struct mddev *mddev) |
7672 | { | 7807 | { |
7673 | int chunksect; | 7808 | int chunksect; |
7809 | void *ret; | ||
7674 | 7810 | ||
7675 | if (mddev->raid_disks != 2 || | 7811 | if (mddev->raid_disks != 2 || |
7676 | mddev->degraded > 1) | 7812 | mddev->degraded > 1) |
@@ -7692,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev) | |||
7692 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; | 7828 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; |
7693 | mddev->new_chunk_sectors = chunksect; | 7829 | mddev->new_chunk_sectors = chunksect; |
7694 | 7830 | ||
7695 | return setup_conf(mddev); | 7831 | ret = setup_conf(mddev); |
7832 | if (!IS_ERR_VALUE(ret)) | ||
7833 | clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
7834 | return ret; | ||
7696 | } | 7835 | } |
7697 | 7836 | ||
7698 | static void *raid5_takeover_raid6(struct mddev *mddev) | 7837 | static void *raid5_takeover_raid6(struct mddev *mddev) |
@@ -7762,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev) | |||
7762 | conf->chunk_sectors = new_chunk ; | 7901 | conf->chunk_sectors = new_chunk ; |
7763 | mddev->chunk_sectors = new_chunk; | 7902 | mddev->chunk_sectors = new_chunk; |
7764 | } | 7903 | } |
7765 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 7904 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
7766 | md_wakeup_thread(mddev->thread); | 7905 | md_wakeup_thread(mddev->thread); |
7767 | } | 7906 | } |
7768 | return check_reshape(mddev); | 7907 | return check_reshape(mddev); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 57ec49f0839e..ed8e1362ab36 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -226,6 +226,8 @@ struct stripe_head { | |||
226 | 226 | ||
227 | struct r5l_io_unit *log_io; | 227 | struct r5l_io_unit *log_io; |
228 | struct list_head log_list; | 228 | struct list_head log_list; |
229 | sector_t log_start; /* first meta block on the journal */ | ||
230 | struct list_head r5c; /* for r5c_cache->stripe_in_journal */ | ||
229 | /** | 231 | /** |
230 | * struct stripe_operations | 232 | * struct stripe_operations |
231 | * @target - STRIPE_OP_COMPUTE_BLK target | 233 | * @target - STRIPE_OP_COMPUTE_BLK target |
@@ -264,6 +266,7 @@ struct stripe_head_state { | |||
264 | int syncing, expanding, expanded, replacing; | 266 | int syncing, expanding, expanded, replacing; |
265 | int locked, uptodate, to_read, to_write, failed, written; | 267 | int locked, uptodate, to_read, to_write, failed, written; |
266 | int to_fill, compute, req_compute, non_overwrite; | 268 | int to_fill, compute, req_compute, non_overwrite; |
269 | int injournal, just_cached; | ||
267 | int failed_num[2]; | 270 | int failed_num[2]; |
268 | int p_failed, q_failed; | 271 | int p_failed, q_failed; |
269 | int dec_preread_active; | 272 | int dec_preread_active; |
@@ -273,6 +276,7 @@ struct stripe_head_state { | |||
273 | struct md_rdev *blocked_rdev; | 276 | struct md_rdev *blocked_rdev; |
274 | int handle_bad_blocks; | 277 | int handle_bad_blocks; |
275 | int log_failed; | 278 | int log_failed; |
279 | int waiting_extra_page; | ||
276 | }; | 280 | }; |
277 | 281 | ||
278 | /* Flags for struct r5dev.flags */ | 282 | /* Flags for struct r5dev.flags */ |
@@ -313,6 +317,11 @@ enum r5dev_flags { | |||
313 | */ | 317 | */ |
314 | R5_Discard, /* Discard the stripe */ | 318 | R5_Discard, /* Discard the stripe */ |
315 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ | 319 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ |
320 | R5_InJournal, /* data being written is in the journal device. | ||
321 | * if R5_InJournal is set for parity pd_idx, all the | ||
322 | * data and parity being written are in the journal | ||
323 | * device | ||
324 | */ | ||
316 | }; | 325 | }; |
317 | 326 | ||
318 | /* | 327 | /* |
@@ -345,7 +354,30 @@ enum { | |||
345 | STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add | 354 | STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add |
346 | * to batch yet. | 355 | * to batch yet. |
347 | */ | 356 | */ |
348 | STRIPE_LOG_TRAPPED, /* trapped into log */ | 357 | STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) |
358 | * this bit is used in two scenarios: | ||
359 | * | ||
360 | * 1. write-out phase | ||
361 | * set in first entry of r5l_write_stripe | ||
362 | * clear in second entry of r5l_write_stripe | ||
363 | * used to bypass logic in handle_stripe | ||
364 | * | ||
365 | * 2. caching phase | ||
366 | * set in r5c_try_caching_write() | ||
367 | * clear when journal write is done | ||
368 | * used to initiate r5c_cache_data() | ||
369 | * also used to bypass logic in handle_stripe | ||
370 | */ | ||
371 | STRIPE_R5C_CACHING, /* the stripe is in caching phase | ||
372 | * see more detail in the raid5-cache.c | ||
373 | */ | ||
374 | STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or | ||
375 | * in conf->r5c_partial_stripe_list) | ||
376 | */ | ||
377 | STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or | ||
378 | * in conf->r5c_full_stripe_list) | ||
379 | */ | ||
380 | STRIPE_R5C_PREFLUSH, /* need to flush journal device */ | ||
349 | }; | 381 | }; |
350 | 382 | ||
351 | #define STRIPE_EXPAND_SYNC_FLAGS \ | 383 | #define STRIPE_EXPAND_SYNC_FLAGS \ |
@@ -408,8 +440,86 @@ enum { | |||
408 | 440 | ||
409 | struct disk_info { | 441 | struct disk_info { |
410 | struct md_rdev *rdev, *replacement; | 442 | struct md_rdev *rdev, *replacement; |
443 | struct page *extra_page; /* extra page to use in prexor */ | ||
411 | }; | 444 | }; |
412 | 445 | ||
446 | /* | ||
447 | * Stripe cache | ||
448 | */ | ||
449 | |||
450 | #define NR_STRIPES 256 | ||
451 | #define STRIPE_SIZE PAGE_SIZE | ||
452 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | ||
453 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | ||
454 | #define IO_THRESHOLD 1 | ||
455 | #define BYPASS_THRESHOLD 1 | ||
456 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) | ||
457 | #define HASH_MASK (NR_HASH - 1) | ||
458 | #define MAX_STRIPE_BATCH 8 | ||
459 | |||
460 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | ||
461 | * order without overlap. There may be several bio's per stripe+device, and | ||
462 | * a bio could span several devices. | ||
463 | * When walking this list for a particular stripe+device, we must never proceed | ||
464 | * beyond a bio that extends past this device, as the next bio might no longer | ||
465 | * be valid. | ||
466 | * This function is used to determine the 'next' bio in the list, given the | ||
467 | * sector of the current stripe+device | ||
468 | */ | ||
469 | static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | ||
470 | { | ||
471 | int sectors = bio_sectors(bio); | ||
472 | |||
473 | if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) | ||
474 | return bio->bi_next; | ||
475 | else | ||
476 | return NULL; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * We maintain a biased count of active stripes in the bottom 16 bits of | ||
481 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | ||
482 | */ | ||
483 | static inline int raid5_bi_processed_stripes(struct bio *bio) | ||
484 | { | ||
485 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
486 | |||
487 | return (atomic_read(segments) >> 16) & 0xffff; | ||
488 | } | ||
489 | |||
490 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) | ||
491 | { | ||
492 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
493 | |||
494 | return atomic_sub_return(1, segments) & 0xffff; | ||
495 | } | ||
496 | |||
497 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) | ||
498 | { | ||
499 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
500 | |||
501 | atomic_inc(segments); | ||
502 | } | ||
503 | |||
504 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, | ||
505 | unsigned int cnt) | ||
506 | { | ||
507 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
508 | int old, new; | ||
509 | |||
510 | do { | ||
511 | old = atomic_read(segments); | ||
512 | new = (old & 0xffff) | (cnt << 16); | ||
513 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
514 | } | ||
515 | |||
516 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) | ||
517 | { | ||
518 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
519 | |||
520 | atomic_set(segments, cnt); | ||
521 | } | ||
522 | |||
413 | /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. | 523 | /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. |
414 | * This is because we sometimes take all the spinlocks | 524 | * This is because we sometimes take all the spinlocks |
415 | * and creating that much locking depth can cause | 525 | * and creating that much locking depth can cause |
@@ -432,6 +542,30 @@ struct r5worker_group { | |||
432 | int stripes_cnt; | 542 | int stripes_cnt; |
433 | }; | 543 | }; |
434 | 544 | ||
545 | enum r5_cache_state { | ||
546 | R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, | ||
547 | * waiting for 25% to be free | ||
548 | */ | ||
549 | R5_ALLOC_MORE, /* It might help to allocate another | ||
550 | * stripe. | ||
551 | */ | ||
552 | R5_DID_ALLOC, /* A stripe was allocated, don't allocate | ||
553 | * more until at least one has been | ||
554 | * released. This avoids flooding | ||
555 | * the cache. | ||
556 | */ | ||
557 | R5C_LOG_TIGHT, /* log device space tight, need to | ||
558 | * prioritize stripes at last_checkpoint | ||
559 | */ | ||
560 | R5C_LOG_CRITICAL, /* log device is running out of space, | ||
561 | * only process stripes that are already | ||
562 | * occupying the log | ||
563 | */ | ||
564 | R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page | ||
565 | * for prexor | ||
566 | */ | ||
567 | }; | ||
568 | |||
435 | struct r5conf { | 569 | struct r5conf { |
436 | struct hlist_head *stripe_hashtbl; | 570 | struct hlist_head *stripe_hashtbl; |
437 | /* only protect corresponding hash list and inactive_list */ | 571 | /* only protect corresponding hash list and inactive_list */ |
@@ -519,23 +653,18 @@ struct r5conf { | |||
519 | */ | 653 | */ |
520 | atomic_t active_stripes; | 654 | atomic_t active_stripes; |
521 | struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; | 655 | struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; |
656 | |||
657 | atomic_t r5c_cached_full_stripes; | ||
658 | struct list_head r5c_full_stripe_list; | ||
659 | atomic_t r5c_cached_partial_stripes; | ||
660 | struct list_head r5c_partial_stripe_list; | ||
661 | |||
522 | atomic_t empty_inactive_list_nr; | 662 | atomic_t empty_inactive_list_nr; |
523 | struct llist_head released_stripes; | 663 | struct llist_head released_stripes; |
524 | wait_queue_head_t wait_for_quiescent; | 664 | wait_queue_head_t wait_for_quiescent; |
525 | wait_queue_head_t wait_for_stripe; | 665 | wait_queue_head_t wait_for_stripe; |
526 | wait_queue_head_t wait_for_overlap; | 666 | wait_queue_head_t wait_for_overlap; |
527 | unsigned long cache_state; | 667 | unsigned long cache_state; |
528 | #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, | ||
529 | * waiting for 25% to be free | ||
530 | */ | ||
531 | #define R5_ALLOC_MORE 2 /* It might help to allocate another | ||
532 | * stripe. | ||
533 | */ | ||
534 | #define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate | ||
535 | * more until at least one has been | ||
536 | * released. This avoids flooding | ||
537 | * the cache. | ||
538 | */ | ||
539 | struct shrinker shrinker; | 668 | struct shrinker shrinker; |
540 | int pool_size; /* number of disks in stripeheads in pool */ | 669 | int pool_size; /* number of disks in stripeheads in pool */ |
541 | spinlock_t device_lock; | 670 | spinlock_t device_lock; |
@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh); | |||
633 | extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); | 762 | extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); |
634 | extern void r5l_quiesce(struct r5l_log *log, int state); | 763 | extern void r5l_quiesce(struct r5l_log *log, int state); |
635 | extern bool r5l_log_disk_error(struct r5conf *conf); | 764 | extern bool r5l_log_disk_error(struct r5conf *conf); |
765 | extern bool r5c_is_writeback(struct r5l_log *log); | ||
766 | extern int | ||
767 | r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, | ||
768 | struct stripe_head_state *s, int disks); | ||
769 | extern void | ||
770 | r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, | ||
771 | struct stripe_head_state *s); | ||
772 | extern void r5c_release_extra_page(struct stripe_head *sh); | ||
773 | extern void r5c_use_extra_page(struct stripe_head *sh); | ||
774 | extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); | ||
775 | extern void r5c_handle_cached_data_endio(struct r5conf *conf, | ||
776 | struct stripe_head *sh, int disks, struct bio_list *return_bi); | ||
777 | extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, | ||
778 | struct stripe_head_state *s); | ||
779 | extern void r5c_make_stripe_write_out(struct stripe_head *sh); | ||
780 | extern void r5c_flush_cache(struct r5conf *conf, int num); | ||
781 | extern void r5c_check_stripe_cache_usage(struct r5conf *conf); | ||
782 | extern void r5c_check_cached_full_stripe(struct r5conf *conf); | ||
783 | extern struct md_sysfs_entry r5c_journal_mode; | ||
636 | #endif | 784 | #endif |
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index c3e654c6d518..9930f3e9040f 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h | |||
@@ -84,6 +84,10 @@ | |||
84 | #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed | 84 | #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed |
85 | * For clustered enviroments only. | 85 | * For clustered enviroments only. |
86 | */ | 86 | */ |
87 | #define MD_DISK_FAILFAST 10 /* Send REQ_FAILFAST if there are multiple | ||
88 | * devices available - and don't try to | ||
89 | * correct read errors. | ||
90 | */ | ||
87 | 91 | ||
88 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. | 92 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. |
89 | * read requests will only be sent here in | 93 | * read requests will only be sent here in |
@@ -265,8 +269,9 @@ struct mdp_superblock_1 { | |||
265 | __le32 dev_number; /* permanent identifier of this device - not role in raid */ | 269 | __le32 dev_number; /* permanent identifier of this device - not role in raid */ |
266 | __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ | 270 | __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ |
267 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ | 271 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ |
268 | __u8 devflags; /* per-device flags. Only one defined...*/ | 272 | __u8 devflags; /* per-device flags. Only two defined...*/ |
269 | #define WriteMostly1 1 /* mask for writemostly flag in above */ | 273 | #define WriteMostly1 1 /* mask for writemostly flag in above */ |
274 | #define FailFast1 2 /* Should avoid retries and fixups and just fail */ | ||
270 | /* Bad block log. If there are any bad blocks the feature flag is set. | 275 | /* Bad block log. If there are any bad blocks the feature flag is set. |
271 | * If offset and size are non-zero, that space is reserved and available | 276 | * If offset and size are non-zero, that space is reserved and available |
272 | */ | 277 | */ |
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index 76734004358d..20bca3d44f67 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c | |||
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
87 | kernel_fpu_end(); | 87 | kernel_fpu_end(); |
88 | } | 88 | } |
89 | 89 | ||
90 | static void raid6_avx21_xor_syndrome(int disks, int start, int stop, | ||
91 | size_t bytes, void **ptrs) | ||
92 | { | ||
93 | u8 **dptr = (u8 **)ptrs; | ||
94 | u8 *p, *q; | ||
95 | int d, z, z0; | ||
96 | |||
97 | z0 = stop; /* P/Q right side optimization */ | ||
98 | p = dptr[disks-2]; /* XOR parity */ | ||
99 | q = dptr[disks-1]; /* RS syndrome */ | ||
100 | |||
101 | kernel_fpu_begin(); | ||
102 | |||
103 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | ||
104 | |||
105 | for (d = 0 ; d < bytes ; d += 32) { | ||
106 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | ||
107 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | ||
108 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | ||
109 | /* P/Q data pages */ | ||
110 | for (z = z0-1 ; z >= start ; z--) { | ||
111 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
112 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
113 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
114 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
115 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
116 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | ||
117 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | ||
118 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
119 | } | ||
120 | /* P/Q left side optimization */ | ||
121 | for (z = start-1 ; z >= 0 ; z--) { | ||
122 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
123 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
124 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
125 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
126 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
127 | } | ||
128 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | ||
129 | /* Don't use movntdq for r/w memory area < cache line */ | ||
130 | asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); | ||
131 | asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); | ||
132 | } | ||
133 | |||
134 | asm volatile("sfence" : : : "memory"); | ||
135 | kernel_fpu_end(); | ||
136 | } | ||
137 | |||
90 | const struct raid6_calls raid6_avx2x1 = { | 138 | const struct raid6_calls raid6_avx2x1 = { |
91 | raid6_avx21_gen_syndrome, | 139 | raid6_avx21_gen_syndrome, |
92 | NULL, /* XOR not yet implemented */ | 140 | raid6_avx21_xor_syndrome, |
93 | raid6_have_avx2, | 141 | raid6_have_avx2, |
94 | "avx2x1", | 142 | "avx2x1", |
95 | 1 /* Has cache hints */ | 143 | 1 /* Has cache hints */ |
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
149 | kernel_fpu_end(); | 197 | kernel_fpu_end(); |
150 | } | 198 | } |
151 | 199 | ||
200 | static void raid6_avx22_xor_syndrome(int disks, int start, int stop, | ||
201 | size_t bytes, void **ptrs) | ||
202 | { | ||
203 | u8 **dptr = (u8 **)ptrs; | ||
204 | u8 *p, *q; | ||
205 | int d, z, z0; | ||
206 | |||
207 | z0 = stop; /* P/Q right side optimization */ | ||
208 | p = dptr[disks-2]; /* XOR parity */ | ||
209 | q = dptr[disks-1]; /* RS syndrome */ | ||
210 | |||
211 | kernel_fpu_begin(); | ||
212 | |||
213 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | ||
214 | |||
215 | for (d = 0 ; d < bytes ; d += 64) { | ||
216 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | ||
217 | asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); | ||
218 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | ||
219 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); | ||
220 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | ||
221 | asm volatile("vpxor %ymm6,%ymm3,%ymm3"); | ||
222 | /* P/Q data pages */ | ||
223 | for (z = z0-1 ; z >= start ; z--) { | ||
224 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
225 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
226 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
227 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
228 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
229 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
230 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
231 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
232 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
233 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
234 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | ||
235 | asm volatile("vmovdqa %0,%%ymm7" | ||
236 | :: "m" (dptr[z][d+32])); | ||
237 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | ||
238 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | ||
239 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
240 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
241 | } | ||
242 | /* P/Q left side optimization */ | ||
243 | for (z = start-1 ; z >= 0 ; z--) { | ||
244 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
245 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
246 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
247 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
248 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
249 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
250 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
251 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
252 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
253 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
254 | } | ||
255 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | ||
256 | asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); | ||
257 | /* Don't use movntdq for r/w memory area < cache line */ | ||
258 | asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); | ||
259 | asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); | ||
260 | asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); | ||
261 | asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); | ||
262 | } | ||
263 | |||
264 | asm volatile("sfence" : : : "memory"); | ||
265 | kernel_fpu_end(); | ||
266 | } | ||
267 | |||
152 | const struct raid6_calls raid6_avx2x2 = { | 268 | const struct raid6_calls raid6_avx2x2 = { |
153 | raid6_avx22_gen_syndrome, | 269 | raid6_avx22_gen_syndrome, |
154 | NULL, /* XOR not yet implemented */ | 270 | raid6_avx22_xor_syndrome, |
155 | raid6_have_avx2, | 271 | raid6_have_avx2, |
156 | "avx2x2", | 272 | "avx2x2", |
157 | 1 /* Has cache hints */ | 273 | 1 /* Has cache hints */ |
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
242 | kernel_fpu_end(); | 358 | kernel_fpu_end(); |
243 | } | 359 | } |
244 | 360 | ||
361 | static void raid6_avx24_xor_syndrome(int disks, int start, int stop, | ||
362 | size_t bytes, void **ptrs) | ||
363 | { | ||
364 | u8 **dptr = (u8 **)ptrs; | ||
365 | u8 *p, *q; | ||
366 | int d, z, z0; | ||
367 | |||
368 | z0 = stop; /* P/Q right side optimization */ | ||
369 | p = dptr[disks-2]; /* XOR parity */ | ||
370 | q = dptr[disks-1]; /* RS syndrome */ | ||
371 | |||
372 | kernel_fpu_begin(); | ||
373 | |||
374 | asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); | ||
375 | |||
376 | for (d = 0 ; d < bytes ; d += 128) { | ||
377 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | ||
378 | asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); | ||
379 | asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); | ||
380 | asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); | ||
381 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | ||
382 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); | ||
383 | asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); | ||
384 | asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); | ||
385 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | ||
386 | asm volatile("vpxor %ymm6,%ymm3,%ymm3"); | ||
387 | asm volatile("vpxor %ymm12,%ymm10,%ymm10"); | ||
388 | asm volatile("vpxor %ymm14,%ymm11,%ymm11"); | ||
389 | /* P/Q data pages */ | ||
390 | for (z = z0-1 ; z >= start ; z--) { | ||
391 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); | ||
392 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); | ||
393 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
394 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
395 | asm volatile("vpxor %ymm13,%ymm13,%ymm13"); | ||
396 | asm volatile("vpxor %ymm15,%ymm15,%ymm15"); | ||
397 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
398 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
399 | asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); | ||
400 | asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); | ||
401 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
402 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
403 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | ||
404 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | ||
405 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
406 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
407 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | ||
408 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | ||
409 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
410 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
411 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | ||
412 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | ||
413 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | ||
414 | asm volatile("vmovdqa %0,%%ymm7" | ||
415 | :: "m" (dptr[z][d+32])); | ||
416 | asm volatile("vmovdqa %0,%%ymm13" | ||
417 | :: "m" (dptr[z][d+64])); | ||
418 | asm volatile("vmovdqa %0,%%ymm15" | ||
419 | :: "m" (dptr[z][d+96])); | ||
420 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | ||
421 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | ||
422 | asm volatile("vpxor %ymm13,%ymm10,%ymm10"); | ||
423 | asm volatile("vpxor %ymm15,%ymm11,%ymm11"); | ||
424 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
425 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
426 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | ||
427 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | ||
428 | } | ||
429 | asm volatile("prefetchnta %0" :: "m" (q[d])); | ||
430 | asm volatile("prefetchnta %0" :: "m" (q[d+64])); | ||
431 | /* P/Q left side optimization */ | ||
432 | for (z = start-1 ; z >= 0 ; z--) { | ||
433 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
434 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
435 | asm volatile("vpxor %ymm13,%ymm13,%ymm13"); | ||
436 | asm volatile("vpxor %ymm15,%ymm15,%ymm15"); | ||
437 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
438 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
439 | asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); | ||
440 | asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); | ||
441 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
442 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
443 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | ||
444 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | ||
445 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
446 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
447 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | ||
448 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | ||
449 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
450 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
451 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | ||
452 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | ||
453 | } | ||
454 | asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); | ||
455 | asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); | ||
456 | asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); | ||
457 | asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); | ||
458 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | ||
459 | asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); | ||
460 | asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); | ||
461 | asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); | ||
462 | asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); | ||
463 | asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); | ||
464 | asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); | ||
465 | asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); | ||
466 | } | ||
467 | asm volatile("sfence" : : : "memory"); | ||
468 | kernel_fpu_end(); | ||
469 | } | ||
470 | |||
245 | const struct raid6_calls raid6_avx2x4 = { | 471 | const struct raid6_calls raid6_avx2x4 = { |
246 | raid6_avx24_gen_syndrome, | 472 | raid6_avx24_gen_syndrome, |
247 | NULL, /* XOR not yet implemented */ | 473 | raid6_avx24_xor_syndrome, |
248 | raid6_have_avx2, | 474 | raid6_have_avx2, |
249 | "avx2x4", | 475 | "avx2x4", |
250 | 1 /* Has cache hints */ | 476 | 1 /* Has cache hints */ |