diff options
| -rw-r--r-- | drivers/md/bitmap.c | 166 | ||||
| -rw-r--r-- | drivers/md/dm-raid.c | 4 | ||||
| -rw-r--r-- | drivers/md/linear.c | 31 | ||||
| -rw-r--r-- | drivers/md/md.c | 701 | ||||
| -rw-r--r-- | drivers/md/md.h | 108 | ||||
| -rw-r--r-- | drivers/md/multipath.c | 92 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 107 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 247 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 19 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 295 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 2 | ||||
| -rw-r--r-- | drivers/md/raid5-cache.c | 1833 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 623 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 172 | ||||
| -rw-r--r-- | include/uapi/linux/raid/md_p.h | 7 | ||||
| -rw-r--r-- | lib/raid6/avx2.c | 232 |
16 files changed, 3403 insertions, 1236 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2d826927a3bf..9fb2ccac958a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
| 28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
| 29 | #include <linux/seq_file.h> | 29 | #include <linux/seq_file.h> |
| 30 | #include <trace/events/block.h> | ||
| 30 | #include "md.h" | 31 | #include "md.h" |
| 31 | #include "bitmap.h" | 32 | #include "bitmap.h" |
| 32 | 33 | ||
| @@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde | |||
| 208 | 209 | ||
| 209 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | 210 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) |
| 210 | { | 211 | { |
| 211 | struct md_rdev *rdev = NULL; | 212 | struct md_rdev *rdev; |
| 212 | struct block_device *bdev; | 213 | struct block_device *bdev; |
| 213 | struct mddev *mddev = bitmap->mddev; | 214 | struct mddev *mddev = bitmap->mddev; |
| 214 | struct bitmap_storage *store = &bitmap->storage; | 215 | struct bitmap_storage *store = &bitmap->storage; |
| 215 | 216 | ||
| 217 | restart: | ||
| 218 | rdev = NULL; | ||
| 216 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 219 | while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { |
| 217 | int size = PAGE_SIZE; | 220 | int size = PAGE_SIZE; |
| 218 | loff_t offset = mddev->bitmap_info.offset; | 221 | loff_t offset = mddev->bitmap_info.offset; |
| @@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 268 | page); | 271 | page); |
| 269 | } | 272 | } |
| 270 | 273 | ||
| 271 | if (wait) | 274 | if (wait && md_super_wait(mddev) < 0) |
| 272 | md_super_wait(mddev); | 275 | goto restart; |
| 273 | return 0; | 276 | return 0; |
| 274 | 277 | ||
| 275 | bad_alignment: | 278 | bad_alignment: |
| @@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index, | |||
| 405 | ret = -EIO; | 408 | ret = -EIO; |
| 406 | out: | 409 | out: |
| 407 | if (ret) | 410 | if (ret) |
| 408 | printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", | 411 | pr_err("md: bitmap read error: (%dB @ %llu): %d\n", |
| 409 | (int)PAGE_SIZE, | 412 | (int)PAGE_SIZE, |
| 410 | (unsigned long long)index << PAGE_SHIFT, | 413 | (unsigned long long)index << PAGE_SHIFT, |
| 411 | ret); | 414 | ret); |
| 412 | return ret; | 415 | return ret; |
| 413 | } | 416 | } |
| 414 | 417 | ||
| @@ -416,6 +419,28 @@ out: | |||
| 416 | * bitmap file superblock operations | 419 | * bitmap file superblock operations |
| 417 | */ | 420 | */ |
| 418 | 421 | ||
| 422 | /* | ||
| 423 | * bitmap_wait_writes() should be called before writing any bitmap | ||
| 424 | * blocks, to ensure previous writes, particularly from | ||
| 425 | * bitmap_daemon_work(), have completed. | ||
| 426 | */ | ||
| 427 | static void bitmap_wait_writes(struct bitmap *bitmap) | ||
| 428 | { | ||
| 429 | if (bitmap->storage.file) | ||
| 430 | wait_event(bitmap->write_wait, | ||
| 431 | atomic_read(&bitmap->pending_writes)==0); | ||
| 432 | else | ||
| 433 | /* Note that we ignore the return value. The writes | ||
| 434 | * might have failed, but that would just mean that | ||
| 435 | * some bits which should be cleared haven't been, | ||
| 436 | * which is safe. The relevant bitmap blocks will | ||
| 437 | * probably get written again, but there is no great | ||
| 438 | * loss if they aren't. | ||
| 439 | */ | ||
| 440 | md_super_wait(bitmap->mddev); | ||
| 441 | } | ||
| 442 | |||
| 443 | |||
| 419 | /* update the event counter and sync the superblock to disk */ | 444 | /* update the event counter and sync the superblock to disk */ |
| 420 | void bitmap_update_sb(struct bitmap *bitmap) | 445 | void bitmap_update_sb(struct bitmap *bitmap) |
| 421 | { | 446 | { |
| @@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
| 455 | if (!bitmap || !bitmap->storage.sb_page) | 480 | if (!bitmap || !bitmap->storage.sb_page) |
| 456 | return; | 481 | return; |
| 457 | sb = kmap_atomic(bitmap->storage.sb_page); | 482 | sb = kmap_atomic(bitmap->storage.sb_page); |
| 458 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); | 483 | pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); |
| 459 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); | 484 | pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); |
| 460 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); | 485 | pr_debug(" version: %d\n", le32_to_cpu(sb->version)); |
| 461 | printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", | 486 | pr_debug(" uuid: %08x.%08x.%08x.%08x\n", |
| 462 | *(__u32 *)(sb->uuid+0), | 487 | *(__u32 *)(sb->uuid+0), |
| 463 | *(__u32 *)(sb->uuid+4), | 488 | *(__u32 *)(sb->uuid+4), |
| 464 | *(__u32 *)(sb->uuid+8), | 489 | *(__u32 *)(sb->uuid+8), |
| 465 | *(__u32 *)(sb->uuid+12)); | 490 | *(__u32 *)(sb->uuid+12)); |
| 466 | printk(KERN_DEBUG " events: %llu\n", | 491 | pr_debug(" events: %llu\n", |
| 467 | (unsigned long long) le64_to_cpu(sb->events)); | 492 | (unsigned long long) le64_to_cpu(sb->events)); |
| 468 | printk(KERN_DEBUG "events cleared: %llu\n", | 493 | pr_debug("events cleared: %llu\n", |
| 469 | (unsigned long long) le64_to_cpu(sb->events_cleared)); | 494 | (unsigned long long) le64_to_cpu(sb->events_cleared)); |
| 470 | printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); | 495 | pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); |
| 471 | printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); | 496 | pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); |
| 472 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); | 497 | pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); |
| 473 | printk(KERN_DEBUG " sync size: %llu KB\n", | 498 | pr_debug(" sync size: %llu KB\n", |
| 474 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); | 499 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); |
| 475 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); | 500 | pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); |
| 476 | kunmap_atomic(sb); | 501 | kunmap_atomic(sb); |
| 477 | } | 502 | } |
| 478 | 503 | ||
| @@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
| 506 | BUG_ON(!chunksize); | 531 | BUG_ON(!chunksize); |
| 507 | if (!is_power_of_2(chunksize)) { | 532 | if (!is_power_of_2(chunksize)) { |
| 508 | kunmap_atomic(sb); | 533 | kunmap_atomic(sb); |
| 509 | printk(KERN_ERR "bitmap chunksize not a power of 2\n"); | 534 | pr_warn("bitmap chunksize not a power of 2\n"); |
| 510 | return -EINVAL; | 535 | return -EINVAL; |
| 511 | } | 536 | } |
| 512 | sb->chunksize = cpu_to_le32(chunksize); | 537 | sb->chunksize = cpu_to_le32(chunksize); |
| 513 | 538 | ||
| 514 | daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; | 539 | daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; |
| 515 | if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { | 540 | if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { |
| 516 | printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); | 541 | pr_debug("Choosing daemon_sleep default (5 sec)\n"); |
| 517 | daemon_sleep = 5 * HZ; | 542 | daemon_sleep = 5 * HZ; |
| 518 | } | 543 | } |
| 519 | sb->daemon_sleep = cpu_to_le32(daemon_sleep); | 544 | sb->daemon_sleep = cpu_to_le32(daemon_sleep); |
| @@ -584,7 +609,7 @@ re_read: | |||
| 584 | /* to 4k blocks */ | 609 | /* to 4k blocks */ |
| 585 | bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); | 610 | bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); |
| 586 | offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); | 611 | offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); |
| 587 | pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, | 612 | pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, |
| 588 | bitmap->cluster_slot, offset); | 613 | bitmap->cluster_slot, offset); |
| 589 | } | 614 | } |
| 590 | 615 | ||
| @@ -634,7 +659,7 @@ re_read: | |||
| 634 | else if (write_behind > COUNTER_MAX) | 659 | else if (write_behind > COUNTER_MAX) |
| 635 | reason = "write-behind limit out of range (0 - 16383)"; | 660 | reason = "write-behind limit out of range (0 - 16383)"; |
| 636 | if (reason) { | 661 | if (reason) { |
| 637 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", | 662 | pr_warn("%s: invalid bitmap file superblock: %s\n", |
| 638 | bmname(bitmap), reason); | 663 | bmname(bitmap), reason); |
| 639 | goto out; | 664 | goto out; |
| 640 | } | 665 | } |
| @@ -648,18 +673,15 @@ re_read: | |||
| 648 | * bitmap's UUID and event counter to the mddev's | 673 | * bitmap's UUID and event counter to the mddev's |
| 649 | */ | 674 | */ |
| 650 | if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { | 675 | if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { |
| 651 | printk(KERN_INFO | 676 | pr_warn("%s: bitmap superblock UUID mismatch\n", |
| 652 | "%s: bitmap superblock UUID mismatch\n", | 677 | bmname(bitmap)); |
| 653 | bmname(bitmap)); | ||
| 654 | goto out; | 678 | goto out; |
| 655 | } | 679 | } |
| 656 | events = le64_to_cpu(sb->events); | 680 | events = le64_to_cpu(sb->events); |
| 657 | if (!nodes && (events < bitmap->mddev->events)) { | 681 | if (!nodes && (events < bitmap->mddev->events)) { |
| 658 | printk(KERN_INFO | 682 | pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", |
| 659 | "%s: bitmap file is out of date (%llu < %llu) " | 683 | bmname(bitmap), events, |
| 660 | "-- forcing full recovery\n", | 684 | (unsigned long long) bitmap->mddev->events); |
| 661 | bmname(bitmap), events, | ||
| 662 | (unsigned long long) bitmap->mddev->events); | ||
| 663 | set_bit(BITMAP_STALE, &bitmap->flags); | 685 | set_bit(BITMAP_STALE, &bitmap->flags); |
| 664 | } | 686 | } |
| 665 | } | 687 | } |
| @@ -679,8 +701,8 @@ out: | |||
| 679 | if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { | 701 | if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { |
| 680 | err = md_setup_cluster(bitmap->mddev, nodes); | 702 | err = md_setup_cluster(bitmap->mddev, nodes); |
| 681 | if (err) { | 703 | if (err) { |
| 682 | pr_err("%s: Could not setup cluster service (%d)\n", | 704 | pr_warn("%s: Could not setup cluster service (%d)\n", |
| 683 | bmname(bitmap), err); | 705 | bmname(bitmap), err); |
| 684 | goto out_no_sb; | 706 | goto out_no_sb; |
| 685 | } | 707 | } |
| 686 | bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); | 708 | bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); |
| @@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap) | |||
| 847 | ptr = file_path(bitmap->storage.file, | 869 | ptr = file_path(bitmap->storage.file, |
| 848 | path, PAGE_SIZE); | 870 | path, PAGE_SIZE); |
| 849 | 871 | ||
| 850 | printk(KERN_ALERT | 872 | pr_warn("%s: kicking failed bitmap file %s from array!\n", |
| 851 | "%s: kicking failed bitmap file %s from array!\n", | 873 | bmname(bitmap), IS_ERR(ptr) ? "" : ptr); |
| 852 | bmname(bitmap), IS_ERR(ptr) ? "" : ptr); | ||
| 853 | 874 | ||
| 854 | kfree(path); | 875 | kfree(path); |
| 855 | } else | 876 | } else |
| 856 | printk(KERN_ALERT | 877 | pr_warn("%s: disabling internal bitmap due to errors\n", |
| 857 | "%s: disabling internal bitmap due to errors\n", | 878 | bmname(bitmap)); |
| 858 | bmname(bitmap)); | ||
| 859 | } | 879 | } |
| 860 | } | 880 | } |
| 861 | 881 | ||
| @@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
| 983 | { | 1003 | { |
| 984 | unsigned long i; | 1004 | unsigned long i; |
| 985 | int dirty, need_write; | 1005 | int dirty, need_write; |
| 1006 | int writing = 0; | ||
| 986 | 1007 | ||
| 987 | if (!bitmap || !bitmap->storage.filemap || | 1008 | if (!bitmap || !bitmap->storage.filemap || |
| 988 | test_bit(BITMAP_STALE, &bitmap->flags)) | 1009 | test_bit(BITMAP_STALE, &bitmap->flags)) |
| @@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap) | |||
| 997 | need_write = test_and_clear_page_attr(bitmap, i, | 1018 | need_write = test_and_clear_page_attr(bitmap, i, |
| 998 | BITMAP_PAGE_NEEDWRITE); | 1019 | BITMAP_PAGE_NEEDWRITE); |
| 999 | if (dirty || need_write) { | 1020 | if (dirty || need_write) { |
| 1021 | if (!writing) { | ||
| 1022 | bitmap_wait_writes(bitmap); | ||
| 1023 | if (bitmap->mddev->queue) | ||
| 1024 | blk_add_trace_msg(bitmap->mddev->queue, | ||
| 1025 | "md bitmap_unplug"); | ||
| 1026 | } | ||
| 1000 | clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); | 1027 | clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); |
| 1001 | write_page(bitmap, bitmap->storage.filemap[i], 0); | 1028 | write_page(bitmap, bitmap->storage.filemap[i], 0); |
| 1029 | writing = 1; | ||
| 1002 | } | 1030 | } |
| 1003 | } | 1031 | } |
| 1004 | if (bitmap->storage.file) | 1032 | if (writing) |
| 1005 | wait_event(bitmap->write_wait, | 1033 | bitmap_wait_writes(bitmap); |
| 1006 | atomic_read(&bitmap->pending_writes)==0); | ||
| 1007 | else | ||
| 1008 | md_super_wait(bitmap->mddev); | ||
| 1009 | 1034 | ||
| 1010 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) | 1035 | if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) |
| 1011 | bitmap_file_kick(bitmap); | 1036 | bitmap_file_kick(bitmap); |
| @@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
| 1056 | 1081 | ||
| 1057 | outofdate = test_bit(BITMAP_STALE, &bitmap->flags); | 1082 | outofdate = test_bit(BITMAP_STALE, &bitmap->flags); |
| 1058 | if (outofdate) | 1083 | if (outofdate) |
| 1059 | printk(KERN_INFO "%s: bitmap file is out of date, doing full " | 1084 | pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); |
| 1060 | "recovery\n", bmname(bitmap)); | ||
| 1061 | 1085 | ||
| 1062 | if (file && i_size_read(file->f_mapping->host) < store->bytes) { | 1086 | if (file && i_size_read(file->f_mapping->host) < store->bytes) { |
| 1063 | printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", | 1087 | pr_warn("%s: bitmap file too short %lu < %lu\n", |
| 1064 | bmname(bitmap), | 1088 | bmname(bitmap), |
| 1065 | (unsigned long) i_size_read(file->f_mapping->host), | 1089 | (unsigned long) i_size_read(file->f_mapping->host), |
| 1066 | store->bytes); | 1090 | store->bytes); |
| 1067 | goto err; | 1091 | goto err; |
| 1068 | } | 1092 | } |
| 1069 | 1093 | ||
| @@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
| 1137 | offset = 0; | 1161 | offset = 0; |
| 1138 | } | 1162 | } |
| 1139 | 1163 | ||
| 1140 | printk(KERN_INFO "%s: bitmap initialized from disk: " | 1164 | pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", |
| 1141 | "read %lu pages, set %lu of %lu bits\n", | 1165 | bmname(bitmap), store->file_pages, |
| 1142 | bmname(bitmap), store->file_pages, | 1166 | bit_cnt, chunks); |
| 1143 | bit_cnt, chunks); | ||
| 1144 | 1167 | ||
| 1145 | return 0; | 1168 | return 0; |
| 1146 | 1169 | ||
| 1147 | err: | 1170 | err: |
| 1148 | printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", | 1171 | pr_warn("%s: bitmap initialisation failed: %d\n", |
| 1149 | bmname(bitmap), ret); | 1172 | bmname(bitmap), ret); |
| 1150 | return ret; | 1173 | return ret; |
| 1151 | } | 1174 | } |
| 1152 | 1175 | ||
| @@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
| 1225 | } | 1248 | } |
| 1226 | bitmap->allclean = 1; | 1249 | bitmap->allclean = 1; |
| 1227 | 1250 | ||
| 1251 | if (bitmap->mddev->queue) | ||
| 1252 | blk_add_trace_msg(bitmap->mddev->queue, | ||
| 1253 | "md bitmap_daemon_work"); | ||
| 1254 | |||
| 1228 | /* Any file-page which is PENDING now needs to be written. | 1255 | /* Any file-page which is PENDING now needs to be written. |
| 1229 | * So set NEEDWRITE now, then after we make any last-minute changes | 1256 | * So set NEEDWRITE now, then after we make any last-minute changes |
| 1230 | * we will write it. | 1257 | * we will write it. |
| @@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
| 1289 | } | 1316 | } |
| 1290 | spin_unlock_irq(&counts->lock); | 1317 | spin_unlock_irq(&counts->lock); |
| 1291 | 1318 | ||
| 1319 | bitmap_wait_writes(bitmap); | ||
| 1292 | /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. | 1320 | /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. |
| 1293 | * DIRTY pages need to be written by bitmap_unplug so it can wait | 1321 | * DIRTY pages need to be written by bitmap_unplug so it can wait |
| 1294 | * for them. | 1322 | * for them. |
| @@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) | |||
| 1595 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1623 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
| 1596 | 1624 | ||
| 1597 | bitmap->mddev->curr_resync_completed = sector; | 1625 | bitmap->mddev->curr_resync_completed = sector; |
| 1598 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | 1626 | set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); |
| 1599 | sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); | 1627 | sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); |
| 1600 | s = 0; | 1628 | s = 0; |
| 1601 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1629 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
| @@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) | |||
| 1825 | if (err) | 1853 | if (err) |
| 1826 | goto error; | 1854 | goto error; |
| 1827 | 1855 | ||
| 1828 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1856 | pr_debug("created bitmap (%lu pages) for device %s\n", |
| 1829 | bitmap->counts.pages, bmname(bitmap)); | 1857 | bitmap->counts.pages, bmname(bitmap)); |
| 1830 | 1858 | ||
| 1831 | err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; | 1859 | err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; |
| 1832 | if (err) | 1860 | if (err) |
| @@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | |||
| 2029 | !bitmap->mddev->bitmap_info.external, | 2057 | !bitmap->mddev->bitmap_info.external, |
| 2030 | mddev_is_clustered(bitmap->mddev) | 2058 | mddev_is_clustered(bitmap->mddev) |
| 2031 | ? bitmap->cluster_slot : 0); | 2059 | ? bitmap->cluster_slot : 0); |
| 2032 | if (ret) | 2060 | if (ret) { |
| 2061 | bitmap_file_unmap(&store); | ||
| 2033 | goto err; | 2062 | goto err; |
| 2063 | } | ||
| 2034 | 2064 | ||
| 2035 | pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); | 2065 | pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); |
| 2036 | 2066 | ||
| @@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, | |||
| 2089 | bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + | 2119 | bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + |
| 2090 | BITMAP_BLOCK_SHIFT); | 2120 | BITMAP_BLOCK_SHIFT); |
| 2091 | blocks = old_counts.chunks << old_counts.chunkshift; | 2121 | blocks = old_counts.chunks << old_counts.chunkshift; |
| 2092 | pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); | 2122 | pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); |
| 2093 | break; | 2123 | break; |
| 2094 | } else | 2124 | } else |
| 2095 | bitmap->counts.bp[page].count += 1; | 2125 | bitmap->counts.bp[page].count += 1; |
| @@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 2266 | /* Ensure new bitmap info is stored in | 2296 | /* Ensure new bitmap info is stored in |
| 2267 | * metadata promptly. | 2297 | * metadata promptly. |
| 2268 | */ | 2298 | */ |
| 2269 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2299 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2270 | md_wakeup_thread(mddev->thread); | 2300 | md_wakeup_thread(mddev->thread); |
| 2271 | } | 2301 | } |
| 2272 | rv = 0; | 2302 | rv = 0; |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6d53810963f7..953159d9a825 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
| @@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) | |||
| 2011 | sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); | 2011 | sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); |
| 2012 | 2012 | ||
| 2013 | /* Force writing of superblocks to disk */ | 2013 | /* Force writing of superblocks to disk */ |
| 2014 | set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); | 2014 | set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags); |
| 2015 | 2015 | ||
| 2016 | /* Any superblock is better than none, choose that if given */ | 2016 | /* Any superblock is better than none, choose that if given */ |
| 2017 | return refdev ? 0 : 1; | 2017 | return refdev ? 0 : 1; |
| @@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs) | |||
| 3497 | struct mddev *mddev = &rs->md; | 3497 | struct mddev *mddev = &rs->md; |
| 3498 | int ro = mddev->ro; | 3498 | int ro = mddev->ro; |
| 3499 | 3499 | ||
| 3500 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3500 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 3501 | mddev->ro = 0; | 3501 | mddev->ro = 0; |
| 3502 | md_update_sb(mddev, 1); | 3502 | md_update_sb(mddev, 1); |
| 3503 | mddev->ro = ro; | 3503 | mddev->ro = ro; |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 86f5d435901d..5975c9915684 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
| 22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 24 | #include <trace/events/block.h> | ||
| 24 | #include "md.h" | 25 | #include "md.h" |
| 25 | #include "linear.h" | 26 | #include "linear.h" |
| 26 | 27 | ||
| @@ -101,8 +102,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
| 101 | sector_t sectors; | 102 | sector_t sectors; |
| 102 | 103 | ||
| 103 | if (j < 0 || j >= raid_disks || disk->rdev) { | 104 | if (j < 0 || j >= raid_disks || disk->rdev) { |
| 104 | printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", | 105 | pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", |
| 105 | mdname(mddev)); | 106 | mdname(mddev)); |
| 106 | goto out; | 107 | goto out; |
| 107 | } | 108 | } |
| 108 | 109 | ||
| @@ -123,8 +124,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
| 123 | discard_supported = true; | 124 | discard_supported = true; |
| 124 | } | 125 | } |
| 125 | if (cnt != raid_disks) { | 126 | if (cnt != raid_disks) { |
| 126 | printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", | 127 | pr_warn("md/linear:%s: not enough drives present. Aborting!\n", |
| 127 | mdname(mddev)); | 128 | mdname(mddev)); |
| 128 | goto out; | 129 | goto out; |
| 129 | } | 130 | } |
| 130 | 131 | ||
| @@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) | |||
| 227 | } | 228 | } |
| 228 | 229 | ||
| 229 | do { | 230 | do { |
| 230 | tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); | 231 | sector_t bio_sector = bio->bi_iter.bi_sector; |
| 232 | tmp_dev = which_dev(mddev, bio_sector); | ||
| 231 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; | 233 | start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; |
| 232 | end_sector = tmp_dev->end_sector; | 234 | end_sector = tmp_dev->end_sector; |
| 233 | data_offset = tmp_dev->rdev->data_offset; | 235 | data_offset = tmp_dev->rdev->data_offset; |
| 234 | bio->bi_bdev = tmp_dev->rdev->bdev; | 236 | bio->bi_bdev = tmp_dev->rdev->bdev; |
| 235 | 237 | ||
| 236 | if (unlikely(bio->bi_iter.bi_sector >= end_sector || | 238 | if (unlikely(bio_sector >= end_sector || |
| 237 | bio->bi_iter.bi_sector < start_sector)) | 239 | bio_sector < start_sector)) |
| 238 | goto out_of_bounds; | 240 | goto out_of_bounds; |
| 239 | 241 | ||
| 240 | if (unlikely(bio_end_sector(bio) > end_sector)) { | 242 | if (unlikely(bio_end_sector(bio) > end_sector)) { |
| 241 | /* This bio crosses a device boundary, so we have to | 243 | /* This bio crosses a device boundary, so we have to |
| 242 | * split it. | 244 | * split it. |
| 243 | */ | 245 | */ |
| 244 | split = bio_split(bio, end_sector - | 246 | split = bio_split(bio, end_sector - bio_sector, |
| 245 | bio->bi_iter.bi_sector, | ||
| 246 | GFP_NOIO, fs_bio_set); | 247 | GFP_NOIO, fs_bio_set); |
| 247 | bio_chain(split, bio); | 248 | bio_chain(split, bio); |
| 248 | } else { | 249 | } else { |
| @@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) | |||
| 256 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { | 257 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { |
| 257 | /* Just ignore it */ | 258 | /* Just ignore it */ |
| 258 | bio_endio(split); | 259 | bio_endio(split); |
| 259 | } else | 260 | } else { |
| 261 | if (mddev->gendisk) | ||
| 262 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), | ||
| 263 | split, disk_devt(mddev->gendisk), | ||
| 264 | bio_sector); | ||
| 260 | generic_make_request(split); | 265 | generic_make_request(split); |
| 266 | } | ||
| 261 | } while (split != bio); | 267 | } while (split != bio); |
| 262 | return; | 268 | return; |
| 263 | 269 | ||
| 264 | out_of_bounds: | 270 | out_of_bounds: |
| 265 | printk(KERN_ERR | 271 | pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", |
| 266 | "md/linear:%s: make_request: Sector %llu out of bounds on " | ||
| 267 | "dev %s: %llu sectors, offset %llu\n", | ||
| 268 | mdname(mddev), | 272 | mdname(mddev), |
| 269 | (unsigned long long)bio->bi_iter.bi_sector, | 273 | (unsigned long long)bio->bi_iter.bi_sector, |
| 270 | bdevname(tmp_dev->rdev->bdev, b), | 274 | bdevname(tmp_dev->rdev->bdev, b), |
| @@ -275,7 +279,6 @@ out_of_bounds: | |||
| 275 | 279 | ||
| 276 | static void linear_status (struct seq_file *seq, struct mddev *mddev) | 280 | static void linear_status (struct seq_file *seq, struct mddev *mddev) |
| 277 | { | 281 | { |
| 278 | |||
| 279 | seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); | 282 | seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); |
| 280 | } | 283 | } |
| 281 | 284 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index f975cd08923d..82821ee0d57f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -30,6 +30,18 @@ | |||
| 30 | You should have received a copy of the GNU General Public License | 30 | You should have received a copy of the GNU General Public License |
| 31 | (for example /usr/src/linux/COPYING); if not, write to the Free | 31 | (for example /usr/src/linux/COPYING); if not, write to the Free |
| 32 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 32 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 33 | |||
| 34 | Errors, Warnings, etc. | ||
| 35 | Please use: | ||
| 36 | pr_crit() for error conditions that risk data loss | ||
| 37 | pr_err() for error conditions that are unexpected, like an IO error | ||
| 38 | or internal inconsistency | ||
| 39 | pr_warn() for error conditions that could have been predicated, like | ||
| 40 | adding a device to an array when it has incompatible metadata | ||
| 41 | pr_info() for every interesting, very rare events, like an array starting | ||
| 42 | or stopping, or resync starting or stopping | ||
| 43 | pr_debug() for everything else. | ||
| 44 | |||
| 33 | */ | 45 | */ |
| 34 | 46 | ||
| 35 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
| @@ -52,6 +64,7 @@ | |||
| 52 | #include <linux/raid/md_p.h> | 64 | #include <linux/raid/md_p.h> |
| 53 | #include <linux/raid/md_u.h> | 65 | #include <linux/raid/md_u.h> |
| 54 | #include <linux/slab.h> | 66 | #include <linux/slab.h> |
| 67 | #include <trace/events/block.h> | ||
| 55 | #include "md.h" | 68 | #include "md.h" |
| 56 | #include "bitmap.h" | 69 | #include "bitmap.h" |
| 57 | #include "md-cluster.h" | 70 | #include "md-cluster.h" |
| @@ -684,11 +697,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) | |||
| 684 | static int alloc_disk_sb(struct md_rdev *rdev) | 697 | static int alloc_disk_sb(struct md_rdev *rdev) |
| 685 | { | 698 | { |
| 686 | rdev->sb_page = alloc_page(GFP_KERNEL); | 699 | rdev->sb_page = alloc_page(GFP_KERNEL); |
| 687 | if (!rdev->sb_page) { | 700 | if (!rdev->sb_page) |
| 688 | printk(KERN_ALERT "md: out of memory.\n"); | ||
| 689 | return -ENOMEM; | 701 | return -ENOMEM; |
| 690 | } | ||
| 691 | |||
| 692 | return 0; | 702 | return 0; |
| 693 | } | 703 | } |
| 694 | 704 | ||
| @@ -715,9 +725,15 @@ static void super_written(struct bio *bio) | |||
| 715 | struct mddev *mddev = rdev->mddev; | 725 | struct mddev *mddev = rdev->mddev; |
| 716 | 726 | ||
| 717 | if (bio->bi_error) { | 727 | if (bio->bi_error) { |
| 718 | printk("md: super_written gets error=%d\n", bio->bi_error); | 728 | pr_err("md: super_written gets error=%d\n", bio->bi_error); |
| 719 | md_error(mddev, rdev); | 729 | md_error(mddev, rdev); |
| 720 | } | 730 | if (!test_bit(Faulty, &rdev->flags) |
| 731 | && (bio->bi_opf & MD_FAILFAST)) { | ||
| 732 | set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); | ||
| 733 | set_bit(LastDev, &rdev->flags); | ||
| 734 | } | ||
| 735 | } else | ||
| 736 | clear_bit(LastDev, &rdev->flags); | ||
| 721 | 737 | ||
| 722 | if (atomic_dec_and_test(&mddev->pending_writes)) | 738 | if (atomic_dec_and_test(&mddev->pending_writes)) |
| 723 | wake_up(&mddev->sb_wait); | 739 | wake_up(&mddev->sb_wait); |
| @@ -734,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | |||
| 734 | * if zero is reached. | 750 | * if zero is reached. |
| 735 | * If an error occurred, call md_error | 751 | * If an error occurred, call md_error |
| 736 | */ | 752 | */ |
| 737 | struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | 753 | struct bio *bio; |
| 754 | int ff = 0; | ||
| 755 | |||
| 756 | if (test_bit(Faulty, &rdev->flags)) | ||
| 757 | return; | ||
| 758 | |||
| 759 | bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | ||
| 738 | 760 | ||
| 739 | atomic_inc(&rdev->nr_pending); | 761 | atomic_inc(&rdev->nr_pending); |
| 740 | 762 | ||
| @@ -743,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | |||
| 743 | bio_add_page(bio, page, size, 0); | 765 | bio_add_page(bio, page, size, 0); |
| 744 | bio->bi_private = rdev; | 766 | bio->bi_private = rdev; |
| 745 | bio->bi_end_io = super_written; | 767 | bio->bi_end_io = super_written; |
| 746 | bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA; | 768 | |
| 769 | if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && | ||
| 770 | test_bit(FailFast, &rdev->flags) && | ||
| 771 | !test_bit(LastDev, &rdev->flags)) | ||
| 772 | ff = MD_FAILFAST; | ||
| 773 | bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff; | ||
| 747 | 774 | ||
| 748 | atomic_inc(&mddev->pending_writes); | 775 | atomic_inc(&mddev->pending_writes); |
| 749 | submit_bio(bio); | 776 | submit_bio(bio); |
| 750 | } | 777 | } |
| 751 | 778 | ||
| 752 | void md_super_wait(struct mddev *mddev) | 779 | int md_super_wait(struct mddev *mddev) |
| 753 | { | 780 | { |
| 754 | /* wait for all superblock writes that were scheduled to complete */ | 781 | /* wait for all superblock writes that were scheduled to complete */ |
| 755 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); | 782 | wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
| 783 | if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) | ||
| 784 | return -EAGAIN; | ||
| 785 | return 0; | ||
| 756 | } | 786 | } |
| 757 | 787 | ||
| 758 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | 788 | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
| @@ -795,8 +825,8 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
| 795 | return 0; | 825 | return 0; |
| 796 | 826 | ||
| 797 | fail: | 827 | fail: |
| 798 | printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", | 828 | pr_err("md: disabled device %s, could not read superblock.\n", |
| 799 | bdevname(rdev->bdev,b)); | 829 | bdevname(rdev->bdev,b)); |
| 800 | return -EINVAL; | 830 | return -EINVAL; |
| 801 | } | 831 | } |
| 802 | 832 | ||
| @@ -818,7 +848,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
| 818 | 848 | ||
| 819 | if (!tmp1 || !tmp2) { | 849 | if (!tmp1 || !tmp2) { |
| 820 | ret = 0; | 850 | ret = 0; |
| 821 | printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); | ||
| 822 | goto abort; | 851 | goto abort; |
| 823 | } | 852 | } |
| 824 | 853 | ||
| @@ -932,7 +961,7 @@ int md_check_no_bitmap(struct mddev *mddev) | |||
| 932 | { | 961 | { |
| 933 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) | 962 | if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
| 934 | return 0; | 963 | return 0; |
| 935 | printk(KERN_ERR "%s: bitmaps are not supported for %s\n", | 964 | pr_warn("%s: bitmaps are not supported for %s\n", |
| 936 | mdname(mddev), mddev->pers->name); | 965 | mdname(mddev), mddev->pers->name); |
| 937 | return 1; | 966 | return 1; |
| 938 | } | 967 | } |
| @@ -956,7 +985,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
| 956 | rdev->sb_start = calc_dev_sboffset(rdev); | 985 | rdev->sb_start = calc_dev_sboffset(rdev); |
| 957 | 986 | ||
| 958 | ret = read_disk_sb(rdev, MD_SB_BYTES); | 987 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
| 959 | if (ret) return ret; | 988 | if (ret) |
| 989 | return ret; | ||
| 960 | 990 | ||
| 961 | ret = -EINVAL; | 991 | ret = -EINVAL; |
| 962 | 992 | ||
| @@ -964,17 +994,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
| 964 | sb = page_address(rdev->sb_page); | 994 | sb = page_address(rdev->sb_page); |
| 965 | 995 | ||
| 966 | if (sb->md_magic != MD_SB_MAGIC) { | 996 | if (sb->md_magic != MD_SB_MAGIC) { |
| 967 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | 997 | pr_warn("md: invalid raid superblock magic on %s\n", b); |
| 968 | b); | ||
| 969 | goto abort; | 998 | goto abort; |
| 970 | } | 999 | } |
| 971 | 1000 | ||
| 972 | if (sb->major_version != 0 || | 1001 | if (sb->major_version != 0 || |
| 973 | sb->minor_version < 90 || | 1002 | sb->minor_version < 90 || |
| 974 | sb->minor_version > 91) { | 1003 | sb->minor_version > 91) { |
| 975 | printk(KERN_WARNING "Bad version number %d.%d on %s\n", | 1004 | pr_warn("Bad version number %d.%d on %s\n", |
| 976 | sb->major_version, sb->minor_version, | 1005 | sb->major_version, sb->minor_version, b); |
| 977 | b); | ||
| 978 | goto abort; | 1006 | goto abort; |
| 979 | } | 1007 | } |
| 980 | 1008 | ||
| @@ -982,8 +1010,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
| 982 | goto abort; | 1010 | goto abort; |
| 983 | 1011 | ||
| 984 | if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { | 1012 | if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { |
| 985 | printk(KERN_WARNING "md: invalid superblock checksum on %s\n", | 1013 | pr_warn("md: invalid superblock checksum on %s\n", b); |
| 986 | b); | ||
| 987 | goto abort; | 1014 | goto abort; |
| 988 | } | 1015 | } |
| 989 | 1016 | ||
| @@ -1004,14 +1031,13 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor | |||
| 1004 | __u64 ev1, ev2; | 1031 | __u64 ev1, ev2; |
| 1005 | mdp_super_t *refsb = page_address(refdev->sb_page); | 1032 | mdp_super_t *refsb = page_address(refdev->sb_page); |
| 1006 | if (!uuid_equal(refsb, sb)) { | 1033 | if (!uuid_equal(refsb, sb)) { |
| 1007 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | 1034 | pr_warn("md: %s has different UUID to %s\n", |
| 1008 | b, bdevname(refdev->bdev,b2)); | 1035 | b, bdevname(refdev->bdev,b2)); |
| 1009 | goto abort; | 1036 | goto abort; |
| 1010 | } | 1037 | } |
| 1011 | if (!sb_equal(refsb, sb)) { | 1038 | if (!sb_equal(refsb, sb)) { |
| 1012 | printk(KERN_WARNING "md: %s has same UUID" | 1039 | pr_warn("md: %s has same UUID but different superblock to %s\n", |
| 1013 | " but different superblock to %s\n", | 1040 | b, bdevname(refdev->bdev, b2)); |
| 1014 | b, bdevname(refdev->bdev, b2)); | ||
| 1015 | goto abort; | 1041 | goto abort; |
| 1016 | } | 1042 | } |
| 1017 | ev1 = md_event(sb); | 1043 | ev1 = md_event(sb); |
| @@ -1158,6 +1184,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1158 | } | 1184 | } |
| 1159 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) | 1185 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
| 1160 | set_bit(WriteMostly, &rdev->flags); | 1186 | set_bit(WriteMostly, &rdev->flags); |
| 1187 | if (desc->state & (1<<MD_DISK_FAILFAST)) | ||
| 1188 | set_bit(FailFast, &rdev->flags); | ||
| 1161 | } else /* MULTIPATH are always insync */ | 1189 | } else /* MULTIPATH are always insync */ |
| 1162 | set_bit(In_sync, &rdev->flags); | 1190 | set_bit(In_sync, &rdev->flags); |
| 1163 | return 0; | 1191 | return 0; |
| @@ -1283,6 +1311,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1283 | } | 1311 | } |
| 1284 | if (test_bit(WriteMostly, &rdev2->flags)) | 1312 | if (test_bit(WriteMostly, &rdev2->flags)) |
| 1285 | d->state |= (1<<MD_DISK_WRITEMOSTLY); | 1313 | d->state |= (1<<MD_DISK_WRITEMOSTLY); |
| 1314 | if (test_bit(FailFast, &rdev2->flags)) | ||
| 1315 | d->state |= (1<<MD_DISK_FAILFAST); | ||
| 1286 | } | 1316 | } |
| 1287 | /* now set the "removed" and "faulty" bits on any missing devices */ | 1317 | /* now set the "removed" and "faulty" bits on any missing devices */ |
| 1288 | for (i=0 ; i < mddev->raid_disks ; i++) { | 1318 | for (i=0 ; i < mddev->raid_disks ; i++) { |
| @@ -1324,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
| 1324 | if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && | 1354 | if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && |
| 1325 | rdev->mddev->level >= 1) | 1355 | rdev->mddev->level >= 1) |
| 1326 | num_sectors = (sector_t)(2ULL << 32) - 2; | 1356 | num_sectors = (sector_t)(2ULL << 32) - 2; |
| 1327 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1357 | do { |
| 1358 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||
| 1328 | rdev->sb_page); | 1359 | rdev->sb_page); |
| 1329 | md_super_wait(rdev->mddev); | 1360 | } while (md_super_wait(rdev->mddev) < 0); |
| 1330 | return num_sectors; | 1361 | return num_sectors; |
| 1331 | } | 1362 | } |
| 1332 | 1363 | ||
| @@ -1413,13 +1444,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
| 1413 | return -EINVAL; | 1444 | return -EINVAL; |
| 1414 | 1445 | ||
| 1415 | if (calc_sb_1_csum(sb) != sb->sb_csum) { | 1446 | if (calc_sb_1_csum(sb) != sb->sb_csum) { |
| 1416 | printk("md: invalid superblock checksum on %s\n", | 1447 | pr_warn("md: invalid superblock checksum on %s\n", |
| 1417 | bdevname(rdev->bdev,b)); | 1448 | bdevname(rdev->bdev,b)); |
| 1418 | return -EINVAL; | 1449 | return -EINVAL; |
| 1419 | } | 1450 | } |
| 1420 | if (le64_to_cpu(sb->data_size) < 10) { | 1451 | if (le64_to_cpu(sb->data_size) < 10) { |
| 1421 | printk("md: data_size too small on %s\n", | 1452 | pr_warn("md: data_size too small on %s\n", |
| 1422 | bdevname(rdev->bdev,b)); | 1453 | bdevname(rdev->bdev,b)); |
| 1423 | return -EINVAL; | 1454 | return -EINVAL; |
| 1424 | } | 1455 | } |
| 1425 | if (sb->pad0 || | 1456 | if (sb->pad0 || |
| @@ -1503,8 +1534,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ | |||
| 1503 | sb->level != refsb->level || | 1534 | sb->level != refsb->level || |
| 1504 | sb->layout != refsb->layout || | 1535 | sb->layout != refsb->layout || |
| 1505 | sb->chunksize != refsb->chunksize) { | 1536 | sb->chunksize != refsb->chunksize) { |
| 1506 | printk(KERN_WARNING "md: %s has strangely different" | 1537 | pr_warn("md: %s has strangely different superblock to %s\n", |
| 1507 | " superblock to %s\n", | ||
| 1508 | bdevname(rdev->bdev,b), | 1538 | bdevname(rdev->bdev,b), |
| 1509 | bdevname(refdev->bdev,b2)); | 1539 | bdevname(refdev->bdev,b2)); |
| 1510 | return -EINVAL; | 1540 | return -EINVAL; |
| @@ -1646,8 +1676,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1646 | case MD_DISK_ROLE_JOURNAL: /* journal device */ | 1676 | case MD_DISK_ROLE_JOURNAL: /* journal device */ |
| 1647 | if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { | 1677 | if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { |
| 1648 | /* journal device without journal feature */ | 1678 | /* journal device without journal feature */ |
| 1649 | printk(KERN_WARNING | 1679 | pr_warn("md: journal device provided without journal feature, ignoring the device\n"); |
| 1650 | "md: journal device provided without journal feature, ignoring the device\n"); | ||
| 1651 | return -EINVAL; | 1680 | return -EINVAL; |
| 1652 | } | 1681 | } |
| 1653 | set_bit(Journal, &rdev->flags); | 1682 | set_bit(Journal, &rdev->flags); |
| @@ -1669,6 +1698,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1669 | } | 1698 | } |
| 1670 | if (sb->devflags & WriteMostly1) | 1699 | if (sb->devflags & WriteMostly1) |
| 1671 | set_bit(WriteMostly, &rdev->flags); | 1700 | set_bit(WriteMostly, &rdev->flags); |
| 1701 | if (sb->devflags & FailFast1) | ||
| 1702 | set_bit(FailFast, &rdev->flags); | ||
| 1672 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) | 1703 | if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) |
| 1673 | set_bit(Replacement, &rdev->flags); | 1704 | set_bit(Replacement, &rdev->flags); |
| 1674 | } else /* MULTIPATH are always insync */ | 1705 | } else /* MULTIPATH are always insync */ |
| @@ -1707,6 +1738,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1707 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); | 1738 | sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
| 1708 | sb->level = cpu_to_le32(mddev->level); | 1739 | sb->level = cpu_to_le32(mddev->level); |
| 1709 | sb->layout = cpu_to_le32(mddev->layout); | 1740 | sb->layout = cpu_to_le32(mddev->layout); |
| 1741 | if (test_bit(FailFast, &rdev->flags)) | ||
| 1742 | sb->devflags |= FailFast1; | ||
| 1743 | else | ||
| 1744 | sb->devflags &= ~FailFast1; | ||
| 1710 | 1745 | ||
| 1711 | if (test_bit(WriteMostly, &rdev->flags)) | 1746 | if (test_bit(WriteMostly, &rdev->flags)) |
| 1712 | sb->devflags |= WriteMostly1; | 1747 | sb->devflags |= WriteMostly1; |
| @@ -1863,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | |||
| 1863 | sb->data_size = cpu_to_le64(num_sectors); | 1898 | sb->data_size = cpu_to_le64(num_sectors); |
| 1864 | sb->super_offset = rdev->sb_start; | 1899 | sb->super_offset = rdev->sb_start; |
| 1865 | sb->sb_csum = calc_sb_1_csum(sb); | 1900 | sb->sb_csum = calc_sb_1_csum(sb); |
| 1866 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 1901 | do { |
| 1867 | rdev->sb_page); | 1902 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| 1868 | md_super_wait(rdev->mddev); | 1903 | rdev->sb_page); |
| 1904 | } while (md_super_wait(rdev->mddev) < 0); | ||
| 1869 | return num_sectors; | 1905 | return num_sectors; |
| 1870 | 1906 | ||
| 1871 | } | 1907 | } |
| @@ -2004,9 +2040,9 @@ int md_integrity_register(struct mddev *mddev) | |||
| 2004 | blk_integrity_register(mddev->gendisk, | 2040 | blk_integrity_register(mddev->gendisk, |
| 2005 | bdev_get_integrity(reference->bdev)); | 2041 | bdev_get_integrity(reference->bdev)); |
| 2006 | 2042 | ||
| 2007 | printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); | 2043 | pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); |
| 2008 | if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { | 2044 | if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { |
| 2009 | printk(KERN_ERR "md: failed to create integrity pool for %s\n", | 2045 | pr_err("md: failed to create integrity pool for %s\n", |
| 2010 | mdname(mddev)); | 2046 | mdname(mddev)); |
| 2011 | return -EINVAL; | 2047 | return -EINVAL; |
| 2012 | } | 2048 | } |
| @@ -2034,8 +2070,8 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) | |||
| 2034 | return 0; | 2070 | return 0; |
| 2035 | 2071 | ||
| 2036 | if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { | 2072 | if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { |
| 2037 | printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", | 2073 | pr_err("%s: incompatible integrity profile for %s\n", |
| 2038 | mdname(mddev), bdevname(rdev->bdev, name)); | 2074 | mdname(mddev), bdevname(rdev->bdev, name)); |
| 2039 | return -ENXIO; | 2075 | return -ENXIO; |
| 2040 | } | 2076 | } |
| 2041 | 2077 | ||
| @@ -2089,15 +2125,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) | |||
| 2089 | rcu_read_unlock(); | 2125 | rcu_read_unlock(); |
| 2090 | if (!test_bit(Journal, &rdev->flags) && | 2126 | if (!test_bit(Journal, &rdev->flags) && |
| 2091 | mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { | 2127 | mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
| 2092 | printk(KERN_WARNING "md: %s: array is limited to %d devices\n", | 2128 | pr_warn("md: %s: array is limited to %d devices\n", |
| 2093 | mdname(mddev), mddev->max_disks); | 2129 | mdname(mddev), mddev->max_disks); |
| 2094 | return -EBUSY; | 2130 | return -EBUSY; |
| 2095 | } | 2131 | } |
| 2096 | bdevname(rdev->bdev,b); | 2132 | bdevname(rdev->bdev,b); |
| 2097 | strreplace(b, '/', '!'); | 2133 | strreplace(b, '/', '!'); |
| 2098 | 2134 | ||
| 2099 | rdev->mddev = mddev; | 2135 | rdev->mddev = mddev; |
| 2100 | printk(KERN_INFO "md: bind<%s>\n", b); | 2136 | pr_debug("md: bind<%s>\n", b); |
| 2101 | 2137 | ||
| 2102 | if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) | 2138 | if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) |
| 2103 | goto fail; | 2139 | goto fail; |
| @@ -2116,8 +2152,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) | |||
| 2116 | return 0; | 2152 | return 0; |
| 2117 | 2153 | ||
| 2118 | fail: | 2154 | fail: |
| 2119 | printk(KERN_WARNING "md: failed to register dev-%s for %s\n", | 2155 | pr_warn("md: failed to register dev-%s for %s\n", |
| 2120 | b, mdname(mddev)); | 2156 | b, mdname(mddev)); |
| 2121 | return err; | 2157 | return err; |
| 2122 | } | 2158 | } |
| 2123 | 2159 | ||
| @@ -2134,7 +2170,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) | |||
| 2134 | 2170 | ||
| 2135 | bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); | 2171 | bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
| 2136 | list_del_rcu(&rdev->same_set); | 2172 | list_del_rcu(&rdev->same_set); |
| 2137 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 2173 | pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
| 2138 | rdev->mddev = NULL; | 2174 | rdev->mddev = NULL; |
| 2139 | sysfs_remove_link(&rdev->kobj, "block"); | 2175 | sysfs_remove_link(&rdev->kobj, "block"); |
| 2140 | sysfs_put(rdev->sysfs_state); | 2176 | sysfs_put(rdev->sysfs_state); |
| @@ -2164,8 +2200,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) | |||
| 2164 | bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, | 2200 | bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
| 2165 | shared ? (struct md_rdev *)lock_rdev : rdev); | 2201 | shared ? (struct md_rdev *)lock_rdev : rdev); |
| 2166 | if (IS_ERR(bdev)) { | 2202 | if (IS_ERR(bdev)) { |
| 2167 | printk(KERN_ERR "md: could not open %s.\n", | 2203 | pr_warn("md: could not open %s.\n", __bdevname(dev, b)); |
| 2168 | __bdevname(dev, b)); | ||
| 2169 | return PTR_ERR(bdev); | 2204 | return PTR_ERR(bdev); |
| 2170 | } | 2205 | } |
| 2171 | rdev->bdev = bdev; | 2206 | rdev->bdev = bdev; |
| @@ -2185,8 +2220,7 @@ static void export_rdev(struct md_rdev *rdev) | |||
| 2185 | { | 2220 | { |
| 2186 | char b[BDEVNAME_SIZE]; | 2221 | char b[BDEVNAME_SIZE]; |
| 2187 | 2222 | ||
| 2188 | printk(KERN_INFO "md: export_rdev(%s)\n", | 2223 | pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); |
| 2189 | bdevname(rdev->bdev,b)); | ||
| 2190 | md_rdev_clear(rdev); | 2224 | md_rdev_clear(rdev); |
| 2191 | #ifndef MODULE | 2225 | #ifndef MODULE |
| 2192 | if (test_bit(AutoDetected, &rdev->flags)) | 2226 | if (test_bit(AutoDetected, &rdev->flags)) |
| @@ -2288,24 +2322,24 @@ void md_update_sb(struct mddev *mddev, int force_change) | |||
| 2288 | 2322 | ||
| 2289 | if (mddev->ro) { | 2323 | if (mddev->ro) { |
| 2290 | if (force_change) | 2324 | if (force_change) |
| 2291 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2325 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2292 | return; | 2326 | return; |
| 2293 | } | 2327 | } |
| 2294 | 2328 | ||
| 2295 | repeat: | 2329 | repeat: |
| 2296 | if (mddev_is_clustered(mddev)) { | 2330 | if (mddev_is_clustered(mddev)) { |
| 2297 | if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) | 2331 | if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
| 2298 | force_change = 1; | 2332 | force_change = 1; |
| 2299 | if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) | 2333 | if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
| 2300 | nospares = 1; | 2334 | nospares = 1; |
| 2301 | ret = md_cluster_ops->metadata_update_start(mddev); | 2335 | ret = md_cluster_ops->metadata_update_start(mddev); |
| 2302 | /* Has someone else has updated the sb */ | 2336 | /* Has someone else has updated the sb */ |
| 2303 | if (!does_sb_need_changing(mddev)) { | 2337 | if (!does_sb_need_changing(mddev)) { |
| 2304 | if (ret == 0) | 2338 | if (ret == 0) |
| 2305 | md_cluster_ops->metadata_update_cancel(mddev); | 2339 | md_cluster_ops->metadata_update_cancel(mddev); |
| 2306 | bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), | 2340 | bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
| 2307 | BIT(MD_CHANGE_DEVS) | | 2341 | BIT(MD_SB_CHANGE_DEVS) | |
| 2308 | BIT(MD_CHANGE_CLEAN)); | 2342 | BIT(MD_SB_CHANGE_CLEAN)); |
| 2309 | return; | 2343 | return; |
| 2310 | } | 2344 | } |
| 2311 | } | 2345 | } |
| @@ -2321,10 +2355,10 @@ repeat: | |||
| 2321 | 2355 | ||
| 2322 | } | 2356 | } |
| 2323 | if (!mddev->persistent) { | 2357 | if (!mddev->persistent) { |
| 2324 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2358 | clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 2325 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2359 | clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2326 | if (!mddev->external) { | 2360 | if (!mddev->external) { |
| 2327 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2361 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| 2328 | rdev_for_each(rdev, mddev) { | 2362 | rdev_for_each(rdev, mddev) { |
| 2329 | if (rdev->badblocks.changed) { | 2363 | if (rdev->badblocks.changed) { |
| 2330 | rdev->badblocks.changed = 0; | 2364 | rdev->badblocks.changed = 0; |
| @@ -2344,9 +2378,9 @@ repeat: | |||
| 2344 | 2378 | ||
| 2345 | mddev->utime = ktime_get_real_seconds(); | 2379 | mddev->utime = ktime_get_real_seconds(); |
| 2346 | 2380 | ||
| 2347 | if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) | 2381 | if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
| 2348 | force_change = 1; | 2382 | force_change = 1; |
| 2349 | if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) | 2383 | if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
| 2350 | /* just a clean<-> dirty transition, possibly leave spares alone, | 2384 | /* just a clean<-> dirty transition, possibly leave spares alone, |
| 2351 | * though if events isn't the right even/odd, we will have to do | 2385 | * though if events isn't the right even/odd, we will have to do |
| 2352 | * spares after all | 2386 | * spares after all |
| @@ -2402,6 +2436,9 @@ repeat: | |||
| 2402 | pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", | 2436 | pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", |
| 2403 | mdname(mddev), mddev->in_sync); | 2437 | mdname(mddev), mddev->in_sync); |
| 2404 | 2438 | ||
| 2439 | if (mddev->queue) | ||
| 2440 | blk_add_trace_msg(mddev->queue, "md md_update_sb"); | ||
| 2441 | rewrite: | ||
| 2405 | bitmap_update_sb(mddev->bitmap); | 2442 | bitmap_update_sb(mddev->bitmap); |
| 2406 | rdev_for_each(rdev, mddev) { | 2443 | rdev_for_each(rdev, mddev) { |
| 2407 | char b[BDEVNAME_SIZE]; | 2444 | char b[BDEVNAME_SIZE]; |
| @@ -2433,15 +2470,16 @@ repeat: | |||
| 2433 | /* only need to write one superblock... */ | 2470 | /* only need to write one superblock... */ |
| 2434 | break; | 2471 | break; |
| 2435 | } | 2472 | } |
| 2436 | md_super_wait(mddev); | 2473 | if (md_super_wait(mddev) < 0) |
| 2437 | /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ | 2474 | goto rewrite; |
| 2475 | /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ | ||
| 2438 | 2476 | ||
| 2439 | if (mddev_is_clustered(mddev) && ret == 0) | 2477 | if (mddev_is_clustered(mddev) && ret == 0) |
| 2440 | md_cluster_ops->metadata_update_finish(mddev); | 2478 | md_cluster_ops->metadata_update_finish(mddev); |
| 2441 | 2479 | ||
| 2442 | if (mddev->in_sync != sync_req || | 2480 | if (mddev->in_sync != sync_req || |
| 2443 | !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), | 2481 | !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
| 2444 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) | 2482 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) |
| 2445 | /* have to write it out again */ | 2483 | /* have to write it out again */ |
| 2446 | goto repeat; | 2484 | goto repeat; |
| 2447 | wake_up(&mddev->sb_wait); | 2485 | wake_up(&mddev->sb_wait); |
| @@ -2485,7 +2523,7 @@ static int add_bound_rdev(struct md_rdev *rdev) | |||
| 2485 | } | 2523 | } |
| 2486 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2524 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| 2487 | 2525 | ||
| 2488 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2526 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2489 | if (mddev->degraded) | 2527 | if (mddev->degraded) |
| 2490 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 2528 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
| 2491 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2529 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| @@ -2523,51 +2561,41 @@ struct rdev_sysfs_entry { | |||
| 2523 | static ssize_t | 2561 | static ssize_t |
| 2524 | state_show(struct md_rdev *rdev, char *page) | 2562 | state_show(struct md_rdev *rdev, char *page) |
| 2525 | { | 2563 | { |
| 2526 | char *sep = ""; | 2564 | char *sep = ","; |
| 2527 | size_t len = 0; | 2565 | size_t len = 0; |
| 2528 | unsigned long flags = ACCESS_ONCE(rdev->flags); | 2566 | unsigned long flags = ACCESS_ONCE(rdev->flags); |
| 2529 | 2567 | ||
| 2530 | if (test_bit(Faulty, &flags) || | 2568 | if (test_bit(Faulty, &flags) || |
| 2531 | rdev->badblocks.unacked_exist) { | 2569 | (!test_bit(ExternalBbl, &flags) && |
| 2532 | len+= sprintf(page+len, "%sfaulty",sep); | 2570 | rdev->badblocks.unacked_exist)) |
| 2533 | sep = ","; | 2571 | len += sprintf(page+len, "faulty%s", sep); |
| 2534 | } | 2572 | if (test_bit(In_sync, &flags)) |
| 2535 | if (test_bit(In_sync, &flags)) { | 2573 | len += sprintf(page+len, "in_sync%s", sep); |
| 2536 | len += sprintf(page+len, "%sin_sync",sep); | 2574 | if (test_bit(Journal, &flags)) |
| 2537 | sep = ","; | 2575 | len += sprintf(page+len, "journal%s", sep); |
| 2538 | } | 2576 | if (test_bit(WriteMostly, &flags)) |
| 2539 | if (test_bit(Journal, &flags)) { | 2577 | len += sprintf(page+len, "write_mostly%s", sep); |
| 2540 | len += sprintf(page+len, "%sjournal",sep); | ||
| 2541 | sep = ","; | ||
| 2542 | } | ||
| 2543 | if (test_bit(WriteMostly, &flags)) { | ||
| 2544 | len += sprintf(page+len, "%swrite_mostly",sep); | ||
| 2545 | sep = ","; | ||
| 2546 | } | ||
| 2547 | if (test_bit(Blocked, &flags) || | 2578 | if (test_bit(Blocked, &flags) || |
| 2548 | (rdev->badblocks.unacked_exist | 2579 | (rdev->badblocks.unacked_exist |
| 2549 | && !test_bit(Faulty, &flags))) { | 2580 | && !test_bit(Faulty, &flags))) |
| 2550 | len += sprintf(page+len, "%sblocked", sep); | 2581 | len += sprintf(page+len, "blocked%s", sep); |
| 2551 | sep = ","; | ||
| 2552 | } | ||
| 2553 | if (!test_bit(Faulty, &flags) && | 2582 | if (!test_bit(Faulty, &flags) && |
| 2554 | !test_bit(Journal, &flags) && | 2583 | !test_bit(Journal, &flags) && |
| 2555 | !test_bit(In_sync, &flags)) { | 2584 | !test_bit(In_sync, &flags)) |
| 2556 | len += sprintf(page+len, "%sspare", sep); | 2585 | len += sprintf(page+len, "spare%s", sep); |
| 2557 | sep = ","; | 2586 | if (test_bit(WriteErrorSeen, &flags)) |
| 2558 | } | 2587 | len += sprintf(page+len, "write_error%s", sep); |
| 2559 | if (test_bit(WriteErrorSeen, &flags)) { | 2588 | if (test_bit(WantReplacement, &flags)) |
| 2560 | len += sprintf(page+len, "%swrite_error", sep); | 2589 | len += sprintf(page+len, "want_replacement%s", sep); |
| 2561 | sep = ","; | 2590 | if (test_bit(Replacement, &flags)) |
| 2562 | } | 2591 | len += sprintf(page+len, "replacement%s", sep); |
| 2563 | if (test_bit(WantReplacement, &flags)) { | 2592 | if (test_bit(ExternalBbl, &flags)) |
| 2564 | len += sprintf(page+len, "%swant_replacement", sep); | 2593 | len += sprintf(page+len, "external_bbl%s", sep); |
| 2565 | sep = ","; | 2594 | if (test_bit(FailFast, &flags)) |
| 2566 | } | 2595 | len += sprintf(page+len, "failfast%s", sep); |
| 2567 | if (test_bit(Replacement, &flags)) { | 2596 | |
| 2568 | len += sprintf(page+len, "%sreplacement", sep); | 2597 | if (len) |
| 2569 | sep = ","; | 2598 | len -= strlen(sep); |
| 2570 | } | ||
| 2571 | 2599 | ||
| 2572 | return len+sprintf(page+len, "\n"); | 2600 | return len+sprintf(page+len, "\n"); |
| 2573 | } | 2601 | } |
| @@ -2587,6 +2615,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2587 | * so that it gets rebuilt based on bitmap | 2615 | * so that it gets rebuilt based on bitmap |
| 2588 | * write_error - sets WriteErrorSeen | 2616 | * write_error - sets WriteErrorSeen |
| 2589 | * -write_error - clears WriteErrorSeen | 2617 | * -write_error - clears WriteErrorSeen |
| 2618 | * {,-}failfast - set/clear FailFast | ||
| 2590 | */ | 2619 | */ |
| 2591 | int err = -EINVAL; | 2620 | int err = -EINVAL; |
| 2592 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 2621 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
| @@ -2610,8 +2639,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2610 | 2639 | ||
| 2611 | if (err == 0) { | 2640 | if (err == 0) { |
| 2612 | md_kick_rdev_from_array(rdev); | 2641 | md_kick_rdev_from_array(rdev); |
| 2613 | if (mddev->pers) | 2642 | if (mddev->pers) { |
| 2614 | md_update_sb(mddev, 1); | 2643 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2644 | md_wakeup_thread(mddev->thread); | ||
| 2645 | } | ||
| 2615 | md_new_event(mddev); | 2646 | md_new_event(mddev); |
| 2616 | } | 2647 | } |
| 2617 | } | 2648 | } |
| @@ -2626,6 +2657,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2626 | err = 0; | 2657 | err = 0; |
| 2627 | } else if (cmd_match(buf, "-blocked")) { | 2658 | } else if (cmd_match(buf, "-blocked")) { |
| 2628 | if (!test_bit(Faulty, &rdev->flags) && | 2659 | if (!test_bit(Faulty, &rdev->flags) && |
| 2660 | !test_bit(ExternalBbl, &rdev->flags) && | ||
| 2629 | rdev->badblocks.unacked_exist) { | 2661 | rdev->badblocks.unacked_exist) { |
| 2630 | /* metadata handler doesn't understand badblocks, | 2662 | /* metadata handler doesn't understand badblocks, |
| 2631 | * so we need to fail the device | 2663 | * so we need to fail the device |
| @@ -2642,6 +2674,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2642 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2674 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
| 2643 | set_bit(In_sync, &rdev->flags); | 2675 | set_bit(In_sync, &rdev->flags); |
| 2644 | err = 0; | 2676 | err = 0; |
| 2677 | } else if (cmd_match(buf, "failfast")) { | ||
| 2678 | set_bit(FailFast, &rdev->flags); | ||
| 2679 | err = 0; | ||
| 2680 | } else if (cmd_match(buf, "-failfast")) { | ||
| 2681 | clear_bit(FailFast, &rdev->flags); | ||
| 2682 | err = 0; | ||
| 2645 | } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && | 2683 | } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && |
| 2646 | !test_bit(Journal, &rdev->flags)) { | 2684 | !test_bit(Journal, &rdev->flags)) { |
| 2647 | if (rdev->mddev->pers == NULL) { | 2685 | if (rdev->mddev->pers == NULL) { |
| @@ -2708,6 +2746,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2708 | } | 2746 | } |
| 2709 | } else | 2747 | } else |
| 2710 | err = -EBUSY; | 2748 | err = -EBUSY; |
| 2749 | } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { | ||
| 2750 | set_bit(ExternalBbl, &rdev->flags); | ||
| 2751 | rdev->badblocks.shift = 0; | ||
| 2752 | err = 0; | ||
| 2753 | } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { | ||
| 2754 | clear_bit(ExternalBbl, &rdev->flags); | ||
| 2755 | err = 0; | ||
| 2711 | } | 2756 | } |
| 2712 | if (!err) | 2757 | if (!err) |
| 2713 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2758 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| @@ -3211,10 +3256,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
| 3211 | sector_t size; | 3256 | sector_t size; |
| 3212 | 3257 | ||
| 3213 | rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); | 3258 | rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); |
| 3214 | if (!rdev) { | 3259 | if (!rdev) |
| 3215 | printk(KERN_ERR "md: could not alloc mem for new device!\n"); | ||
| 3216 | return ERR_PTR(-ENOMEM); | 3260 | return ERR_PTR(-ENOMEM); |
| 3217 | } | ||
| 3218 | 3261 | ||
| 3219 | err = md_rdev_init(rdev); | 3262 | err = md_rdev_init(rdev); |
| 3220 | if (err) | 3263 | if (err) |
| @@ -3231,8 +3274,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
| 3231 | 3274 | ||
| 3232 | size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; | 3275 | size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; |
| 3233 | if (!size) { | 3276 | if (!size) { |
| 3234 | printk(KERN_WARNING | 3277 | pr_warn("md: %s has zero or unknown size, marking faulty!\n", |
| 3235 | "md: %s has zero or unknown size, marking faulty!\n", | ||
| 3236 | bdevname(rdev->bdev,b)); | 3278 | bdevname(rdev->bdev,b)); |
| 3237 | err = -EINVAL; | 3279 | err = -EINVAL; |
| 3238 | goto abort_free; | 3280 | goto abort_free; |
| @@ -3242,16 +3284,13 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe | |||
| 3242 | err = super_types[super_format]. | 3284 | err = super_types[super_format]. |
| 3243 | load_super(rdev, NULL, super_minor); | 3285 | load_super(rdev, NULL, super_minor); |
| 3244 | if (err == -EINVAL) { | 3286 | if (err == -EINVAL) { |
| 3245 | printk(KERN_WARNING | 3287 | pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", |
| 3246 | "md: %s does not have a valid v%d.%d " | ||
| 3247 | "superblock, not importing!\n", | ||
| 3248 | bdevname(rdev->bdev,b), | 3288 | bdevname(rdev->bdev,b), |
| 3249 | super_format, super_minor); | 3289 | super_format, super_minor); |
| 3250 | goto abort_free; | 3290 | goto abort_free; |
| 3251 | } | 3291 | } |
| 3252 | if (err < 0) { | 3292 | if (err < 0) { |
| 3253 | printk(KERN_WARNING | 3293 | pr_warn("md: could not read %s's sb, not importing!\n", |
| 3254 | "md: could not read %s's sb, not importing!\n", | ||
| 3255 | bdevname(rdev->bdev,b)); | 3294 | bdevname(rdev->bdev,b)); |
| 3256 | goto abort_free; | 3295 | goto abort_free; |
| 3257 | } | 3296 | } |
| @@ -3287,9 +3326,7 @@ static void analyze_sbs(struct mddev *mddev) | |||
| 3287 | case 0: | 3326 | case 0: |
| 3288 | break; | 3327 | break; |
| 3289 | default: | 3328 | default: |
| 3290 | printk( KERN_ERR \ | 3329 | pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", |
| 3291 | "md: fatal superblock inconsistency in %s" | ||
| 3292 | " -- removing from array\n", | ||
| 3293 | bdevname(rdev->bdev,b)); | 3330 | bdevname(rdev->bdev,b)); |
| 3294 | md_kick_rdev_from_array(rdev); | 3331 | md_kick_rdev_from_array(rdev); |
| 3295 | } | 3332 | } |
| @@ -3302,18 +3339,16 @@ static void analyze_sbs(struct mddev *mddev) | |||
| 3302 | if (mddev->max_disks && | 3339 | if (mddev->max_disks && |
| 3303 | (rdev->desc_nr >= mddev->max_disks || | 3340 | (rdev->desc_nr >= mddev->max_disks || |
| 3304 | i > mddev->max_disks)) { | 3341 | i > mddev->max_disks)) { |
| 3305 | printk(KERN_WARNING | 3342 | pr_warn("md: %s: %s: only %d devices permitted\n", |
| 3306 | "md: %s: %s: only %d devices permitted\n", | 3343 | mdname(mddev), bdevname(rdev->bdev, b), |
| 3307 | mdname(mddev), bdevname(rdev->bdev, b), | 3344 | mddev->max_disks); |
| 3308 | mddev->max_disks); | ||
| 3309 | md_kick_rdev_from_array(rdev); | 3345 | md_kick_rdev_from_array(rdev); |
| 3310 | continue; | 3346 | continue; |
| 3311 | } | 3347 | } |
| 3312 | if (rdev != freshest) { | 3348 | if (rdev != freshest) { |
| 3313 | if (super_types[mddev->major_version]. | 3349 | if (super_types[mddev->major_version]. |
| 3314 | validate_super(mddev, rdev)) { | 3350 | validate_super(mddev, rdev)) { |
| 3315 | printk(KERN_WARNING "md: kicking non-fresh %s" | 3351 | pr_warn("md: kicking non-fresh %s from array!\n", |
| 3316 | " from array!\n", | ||
| 3317 | bdevname(rdev->bdev,b)); | 3352 | bdevname(rdev->bdev,b)); |
| 3318 | md_kick_rdev_from_array(rdev); | 3353 | md_kick_rdev_from_array(rdev); |
| 3319 | continue; | 3354 | continue; |
| @@ -3384,7 +3419,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) | |||
| 3384 | unsigned long msec; | 3419 | unsigned long msec; |
| 3385 | 3420 | ||
| 3386 | if (mddev_is_clustered(mddev)) { | 3421 | if (mddev_is_clustered(mddev)) { |
| 3387 | pr_info("md: Safemode is disabled for clustered mode\n"); | 3422 | pr_warn("md: Safemode is disabled for clustered mode\n"); |
| 3388 | return -EINVAL; | 3423 | return -EINVAL; |
| 3389 | } | 3424 | } |
| 3390 | 3425 | ||
| @@ -3472,8 +3507,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3472 | 3507 | ||
| 3473 | rv = -EINVAL; | 3508 | rv = -EINVAL; |
| 3474 | if (!mddev->pers->quiesce) { | 3509 | if (!mddev->pers->quiesce) { |
| 3475 | printk(KERN_WARNING "md: %s: %s does not support online personality change\n", | 3510 | pr_warn("md: %s: %s does not support online personality change\n", |
| 3476 | mdname(mddev), mddev->pers->name); | 3511 | mdname(mddev), mddev->pers->name); |
| 3477 | goto out_unlock; | 3512 | goto out_unlock; |
| 3478 | } | 3513 | } |
| 3479 | 3514 | ||
| @@ -3491,7 +3526,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3491 | pers = find_pers(level, clevel); | 3526 | pers = find_pers(level, clevel); |
| 3492 | if (!pers || !try_module_get(pers->owner)) { | 3527 | if (!pers || !try_module_get(pers->owner)) { |
| 3493 | spin_unlock(&pers_lock); | 3528 | spin_unlock(&pers_lock); |
| 3494 | printk(KERN_WARNING "md: personality %s not loaded\n", clevel); | 3529 | pr_warn("md: personality %s not loaded\n", clevel); |
| 3495 | rv = -EINVAL; | 3530 | rv = -EINVAL; |
| 3496 | goto out_unlock; | 3531 | goto out_unlock; |
| 3497 | } | 3532 | } |
| @@ -3505,8 +3540,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3505 | } | 3540 | } |
| 3506 | if (!pers->takeover) { | 3541 | if (!pers->takeover) { |
| 3507 | module_put(pers->owner); | 3542 | module_put(pers->owner); |
| 3508 | printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", | 3543 | pr_warn("md: %s: %s does not support personality takeover\n", |
| 3509 | mdname(mddev), clevel); | 3544 | mdname(mddev), clevel); |
| 3510 | rv = -EINVAL; | 3545 | rv = -EINVAL; |
| 3511 | goto out_unlock; | 3546 | goto out_unlock; |
| 3512 | } | 3547 | } |
| @@ -3526,8 +3561,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3526 | mddev->delta_disks = 0; | 3561 | mddev->delta_disks = 0; |
| 3527 | mddev->reshape_backwards = 0; | 3562 | mddev->reshape_backwards = 0; |
| 3528 | module_put(pers->owner); | 3563 | module_put(pers->owner); |
| 3529 | printk(KERN_WARNING "md: %s: %s would not accept array\n", | 3564 | pr_warn("md: %s: %s would not accept array\n", |
| 3530 | mdname(mddev), clevel); | 3565 | mdname(mddev), clevel); |
| 3531 | rv = PTR_ERR(priv); | 3566 | rv = PTR_ERR(priv); |
| 3532 | goto out_unlock; | 3567 | goto out_unlock; |
| 3533 | } | 3568 | } |
| @@ -3570,9 +3605,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3570 | pers->sync_request != NULL) { | 3605 | pers->sync_request != NULL) { |
| 3571 | /* need to add the md_redundancy_group */ | 3606 | /* need to add the md_redundancy_group */ |
| 3572 | if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) | 3607 | if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) |
| 3573 | printk(KERN_WARNING | 3608 | pr_warn("md: cannot register extra attributes for %s\n", |
| 3574 | "md: cannot register extra attributes for %s\n", | 3609 | mdname(mddev)); |
| 3575 | mdname(mddev)); | ||
| 3576 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); | 3610 | mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); |
| 3577 | } | 3611 | } |
| 3578 | if (oldpers->sync_request != NULL && | 3612 | if (oldpers->sync_request != NULL && |
| @@ -3603,9 +3637,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3603 | clear_bit(In_sync, &rdev->flags); | 3637 | clear_bit(In_sync, &rdev->flags); |
| 3604 | else { | 3638 | else { |
| 3605 | if (sysfs_link_rdev(mddev, rdev)) | 3639 | if (sysfs_link_rdev(mddev, rdev)) |
| 3606 | printk(KERN_WARNING "md: cannot register rd%d" | 3640 | pr_warn("md: cannot register rd%d for %s after level change\n", |
| 3607 | " for %s after level change\n", | 3641 | rdev->raid_disk, mdname(mddev)); |
| 3608 | rdev->raid_disk, mdname(mddev)); | ||
| 3609 | } | 3642 | } |
| 3610 | } | 3643 | } |
| 3611 | 3644 | ||
| @@ -3618,7 +3651,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3618 | } | 3651 | } |
| 3619 | blk_set_stacking_limits(&mddev->queue->limits); | 3652 | blk_set_stacking_limits(&mddev->queue->limits); |
| 3620 | pers->run(mddev); | 3653 | pers->run(mddev); |
| 3621 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3654 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 3622 | mddev_resume(mddev); | 3655 | mddev_resume(mddev); |
| 3623 | if (!mddev->thread) | 3656 | if (!mddev->thread) |
| 3624 | md_update_sb(mddev, 1); | 3657 | md_update_sb(mddev, 1); |
| @@ -3813,7 +3846,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3813 | if (!err) { | 3846 | if (!err) { |
| 3814 | mddev->recovery_cp = n; | 3847 | mddev->recovery_cp = n; |
| 3815 | if (mddev->pers) | 3848 | if (mddev->pers) |
| 3816 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 3849 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 3817 | } | 3850 | } |
| 3818 | mddev_unlock(mddev); | 3851 | mddev_unlock(mddev); |
| 3819 | return err ?: len; | 3852 | return err ?: len; |
| @@ -3887,7 +3920,7 @@ array_state_show(struct mddev *mddev, char *page) | |||
| 3887 | st = read_auto; | 3920 | st = read_auto; |
| 3888 | break; | 3921 | break; |
| 3889 | case 0: | 3922 | case 0: |
| 3890 | if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) | 3923 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
| 3891 | st = write_pending; | 3924 | st = write_pending; |
| 3892 | else if (mddev->in_sync) | 3925 | else if (mddev->in_sync) |
| 3893 | st = clean; | 3926 | st = clean; |
| @@ -3925,7 +3958,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3925 | spin_lock(&mddev->lock); | 3958 | spin_lock(&mddev->lock); |
| 3926 | if (st == active) { | 3959 | if (st == active) { |
| 3927 | restart_array(mddev); | 3960 | restart_array(mddev); |
| 3928 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 3961 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| 3962 | md_wakeup_thread(mddev->thread); | ||
| 3929 | wake_up(&mddev->sb_wait); | 3963 | wake_up(&mddev->sb_wait); |
| 3930 | err = 0; | 3964 | err = 0; |
| 3931 | } else /* st == clean */ { | 3965 | } else /* st == clean */ { |
| @@ -3935,7 +3969,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3935 | mddev->in_sync = 1; | 3969 | mddev->in_sync = 1; |
| 3936 | if (mddev->safemode == 1) | 3970 | if (mddev->safemode == 1) |
| 3937 | mddev->safemode = 0; | 3971 | mddev->safemode = 0; |
| 3938 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 3972 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 3939 | } | 3973 | } |
| 3940 | err = 0; | 3974 | err = 0; |
| 3941 | } else | 3975 | } else |
| @@ -4001,7 +4035,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 4001 | mddev->in_sync = 1; | 4035 | mddev->in_sync = 1; |
| 4002 | if (mddev->safemode == 1) | 4036 | if (mddev->safemode == 1) |
| 4003 | mddev->safemode = 0; | 4037 | mddev->safemode = 0; |
| 4004 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 4038 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 4005 | } | 4039 | } |
| 4006 | err = 0; | 4040 | err = 0; |
| 4007 | } else | 4041 | } else |
| @@ -4015,7 +4049,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 4015 | err = restart_array(mddev); | 4049 | err = restart_array(mddev); |
| 4016 | if (err) | 4050 | if (err) |
| 4017 | break; | 4051 | break; |
| 4018 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 4052 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| 4019 | wake_up(&mddev->sb_wait); | 4053 | wake_up(&mddev->sb_wait); |
| 4020 | err = 0; | 4054 | err = 0; |
| 4021 | } else { | 4055 | } else { |
| @@ -5071,13 +5105,13 @@ static int md_alloc(dev_t dev, char *name) | |||
| 5071 | /* This isn't possible, but as kobject_init_and_add is marked | 5105 | /* This isn't possible, but as kobject_init_and_add is marked |
| 5072 | * __must_check, we must do something with the result | 5106 | * __must_check, we must do something with the result |
| 5073 | */ | 5107 | */ |
| 5074 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 5108 | pr_debug("md: cannot register %s/md - name in use\n", |
| 5075 | disk->disk_name); | 5109 | disk->disk_name); |
| 5076 | error = 0; | 5110 | error = 0; |
| 5077 | } | 5111 | } |
| 5078 | if (mddev->kobj.sd && | 5112 | if (mddev->kobj.sd && |
| 5079 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | 5113 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) |
| 5080 | printk(KERN_DEBUG "pointless warning\n"); | 5114 | pr_debug("pointless warning\n"); |
| 5081 | mutex_unlock(&mddev->open_mutex); | 5115 | mutex_unlock(&mddev->open_mutex); |
| 5082 | abort: | 5116 | abort: |
| 5083 | mutex_unlock(&disks_mutex); | 5117 | mutex_unlock(&disks_mutex); |
| @@ -5179,15 +5213,15 @@ int md_run(struct mddev *mddev) | |||
| 5179 | if (mddev->dev_sectors && | 5213 | if (mddev->dev_sectors && |
| 5180 | rdev->data_offset + mddev->dev_sectors | 5214 | rdev->data_offset + mddev->dev_sectors |
| 5181 | > rdev->sb_start) { | 5215 | > rdev->sb_start) { |
| 5182 | printk("md: %s: data overlaps metadata\n", | 5216 | pr_warn("md: %s: data overlaps metadata\n", |
| 5183 | mdname(mddev)); | 5217 | mdname(mddev)); |
| 5184 | return -EINVAL; | 5218 | return -EINVAL; |
| 5185 | } | 5219 | } |
| 5186 | } else { | 5220 | } else { |
| 5187 | if (rdev->sb_start + rdev->sb_size/512 | 5221 | if (rdev->sb_start + rdev->sb_size/512 |
| 5188 | > rdev->data_offset) { | 5222 | > rdev->data_offset) { |
| 5189 | printk("md: %s: metadata overlaps data\n", | 5223 | pr_warn("md: %s: metadata overlaps data\n", |
| 5190 | mdname(mddev)); | 5224 | mdname(mddev)); |
| 5191 | return -EINVAL; | 5225 | return -EINVAL; |
| 5192 | } | 5226 | } |
| 5193 | } | 5227 | } |
| @@ -5202,11 +5236,11 @@ int md_run(struct mddev *mddev) | |||
| 5202 | if (!pers || !try_module_get(pers->owner)) { | 5236 | if (!pers || !try_module_get(pers->owner)) { |
| 5203 | spin_unlock(&pers_lock); | 5237 | spin_unlock(&pers_lock); |
| 5204 | if (mddev->level != LEVEL_NONE) | 5238 | if (mddev->level != LEVEL_NONE) |
| 5205 | printk(KERN_WARNING "md: personality for level %d is not loaded!\n", | 5239 | pr_warn("md: personality for level %d is not loaded!\n", |
| 5206 | mddev->level); | 5240 | mddev->level); |
| 5207 | else | 5241 | else |
| 5208 | printk(KERN_WARNING "md: personality for level %s is not loaded!\n", | 5242 | pr_warn("md: personality for level %s is not loaded!\n", |
| 5209 | mddev->clevel); | 5243 | mddev->clevel); |
| 5210 | return -EINVAL; | 5244 | return -EINVAL; |
| 5211 | } | 5245 | } |
| 5212 | spin_unlock(&pers_lock); | 5246 | spin_unlock(&pers_lock); |
| @@ -5236,21 +5270,16 @@ int md_run(struct mddev *mddev) | |||
| 5236 | if (rdev < rdev2 && | 5270 | if (rdev < rdev2 && |
| 5237 | rdev->bdev->bd_contains == | 5271 | rdev->bdev->bd_contains == |
| 5238 | rdev2->bdev->bd_contains) { | 5272 | rdev2->bdev->bd_contains) { |
| 5239 | printk(KERN_WARNING | 5273 | pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", |
| 5240 | "%s: WARNING: %s appears to be" | 5274 | mdname(mddev), |
| 5241 | " on the same physical disk as" | 5275 | bdevname(rdev->bdev,b), |
| 5242 | " %s.\n", | 5276 | bdevname(rdev2->bdev,b2)); |
| 5243 | mdname(mddev), | ||
| 5244 | bdevname(rdev->bdev,b), | ||
| 5245 | bdevname(rdev2->bdev,b2)); | ||
| 5246 | warned = 1; | 5277 | warned = 1; |
| 5247 | } | 5278 | } |
| 5248 | } | 5279 | } |
| 5249 | 5280 | ||
| 5250 | if (warned) | 5281 | if (warned) |
| 5251 | printk(KERN_WARNING | 5282 | pr_warn("True protection against single-disk failure might be compromised.\n"); |
| 5252 | "True protection against single-disk" | ||
| 5253 | " failure might be compromised.\n"); | ||
| 5254 | } | 5283 | } |
| 5255 | 5284 | ||
| 5256 | mddev->recovery = 0; | 5285 | mddev->recovery = 0; |
| @@ -5264,14 +5293,14 @@ int md_run(struct mddev *mddev) | |||
| 5264 | 5293 | ||
| 5265 | err = pers->run(mddev); | 5294 | err = pers->run(mddev); |
| 5266 | if (err) | 5295 | if (err) |
| 5267 | printk(KERN_ERR "md: pers->run() failed ...\n"); | 5296 | pr_warn("md: pers->run() failed ...\n"); |
| 5268 | else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { | 5297 | else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { |
| 5269 | WARN_ONCE(!mddev->external_size, "%s: default size too small," | 5298 | WARN_ONCE(!mddev->external_size, |
| 5270 | " but 'external_size' not in effect?\n", __func__); | 5299 | "%s: default size too small, but 'external_size' not in effect?\n", |
| 5271 | printk(KERN_ERR | 5300 | __func__); |
| 5272 | "md: invalid array_size %llu > default size %llu\n", | 5301 | pr_warn("md: invalid array_size %llu > default size %llu\n", |
| 5273 | (unsigned long long)mddev->array_sectors / 2, | 5302 | (unsigned long long)mddev->array_sectors / 2, |
| 5274 | (unsigned long long)pers->size(mddev, 0, 0) / 2); | 5303 | (unsigned long long)pers->size(mddev, 0, 0) / 2); |
| 5275 | err = -EINVAL; | 5304 | err = -EINVAL; |
| 5276 | } | 5305 | } |
| 5277 | if (err == 0 && pers->sync_request && | 5306 | if (err == 0 && pers->sync_request && |
| @@ -5281,8 +5310,8 @@ int md_run(struct mddev *mddev) | |||
| 5281 | bitmap = bitmap_create(mddev, -1); | 5310 | bitmap = bitmap_create(mddev, -1); |
| 5282 | if (IS_ERR(bitmap)) { | 5311 | if (IS_ERR(bitmap)) { |
| 5283 | err = PTR_ERR(bitmap); | 5312 | err = PTR_ERR(bitmap); |
| 5284 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 5313 | pr_warn("%s: failed to create bitmap (%d)\n", |
| 5285 | mdname(mddev), err); | 5314 | mdname(mddev), err); |
| 5286 | } else | 5315 | } else |
| 5287 | mddev->bitmap = bitmap; | 5316 | mddev->bitmap = bitmap; |
| 5288 | 5317 | ||
| @@ -5318,9 +5347,8 @@ int md_run(struct mddev *mddev) | |||
| 5318 | if (pers->sync_request) { | 5347 | if (pers->sync_request) { |
| 5319 | if (mddev->kobj.sd && | 5348 | if (mddev->kobj.sd && |
| 5320 | sysfs_create_group(&mddev->kobj, &md_redundancy_group)) | 5349 | sysfs_create_group(&mddev->kobj, &md_redundancy_group)) |
| 5321 | printk(KERN_WARNING | 5350 | pr_warn("md: cannot register extra attributes for %s\n", |
| 5322 | "md: cannot register extra attributes for %s\n", | 5351 | mdname(mddev)); |
| 5323 | mdname(mddev)); | ||
| 5324 | mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); | 5352 | mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); |
| 5325 | } else if (mddev->ro == 2) /* auto-readonly not meaningful */ | 5353 | } else if (mddev->ro == 2) /* auto-readonly not meaningful */ |
| 5326 | mddev->ro = 0; | 5354 | mddev->ro = 0; |
| @@ -5350,7 +5378,7 @@ int md_run(struct mddev *mddev) | |||
| 5350 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 5378 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
| 5351 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5379 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 5352 | 5380 | ||
| 5353 | if (mddev->flags & MD_UPDATE_SB_FLAGS) | 5381 | if (mddev->sb_flags) |
| 5354 | md_update_sb(mddev, 0); | 5382 | md_update_sb(mddev, 0); |
| 5355 | 5383 | ||
| 5356 | md_new_event(mddev); | 5384 | md_new_event(mddev); |
| @@ -5421,8 +5449,7 @@ static int restart_array(struct mddev *mddev) | |||
| 5421 | mddev->safemode = 0; | 5449 | mddev->safemode = 0; |
| 5422 | mddev->ro = 0; | 5450 | mddev->ro = 0; |
| 5423 | set_disk_ro(disk, 0); | 5451 | set_disk_ro(disk, 0); |
| 5424 | printk(KERN_INFO "md: %s switched to read-write mode.\n", | 5452 | pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); |
| 5425 | mdname(mddev)); | ||
| 5426 | /* Kick recovery or resync if necessary */ | 5453 | /* Kick recovery or resync if necessary */ |
| 5427 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5454 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 5428 | md_wakeup_thread(mddev->thread); | 5455 | md_wakeup_thread(mddev->thread); |
| @@ -5446,6 +5473,7 @@ static void md_clean(struct mddev *mddev) | |||
| 5446 | mddev->level = LEVEL_NONE; | 5473 | mddev->level = LEVEL_NONE; |
| 5447 | mddev->clevel[0] = 0; | 5474 | mddev->clevel[0] = 0; |
| 5448 | mddev->flags = 0; | 5475 | mddev->flags = 0; |
| 5476 | mddev->sb_flags = 0; | ||
| 5449 | mddev->ro = 0; | 5477 | mddev->ro = 0; |
| 5450 | mddev->metadata_type[0] = 0; | 5478 | mddev->metadata_type[0] = 0; |
| 5451 | mddev->chunk_sectors = 0; | 5479 | mddev->chunk_sectors = 0; |
| @@ -5490,12 +5518,15 @@ static void __md_stop_writes(struct mddev *mddev) | |||
| 5490 | 5518 | ||
| 5491 | del_timer_sync(&mddev->safemode_timer); | 5519 | del_timer_sync(&mddev->safemode_timer); |
| 5492 | 5520 | ||
| 5521 | if (mddev->pers && mddev->pers->quiesce) { | ||
| 5522 | mddev->pers->quiesce(mddev, 1); | ||
| 5523 | mddev->pers->quiesce(mddev, 0); | ||
| 5524 | } | ||
| 5493 | bitmap_flush(mddev); | 5525 | bitmap_flush(mddev); |
| 5494 | md_super_wait(mddev); | ||
| 5495 | 5526 | ||
| 5496 | if (mddev->ro == 0 && | 5527 | if (mddev->ro == 0 && |
| 5497 | ((!mddev->in_sync && !mddev_is_clustered(mddev)) || | 5528 | ((!mddev->in_sync && !mddev_is_clustered(mddev)) || |
| 5498 | (mddev->flags & MD_UPDATE_SB_FLAGS))) { | 5529 | mddev->sb_flags)) { |
| 5499 | /* mark array as shutdown cleanly */ | 5530 | /* mark array as shutdown cleanly */ |
| 5500 | if (!mddev_is_clustered(mddev)) | 5531 | if (!mddev_is_clustered(mddev)) |
| 5501 | mddev->in_sync = 1; | 5532 | mddev->in_sync = 1; |
| @@ -5516,8 +5547,8 @@ static void mddev_detach(struct mddev *mddev) | |||
| 5516 | struct bitmap *bitmap = mddev->bitmap; | 5547 | struct bitmap *bitmap = mddev->bitmap; |
| 5517 | /* wait for behind writes to complete */ | 5548 | /* wait for behind writes to complete */ |
| 5518 | if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | 5549 | if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { |
| 5519 | printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", | 5550 | pr_debug("md:%s: behind writes in progress - waiting to stop.\n", |
| 5520 | mdname(mddev)); | 5551 | mdname(mddev)); |
| 5521 | /* need to kick something here to make sure I/O goes? */ | 5552 | /* need to kick something here to make sure I/O goes? */ |
| 5522 | wait_event(bitmap->behind_wait, | 5553 | wait_event(bitmap->behind_wait, |
| 5523 | atomic_read(&bitmap->behind_writes) == 0); | 5554 | atomic_read(&bitmap->behind_writes) == 0); |
| @@ -5578,20 +5609,20 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) | |||
| 5578 | * which will now never happen */ | 5609 | * which will now never happen */ |
| 5579 | wake_up_process(mddev->sync_thread->tsk); | 5610 | wake_up_process(mddev->sync_thread->tsk); |
| 5580 | 5611 | ||
| 5581 | if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) | 5612 | if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
| 5582 | return -EBUSY; | 5613 | return -EBUSY; |
| 5583 | mddev_unlock(mddev); | 5614 | mddev_unlock(mddev); |
| 5584 | wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, | 5615 | wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, |
| 5585 | &mddev->recovery)); | 5616 | &mddev->recovery)); |
| 5586 | wait_event(mddev->sb_wait, | 5617 | wait_event(mddev->sb_wait, |
| 5587 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 5618 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
| 5588 | mddev_lock_nointr(mddev); | 5619 | mddev_lock_nointr(mddev); |
| 5589 | 5620 | ||
| 5590 | mutex_lock(&mddev->open_mutex); | 5621 | mutex_lock(&mddev->open_mutex); |
| 5591 | if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || | 5622 | if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || |
| 5592 | mddev->sync_thread || | 5623 | mddev->sync_thread || |
| 5593 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { | 5624 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
| 5594 | printk("md: %s still in use.\n",mdname(mddev)); | 5625 | pr_warn("md: %s still in use.\n",mdname(mddev)); |
| 5595 | if (did_freeze) { | 5626 | if (did_freeze) { |
| 5596 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5627 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
| 5597 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5628 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| @@ -5653,7 +5684,7 @@ static int do_md_stop(struct mddev *mddev, int mode, | |||
| 5653 | mddev->sysfs_active || | 5684 | mddev->sysfs_active || |
| 5654 | mddev->sync_thread || | 5685 | mddev->sync_thread || |
| 5655 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { | 5686 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { |
| 5656 | printk("md: %s still in use.\n",mdname(mddev)); | 5687 | pr_warn("md: %s still in use.\n",mdname(mddev)); |
| 5657 | mutex_unlock(&mddev->open_mutex); | 5688 | mutex_unlock(&mddev->open_mutex); |
| 5658 | if (did_freeze) { | 5689 | if (did_freeze) { |
| 5659 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 5690 | clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
| @@ -5690,7 +5721,7 @@ static int do_md_stop(struct mddev *mddev, int mode, | |||
| 5690 | * Free resources if final stop | 5721 | * Free resources if final stop |
| 5691 | */ | 5722 | */ |
| 5692 | if (mode == 0) { | 5723 | if (mode == 0) { |
| 5693 | printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); | 5724 | pr_info("md: %s stopped.\n", mdname(mddev)); |
| 5694 | 5725 | ||
| 5695 | bitmap_destroy(mddev); | 5726 | bitmap_destroy(mddev); |
| 5696 | if (mddev->bitmap_info.file) { | 5727 | if (mddev->bitmap_info.file) { |
| @@ -5722,17 +5753,17 @@ static void autorun_array(struct mddev *mddev) | |||
| 5722 | if (list_empty(&mddev->disks)) | 5753 | if (list_empty(&mddev->disks)) |
| 5723 | return; | 5754 | return; |
| 5724 | 5755 | ||
| 5725 | printk(KERN_INFO "md: running: "); | 5756 | pr_info("md: running: "); |
| 5726 | 5757 | ||
| 5727 | rdev_for_each(rdev, mddev) { | 5758 | rdev_for_each(rdev, mddev) { |
| 5728 | char b[BDEVNAME_SIZE]; | 5759 | char b[BDEVNAME_SIZE]; |
| 5729 | printk("<%s>", bdevname(rdev->bdev,b)); | 5760 | pr_cont("<%s>", bdevname(rdev->bdev,b)); |
| 5730 | } | 5761 | } |
| 5731 | printk("\n"); | 5762 | pr_cont("\n"); |
| 5732 | 5763 | ||
| 5733 | err = do_md_run(mddev); | 5764 | err = do_md_run(mddev); |
| 5734 | if (err) { | 5765 | if (err) { |
| 5735 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); | 5766 | pr_warn("md: do_md_run() returned %d\n", err); |
| 5736 | do_md_stop(mddev, 0, NULL); | 5767 | do_md_stop(mddev, 0, NULL); |
| 5737 | } | 5768 | } |
| 5738 | } | 5769 | } |
| @@ -5755,7 +5786,7 @@ static void autorun_devices(int part) | |||
| 5755 | struct mddev *mddev; | 5786 | struct mddev *mddev; |
| 5756 | char b[BDEVNAME_SIZE]; | 5787 | char b[BDEVNAME_SIZE]; |
| 5757 | 5788 | ||
| 5758 | printk(KERN_INFO "md: autorun ...\n"); | 5789 | pr_info("md: autorun ...\n"); |
| 5759 | while (!list_empty(&pending_raid_disks)) { | 5790 | while (!list_empty(&pending_raid_disks)) { |
| 5760 | int unit; | 5791 | int unit; |
| 5761 | dev_t dev; | 5792 | dev_t dev; |
| @@ -5763,13 +5794,12 @@ static void autorun_devices(int part) | |||
| 5763 | rdev0 = list_entry(pending_raid_disks.next, | 5794 | rdev0 = list_entry(pending_raid_disks.next, |
| 5764 | struct md_rdev, same_set); | 5795 | struct md_rdev, same_set); |
| 5765 | 5796 | ||
| 5766 | printk(KERN_INFO "md: considering %s ...\n", | 5797 | pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); |
| 5767 | bdevname(rdev0->bdev,b)); | ||
| 5768 | INIT_LIST_HEAD(&candidates); | 5798 | INIT_LIST_HEAD(&candidates); |
| 5769 | rdev_for_each_list(rdev, tmp, &pending_raid_disks) | 5799 | rdev_for_each_list(rdev, tmp, &pending_raid_disks) |
| 5770 | if (super_90_load(rdev, rdev0, 0) >= 0) { | 5800 | if (super_90_load(rdev, rdev0, 0) >= 0) { |
| 5771 | printk(KERN_INFO "md: adding %s ...\n", | 5801 | pr_debug("md: adding %s ...\n", |
| 5772 | bdevname(rdev->bdev,b)); | 5802 | bdevname(rdev->bdev,b)); |
| 5773 | list_move(&rdev->same_set, &candidates); | 5803 | list_move(&rdev->same_set, &candidates); |
| 5774 | } | 5804 | } |
| 5775 | /* | 5805 | /* |
| @@ -5786,8 +5816,8 @@ static void autorun_devices(int part) | |||
| 5786 | unit = MINOR(dev); | 5816 | unit = MINOR(dev); |
| 5787 | } | 5817 | } |
| 5788 | if (rdev0->preferred_minor != unit) { | 5818 | if (rdev0->preferred_minor != unit) { |
| 5789 | printk(KERN_INFO "md: unit number in %s is bad: %d\n", | 5819 | pr_warn("md: unit number in %s is bad: %d\n", |
| 5790 | bdevname(rdev0->bdev, b), rdev0->preferred_minor); | 5820 | bdevname(rdev0->bdev, b), rdev0->preferred_minor); |
| 5791 | break; | 5821 | break; |
| 5792 | } | 5822 | } |
| 5793 | 5823 | ||
| @@ -5796,21 +5826,17 @@ static void autorun_devices(int part) | |||
| 5796 | if (!mddev || !mddev->gendisk) { | 5826 | if (!mddev || !mddev->gendisk) { |
| 5797 | if (mddev) | 5827 | if (mddev) |
| 5798 | mddev_put(mddev); | 5828 | mddev_put(mddev); |
| 5799 | printk(KERN_ERR | ||
| 5800 | "md: cannot allocate memory for md drive.\n"); | ||
| 5801 | break; | 5829 | break; |
| 5802 | } | 5830 | } |
| 5803 | if (mddev_lock(mddev)) | 5831 | if (mddev_lock(mddev)) |
| 5804 | printk(KERN_WARNING "md: %s locked, cannot run\n", | 5832 | pr_warn("md: %s locked, cannot run\n", mdname(mddev)); |
| 5805 | mdname(mddev)); | ||
| 5806 | else if (mddev->raid_disks || mddev->major_version | 5833 | else if (mddev->raid_disks || mddev->major_version |
| 5807 | || !list_empty(&mddev->disks)) { | 5834 | || !list_empty(&mddev->disks)) { |
| 5808 | printk(KERN_WARNING | 5835 | pr_warn("md: %s already running, cannot run %s\n", |
| 5809 | "md: %s already running, cannot run %s\n", | ||
| 5810 | mdname(mddev), bdevname(rdev0->bdev,b)); | 5836 | mdname(mddev), bdevname(rdev0->bdev,b)); |
| 5811 | mddev_unlock(mddev); | 5837 | mddev_unlock(mddev); |
| 5812 | } else { | 5838 | } else { |
| 5813 | printk(KERN_INFO "md: created %s\n", mdname(mddev)); | 5839 | pr_debug("md: created %s\n", mdname(mddev)); |
| 5814 | mddev->persistent = 1; | 5840 | mddev->persistent = 1; |
| 5815 | rdev_for_each_list(rdev, tmp, &candidates) { | 5841 | rdev_for_each_list(rdev, tmp, &candidates) { |
| 5816 | list_del_init(&rdev->same_set); | 5842 | list_del_init(&rdev->same_set); |
| @@ -5829,7 +5855,7 @@ static void autorun_devices(int part) | |||
| 5829 | } | 5855 | } |
| 5830 | mddev_put(mddev); | 5856 | mddev_put(mddev); |
| 5831 | } | 5857 | } |
| 5832 | printk(KERN_INFO "md: ... autorun DONE.\n"); | 5858 | pr_info("md: ... autorun DONE.\n"); |
| 5833 | } | 5859 | } |
| 5834 | #endif /* !MODULE */ | 5860 | #endif /* !MODULE */ |
| 5835 | 5861 | ||
| @@ -5964,6 +5990,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) | |||
| 5964 | info.state |= (1<<MD_DISK_JOURNAL); | 5990 | info.state |= (1<<MD_DISK_JOURNAL); |
| 5965 | if (test_bit(WriteMostly, &rdev->flags)) | 5991 | if (test_bit(WriteMostly, &rdev->flags)) |
| 5966 | info.state |= (1<<MD_DISK_WRITEMOSTLY); | 5992 | info.state |= (1<<MD_DISK_WRITEMOSTLY); |
| 5993 | if (test_bit(FailFast, &rdev->flags)) | ||
| 5994 | info.state |= (1<<MD_DISK_FAILFAST); | ||
| 5967 | } else { | 5995 | } else { |
| 5968 | info.major = info.minor = 0; | 5996 | info.major = info.minor = 0; |
| 5969 | info.raid_disk = -1; | 5997 | info.raid_disk = -1; |
| @@ -5985,8 +6013,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 5985 | 6013 | ||
| 5986 | if (mddev_is_clustered(mddev) && | 6014 | if (mddev_is_clustered(mddev) && |
| 5987 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { | 6015 | !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { |
| 5988 | pr_err("%s: Cannot add to clustered mddev.\n", | 6016 | pr_warn("%s: Cannot add to clustered mddev.\n", |
| 5989 | mdname(mddev)); | 6017 | mdname(mddev)); |
| 5990 | return -EINVAL; | 6018 | return -EINVAL; |
| 5991 | } | 6019 | } |
| 5992 | 6020 | ||
| @@ -5998,8 +6026,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 5998 | /* expecting a device which has a superblock */ | 6026 | /* expecting a device which has a superblock */ |
| 5999 | rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); | 6027 | rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); |
| 6000 | if (IS_ERR(rdev)) { | 6028 | if (IS_ERR(rdev)) { |
| 6001 | printk(KERN_WARNING | 6029 | pr_warn("md: md_import_device returned %ld\n", |
| 6002 | "md: md_import_device returned %ld\n", | ||
| 6003 | PTR_ERR(rdev)); | 6030 | PTR_ERR(rdev)); |
| 6004 | return PTR_ERR(rdev); | 6031 | return PTR_ERR(rdev); |
| 6005 | } | 6032 | } |
| @@ -6010,8 +6037,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6010 | err = super_types[mddev->major_version] | 6037 | err = super_types[mddev->major_version] |
| 6011 | .load_super(rdev, rdev0, mddev->minor_version); | 6038 | .load_super(rdev, rdev0, mddev->minor_version); |
| 6012 | if (err < 0) { | 6039 | if (err < 0) { |
| 6013 | printk(KERN_WARNING | 6040 | pr_warn("md: %s has different UUID to %s\n", |
| 6014 | "md: %s has different UUID to %s\n", | ||
| 6015 | bdevname(rdev->bdev,b), | 6041 | bdevname(rdev->bdev,b), |
| 6016 | bdevname(rdev0->bdev,b2)); | 6042 | bdevname(rdev0->bdev,b2)); |
| 6017 | export_rdev(rdev); | 6043 | export_rdev(rdev); |
| @@ -6032,9 +6058,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6032 | if (mddev->pers) { | 6058 | if (mddev->pers) { |
| 6033 | int err; | 6059 | int err; |
| 6034 | if (!mddev->pers->hot_add_disk) { | 6060 | if (!mddev->pers->hot_add_disk) { |
| 6035 | printk(KERN_WARNING | 6061 | pr_warn("%s: personality does not support diskops!\n", |
| 6036 | "%s: personality does not support diskops!\n", | 6062 | mdname(mddev)); |
| 6037 | mdname(mddev)); | ||
| 6038 | return -EINVAL; | 6063 | return -EINVAL; |
| 6039 | } | 6064 | } |
| 6040 | if (mddev->persistent) | 6065 | if (mddev->persistent) |
| @@ -6043,8 +6068,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6043 | else | 6068 | else |
| 6044 | rdev = md_import_device(dev, -1, -1); | 6069 | rdev = md_import_device(dev, -1, -1); |
| 6045 | if (IS_ERR(rdev)) { | 6070 | if (IS_ERR(rdev)) { |
| 6046 | printk(KERN_WARNING | 6071 | pr_warn("md: md_import_device returned %ld\n", |
| 6047 | "md: md_import_device returned %ld\n", | ||
| 6048 | PTR_ERR(rdev)); | 6072 | PTR_ERR(rdev)); |
| 6049 | return PTR_ERR(rdev); | 6073 | return PTR_ERR(rdev); |
| 6050 | } | 6074 | } |
| @@ -6075,6 +6099,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6075 | set_bit(WriteMostly, &rdev->flags); | 6099 | set_bit(WriteMostly, &rdev->flags); |
| 6076 | else | 6100 | else |
| 6077 | clear_bit(WriteMostly, &rdev->flags); | 6101 | clear_bit(WriteMostly, &rdev->flags); |
| 6102 | if (info->state & (1<<MD_DISK_FAILFAST)) | ||
| 6103 | set_bit(FailFast, &rdev->flags); | ||
| 6104 | else | ||
| 6105 | clear_bit(FailFast, &rdev->flags); | ||
| 6078 | 6106 | ||
| 6079 | if (info->state & (1<<MD_DISK_JOURNAL)) { | 6107 | if (info->state & (1<<MD_DISK_JOURNAL)) { |
| 6080 | struct md_rdev *rdev2; | 6108 | struct md_rdev *rdev2; |
| @@ -6140,8 +6168,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6140 | * for major_version==0 superblocks | 6168 | * for major_version==0 superblocks |
| 6141 | */ | 6169 | */ |
| 6142 | if (mddev->major_version != 0) { | 6170 | if (mddev->major_version != 0) { |
| 6143 | printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", | 6171 | pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); |
| 6144 | mdname(mddev)); | ||
| 6145 | return -EINVAL; | 6172 | return -EINVAL; |
| 6146 | } | 6173 | } |
| 6147 | 6174 | ||
| @@ -6149,8 +6176,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6149 | int err; | 6176 | int err; |
| 6150 | rdev = md_import_device(dev, -1, 0); | 6177 | rdev = md_import_device(dev, -1, 0); |
| 6151 | if (IS_ERR(rdev)) { | 6178 | if (IS_ERR(rdev)) { |
| 6152 | printk(KERN_WARNING | 6179 | pr_warn("md: error, md_import_device() returned %ld\n", |
| 6153 | "md: error, md_import_device() returned %ld\n", | ||
| 6154 | PTR_ERR(rdev)); | 6180 | PTR_ERR(rdev)); |
| 6155 | return PTR_ERR(rdev); | 6181 | return PTR_ERR(rdev); |
| 6156 | } | 6182 | } |
| @@ -6166,9 +6192,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) | |||
| 6166 | 6192 | ||
| 6167 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | 6193 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
| 6168 | set_bit(WriteMostly, &rdev->flags); | 6194 | set_bit(WriteMostly, &rdev->flags); |
| 6195 | if (info->state & (1<<MD_DISK_FAILFAST)) | ||
| 6196 | set_bit(FailFast, &rdev->flags); | ||
| 6169 | 6197 | ||
| 6170 | if (!mddev->persistent) { | 6198 | if (!mddev->persistent) { |
| 6171 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | 6199 | pr_debug("md: nonpersistent superblock ...\n"); |
| 6172 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; | 6200 | rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; |
| 6173 | } else | 6201 | } else |
| 6174 | rdev->sb_start = calc_dev_sboffset(rdev); | 6202 | rdev->sb_start = calc_dev_sboffset(rdev); |
| @@ -6207,13 +6235,17 @@ kick_rdev: | |||
| 6207 | md_cluster_ops->remove_disk(mddev, rdev); | 6235 | md_cluster_ops->remove_disk(mddev, rdev); |
| 6208 | 6236 | ||
| 6209 | md_kick_rdev_from_array(rdev); | 6237 | md_kick_rdev_from_array(rdev); |
| 6210 | md_update_sb(mddev, 1); | 6238 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 6239 | if (mddev->thread) | ||
| 6240 | md_wakeup_thread(mddev->thread); | ||
| 6241 | else | ||
| 6242 | md_update_sb(mddev, 1); | ||
| 6211 | md_new_event(mddev); | 6243 | md_new_event(mddev); |
| 6212 | 6244 | ||
| 6213 | return 0; | 6245 | return 0; |
| 6214 | busy: | 6246 | busy: |
| 6215 | printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", | 6247 | pr_debug("md: cannot remove active disk %s from %s ...\n", |
| 6216 | bdevname(rdev->bdev,b), mdname(mddev)); | 6248 | bdevname(rdev->bdev,b), mdname(mddev)); |
| 6217 | return -EBUSY; | 6249 | return -EBUSY; |
| 6218 | } | 6250 | } |
| 6219 | 6251 | ||
| @@ -6227,22 +6259,19 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
| 6227 | return -ENODEV; | 6259 | return -ENODEV; |
| 6228 | 6260 | ||
| 6229 | if (mddev->major_version != 0) { | 6261 | if (mddev->major_version != 0) { |
| 6230 | printk(KERN_WARNING "%s: HOT_ADD may only be used with" | 6262 | pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", |
| 6231 | " version-0 superblocks.\n", | ||
| 6232 | mdname(mddev)); | 6263 | mdname(mddev)); |
| 6233 | return -EINVAL; | 6264 | return -EINVAL; |
| 6234 | } | 6265 | } |
| 6235 | if (!mddev->pers->hot_add_disk) { | 6266 | if (!mddev->pers->hot_add_disk) { |
| 6236 | printk(KERN_WARNING | 6267 | pr_warn("%s: personality does not support diskops!\n", |
| 6237 | "%s: personality does not support diskops!\n", | ||
| 6238 | mdname(mddev)); | 6268 | mdname(mddev)); |
| 6239 | return -EINVAL; | 6269 | return -EINVAL; |
| 6240 | } | 6270 | } |
| 6241 | 6271 | ||
| 6242 | rdev = md_import_device(dev, -1, 0); | 6272 | rdev = md_import_device(dev, -1, 0); |
| 6243 | if (IS_ERR(rdev)) { | 6273 | if (IS_ERR(rdev)) { |
| 6244 | printk(KERN_WARNING | 6274 | pr_warn("md: error, md_import_device() returned %ld\n", |
| 6245 | "md: error, md_import_device() returned %ld\n", | ||
| 6246 | PTR_ERR(rdev)); | 6275 | PTR_ERR(rdev)); |
| 6247 | return -EINVAL; | 6276 | return -EINVAL; |
| 6248 | } | 6277 | } |
| @@ -6255,8 +6284,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
| 6255 | rdev->sectors = rdev->sb_start; | 6284 | rdev->sectors = rdev->sb_start; |
| 6256 | 6285 | ||
| 6257 | if (test_bit(Faulty, &rdev->flags)) { | 6286 | if (test_bit(Faulty, &rdev->flags)) { |
| 6258 | printk(KERN_WARNING | 6287 | pr_warn("md: can not hot-add faulty %s disk to %s!\n", |
| 6259 | "md: can not hot-add faulty %s disk to %s!\n", | ||
| 6260 | bdevname(rdev->bdev,b), mdname(mddev)); | 6288 | bdevname(rdev->bdev,b), mdname(mddev)); |
| 6261 | err = -EINVAL; | 6289 | err = -EINVAL; |
| 6262 | goto abort_export; | 6290 | goto abort_export; |
| @@ -6276,7 +6304,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) | |||
| 6276 | 6304 | ||
| 6277 | rdev->raid_disk = -1; | 6305 | rdev->raid_disk = -1; |
| 6278 | 6306 | ||
| 6279 | md_update_sb(mddev, 1); | 6307 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 6308 | if (!mddev->thread) | ||
| 6309 | md_update_sb(mddev, 1); | ||
| 6280 | /* | 6310 | /* |
| 6281 | * Kick recovery, maybe this spare has to be added to the | 6311 | * Kick recovery, maybe this spare has to be added to the |
| 6282 | * array immediately. | 6312 | * array immediately. |
| @@ -6312,23 +6342,23 @@ static int set_bitmap_file(struct mddev *mddev, int fd) | |||
| 6312 | f = fget(fd); | 6342 | f = fget(fd); |
| 6313 | 6343 | ||
| 6314 | if (f == NULL) { | 6344 | if (f == NULL) { |
| 6315 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | 6345 | pr_warn("%s: error: failed to get bitmap file\n", |
| 6316 | mdname(mddev)); | 6346 | mdname(mddev)); |
| 6317 | return -EBADF; | 6347 | return -EBADF; |
| 6318 | } | 6348 | } |
| 6319 | 6349 | ||
| 6320 | inode = f->f_mapping->host; | 6350 | inode = f->f_mapping->host; |
| 6321 | if (!S_ISREG(inode->i_mode)) { | 6351 | if (!S_ISREG(inode->i_mode)) { |
| 6322 | printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", | 6352 | pr_warn("%s: error: bitmap file must be a regular file\n", |
| 6323 | mdname(mddev)); | 6353 | mdname(mddev)); |
| 6324 | err = -EBADF; | 6354 | err = -EBADF; |
| 6325 | } else if (!(f->f_mode & FMODE_WRITE)) { | 6355 | } else if (!(f->f_mode & FMODE_WRITE)) { |
| 6326 | printk(KERN_ERR "%s: error: bitmap file must open for write\n", | 6356 | pr_warn("%s: error: bitmap file must open for write\n", |
| 6327 | mdname(mddev)); | 6357 | mdname(mddev)); |
| 6328 | err = -EBADF; | 6358 | err = -EBADF; |
| 6329 | } else if (atomic_read(&inode->i_writecount) != 1) { | 6359 | } else if (atomic_read(&inode->i_writecount) != 1) { |
| 6330 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | 6360 | pr_warn("%s: error: bitmap file is already in use\n", |
| 6331 | mdname(mddev)); | 6361 | mdname(mddev)); |
| 6332 | err = -EBUSY; | 6362 | err = -EBUSY; |
| 6333 | } | 6363 | } |
| 6334 | if (err) { | 6364 | if (err) { |
| @@ -6393,8 +6423,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
| 6393 | info->major_version >= ARRAY_SIZE(super_types) || | 6423 | info->major_version >= ARRAY_SIZE(super_types) || |
| 6394 | super_types[info->major_version].name == NULL) { | 6424 | super_types[info->major_version].name == NULL) { |
| 6395 | /* maybe try to auto-load a module? */ | 6425 | /* maybe try to auto-load a module? */ |
| 6396 | printk(KERN_INFO | 6426 | pr_warn("md: superblock version %d not known\n", |
| 6397 | "md: superblock version %d not known\n", | ||
| 6398 | info->major_version); | 6427 | info->major_version); |
| 6399 | return -EINVAL; | 6428 | return -EINVAL; |
| 6400 | } | 6429 | } |
| @@ -6432,9 +6461,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
| 6432 | 6461 | ||
| 6433 | mddev->max_disks = MD_SB_DISKS; | 6462 | mddev->max_disks = MD_SB_DISKS; |
| 6434 | 6463 | ||
| 6435 | if (mddev->persistent) | 6464 | if (mddev->persistent) { |
| 6436 | mddev->flags = 0; | 6465 | mddev->flags = 0; |
| 6437 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 6466 | mddev->sb_flags = 0; |
| 6467 | } | ||
| 6468 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); | ||
| 6438 | 6469 | ||
| 6439 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; | 6470 | mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
| 6440 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); | 6471 | mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
| @@ -6660,8 +6691,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
| 6660 | if (mddev->bitmap_info.nodes) { | 6691 | if (mddev->bitmap_info.nodes) { |
| 6661 | /* hold PW on all the bitmap lock */ | 6692 | /* hold PW on all the bitmap lock */ |
| 6662 | if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { | 6693 | if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { |
| 6663 | printk("md: can't change bitmap to none since the" | 6694 | pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); |
| 6664 | " array is in use by more than one node\n"); | ||
| 6665 | rv = -EPERM; | 6695 | rv = -EPERM; |
| 6666 | md_cluster_ops->unlock_all_bitmaps(mddev); | 6696 | md_cluster_ops->unlock_all_bitmaps(mddev); |
| 6667 | goto err; | 6697 | goto err; |
| @@ -6829,7 +6859,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 6829 | /* need to ensure recovery thread has run */ | 6859 | /* need to ensure recovery thread has run */ |
| 6830 | wait_event_interruptible_timeout(mddev->sb_wait, | 6860 | wait_event_interruptible_timeout(mddev->sb_wait, |
| 6831 | !test_bit(MD_RECOVERY_NEEDED, | 6861 | !test_bit(MD_RECOVERY_NEEDED, |
| 6832 | &mddev->flags), | 6862 | &mddev->recovery), |
| 6833 | msecs_to_jiffies(5000)); | 6863 | msecs_to_jiffies(5000)); |
| 6834 | if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { | 6864 | if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { |
| 6835 | /* Need to flush page cache, and ensure no-one else opens | 6865 | /* Need to flush page cache, and ensure no-one else opens |
| @@ -6847,9 +6877,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 6847 | } | 6877 | } |
| 6848 | err = mddev_lock(mddev); | 6878 | err = mddev_lock(mddev); |
| 6849 | if (err) { | 6879 | if (err) { |
| 6850 | printk(KERN_INFO | 6880 | pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", |
| 6851 | "md: ioctl lock interrupted, reason %d, cmd %d\n", | 6881 | err, cmd); |
| 6852 | err, cmd); | ||
| 6853 | goto out; | 6882 | goto out; |
| 6854 | } | 6883 | } |
| 6855 | 6884 | ||
| @@ -6864,30 +6893,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 6864 | if (mddev->pers) { | 6893 | if (mddev->pers) { |
| 6865 | err = update_array_info(mddev, &info); | 6894 | err = update_array_info(mddev, &info); |
| 6866 | if (err) { | 6895 | if (err) { |
| 6867 | printk(KERN_WARNING "md: couldn't update" | 6896 | pr_warn("md: couldn't update array info. %d\n", err); |
| 6868 | " array info. %d\n", err); | ||
| 6869 | goto unlock; | 6897 | goto unlock; |
| 6870 | } | 6898 | } |
| 6871 | goto unlock; | 6899 | goto unlock; |
| 6872 | } | 6900 | } |
| 6873 | if (!list_empty(&mddev->disks)) { | 6901 | if (!list_empty(&mddev->disks)) { |
| 6874 | printk(KERN_WARNING | 6902 | pr_warn("md: array %s already has disks!\n", mdname(mddev)); |
| 6875 | "md: array %s already has disks!\n", | ||
| 6876 | mdname(mddev)); | ||
| 6877 | err = -EBUSY; | 6903 | err = -EBUSY; |
| 6878 | goto unlock; | 6904 | goto unlock; |
| 6879 | } | 6905 | } |
| 6880 | if (mddev->raid_disks) { | 6906 | if (mddev->raid_disks) { |
| 6881 | printk(KERN_WARNING | 6907 | pr_warn("md: array %s already initialised!\n", mdname(mddev)); |
| 6882 | "md: array %s already initialised!\n", | ||
| 6883 | mdname(mddev)); | ||
| 6884 | err = -EBUSY; | 6908 | err = -EBUSY; |
| 6885 | goto unlock; | 6909 | goto unlock; |
| 6886 | } | 6910 | } |
| 6887 | err = set_array_info(mddev, &info); | 6911 | err = set_array_info(mddev, &info); |
| 6888 | if (err) { | 6912 | if (err) { |
| 6889 | printk(KERN_WARNING "md: couldn't set" | 6913 | pr_warn("md: couldn't set array info. %d\n", err); |
| 6890 | " array info. %d\n", err); | ||
| 6891 | goto unlock; | 6914 | goto unlock; |
| 6892 | } | 6915 | } |
| 6893 | goto unlock; | 6916 | goto unlock; |
| @@ -6987,11 +7010,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 6987 | /* If a device failed while we were read-only, we | 7010 | /* If a device failed while we were read-only, we |
| 6988 | * need to make sure the metadata is updated now. | 7011 | * need to make sure the metadata is updated now. |
| 6989 | */ | 7012 | */ |
| 6990 | if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { | 7013 | if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { |
| 6991 | mddev_unlock(mddev); | 7014 | mddev_unlock(mddev); |
| 6992 | wait_event(mddev->sb_wait, | 7015 | wait_event(mddev->sb_wait, |
| 6993 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | 7016 | !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && |
| 6994 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 7017 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
| 6995 | mddev_lock_nointr(mddev); | 7018 | mddev_lock_nointr(mddev); |
| 6996 | } | 7019 | } |
| 6997 | } else { | 7020 | } else { |
| @@ -7092,7 +7115,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
| 7092 | 7115 | ||
| 7093 | if (test_bit(MD_CLOSING, &mddev->flags)) { | 7116 | if (test_bit(MD_CLOSING, &mddev->flags)) { |
| 7094 | mutex_unlock(&mddev->open_mutex); | 7117 | mutex_unlock(&mddev->open_mutex); |
| 7095 | return -ENODEV; | 7118 | err = -ENODEV; |
| 7119 | goto out; | ||
| 7096 | } | 7120 | } |
| 7097 | 7121 | ||
| 7098 | err = 0; | 7122 | err = 0; |
| @@ -7101,6 +7125,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) | |||
| 7101 | 7125 | ||
| 7102 | check_disk_change(bdev); | 7126 | check_disk_change(bdev); |
| 7103 | out: | 7127 | out: |
| 7128 | if (err) | ||
| 7129 | mddev_put(mddev); | ||
| 7104 | return err; | 7130 | return err; |
| 7105 | } | 7131 | } |
| 7106 | 7132 | ||
| @@ -7171,10 +7197,12 @@ static int md_thread(void *arg) | |||
| 7171 | wait_event_interruptible_timeout | 7197 | wait_event_interruptible_timeout |
| 7172 | (thread->wqueue, | 7198 | (thread->wqueue, |
| 7173 | test_bit(THREAD_WAKEUP, &thread->flags) | 7199 | test_bit(THREAD_WAKEUP, &thread->flags) |
| 7174 | || kthread_should_stop(), | 7200 | || kthread_should_stop() || kthread_should_park(), |
| 7175 | thread->timeout); | 7201 | thread->timeout); |
| 7176 | 7202 | ||
| 7177 | clear_bit(THREAD_WAKEUP, &thread->flags); | 7203 | clear_bit(THREAD_WAKEUP, &thread->flags); |
| 7204 | if (kthread_should_park()) | ||
| 7205 | kthread_parkme(); | ||
| 7178 | if (!kthread_should_stop()) | 7206 | if (!kthread_should_stop()) |
| 7179 | thread->run(thread); | 7207 | thread->run(thread); |
| 7180 | } | 7208 | } |
| @@ -7588,8 +7616,8 @@ static const struct file_operations md_seq_fops = { | |||
| 7588 | 7616 | ||
| 7589 | int register_md_personality(struct md_personality *p) | 7617 | int register_md_personality(struct md_personality *p) |
| 7590 | { | 7618 | { |
| 7591 | printk(KERN_INFO "md: %s personality registered for level %d\n", | 7619 | pr_debug("md: %s personality registered for level %d\n", |
| 7592 | p->name, p->level); | 7620 | p->name, p->level); |
| 7593 | spin_lock(&pers_lock); | 7621 | spin_lock(&pers_lock); |
| 7594 | list_add_tail(&p->list, &pers_list); | 7622 | list_add_tail(&p->list, &pers_list); |
| 7595 | spin_unlock(&pers_lock); | 7623 | spin_unlock(&pers_lock); |
| @@ -7599,7 +7627,7 @@ EXPORT_SYMBOL(register_md_personality); | |||
| 7599 | 7627 | ||
| 7600 | int unregister_md_personality(struct md_personality *p) | 7628 | int unregister_md_personality(struct md_personality *p) |
| 7601 | { | 7629 | { |
| 7602 | printk(KERN_INFO "md: %s personality unregistered\n", p->name); | 7630 | pr_debug("md: %s personality unregistered\n", p->name); |
| 7603 | spin_lock(&pers_lock); | 7631 | spin_lock(&pers_lock); |
| 7604 | list_del_init(&p->list); | 7632 | list_del_init(&p->list); |
| 7605 | spin_unlock(&pers_lock); | 7633 | spin_unlock(&pers_lock); |
| @@ -7639,7 +7667,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) | |||
| 7639 | spin_lock(&pers_lock); | 7667 | spin_lock(&pers_lock); |
| 7640 | /* ensure module won't be unloaded */ | 7668 | /* ensure module won't be unloaded */ |
| 7641 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { | 7669 | if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { |
| 7642 | pr_err("can't find md-cluster module or get it's reference.\n"); | 7670 | pr_warn("can't find md-cluster module or get it's reference.\n"); |
| 7643 | spin_unlock(&pers_lock); | 7671 | spin_unlock(&pers_lock); |
| 7644 | return -ENOENT; | 7672 | return -ENOENT; |
| 7645 | } | 7673 | } |
| @@ -7741,8 +7769,8 @@ void md_write_start(struct mddev *mddev, struct bio *bi) | |||
| 7741 | spin_lock(&mddev->lock); | 7769 | spin_lock(&mddev->lock); |
| 7742 | if (mddev->in_sync) { | 7770 | if (mddev->in_sync) { |
| 7743 | mddev->in_sync = 0; | 7771 | mddev->in_sync = 0; |
| 7744 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 7772 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 7745 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | 7773 | set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| 7746 | md_wakeup_thread(mddev->thread); | 7774 | md_wakeup_thread(mddev->thread); |
| 7747 | did_change = 1; | 7775 | did_change = 1; |
| 7748 | } | 7776 | } |
| @@ -7751,7 +7779,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) | |||
| 7751 | if (did_change) | 7779 | if (did_change) |
| 7752 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 7780 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
| 7753 | wait_event(mddev->sb_wait, | 7781 | wait_event(mddev->sb_wait, |
| 7754 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 7782 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
| 7755 | } | 7783 | } |
| 7756 | EXPORT_SYMBOL(md_write_start); | 7784 | EXPORT_SYMBOL(md_write_start); |
| 7757 | 7785 | ||
| @@ -7772,7 +7800,7 @@ EXPORT_SYMBOL(md_write_end); | |||
| 7772 | * attempting a GFP_KERNEL allocation while holding the mddev lock. | 7800 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
| 7773 | * Must be called with mddev_lock held. | 7801 | * Must be called with mddev_lock held. |
| 7774 | * | 7802 | * |
| 7775 | * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock | 7803 | * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock |
| 7776 | * is dropped, so return -EAGAIN after notifying userspace. | 7804 | * is dropped, so return -EAGAIN after notifying userspace. |
| 7777 | */ | 7805 | */ |
| 7778 | int md_allow_write(struct mddev *mddev) | 7806 | int md_allow_write(struct mddev *mddev) |
| @@ -7787,8 +7815,8 @@ int md_allow_write(struct mddev *mddev) | |||
| 7787 | spin_lock(&mddev->lock); | 7815 | spin_lock(&mddev->lock); |
| 7788 | if (mddev->in_sync) { | 7816 | if (mddev->in_sync) { |
| 7789 | mddev->in_sync = 0; | 7817 | mddev->in_sync = 0; |
| 7790 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 7818 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 7791 | set_bit(MD_CHANGE_PENDING, &mddev->flags); | 7819 | set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| 7792 | if (mddev->safemode_delay && | 7820 | if (mddev->safemode_delay && |
| 7793 | mddev->safemode == 0) | 7821 | mddev->safemode == 0) |
| 7794 | mddev->safemode = 1; | 7822 | mddev->safemode = 1; |
| @@ -7798,7 +7826,7 @@ int md_allow_write(struct mddev *mddev) | |||
| 7798 | } else | 7826 | } else |
| 7799 | spin_unlock(&mddev->lock); | 7827 | spin_unlock(&mddev->lock); |
| 7800 | 7828 | ||
| 7801 | if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) | 7829 | if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) |
| 7802 | return -EAGAIN; | 7830 | return -EAGAIN; |
| 7803 | else | 7831 | else |
| 7804 | return 0; | 7832 | return 0; |
| @@ -7914,11 +7942,9 @@ void md_do_sync(struct md_thread *thread) | |||
| 7914 | mddev2->curr_resync >= mddev->curr_resync) { | 7942 | mddev2->curr_resync >= mddev->curr_resync) { |
| 7915 | if (mddev2_minor != mddev2->md_minor) { | 7943 | if (mddev2_minor != mddev2->md_minor) { |
| 7916 | mddev2_minor = mddev2->md_minor; | 7944 | mddev2_minor = mddev2->md_minor; |
| 7917 | printk(KERN_INFO "md: delaying %s of %s" | 7945 | pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", |
| 7918 | " until %s has finished (they" | 7946 | desc, mdname(mddev), |
| 7919 | " share one or more physical units)\n", | 7947 | mdname(mddev2)); |
| 7920 | desc, mdname(mddev), | ||
| 7921 | mdname(mddev2)); | ||
| 7922 | } | 7948 | } |
| 7923 | mddev_put(mddev2); | 7949 | mddev_put(mddev2); |
| 7924 | if (signal_pending(current)) | 7950 | if (signal_pending(current)) |
| @@ -7975,12 +8001,10 @@ void md_do_sync(struct md_thread *thread) | |||
| 7975 | } | 8001 | } |
| 7976 | } | 8002 | } |
| 7977 | 8003 | ||
| 7978 | printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); | 8004 | pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); |
| 7979 | printk(KERN_INFO "md: minimum _guaranteed_ speed:" | 8005 | pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); |
| 7980 | " %d KB/sec/disk.\n", speed_min(mddev)); | 8006 | pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", |
| 7981 | printk(KERN_INFO "md: using maximum available idle IO bandwidth " | 8007 | speed_max(mddev), desc); |
| 7982 | "(but not more than %d KB/sec) for %s.\n", | ||
| 7983 | speed_max(mddev), desc); | ||
| 7984 | 8008 | ||
| 7985 | is_mddev_idle(mddev, 1); /* this initializes IO event counters */ | 8009 | is_mddev_idle(mddev, 1); /* this initializes IO event counters */ |
| 7986 | 8010 | ||
| @@ -7997,16 +8021,15 @@ void md_do_sync(struct md_thread *thread) | |||
| 7997 | * Tune reconstruction: | 8021 | * Tune reconstruction: |
| 7998 | */ | 8022 | */ |
| 7999 | window = 32*(PAGE_SIZE/512); | 8023 | window = 32*(PAGE_SIZE/512); |
| 8000 | printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", | 8024 | pr_debug("md: using %dk window, over a total of %lluk.\n", |
| 8001 | window/2, (unsigned long long)max_sectors/2); | 8025 | window/2, (unsigned long long)max_sectors/2); |
| 8002 | 8026 | ||
| 8003 | atomic_set(&mddev->recovery_active, 0); | 8027 | atomic_set(&mddev->recovery_active, 0); |
| 8004 | last_check = 0; | 8028 | last_check = 0; |
| 8005 | 8029 | ||
| 8006 | if (j>2) { | 8030 | if (j>2) { |
| 8007 | printk(KERN_INFO | 8031 | pr_debug("md: resuming %s of %s from checkpoint.\n", |
| 8008 | "md: resuming %s of %s from checkpoint.\n", | 8032 | desc, mdname(mddev)); |
| 8009 | desc, mdname(mddev)); | ||
| 8010 | mddev->curr_resync = j; | 8033 | mddev->curr_resync = j; |
| 8011 | } else | 8034 | } else |
| 8012 | mddev->curr_resync = 3; /* no longer delayed */ | 8035 | mddev->curr_resync = 3; /* no longer delayed */ |
| @@ -8038,7 +8061,7 @@ void md_do_sync(struct md_thread *thread) | |||
| 8038 | j > mddev->recovery_cp) | 8061 | j > mddev->recovery_cp) |
| 8039 | mddev->recovery_cp = j; | 8062 | mddev->recovery_cp = j; |
| 8040 | update_time = jiffies; | 8063 | update_time = jiffies; |
| 8041 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 8064 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 8042 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 8065 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 8043 | } | 8066 | } |
| 8044 | 8067 | ||
| @@ -8133,9 +8156,9 @@ void md_do_sync(struct md_thread *thread) | |||
| 8133 | } | 8156 | } |
| 8134 | } | 8157 | } |
| 8135 | } | 8158 | } |
| 8136 | printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, | 8159 | pr_info("md: %s: %s %s.\n",mdname(mddev), desc, |
| 8137 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) | 8160 | test_bit(MD_RECOVERY_INTR, &mddev->recovery) |
| 8138 | ? "interrupted" : "done"); | 8161 | ? "interrupted" : "done"); |
| 8139 | /* | 8162 | /* |
| 8140 | * this also signals 'finished resyncing' to md_stop | 8163 | * this also signals 'finished resyncing' to md_stop |
| 8141 | */ | 8164 | */ |
| @@ -8155,9 +8178,8 @@ void md_do_sync(struct md_thread *thread) | |||
| 8155 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 8178 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
| 8156 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 8179 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
| 8157 | if (mddev->curr_resync >= mddev->recovery_cp) { | 8180 | if (mddev->curr_resync >= mddev->recovery_cp) { |
| 8158 | printk(KERN_INFO | 8181 | pr_debug("md: checkpointing %s of %s.\n", |
| 8159 | "md: checkpointing %s of %s.\n", | 8182 | desc, mdname(mddev)); |
| 8160 | desc, mdname(mddev)); | ||
| 8161 | if (test_bit(MD_RECOVERY_ERROR, | 8183 | if (test_bit(MD_RECOVERY_ERROR, |
| 8162 | &mddev->recovery)) | 8184 | &mddev->recovery)) |
| 8163 | mddev->recovery_cp = | 8185 | mddev->recovery_cp = |
| @@ -8187,8 +8209,8 @@ void md_do_sync(struct md_thread *thread) | |||
| 8187 | /* set CHANGE_PENDING here since maybe another update is needed, | 8209 | /* set CHANGE_PENDING here since maybe another update is needed, |
| 8188 | * so other nodes are informed. It should be harmless for normal | 8210 | * so other nodes are informed. It should be harmless for normal |
| 8189 | * raid */ | 8211 | * raid */ |
| 8190 | set_mask_bits(&mddev->flags, 0, | 8212 | set_mask_bits(&mddev->sb_flags, 0, |
| 8191 | BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); | 8213 | BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); |
| 8192 | 8214 | ||
| 8193 | spin_lock(&mddev->lock); | 8215 | spin_lock(&mddev->lock); |
| 8194 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 8216 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
| @@ -8288,12 +8310,12 @@ static int remove_and_add_spares(struct mddev *mddev, | |||
| 8288 | if (!test_bit(Journal, &rdev->flags)) | 8310 | if (!test_bit(Journal, &rdev->flags)) |
| 8289 | spares++; | 8311 | spares++; |
| 8290 | md_new_event(mddev); | 8312 | md_new_event(mddev); |
| 8291 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8313 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 8292 | } | 8314 | } |
| 8293 | } | 8315 | } |
| 8294 | no_add: | 8316 | no_add: |
| 8295 | if (removed) | 8317 | if (removed) |
| 8296 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8318 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 8297 | return spares; | 8319 | return spares; |
| 8298 | } | 8320 | } |
| 8299 | 8321 | ||
| @@ -8305,8 +8327,8 @@ static void md_start_sync(struct work_struct *ws) | |||
| 8305 | mddev, | 8327 | mddev, |
| 8306 | "resync"); | 8328 | "resync"); |
| 8307 | if (!mddev->sync_thread) { | 8329 | if (!mddev->sync_thread) { |
| 8308 | printk(KERN_ERR "%s: could not start resync thread...\n", | 8330 | pr_warn("%s: could not start resync thread...\n", |
| 8309 | mdname(mddev)); | 8331 | mdname(mddev)); |
| 8310 | /* leave the spares where they are, it shouldn't hurt */ | 8332 | /* leave the spares where they are, it shouldn't hurt */ |
| 8311 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 8333 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
| 8312 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 8334 | clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
| @@ -8356,8 +8378,8 @@ void md_check_recovery(struct mddev *mddev) | |||
| 8356 | 8378 | ||
| 8357 | if (signal_pending(current)) { | 8379 | if (signal_pending(current)) { |
| 8358 | if (mddev->pers->sync_request && !mddev->external) { | 8380 | if (mddev->pers->sync_request && !mddev->external) { |
| 8359 | printk(KERN_INFO "md: %s in immediate safe mode\n", | 8381 | pr_debug("md: %s in immediate safe mode\n", |
| 8360 | mdname(mddev)); | 8382 | mdname(mddev)); |
| 8361 | mddev->safemode = 2; | 8383 | mddev->safemode = 2; |
| 8362 | } | 8384 | } |
| 8363 | flush_signals(current); | 8385 | flush_signals(current); |
| @@ -8366,7 +8388,7 @@ void md_check_recovery(struct mddev *mddev) | |||
| 8366 | if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 8388 | if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
| 8367 | return; | 8389 | return; |
| 8368 | if ( ! ( | 8390 | if ( ! ( |
| 8369 | (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || | 8391 | (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || |
| 8370 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || | 8392 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || |
| 8371 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || | 8393 | test_bit(MD_RECOVERY_DONE, &mddev->recovery) || |
| 8372 | test_bit(MD_RELOAD_SB, &mddev->flags) || | 8394 | test_bit(MD_RELOAD_SB, &mddev->flags) || |
| @@ -8404,7 +8426,7 @@ void md_check_recovery(struct mddev *mddev) | |||
| 8404 | md_reap_sync_thread(mddev); | 8426 | md_reap_sync_thread(mddev); |
| 8405 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | 8427 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
| 8406 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 8428 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 8407 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 8429 | clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| 8408 | goto unlock; | 8430 | goto unlock; |
| 8409 | } | 8431 | } |
| 8410 | 8432 | ||
| @@ -8432,7 +8454,7 @@ void md_check_recovery(struct mddev *mddev) | |||
| 8432 | mddev->recovery_cp == MaxSector) { | 8454 | mddev->recovery_cp == MaxSector) { |
| 8433 | mddev->in_sync = 1; | 8455 | mddev->in_sync = 1; |
| 8434 | did_change = 1; | 8456 | did_change = 1; |
| 8435 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 8457 | set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| 8436 | } | 8458 | } |
| 8437 | if (mddev->safemode == 1) | 8459 | if (mddev->safemode == 1) |
| 8438 | mddev->safemode = 0; | 8460 | mddev->safemode = 0; |
| @@ -8441,7 +8463,7 @@ void md_check_recovery(struct mddev *mddev) | |||
| 8441 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 8463 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
| 8442 | } | 8464 | } |
| 8443 | 8465 | ||
| 8444 | if (mddev->flags & MD_UPDATE_SB_FLAGS) | 8466 | if (mddev->sb_flags) |
| 8445 | md_update_sb(mddev, 0); | 8467 | md_update_sb(mddev, 0); |
| 8446 | 8468 | ||
| 8447 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 8469 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
| @@ -8537,7 +8559,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
| 8537 | if (mddev->pers->spare_active(mddev)) { | 8559 | if (mddev->pers->spare_active(mddev)) { |
| 8538 | sysfs_notify(&mddev->kobj, NULL, | 8560 | sysfs_notify(&mddev->kobj, NULL, |
| 8539 | "degraded"); | 8561 | "degraded"); |
| 8540 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 8562 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 8541 | } | 8563 | } |
| 8542 | } | 8564 | } |
| 8543 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | 8565 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
| @@ -8552,7 +8574,7 @@ void md_reap_sync_thread(struct mddev *mddev) | |||
| 8552 | rdev->saved_raid_disk = -1; | 8574 | rdev->saved_raid_disk = -1; |
| 8553 | 8575 | ||
| 8554 | md_update_sb(mddev, 1); | 8576 | md_update_sb(mddev, 1); |
| 8555 | /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can | 8577 | /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can |
| 8556 | * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by | 8578 | * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by |
| 8557 | * clustered raid */ | 8579 | * clustered raid */ |
| 8558 | if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) | 8580 | if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) |
| @@ -8614,9 +8636,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | |||
| 8614 | rv = badblocks_set(&rdev->badblocks, s, sectors, 0); | 8636 | rv = badblocks_set(&rdev->badblocks, s, sectors, 0); |
| 8615 | if (rv == 0) { | 8637 | if (rv == 0) { |
| 8616 | /* Make sure they get written out promptly */ | 8638 | /* Make sure they get written out promptly */ |
| 8639 | if (test_bit(ExternalBbl, &rdev->flags)) | ||
| 8640 | sysfs_notify(&rdev->kobj, NULL, | ||
| 8641 | "unacknowledged_bad_blocks"); | ||
| 8617 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 8642 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
| 8618 | set_mask_bits(&mddev->flags, 0, | 8643 | set_mask_bits(&mddev->sb_flags, 0, |
| 8619 | BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); | 8644 | BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); |
| 8620 | md_wakeup_thread(rdev->mddev->thread); | 8645 | md_wakeup_thread(rdev->mddev->thread); |
| 8621 | return 1; | 8646 | return 1; |
| 8622 | } else | 8647 | } else |
| @@ -8627,12 +8652,15 @@ EXPORT_SYMBOL_GPL(rdev_set_badblocks); | |||
| 8627 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | 8652 | int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, |
| 8628 | int is_new) | 8653 | int is_new) |
| 8629 | { | 8654 | { |
| 8655 | int rv; | ||
| 8630 | if (is_new) | 8656 | if (is_new) |
| 8631 | s += rdev->new_data_offset; | 8657 | s += rdev->new_data_offset; |
| 8632 | else | 8658 | else |
| 8633 | s += rdev->data_offset; | 8659 | s += rdev->data_offset; |
| 8634 | return badblocks_clear(&rdev->badblocks, | 8660 | rv = badblocks_clear(&rdev->badblocks, s, sectors); |
| 8635 | s, sectors); | 8661 | if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) |
| 8662 | sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); | ||
| 8663 | return rv; | ||
| 8636 | } | 8664 | } |
| 8637 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | 8665 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); |
| 8638 | 8666 | ||
| @@ -8749,7 +8777,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) | |||
| 8749 | rdev2->saved_raid_disk = role; | 8777 | rdev2->saved_raid_disk = role; |
| 8750 | ret = remove_and_add_spares(mddev, rdev2); | 8778 | ret = remove_and_add_spares(mddev, rdev2); |
| 8751 | pr_info("Activated spare: %s\n", | 8779 | pr_info("Activated spare: %s\n", |
| 8752 | bdevname(rdev2->bdev,b)); | 8780 | bdevname(rdev2->bdev,b)); |
| 8753 | /* wakeup mddev->thread here, so array could | 8781 | /* wakeup mddev->thread here, so array could |
| 8754 | * perform resync with the new activated disk */ | 8782 | * perform resync with the new activated disk */ |
| 8755 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 8783 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| @@ -8785,15 +8813,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) | |||
| 8785 | * variable in case we err in the future | 8813 | * variable in case we err in the future |
| 8786 | */ | 8814 | */ |
| 8787 | rdev->sb_page = NULL; | 8815 | rdev->sb_page = NULL; |
| 8788 | alloc_disk_sb(rdev); | 8816 | err = alloc_disk_sb(rdev); |
| 8789 | ClearPageUptodate(rdev->sb_page); | 8817 | if (err == 0) { |
| 8790 | rdev->sb_loaded = 0; | 8818 | ClearPageUptodate(rdev->sb_page); |
| 8791 | err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); | 8819 | rdev->sb_loaded = 0; |
| 8792 | 8820 | err = super_types[mddev->major_version]. | |
| 8821 | load_super(rdev, NULL, mddev->minor_version); | ||
| 8822 | } | ||
| 8793 | if (err < 0) { | 8823 | if (err < 0) { |
| 8794 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", | 8824 | pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", |
| 8795 | __func__, __LINE__, rdev->desc_nr, err); | 8825 | __func__, __LINE__, rdev->desc_nr, err); |
| 8796 | put_page(rdev->sb_page); | 8826 | if (rdev->sb_page) |
| 8827 | put_page(rdev->sb_page); | ||
| 8797 | rdev->sb_page = swapout; | 8828 | rdev->sb_page = swapout; |
| 8798 | rdev->sb_loaded = 1; | 8829 | rdev->sb_loaded = 1; |
| 8799 | return err; | 8830 | return err; |
| @@ -8871,9 +8902,6 @@ void md_autodetect_dev(dev_t dev) | |||
| 8871 | mutex_lock(&detected_devices_mutex); | 8902 | mutex_lock(&detected_devices_mutex); |
| 8872 | list_add_tail(&node_detected_dev->list, &all_detected_devices); | 8903 | list_add_tail(&node_detected_dev->list, &all_detected_devices); |
| 8873 | mutex_unlock(&detected_devices_mutex); | 8904 | mutex_unlock(&detected_devices_mutex); |
| 8874 | } else { | ||
| 8875 | printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" | ||
| 8876 | ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); | ||
| 8877 | } | 8905 | } |
| 8878 | } | 8906 | } |
| 8879 | 8907 | ||
| @@ -8887,7 +8915,7 @@ static void autostart_arrays(int part) | |||
| 8887 | i_scanned = 0; | 8915 | i_scanned = 0; |
| 8888 | i_passed = 0; | 8916 | i_passed = 0; |
| 8889 | 8917 | ||
| 8890 | printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); | 8918 | pr_info("md: Autodetecting RAID arrays.\n"); |
| 8891 | 8919 | ||
| 8892 | mutex_lock(&detected_devices_mutex); | 8920 | mutex_lock(&detected_devices_mutex); |
| 8893 | while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { | 8921 | while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { |
| @@ -8912,8 +8940,7 @@ static void autostart_arrays(int part) | |||
| 8912 | } | 8940 | } |
| 8913 | mutex_unlock(&detected_devices_mutex); | 8941 | mutex_unlock(&detected_devices_mutex); |
| 8914 | 8942 | ||
| 8915 | printk(KERN_INFO "md: Scanned %d and added %d devices.\n", | 8943 | pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); |
| 8916 | i_scanned, i_passed); | ||
| 8917 | 8944 | ||
| 8918 | autorun_devices(part); | 8945 | autorun_devices(part); |
| 8919 | } | 8946 | } |
diff --git a/drivers/md/md.h b/drivers/md/md.h index 2b2041773e79..e38936d05df1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
| @@ -30,6 +30,16 @@ | |||
| 30 | #define MaxSector (~(sector_t)0) | 30 | #define MaxSector (~(sector_t)0) |
| 31 | 31 | ||
| 32 | /* | 32 | /* |
| 33 | * These flags should really be called "NO_RETRY" rather than | ||
| 34 | * "FAILFAST" because they don't make any promise about time lapse, | ||
| 35 | * only about the number of retries, which will be zero. | ||
| 36 | * REQ_FAILFAST_DRIVER is not included because | ||
| 37 | * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") | ||
| 38 | * seems to suggest that the errors it avoids retrying should usually | ||
| 39 | * be retried. | ||
| 40 | */ | ||
| 41 | #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) | ||
| 42 | /* | ||
| 33 | * MD's 'extended' device | 43 | * MD's 'extended' device |
| 34 | */ | 44 | */ |
| 35 | struct md_rdev { | 45 | struct md_rdev { |
| @@ -168,6 +178,19 @@ enum flag_bits { | |||
| 168 | * so it is safe to remove without | 178 | * so it is safe to remove without |
| 169 | * another synchronize_rcu() call. | 179 | * another synchronize_rcu() call. |
| 170 | */ | 180 | */ |
| 181 | ExternalBbl, /* External metadata provides bad | ||
| 182 | * block management for a disk | ||
| 183 | */ | ||
| 184 | FailFast, /* Minimal retries should be attempted on | ||
| 185 | * this device, so use REQ_FAILFAST_DEV. | ||
| 186 | * Also don't try to repair failed reads. | ||
| 187 | * It is expects that no bad block log | ||
| 188 | * is present. | ||
| 189 | */ | ||
| 190 | LastDev, /* Seems to be the last working dev as | ||
| 191 | * it didn't fail, so don't use FailFast | ||
| 192 | * any more for metadata | ||
| 193 | */ | ||
| 171 | }; | 194 | }; |
| 172 | 195 | ||
| 173 | static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, | 196 | static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, |
| @@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, | |||
| 189 | int is_new); | 212 | int is_new); |
| 190 | struct md_cluster_info; | 213 | struct md_cluster_info; |
| 191 | 214 | ||
| 215 | enum mddev_flags { | ||
| 216 | MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ | ||
| 217 | MD_CLOSING, /* If set, we are closing the array, do not open | ||
| 218 | * it then */ | ||
| 219 | MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ | ||
| 220 | MD_HAS_JOURNAL, /* The raid array has journal feature set */ | ||
| 221 | MD_RELOAD_SB, /* Reload the superblock because another node | ||
| 222 | * updated it. | ||
| 223 | */ | ||
| 224 | MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node | ||
| 225 | * already took resync lock, need to | ||
| 226 | * release the lock */ | ||
| 227 | MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is | ||
| 228 | * supported as calls to md_error() will | ||
| 229 | * never cause the array to become failed. | ||
| 230 | */ | ||
| 231 | }; | ||
| 232 | |||
| 233 | enum mddev_sb_flags { | ||
| 234 | MD_SB_CHANGE_DEVS, /* Some device status has changed */ | ||
| 235 | MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ | ||
| 236 | MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ | ||
| 237 | MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ | ||
| 238 | }; | ||
| 239 | |||
| 192 | struct mddev { | 240 | struct mddev { |
| 193 | void *private; | 241 | void *private; |
| 194 | struct md_personality *pers; | 242 | struct md_personality *pers; |
| @@ -196,21 +244,7 @@ struct mddev { | |||
| 196 | int md_minor; | 244 | int md_minor; |
| 197 | struct list_head disks; | 245 | struct list_head disks; |
| 198 | unsigned long flags; | 246 | unsigned long flags; |
| 199 | #define MD_CHANGE_DEVS 0 /* Some device status has changed */ | 247 | unsigned long sb_flags; |
| 200 | #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ | ||
| 201 | #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ | ||
| 202 | #define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ | ||
| 203 | #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ | ||
| 204 | #define MD_CLOSING 4 /* If set, we are closing the array, do not open | ||
| 205 | * it then */ | ||
| 206 | #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ | ||
| 207 | #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ | ||
| 208 | #define MD_RELOAD_SB 7 /* Reload the superblock because another node | ||
| 209 | * updated it. | ||
| 210 | */ | ||
| 211 | #define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node | ||
| 212 | * already took resync lock, need to | ||
| 213 | * release the lock */ | ||
| 214 | 248 | ||
| 215 | int suspended; | 249 | int suspended; |
| 216 | atomic_t active_io; | 250 | atomic_t active_io; |
| @@ -304,31 +338,6 @@ struct mddev { | |||
| 304 | int parallel_resync; | 338 | int parallel_resync; |
| 305 | 339 | ||
| 306 | int ok_start_degraded; | 340 | int ok_start_degraded; |
| 307 | /* recovery/resync flags | ||
| 308 | * NEEDED: we might need to start a resync/recover | ||
| 309 | * RUNNING: a thread is running, or about to be started | ||
| 310 | * SYNC: actually doing a resync, not a recovery | ||
| 311 | * RECOVER: doing recovery, or need to try it. | ||
| 312 | * INTR: resync needs to be aborted for some reason | ||
| 313 | * DONE: thread is done and is waiting to be reaped | ||
| 314 | * REQUEST: user-space has requested a sync (used with SYNC) | ||
| 315 | * CHECK: user-space request for check-only, no repair | ||
| 316 | * RESHAPE: A reshape is happening | ||
| 317 | * ERROR: sync-action interrupted because io-error | ||
| 318 | * | ||
| 319 | * If neither SYNC or RESHAPE are set, then it is a recovery. | ||
| 320 | */ | ||
| 321 | #define MD_RECOVERY_RUNNING 0 | ||
| 322 | #define MD_RECOVERY_SYNC 1 | ||
| 323 | #define MD_RECOVERY_RECOVER 2 | ||
| 324 | #define MD_RECOVERY_INTR 3 | ||
| 325 | #define MD_RECOVERY_DONE 4 | ||
| 326 | #define MD_RECOVERY_NEEDED 5 | ||
| 327 | #define MD_RECOVERY_REQUESTED 6 | ||
| 328 | #define MD_RECOVERY_CHECK 7 | ||
| 329 | #define MD_RECOVERY_RESHAPE 8 | ||
| 330 | #define MD_RECOVERY_FROZEN 9 | ||
| 331 | #define MD_RECOVERY_ERROR 10 | ||
| 332 | 341 | ||
| 333 | unsigned long recovery; | 342 | unsigned long recovery; |
| 334 | /* If a RAID personality determines that recovery (of a particular | 343 | /* If a RAID personality determines that recovery (of a particular |
| @@ -442,6 +451,23 @@ struct mddev { | |||
| 442 | unsigned int good_device_nr; /* good device num within cluster raid */ | 451 | unsigned int good_device_nr; /* good device num within cluster raid */ |
| 443 | }; | 452 | }; |
| 444 | 453 | ||
| 454 | enum recovery_flags { | ||
| 455 | /* | ||
| 456 | * If neither SYNC or RESHAPE are set, then it is a recovery. | ||
| 457 | */ | ||
| 458 | MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ | ||
| 459 | MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ | ||
| 460 | MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ | ||
| 461 | MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ | ||
| 462 | MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ | ||
| 463 | MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ | ||
| 464 | MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ | ||
| 465 | MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ | ||
| 466 | MD_RECOVERY_RESHAPE, /* A reshape is happening */ | ||
| 467 | MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ | ||
| 468 | MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ | ||
| 469 | }; | ||
| 470 | |||
| 445 | static inline int __must_check mddev_lock(struct mddev *mddev) | 471 | static inline int __must_check mddev_lock(struct mddev *mddev) |
| 446 | { | 472 | { |
| 447 | return mutex_lock_interruptible(&mddev->reconfig_mutex); | 473 | return mutex_lock_interruptible(&mddev->reconfig_mutex); |
| @@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits); | |||
| 623 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); | 649 | extern void md_flush_request(struct mddev *mddev, struct bio *bio); |
| 624 | extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | 650 | extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
| 625 | sector_t sector, int size, struct page *page); | 651 | sector_t sector, int size, struct page *page); |
| 626 | extern void md_super_wait(struct mddev *mddev); | 652 | extern int md_super_wait(struct mddev *mddev); |
| 627 | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | 653 | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
| 628 | struct page *page, int op, int op_flags, | 654 | struct page *page, int op, int op_flags, |
| 629 | bool metadata_op); | 655 | bool metadata_op); |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 4da06d813b8f..aa8c4e5c1ee2 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
| @@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf) | |||
| 52 | } | 52 | } |
| 53 | rcu_read_unlock(); | 53 | rcu_read_unlock(); |
| 54 | 54 | ||
| 55 | printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); | 55 | pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); |
| 56 | return (-1); | 56 | return (-1); |
| 57 | } | 57 | } |
| 58 | 58 | ||
| @@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio) | |||
| 97 | */ | 97 | */ |
| 98 | char b[BDEVNAME_SIZE]; | 98 | char b[BDEVNAME_SIZE]; |
| 99 | md_error (mp_bh->mddev, rdev); | 99 | md_error (mp_bh->mddev, rdev); |
| 100 | printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", | 100 | pr_info("multipath: %s: rescheduling sector %llu\n", |
| 101 | bdevname(rdev->bdev,b), | 101 | bdevname(rdev->bdev,b), |
| 102 | (unsigned long long)bio->bi_iter.bi_sector); | 102 | (unsigned long long)bio->bi_iter.bi_sector); |
| 103 | multipath_reschedule_retry(mp_bh); | 103 | multipath_reschedule_retry(mp_bh); |
| 104 | } else | 104 | } else |
| 105 | multipath_end_bh_io(mp_bh, bio->bi_error); | 105 | multipath_end_bh_io(mp_bh, bio->bi_error); |
| @@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) | |||
| 194 | * first check if this is a queued request for a device | 194 | * first check if this is a queued request for a device |
| 195 | * which has just failed. | 195 | * which has just failed. |
| 196 | */ | 196 | */ |
| 197 | printk(KERN_ALERT | 197 | pr_warn("multipath: only one IO path left and IO error.\n"); |
| 198 | "multipath: only one IO path left and IO error.\n"); | ||
| 199 | /* leave it active... it's all we have */ | 198 | /* leave it active... it's all we have */ |
| 200 | return; | 199 | return; |
| 201 | } | 200 | } |
| @@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) | |||
| 209 | spin_unlock_irqrestore(&conf->device_lock, flags); | 208 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 210 | } | 209 | } |
| 211 | set_bit(Faulty, &rdev->flags); | 210 | set_bit(Faulty, &rdev->flags); |
| 212 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 211 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 213 | printk(KERN_ALERT "multipath: IO failure on %s," | 212 | pr_err("multipath: IO failure on %s, disabling IO path.\n" |
| 214 | " disabling IO path.\n" | 213 | "multipath: Operation continuing on %d IO paths.\n", |
| 215 | "multipath: Operation continuing" | ||
| 216 | " on %d IO paths.\n", | ||
| 217 | bdevname(rdev->bdev, b), | 214 | bdevname(rdev->bdev, b), |
| 218 | conf->raid_disks - mddev->degraded); | 215 | conf->raid_disks - mddev->degraded); |
| 219 | } | 216 | } |
| @@ -223,21 +220,21 @@ static void print_multipath_conf (struct mpconf *conf) | |||
| 223 | int i; | 220 | int i; |
| 224 | struct multipath_info *tmp; | 221 | struct multipath_info *tmp; |
| 225 | 222 | ||
| 226 | printk("MULTIPATH conf printout:\n"); | 223 | pr_debug("MULTIPATH conf printout:\n"); |
| 227 | if (!conf) { | 224 | if (!conf) { |
| 228 | printk("(conf==NULL)\n"); | 225 | pr_debug("(conf==NULL)\n"); |
| 229 | return; | 226 | return; |
| 230 | } | 227 | } |
| 231 | printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 228 | pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
| 232 | conf->raid_disks); | 229 | conf->raid_disks); |
| 233 | 230 | ||
| 234 | for (i = 0; i < conf->raid_disks; i++) { | 231 | for (i = 0; i < conf->raid_disks; i++) { |
| 235 | char b[BDEVNAME_SIZE]; | 232 | char b[BDEVNAME_SIZE]; |
| 236 | tmp = conf->multipaths + i; | 233 | tmp = conf->multipaths + i; |
| 237 | if (tmp->rdev) | 234 | if (tmp->rdev) |
| 238 | printk(" disk%d, o:%d, dev:%s\n", | 235 | pr_debug(" disk%d, o:%d, dev:%s\n", |
| 239 | i,!test_bit(Faulty, &tmp->rdev->flags), | 236 | i,!test_bit(Faulty, &tmp->rdev->flags), |
| 240 | bdevname(tmp->rdev->bdev,b)); | 237 | bdevname(tmp->rdev->bdev,b)); |
| 241 | } | 238 | } |
| 242 | } | 239 | } |
| 243 | 240 | ||
| @@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 292 | if (rdev == p->rdev) { | 289 | if (rdev == p->rdev) { |
| 293 | if (test_bit(In_sync, &rdev->flags) || | 290 | if (test_bit(In_sync, &rdev->flags) || |
| 294 | atomic_read(&rdev->nr_pending)) { | 291 | atomic_read(&rdev->nr_pending)) { |
| 295 | printk(KERN_ERR "hot-remove-disk, slot %d is identified" | 292 | pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); |
| 296 | " but is still operational!\n", number); | ||
| 297 | err = -EBUSY; | 293 | err = -EBUSY; |
| 298 | goto abort; | 294 | goto abort; |
| 299 | } | 295 | } |
| @@ -346,16 +342,14 @@ static void multipathd(struct md_thread *thread) | |||
| 346 | bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; | 342 | bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; |
| 347 | 343 | ||
| 348 | if ((mp_bh->path = multipath_map (conf))<0) { | 344 | if ((mp_bh->path = multipath_map (conf))<0) { |
| 349 | printk(KERN_ALERT "multipath: %s: unrecoverable IO read" | 345 | pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", |
| 350 | " error for block %llu\n", | 346 | bdevname(bio->bi_bdev,b), |
| 351 | bdevname(bio->bi_bdev,b), | 347 | (unsigned long long)bio->bi_iter.bi_sector); |
| 352 | (unsigned long long)bio->bi_iter.bi_sector); | ||
| 353 | multipath_end_bh_io(mp_bh, -EIO); | 348 | multipath_end_bh_io(mp_bh, -EIO); |
| 354 | } else { | 349 | } else { |
| 355 | printk(KERN_ERR "multipath: %s: redirecting sector %llu" | 350 | pr_err("multipath: %s: redirecting sector %llu to another IO path\n", |
| 356 | " to another IO path\n", | 351 | bdevname(bio->bi_bdev,b), |
| 357 | bdevname(bio->bi_bdev,b), | 352 | (unsigned long long)bio->bi_iter.bi_sector); |
| 358 | (unsigned long long)bio->bi_iter.bi_sector); | ||
| 359 | *bio = *(mp_bh->master_bio); | 353 | *bio = *(mp_bh->master_bio); |
| 360 | bio->bi_iter.bi_sector += | 354 | bio->bi_iter.bi_sector += |
| 361 | conf->multipaths[mp_bh->path].rdev->data_offset; | 355 | conf->multipaths[mp_bh->path].rdev->data_offset; |
| @@ -389,8 +383,8 @@ static int multipath_run (struct mddev *mddev) | |||
| 389 | return -EINVAL; | 383 | return -EINVAL; |
| 390 | 384 | ||
| 391 | if (mddev->level != LEVEL_MULTIPATH) { | 385 | if (mddev->level != LEVEL_MULTIPATH) { |
| 392 | printk("multipath: %s: raid level not set to multipath IO (%d)\n", | 386 | pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", |
| 393 | mdname(mddev), mddev->level); | 387 | mdname(mddev), mddev->level); |
| 394 | goto out; | 388 | goto out; |
| 395 | } | 389 | } |
| 396 | /* | 390 | /* |
| @@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev) | |||
| 401 | 395 | ||
| 402 | conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); | 396 | conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); |
| 403 | mddev->private = conf; | 397 | mddev->private = conf; |
| 404 | if (!conf) { | 398 | if (!conf) |
| 405 | printk(KERN_ERR | ||
| 406 | "multipath: couldn't allocate memory for %s\n", | ||
| 407 | mdname(mddev)); | ||
| 408 | goto out; | 399 | goto out; |
| 409 | } | ||
| 410 | 400 | ||
| 411 | conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, | 401 | conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, |
| 412 | GFP_KERNEL); | 402 | GFP_KERNEL); |
| 413 | if (!conf->multipaths) { | 403 | if (!conf->multipaths) |
| 414 | printk(KERN_ERR | ||
| 415 | "multipath: couldn't allocate memory for %s\n", | ||
| 416 | mdname(mddev)); | ||
| 417 | goto out_free_conf; | 404 | goto out_free_conf; |
| 418 | } | ||
| 419 | 405 | ||
| 420 | working_disks = 0; | 406 | working_disks = 0; |
| 421 | rdev_for_each(rdev, mddev) { | 407 | rdev_for_each(rdev, mddev) { |
| @@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev) | |||
| 439 | INIT_LIST_HEAD(&conf->retry_list); | 425 | INIT_LIST_HEAD(&conf->retry_list); |
| 440 | 426 | ||
| 441 | if (!working_disks) { | 427 | if (!working_disks) { |
| 442 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", | 428 | pr_warn("multipath: no operational IO paths for %s\n", |
| 443 | mdname(mddev)); | 429 | mdname(mddev)); |
| 444 | goto out_free_conf; | 430 | goto out_free_conf; |
| 445 | } | 431 | } |
| @@ -447,27 +433,17 @@ static int multipath_run (struct mddev *mddev) | |||
| 447 | 433 | ||
| 448 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, | 434 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, |
| 449 | sizeof(struct multipath_bh)); | 435 | sizeof(struct multipath_bh)); |
| 450 | if (conf->pool == NULL) { | 436 | if (conf->pool == NULL) |
| 451 | printk(KERN_ERR | ||
| 452 | "multipath: couldn't allocate memory for %s\n", | ||
| 453 | mdname(mddev)); | ||
| 454 | goto out_free_conf; | 437 | goto out_free_conf; |
| 455 | } | ||
| 456 | 438 | ||
| 457 | { | 439 | mddev->thread = md_register_thread(multipathd, mddev, |
| 458 | mddev->thread = md_register_thread(multipathd, mddev, | 440 | "multipath"); |
| 459 | "multipath"); | 441 | if (!mddev->thread) |
| 460 | if (!mddev->thread) { | 442 | goto out_free_conf; |
| 461 | printk(KERN_ERR "multipath: couldn't allocate thread" | ||
| 462 | " for %s\n", mdname(mddev)); | ||
| 463 | goto out_free_conf; | ||
| 464 | } | ||
| 465 | } | ||
| 466 | 443 | ||
| 467 | printk(KERN_INFO | 444 | pr_info("multipath: array %s active with %d out of %d IO paths\n", |
| 468 | "multipath: array %s active with %d out of %d IO paths\n", | ||
| 469 | mdname(mddev), conf->raid_disks - mddev->degraded, | 445 | mdname(mddev), conf->raid_disks - mddev->degraded, |
| 470 | mddev->raid_disks); | 446 | mddev->raid_disks); |
| 471 | /* | 447 | /* |
| 472 | * Ok, everything is just fine now | 448 | * Ok, everything is just fine now |
| 473 | */ | 449 | */ |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 258986a2699d..a162fedeb51a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/seq_file.h> | 21 | #include <linux/seq_file.h> |
| 22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 24 | #include <trace/events/block.h> | ||
| 24 | #include "md.h" | 25 | #include "md.h" |
| 25 | #include "raid0.h" | 26 | #include "raid0.h" |
| 26 | #include "raid5.h" | 27 | #include "raid5.h" |
| @@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev) | |||
| 51 | char b[BDEVNAME_SIZE]; | 52 | char b[BDEVNAME_SIZE]; |
| 52 | struct r0conf *conf = mddev->private; | 53 | struct r0conf *conf = mddev->private; |
| 53 | int raid_disks = conf->strip_zone[0].nb_dev; | 54 | int raid_disks = conf->strip_zone[0].nb_dev; |
| 54 | printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", | 55 | pr_debug("md: RAID0 configuration for %s - %d zone%s\n", |
| 55 | mdname(mddev), | 56 | mdname(mddev), |
| 56 | conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); | 57 | conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); |
| 57 | for (j = 0; j < conf->nr_strip_zones; j++) { | 58 | for (j = 0; j < conf->nr_strip_zones; j++) { |
| 58 | printk(KERN_INFO "md: zone%d=[", j); | 59 | char line[200]; |
| 60 | int len = 0; | ||
| 61 | |||
| 59 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) | 62 | for (k = 0; k < conf->strip_zone[j].nb_dev; k++) |
| 60 | printk(KERN_CONT "%s%s", k?"/":"", | 63 | len += snprintf(line+len, 200-len, "%s%s", k?"/":"", |
| 61 | bdevname(conf->devlist[j*raid_disks | 64 | bdevname(conf->devlist[j*raid_disks |
| 62 | + k]->bdev, b)); | 65 | + k]->bdev, b)); |
| 63 | printk(KERN_CONT "]\n"); | 66 | pr_debug("md: zone%d=[%s]\n", j, line); |
| 64 | 67 | ||
| 65 | zone_size = conf->strip_zone[j].zone_end - zone_start; | 68 | zone_size = conf->strip_zone[j].zone_end - zone_start; |
| 66 | printk(KERN_INFO " zone-offset=%10lluKB, " | 69 | pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n", |
| 67 | "device-offset=%10lluKB, size=%10lluKB\n", | ||
| 68 | (unsigned long long)zone_start>>1, | 70 | (unsigned long long)zone_start>>1, |
| 69 | (unsigned long long)conf->strip_zone[j].dev_start>>1, | 71 | (unsigned long long)conf->strip_zone[j].dev_start>>1, |
| 70 | (unsigned long long)zone_size>>1); | 72 | (unsigned long long)zone_size>>1); |
| @@ -142,9 +144,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
| 142 | * chunk size is a multiple of that sector size | 144 | * chunk size is a multiple of that sector size |
| 143 | */ | 145 | */ |
| 144 | if ((mddev->chunk_sectors << 9) % blksize) { | 146 | if ((mddev->chunk_sectors << 9) % blksize) { |
| 145 | printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", | 147 | pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n", |
| 146 | mdname(mddev), | 148 | mdname(mddev), |
| 147 | mddev->chunk_sectors << 9, blksize); | 149 | mddev->chunk_sectors << 9, blksize); |
| 148 | err = -EINVAL; | 150 | err = -EINVAL; |
| 149 | goto abort; | 151 | goto abort; |
| 150 | } | 152 | } |
| @@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
| 186 | } | 188 | } |
| 187 | 189 | ||
| 188 | if (j < 0) { | 190 | if (j < 0) { |
| 189 | printk(KERN_ERR | 191 | pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n", |
| 190 | "md/raid0:%s: remove inactive devices before converting to RAID0\n", | 192 | mdname(mddev)); |
| 191 | mdname(mddev)); | ||
| 192 | goto abort; | 193 | goto abort; |
| 193 | } | 194 | } |
| 194 | if (j >= mddev->raid_disks) { | 195 | if (j >= mddev->raid_disks) { |
| 195 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 196 | pr_warn("md/raid0:%s: bad disk number %d - aborting!\n", |
| 196 | "aborting!\n", mdname(mddev), j); | 197 | mdname(mddev), j); |
| 197 | goto abort; | 198 | goto abort; |
| 198 | } | 199 | } |
| 199 | if (dev[j]) { | 200 | if (dev[j]) { |
| 200 | printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " | 201 | pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n", |
| 201 | "aborting!\n", mdname(mddev), j); | 202 | mdname(mddev), j); |
| 202 | goto abort; | 203 | goto abort; |
| 203 | } | 204 | } |
| 204 | dev[j] = rdev1; | 205 | dev[j] = rdev1; |
| @@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
| 208 | cnt++; | 209 | cnt++; |
| 209 | } | 210 | } |
| 210 | if (cnt != mddev->raid_disks) { | 211 | if (cnt != mddev->raid_disks) { |
| 211 | printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " | 212 | pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n", |
| 212 | "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); | 213 | mdname(mddev), cnt, mddev->raid_disks); |
| 213 | goto abort; | 214 | goto abort; |
| 214 | } | 215 | } |
| 215 | zone->nb_dev = cnt; | 216 | zone->nb_dev = cnt; |
| @@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev) | |||
| 357 | int ret; | 358 | int ret; |
| 358 | 359 | ||
| 359 | if (mddev->chunk_sectors == 0) { | 360 | if (mddev->chunk_sectors == 0) { |
| 360 | printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", | 361 | pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev)); |
| 361 | mdname(mddev)); | ||
| 362 | return -EINVAL; | 362 | return -EINVAL; |
| 363 | } | 363 | } |
| 364 | if (md_check_no_bitmap(mddev)) | 364 | if (md_check_no_bitmap(mddev)) |
| @@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev) | |||
| 399 | /* calculate array device size */ | 399 | /* calculate array device size */ |
| 400 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); | 400 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
| 401 | 401 | ||
| 402 | printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", | 402 | pr_debug("md/raid0:%s: md_size is %llu sectors.\n", |
| 403 | mdname(mddev), | 403 | mdname(mddev), |
| 404 | (unsigned long long)mddev->array_sectors); | 404 | (unsigned long long)mddev->array_sectors); |
| 405 | 405 | ||
| 406 | if (mddev->queue) { | 406 | if (mddev->queue) { |
| 407 | /* calculate the max read-ahead size. | 407 | /* calculate the max read-ahead size. |
| @@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
| 464 | } | 464 | } |
| 465 | 465 | ||
| 466 | do { | 466 | do { |
| 467 | sector_t sector = bio->bi_iter.bi_sector; | 467 | sector_t bio_sector = bio->bi_iter.bi_sector; |
| 468 | sector_t sector = bio_sector; | ||
| 468 | unsigned chunk_sects = mddev->chunk_sectors; | 469 | unsigned chunk_sects = mddev->chunk_sectors; |
| 469 | 470 | ||
| 470 | unsigned sectors = chunk_sects - | 471 | unsigned sectors = chunk_sects - |
| @@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
| 473 | : sector_div(sector, chunk_sects)); | 474 | : sector_div(sector, chunk_sects)); |
| 474 | 475 | ||
| 475 | /* Restore due to sector_div */ | 476 | /* Restore due to sector_div */ |
| 476 | sector = bio->bi_iter.bi_sector; | 477 | sector = bio_sector; |
| 477 | 478 | ||
| 478 | if (sectors < bio_sectors(bio)) { | 479 | if (sectors < bio_sectors(bio)) { |
| 479 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); | 480 | split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); |
| @@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
| 492 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { | 493 | !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { |
| 493 | /* Just ignore it */ | 494 | /* Just ignore it */ |
| 494 | bio_endio(split); | 495 | bio_endio(split); |
| 495 | } else | 496 | } else { |
| 497 | if (mddev->gendisk) | ||
| 498 | trace_block_bio_remap(bdev_get_queue(split->bi_bdev), | ||
| 499 | split, disk_devt(mddev->gendisk), | ||
| 500 | bio_sector); | ||
| 496 | generic_make_request(split); | 501 | generic_make_request(split); |
| 502 | } | ||
| 497 | } while (split != bio); | 503 | } while (split != bio); |
| 498 | } | 504 | } |
| 499 | 505 | ||
| @@ -509,17 +515,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev) | |||
| 509 | struct r0conf *priv_conf; | 515 | struct r0conf *priv_conf; |
| 510 | 516 | ||
| 511 | if (mddev->degraded != 1) { | 517 | if (mddev->degraded != 1) { |
| 512 | printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", | 518 | pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", |
| 513 | mdname(mddev), | 519 | mdname(mddev), |
| 514 | mddev->degraded); | 520 | mddev->degraded); |
| 515 | return ERR_PTR(-EINVAL); | 521 | return ERR_PTR(-EINVAL); |
| 516 | } | 522 | } |
| 517 | 523 | ||
| 518 | rdev_for_each(rdev, mddev) { | 524 | rdev_for_each(rdev, mddev) { |
| 519 | /* check slot number for a disk */ | 525 | /* check slot number for a disk */ |
| 520 | if (rdev->raid_disk == mddev->raid_disks-1) { | 526 | if (rdev->raid_disk == mddev->raid_disks-1) { |
| 521 | printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", | 527 | pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n", |
| 522 | mdname(mddev)); | 528 | mdname(mddev)); |
| 523 | return ERR_PTR(-EINVAL); | 529 | return ERR_PTR(-EINVAL); |
| 524 | } | 530 | } |
| 525 | rdev->sectors = mddev->dev_sectors; | 531 | rdev->sectors = mddev->dev_sectors; |
| @@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev) | |||
| 533 | mddev->delta_disks = -1; | 539 | mddev->delta_disks = -1; |
| 534 | /* make sure it will be not marked as dirty */ | 540 | /* make sure it will be not marked as dirty */ |
| 535 | mddev->recovery_cp = MaxSector; | 541 | mddev->recovery_cp = MaxSector; |
| 542 | clear_bit(MD_HAS_JOURNAL, &mddev->flags); | ||
| 543 | clear_bit(MD_JOURNAL_CLEAN, &mddev->flags); | ||
| 536 | 544 | ||
| 537 | create_strip_zones(mddev, &priv_conf); | 545 | create_strip_zones(mddev, &priv_conf); |
| 546 | |||
| 538 | return priv_conf; | 547 | return priv_conf; |
| 539 | } | 548 | } |
| 540 | 549 | ||
| @@ -549,19 +558,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev) | |||
| 549 | * - all mirrors must be already degraded | 558 | * - all mirrors must be already degraded |
| 550 | */ | 559 | */ |
| 551 | if (mddev->layout != ((1 << 8) + 2)) { | 560 | if (mddev->layout != ((1 << 8) + 2)) { |
| 552 | printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", | 561 | pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", |
| 553 | mdname(mddev), | 562 | mdname(mddev), |
| 554 | mddev->layout); | 563 | mddev->layout); |
| 555 | return ERR_PTR(-EINVAL); | 564 | return ERR_PTR(-EINVAL); |
| 556 | } | 565 | } |
| 557 | if (mddev->raid_disks & 1) { | 566 | if (mddev->raid_disks & 1) { |
| 558 | printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", | 567 | pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", |
| 559 | mdname(mddev)); | 568 | mdname(mddev)); |
| 560 | return ERR_PTR(-EINVAL); | 569 | return ERR_PTR(-EINVAL); |
| 561 | } | 570 | } |
| 562 | if (mddev->degraded != (mddev->raid_disks>>1)) { | 571 | if (mddev->degraded != (mddev->raid_disks>>1)) { |
| 563 | printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", | 572 | pr_warn("md/raid0:%s: All mirrors must be already degraded!\n", |
| 564 | mdname(mddev)); | 573 | mdname(mddev)); |
| 565 | return ERR_PTR(-EINVAL); | 574 | return ERR_PTR(-EINVAL); |
| 566 | } | 575 | } |
| 567 | 576 | ||
| @@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) | |||
| 574 | mddev->degraded = 0; | 583 | mddev->degraded = 0; |
| 575 | /* make sure it will be not marked as dirty */ | 584 | /* make sure it will be not marked as dirty */ |
| 576 | mddev->recovery_cp = MaxSector; | 585 | mddev->recovery_cp = MaxSector; |
| 586 | clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
| 577 | 587 | ||
| 578 | create_strip_zones(mddev, &priv_conf); | 588 | create_strip_zones(mddev, &priv_conf); |
| 579 | return priv_conf; | 589 | return priv_conf; |
| @@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) | |||
| 588 | * - (N - 1) mirror drives must be already faulty | 598 | * - (N - 1) mirror drives must be already faulty |
| 589 | */ | 599 | */ |
| 590 | if ((mddev->raid_disks - 1) != mddev->degraded) { | 600 | if ((mddev->raid_disks - 1) != mddev->degraded) { |
| 591 | printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", | 601 | pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", |
| 592 | mdname(mddev)); | 602 | mdname(mddev)); |
| 593 | return ERR_PTR(-EINVAL); | 603 | return ERR_PTR(-EINVAL); |
| 594 | } | 604 | } |
| @@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) | |||
| 616 | mddev->raid_disks = 1; | 626 | mddev->raid_disks = 1; |
| 617 | /* make sure it will be not marked as dirty */ | 627 | /* make sure it will be not marked as dirty */ |
| 618 | mddev->recovery_cp = MaxSector; | 628 | mddev->recovery_cp = MaxSector; |
| 629 | clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
| 619 | 630 | ||
| 620 | create_strip_zones(mddev, &priv_conf); | 631 | create_strip_zones(mddev, &priv_conf); |
| 621 | return priv_conf; | 632 | return priv_conf; |
| @@ -631,8 +642,8 @@ static void *raid0_takeover(struct mddev *mddev) | |||
| 631 | */ | 642 | */ |
| 632 | 643 | ||
| 633 | if (mddev->bitmap) { | 644 | if (mddev->bitmap) { |
| 634 | printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n", | 645 | pr_warn("md/raid0: %s: cannot takeover array with bitmap\n", |
| 635 | mdname(mddev)); | 646 | mdname(mddev)); |
| 636 | return ERR_PTR(-EBUSY); | 647 | return ERR_PTR(-EBUSY); |
| 637 | } | 648 | } |
| 638 | if (mddev->level == 4) | 649 | if (mddev->level == 4) |
| @@ -642,8 +653,8 @@ static void *raid0_takeover(struct mddev *mddev) | |||
| 642 | if (mddev->layout == ALGORITHM_PARITY_N) | 653 | if (mddev->layout == ALGORITHM_PARITY_N) |
| 643 | return raid0_takeover_raid45(mddev); | 654 | return raid0_takeover_raid45(mddev); |
| 644 | 655 | ||
| 645 | printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", | 656 | pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", |
| 646 | mdname(mddev), ALGORITHM_PARITY_N); | 657 | mdname(mddev), ALGORITHM_PARITY_N); |
| 647 | } | 658 | } |
| 648 | 659 | ||
| 649 | if (mddev->level == 10) | 660 | if (mddev->level == 10) |
| @@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev) | |||
| 652 | if (mddev->level == 1) | 663 | if (mddev->level == 1) |
| 653 | return raid0_takeover_raid1(mddev); | 664 | return raid0_takeover_raid1(mddev); |
| 654 | 665 | ||
| 655 | printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", | 666 | pr_warn("Takeover from raid%i to raid0 not supported\n", |
| 656 | mddev->level); | 667 | mddev->level); |
| 657 | 668 | ||
| 658 | return ERR_PTR(-EINVAL); | 669 | return ERR_PTR(-EINVAL); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 29e2df5cd77b..a1f3fbed9100 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -37,6 +37,7 @@ | |||
| 37 | #include <linux/module.h> | 37 | #include <linux/module.h> |
| 38 | #include <linux/seq_file.h> | 38 | #include <linux/seq_file.h> |
| 39 | #include <linux/ratelimit.h> | 39 | #include <linux/ratelimit.h> |
| 40 | #include <trace/events/block.h> | ||
| 40 | #include "md.h" | 41 | #include "md.h" |
| 41 | #include "raid1.h" | 42 | #include "raid1.h" |
| 42 | #include "bitmap.h" | 43 | #include "bitmap.h" |
| @@ -70,6 +71,9 @@ static void allow_barrier(struct r1conf *conf, sector_t start_next_window, | |||
| 70 | sector_t bi_sector); | 71 | sector_t bi_sector); |
| 71 | static void lower_barrier(struct r1conf *conf); | 72 | static void lower_barrier(struct r1conf *conf); |
| 72 | 73 | ||
| 74 | #define raid1_log(md, fmt, args...) \ | ||
| 75 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) | ||
| 76 | |||
| 73 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 77 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
| 74 | { | 78 | { |
| 75 | struct pool_info *pi = data; | 79 | struct pool_info *pi = data; |
| @@ -325,6 +329,11 @@ static void raid1_end_read_request(struct bio *bio) | |||
| 325 | 329 | ||
| 326 | if (uptodate) | 330 | if (uptodate) |
| 327 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 331 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
| 332 | else if (test_bit(FailFast, &rdev->flags) && | ||
| 333 | test_bit(R1BIO_FailFast, &r1_bio->state)) | ||
| 334 | /* This was a fail-fast read so we definitely | ||
| 335 | * want to retry */ | ||
| 336 | ; | ||
| 328 | else { | 337 | else { |
| 329 | /* If all other devices have failed, we want to return | 338 | /* If all other devices have failed, we want to return |
| 330 | * the error upwards rather than fail the last device. | 339 | * the error upwards rather than fail the last device. |
| @@ -347,13 +356,10 @@ static void raid1_end_read_request(struct bio *bio) | |||
| 347 | * oops, read error: | 356 | * oops, read error: |
| 348 | */ | 357 | */ |
| 349 | char b[BDEVNAME_SIZE]; | 358 | char b[BDEVNAME_SIZE]; |
| 350 | printk_ratelimited( | 359 | pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n", |
| 351 | KERN_ERR "md/raid1:%s: %s: " | 360 | mdname(conf->mddev), |
| 352 | "rescheduling sector %llu\n", | 361 | bdevname(rdev->bdev, b), |
| 353 | mdname(conf->mddev), | 362 | (unsigned long long)r1_bio->sector); |
| 354 | bdevname(rdev->bdev, | ||
| 355 | b), | ||
| 356 | (unsigned long long)r1_bio->sector); | ||
| 357 | set_bit(R1BIO_ReadError, &r1_bio->state); | 363 | set_bit(R1BIO_ReadError, &r1_bio->state); |
| 358 | reschedule_retry(r1_bio); | 364 | reschedule_retry(r1_bio); |
| 359 | /* don't drop the reference on read_disk yet */ | 365 | /* don't drop the reference on read_disk yet */ |
| @@ -416,7 +422,24 @@ static void raid1_end_write_request(struct bio *bio) | |||
| 416 | set_bit(MD_RECOVERY_NEEDED, & | 422 | set_bit(MD_RECOVERY_NEEDED, & |
| 417 | conf->mddev->recovery); | 423 | conf->mddev->recovery); |
| 418 | 424 | ||
| 419 | set_bit(R1BIO_WriteError, &r1_bio->state); | 425 | if (test_bit(FailFast, &rdev->flags) && |
| 426 | (bio->bi_opf & MD_FAILFAST) && | ||
| 427 | /* We never try FailFast to WriteMostly devices */ | ||
| 428 | !test_bit(WriteMostly, &rdev->flags)) { | ||
| 429 | md_error(r1_bio->mddev, rdev); | ||
| 430 | if (!test_bit(Faulty, &rdev->flags)) | ||
| 431 | /* This is the only remaining device, | ||
| 432 | * We need to retry the write without | ||
| 433 | * FailFast | ||
| 434 | */ | ||
| 435 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
| 436 | else { | ||
| 437 | /* Finished with this branch */ | ||
| 438 | r1_bio->bios[mirror] = NULL; | ||
| 439 | to_put = bio; | ||
| 440 | } | ||
| 441 | } else | ||
| 442 | set_bit(R1BIO_WriteError, &r1_bio->state); | ||
| 420 | } else { | 443 | } else { |
| 421 | /* | 444 | /* |
| 422 | * Set R1BIO_Uptodate in our master bio, so that we | 445 | * Set R1BIO_Uptodate in our master bio, so that we |
| @@ -534,6 +557,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 534 | best_good_sectors = 0; | 557 | best_good_sectors = 0; |
| 535 | has_nonrot_disk = 0; | 558 | has_nonrot_disk = 0; |
| 536 | choose_next_idle = 0; | 559 | choose_next_idle = 0; |
| 560 | clear_bit(R1BIO_FailFast, &r1_bio->state); | ||
| 537 | 561 | ||
| 538 | if ((conf->mddev->recovery_cp < this_sector + sectors) || | 562 | if ((conf->mddev->recovery_cp < this_sector + sectors) || |
| 539 | (mddev_is_clustered(conf->mddev) && | 563 | (mddev_is_clustered(conf->mddev) && |
| @@ -607,6 +631,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 607 | } else | 631 | } else |
| 608 | best_good_sectors = sectors; | 632 | best_good_sectors = sectors; |
| 609 | 633 | ||
| 634 | if (best_disk >= 0) | ||
| 635 | /* At least two disks to choose from so failfast is OK */ | ||
| 636 | set_bit(R1BIO_FailFast, &r1_bio->state); | ||
| 637 | |||
| 610 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); | 638 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); |
| 611 | has_nonrot_disk |= nonrot; | 639 | has_nonrot_disk |= nonrot; |
| 612 | pending = atomic_read(&rdev->nr_pending); | 640 | pending = atomic_read(&rdev->nr_pending); |
| @@ -645,11 +673,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 645 | } | 673 | } |
| 646 | break; | 674 | break; |
| 647 | } | 675 | } |
| 648 | /* If device is idle, use it */ | ||
| 649 | if (pending == 0) { | ||
| 650 | best_disk = disk; | ||
| 651 | break; | ||
| 652 | } | ||
| 653 | 676 | ||
| 654 | if (choose_next_idle) | 677 | if (choose_next_idle) |
| 655 | continue; | 678 | continue; |
| @@ -672,7 +695,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 672 | * mixed ratation/non-rotational disks depending on workload. | 695 | * mixed ratation/non-rotational disks depending on workload. |
| 673 | */ | 696 | */ |
| 674 | if (best_disk == -1) { | 697 | if (best_disk == -1) { |
| 675 | if (has_nonrot_disk) | 698 | if (has_nonrot_disk || min_pending == 0) |
| 676 | best_disk = best_pending_disk; | 699 | best_disk = best_pending_disk; |
| 677 | else | 700 | else |
| 678 | best_disk = best_dist_disk; | 701 | best_disk = best_dist_disk; |
| @@ -745,9 +768,14 @@ static void flush_pending_writes(struct r1conf *conf) | |||
| 745 | 768 | ||
| 746 | while (bio) { /* submit pending writes */ | 769 | while (bio) { /* submit pending writes */ |
| 747 | struct bio *next = bio->bi_next; | 770 | struct bio *next = bio->bi_next; |
| 771 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
| 748 | bio->bi_next = NULL; | 772 | bio->bi_next = NULL; |
| 749 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 773 | bio->bi_bdev = rdev->bdev; |
| 750 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 774 | if (test_bit(Faulty, &rdev->flags)) { |
| 775 | bio->bi_error = -EIO; | ||
| 776 | bio_endio(bio); | ||
| 777 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
| 778 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
| 751 | /* Just ignore it */ | 779 | /* Just ignore it */ |
| 752 | bio_endio(bio); | 780 | bio_endio(bio); |
| 753 | else | 781 | else |
| @@ -832,7 +860,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) | |||
| 832 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { | 860 | else if (conf->barrier && bio_data_dir(bio) == WRITE) { |
| 833 | if ((conf->mddev->curr_resync_completed | 861 | if ((conf->mddev->curr_resync_completed |
| 834 | >= bio_end_sector(bio)) || | 862 | >= bio_end_sector(bio)) || |
| 835 | (conf->next_resync + NEXT_NORMALIO_DISTANCE | 863 | (conf->start_next_window + NEXT_NORMALIO_DISTANCE |
| 836 | <= bio->bi_iter.bi_sector)) | 864 | <= bio->bi_iter.bi_sector)) |
| 837 | wait = false; | 865 | wait = false; |
| 838 | else | 866 | else |
| @@ -858,6 +886,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) | |||
| 858 | * that queue to allow conf->start_next_window | 886 | * that queue to allow conf->start_next_window |
| 859 | * to increase. | 887 | * to increase. |
| 860 | */ | 888 | */ |
| 889 | raid1_log(conf->mddev, "wait barrier"); | ||
| 861 | wait_event_lock_irq(conf->wait_barrier, | 890 | wait_event_lock_irq(conf->wait_barrier, |
| 862 | !conf->array_frozen && | 891 | !conf->array_frozen && |
| 863 | (!conf->barrier || | 892 | (!conf->barrier || |
| @@ -937,6 +966,7 @@ static void freeze_array(struct r1conf *conf, int extra) | |||
| 937 | */ | 966 | */ |
| 938 | spin_lock_irq(&conf->resync_lock); | 967 | spin_lock_irq(&conf->resync_lock); |
| 939 | conf->array_frozen = 1; | 968 | conf->array_frozen = 1; |
| 969 | raid1_log(conf->mddev, "wait freeze"); | ||
| 940 | wait_event_lock_irq_cmd(conf->wait_barrier, | 970 | wait_event_lock_irq_cmd(conf->wait_barrier, |
| 941 | conf->nr_pending == conf->nr_queued+extra, | 971 | conf->nr_pending == conf->nr_queued+extra, |
| 942 | conf->resync_lock, | 972 | conf->resync_lock, |
| @@ -1019,9 +1049,14 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
| 1019 | 1049 | ||
| 1020 | while (bio) { /* submit pending writes */ | 1050 | while (bio) { /* submit pending writes */ |
| 1021 | struct bio *next = bio->bi_next; | 1051 | struct bio *next = bio->bi_next; |
| 1052 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
| 1022 | bio->bi_next = NULL; | 1053 | bio->bi_next = NULL; |
| 1023 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 1054 | bio->bi_bdev = rdev->bdev; |
| 1024 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 1055 | if (test_bit(Faulty, &rdev->flags)) { |
| 1056 | bio->bi_error = -EIO; | ||
| 1057 | bio_endio(bio); | ||
| 1058 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
| 1059 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
| 1025 | /* Just ignore it */ | 1060 | /* Just ignore it */ |
| 1026 | bio_endio(bio); | 1061 | bio_endio(bio); |
| 1027 | else | 1062 | else |
| @@ -1136,6 +1171,7 @@ read_again: | |||
| 1136 | * take care not to over-take any writes | 1171 | * take care not to over-take any writes |
| 1137 | * that are 'behind' | 1172 | * that are 'behind' |
| 1138 | */ | 1173 | */ |
| 1174 | raid1_log(mddev, "wait behind writes"); | ||
| 1139 | wait_event(bitmap->behind_wait, | 1175 | wait_event(bitmap->behind_wait, |
| 1140 | atomic_read(&bitmap->behind_writes) == 0); | 1176 | atomic_read(&bitmap->behind_writes) == 0); |
| 1141 | } | 1177 | } |
| @@ -1153,8 +1189,16 @@ read_again: | |||
| 1153 | read_bio->bi_bdev = mirror->rdev->bdev; | 1189 | read_bio->bi_bdev = mirror->rdev->bdev; |
| 1154 | read_bio->bi_end_io = raid1_end_read_request; | 1190 | read_bio->bi_end_io = raid1_end_read_request; |
| 1155 | bio_set_op_attrs(read_bio, op, do_sync); | 1191 | bio_set_op_attrs(read_bio, op, do_sync); |
| 1192 | if (test_bit(FailFast, &mirror->rdev->flags) && | ||
| 1193 | test_bit(R1BIO_FailFast, &r1_bio->state)) | ||
| 1194 | read_bio->bi_opf |= MD_FAILFAST; | ||
| 1156 | read_bio->bi_private = r1_bio; | 1195 | read_bio->bi_private = r1_bio; |
| 1157 | 1196 | ||
| 1197 | if (mddev->gendisk) | ||
| 1198 | trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), | ||
| 1199 | read_bio, disk_devt(mddev->gendisk), | ||
| 1200 | r1_bio->sector); | ||
| 1201 | |||
| 1158 | if (max_sectors < r1_bio->sectors) { | 1202 | if (max_sectors < r1_bio->sectors) { |
| 1159 | /* could not read all from this device, so we will | 1203 | /* could not read all from this device, so we will |
| 1160 | * need another r1_bio. | 1204 | * need another r1_bio. |
| @@ -1195,6 +1239,7 @@ read_again: | |||
| 1195 | */ | 1239 | */ |
| 1196 | if (conf->pending_count >= max_queued_requests) { | 1240 | if (conf->pending_count >= max_queued_requests) { |
| 1197 | md_wakeup_thread(mddev->thread); | 1241 | md_wakeup_thread(mddev->thread); |
| 1242 | raid1_log(mddev, "wait queued"); | ||
| 1198 | wait_event(conf->wait_barrier, | 1243 | wait_event(conf->wait_barrier, |
| 1199 | conf->pending_count < max_queued_requests); | 1244 | conf->pending_count < max_queued_requests); |
| 1200 | } | 1245 | } |
| @@ -1286,6 +1331,7 @@ read_again: | |||
| 1286 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 1331 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
| 1287 | r1_bio->state = 0; | 1332 | r1_bio->state = 0; |
| 1288 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); | 1333 | allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); |
| 1334 | raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | ||
| 1289 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1335 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
| 1290 | start_next_window = wait_barrier(conf, bio); | 1336 | start_next_window = wait_barrier(conf, bio); |
| 1291 | /* | 1337 | /* |
| @@ -1363,10 +1409,21 @@ read_again: | |||
| 1363 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1409 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
| 1364 | mbio->bi_end_io = raid1_end_write_request; | 1410 | mbio->bi_end_io = raid1_end_write_request; |
| 1365 | bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); | 1411 | bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); |
| 1412 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && | ||
| 1413 | !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && | ||
| 1414 | conf->raid_disks - mddev->degraded > 1) | ||
| 1415 | mbio->bi_opf |= MD_FAILFAST; | ||
| 1366 | mbio->bi_private = r1_bio; | 1416 | mbio->bi_private = r1_bio; |
| 1367 | 1417 | ||
| 1368 | atomic_inc(&r1_bio->remaining); | 1418 | atomic_inc(&r1_bio->remaining); |
| 1369 | 1419 | ||
| 1420 | if (mddev->gendisk) | ||
| 1421 | trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), | ||
| 1422 | mbio, disk_devt(mddev->gendisk), | ||
| 1423 | r1_bio->sector); | ||
| 1424 | /* flush_pending_writes() needs access to the rdev so...*/ | ||
| 1425 | mbio->bi_bdev = (void*)conf->mirrors[i].rdev; | ||
| 1426 | |||
| 1370 | cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); | 1427 | cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); |
| 1371 | if (cb) | 1428 | if (cb) |
| 1372 | plug = container_of(cb, struct raid1_plug_cb, cb); | 1429 | plug = container_of(cb, struct raid1_plug_cb, cb); |
| @@ -1436,6 +1493,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1436 | * next level up know. | 1493 | * next level up know. |
| 1437 | * else mark the drive as failed | 1494 | * else mark the drive as failed |
| 1438 | */ | 1495 | */ |
| 1496 | spin_lock_irqsave(&conf->device_lock, flags); | ||
| 1439 | if (test_bit(In_sync, &rdev->flags) | 1497 | if (test_bit(In_sync, &rdev->flags) |
| 1440 | && (conf->raid_disks - mddev->degraded) == 1) { | 1498 | && (conf->raid_disks - mddev->degraded) == 1) { |
| 1441 | /* | 1499 | /* |
| @@ -1445,10 +1503,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1445 | * it is very likely to fail. | 1503 | * it is very likely to fail. |
| 1446 | */ | 1504 | */ |
| 1447 | conf->recovery_disabled = mddev->recovery_disabled; | 1505 | conf->recovery_disabled = mddev->recovery_disabled; |
| 1506 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
| 1448 | return; | 1507 | return; |
| 1449 | } | 1508 | } |
| 1450 | set_bit(Blocked, &rdev->flags); | 1509 | set_bit(Blocked, &rdev->flags); |
| 1451 | spin_lock_irqsave(&conf->device_lock, flags); | ||
| 1452 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1510 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
| 1453 | mddev->degraded++; | 1511 | mddev->degraded++; |
| 1454 | set_bit(Faulty, &rdev->flags); | 1512 | set_bit(Faulty, &rdev->flags); |
| @@ -1459,36 +1517,35 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1459 | * if recovery is running, make sure it aborts. | 1517 | * if recovery is running, make sure it aborts. |
| 1460 | */ | 1518 | */ |
| 1461 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1519 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 1462 | set_mask_bits(&mddev->flags, 0, | 1520 | set_mask_bits(&mddev->sb_flags, 0, |
| 1463 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1521 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
| 1464 | printk(KERN_ALERT | 1522 | pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n" |
| 1465 | "md/raid1:%s: Disk failure on %s, disabling device.\n" | 1523 | "md/raid1:%s: Operation continuing on %d devices.\n", |
| 1466 | "md/raid1:%s: Operation continuing on %d devices.\n", | 1524 | mdname(mddev), bdevname(rdev->bdev, b), |
| 1467 | mdname(mddev), bdevname(rdev->bdev, b), | 1525 | mdname(mddev), conf->raid_disks - mddev->degraded); |
| 1468 | mdname(mddev), conf->raid_disks - mddev->degraded); | ||
| 1469 | } | 1526 | } |
| 1470 | 1527 | ||
| 1471 | static void print_conf(struct r1conf *conf) | 1528 | static void print_conf(struct r1conf *conf) |
| 1472 | { | 1529 | { |
| 1473 | int i; | 1530 | int i; |
| 1474 | 1531 | ||
| 1475 | printk(KERN_DEBUG "RAID1 conf printout:\n"); | 1532 | pr_debug("RAID1 conf printout:\n"); |
| 1476 | if (!conf) { | 1533 | if (!conf) { |
| 1477 | printk(KERN_DEBUG "(!conf)\n"); | 1534 | pr_debug("(!conf)\n"); |
| 1478 | return; | 1535 | return; |
| 1479 | } | 1536 | } |
| 1480 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, | 1537 | pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
| 1481 | conf->raid_disks); | 1538 | conf->raid_disks); |
| 1482 | 1539 | ||
| 1483 | rcu_read_lock(); | 1540 | rcu_read_lock(); |
| 1484 | for (i = 0; i < conf->raid_disks; i++) { | 1541 | for (i = 0; i < conf->raid_disks; i++) { |
| 1485 | char b[BDEVNAME_SIZE]; | 1542 | char b[BDEVNAME_SIZE]; |
| 1486 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 1543 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
| 1487 | if (rdev) | 1544 | if (rdev) |
| 1488 | printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", | 1545 | pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", |
| 1489 | i, !test_bit(In_sync, &rdev->flags), | 1546 | i, !test_bit(In_sync, &rdev->flags), |
| 1490 | !test_bit(Faulty, &rdev->flags), | 1547 | !test_bit(Faulty, &rdev->flags), |
| 1491 | bdevname(rdev->bdev,b)); | 1548 | bdevname(rdev->bdev,b)); |
| 1492 | } | 1549 | } |
| 1493 | rcu_read_unlock(); | 1550 | rcu_read_unlock(); |
| 1494 | } | 1551 | } |
| @@ -1788,12 +1845,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
| 1788 | sector_t sect = r1_bio->sector; | 1845 | sector_t sect = r1_bio->sector; |
| 1789 | int sectors = r1_bio->sectors; | 1846 | int sectors = r1_bio->sectors; |
| 1790 | int idx = 0; | 1847 | int idx = 0; |
| 1848 | struct md_rdev *rdev; | ||
| 1849 | |||
| 1850 | rdev = conf->mirrors[r1_bio->read_disk].rdev; | ||
| 1851 | if (test_bit(FailFast, &rdev->flags)) { | ||
| 1852 | /* Don't try recovering from here - just fail it | ||
| 1853 | * ... unless it is the last working device of course */ | ||
| 1854 | md_error(mddev, rdev); | ||
| 1855 | if (test_bit(Faulty, &rdev->flags)) | ||
| 1856 | /* Don't try to read from here, but make sure | ||
| 1857 | * put_buf does it's thing | ||
| 1858 | */ | ||
| 1859 | bio->bi_end_io = end_sync_write; | ||
| 1860 | } | ||
| 1791 | 1861 | ||
| 1792 | while(sectors) { | 1862 | while(sectors) { |
| 1793 | int s = sectors; | 1863 | int s = sectors; |
| 1794 | int d = r1_bio->read_disk; | 1864 | int d = r1_bio->read_disk; |
| 1795 | int success = 0; | 1865 | int success = 0; |
| 1796 | struct md_rdev *rdev; | ||
| 1797 | int start; | 1866 | int start; |
| 1798 | 1867 | ||
| 1799 | if (s > (PAGE_SIZE>>9)) | 1868 | if (s > (PAGE_SIZE>>9)) |
| @@ -1825,11 +1894,10 @@ static int fix_sync_read_error(struct r1bio *r1_bio) | |||
| 1825 | * work just disable and interrupt the recovery. | 1894 | * work just disable and interrupt the recovery. |
| 1826 | * Don't fail devices as that won't really help. | 1895 | * Don't fail devices as that won't really help. |
| 1827 | */ | 1896 | */ |
| 1828 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | 1897 | pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", |
| 1829 | " for block %llu\n", | 1898 | mdname(mddev), |
| 1830 | mdname(mddev), | 1899 | bdevname(bio->bi_bdev, b), |
| 1831 | bdevname(bio->bi_bdev, b), | 1900 | (unsigned long long)r1_bio->sector); |
| 1832 | (unsigned long long)r1_bio->sector); | ||
| 1833 | for (d = 0; d < conf->raid_disks * 2; d++) { | 1901 | for (d = 0; d < conf->raid_disks * 2; d++) { |
| 1834 | rdev = conf->mirrors[d].rdev; | 1902 | rdev = conf->mirrors[d].rdev; |
| 1835 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 1903 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
| @@ -2013,6 +2081,9 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) | |||
| 2013 | continue; | 2081 | continue; |
| 2014 | 2082 | ||
| 2015 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); | 2083 | bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); |
| 2084 | if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) | ||
| 2085 | wbio->bi_opf |= MD_FAILFAST; | ||
| 2086 | |||
| 2016 | wbio->bi_end_io = end_sync_write; | 2087 | wbio->bi_end_io = end_sync_write; |
| 2017 | atomic_inc(&r1_bio->remaining); | 2088 | atomic_inc(&r1_bio->remaining); |
| 2018 | md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); | 2089 | md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); |
| @@ -2122,13 +2193,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk, | |||
| 2122 | if (r1_sync_page_io(rdev, sect, s, | 2193 | if (r1_sync_page_io(rdev, sect, s, |
| 2123 | conf->tmppage, READ)) { | 2194 | conf->tmppage, READ)) { |
| 2124 | atomic_add(s, &rdev->corrected_errors); | 2195 | atomic_add(s, &rdev->corrected_errors); |
| 2125 | printk(KERN_INFO | 2196 | pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n", |
| 2126 | "md/raid1:%s: read error corrected " | 2197 | mdname(mddev), s, |
| 2127 | "(%d sectors at %llu on %s)\n", | 2198 | (unsigned long long)(sect + |
| 2128 | mdname(mddev), s, | 2199 | rdev->data_offset), |
| 2129 | (unsigned long long)(sect + | 2200 | bdevname(rdev->bdev, b)); |
| 2130 | rdev->data_offset), | ||
| 2131 | bdevname(rdev->bdev, b)); | ||
| 2132 | } | 2201 | } |
| 2133 | rdev_dec_pending(rdev, mddev); | 2202 | rdev_dec_pending(rdev, mddev); |
| 2134 | } else | 2203 | } else |
| @@ -2287,6 +2356,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
| 2287 | struct bio *bio; | 2356 | struct bio *bio; |
| 2288 | char b[BDEVNAME_SIZE]; | 2357 | char b[BDEVNAME_SIZE]; |
| 2289 | struct md_rdev *rdev; | 2358 | struct md_rdev *rdev; |
| 2359 | dev_t bio_dev; | ||
| 2360 | sector_t bio_sector; | ||
| 2290 | 2361 | ||
| 2291 | clear_bit(R1BIO_ReadError, &r1_bio->state); | 2362 | clear_bit(R1BIO_ReadError, &r1_bio->state); |
| 2292 | /* we got a read error. Maybe the drive is bad. Maybe just | 2363 | /* we got a read error. Maybe the drive is bad. Maybe just |
| @@ -2300,10 +2371,14 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
| 2300 | 2371 | ||
| 2301 | bio = r1_bio->bios[r1_bio->read_disk]; | 2372 | bio = r1_bio->bios[r1_bio->read_disk]; |
| 2302 | bdevname(bio->bi_bdev, b); | 2373 | bdevname(bio->bi_bdev, b); |
| 2374 | bio_dev = bio->bi_bdev->bd_dev; | ||
| 2375 | bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; | ||
| 2303 | bio_put(bio); | 2376 | bio_put(bio); |
| 2304 | r1_bio->bios[r1_bio->read_disk] = NULL; | 2377 | r1_bio->bios[r1_bio->read_disk] = NULL; |
| 2305 | 2378 | ||
| 2306 | if (mddev->ro == 0) { | 2379 | rdev = conf->mirrors[r1_bio->read_disk].rdev; |
| 2380 | if (mddev->ro == 0 | ||
| 2381 | && !test_bit(FailFast, &rdev->flags)) { | ||
| 2307 | freeze_array(conf, 1); | 2382 | freeze_array(conf, 1); |
| 2308 | fix_read_error(conf, r1_bio->read_disk, | 2383 | fix_read_error(conf, r1_bio->read_disk, |
| 2309 | r1_bio->sector, r1_bio->sectors); | 2384 | r1_bio->sector, r1_bio->sectors); |
| @@ -2312,14 +2387,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
| 2312 | r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; | 2387 | r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; |
| 2313 | } | 2388 | } |
| 2314 | 2389 | ||
| 2315 | rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); | 2390 | rdev_dec_pending(rdev, conf->mddev); |
| 2316 | 2391 | ||
| 2317 | read_more: | 2392 | read_more: |
| 2318 | disk = read_balance(conf, r1_bio, &max_sectors); | 2393 | disk = read_balance(conf, r1_bio, &max_sectors); |
| 2319 | if (disk == -1) { | 2394 | if (disk == -1) { |
| 2320 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" | 2395 | pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", |
| 2321 | " read error for block %llu\n", | 2396 | mdname(mddev), b, (unsigned long long)r1_bio->sector); |
| 2322 | mdname(mddev), b, (unsigned long long)r1_bio->sector); | ||
| 2323 | raid_end_bio_io(r1_bio); | 2397 | raid_end_bio_io(r1_bio); |
| 2324 | } else { | 2398 | } else { |
| 2325 | const unsigned long do_sync | 2399 | const unsigned long do_sync |
| @@ -2330,16 +2404,17 @@ read_more: | |||
| 2330 | max_sectors); | 2404 | max_sectors); |
| 2331 | r1_bio->bios[r1_bio->read_disk] = bio; | 2405 | r1_bio->bios[r1_bio->read_disk] = bio; |
| 2332 | rdev = conf->mirrors[disk].rdev; | 2406 | rdev = conf->mirrors[disk].rdev; |
| 2333 | printk_ratelimited(KERN_ERR | 2407 | pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", |
| 2334 | "md/raid1:%s: redirecting sector %llu" | 2408 | mdname(mddev), |
| 2335 | " to other mirror: %s\n", | 2409 | (unsigned long long)r1_bio->sector, |
| 2336 | mdname(mddev), | 2410 | bdevname(rdev->bdev, b)); |
| 2337 | (unsigned long long)r1_bio->sector, | ||
| 2338 | bdevname(rdev->bdev, b)); | ||
| 2339 | bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; | 2411 | bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; |
| 2340 | bio->bi_bdev = rdev->bdev; | 2412 | bio->bi_bdev = rdev->bdev; |
| 2341 | bio->bi_end_io = raid1_end_read_request; | 2413 | bio->bi_end_io = raid1_end_read_request; |
| 2342 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); | 2414 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); |
| 2415 | if (test_bit(FailFast, &rdev->flags) && | ||
| 2416 | test_bit(R1BIO_FailFast, &r1_bio->state)) | ||
| 2417 | bio->bi_opf |= MD_FAILFAST; | ||
| 2343 | bio->bi_private = r1_bio; | 2418 | bio->bi_private = r1_bio; |
| 2344 | if (max_sectors < r1_bio->sectors) { | 2419 | if (max_sectors < r1_bio->sectors) { |
| 2345 | /* Drat - have to split this up more */ | 2420 | /* Drat - have to split this up more */ |
| @@ -2353,6 +2428,8 @@ read_more: | |||
| 2353 | else | 2428 | else |
| 2354 | mbio->bi_phys_segments++; | 2429 | mbio->bi_phys_segments++; |
| 2355 | spin_unlock_irq(&conf->device_lock); | 2430 | spin_unlock_irq(&conf->device_lock); |
| 2431 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), | ||
| 2432 | bio, bio_dev, bio_sector); | ||
| 2356 | generic_make_request(bio); | 2433 | generic_make_request(bio); |
| 2357 | bio = NULL; | 2434 | bio = NULL; |
| 2358 | 2435 | ||
| @@ -2367,8 +2444,11 @@ read_more: | |||
| 2367 | sectors_handled; | 2444 | sectors_handled; |
| 2368 | 2445 | ||
| 2369 | goto read_more; | 2446 | goto read_more; |
| 2370 | } else | 2447 | } else { |
| 2448 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), | ||
| 2449 | bio, bio_dev, bio_sector); | ||
| 2371 | generic_make_request(bio); | 2450 | generic_make_request(bio); |
| 2451 | } | ||
| 2372 | } | 2452 | } |
| 2373 | } | 2453 | } |
| 2374 | 2454 | ||
| @@ -2384,10 +2464,10 @@ static void raid1d(struct md_thread *thread) | |||
| 2384 | md_check_recovery(mddev); | 2464 | md_check_recovery(mddev); |
| 2385 | 2465 | ||
| 2386 | if (!list_empty_careful(&conf->bio_end_io_list) && | 2466 | if (!list_empty_careful(&conf->bio_end_io_list) && |
| 2387 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2467 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 2388 | LIST_HEAD(tmp); | 2468 | LIST_HEAD(tmp); |
| 2389 | spin_lock_irqsave(&conf->device_lock, flags); | 2469 | spin_lock_irqsave(&conf->device_lock, flags); |
| 2390 | if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2470 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 2391 | while (!list_empty(&conf->bio_end_io_list)) { | 2471 | while (!list_empty(&conf->bio_end_io_list)) { |
| 2392 | list_move(conf->bio_end_io_list.prev, &tmp); | 2472 | list_move(conf->bio_end_io_list.prev, &tmp); |
| 2393 | conf->nr_queued--; | 2473 | conf->nr_queued--; |
| @@ -2441,7 +2521,7 @@ static void raid1d(struct md_thread *thread) | |||
| 2441 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); | 2521 | generic_make_request(r1_bio->bios[r1_bio->read_disk]); |
| 2442 | 2522 | ||
| 2443 | cond_resched(); | 2523 | cond_resched(); |
| 2444 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 2524 | if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) |
| 2445 | md_check_recovery(mddev); | 2525 | md_check_recovery(mddev); |
| 2446 | } | 2526 | } |
| 2447 | blk_finish_plug(&plug); | 2527 | blk_finish_plug(&plug); |
| @@ -2623,6 +2703,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2623 | bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; | 2703 | bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; |
| 2624 | bio->bi_bdev = rdev->bdev; | 2704 | bio->bi_bdev = rdev->bdev; |
| 2625 | bio->bi_private = r1_bio; | 2705 | bio->bi_private = r1_bio; |
| 2706 | if (test_bit(FailFast, &rdev->flags)) | ||
| 2707 | bio->bi_opf |= MD_FAILFAST; | ||
| 2626 | } | 2708 | } |
| 2627 | } | 2709 | } |
| 2628 | rcu_read_unlock(); | 2710 | rcu_read_unlock(); |
| @@ -2642,7 +2724,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2642 | min_bad, 0 | 2724 | min_bad, 0 |
| 2643 | ) && ok; | 2725 | ) && ok; |
| 2644 | } | 2726 | } |
| 2645 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2727 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2646 | *skipped = 1; | 2728 | *skipped = 1; |
| 2647 | put_buf(r1_bio); | 2729 | put_buf(r1_bio); |
| 2648 | 2730 | ||
| @@ -2753,6 +2835,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2753 | if (bio->bi_end_io == end_sync_read) { | 2835 | if (bio->bi_end_io == end_sync_read) { |
| 2754 | read_targets--; | 2836 | read_targets--; |
| 2755 | md_sync_acct(bio->bi_bdev, nr_sectors); | 2837 | md_sync_acct(bio->bi_bdev, nr_sectors); |
| 2838 | if (read_targets == 1) | ||
| 2839 | bio->bi_opf &= ~MD_FAILFAST; | ||
| 2756 | generic_make_request(bio); | 2840 | generic_make_request(bio); |
| 2757 | } | 2841 | } |
| 2758 | } | 2842 | } |
| @@ -2760,6 +2844,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2760 | atomic_set(&r1_bio->remaining, 1); | 2844 | atomic_set(&r1_bio->remaining, 1); |
| 2761 | bio = r1_bio->bios[r1_bio->read_disk]; | 2845 | bio = r1_bio->bios[r1_bio->read_disk]; |
| 2762 | md_sync_acct(bio->bi_bdev, nr_sectors); | 2846 | md_sync_acct(bio->bi_bdev, nr_sectors); |
| 2847 | if (read_targets == 1) | ||
| 2848 | bio->bi_opf &= ~MD_FAILFAST; | ||
| 2763 | generic_make_request(bio); | 2849 | generic_make_request(bio); |
| 2764 | 2850 | ||
| 2765 | } | 2851 | } |
| @@ -2875,12 +2961,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2875 | 2961 | ||
| 2876 | err = -ENOMEM; | 2962 | err = -ENOMEM; |
| 2877 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); | 2963 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); |
| 2878 | if (!conf->thread) { | 2964 | if (!conf->thread) |
| 2879 | printk(KERN_ERR | ||
| 2880 | "md/raid1:%s: couldn't allocate thread\n", | ||
| 2881 | mdname(mddev)); | ||
| 2882 | goto abort; | 2965 | goto abort; |
| 2883 | } | ||
| 2884 | 2966 | ||
| 2885 | return conf; | 2967 | return conf; |
| 2886 | 2968 | ||
| @@ -2905,13 +2987,13 @@ static int raid1_run(struct mddev *mddev) | |||
| 2905 | bool discard_supported = false; | 2987 | bool discard_supported = false; |
| 2906 | 2988 | ||
| 2907 | if (mddev->level != 1) { | 2989 | if (mddev->level != 1) { |
| 2908 | printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", | 2990 | pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", |
| 2909 | mdname(mddev), mddev->level); | 2991 | mdname(mddev), mddev->level); |
| 2910 | return -EIO; | 2992 | return -EIO; |
| 2911 | } | 2993 | } |
| 2912 | if (mddev->reshape_position != MaxSector) { | 2994 | if (mddev->reshape_position != MaxSector) { |
| 2913 | printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", | 2995 | pr_warn("md/raid1:%s: reshape_position set but not supported\n", |
| 2914 | mdname(mddev)); | 2996 | mdname(mddev)); |
| 2915 | return -EIO; | 2997 | return -EIO; |
| 2916 | } | 2998 | } |
| 2917 | /* | 2999 | /* |
| @@ -2950,11 +3032,9 @@ static int raid1_run(struct mddev *mddev) | |||
| 2950 | mddev->recovery_cp = MaxSector; | 3032 | mddev->recovery_cp = MaxSector; |
| 2951 | 3033 | ||
| 2952 | if (mddev->recovery_cp != MaxSector) | 3034 | if (mddev->recovery_cp != MaxSector) |
| 2953 | printk(KERN_NOTICE "md/raid1:%s: not clean" | 3035 | pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", |
| 2954 | " -- starting background reconstruction\n", | 3036 | mdname(mddev)); |
| 2955 | mdname(mddev)); | 3037 | pr_info("md/raid1:%s: active with %d out of %d mirrors\n", |
| 2956 | printk(KERN_INFO | ||
| 2957 | "md/raid1:%s: active with %d out of %d mirrors\n", | ||
| 2958 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 3038 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
| 2959 | mddev->raid_disks); | 3039 | mddev->raid_disks); |
| 2960 | 3040 | ||
| @@ -2964,6 +3044,7 @@ static int raid1_run(struct mddev *mddev) | |||
| 2964 | mddev->thread = conf->thread; | 3044 | mddev->thread = conf->thread; |
| 2965 | conf->thread = NULL; | 3045 | conf->thread = NULL; |
| 2966 | mddev->private = conf; | 3046 | mddev->private = conf; |
| 3047 | set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
| 2967 | 3048 | ||
| 2968 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 3049 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
| 2969 | 3050 | ||
| @@ -3107,9 +3188,8 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 3107 | rdev->raid_disk = d2; | 3188 | rdev->raid_disk = d2; |
| 3108 | sysfs_unlink_rdev(mddev, rdev); | 3189 | sysfs_unlink_rdev(mddev, rdev); |
| 3109 | if (sysfs_link_rdev(mddev, rdev)) | 3190 | if (sysfs_link_rdev(mddev, rdev)) |
| 3110 | printk(KERN_WARNING | 3191 | pr_warn("md/raid1:%s: cannot register rd%d\n", |
| 3111 | "md/raid1:%s: cannot register rd%d\n", | 3192 | mdname(mddev), rdev->raid_disk); |
| 3112 | mdname(mddev), rdev->raid_disk); | ||
| 3113 | } | 3193 | } |
| 3114 | if (rdev) | 3194 | if (rdev) |
| 3115 | newmirrors[d2++].rdev = rdev; | 3195 | newmirrors[d2++].rdev = rdev; |
| @@ -3163,9 +3243,12 @@ static void *raid1_takeover(struct mddev *mddev) | |||
| 3163 | mddev->new_layout = 0; | 3243 | mddev->new_layout = 0; |
| 3164 | mddev->new_chunk_sectors = 0; | 3244 | mddev->new_chunk_sectors = 0; |
| 3165 | conf = setup_conf(mddev); | 3245 | conf = setup_conf(mddev); |
| 3166 | if (!IS_ERR(conf)) | 3246 | if (!IS_ERR(conf)) { |
| 3167 | /* Array must appear to be quiesced */ | 3247 | /* Array must appear to be quiesced */ |
| 3168 | conf->array_frozen = 1; | 3248 | conf->array_frozen = 1; |
| 3249 | clear_bit(MD_HAS_JOURNAL, &mddev->flags); | ||
| 3250 | clear_bit(MD_JOURNAL_CLEAN, &mddev->flags); | ||
| 3251 | } | ||
| 3169 | return conf; | 3252 | return conf; |
| 3170 | } | 3253 | } |
| 3171 | return ERR_PTR(-EINVAL); | 3254 | return ERR_PTR(-EINVAL); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 61c39b390cd8..c52ef424a24b 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -161,14 +161,15 @@ struct r1bio { | |||
| 161 | }; | 161 | }; |
| 162 | 162 | ||
| 163 | /* bits for r1bio.state */ | 163 | /* bits for r1bio.state */ |
| 164 | #define R1BIO_Uptodate 0 | 164 | enum r1bio_state { |
| 165 | #define R1BIO_IsSync 1 | 165 | R1BIO_Uptodate, |
| 166 | #define R1BIO_Degraded 2 | 166 | R1BIO_IsSync, |
| 167 | #define R1BIO_BehindIO 3 | 167 | R1BIO_Degraded, |
| 168 | R1BIO_BehindIO, | ||
| 168 | /* Set ReadError on bios that experience a readerror so that | 169 | /* Set ReadError on bios that experience a readerror so that |
| 169 | * raid1d knows what to do with them. | 170 | * raid1d knows what to do with them. |
| 170 | */ | 171 | */ |
| 171 | #define R1BIO_ReadError 4 | 172 | R1BIO_ReadError, |
| 172 | /* For write-behind requests, we call bi_end_io when | 173 | /* For write-behind requests, we call bi_end_io when |
| 173 | * the last non-write-behind device completes, providing | 174 | * the last non-write-behind device completes, providing |
| 174 | * any write was successful. Otherwise we call when | 175 | * any write was successful. Otherwise we call when |
| @@ -176,10 +177,12 @@ struct r1bio { | |||
| 176 | * with failure when last write completes (and all failed). | 177 | * with failure when last write completes (and all failed). |
| 177 | * Record that bi_end_io was called with this flag... | 178 | * Record that bi_end_io was called with this flag... |
| 178 | */ | 179 | */ |
| 179 | #define R1BIO_Returned 6 | 180 | R1BIO_Returned, |
| 180 | /* If a write for this request means we can clear some | 181 | /* If a write for this request means we can clear some |
| 181 | * known-bad-block records, we set this flag | 182 | * known-bad-block records, we set this flag |
| 182 | */ | 183 | */ |
| 183 | #define R1BIO_MadeGood 7 | 184 | R1BIO_MadeGood, |
| 184 | #define R1BIO_WriteError 8 | 185 | R1BIO_WriteError, |
| 186 | R1BIO_FailFast, | ||
| 187 | }; | ||
| 185 | #endif | 188 | #endif |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 39fddda2fef2..ab5e86209322 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
| 26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
| 27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
| 28 | #include <trace/events/block.h> | ||
| 28 | #include "md.h" | 29 | #include "md.h" |
| 29 | #include "raid10.h" | 30 | #include "raid10.h" |
| 30 | #include "raid0.h" | 31 | #include "raid0.h" |
| @@ -99,12 +100,16 @@ static int max_queued_requests = 1024; | |||
| 99 | static void allow_barrier(struct r10conf *conf); | 100 | static void allow_barrier(struct r10conf *conf); |
| 100 | static void lower_barrier(struct r10conf *conf); | 101 | static void lower_barrier(struct r10conf *conf); |
| 101 | static int _enough(struct r10conf *conf, int previous, int ignore); | 102 | static int _enough(struct r10conf *conf, int previous, int ignore); |
| 103 | static int enough(struct r10conf *conf, int ignore); | ||
| 102 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | 104 | static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, |
| 103 | int *skipped); | 105 | int *skipped); |
| 104 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); | 106 | static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); |
| 105 | static void end_reshape_write(struct bio *bio); | 107 | static void end_reshape_write(struct bio *bio); |
| 106 | static void end_reshape(struct r10conf *conf); | 108 | static void end_reshape(struct r10conf *conf); |
| 107 | 109 | ||
| 110 | #define raid10_log(md, fmt, args...) \ | ||
| 111 | do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) | ||
| 112 | |||
| 108 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 113 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
| 109 | { | 114 | { |
| 110 | struct r10conf *conf = data; | 115 | struct r10conf *conf = data; |
| @@ -404,8 +409,7 @@ static void raid10_end_read_request(struct bio *bio) | |||
| 404 | * oops, read error - keep the refcount on the rdev | 409 | * oops, read error - keep the refcount on the rdev |
| 405 | */ | 410 | */ |
| 406 | char b[BDEVNAME_SIZE]; | 411 | char b[BDEVNAME_SIZE]; |
| 407 | printk_ratelimited(KERN_ERR | 412 | pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n", |
| 408 | "md/raid10:%s: %s: rescheduling sector %llu\n", | ||
| 409 | mdname(conf->mddev), | 413 | mdname(conf->mddev), |
| 410 | bdevname(rdev->bdev, b), | 414 | bdevname(rdev->bdev, b), |
| 411 | (unsigned long long)r10_bio->sector); | 415 | (unsigned long long)r10_bio->sector); |
| @@ -447,6 +451,7 @@ static void raid10_end_write_request(struct bio *bio) | |||
| 447 | struct r10conf *conf = r10_bio->mddev->private; | 451 | struct r10conf *conf = r10_bio->mddev->private; |
| 448 | int slot, repl; | 452 | int slot, repl; |
| 449 | struct md_rdev *rdev = NULL; | 453 | struct md_rdev *rdev = NULL; |
| 454 | struct bio *to_put = NULL; | ||
| 450 | bool discard_error; | 455 | bool discard_error; |
| 451 | 456 | ||
| 452 | discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; | 457 | discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; |
| @@ -474,8 +479,24 @@ static void raid10_end_write_request(struct bio *bio) | |||
| 474 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) | 479 | if (!test_and_set_bit(WantReplacement, &rdev->flags)) |
| 475 | set_bit(MD_RECOVERY_NEEDED, | 480 | set_bit(MD_RECOVERY_NEEDED, |
| 476 | &rdev->mddev->recovery); | 481 | &rdev->mddev->recovery); |
| 477 | set_bit(R10BIO_WriteError, &r10_bio->state); | 482 | |
| 478 | dec_rdev = 0; | 483 | dec_rdev = 0; |
| 484 | if (test_bit(FailFast, &rdev->flags) && | ||
| 485 | (bio->bi_opf & MD_FAILFAST)) { | ||
| 486 | md_error(rdev->mddev, rdev); | ||
| 487 | if (!test_bit(Faulty, &rdev->flags)) | ||
| 488 | /* This is the only remaining device, | ||
| 489 | * We need to retry the write without | ||
| 490 | * FailFast | ||
| 491 | */ | ||
| 492 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
| 493 | else { | ||
| 494 | r10_bio->devs[slot].bio = NULL; | ||
| 495 | to_put = bio; | ||
| 496 | dec_rdev = 1; | ||
| 497 | } | ||
| 498 | } else | ||
| 499 | set_bit(R10BIO_WriteError, &r10_bio->state); | ||
| 479 | } | 500 | } |
| 480 | } else { | 501 | } else { |
| 481 | /* | 502 | /* |
| @@ -525,6 +546,8 @@ static void raid10_end_write_request(struct bio *bio) | |||
| 525 | one_write_done(r10_bio); | 546 | one_write_done(r10_bio); |
| 526 | if (dec_rdev) | 547 | if (dec_rdev) |
| 527 | rdev_dec_pending(rdev, conf->mddev); | 548 | rdev_dec_pending(rdev, conf->mddev); |
| 549 | if (to_put) | ||
| 550 | bio_put(to_put); | ||
| 528 | } | 551 | } |
| 529 | 552 | ||
| 530 | /* | 553 | /* |
| @@ -716,6 +739,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
| 716 | best_dist = MaxSector; | 739 | best_dist = MaxSector; |
| 717 | best_good_sectors = 0; | 740 | best_good_sectors = 0; |
| 718 | do_balance = 1; | 741 | do_balance = 1; |
| 742 | clear_bit(R10BIO_FailFast, &r10_bio->state); | ||
| 719 | /* | 743 | /* |
| 720 | * Check if we can balance. We can balance on the whole | 744 | * Check if we can balance. We can balance on the whole |
| 721 | * device if no resync is going on (recovery is ok), or below | 745 | * device if no resync is going on (recovery is ok), or below |
| @@ -780,15 +804,18 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
| 780 | if (!do_balance) | 804 | if (!do_balance) |
| 781 | break; | 805 | break; |
| 782 | 806 | ||
| 807 | if (best_slot >= 0) | ||
| 808 | /* At least 2 disks to choose from so failfast is OK */ | ||
| 809 | set_bit(R10BIO_FailFast, &r10_bio->state); | ||
| 783 | /* This optimisation is debatable, and completely destroys | 810 | /* This optimisation is debatable, and completely destroys |
| 784 | * sequential read speed for 'far copies' arrays. So only | 811 | * sequential read speed for 'far copies' arrays. So only |
| 785 | * keep it for 'near' arrays, and review those later. | 812 | * keep it for 'near' arrays, and review those later. |
| 786 | */ | 813 | */ |
| 787 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) | 814 | if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
| 788 | break; | 815 | new_distance = 0; |
| 789 | 816 | ||
| 790 | /* for far > 1 always use the lowest address */ | 817 | /* for far > 1 always use the lowest address */ |
| 791 | if (geo->far_copies > 1) | 818 | else if (geo->far_copies > 1) |
| 792 | new_distance = r10_bio->devs[slot].addr; | 819 | new_distance = r10_bio->devs[slot].addr; |
| 793 | else | 820 | else |
| 794 | new_distance = abs(r10_bio->devs[slot].addr - | 821 | new_distance = abs(r10_bio->devs[slot].addr - |
| @@ -859,9 +886,14 @@ static void flush_pending_writes(struct r10conf *conf) | |||
| 859 | 886 | ||
| 860 | while (bio) { /* submit pending writes */ | 887 | while (bio) { /* submit pending writes */ |
| 861 | struct bio *next = bio->bi_next; | 888 | struct bio *next = bio->bi_next; |
| 889 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
| 862 | bio->bi_next = NULL; | 890 | bio->bi_next = NULL; |
| 863 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 891 | bio->bi_bdev = rdev->bdev; |
| 864 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 892 | if (test_bit(Faulty, &rdev->flags)) { |
| 893 | bio->bi_error = -EIO; | ||
| 894 | bio_endio(bio); | ||
| 895 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
| 896 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
| 865 | /* Just ignore it */ | 897 | /* Just ignore it */ |
| 866 | bio_endio(bio); | 898 | bio_endio(bio); |
| 867 | else | 899 | else |
| @@ -937,6 +969,7 @@ static void wait_barrier(struct r10conf *conf) | |||
| 937 | * that queue to get the nr_pending | 969 | * that queue to get the nr_pending |
| 938 | * count down. | 970 | * count down. |
| 939 | */ | 971 | */ |
| 972 | raid10_log(conf->mddev, "wait barrier"); | ||
| 940 | wait_event_lock_irq(conf->wait_barrier, | 973 | wait_event_lock_irq(conf->wait_barrier, |
| 941 | !conf->barrier || | 974 | !conf->barrier || |
| 942 | (atomic_read(&conf->nr_pending) && | 975 | (atomic_read(&conf->nr_pending) && |
| @@ -1037,9 +1070,14 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
| 1037 | 1070 | ||
| 1038 | while (bio) { /* submit pending writes */ | 1071 | while (bio) { /* submit pending writes */ |
| 1039 | struct bio *next = bio->bi_next; | 1072 | struct bio *next = bio->bi_next; |
| 1073 | struct md_rdev *rdev = (void*)bio->bi_bdev; | ||
| 1040 | bio->bi_next = NULL; | 1074 | bio->bi_next = NULL; |
| 1041 | if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | 1075 | bio->bi_bdev = rdev->bdev; |
| 1042 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | 1076 | if (test_bit(Faulty, &rdev->flags)) { |
| 1077 | bio->bi_error = -EIO; | ||
| 1078 | bio_endio(bio); | ||
| 1079 | } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && | ||
| 1080 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
| 1043 | /* Just ignore it */ | 1081 | /* Just ignore it */ |
| 1044 | bio_endio(bio); | 1082 | bio_endio(bio); |
| 1045 | else | 1083 | else |
| @@ -1083,6 +1121,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio) | |||
| 1083 | /* IO spans the reshape position. Need to wait for | 1121 | /* IO spans the reshape position. Need to wait for |
| 1084 | * reshape to pass | 1122 | * reshape to pass |
| 1085 | */ | 1123 | */ |
| 1124 | raid10_log(conf->mddev, "wait reshape"); | ||
| 1086 | allow_barrier(conf); | 1125 | allow_barrier(conf); |
| 1087 | wait_event(conf->wait_barrier, | 1126 | wait_event(conf->wait_barrier, |
| 1088 | conf->reshape_progress <= bio->bi_iter.bi_sector || | 1127 | conf->reshape_progress <= bio->bi_iter.bi_sector || |
| @@ -1099,11 +1138,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio) | |||
| 1099 | bio->bi_iter.bi_sector < conf->reshape_progress))) { | 1138 | bio->bi_iter.bi_sector < conf->reshape_progress))) { |
| 1100 | /* Need to update reshape_position in metadata */ | 1139 | /* Need to update reshape_position in metadata */ |
| 1101 | mddev->reshape_position = conf->reshape_progress; | 1140 | mddev->reshape_position = conf->reshape_progress; |
| 1102 | set_mask_bits(&mddev->flags, 0, | 1141 | set_mask_bits(&mddev->sb_flags, 0, |
| 1103 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1142 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
| 1104 | md_wakeup_thread(mddev->thread); | 1143 | md_wakeup_thread(mddev->thread); |
| 1144 | raid10_log(conf->mddev, "wait reshape metadata"); | ||
| 1105 | wait_event(mddev->sb_wait, | 1145 | wait_event(mddev->sb_wait, |
| 1106 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 1146 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); |
| 1107 | 1147 | ||
| 1108 | conf->reshape_safe = mddev->reshape_position; | 1148 | conf->reshape_safe = mddev->reshape_position; |
| 1109 | } | 1149 | } |
| @@ -1154,8 +1194,15 @@ read_again: | |||
| 1154 | read_bio->bi_bdev = rdev->bdev; | 1194 | read_bio->bi_bdev = rdev->bdev; |
| 1155 | read_bio->bi_end_io = raid10_end_read_request; | 1195 | read_bio->bi_end_io = raid10_end_read_request; |
| 1156 | bio_set_op_attrs(read_bio, op, do_sync); | 1196 | bio_set_op_attrs(read_bio, op, do_sync); |
| 1197 | if (test_bit(FailFast, &rdev->flags) && | ||
| 1198 | test_bit(R10BIO_FailFast, &r10_bio->state)) | ||
| 1199 | read_bio->bi_opf |= MD_FAILFAST; | ||
| 1157 | read_bio->bi_private = r10_bio; | 1200 | read_bio->bi_private = r10_bio; |
| 1158 | 1201 | ||
| 1202 | if (mddev->gendisk) | ||
| 1203 | trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), | ||
| 1204 | read_bio, disk_devt(mddev->gendisk), | ||
| 1205 | r10_bio->sector); | ||
| 1159 | if (max_sectors < r10_bio->sectors) { | 1206 | if (max_sectors < r10_bio->sectors) { |
| 1160 | /* Could not read all from this device, so we will | 1207 | /* Could not read all from this device, so we will |
| 1161 | * need another r10_bio. | 1208 | * need another r10_bio. |
| @@ -1195,6 +1242,7 @@ read_again: | |||
| 1195 | */ | 1242 | */ |
| 1196 | if (conf->pending_count >= max_queued_requests) { | 1243 | if (conf->pending_count >= max_queued_requests) { |
| 1197 | md_wakeup_thread(mddev->thread); | 1244 | md_wakeup_thread(mddev->thread); |
| 1245 | raid10_log(mddev, "wait queued"); | ||
| 1198 | wait_event(conf->wait_barrier, | 1246 | wait_event(conf->wait_barrier, |
| 1199 | conf->pending_count < max_queued_requests); | 1247 | conf->pending_count < max_queued_requests); |
| 1200 | } | 1248 | } |
| @@ -1322,6 +1370,7 @@ retry_write: | |||
| 1322 | } | 1370 | } |
| 1323 | } | 1371 | } |
| 1324 | allow_barrier(conf); | 1372 | allow_barrier(conf); |
| 1373 | raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); | ||
| 1325 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 1374 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
| 1326 | wait_barrier(conf); | 1375 | wait_barrier(conf); |
| 1327 | goto retry_write; | 1376 | goto retry_write; |
| @@ -1361,8 +1410,18 @@ retry_write: | |||
| 1361 | mbio->bi_bdev = rdev->bdev; | 1410 | mbio->bi_bdev = rdev->bdev; |
| 1362 | mbio->bi_end_io = raid10_end_write_request; | 1411 | mbio->bi_end_io = raid10_end_write_request; |
| 1363 | bio_set_op_attrs(mbio, op, do_sync | do_fua); | 1412 | bio_set_op_attrs(mbio, op, do_sync | do_fua); |
| 1413 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) && | ||
| 1414 | enough(conf, d)) | ||
| 1415 | mbio->bi_opf |= MD_FAILFAST; | ||
| 1364 | mbio->bi_private = r10_bio; | 1416 | mbio->bi_private = r10_bio; |
| 1365 | 1417 | ||
| 1418 | if (conf->mddev->gendisk) | ||
| 1419 | trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), | ||
| 1420 | mbio, disk_devt(conf->mddev->gendisk), | ||
| 1421 | r10_bio->sector); | ||
| 1422 | /* flush_pending_writes() needs access to the rdev so...*/ | ||
| 1423 | mbio->bi_bdev = (void*)rdev; | ||
| 1424 | |||
| 1366 | atomic_inc(&r10_bio->remaining); | 1425 | atomic_inc(&r10_bio->remaining); |
| 1367 | 1426 | ||
| 1368 | cb = blk_check_plugged(raid10_unplug, mddev, | 1427 | cb = blk_check_plugged(raid10_unplug, mddev, |
| @@ -1405,6 +1464,13 @@ retry_write: | |||
| 1405 | bio_set_op_attrs(mbio, op, do_sync | do_fua); | 1464 | bio_set_op_attrs(mbio, op, do_sync | do_fua); |
| 1406 | mbio->bi_private = r10_bio; | 1465 | mbio->bi_private = r10_bio; |
| 1407 | 1466 | ||
| 1467 | if (conf->mddev->gendisk) | ||
| 1468 | trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), | ||
| 1469 | mbio, disk_devt(conf->mddev->gendisk), | ||
| 1470 | r10_bio->sector); | ||
| 1471 | /* flush_pending_writes() needs access to the rdev so...*/ | ||
| 1472 | mbio->bi_bdev = (void*)rdev; | ||
| 1473 | |||
| 1408 | atomic_inc(&r10_bio->remaining); | 1474 | atomic_inc(&r10_bio->remaining); |
| 1409 | spin_lock_irqsave(&conf->device_lock, flags); | 1475 | spin_lock_irqsave(&conf->device_lock, flags); |
| 1410 | bio_list_add(&conf->pending_bio_list, mbio); | 1476 | bio_list_add(&conf->pending_bio_list, mbio); |
| @@ -1586,14 +1652,13 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1586 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 1652 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 1587 | set_bit(Blocked, &rdev->flags); | 1653 | set_bit(Blocked, &rdev->flags); |
| 1588 | set_bit(Faulty, &rdev->flags); | 1654 | set_bit(Faulty, &rdev->flags); |
| 1589 | set_mask_bits(&mddev->flags, 0, | 1655 | set_mask_bits(&mddev->sb_flags, 0, |
| 1590 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1656 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
| 1591 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1657 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| 1592 | printk(KERN_ALERT | 1658 | pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n" |
| 1593 | "md/raid10:%s: Disk failure on %s, disabling device.\n" | 1659 | "md/raid10:%s: Operation continuing on %d devices.\n", |
| 1594 | "md/raid10:%s: Operation continuing on %d devices.\n", | 1660 | mdname(mddev), bdevname(rdev->bdev, b), |
| 1595 | mdname(mddev), bdevname(rdev->bdev, b), | 1661 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); |
| 1596 | mdname(mddev), conf->geo.raid_disks - mddev->degraded); | ||
| 1597 | } | 1662 | } |
| 1598 | 1663 | ||
| 1599 | static void print_conf(struct r10conf *conf) | 1664 | static void print_conf(struct r10conf *conf) |
| @@ -1601,13 +1666,13 @@ static void print_conf(struct r10conf *conf) | |||
| 1601 | int i; | 1666 | int i; |
| 1602 | struct md_rdev *rdev; | 1667 | struct md_rdev *rdev; |
| 1603 | 1668 | ||
| 1604 | printk(KERN_DEBUG "RAID10 conf printout:\n"); | 1669 | pr_debug("RAID10 conf printout:\n"); |
| 1605 | if (!conf) { | 1670 | if (!conf) { |
| 1606 | printk(KERN_DEBUG "(!conf)\n"); | 1671 | pr_debug("(!conf)\n"); |
| 1607 | return; | 1672 | return; |
| 1608 | } | 1673 | } |
| 1609 | printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, | 1674 | pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, |
| 1610 | conf->geo.raid_disks); | 1675 | conf->geo.raid_disks); |
| 1611 | 1676 | ||
| 1612 | /* This is only called with ->reconfix_mutex held, so | 1677 | /* This is only called with ->reconfix_mutex held, so |
| 1613 | * rcu protection of rdev is not needed */ | 1678 | * rcu protection of rdev is not needed */ |
| @@ -1615,10 +1680,10 @@ static void print_conf(struct r10conf *conf) | |||
| 1615 | char b[BDEVNAME_SIZE]; | 1680 | char b[BDEVNAME_SIZE]; |
| 1616 | rdev = conf->mirrors[i].rdev; | 1681 | rdev = conf->mirrors[i].rdev; |
| 1617 | if (rdev) | 1682 | if (rdev) |
| 1618 | printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", | 1683 | pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", |
| 1619 | i, !test_bit(In_sync, &rdev->flags), | 1684 | i, !test_bit(In_sync, &rdev->flags), |
| 1620 | !test_bit(Faulty, &rdev->flags), | 1685 | !test_bit(Faulty, &rdev->flags), |
| 1621 | bdevname(rdev->bdev,b)); | 1686 | bdevname(rdev->bdev,b)); |
| 1622 | } | 1687 | } |
| 1623 | } | 1688 | } |
| 1624 | 1689 | ||
| @@ -1953,6 +2018,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 1953 | /* now find blocks with errors */ | 2018 | /* now find blocks with errors */ |
| 1954 | for (i=0 ; i < conf->copies ; i++) { | 2019 | for (i=0 ; i < conf->copies ; i++) { |
| 1955 | int j, d; | 2020 | int j, d; |
| 2021 | struct md_rdev *rdev; | ||
| 1956 | 2022 | ||
| 1957 | tbio = r10_bio->devs[i].bio; | 2023 | tbio = r10_bio->devs[i].bio; |
| 1958 | 2024 | ||
| @@ -1960,6 +2026,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 1960 | continue; | 2026 | continue; |
| 1961 | if (i == first) | 2027 | if (i == first) |
| 1962 | continue; | 2028 | continue; |
| 2029 | d = r10_bio->devs[i].devnum; | ||
| 2030 | rdev = conf->mirrors[d].rdev; | ||
| 1963 | if (!r10_bio->devs[i].bio->bi_error) { | 2031 | if (!r10_bio->devs[i].bio->bi_error) { |
| 1964 | /* We know that the bi_io_vec layout is the same for | 2032 | /* We know that the bi_io_vec layout is the same for |
| 1965 | * both 'first' and 'i', so we just compare them. | 2033 | * both 'first' and 'i', so we just compare them. |
| @@ -1982,6 +2050,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 1982 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 2050 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) |
| 1983 | /* Don't fix anything. */ | 2051 | /* Don't fix anything. */ |
| 1984 | continue; | 2052 | continue; |
| 2053 | } else if (test_bit(FailFast, &rdev->flags)) { | ||
| 2054 | /* Just give up on this device */ | ||
| 2055 | md_error(rdev->mddev, rdev); | ||
| 2056 | continue; | ||
| 1985 | } | 2057 | } |
| 1986 | /* Ok, we need to write this bio, either to correct an | 2058 | /* Ok, we need to write this bio, either to correct an |
| 1987 | * inconsistency or to correct an unreadable block. | 2059 | * inconsistency or to correct an unreadable block. |
| @@ -1999,11 +2071,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 1999 | 2071 | ||
| 2000 | bio_copy_data(tbio, fbio); | 2072 | bio_copy_data(tbio, fbio); |
| 2001 | 2073 | ||
| 2002 | d = r10_bio->devs[i].devnum; | ||
| 2003 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | 2074 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); |
| 2004 | atomic_inc(&r10_bio->remaining); | 2075 | atomic_inc(&r10_bio->remaining); |
| 2005 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); | 2076 | md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); |
| 2006 | 2077 | ||
| 2078 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) | ||
| 2079 | tbio->bi_opf |= MD_FAILFAST; | ||
| 2007 | tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; | 2080 | tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; |
| 2008 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 2081 | tbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
| 2009 | generic_make_request(tbio); | 2082 | generic_make_request(tbio); |
| @@ -2109,10 +2182,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) | |||
| 2109 | ok = rdev_set_badblocks(rdev2, addr, s, 0); | 2182 | ok = rdev_set_badblocks(rdev2, addr, s, 0); |
| 2110 | if (!ok) { | 2183 | if (!ok) { |
| 2111 | /* just abort the recovery */ | 2184 | /* just abort the recovery */ |
| 2112 | printk(KERN_NOTICE | 2185 | pr_notice("md/raid10:%s: recovery aborted due to read error\n", |
| 2113 | "md/raid10:%s: recovery aborted" | 2186 | mdname(mddev)); |
| 2114 | " due to read error\n", | ||
| 2115 | mdname(mddev)); | ||
| 2116 | 2187 | ||
| 2117 | conf->mirrors[dw].recovery_disabled | 2188 | conf->mirrors[dw].recovery_disabled |
| 2118 | = mddev->recovery_disabled; | 2189 | = mddev->recovery_disabled; |
| @@ -2259,14 +2330,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
| 2259 | char b[BDEVNAME_SIZE]; | 2330 | char b[BDEVNAME_SIZE]; |
| 2260 | bdevname(rdev->bdev, b); | 2331 | bdevname(rdev->bdev, b); |
| 2261 | 2332 | ||
| 2262 | printk(KERN_NOTICE | 2333 | pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n", |
| 2263 | "md/raid10:%s: %s: Raid device exceeded " | 2334 | mdname(mddev), b, |
| 2264 | "read_error threshold [cur %d:max %d]\n", | 2335 | atomic_read(&rdev->read_errors), max_read_errors); |
| 2265 | mdname(mddev), b, | 2336 | pr_notice("md/raid10:%s: %s: Failing raid device\n", |
| 2266 | atomic_read(&rdev->read_errors), max_read_errors); | 2337 | mdname(mddev), b); |
| 2267 | printk(KERN_NOTICE | ||
| 2268 | "md/raid10:%s: %s: Failing raid device\n", | ||
| 2269 | mdname(mddev), b); | ||
| 2270 | md_error(mddev, rdev); | 2338 | md_error(mddev, rdev); |
| 2271 | r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; | 2339 | r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; |
| 2272 | return; | 2340 | return; |
| @@ -2356,20 +2424,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
| 2356 | s, conf->tmppage, WRITE) | 2424 | s, conf->tmppage, WRITE) |
| 2357 | == 0) { | 2425 | == 0) { |
| 2358 | /* Well, this device is dead */ | 2426 | /* Well, this device is dead */ |
| 2359 | printk(KERN_NOTICE | 2427 | pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n", |
| 2360 | "md/raid10:%s: read correction " | 2428 | mdname(mddev), s, |
| 2361 | "write failed" | 2429 | (unsigned long long)( |
| 2362 | " (%d sectors at %llu on %s)\n", | 2430 | sect + |
| 2363 | mdname(mddev), s, | 2431 | choose_data_offset(r10_bio, |
| 2364 | (unsigned long long)( | 2432 | rdev)), |
| 2365 | sect + | 2433 | bdevname(rdev->bdev, b)); |
| 2366 | choose_data_offset(r10_bio, | 2434 | pr_notice("md/raid10:%s: %s: failing drive\n", |
| 2367 | rdev)), | 2435 | mdname(mddev), |
| 2368 | bdevname(rdev->bdev, b)); | 2436 | bdevname(rdev->bdev, b)); |
| 2369 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | ||
| 2370 | "drive\n", | ||
| 2371 | mdname(mddev), | ||
| 2372 | bdevname(rdev->bdev, b)); | ||
| 2373 | } | 2437 | } |
| 2374 | rdev_dec_pending(rdev, mddev); | 2438 | rdev_dec_pending(rdev, mddev); |
| 2375 | rcu_read_lock(); | 2439 | rcu_read_lock(); |
| @@ -2397,24 +2461,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
| 2397 | READ)) { | 2461 | READ)) { |
| 2398 | case 0: | 2462 | case 0: |
| 2399 | /* Well, this device is dead */ | 2463 | /* Well, this device is dead */ |
| 2400 | printk(KERN_NOTICE | 2464 | pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n", |
| 2401 | "md/raid10:%s: unable to read back " | ||
| 2402 | "corrected sectors" | ||
| 2403 | " (%d sectors at %llu on %s)\n", | ||
| 2404 | mdname(mddev), s, | 2465 | mdname(mddev), s, |
| 2405 | (unsigned long long)( | 2466 | (unsigned long long)( |
| 2406 | sect + | 2467 | sect + |
| 2407 | choose_data_offset(r10_bio, rdev)), | 2468 | choose_data_offset(r10_bio, rdev)), |
| 2408 | bdevname(rdev->bdev, b)); | 2469 | bdevname(rdev->bdev, b)); |
| 2409 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 2470 | pr_notice("md/raid10:%s: %s: failing drive\n", |
| 2410 | "drive\n", | ||
| 2411 | mdname(mddev), | 2471 | mdname(mddev), |
| 2412 | bdevname(rdev->bdev, b)); | 2472 | bdevname(rdev->bdev, b)); |
| 2413 | break; | 2473 | break; |
| 2414 | case 1: | 2474 | case 1: |
| 2415 | printk(KERN_INFO | 2475 | pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n", |
| 2416 | "md/raid10:%s: read error corrected" | ||
| 2417 | " (%d sectors at %llu on %s)\n", | ||
| 2418 | mdname(mddev), s, | 2476 | mdname(mddev), s, |
| 2419 | (unsigned long long)( | 2477 | (unsigned long long)( |
| 2420 | sect + | 2478 | sect + |
| @@ -2503,6 +2561,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 2503 | char b[BDEVNAME_SIZE]; | 2561 | char b[BDEVNAME_SIZE]; |
| 2504 | unsigned long do_sync; | 2562 | unsigned long do_sync; |
| 2505 | int max_sectors; | 2563 | int max_sectors; |
| 2564 | dev_t bio_dev; | ||
| 2565 | sector_t bio_last_sector; | ||
| 2506 | 2566 | ||
| 2507 | /* we got a read error. Maybe the drive is bad. Maybe just | 2567 | /* we got a read error. Maybe the drive is bad. Maybe just |
| 2508 | * the block and we can fix it. | 2568 | * the block and we can fix it. |
| @@ -2514,38 +2574,38 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
| 2514 | */ | 2574 | */ |
| 2515 | bio = r10_bio->devs[slot].bio; | 2575 | bio = r10_bio->devs[slot].bio; |
| 2516 | bdevname(bio->bi_bdev, b); | 2576 | bdevname(bio->bi_bdev, b); |
| 2577 | bio_dev = bio->bi_bdev->bd_dev; | ||
| 2578 | bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; | ||
| 2517 | bio_put(bio); | 2579 | bio_put(bio); |
| 2518 | r10_bio->devs[slot].bio = NULL; | 2580 | r10_bio->devs[slot].bio = NULL; |
| 2519 | 2581 | ||
| 2520 | if (mddev->ro == 0) { | 2582 | if (mddev->ro) |
| 2583 | r10_bio->devs[slot].bio = IO_BLOCKED; | ||
| 2584 | else if (!test_bit(FailFast, &rdev->flags)) { | ||
| 2521 | freeze_array(conf, 1); | 2585 | freeze_array(conf, 1); |
| 2522 | fix_read_error(conf, mddev, r10_bio); | 2586 | fix_read_error(conf, mddev, r10_bio); |
| 2523 | unfreeze_array(conf); | 2587 | unfreeze_array(conf); |
| 2524 | } else | 2588 | } else |
| 2525 | r10_bio->devs[slot].bio = IO_BLOCKED; | 2589 | md_error(mddev, rdev); |
| 2526 | 2590 | ||
| 2527 | rdev_dec_pending(rdev, mddev); | 2591 | rdev_dec_pending(rdev, mddev); |
| 2528 | 2592 | ||
| 2529 | read_more: | 2593 | read_more: |
| 2530 | rdev = read_balance(conf, r10_bio, &max_sectors); | 2594 | rdev = read_balance(conf, r10_bio, &max_sectors); |
| 2531 | if (rdev == NULL) { | 2595 | if (rdev == NULL) { |
| 2532 | printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" | 2596 | pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", |
| 2533 | " read error for block %llu\n", | 2597 | mdname(mddev), b, |
| 2534 | mdname(mddev), b, | 2598 | (unsigned long long)r10_bio->sector); |
| 2535 | (unsigned long long)r10_bio->sector); | ||
| 2536 | raid_end_bio_io(r10_bio); | 2599 | raid_end_bio_io(r10_bio); |
| 2537 | return; | 2600 | return; |
| 2538 | } | 2601 | } |
| 2539 | 2602 | ||
| 2540 | do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); | 2603 | do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); |
| 2541 | slot = r10_bio->read_slot; | 2604 | slot = r10_bio->read_slot; |
| 2542 | printk_ratelimited( | 2605 | pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", |
| 2543 | KERN_ERR | 2606 | mdname(mddev), |
| 2544 | "md/raid10:%s: %s: redirecting " | 2607 | bdevname(rdev->bdev, b), |
| 2545 | "sector %llu to another mirror\n", | 2608 | (unsigned long long)r10_bio->sector); |
| 2546 | mdname(mddev), | ||
| 2547 | bdevname(rdev->bdev, b), | ||
| 2548 | (unsigned long long)r10_bio->sector); | ||
| 2549 | bio = bio_clone_mddev(r10_bio->master_bio, | 2609 | bio = bio_clone_mddev(r10_bio->master_bio, |
| 2550 | GFP_NOIO, mddev); | 2610 | GFP_NOIO, mddev); |
| 2551 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); | 2611 | bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); |
| @@ -2555,8 +2615,15 @@ read_more: | |||
| 2555 | + choose_data_offset(r10_bio, rdev); | 2615 | + choose_data_offset(r10_bio, rdev); |
| 2556 | bio->bi_bdev = rdev->bdev; | 2616 | bio->bi_bdev = rdev->bdev; |
| 2557 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); | 2617 | bio_set_op_attrs(bio, REQ_OP_READ, do_sync); |
| 2618 | if (test_bit(FailFast, &rdev->flags) && | ||
| 2619 | test_bit(R10BIO_FailFast, &r10_bio->state)) | ||
| 2620 | bio->bi_opf |= MD_FAILFAST; | ||
| 2558 | bio->bi_private = r10_bio; | 2621 | bio->bi_private = r10_bio; |
| 2559 | bio->bi_end_io = raid10_end_read_request; | 2622 | bio->bi_end_io = raid10_end_read_request; |
| 2623 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), | ||
| 2624 | bio, bio_dev, | ||
| 2625 | bio_last_sector - r10_bio->sectors); | ||
| 2626 | |||
| 2560 | if (max_sectors < r10_bio->sectors) { | 2627 | if (max_sectors < r10_bio->sectors) { |
| 2561 | /* Drat - have to split this up more */ | 2628 | /* Drat - have to split this up more */ |
| 2562 | struct bio *mbio = r10_bio->master_bio; | 2629 | struct bio *mbio = r10_bio->master_bio; |
| @@ -2694,10 +2761,10 @@ static void raid10d(struct md_thread *thread) | |||
| 2694 | md_check_recovery(mddev); | 2761 | md_check_recovery(mddev); |
| 2695 | 2762 | ||
| 2696 | if (!list_empty_careful(&conf->bio_end_io_list) && | 2763 | if (!list_empty_careful(&conf->bio_end_io_list) && |
| 2697 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2764 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 2698 | LIST_HEAD(tmp); | 2765 | LIST_HEAD(tmp); |
| 2699 | spin_lock_irqsave(&conf->device_lock, flags); | 2766 | spin_lock_irqsave(&conf->device_lock, flags); |
| 2700 | if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 2767 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 2701 | while (!list_empty(&conf->bio_end_io_list)) { | 2768 | while (!list_empty(&conf->bio_end_io_list)) { |
| 2702 | list_move(conf->bio_end_io_list.prev, &tmp); | 2769 | list_move(conf->bio_end_io_list.prev, &tmp); |
| 2703 | conf->nr_queued--; | 2770 | conf->nr_queued--; |
| @@ -2755,7 +2822,7 @@ static void raid10d(struct md_thread *thread) | |||
| 2755 | } | 2822 | } |
| 2756 | 2823 | ||
| 2757 | cond_resched(); | 2824 | cond_resched(); |
| 2758 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) | 2825 | if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) |
| 2759 | md_check_recovery(mddev); | 2826 | md_check_recovery(mddev); |
| 2760 | } | 2827 | } |
| 2761 | blk_finish_plug(&plug); | 2828 | blk_finish_plug(&plug); |
| @@ -3072,6 +3139,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 3072 | bio->bi_private = r10_bio; | 3139 | bio->bi_private = r10_bio; |
| 3073 | bio->bi_end_io = end_sync_read; | 3140 | bio->bi_end_io = end_sync_read; |
| 3074 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | 3141 | bio_set_op_attrs(bio, REQ_OP_READ, 0); |
| 3142 | if (test_bit(FailFast, &rdev->flags)) | ||
| 3143 | bio->bi_opf |= MD_FAILFAST; | ||
| 3075 | from_addr = r10_bio->devs[j].addr; | 3144 | from_addr = r10_bio->devs[j].addr; |
| 3076 | bio->bi_iter.bi_sector = from_addr + | 3145 | bio->bi_iter.bi_sector = from_addr + |
| 3077 | rdev->data_offset; | 3146 | rdev->data_offset; |
| @@ -3160,8 +3229,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 3160 | if (!any_working) { | 3229 | if (!any_working) { |
| 3161 | if (!test_and_set_bit(MD_RECOVERY_INTR, | 3230 | if (!test_and_set_bit(MD_RECOVERY_INTR, |
| 3162 | &mddev->recovery)) | 3231 | &mddev->recovery)) |
| 3163 | printk(KERN_INFO "md/raid10:%s: insufficient " | 3232 | pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", |
| 3164 | "working devices for recovery.\n", | ||
| 3165 | mdname(mddev)); | 3233 | mdname(mddev)); |
| 3166 | mirror->recovery_disabled | 3234 | mirror->recovery_disabled |
| 3167 | = mddev->recovery_disabled; | 3235 | = mddev->recovery_disabled; |
| @@ -3178,6 +3246,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 3178 | rdev_dec_pending(mrdev, mddev); | 3246 | rdev_dec_pending(mrdev, mddev); |
| 3179 | if (mreplace) | 3247 | if (mreplace) |
| 3180 | rdev_dec_pending(mreplace, mddev); | 3248 | rdev_dec_pending(mreplace, mddev); |
| 3249 | if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { | ||
| 3250 | /* Only want this if there is elsewhere to | ||
| 3251 | * read from. 'j' is currently the first | ||
| 3252 | * readable copy. | ||
| 3253 | */ | ||
| 3254 | int targets = 1; | ||
| 3255 | for (; j < conf->copies; j++) { | ||
| 3256 | int d = r10_bio->devs[j].devnum; | ||
| 3257 | if (conf->mirrors[d].rdev && | ||
| 3258 | test_bit(In_sync, | ||
| 3259 | &conf->mirrors[d].rdev->flags)) | ||
| 3260 | targets++; | ||
| 3261 | } | ||
| 3262 | if (targets == 1) | ||
| 3263 | r10_bio->devs[0].bio->bi_opf | ||
| 3264 | &= ~MD_FAILFAST; | ||
| 3265 | } | ||
| 3181 | } | 3266 | } |
| 3182 | if (biolist == NULL) { | 3267 | if (biolist == NULL) { |
| 3183 | while (r10_bio) { | 3268 | while (r10_bio) { |
| @@ -3256,6 +3341,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 3256 | bio->bi_private = r10_bio; | 3341 | bio->bi_private = r10_bio; |
| 3257 | bio->bi_end_io = end_sync_read; | 3342 | bio->bi_end_io = end_sync_read; |
| 3258 | bio_set_op_attrs(bio, REQ_OP_READ, 0); | 3343 | bio_set_op_attrs(bio, REQ_OP_READ, 0); |
| 3344 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) | ||
| 3345 | bio->bi_opf |= MD_FAILFAST; | ||
| 3259 | bio->bi_iter.bi_sector = sector + rdev->data_offset; | 3346 | bio->bi_iter.bi_sector = sector + rdev->data_offset; |
| 3260 | bio->bi_bdev = rdev->bdev; | 3347 | bio->bi_bdev = rdev->bdev; |
| 3261 | count++; | 3348 | count++; |
| @@ -3279,6 +3366,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 3279 | bio->bi_private = r10_bio; | 3366 | bio->bi_private = r10_bio; |
| 3280 | bio->bi_end_io = end_sync_write; | 3367 | bio->bi_end_io = end_sync_write; |
| 3281 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); | 3368 | bio_set_op_attrs(bio, REQ_OP_WRITE, 0); |
| 3369 | if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) | ||
| 3370 | bio->bi_opf |= MD_FAILFAST; | ||
| 3282 | bio->bi_iter.bi_sector = sector + rdev->data_offset; | 3371 | bio->bi_iter.bi_sector = sector + rdev->data_offset; |
| 3283 | bio->bi_bdev = rdev->bdev; | 3372 | bio->bi_bdev = rdev->bdev; |
| 3284 | count++; | 3373 | count++; |
| @@ -3489,15 +3578,14 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3489 | copies = setup_geo(&geo, mddev, geo_new); | 3578 | copies = setup_geo(&geo, mddev, geo_new); |
| 3490 | 3579 | ||
| 3491 | if (copies == -2) { | 3580 | if (copies == -2) { |
| 3492 | printk(KERN_ERR "md/raid10:%s: chunk size must be " | 3581 | pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", |
| 3493 | "at least PAGE_SIZE(%ld) and be a power of 2.\n", | 3582 | mdname(mddev), PAGE_SIZE); |
| 3494 | mdname(mddev), PAGE_SIZE); | ||
| 3495 | goto out; | 3583 | goto out; |
| 3496 | } | 3584 | } |
| 3497 | 3585 | ||
| 3498 | if (copies < 2 || copies > mddev->raid_disks) { | 3586 | if (copies < 2 || copies > mddev->raid_disks) { |
| 3499 | printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", | 3587 | pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", |
| 3500 | mdname(mddev), mddev->new_layout); | 3588 | mdname(mddev), mddev->new_layout); |
| 3501 | goto out; | 3589 | goto out; |
| 3502 | } | 3590 | } |
| 3503 | 3591 | ||
| @@ -3557,9 +3645,6 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3557 | return conf; | 3645 | return conf; |
| 3558 | 3646 | ||
| 3559 | out: | 3647 | out: |
| 3560 | if (err == -ENOMEM) | ||
| 3561 | printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", | ||
| 3562 | mdname(mddev)); | ||
| 3563 | if (conf) { | 3648 | if (conf) { |
| 3564 | mempool_destroy(conf->r10bio_pool); | 3649 | mempool_destroy(conf->r10bio_pool); |
| 3565 | kfree(conf->mirrors); | 3650 | kfree(conf->mirrors); |
| @@ -3656,7 +3741,7 @@ static int raid10_run(struct mddev *mddev) | |||
| 3656 | } | 3741 | } |
| 3657 | /* need to check that every block has at least one working mirror */ | 3742 | /* need to check that every block has at least one working mirror */ |
| 3658 | if (!enough(conf, -1)) { | 3743 | if (!enough(conf, -1)) { |
| 3659 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3744 | pr_err("md/raid10:%s: not enough operational mirrors.\n", |
| 3660 | mdname(mddev)); | 3745 | mdname(mddev)); |
| 3661 | goto out_free_conf; | 3746 | goto out_free_conf; |
| 3662 | } | 3747 | } |
| @@ -3698,11 +3783,9 @@ static int raid10_run(struct mddev *mddev) | |||
| 3698 | } | 3783 | } |
| 3699 | 3784 | ||
| 3700 | if (mddev->recovery_cp != MaxSector) | 3785 | if (mddev->recovery_cp != MaxSector) |
| 3701 | printk(KERN_NOTICE "md/raid10:%s: not clean" | 3786 | pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", |
| 3702 | " -- starting background reconstruction\n", | 3787 | mdname(mddev)); |
| 3703 | mdname(mddev)); | 3788 | pr_info("md/raid10:%s: active with %d out of %d devices\n", |
| 3704 | printk(KERN_INFO | ||
| 3705 | "md/raid10:%s: active with %d out of %d devices\n", | ||
| 3706 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, | 3789 | mdname(mddev), conf->geo.raid_disks - mddev->degraded, |
| 3707 | conf->geo.raid_disks); | 3790 | conf->geo.raid_disks); |
| 3708 | /* | 3791 | /* |
| @@ -3712,6 +3795,7 @@ static int raid10_run(struct mddev *mddev) | |||
| 3712 | size = raid10_size(mddev, 0, 0); | 3795 | size = raid10_size(mddev, 0, 0); |
| 3713 | md_set_array_sectors(mddev, size); | 3796 | md_set_array_sectors(mddev, size); |
| 3714 | mddev->resync_max_sectors = size; | 3797 | mddev->resync_max_sectors = size; |
| 3798 | set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
| 3715 | 3799 | ||
| 3716 | if (mddev->queue) { | 3800 | if (mddev->queue) { |
| 3717 | int stripe = conf->geo.raid_disks * | 3801 | int stripe = conf->geo.raid_disks * |
| @@ -3739,7 +3823,7 @@ static int raid10_run(struct mddev *mddev) | |||
| 3739 | 3823 | ||
| 3740 | if (max(before_length, after_length) > min_offset_diff) { | 3824 | if (max(before_length, after_length) > min_offset_diff) { |
| 3741 | /* This cannot work */ | 3825 | /* This cannot work */ |
| 3742 | printk("md/raid10: offset difference not enough to continue reshape\n"); | 3826 | pr_warn("md/raid10: offset difference not enough to continue reshape\n"); |
| 3743 | goto out_free_conf; | 3827 | goto out_free_conf; |
| 3744 | } | 3828 | } |
| 3745 | conf->offset_diff = min_offset_diff; | 3829 | conf->offset_diff = min_offset_diff; |
| @@ -3846,8 +3930,8 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) | |||
| 3846 | struct r10conf *conf; | 3930 | struct r10conf *conf; |
| 3847 | 3931 | ||
| 3848 | if (mddev->degraded > 0) { | 3932 | if (mddev->degraded > 0) { |
| 3849 | printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", | 3933 | pr_warn("md/raid10:%s: Error: degraded raid0!\n", |
| 3850 | mdname(mddev)); | 3934 | mdname(mddev)); |
| 3851 | return ERR_PTR(-EINVAL); | 3935 | return ERR_PTR(-EINVAL); |
| 3852 | } | 3936 | } |
| 3853 | sector_div(size, devs); | 3937 | sector_div(size, devs); |
| @@ -3887,9 +3971,8 @@ static void *raid10_takeover(struct mddev *mddev) | |||
| 3887 | /* for raid0 takeover only one zone is supported */ | 3971 | /* for raid0 takeover only one zone is supported */ |
| 3888 | raid0_conf = mddev->private; | 3972 | raid0_conf = mddev->private; |
| 3889 | if (raid0_conf->nr_strip_zones > 1) { | 3973 | if (raid0_conf->nr_strip_zones > 1) { |
| 3890 | printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" | 3974 | pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", |
| 3891 | " with more than one zone.\n", | 3975 | mdname(mddev)); |
| 3892 | mdname(mddev)); | ||
| 3893 | return ERR_PTR(-EINVAL); | 3976 | return ERR_PTR(-EINVAL); |
| 3894 | } | 3977 | } |
| 3895 | return raid10_takeover_raid0(mddev, | 3978 | return raid10_takeover_raid0(mddev, |
| @@ -4078,8 +4161,8 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
| 4078 | sector_t size = raid10_size(mddev, 0, 0); | 4161 | sector_t size = raid10_size(mddev, 0, 0); |
| 4079 | if (size < mddev->array_sectors) { | 4162 | if (size < mddev->array_sectors) { |
| 4080 | spin_unlock_irq(&conf->device_lock); | 4163 | spin_unlock_irq(&conf->device_lock); |
| 4081 | printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", | 4164 | pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", |
| 4082 | mdname(mddev)); | 4165 | mdname(mddev)); |
| 4083 | return -EINVAL; | 4166 | return -EINVAL; |
| 4084 | } | 4167 | } |
| 4085 | mddev->resync_max_sectors = size; | 4168 | mddev->resync_max_sectors = size; |
| @@ -4126,7 +4209,7 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
| 4126 | spin_unlock_irq(&conf->device_lock); | 4209 | spin_unlock_irq(&conf->device_lock); |
| 4127 | mddev->raid_disks = conf->geo.raid_disks; | 4210 | mddev->raid_disks = conf->geo.raid_disks; |
| 4128 | mddev->reshape_position = conf->reshape_progress; | 4211 | mddev->reshape_position = conf->reshape_progress; |
| 4129 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4212 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 4130 | 4213 | ||
| 4131 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 4214 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
| 4132 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 4215 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
| @@ -4321,9 +4404,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, | |||
| 4321 | else | 4404 | else |
| 4322 | mddev->curr_resync_completed = conf->reshape_progress; | 4405 | mddev->curr_resync_completed = conf->reshape_progress; |
| 4323 | conf->reshape_checkpoint = jiffies; | 4406 | conf->reshape_checkpoint = jiffies; |
| 4324 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4407 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 4325 | md_wakeup_thread(mddev->thread); | 4408 | md_wakeup_thread(mddev->thread); |
| 4326 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 4409 | wait_event(mddev->sb_wait, mddev->sb_flags == 0 || |
| 4327 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | 4410 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
| 4328 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 4411 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
| 4329 | allow_barrier(conf); | 4412 | allow_barrier(conf); |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 18ec1f7a98bf..3162615e57bd 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -156,5 +156,7 @@ enum r10bio_state { | |||
| 156 | * flag is set | 156 | * flag is set |
| 157 | */ | 157 | */ |
| 158 | R10BIO_Previous, | 158 | R10BIO_Previous, |
| 159 | /* failfast devices did receive failfast requests. */ | ||
| 160 | R10BIO_FailFast, | ||
| 159 | }; | 161 | }; |
| 160 | #endif | 162 | #endif |
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 8491edcfb5a6..d7bfb6fc8aef 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Copyright (C) 2015 Shaohua Li <shli@fb.com> | 2 | * Copyright (C) 2015 Shaohua Li <shli@fb.com> |
| 3 | * Copyright (C) 2016 Song Liu <songliubraving@fb.com> | ||
| 3 | * | 4 | * |
| 4 | * This program is free software; you can redistribute it and/or modify it | 5 | * This program is free software; you can redistribute it and/or modify it |
| 5 | * under the terms and conditions of the GNU General Public License, | 6 | * under the terms and conditions of the GNU General Public License, |
| @@ -18,8 +19,10 @@ | |||
| 18 | #include <linux/raid/md_p.h> | 19 | #include <linux/raid/md_p.h> |
| 19 | #include <linux/crc32c.h> | 20 | #include <linux/crc32c.h> |
| 20 | #include <linux/random.h> | 21 | #include <linux/random.h> |
| 22 | #include <linux/kthread.h> | ||
| 21 | #include "md.h" | 23 | #include "md.h" |
| 22 | #include "raid5.h" | 24 | #include "raid5.h" |
| 25 | #include "bitmap.h" | ||
| 23 | 26 | ||
| 24 | /* | 27 | /* |
| 25 | * metadata/data stored in disk with 4k size unit (a block) regardless | 28 | * metadata/data stored in disk with 4k size unit (a block) regardless |
| @@ -28,18 +31,70 @@ | |||
| 28 | #define BLOCK_SECTORS (8) | 31 | #define BLOCK_SECTORS (8) |
| 29 | 32 | ||
| 30 | /* | 33 | /* |
| 31 | * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent | 34 | * log->max_free_space is min(1/4 disk size, 10G reclaimable space). |
| 32 | * recovery scans a very long log | 35 | * |
| 36 | * In write through mode, the reclaim runs every log->max_free_space. | ||
| 37 | * This can prevent the recovery scans for too long | ||
| 33 | */ | 38 | */ |
| 34 | #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ | 39 | #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ |
| 35 | #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) | 40 | #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) |
| 36 | 41 | ||
| 42 | /* wake up reclaim thread periodically */ | ||
| 43 | #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) | ||
| 44 | /* start flush with these full stripes */ | ||
| 45 | #define R5C_FULL_STRIPE_FLUSH_BATCH 256 | ||
| 46 | /* reclaim stripes in groups */ | ||
| 47 | #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) | ||
| 48 | |||
| 37 | /* | 49 | /* |
| 38 | * We only need 2 bios per I/O unit to make progress, but ensure we | 50 | * We only need 2 bios per I/O unit to make progress, but ensure we |
| 39 | * have a few more available to not get too tight. | 51 | * have a few more available to not get too tight. |
| 40 | */ | 52 | */ |
| 41 | #define R5L_POOL_SIZE 4 | 53 | #define R5L_POOL_SIZE 4 |
| 42 | 54 | ||
| 55 | /* | ||
| 56 | * r5c journal modes of the array: write-back or write-through. | ||
| 57 | * write-through mode has identical behavior as existing log only | ||
| 58 | * implementation. | ||
| 59 | */ | ||
| 60 | enum r5c_journal_mode { | ||
| 61 | R5C_JOURNAL_MODE_WRITE_THROUGH = 0, | ||
| 62 | R5C_JOURNAL_MODE_WRITE_BACK = 1, | ||
| 63 | }; | ||
| 64 | |||
| 65 | static char *r5c_journal_mode_str[] = {"write-through", | ||
| 66 | "write-back"}; | ||
| 67 | /* | ||
| 68 | * raid5 cache state machine | ||
| 69 | * | ||
| 70 | * With the RAID cache, each stripe works in two phases: | ||
| 71 | * - caching phase | ||
| 72 | * - writing-out phase | ||
| 73 | * | ||
| 74 | * These two phases are controlled by bit STRIPE_R5C_CACHING: | ||
| 75 | * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase | ||
| 76 | * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase | ||
| 77 | * | ||
| 78 | * When there is no journal, or the journal is in write-through mode, | ||
| 79 | * the stripe is always in writing-out phase. | ||
| 80 | * | ||
| 81 | * For write-back journal, the stripe is sent to caching phase on write | ||
| 82 | * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off | ||
| 83 | * the write-out phase by clearing STRIPE_R5C_CACHING. | ||
| 84 | * | ||
| 85 | * Stripes in caching phase do not write the raid disks. Instead, all | ||
| 86 | * writes are committed from the log device. Therefore, a stripe in | ||
| 87 | * caching phase handles writes as: | ||
| 88 | * - write to log device | ||
| 89 | * - return IO | ||
| 90 | * | ||
| 91 | * Stripes in writing-out phase handle writes as: | ||
| 92 | * - calculate parity | ||
| 93 | * - write pending data and parity to journal | ||
| 94 | * - write data and parity to raid disks | ||
| 95 | * - return IO for pending writes | ||
| 96 | */ | ||
| 97 | |||
| 43 | struct r5l_log { | 98 | struct r5l_log { |
| 44 | struct md_rdev *rdev; | 99 | struct md_rdev *rdev; |
| 45 | 100 | ||
| @@ -58,7 +113,6 @@ struct r5l_log { | |||
| 58 | u64 seq; /* log head sequence */ | 113 | u64 seq; /* log head sequence */ |
| 59 | 114 | ||
| 60 | sector_t next_checkpoint; | 115 | sector_t next_checkpoint; |
| 61 | u64 next_cp_seq; | ||
| 62 | 116 | ||
| 63 | struct mutex io_mutex; | 117 | struct mutex io_mutex; |
| 64 | struct r5l_io_unit *current_io; /* current io_unit accepting new data */ | 118 | struct r5l_io_unit *current_io; /* current io_unit accepting new data */ |
| @@ -96,6 +150,18 @@ struct r5l_log { | |||
| 96 | spinlock_t no_space_stripes_lock; | 150 | spinlock_t no_space_stripes_lock; |
| 97 | 151 | ||
| 98 | bool need_cache_flush; | 152 | bool need_cache_flush; |
| 153 | |||
| 154 | /* for r5c_cache */ | ||
| 155 | enum r5c_journal_mode r5c_journal_mode; | ||
| 156 | |||
| 157 | /* all stripes in r5cache, in the order of seq at sh->log_start */ | ||
| 158 | struct list_head stripe_in_journal_list; | ||
| 159 | |||
| 160 | spinlock_t stripe_in_journal_lock; | ||
| 161 | atomic_t stripe_in_journal_count; | ||
| 162 | |||
| 163 | /* to submit async io_units, to fulfill ordering of flush */ | ||
| 164 | struct work_struct deferred_io_work; | ||
| 99 | }; | 165 | }; |
| 100 | 166 | ||
| 101 | /* | 167 | /* |
| @@ -122,6 +188,18 @@ struct r5l_io_unit { | |||
| 122 | 188 | ||
| 123 | int state; | 189 | int state; |
| 124 | bool need_split_bio; | 190 | bool need_split_bio; |
| 191 | struct bio *split_bio; | ||
| 192 | |||
| 193 | unsigned int has_flush:1; /* include flush request */ | ||
| 194 | unsigned int has_fua:1; /* include fua request */ | ||
| 195 | unsigned int has_null_flush:1; /* include empty flush request */ | ||
| 196 | /* | ||
| 197 | * io isn't sent yet, flush/fua request can only be submitted till it's | ||
| 198 | * the first IO in running_ios list | ||
| 199 | */ | ||
| 200 | unsigned int io_deferred:1; | ||
| 201 | |||
| 202 | struct bio_list flush_barriers; /* size == 0 flush bios */ | ||
| 125 | }; | 203 | }; |
| 126 | 204 | ||
| 127 | /* r5l_io_unit state */ | 205 | /* r5l_io_unit state */ |
| @@ -133,6 +211,12 @@ enum r5l_io_unit_state { | |||
| 133 | IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ | 211 | IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ |
| 134 | }; | 212 | }; |
| 135 | 213 | ||
| 214 | bool r5c_is_writeback(struct r5l_log *log) | ||
| 215 | { | ||
| 216 | return (log != NULL && | ||
| 217 | log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); | ||
| 218 | } | ||
| 219 | |||
| 136 | static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) | 220 | static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) |
| 137 | { | 221 | { |
| 138 | start += inc; | 222 | start += inc; |
| @@ -168,12 +252,235 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io, | |||
| 168 | io->state = state; | 252 | io->state = state; |
| 169 | } | 253 | } |
| 170 | 254 | ||
| 255 | static void | ||
| 256 | r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, | ||
| 257 | struct bio_list *return_bi) | ||
| 258 | { | ||
| 259 | struct bio *wbi, *wbi2; | ||
| 260 | |||
| 261 | wbi = dev->written; | ||
| 262 | dev->written = NULL; | ||
| 263 | while (wbi && wbi->bi_iter.bi_sector < | ||
| 264 | dev->sector + STRIPE_SECTORS) { | ||
| 265 | wbi2 = r5_next_bio(wbi, dev->sector); | ||
| 266 | if (!raid5_dec_bi_active_stripes(wbi)) { | ||
| 267 | md_write_end(conf->mddev); | ||
| 268 | bio_list_add(return_bi, wbi); | ||
| 269 | } | ||
| 270 | wbi = wbi2; | ||
| 271 | } | ||
| 272 | } | ||
| 273 | |||
| 274 | void r5c_handle_cached_data_endio(struct r5conf *conf, | ||
| 275 | struct stripe_head *sh, int disks, struct bio_list *return_bi) | ||
| 276 | { | ||
| 277 | int i; | ||
| 278 | |||
| 279 | for (i = sh->disks; i--; ) { | ||
| 280 | if (sh->dev[i].written) { | ||
| 281 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | ||
| 282 | r5c_return_dev_pending_writes(conf, &sh->dev[i], | ||
| 283 | return_bi); | ||
| 284 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 285 | STRIPE_SECTORS, | ||
| 286 | !test_bit(STRIPE_DEGRADED, &sh->state), | ||
| 287 | 0); | ||
| 288 | } | ||
| 289 | } | ||
| 290 | } | ||
| 291 | |||
| 292 | /* Check whether we should flush some stripes to free up stripe cache */ | ||
| 293 | void r5c_check_stripe_cache_usage(struct r5conf *conf) | ||
| 294 | { | ||
| 295 | int total_cached; | ||
| 296 | |||
| 297 | if (!r5c_is_writeback(conf->log)) | ||
| 298 | return; | ||
| 299 | |||
| 300 | total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + | ||
| 301 | atomic_read(&conf->r5c_cached_full_stripes); | ||
| 302 | |||
| 303 | /* | ||
| 304 | * The following condition is true for either of the following: | ||
| 305 | * - stripe cache pressure high: | ||
| 306 | * total_cached > 3/4 min_nr_stripes || | ||
| 307 | * empty_inactive_list_nr > 0 | ||
| 308 | * - stripe cache pressure moderate: | ||
| 309 | * total_cached > 1/2 min_nr_stripes | ||
| 310 | */ | ||
| 311 | if (total_cached > conf->min_nr_stripes * 1 / 2 || | ||
| 312 | atomic_read(&conf->empty_inactive_list_nr) > 0) | ||
| 313 | r5l_wake_reclaim(conf->log, 0); | ||
| 314 | } | ||
| 315 | |||
| 316 | /* | ||
| 317 | * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full | ||
| 318 | * stripes in the cache | ||
| 319 | */ | ||
| 320 | void r5c_check_cached_full_stripe(struct r5conf *conf) | ||
| 321 | { | ||
| 322 | if (!r5c_is_writeback(conf->log)) | ||
| 323 | return; | ||
| 324 | |||
| 325 | /* | ||
| 326 | * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes | ||
| 327 | * or a full stripe (chunk size / 4k stripes). | ||
| 328 | */ | ||
| 329 | if (atomic_read(&conf->r5c_cached_full_stripes) >= | ||
| 330 | min(R5C_FULL_STRIPE_FLUSH_BATCH, | ||
| 331 | conf->chunk_sectors >> STRIPE_SHIFT)) | ||
| 332 | r5l_wake_reclaim(conf->log, 0); | ||
| 333 | } | ||
| 334 | |||
| 335 | /* | ||
| 336 | * Total log space (in sectors) needed to flush all data in cache | ||
| 337 | * | ||
| 338 | * Currently, writing-out phase automatically includes all pending writes | ||
| 339 | * to the same sector. So the reclaim of each stripe takes up to | ||
| 340 | * (conf->raid_disks + 1) pages of log space. | ||
| 341 | * | ||
| 342 | * To totally avoid deadlock due to log space, the code reserves | ||
| 343 | * (conf->raid_disks + 1) pages for each stripe in cache, which is not | ||
| 344 | * necessary in most cases. | ||
| 345 | * | ||
| 346 | * To improve this, we will need writing-out phase to be able to NOT include | ||
| 347 | * pending writes, which will reduce the requirement to | ||
| 348 | * (conf->max_degraded + 1) pages per stripe in cache. | ||
| 349 | */ | ||
| 350 | static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) | ||
| 351 | { | ||
| 352 | struct r5l_log *log = conf->log; | ||
| 353 | |||
| 354 | if (!r5c_is_writeback(log)) | ||
| 355 | return 0; | ||
| 356 | |||
| 357 | return BLOCK_SECTORS * (conf->raid_disks + 1) * | ||
| 358 | atomic_read(&log->stripe_in_journal_count); | ||
| 359 | } | ||
| 360 | |||
| 361 | /* | ||
| 362 | * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL | ||
| 363 | * | ||
| 364 | * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of | ||
| 365 | * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log | ||
| 366 | * device is less than 2x of reclaim_required_space. | ||
| 367 | */ | ||
| 368 | static inline void r5c_update_log_state(struct r5l_log *log) | ||
| 369 | { | ||
| 370 | struct r5conf *conf = log->rdev->mddev->private; | ||
| 371 | sector_t free_space; | ||
| 372 | sector_t reclaim_space; | ||
| 373 | bool wake_reclaim = false; | ||
| 374 | |||
| 375 | if (!r5c_is_writeback(log)) | ||
| 376 | return; | ||
| 377 | |||
| 378 | free_space = r5l_ring_distance(log, log->log_start, | ||
| 379 | log->last_checkpoint); | ||
| 380 | reclaim_space = r5c_log_required_to_flush_cache(conf); | ||
| 381 | if (free_space < 2 * reclaim_space) | ||
| 382 | set_bit(R5C_LOG_CRITICAL, &conf->cache_state); | ||
| 383 | else { | ||
| 384 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) | ||
| 385 | wake_reclaim = true; | ||
| 386 | clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); | ||
| 387 | } | ||
| 388 | if (free_space < 3 * reclaim_space) | ||
| 389 | set_bit(R5C_LOG_TIGHT, &conf->cache_state); | ||
| 390 | else | ||
| 391 | clear_bit(R5C_LOG_TIGHT, &conf->cache_state); | ||
| 392 | |||
| 393 | if (wake_reclaim) | ||
| 394 | r5l_wake_reclaim(log, 0); | ||
| 395 | } | ||
| 396 | |||
| 397 | /* | ||
| 398 | * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. | ||
| 399 | * This function should only be called in write-back mode. | ||
| 400 | */ | ||
| 401 | void r5c_make_stripe_write_out(struct stripe_head *sh) | ||
| 402 | { | ||
| 403 | struct r5conf *conf = sh->raid_conf; | ||
| 404 | struct r5l_log *log = conf->log; | ||
| 405 | |||
| 406 | BUG_ON(!r5c_is_writeback(log)); | ||
| 407 | |||
| 408 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 409 | clear_bit(STRIPE_R5C_CACHING, &sh->state); | ||
| 410 | |||
| 411 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
| 412 | atomic_inc(&conf->preread_active_stripes); | ||
| 413 | |||
| 414 | if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { | ||
| 415 | BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); | ||
| 416 | atomic_dec(&conf->r5c_cached_partial_stripes); | ||
| 417 | } | ||
| 418 | |||
| 419 | if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { | ||
| 420 | BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); | ||
| 421 | atomic_dec(&conf->r5c_cached_full_stripes); | ||
| 422 | } | ||
| 423 | } | ||
| 424 | |||
| 425 | static void r5c_handle_data_cached(struct stripe_head *sh) | ||
| 426 | { | ||
| 427 | int i; | ||
| 428 | |||
| 429 | for (i = sh->disks; i--; ) | ||
| 430 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { | ||
| 431 | set_bit(R5_InJournal, &sh->dev[i].flags); | ||
| 432 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
| 433 | } | ||
| 434 | clear_bit(STRIPE_LOG_TRAPPED, &sh->state); | ||
| 435 | } | ||
| 436 | |||
| 437 | /* | ||
| 438 | * this journal write must contain full parity, | ||
| 439 | * it may also contain some data pages | ||
| 440 | */ | ||
| 441 | static void r5c_handle_parity_cached(struct stripe_head *sh) | ||
| 442 | { | ||
| 443 | int i; | ||
| 444 | |||
| 445 | for (i = sh->disks; i--; ) | ||
| 446 | if (test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
| 447 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | ||
| 448 | } | ||
| 449 | |||
| 450 | /* | ||
| 451 | * Setting proper flags after writing (or flushing) data and/or parity to the | ||
| 452 | * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). | ||
| 453 | */ | ||
| 454 | static void r5c_finish_cache_stripe(struct stripe_head *sh) | ||
| 455 | { | ||
| 456 | struct r5l_log *log = sh->raid_conf->log; | ||
| 457 | |||
| 458 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { | ||
| 459 | BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 460 | /* | ||
| 461 | * Set R5_InJournal for parity dev[pd_idx]. This means | ||
| 462 | * all data AND parity in the journal. For RAID 6, it is | ||
| 463 | * NOT necessary to set the flag for dev[qd_idx], as the | ||
| 464 | * two parities are written out together. | ||
| 465 | */ | ||
| 466 | set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
| 467 | } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { | ||
| 468 | r5c_handle_data_cached(sh); | ||
| 469 | } else { | ||
| 470 | r5c_handle_parity_cached(sh); | ||
| 471 | set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
| 472 | } | ||
| 473 | } | ||
| 474 | |||
| 171 | static void r5l_io_run_stripes(struct r5l_io_unit *io) | 475 | static void r5l_io_run_stripes(struct r5l_io_unit *io) |
| 172 | { | 476 | { |
| 173 | struct stripe_head *sh, *next; | 477 | struct stripe_head *sh, *next; |
| 174 | 478 | ||
| 175 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { | 479 | list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { |
| 176 | list_del_init(&sh->log_list); | 480 | list_del_init(&sh->log_list); |
| 481 | |||
| 482 | r5c_finish_cache_stripe(sh); | ||
| 483 | |||
| 177 | set_bit(STRIPE_HANDLE, &sh->state); | 484 | set_bit(STRIPE_HANDLE, &sh->state); |
| 178 | raid5_release_stripe(sh); | 485 | raid5_release_stripe(sh); |
| 179 | } | 486 | } |
| @@ -209,9 +516,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log) | |||
| 209 | } | 516 | } |
| 210 | } | 517 | } |
| 211 | 518 | ||
| 519 | static void __r5l_stripe_write_finished(struct r5l_io_unit *io); | ||
| 212 | static void r5l_log_endio(struct bio *bio) | 520 | static void r5l_log_endio(struct bio *bio) |
| 213 | { | 521 | { |
| 214 | struct r5l_io_unit *io = bio->bi_private; | 522 | struct r5l_io_unit *io = bio->bi_private; |
| 523 | struct r5l_io_unit *io_deferred; | ||
| 215 | struct r5l_log *log = io->log; | 524 | struct r5l_log *log = io->log; |
| 216 | unsigned long flags; | 525 | unsigned long flags; |
| 217 | 526 | ||
| @@ -227,18 +536,89 @@ static void r5l_log_endio(struct bio *bio) | |||
| 227 | r5l_move_to_end_ios(log); | 536 | r5l_move_to_end_ios(log); |
| 228 | else | 537 | else |
| 229 | r5l_log_run_stripes(log); | 538 | r5l_log_run_stripes(log); |
| 539 | if (!list_empty(&log->running_ios)) { | ||
| 540 | /* | ||
| 541 | * FLUSH/FUA io_unit is deferred because of ordering, now we | ||
| 542 | * can dispatch it | ||
| 543 | */ | ||
| 544 | io_deferred = list_first_entry(&log->running_ios, | ||
| 545 | struct r5l_io_unit, log_sibling); | ||
| 546 | if (io_deferred->io_deferred) | ||
| 547 | schedule_work(&log->deferred_io_work); | ||
| 548 | } | ||
| 549 | |||
| 230 | spin_unlock_irqrestore(&log->io_list_lock, flags); | 550 | spin_unlock_irqrestore(&log->io_list_lock, flags); |
| 231 | 551 | ||
| 232 | if (log->need_cache_flush) | 552 | if (log->need_cache_flush) |
| 233 | md_wakeup_thread(log->rdev->mddev->thread); | 553 | md_wakeup_thread(log->rdev->mddev->thread); |
| 554 | |||
| 555 | if (io->has_null_flush) { | ||
| 556 | struct bio *bi; | ||
| 557 | |||
| 558 | WARN_ON(bio_list_empty(&io->flush_barriers)); | ||
| 559 | while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { | ||
| 560 | bio_endio(bi); | ||
| 561 | atomic_dec(&io->pending_stripe); | ||
| 562 | } | ||
| 563 | if (atomic_read(&io->pending_stripe) == 0) | ||
| 564 | __r5l_stripe_write_finished(io); | ||
| 565 | } | ||
| 566 | } | ||
| 567 | |||
| 568 | static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) | ||
| 569 | { | ||
| 570 | unsigned long flags; | ||
| 571 | |||
| 572 | spin_lock_irqsave(&log->io_list_lock, flags); | ||
| 573 | __r5l_set_io_unit_state(io, IO_UNIT_IO_START); | ||
| 574 | spin_unlock_irqrestore(&log->io_list_lock, flags); | ||
| 575 | |||
| 576 | if (io->has_flush) | ||
| 577 | io->current_bio->bi_opf |= REQ_PREFLUSH; | ||
| 578 | if (io->has_fua) | ||
| 579 | io->current_bio->bi_opf |= REQ_FUA; | ||
| 580 | submit_bio(io->current_bio); | ||
| 581 | |||
| 582 | if (!io->split_bio) | ||
| 583 | return; | ||
| 584 | |||
| 585 | if (io->has_flush) | ||
| 586 | io->split_bio->bi_opf |= REQ_PREFLUSH; | ||
| 587 | if (io->has_fua) | ||
| 588 | io->split_bio->bi_opf |= REQ_FUA; | ||
| 589 | submit_bio(io->split_bio); | ||
| 590 | } | ||
| 591 | |||
| 592 | /* deferred io_unit will be dispatched here */ | ||
| 593 | static void r5l_submit_io_async(struct work_struct *work) | ||
| 594 | { | ||
| 595 | struct r5l_log *log = container_of(work, struct r5l_log, | ||
| 596 | deferred_io_work); | ||
| 597 | struct r5l_io_unit *io = NULL; | ||
| 598 | unsigned long flags; | ||
| 599 | |||
| 600 | spin_lock_irqsave(&log->io_list_lock, flags); | ||
| 601 | if (!list_empty(&log->running_ios)) { | ||
| 602 | io = list_first_entry(&log->running_ios, struct r5l_io_unit, | ||
| 603 | log_sibling); | ||
| 604 | if (!io->io_deferred) | ||
| 605 | io = NULL; | ||
| 606 | else | ||
| 607 | io->io_deferred = 0; | ||
| 608 | } | ||
| 609 | spin_unlock_irqrestore(&log->io_list_lock, flags); | ||
| 610 | if (io) | ||
| 611 | r5l_do_submit_io(log, io); | ||
| 234 | } | 612 | } |
| 235 | 613 | ||
| 236 | static void r5l_submit_current_io(struct r5l_log *log) | 614 | static void r5l_submit_current_io(struct r5l_log *log) |
| 237 | { | 615 | { |
| 238 | struct r5l_io_unit *io = log->current_io; | 616 | struct r5l_io_unit *io = log->current_io; |
| 617 | struct bio *bio; | ||
| 239 | struct r5l_meta_block *block; | 618 | struct r5l_meta_block *block; |
| 240 | unsigned long flags; | 619 | unsigned long flags; |
| 241 | u32 crc; | 620 | u32 crc; |
| 621 | bool do_submit = true; | ||
| 242 | 622 | ||
| 243 | if (!io) | 623 | if (!io) |
| 244 | return; | 624 | return; |
| @@ -247,13 +627,20 @@ static void r5l_submit_current_io(struct r5l_log *log) | |||
| 247 | block->meta_size = cpu_to_le32(io->meta_offset); | 627 | block->meta_size = cpu_to_le32(io->meta_offset); |
| 248 | crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); | 628 | crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); |
| 249 | block->checksum = cpu_to_le32(crc); | 629 | block->checksum = cpu_to_le32(crc); |
| 630 | bio = io->current_bio; | ||
| 250 | 631 | ||
| 251 | log->current_io = NULL; | 632 | log->current_io = NULL; |
| 252 | spin_lock_irqsave(&log->io_list_lock, flags); | 633 | spin_lock_irqsave(&log->io_list_lock, flags); |
| 253 | __r5l_set_io_unit_state(io, IO_UNIT_IO_START); | 634 | if (io->has_flush || io->has_fua) { |
| 635 | if (io != list_first_entry(&log->running_ios, | ||
| 636 | struct r5l_io_unit, log_sibling)) { | ||
| 637 | io->io_deferred = 1; | ||
| 638 | do_submit = false; | ||
| 639 | } | ||
| 640 | } | ||
| 254 | spin_unlock_irqrestore(&log->io_list_lock, flags); | 641 | spin_unlock_irqrestore(&log->io_list_lock, flags); |
| 255 | 642 | if (do_submit) | |
| 256 | submit_bio(io->current_bio); | 643 | r5l_do_submit_io(log, io); |
| 257 | } | 644 | } |
| 258 | 645 | ||
| 259 | static struct bio *r5l_bio_alloc(struct r5l_log *log) | 646 | static struct bio *r5l_bio_alloc(struct r5l_log *log) |
| @@ -271,6 +658,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) | |||
| 271 | { | 658 | { |
| 272 | log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); | 659 | log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); |
| 273 | 660 | ||
| 661 | r5c_update_log_state(log); | ||
| 274 | /* | 662 | /* |
| 275 | * If we filled up the log device start from the beginning again, | 663 | * If we filled up the log device start from the beginning again, |
| 276 | * which will require a new bio. | 664 | * which will require a new bio. |
| @@ -297,6 +685,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) | |||
| 297 | io->log = log; | 685 | io->log = log; |
| 298 | INIT_LIST_HEAD(&io->log_sibling); | 686 | INIT_LIST_HEAD(&io->log_sibling); |
| 299 | INIT_LIST_HEAD(&io->stripe_list); | 687 | INIT_LIST_HEAD(&io->stripe_list); |
| 688 | bio_list_init(&io->flush_barriers); | ||
| 300 | io->state = IO_UNIT_RUNNING; | 689 | io->state = IO_UNIT_RUNNING; |
| 301 | 690 | ||
| 302 | io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); | 691 | io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); |
| @@ -367,12 +756,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) | |||
| 367 | struct r5l_io_unit *io = log->current_io; | 756 | struct r5l_io_unit *io = log->current_io; |
| 368 | 757 | ||
| 369 | if (io->need_split_bio) { | 758 | if (io->need_split_bio) { |
| 370 | struct bio *prev = io->current_bio; | 759 | BUG_ON(io->split_bio); |
| 371 | 760 | io->split_bio = io->current_bio; | |
| 372 | io->current_bio = r5l_bio_alloc(log); | 761 | io->current_bio = r5l_bio_alloc(log); |
| 373 | bio_chain(io->current_bio, prev); | 762 | bio_chain(io->current_bio, io->split_bio); |
| 374 | 763 | io->need_split_bio = false; | |
| 375 | submit_bio(prev); | ||
| 376 | } | 764 | } |
| 377 | 765 | ||
| 378 | if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) | 766 | if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) |
| @@ -401,50 +789,85 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, | |||
| 401 | 789 | ||
| 402 | io = log->current_io; | 790 | io = log->current_io; |
| 403 | 791 | ||
| 792 | if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) | ||
| 793 | io->has_flush = 1; | ||
| 794 | |||
| 404 | for (i = 0; i < sh->disks; i++) { | 795 | for (i = 0; i < sh->disks; i++) { |
| 405 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) | 796 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || |
| 797 | test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
| 406 | continue; | 798 | continue; |
| 407 | if (i == sh->pd_idx || i == sh->qd_idx) | 799 | if (i == sh->pd_idx || i == sh->qd_idx) |
| 408 | continue; | 800 | continue; |
| 801 | if (test_bit(R5_WantFUA, &sh->dev[i].flags) && | ||
| 802 | log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { | ||
| 803 | io->has_fua = 1; | ||
| 804 | /* | ||
| 805 | * we need to flush journal to make sure recovery can | ||
| 806 | * reach the data with fua flag | ||
| 807 | */ | ||
| 808 | io->has_flush = 1; | ||
| 809 | } | ||
| 409 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, | 810 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, |
| 410 | raid5_compute_blocknr(sh, i, 0), | 811 | raid5_compute_blocknr(sh, i, 0), |
| 411 | sh->dev[i].log_checksum, 0, false); | 812 | sh->dev[i].log_checksum, 0, false); |
| 412 | r5l_append_payload_page(log, sh->dev[i].page); | 813 | r5l_append_payload_page(log, sh->dev[i].page); |
| 413 | } | 814 | } |
| 414 | 815 | ||
| 415 | if (sh->qd_idx >= 0) { | 816 | if (parity_pages == 2) { |
| 416 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, | 817 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, |
| 417 | sh->sector, sh->dev[sh->pd_idx].log_checksum, | 818 | sh->sector, sh->dev[sh->pd_idx].log_checksum, |
| 418 | sh->dev[sh->qd_idx].log_checksum, true); | 819 | sh->dev[sh->qd_idx].log_checksum, true); |
| 419 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); | 820 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); |
| 420 | r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); | 821 | r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); |
| 421 | } else { | 822 | } else if (parity_pages == 1) { |
| 422 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, | 823 | r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, |
| 423 | sh->sector, sh->dev[sh->pd_idx].log_checksum, | 824 | sh->sector, sh->dev[sh->pd_idx].log_checksum, |
| 424 | 0, false); | 825 | 0, false); |
| 425 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); | 826 | r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); |
| 426 | } | 827 | } else /* Just writing data, not parity, in caching phase */ |
| 828 | BUG_ON(parity_pages != 0); | ||
| 427 | 829 | ||
| 428 | list_add_tail(&sh->log_list, &io->stripe_list); | 830 | list_add_tail(&sh->log_list, &io->stripe_list); |
| 429 | atomic_inc(&io->pending_stripe); | 831 | atomic_inc(&io->pending_stripe); |
| 430 | sh->log_io = io; | 832 | sh->log_io = io; |
| 431 | 833 | ||
| 834 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
| 835 | return 0; | ||
| 836 | |||
| 837 | if (sh->log_start == MaxSector) { | ||
| 838 | BUG_ON(!list_empty(&sh->r5c)); | ||
| 839 | sh->log_start = io->log_start; | ||
| 840 | spin_lock_irq(&log->stripe_in_journal_lock); | ||
| 841 | list_add_tail(&sh->r5c, | ||
| 842 | &log->stripe_in_journal_list); | ||
| 843 | spin_unlock_irq(&log->stripe_in_journal_lock); | ||
| 844 | atomic_inc(&log->stripe_in_journal_count); | ||
| 845 | } | ||
| 432 | return 0; | 846 | return 0; |
| 433 | } | 847 | } |
| 434 | 848 | ||
| 435 | static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); | 849 | /* add stripe to no_space_stripes, and then wake up reclaim */ |
| 850 | static inline void r5l_add_no_space_stripe(struct r5l_log *log, | ||
| 851 | struct stripe_head *sh) | ||
| 852 | { | ||
| 853 | spin_lock(&log->no_space_stripes_lock); | ||
| 854 | list_add_tail(&sh->log_list, &log->no_space_stripes); | ||
| 855 | spin_unlock(&log->no_space_stripes_lock); | ||
| 856 | } | ||
| 857 | |||
| 436 | /* | 858 | /* |
| 437 | * running in raid5d, where reclaim could wait for raid5d too (when it flushes | 859 | * running in raid5d, where reclaim could wait for raid5d too (when it flushes |
| 438 | * data from log to raid disks), so we shouldn't wait for reclaim here | 860 | * data from log to raid disks), so we shouldn't wait for reclaim here |
| 439 | */ | 861 | */ |
| 440 | int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | 862 | int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) |
| 441 | { | 863 | { |
| 864 | struct r5conf *conf = sh->raid_conf; | ||
| 442 | int write_disks = 0; | 865 | int write_disks = 0; |
| 443 | int data_pages, parity_pages; | 866 | int data_pages, parity_pages; |
| 444 | int meta_size; | ||
| 445 | int reserve; | 867 | int reserve; |
| 446 | int i; | 868 | int i; |
| 447 | int ret = 0; | 869 | int ret = 0; |
| 870 | bool wake_reclaim = false; | ||
| 448 | 871 | ||
| 449 | if (!log) | 872 | if (!log) |
| 450 | return -EAGAIN; | 873 | return -EAGAIN; |
| @@ -456,11 +879,15 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
| 456 | return -EAGAIN; | 879 | return -EAGAIN; |
| 457 | } | 880 | } |
| 458 | 881 | ||
| 882 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 883 | |||
| 459 | for (i = 0; i < sh->disks; i++) { | 884 | for (i = 0; i < sh->disks; i++) { |
| 460 | void *addr; | 885 | void *addr; |
| 461 | 886 | ||
| 462 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) | 887 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || |
| 888 | test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
| 463 | continue; | 889 | continue; |
| 890 | |||
| 464 | write_disks++; | 891 | write_disks++; |
| 465 | /* checksum is already calculated in last run */ | 892 | /* checksum is already calculated in last run */ |
| 466 | if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) | 893 | if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) |
| @@ -473,15 +900,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
| 473 | parity_pages = 1 + !!(sh->qd_idx >= 0); | 900 | parity_pages = 1 + !!(sh->qd_idx >= 0); |
| 474 | data_pages = write_disks - parity_pages; | 901 | data_pages = write_disks - parity_pages; |
| 475 | 902 | ||
| 476 | meta_size = | ||
| 477 | ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) | ||
| 478 | * data_pages) + | ||
| 479 | sizeof(struct r5l_payload_data_parity) + | ||
| 480 | sizeof(__le32) * parity_pages; | ||
| 481 | /* Doesn't work with very big raid array */ | ||
| 482 | if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) | ||
| 483 | return -EINVAL; | ||
| 484 | |||
| 485 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); | 903 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); |
| 486 | /* | 904 | /* |
| 487 | * The stripe must enter state machine again to finish the write, so | 905 | * The stripe must enter state machine again to finish the write, so |
| @@ -493,22 +911,49 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) | |||
| 493 | mutex_lock(&log->io_mutex); | 911 | mutex_lock(&log->io_mutex); |
| 494 | /* meta + data */ | 912 | /* meta + data */ |
| 495 | reserve = (1 + write_disks) << (PAGE_SHIFT - 9); | 913 | reserve = (1 + write_disks) << (PAGE_SHIFT - 9); |
| 496 | if (!r5l_has_free_space(log, reserve)) { | ||
| 497 | spin_lock(&log->no_space_stripes_lock); | ||
| 498 | list_add_tail(&sh->log_list, &log->no_space_stripes); | ||
| 499 | spin_unlock(&log->no_space_stripes_lock); | ||
| 500 | 914 | ||
| 501 | r5l_wake_reclaim(log, reserve); | 915 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { |
| 502 | } else { | 916 | if (!r5l_has_free_space(log, reserve)) { |
| 503 | ret = r5l_log_stripe(log, sh, data_pages, parity_pages); | 917 | r5l_add_no_space_stripe(log, sh); |
| 504 | if (ret) { | 918 | wake_reclaim = true; |
| 505 | spin_lock_irq(&log->io_list_lock); | 919 | } else { |
| 506 | list_add_tail(&sh->log_list, &log->no_mem_stripes); | 920 | ret = r5l_log_stripe(log, sh, data_pages, parity_pages); |
| 507 | spin_unlock_irq(&log->io_list_lock); | 921 | if (ret) { |
| 922 | spin_lock_irq(&log->io_list_lock); | ||
| 923 | list_add_tail(&sh->log_list, | ||
| 924 | &log->no_mem_stripes); | ||
| 925 | spin_unlock_irq(&log->io_list_lock); | ||
| 926 | } | ||
| 927 | } | ||
| 928 | } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ | ||
| 929 | /* | ||
| 930 | * log space critical, do not process stripes that are | ||
| 931 | * not in cache yet (sh->log_start == MaxSector). | ||
| 932 | */ | ||
| 933 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && | ||
| 934 | sh->log_start == MaxSector) { | ||
| 935 | r5l_add_no_space_stripe(log, sh); | ||
| 936 | wake_reclaim = true; | ||
| 937 | reserve = 0; | ||
| 938 | } else if (!r5l_has_free_space(log, reserve)) { | ||
| 939 | if (sh->log_start == log->last_checkpoint) | ||
| 940 | BUG(); | ||
| 941 | else | ||
| 942 | r5l_add_no_space_stripe(log, sh); | ||
| 943 | } else { | ||
| 944 | ret = r5l_log_stripe(log, sh, data_pages, parity_pages); | ||
| 945 | if (ret) { | ||
| 946 | spin_lock_irq(&log->io_list_lock); | ||
| 947 | list_add_tail(&sh->log_list, | ||
| 948 | &log->no_mem_stripes); | ||
| 949 | spin_unlock_irq(&log->io_list_lock); | ||
| 950 | } | ||
| 508 | } | 951 | } |
| 509 | } | 952 | } |
| 510 | 953 | ||
| 511 | mutex_unlock(&log->io_mutex); | 954 | mutex_unlock(&log->io_mutex); |
| 955 | if (wake_reclaim) | ||
| 956 | r5l_wake_reclaim(log, reserve); | ||
| 512 | return 0; | 957 | return 0; |
| 513 | } | 958 | } |
| 514 | 959 | ||
| @@ -525,17 +970,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) | |||
| 525 | { | 970 | { |
| 526 | if (!log) | 971 | if (!log) |
| 527 | return -ENODEV; | 972 | return -ENODEV; |
| 528 | /* | 973 | |
| 529 | * we flush log disk cache first, then write stripe data to raid disks. | 974 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { |
| 530 | * So if bio is finished, the log disk cache is flushed already. The | 975 | /* |
| 531 | * recovery guarantees we can recovery the bio from log disk, so we | 976 | * in write through (journal only) |
| 532 | * don't need to flush again | 977 | * we flush log disk cache first, then write stripe data to |
| 533 | */ | 978 | * raid disks. So if bio is finished, the log disk cache is |
| 534 | if (bio->bi_iter.bi_size == 0) { | 979 | * flushed already. The recovery guarantees we can recovery |
| 535 | bio_endio(bio); | 980 | * the bio from log disk, so we don't need to flush again |
| 536 | return 0; | 981 | */ |
| 982 | if (bio->bi_iter.bi_size == 0) { | ||
| 983 | bio_endio(bio); | ||
| 984 | return 0; | ||
| 985 | } | ||
| 986 | bio->bi_opf &= ~REQ_PREFLUSH; | ||
| 987 | } else { | ||
| 988 | /* write back (with cache) */ | ||
| 989 | if (bio->bi_iter.bi_size == 0) { | ||
| 990 | mutex_lock(&log->io_mutex); | ||
| 991 | r5l_get_meta(log, 0); | ||
| 992 | bio_list_add(&log->current_io->flush_barriers, bio); | ||
| 993 | log->current_io->has_flush = 1; | ||
| 994 | log->current_io->has_null_flush = 1; | ||
| 995 | atomic_inc(&log->current_io->pending_stripe); | ||
| 996 | r5l_submit_current_io(log); | ||
| 997 | mutex_unlock(&log->io_mutex); | ||
| 998 | return 0; | ||
| 999 | } | ||
| 537 | } | 1000 | } |
| 538 | bio->bi_opf &= ~REQ_PREFLUSH; | ||
| 539 | return -EAGAIN; | 1001 | return -EAGAIN; |
| 540 | } | 1002 | } |
| 541 | 1003 | ||
| @@ -555,10 +1017,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) | |||
| 555 | spin_unlock(&log->no_space_stripes_lock); | 1017 | spin_unlock(&log->no_space_stripes_lock); |
| 556 | } | 1018 | } |
| 557 | 1019 | ||
| 1020 | /* | ||
| 1021 | * calculate new last_checkpoint | ||
| 1022 | * for write through mode, returns log->next_checkpoint | ||
| 1023 | * for write back, returns log_start of first sh in stripe_in_journal_list | ||
| 1024 | */ | ||
| 1025 | static sector_t r5c_calculate_new_cp(struct r5conf *conf) | ||
| 1026 | { | ||
| 1027 | struct stripe_head *sh; | ||
| 1028 | struct r5l_log *log = conf->log; | ||
| 1029 | sector_t new_cp; | ||
| 1030 | unsigned long flags; | ||
| 1031 | |||
| 1032 | if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
| 1033 | return log->next_checkpoint; | ||
| 1034 | |||
| 1035 | spin_lock_irqsave(&log->stripe_in_journal_lock, flags); | ||
| 1036 | if (list_empty(&conf->log->stripe_in_journal_list)) { | ||
| 1037 | /* all stripes flushed */ | ||
| 1038 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | ||
| 1039 | return log->next_checkpoint; | ||
| 1040 | } | ||
| 1041 | sh = list_first_entry(&conf->log->stripe_in_journal_list, | ||
| 1042 | struct stripe_head, r5c); | ||
| 1043 | new_cp = sh->log_start; | ||
| 1044 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | ||
| 1045 | return new_cp; | ||
| 1046 | } | ||
| 1047 | |||
| 558 | static sector_t r5l_reclaimable_space(struct r5l_log *log) | 1048 | static sector_t r5l_reclaimable_space(struct r5l_log *log) |
| 559 | { | 1049 | { |
| 1050 | struct r5conf *conf = log->rdev->mddev->private; | ||
| 1051 | |||
| 560 | return r5l_ring_distance(log, log->last_checkpoint, | 1052 | return r5l_ring_distance(log, log->last_checkpoint, |
| 561 | log->next_checkpoint); | 1053 | r5c_calculate_new_cp(conf)); |
| 562 | } | 1054 | } |
| 563 | 1055 | ||
| 564 | static void r5l_run_no_mem_stripe(struct r5l_log *log) | 1056 | static void r5l_run_no_mem_stripe(struct r5l_log *log) |
| @@ -589,7 +1081,6 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) | |||
| 589 | break; | 1081 | break; |
| 590 | 1082 | ||
| 591 | log->next_checkpoint = io->log_start; | 1083 | log->next_checkpoint = io->log_start; |
| 592 | log->next_cp_seq = io->seq; | ||
| 593 | 1084 | ||
| 594 | list_del(&io->log_sibling); | 1085 | list_del(&io->log_sibling); |
| 595 | mempool_free(io, log->io_pool); | 1086 | mempool_free(io, log->io_pool); |
| @@ -604,6 +1095,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) | |||
| 604 | static void __r5l_stripe_write_finished(struct r5l_io_unit *io) | 1095 | static void __r5l_stripe_write_finished(struct r5l_io_unit *io) |
| 605 | { | 1096 | { |
| 606 | struct r5l_log *log = io->log; | 1097 | struct r5l_log *log = io->log; |
| 1098 | struct r5conf *conf = log->rdev->mddev->private; | ||
| 607 | unsigned long flags; | 1099 | unsigned long flags; |
| 608 | 1100 | ||
| 609 | spin_lock_irqsave(&log->io_list_lock, flags); | 1101 | spin_lock_irqsave(&log->io_list_lock, flags); |
| @@ -614,7 +1106,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io) | |||
| 614 | return; | 1106 | return; |
| 615 | } | 1107 | } |
| 616 | 1108 | ||
| 617 | if (r5l_reclaimable_space(log) > log->max_free_space) | 1109 | if (r5l_reclaimable_space(log) > log->max_free_space || |
| 1110 | test_bit(R5C_LOG_TIGHT, &conf->cache_state)) | ||
| 618 | r5l_wake_reclaim(log, 0); | 1111 | r5l_wake_reclaim(log, 0); |
| 619 | 1112 | ||
| 620 | spin_unlock_irqrestore(&log->io_list_lock, flags); | 1113 | spin_unlock_irqrestore(&log->io_list_lock, flags); |
| @@ -713,8 +1206,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, | |||
| 713 | * there is a deadlock. We workaround this issue with a trylock. | 1206 | * there is a deadlock. We workaround this issue with a trylock. |
| 714 | * FIXME: we could miss discard if we can't take reconfig mutex | 1207 | * FIXME: we could miss discard if we can't take reconfig mutex |
| 715 | */ | 1208 | */ |
| 716 | set_mask_bits(&mddev->flags, 0, | 1209 | set_mask_bits(&mddev->sb_flags, 0, |
| 717 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 1210 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
| 718 | if (!mddev_trylock(mddev)) | 1211 | if (!mddev_trylock(mddev)) |
| 719 | return; | 1212 | return; |
| 720 | md_update_sb(mddev, 1); | 1213 | md_update_sb(mddev, 1); |
| @@ -735,15 +1228,148 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, | |||
| 735 | } | 1228 | } |
| 736 | } | 1229 | } |
| 737 | 1230 | ||
| 1231 | /* | ||
| 1232 | * r5c_flush_stripe moves stripe from cached list to handle_list. When called, | ||
| 1233 | * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. | ||
| 1234 | * | ||
| 1235 | * must hold conf->device_lock | ||
| 1236 | */ | ||
| 1237 | static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) | ||
| 1238 | { | ||
| 1239 | BUG_ON(list_empty(&sh->lru)); | ||
| 1240 | BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 1241 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | ||
| 1242 | |||
| 1243 | /* | ||
| 1244 | * The stripe is not ON_RELEASE_LIST, so it is safe to call | ||
| 1245 | * raid5_release_stripe() while holding conf->device_lock | ||
| 1246 | */ | ||
| 1247 | BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); | ||
| 1248 | assert_spin_locked(&conf->device_lock); | ||
| 1249 | |||
| 1250 | list_del_init(&sh->lru); | ||
| 1251 | atomic_inc(&sh->count); | ||
| 1252 | |||
| 1253 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 1254 | atomic_inc(&conf->active_stripes); | ||
| 1255 | r5c_make_stripe_write_out(sh); | ||
| 1256 | |||
| 1257 | raid5_release_stripe(sh); | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | /* | ||
| 1261 | * if num == 0, flush all full stripes | ||
| 1262 | * if num > 0, flush all full stripes. If less than num full stripes are | ||
| 1263 | * flushed, flush some partial stripes until totally num stripes are | ||
| 1264 | * flushed or there is no more cached stripes. | ||
| 1265 | */ | ||
| 1266 | void r5c_flush_cache(struct r5conf *conf, int num) | ||
| 1267 | { | ||
| 1268 | int count; | ||
| 1269 | struct stripe_head *sh, *next; | ||
| 1270 | |||
| 1271 | assert_spin_locked(&conf->device_lock); | ||
| 1272 | if (!conf->log) | ||
| 1273 | return; | ||
| 1274 | |||
| 1275 | count = 0; | ||
| 1276 | list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { | ||
| 1277 | r5c_flush_stripe(conf, sh); | ||
| 1278 | count++; | ||
| 1279 | } | ||
| 1280 | |||
| 1281 | if (count >= num) | ||
| 1282 | return; | ||
| 1283 | list_for_each_entry_safe(sh, next, | ||
| 1284 | &conf->r5c_partial_stripe_list, lru) { | ||
| 1285 | r5c_flush_stripe(conf, sh); | ||
| 1286 | if (++count >= num) | ||
| 1287 | break; | ||
| 1288 | } | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | static void r5c_do_reclaim(struct r5conf *conf) | ||
| 1292 | { | ||
| 1293 | struct r5l_log *log = conf->log; | ||
| 1294 | struct stripe_head *sh; | ||
| 1295 | int count = 0; | ||
| 1296 | unsigned long flags; | ||
| 1297 | int total_cached; | ||
| 1298 | int stripes_to_flush; | ||
| 1299 | |||
| 1300 | if (!r5c_is_writeback(log)) | ||
| 1301 | return; | ||
| 1302 | |||
| 1303 | total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + | ||
| 1304 | atomic_read(&conf->r5c_cached_full_stripes); | ||
| 1305 | |||
| 1306 | if (total_cached > conf->min_nr_stripes * 3 / 4 || | ||
| 1307 | atomic_read(&conf->empty_inactive_list_nr) > 0) | ||
| 1308 | /* | ||
| 1309 | * if stripe cache pressure high, flush all full stripes and | ||
| 1310 | * some partial stripes | ||
| 1311 | */ | ||
| 1312 | stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; | ||
| 1313 | else if (total_cached > conf->min_nr_stripes * 1 / 2 || | ||
| 1314 | atomic_read(&conf->r5c_cached_full_stripes) > | ||
| 1315 | R5C_FULL_STRIPE_FLUSH_BATCH) | ||
| 1316 | /* | ||
| 1317 | * if stripe cache pressure moderate, or if there is many full | ||
| 1318 | * stripes,flush all full stripes | ||
| 1319 | */ | ||
| 1320 | stripes_to_flush = 0; | ||
| 1321 | else | ||
| 1322 | /* no need to flush */ | ||
| 1323 | stripes_to_flush = -1; | ||
| 1324 | |||
| 1325 | if (stripes_to_flush >= 0) { | ||
| 1326 | spin_lock_irqsave(&conf->device_lock, flags); | ||
| 1327 | r5c_flush_cache(conf, stripes_to_flush); | ||
| 1328 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | /* if log space is tight, flush stripes on stripe_in_journal_list */ | ||
| 1332 | if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { | ||
| 1333 | spin_lock_irqsave(&log->stripe_in_journal_lock, flags); | ||
| 1334 | spin_lock(&conf->device_lock); | ||
| 1335 | list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { | ||
| 1336 | /* | ||
| 1337 | * stripes on stripe_in_journal_list could be in any | ||
| 1338 | * state of the stripe_cache state machine. In this | ||
| 1339 | * case, we only want to flush stripe on | ||
| 1340 | * r5c_cached_full/partial_stripes. The following | ||
| 1341 | * condition makes sure the stripe is on one of the | ||
| 1342 | * two lists. | ||
| 1343 | */ | ||
| 1344 | if (!list_empty(&sh->lru) && | ||
| 1345 | !test_bit(STRIPE_HANDLE, &sh->state) && | ||
| 1346 | atomic_read(&sh->count) == 0) { | ||
| 1347 | r5c_flush_stripe(conf, sh); | ||
| 1348 | } | ||
| 1349 | if (count++ >= R5C_RECLAIM_STRIPE_GROUP) | ||
| 1350 | break; | ||
| 1351 | } | ||
| 1352 | spin_unlock(&conf->device_lock); | ||
| 1353 | spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); | ||
| 1354 | } | ||
| 1355 | |||
| 1356 | if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) | ||
| 1357 | r5l_run_no_space_stripes(log); | ||
| 1358 | |||
| 1359 | md_wakeup_thread(conf->mddev->thread); | ||
| 1360 | } | ||
| 738 | 1361 | ||
| 739 | static void r5l_do_reclaim(struct r5l_log *log) | 1362 | static void r5l_do_reclaim(struct r5l_log *log) |
| 740 | { | 1363 | { |
| 1364 | struct r5conf *conf = log->rdev->mddev->private; | ||
| 741 | sector_t reclaim_target = xchg(&log->reclaim_target, 0); | 1365 | sector_t reclaim_target = xchg(&log->reclaim_target, 0); |
| 742 | sector_t reclaimable; | 1366 | sector_t reclaimable; |
| 743 | sector_t next_checkpoint; | 1367 | sector_t next_checkpoint; |
| 744 | u64 next_cp_seq; | 1368 | bool write_super; |
| 745 | 1369 | ||
| 746 | spin_lock_irq(&log->io_list_lock); | 1370 | spin_lock_irq(&log->io_list_lock); |
| 1371 | write_super = r5l_reclaimable_space(log) > log->max_free_space || | ||
| 1372 | reclaim_target != 0 || !list_empty(&log->no_space_stripes); | ||
| 747 | /* | 1373 | /* |
| 748 | * move proper io_unit to reclaim list. We should not change the order. | 1374 | * move proper io_unit to reclaim list. We should not change the order. |
| 749 | * reclaimable/unreclaimable io_unit can be mixed in the list, we | 1375 | * reclaimable/unreclaimable io_unit can be mixed in the list, we |
| @@ -764,12 +1390,12 @@ static void r5l_do_reclaim(struct r5l_log *log) | |||
| 764 | log->io_list_lock); | 1390 | log->io_list_lock); |
| 765 | } | 1391 | } |
| 766 | 1392 | ||
| 767 | next_checkpoint = log->next_checkpoint; | 1393 | next_checkpoint = r5c_calculate_new_cp(conf); |
| 768 | next_cp_seq = log->next_cp_seq; | ||
| 769 | spin_unlock_irq(&log->io_list_lock); | 1394 | spin_unlock_irq(&log->io_list_lock); |
| 770 | 1395 | ||
| 771 | BUG_ON(reclaimable < 0); | 1396 | BUG_ON(reclaimable < 0); |
| 772 | if (reclaimable == 0) | 1397 | |
| 1398 | if (reclaimable == 0 || !write_super) | ||
| 773 | return; | 1399 | return; |
| 774 | 1400 | ||
| 775 | /* | 1401 | /* |
| @@ -781,7 +1407,7 @@ static void r5l_do_reclaim(struct r5l_log *log) | |||
| 781 | 1407 | ||
| 782 | mutex_lock(&log->io_mutex); | 1408 | mutex_lock(&log->io_mutex); |
| 783 | log->last_checkpoint = next_checkpoint; | 1409 | log->last_checkpoint = next_checkpoint; |
| 784 | log->last_cp_seq = next_cp_seq; | 1410 | r5c_update_log_state(log); |
| 785 | mutex_unlock(&log->io_mutex); | 1411 | mutex_unlock(&log->io_mutex); |
| 786 | 1412 | ||
| 787 | r5l_run_no_space_stripes(log); | 1413 | r5l_run_no_space_stripes(log); |
| @@ -795,14 +1421,17 @@ static void r5l_reclaim_thread(struct md_thread *thread) | |||
| 795 | 1421 | ||
| 796 | if (!log) | 1422 | if (!log) |
| 797 | return; | 1423 | return; |
| 1424 | r5c_do_reclaim(conf); | ||
| 798 | r5l_do_reclaim(log); | 1425 | r5l_do_reclaim(log); |
| 799 | } | 1426 | } |
| 800 | 1427 | ||
| 801 | static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) | 1428 | void r5l_wake_reclaim(struct r5l_log *log, sector_t space) |
| 802 | { | 1429 | { |
| 803 | unsigned long target; | 1430 | unsigned long target; |
| 804 | unsigned long new = (unsigned long)space; /* overflow in theory */ | 1431 | unsigned long new = (unsigned long)space; /* overflow in theory */ |
| 805 | 1432 | ||
| 1433 | if (!log) | ||
| 1434 | return; | ||
| 806 | do { | 1435 | do { |
| 807 | target = log->reclaim_target; | 1436 | target = log->reclaim_target; |
| 808 | if (new < target) | 1437 | if (new < target) |
| @@ -816,22 +1445,14 @@ void r5l_quiesce(struct r5l_log *log, int state) | |||
| 816 | struct mddev *mddev; | 1445 | struct mddev *mddev; |
| 817 | if (!log || state == 2) | 1446 | if (!log || state == 2) |
| 818 | return; | 1447 | return; |
| 819 | if (state == 0) { | 1448 | if (state == 0) |
| 820 | /* | 1449 | kthread_unpark(log->reclaim_thread->tsk); |
| 821 | * This is a special case for hotadd. In suspend, the array has | 1450 | else if (state == 1) { |
| 822 | * no journal. In resume, journal is initialized as well as the | ||
| 823 | * reclaim thread. | ||
| 824 | */ | ||
| 825 | if (log->reclaim_thread) | ||
| 826 | return; | ||
| 827 | log->reclaim_thread = md_register_thread(r5l_reclaim_thread, | ||
| 828 | log->rdev->mddev, "reclaim"); | ||
| 829 | } else if (state == 1) { | ||
| 830 | /* make sure r5l_write_super_and_discard_space exits */ | 1451 | /* make sure r5l_write_super_and_discard_space exits */ |
| 831 | mddev = log->rdev->mddev; | 1452 | mddev = log->rdev->mddev; |
| 832 | wake_up(&mddev->sb_wait); | 1453 | wake_up(&mddev->sb_wait); |
| 833 | r5l_wake_reclaim(log, -1L); | 1454 | kthread_park(log->reclaim_thread->tsk); |
| 834 | md_unregister_thread(&log->reclaim_thread); | 1455 | r5l_wake_reclaim(log, MaxSector); |
| 835 | r5l_do_reclaim(log); | 1456 | r5l_do_reclaim(log); |
| 836 | } | 1457 | } |
| 837 | } | 1458 | } |
| @@ -857,10 +1478,13 @@ struct r5l_recovery_ctx { | |||
| 857 | sector_t meta_total_blocks; /* total size of current meta and data */ | 1478 | sector_t meta_total_blocks; /* total size of current meta and data */ |
| 858 | sector_t pos; /* recovery position */ | 1479 | sector_t pos; /* recovery position */ |
| 859 | u64 seq; /* recovery position seq */ | 1480 | u64 seq; /* recovery position seq */ |
| 1481 | int data_parity_stripes; /* number of data_parity stripes */ | ||
| 1482 | int data_only_stripes; /* number of data_only stripes */ | ||
| 1483 | struct list_head cached_list; | ||
| 860 | }; | 1484 | }; |
| 861 | 1485 | ||
| 862 | static int r5l_read_meta_block(struct r5l_log *log, | 1486 | static int r5l_recovery_read_meta_block(struct r5l_log *log, |
| 863 | struct r5l_recovery_ctx *ctx) | 1487 | struct r5l_recovery_ctx *ctx) |
| 864 | { | 1488 | { |
| 865 | struct page *page = ctx->meta_page; | 1489 | struct page *page = ctx->meta_page; |
| 866 | struct r5l_meta_block *mb; | 1490 | struct r5l_meta_block *mb; |
| @@ -892,170 +1516,618 @@ static int r5l_read_meta_block(struct r5l_log *log, | |||
| 892 | return 0; | 1516 | return 0; |
| 893 | } | 1517 | } |
| 894 | 1518 | ||
| 895 | static int r5l_recovery_flush_one_stripe(struct r5l_log *log, | 1519 | static void |
| 896 | struct r5l_recovery_ctx *ctx, | 1520 | r5l_recovery_create_empty_meta_block(struct r5l_log *log, |
| 897 | sector_t stripe_sect, | 1521 | struct page *page, |
| 898 | int *offset, sector_t *log_offset) | 1522 | sector_t pos, u64 seq) |
| 899 | { | 1523 | { |
| 900 | struct r5conf *conf = log->rdev->mddev->private; | 1524 | struct r5l_meta_block *mb; |
| 901 | struct stripe_head *sh; | ||
| 902 | struct r5l_payload_data_parity *payload; | ||
| 903 | int disk_index; | ||
| 904 | 1525 | ||
| 905 | sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); | 1526 | mb = page_address(page); |
| 906 | while (1) { | 1527 | clear_page(mb); |
| 907 | payload = page_address(ctx->meta_page) + *offset; | 1528 | mb->magic = cpu_to_le32(R5LOG_MAGIC); |
| 1529 | mb->version = R5LOG_VERSION; | ||
| 1530 | mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); | ||
| 1531 | mb->seq = cpu_to_le64(seq); | ||
| 1532 | mb->position = cpu_to_le64(pos); | ||
| 1533 | } | ||
| 908 | 1534 | ||
| 909 | if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { | 1535 | static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, |
| 910 | raid5_compute_sector(conf, | 1536 | u64 seq) |
| 911 | le64_to_cpu(payload->location), 0, | 1537 | { |
| 912 | &disk_index, sh); | 1538 | struct page *page; |
| 1539 | struct r5l_meta_block *mb; | ||
| 913 | 1540 | ||
| 914 | sync_page_io(log->rdev, *log_offset, PAGE_SIZE, | 1541 | page = alloc_page(GFP_KERNEL); |
| 915 | sh->dev[disk_index].page, REQ_OP_READ, 0, | 1542 | if (!page) |
| 916 | false); | 1543 | return -ENOMEM; |
| 917 | sh->dev[disk_index].log_checksum = | 1544 | r5l_recovery_create_empty_meta_block(log, page, pos, seq); |
| 918 | le32_to_cpu(payload->checksum[0]); | 1545 | mb = page_address(page); |
| 919 | set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); | 1546 | mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, |
| 920 | ctx->meta_total_blocks += BLOCK_SECTORS; | 1547 | mb, PAGE_SIZE)); |
| 921 | } else { | 1548 | if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, |
| 922 | disk_index = sh->pd_idx; | 1549 | REQ_FUA, false)) { |
| 923 | sync_page_io(log->rdev, *log_offset, PAGE_SIZE, | 1550 | __free_page(page); |
| 924 | sh->dev[disk_index].page, REQ_OP_READ, 0, | 1551 | return -EIO; |
| 925 | false); | 1552 | } |
| 926 | sh->dev[disk_index].log_checksum = | 1553 | __free_page(page); |
| 927 | le32_to_cpu(payload->checksum[0]); | 1554 | return 0; |
| 928 | set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); | 1555 | } |
| 929 | |||
| 930 | if (sh->qd_idx >= 0) { | ||
| 931 | disk_index = sh->qd_idx; | ||
| 932 | sync_page_io(log->rdev, | ||
| 933 | r5l_ring_add(log, *log_offset, BLOCK_SECTORS), | ||
| 934 | PAGE_SIZE, sh->dev[disk_index].page, | ||
| 935 | REQ_OP_READ, 0, false); | ||
| 936 | sh->dev[disk_index].log_checksum = | ||
| 937 | le32_to_cpu(payload->checksum[1]); | ||
| 938 | set_bit(R5_Wantwrite, | ||
| 939 | &sh->dev[disk_index].flags); | ||
| 940 | } | ||
| 941 | ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; | ||
| 942 | } | ||
| 943 | 1556 | ||
| 944 | *log_offset = r5l_ring_add(log, *log_offset, | 1557 | /* |
| 945 | le32_to_cpu(payload->size)); | 1558 | * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite |
| 946 | *offset += sizeof(struct r5l_payload_data_parity) + | 1559 | * to mark valid (potentially not flushed) data in the journal. |
| 947 | sizeof(__le32) * | 1560 | * |
| 948 | (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); | 1561 | * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, |
| 949 | if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) | 1562 | * so there should not be any mismatch here. |
| 950 | break; | 1563 | */ |
| 1564 | static void r5l_recovery_load_data(struct r5l_log *log, | ||
| 1565 | struct stripe_head *sh, | ||
| 1566 | struct r5l_recovery_ctx *ctx, | ||
| 1567 | struct r5l_payload_data_parity *payload, | ||
| 1568 | sector_t log_offset) | ||
| 1569 | { | ||
| 1570 | struct mddev *mddev = log->rdev->mddev; | ||
| 1571 | struct r5conf *conf = mddev->private; | ||
| 1572 | int dd_idx; | ||
| 1573 | |||
| 1574 | raid5_compute_sector(conf, | ||
| 1575 | le64_to_cpu(payload->location), 0, | ||
| 1576 | &dd_idx, sh); | ||
| 1577 | sync_page_io(log->rdev, log_offset, PAGE_SIZE, | ||
| 1578 | sh->dev[dd_idx].page, REQ_OP_READ, 0, false); | ||
| 1579 | sh->dev[dd_idx].log_checksum = | ||
| 1580 | le32_to_cpu(payload->checksum[0]); | ||
| 1581 | ctx->meta_total_blocks += BLOCK_SECTORS; | ||
| 1582 | |||
| 1583 | set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); | ||
| 1584 | set_bit(STRIPE_R5C_CACHING, &sh->state); | ||
| 1585 | } | ||
| 1586 | |||
| 1587 | static void r5l_recovery_load_parity(struct r5l_log *log, | ||
| 1588 | struct stripe_head *sh, | ||
| 1589 | struct r5l_recovery_ctx *ctx, | ||
| 1590 | struct r5l_payload_data_parity *payload, | ||
| 1591 | sector_t log_offset) | ||
| 1592 | { | ||
| 1593 | struct mddev *mddev = log->rdev->mddev; | ||
| 1594 | struct r5conf *conf = mddev->private; | ||
| 1595 | |||
| 1596 | ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; | ||
| 1597 | sync_page_io(log->rdev, log_offset, PAGE_SIZE, | ||
| 1598 | sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); | ||
| 1599 | sh->dev[sh->pd_idx].log_checksum = | ||
| 1600 | le32_to_cpu(payload->checksum[0]); | ||
| 1601 | set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); | ||
| 1602 | |||
| 1603 | if (sh->qd_idx >= 0) { | ||
| 1604 | sync_page_io(log->rdev, | ||
| 1605 | r5l_ring_add(log, log_offset, BLOCK_SECTORS), | ||
| 1606 | PAGE_SIZE, sh->dev[sh->qd_idx].page, | ||
| 1607 | REQ_OP_READ, 0, false); | ||
| 1608 | sh->dev[sh->qd_idx].log_checksum = | ||
| 1609 | le32_to_cpu(payload->checksum[1]); | ||
| 1610 | set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); | ||
| 951 | } | 1611 | } |
| 1612 | clear_bit(STRIPE_R5C_CACHING, &sh->state); | ||
| 1613 | } | ||
| 952 | 1614 | ||
| 953 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { | 1615 | static void r5l_recovery_reset_stripe(struct stripe_head *sh) |
| 954 | void *addr; | 1616 | { |
| 955 | u32 checksum; | 1617 | int i; |
| 956 | 1618 | ||
| 1619 | sh->state = 0; | ||
| 1620 | sh->log_start = MaxSector; | ||
| 1621 | for (i = sh->disks; i--; ) | ||
| 1622 | sh->dev[i].flags = 0; | ||
| 1623 | } | ||
| 1624 | |||
| 1625 | static void | ||
| 1626 | r5l_recovery_replay_one_stripe(struct r5conf *conf, | ||
| 1627 | struct stripe_head *sh, | ||
| 1628 | struct r5l_recovery_ctx *ctx) | ||
| 1629 | { | ||
| 1630 | struct md_rdev *rdev, *rrdev; | ||
| 1631 | int disk_index; | ||
| 1632 | int data_count = 0; | ||
| 1633 | |||
| 1634 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { | ||
| 957 | if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) | 1635 | if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) |
| 958 | continue; | 1636 | continue; |
| 959 | addr = kmap_atomic(sh->dev[disk_index].page); | 1637 | if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) |
| 960 | checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); | 1638 | continue; |
| 961 | kunmap_atomic(addr); | 1639 | data_count++; |
| 962 | if (checksum != sh->dev[disk_index].log_checksum) | ||
| 963 | goto error; | ||
| 964 | } | 1640 | } |
| 965 | 1641 | ||
| 966 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { | 1642 | /* |
| 967 | struct md_rdev *rdev, *rrdev; | 1643 | * stripes that only have parity must have been flushed |
| 1644 | * before the crash that we are now recovering from, so | ||
| 1645 | * there is nothing more to recovery. | ||
| 1646 | */ | ||
| 1647 | if (data_count == 0) | ||
| 1648 | goto out; | ||
| 968 | 1649 | ||
| 969 | if (!test_and_clear_bit(R5_Wantwrite, | 1650 | for (disk_index = 0; disk_index < sh->disks; disk_index++) { |
| 970 | &sh->dev[disk_index].flags)) | 1651 | if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) |
| 971 | continue; | 1652 | continue; |
| 972 | 1653 | ||
| 973 | /* in case device is broken */ | 1654 | /* in case device is broken */ |
| 1655 | rcu_read_lock(); | ||
| 974 | rdev = rcu_dereference(conf->disks[disk_index].rdev); | 1656 | rdev = rcu_dereference(conf->disks[disk_index].rdev); |
| 975 | if (rdev) | 1657 | if (rdev) { |
| 976 | sync_page_io(rdev, stripe_sect, PAGE_SIZE, | 1658 | atomic_inc(&rdev->nr_pending); |
| 1659 | rcu_read_unlock(); | ||
| 1660 | sync_page_io(rdev, sh->sector, PAGE_SIZE, | ||
| 977 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, | 1661 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, |
| 978 | false); | 1662 | false); |
| 1663 | rdev_dec_pending(rdev, rdev->mddev); | ||
| 1664 | rcu_read_lock(); | ||
| 1665 | } | ||
| 979 | rrdev = rcu_dereference(conf->disks[disk_index].replacement); | 1666 | rrdev = rcu_dereference(conf->disks[disk_index].replacement); |
| 980 | if (rrdev) | 1667 | if (rrdev) { |
| 981 | sync_page_io(rrdev, stripe_sect, PAGE_SIZE, | 1668 | atomic_inc(&rrdev->nr_pending); |
| 1669 | rcu_read_unlock(); | ||
| 1670 | sync_page_io(rrdev, sh->sector, PAGE_SIZE, | ||
| 982 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, | 1671 | sh->dev[disk_index].page, REQ_OP_WRITE, 0, |
| 983 | false); | 1672 | false); |
| 1673 | rdev_dec_pending(rrdev, rrdev->mddev); | ||
| 1674 | rcu_read_lock(); | ||
| 1675 | } | ||
| 1676 | rcu_read_unlock(); | ||
| 984 | } | 1677 | } |
| 985 | raid5_release_stripe(sh); | 1678 | ctx->data_parity_stripes++; |
| 1679 | out: | ||
| 1680 | r5l_recovery_reset_stripe(sh); | ||
| 1681 | } | ||
| 1682 | |||
| 1683 | static struct stripe_head * | ||
| 1684 | r5c_recovery_alloc_stripe(struct r5conf *conf, | ||
| 1685 | sector_t stripe_sect, | ||
| 1686 | sector_t log_start) | ||
| 1687 | { | ||
| 1688 | struct stripe_head *sh; | ||
| 1689 | |||
| 1690 | sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); | ||
| 1691 | if (!sh) | ||
| 1692 | return NULL; /* no more stripe available */ | ||
| 1693 | |||
| 1694 | r5l_recovery_reset_stripe(sh); | ||
| 1695 | sh->log_start = log_start; | ||
| 1696 | |||
| 1697 | return sh; | ||
| 1698 | } | ||
| 1699 | |||
| 1700 | static struct stripe_head * | ||
| 1701 | r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) | ||
| 1702 | { | ||
| 1703 | struct stripe_head *sh; | ||
| 1704 | |||
| 1705 | list_for_each_entry(sh, list, lru) | ||
| 1706 | if (sh->sector == sect) | ||
| 1707 | return sh; | ||
| 1708 | return NULL; | ||
| 1709 | } | ||
| 1710 | |||
| 1711 | static void | ||
| 1712 | r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, | ||
| 1713 | struct r5l_recovery_ctx *ctx) | ||
| 1714 | { | ||
| 1715 | struct stripe_head *sh, *next; | ||
| 1716 | |||
| 1717 | list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { | ||
| 1718 | r5l_recovery_reset_stripe(sh); | ||
| 1719 | list_del_init(&sh->lru); | ||
| 1720 | raid5_release_stripe(sh); | ||
| 1721 | } | ||
| 1722 | } | ||
| 1723 | |||
| 1724 | static void | ||
| 1725 | r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, | ||
| 1726 | struct r5l_recovery_ctx *ctx) | ||
| 1727 | { | ||
| 1728 | struct stripe_head *sh, *next; | ||
| 1729 | |||
| 1730 | list_for_each_entry_safe(sh, next, cached_stripe_list, lru) | ||
| 1731 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { | ||
| 1732 | r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); | ||
| 1733 | list_del_init(&sh->lru); | ||
| 1734 | raid5_release_stripe(sh); | ||
| 1735 | } | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | /* if matches return 0; otherwise return -EINVAL */ | ||
| 1739 | static int | ||
| 1740 | r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, | ||
| 1741 | sector_t log_offset, __le32 log_checksum) | ||
| 1742 | { | ||
| 1743 | void *addr; | ||
| 1744 | u32 checksum; | ||
| 1745 | |||
| 1746 | sync_page_io(log->rdev, log_offset, PAGE_SIZE, | ||
| 1747 | page, REQ_OP_READ, 0, false); | ||
| 1748 | addr = kmap_atomic(page); | ||
| 1749 | checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); | ||
| 1750 | kunmap_atomic(addr); | ||
| 1751 | return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | /* | ||
| 1755 | * before loading data to stripe cache, we need verify checksum for all data, | ||
| 1756 | * if there is mismatch for any data page, we drop all data in the mata block | ||
| 1757 | */ | ||
| 1758 | static int | ||
| 1759 | r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, | ||
| 1760 | struct r5l_recovery_ctx *ctx) | ||
| 1761 | { | ||
| 1762 | struct mddev *mddev = log->rdev->mddev; | ||
| 1763 | struct r5conf *conf = mddev->private; | ||
| 1764 | struct r5l_meta_block *mb = page_address(ctx->meta_page); | ||
| 1765 | sector_t mb_offset = sizeof(struct r5l_meta_block); | ||
| 1766 | sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); | ||
| 1767 | struct page *page; | ||
| 1768 | struct r5l_payload_data_parity *payload; | ||
| 1769 | |||
| 1770 | page = alloc_page(GFP_KERNEL); | ||
| 1771 | if (!page) | ||
| 1772 | return -ENOMEM; | ||
| 1773 | |||
| 1774 | while (mb_offset < le32_to_cpu(mb->meta_size)) { | ||
| 1775 | payload = (void *)mb + mb_offset; | ||
| 1776 | |||
| 1777 | if (payload->header.type == R5LOG_PAYLOAD_DATA) { | ||
| 1778 | if (r5l_recovery_verify_data_checksum( | ||
| 1779 | log, page, log_offset, | ||
| 1780 | payload->checksum[0]) < 0) | ||
| 1781 | goto mismatch; | ||
| 1782 | } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { | ||
| 1783 | if (r5l_recovery_verify_data_checksum( | ||
| 1784 | log, page, log_offset, | ||
| 1785 | payload->checksum[0]) < 0) | ||
| 1786 | goto mismatch; | ||
| 1787 | if (conf->max_degraded == 2 && /* q for RAID 6 */ | ||
| 1788 | r5l_recovery_verify_data_checksum( | ||
| 1789 | log, page, | ||
| 1790 | r5l_ring_add(log, log_offset, | ||
| 1791 | BLOCK_SECTORS), | ||
| 1792 | payload->checksum[1]) < 0) | ||
| 1793 | goto mismatch; | ||
| 1794 | } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ | ||
| 1795 | goto mismatch; | ||
| 1796 | |||
| 1797 | log_offset = r5l_ring_add(log, log_offset, | ||
| 1798 | le32_to_cpu(payload->size)); | ||
| 1799 | |||
| 1800 | mb_offset += sizeof(struct r5l_payload_data_parity) + | ||
| 1801 | sizeof(__le32) * | ||
| 1802 | (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); | ||
| 1803 | } | ||
| 1804 | |||
| 1805 | put_page(page); | ||
| 986 | return 0; | 1806 | return 0; |
| 987 | 1807 | ||
| 988 | error: | 1808 | mismatch: |
| 989 | for (disk_index = 0; disk_index < sh->disks; disk_index++) | 1809 | put_page(page); |
| 990 | sh->dev[disk_index].flags = 0; | ||
| 991 | raid5_release_stripe(sh); | ||
| 992 | return -EINVAL; | 1810 | return -EINVAL; |
| 993 | } | 1811 | } |
| 994 | 1812 | ||
| 995 | static int r5l_recovery_flush_one_meta(struct r5l_log *log, | 1813 | /* |
| 996 | struct r5l_recovery_ctx *ctx) | 1814 | * Analyze all data/parity pages in one meta block |
| 1815 | * Returns: | ||
| 1816 | * 0 for success | ||
| 1817 | * -EINVAL for unknown playload type | ||
| 1818 | * -EAGAIN for checksum mismatch of data page | ||
| 1819 | * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) | ||
| 1820 | */ | ||
| 1821 | static int | ||
| 1822 | r5c_recovery_analyze_meta_block(struct r5l_log *log, | ||
| 1823 | struct r5l_recovery_ctx *ctx, | ||
| 1824 | struct list_head *cached_stripe_list) | ||
| 997 | { | 1825 | { |
| 998 | struct r5conf *conf = log->rdev->mddev->private; | 1826 | struct mddev *mddev = log->rdev->mddev; |
| 999 | struct r5l_payload_data_parity *payload; | 1827 | struct r5conf *conf = mddev->private; |
| 1000 | struct r5l_meta_block *mb; | 1828 | struct r5l_meta_block *mb; |
| 1001 | int offset; | 1829 | struct r5l_payload_data_parity *payload; |
| 1830 | int mb_offset; | ||
| 1002 | sector_t log_offset; | 1831 | sector_t log_offset; |
| 1003 | sector_t stripe_sector; | 1832 | sector_t stripe_sect; |
| 1833 | struct stripe_head *sh; | ||
| 1834 | int ret; | ||
| 1835 | |||
| 1836 | /* | ||
| 1837 | * for mismatch in data blocks, we will drop all data in this mb, but | ||
| 1838 | * we will still read next mb for other data with FLUSH flag, as | ||
| 1839 | * io_unit could finish out of order. | ||
| 1840 | */ | ||
| 1841 | ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); | ||
| 1842 | if (ret == -EINVAL) | ||
| 1843 | return -EAGAIN; | ||
| 1844 | else if (ret) | ||
| 1845 | return ret; /* -ENOMEM duo to alloc_page() failed */ | ||
| 1004 | 1846 | ||
| 1005 | mb = page_address(ctx->meta_page); | 1847 | mb = page_address(ctx->meta_page); |
| 1006 | offset = sizeof(struct r5l_meta_block); | 1848 | mb_offset = sizeof(struct r5l_meta_block); |
| 1007 | log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); | 1849 | log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); |
| 1008 | 1850 | ||
| 1009 | while (offset < le32_to_cpu(mb->meta_size)) { | 1851 | while (mb_offset < le32_to_cpu(mb->meta_size)) { |
| 1010 | int dd; | 1852 | int dd; |
| 1011 | 1853 | ||
| 1012 | payload = (void *)mb + offset; | 1854 | payload = (void *)mb + mb_offset; |
| 1013 | stripe_sector = raid5_compute_sector(conf, | 1855 | stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? |
| 1014 | le64_to_cpu(payload->location), 0, &dd, NULL); | 1856 | raid5_compute_sector( |
| 1015 | if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, | 1857 | conf, le64_to_cpu(payload->location), 0, &dd, |
| 1016 | &offset, &log_offset)) | 1858 | NULL) |
| 1859 | : le64_to_cpu(payload->location); | ||
| 1860 | |||
| 1861 | sh = r5c_recovery_lookup_stripe(cached_stripe_list, | ||
| 1862 | stripe_sect); | ||
| 1863 | |||
| 1864 | if (!sh) { | ||
| 1865 | sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos); | ||
| 1866 | /* | ||
| 1867 | * cannot get stripe from raid5_get_active_stripe | ||
| 1868 | * try replay some stripes | ||
| 1869 | */ | ||
| 1870 | if (!sh) { | ||
| 1871 | r5c_recovery_replay_stripes( | ||
| 1872 | cached_stripe_list, ctx); | ||
| 1873 | sh = r5c_recovery_alloc_stripe( | ||
| 1874 | conf, stripe_sect, ctx->pos); | ||
| 1875 | } | ||
| 1876 | if (!sh) { | ||
| 1877 | pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", | ||
| 1878 | mdname(mddev), | ||
| 1879 | conf->min_nr_stripes * 2); | ||
| 1880 | raid5_set_cache_size(mddev, | ||
| 1881 | conf->min_nr_stripes * 2); | ||
| 1882 | sh = r5c_recovery_alloc_stripe( | ||
| 1883 | conf, stripe_sect, ctx->pos); | ||
| 1884 | } | ||
| 1885 | if (!sh) { | ||
| 1886 | pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", | ||
| 1887 | mdname(mddev)); | ||
| 1888 | return -ENOMEM; | ||
| 1889 | } | ||
| 1890 | list_add_tail(&sh->lru, cached_stripe_list); | ||
| 1891 | } | ||
| 1892 | |||
| 1893 | if (payload->header.type == R5LOG_PAYLOAD_DATA) { | ||
| 1894 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && | ||
| 1895 | test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { | ||
| 1896 | r5l_recovery_replay_one_stripe(conf, sh, ctx); | ||
| 1897 | sh->log_start = ctx->pos; | ||
| 1898 | list_move_tail(&sh->lru, cached_stripe_list); | ||
| 1899 | } | ||
| 1900 | r5l_recovery_load_data(log, sh, ctx, payload, | ||
| 1901 | log_offset); | ||
| 1902 | } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) | ||
| 1903 | r5l_recovery_load_parity(log, sh, ctx, payload, | ||
| 1904 | log_offset); | ||
| 1905 | else | ||
| 1017 | return -EINVAL; | 1906 | return -EINVAL; |
| 1907 | |||
| 1908 | log_offset = r5l_ring_add(log, log_offset, | ||
| 1909 | le32_to_cpu(payload->size)); | ||
| 1910 | |||
| 1911 | mb_offset += sizeof(struct r5l_payload_data_parity) + | ||
| 1912 | sizeof(__le32) * | ||
| 1913 | (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); | ||
| 1018 | } | 1914 | } |
| 1915 | |||
| 1019 | return 0; | 1916 | return 0; |
| 1020 | } | 1917 | } |
| 1021 | 1918 | ||
| 1022 | /* copy data/parity from log to raid disks */ | 1919 | /* |
| 1023 | static void r5l_recovery_flush_log(struct r5l_log *log, | 1920 | * Load the stripe into cache. The stripe will be written out later by |
| 1024 | struct r5l_recovery_ctx *ctx) | 1921 | * the stripe cache state machine. |
| 1922 | */ | ||
| 1923 | static void r5c_recovery_load_one_stripe(struct r5l_log *log, | ||
| 1924 | struct stripe_head *sh) | ||
| 1025 | { | 1925 | { |
| 1926 | struct r5dev *dev; | ||
| 1927 | int i; | ||
| 1928 | |||
| 1929 | for (i = sh->disks; i--; ) { | ||
| 1930 | dev = sh->dev + i; | ||
| 1931 | if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { | ||
| 1932 | set_bit(R5_InJournal, &dev->flags); | ||
| 1933 | set_bit(R5_UPTODATE, &dev->flags); | ||
| 1934 | } | ||
| 1935 | } | ||
| 1936 | list_add_tail(&sh->r5c, &log->stripe_in_journal_list); | ||
| 1937 | atomic_inc(&log->stripe_in_journal_count); | ||
| 1938 | } | ||
| 1939 | |||
| 1940 | /* | ||
| 1941 | * Scan through the log for all to-be-flushed data | ||
| 1942 | * | ||
| 1943 | * For stripes with data and parity, namely Data-Parity stripe | ||
| 1944 | * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. | ||
| 1945 | * | ||
| 1946 | * For stripes with only data, namely Data-Only stripe | ||
| 1947 | * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. | ||
| 1948 | * | ||
| 1949 | * For a stripe, if we see data after parity, we should discard all previous | ||
| 1950 | * data and parity for this stripe, as these data are already flushed to | ||
| 1951 | * the array. | ||
| 1952 | * | ||
| 1953 | * At the end of the scan, we return the new journal_tail, which points to | ||
| 1954 | * first data-only stripe on the journal device, or next invalid meta block. | ||
| 1955 | */ | ||
| 1956 | static int r5c_recovery_flush_log(struct r5l_log *log, | ||
| 1957 | struct r5l_recovery_ctx *ctx) | ||
| 1958 | { | ||
| 1959 | struct stripe_head *sh; | ||
| 1960 | int ret = 0; | ||
| 1961 | |||
| 1962 | /* scan through the log */ | ||
| 1026 | while (1) { | 1963 | while (1) { |
| 1027 | if (r5l_read_meta_block(log, ctx)) | 1964 | if (r5l_recovery_read_meta_block(log, ctx)) |
| 1028 | return; | 1965 | break; |
| 1029 | if (r5l_recovery_flush_one_meta(log, ctx)) | 1966 | |
| 1030 | return; | 1967 | ret = r5c_recovery_analyze_meta_block(log, ctx, |
| 1968 | &ctx->cached_list); | ||
| 1969 | /* | ||
| 1970 | * -EAGAIN means mismatch in data block, in this case, we still | ||
| 1971 | * try scan the next metablock | ||
| 1972 | */ | ||
| 1973 | if (ret && ret != -EAGAIN) | ||
| 1974 | break; /* ret == -EINVAL or -ENOMEM */ | ||
| 1031 | ctx->seq++; | 1975 | ctx->seq++; |
| 1032 | ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); | 1976 | ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); |
| 1033 | } | 1977 | } |
| 1978 | |||
| 1979 | if (ret == -ENOMEM) { | ||
| 1980 | r5c_recovery_drop_stripes(&ctx->cached_list, ctx); | ||
| 1981 | return ret; | ||
| 1982 | } | ||
| 1983 | |||
| 1984 | /* replay data-parity stripes */ | ||
| 1985 | r5c_recovery_replay_stripes(&ctx->cached_list, ctx); | ||
| 1986 | |||
| 1987 | /* load data-only stripes to stripe cache */ | ||
| 1988 | list_for_each_entry(sh, &ctx->cached_list, lru) { | ||
| 1989 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 1990 | r5c_recovery_load_one_stripe(log, sh); | ||
| 1991 | ctx->data_only_stripes++; | ||
| 1992 | } | ||
| 1993 | |||
| 1994 | return 0; | ||
| 1034 | } | 1995 | } |
| 1035 | 1996 | ||
| 1036 | static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, | 1997 | /* |
| 1037 | u64 seq) | 1998 | * we did a recovery. Now ctx.pos points to an invalid meta block. New |
| 1999 | * log will start here. but we can't let superblock point to last valid | ||
| 2000 | * meta block. The log might looks like: | ||
| 2001 | * | meta 1| meta 2| meta 3| | ||
| 2002 | * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If | ||
| 2003 | * superblock points to meta 1, we write a new valid meta 2n. if crash | ||
| 2004 | * happens again, new recovery will start from meta 1. Since meta 2n is | ||
| 2005 | * valid now, recovery will think meta 3 is valid, which is wrong. | ||
| 2006 | * The solution is we create a new meta in meta2 with its seq == meta | ||
| 2007 | * 1's seq + 10000 and let superblock points to meta2. The same recovery | ||
| 2008 | * will not think meta 3 is a valid meta, because its seq doesn't match | ||
| 2009 | */ | ||
| 2010 | |||
| 2011 | /* | ||
| 2012 | * Before recovery, the log looks like the following | ||
| 2013 | * | ||
| 2014 | * --------------------------------------------- | ||
| 2015 | * | valid log | invalid log | | ||
| 2016 | * --------------------------------------------- | ||
| 2017 | * ^ | ||
| 2018 | * |- log->last_checkpoint | ||
| 2019 | * |- log->last_cp_seq | ||
| 2020 | * | ||
| 2021 | * Now we scan through the log until we see invalid entry | ||
| 2022 | * | ||
| 2023 | * --------------------------------------------- | ||
| 2024 | * | valid log | invalid log | | ||
| 2025 | * --------------------------------------------- | ||
| 2026 | * ^ ^ | ||
| 2027 | * |- log->last_checkpoint |- ctx->pos | ||
| 2028 | * |- log->last_cp_seq |- ctx->seq | ||
| 2029 | * | ||
| 2030 | * From this point, we need to increase seq number by 10 to avoid | ||
| 2031 | * confusing next recovery. | ||
| 2032 | * | ||
| 2033 | * --------------------------------------------- | ||
| 2034 | * | valid log | invalid log | | ||
| 2035 | * --------------------------------------------- | ||
| 2036 | * ^ ^ | ||
| 2037 | * |- log->last_checkpoint |- ctx->pos+1 | ||
| 2038 | * |- log->last_cp_seq |- ctx->seq+10001 | ||
| 2039 | * | ||
| 2040 | * However, it is not safe to start the state machine yet, because data only | ||
| 2041 | * parities are not yet secured in RAID. To save these data only parities, we | ||
| 2042 | * rewrite them from seq+11. | ||
| 2043 | * | ||
| 2044 | * ----------------------------------------------------------------- | ||
| 2045 | * | valid log | data only stripes | invalid log | | ||
| 2046 | * ----------------------------------------------------------------- | ||
| 2047 | * ^ ^ | ||
| 2048 | * |- log->last_checkpoint |- ctx->pos+n | ||
| 2049 | * |- log->last_cp_seq |- ctx->seq+10000+n | ||
| 2050 | * | ||
| 2051 | * If failure happens again during this process, the recovery can safe start | ||
| 2052 | * again from log->last_checkpoint. | ||
| 2053 | * | ||
| 2054 | * Once data only stripes are rewritten to journal, we move log_tail | ||
| 2055 | * | ||
| 2056 | * ----------------------------------------------------------------- | ||
| 2057 | * | old log | data only stripes | invalid log | | ||
| 2058 | * ----------------------------------------------------------------- | ||
| 2059 | * ^ ^ | ||
| 2060 | * |- log->last_checkpoint |- ctx->pos+n | ||
| 2061 | * |- log->last_cp_seq |- ctx->seq+10000+n | ||
| 2062 | * | ||
| 2063 | * Then we can safely start the state machine. If failure happens from this | ||
| 2064 | * point on, the recovery will start from new log->last_checkpoint. | ||
| 2065 | */ | ||
| 2066 | static int | ||
| 2067 | r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, | ||
| 2068 | struct r5l_recovery_ctx *ctx) | ||
| 1038 | { | 2069 | { |
| 2070 | struct stripe_head *sh, *next; | ||
| 2071 | struct mddev *mddev = log->rdev->mddev; | ||
| 1039 | struct page *page; | 2072 | struct page *page; |
| 1040 | struct r5l_meta_block *mb; | ||
| 1041 | u32 crc; | ||
| 1042 | 2073 | ||
| 1043 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 2074 | page = alloc_page(GFP_KERNEL); |
| 1044 | if (!page) | 2075 | if (!page) { |
| 2076 | pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", | ||
| 2077 | mdname(mddev)); | ||
| 1045 | return -ENOMEM; | 2078 | return -ENOMEM; |
| 1046 | mb = page_address(page); | 2079 | } |
| 1047 | mb->magic = cpu_to_le32(R5LOG_MAGIC); | ||
| 1048 | mb->version = R5LOG_VERSION; | ||
| 1049 | mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); | ||
| 1050 | mb->seq = cpu_to_le64(seq); | ||
| 1051 | mb->position = cpu_to_le64(pos); | ||
| 1052 | crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); | ||
| 1053 | mb->checksum = cpu_to_le32(crc); | ||
| 1054 | 2080 | ||
| 1055 | if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, | 2081 | list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { |
| 1056 | REQ_FUA, false)) { | 2082 | struct r5l_meta_block *mb; |
| 1057 | __free_page(page); | 2083 | int i; |
| 1058 | return -EIO; | 2084 | int offset; |
| 2085 | sector_t write_pos; | ||
| 2086 | |||
| 2087 | WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 2088 | r5l_recovery_create_empty_meta_block(log, page, | ||
| 2089 | ctx->pos, ctx->seq); | ||
| 2090 | mb = page_address(page); | ||
| 2091 | offset = le32_to_cpu(mb->meta_size); | ||
| 2092 | write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); | ||
| 2093 | |||
| 2094 | for (i = sh->disks; i--; ) { | ||
| 2095 | struct r5dev *dev = &sh->dev[i]; | ||
| 2096 | struct r5l_payload_data_parity *payload; | ||
| 2097 | void *addr; | ||
| 2098 | |||
| 2099 | if (test_bit(R5_InJournal, &dev->flags)) { | ||
| 2100 | payload = (void *)mb + offset; | ||
| 2101 | payload->header.type = cpu_to_le16( | ||
| 2102 | R5LOG_PAYLOAD_DATA); | ||
| 2103 | payload->size = BLOCK_SECTORS; | ||
| 2104 | payload->location = cpu_to_le64( | ||
| 2105 | raid5_compute_blocknr(sh, i, 0)); | ||
| 2106 | addr = kmap_atomic(dev->page); | ||
| 2107 | payload->checksum[0] = cpu_to_le32( | ||
| 2108 | crc32c_le(log->uuid_checksum, addr, | ||
| 2109 | PAGE_SIZE)); | ||
| 2110 | kunmap_atomic(addr); | ||
| 2111 | sync_page_io(log->rdev, write_pos, PAGE_SIZE, | ||
| 2112 | dev->page, REQ_OP_WRITE, 0, false); | ||
| 2113 | write_pos = r5l_ring_add(log, write_pos, | ||
| 2114 | BLOCK_SECTORS); | ||
| 2115 | offset += sizeof(__le32) + | ||
| 2116 | sizeof(struct r5l_payload_data_parity); | ||
| 2117 | |||
| 2118 | } | ||
| 2119 | } | ||
| 2120 | mb->meta_size = cpu_to_le32(offset); | ||
| 2121 | mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, | ||
| 2122 | mb, PAGE_SIZE)); | ||
| 2123 | sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, | ||
| 2124 | REQ_OP_WRITE, REQ_FUA, false); | ||
| 2125 | sh->log_start = ctx->pos; | ||
| 2126 | ctx->pos = write_pos; | ||
| 2127 | ctx->seq += 1; | ||
| 2128 | |||
| 2129 | list_del_init(&sh->lru); | ||
| 2130 | raid5_release_stripe(sh); | ||
| 1059 | } | 2131 | } |
| 1060 | __free_page(page); | 2132 | __free_page(page); |
| 1061 | return 0; | 2133 | return 0; |
| @@ -1063,45 +2135,60 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, | |||
| 1063 | 2135 | ||
| 1064 | static int r5l_recovery_log(struct r5l_log *log) | 2136 | static int r5l_recovery_log(struct r5l_log *log) |
| 1065 | { | 2137 | { |
| 2138 | struct mddev *mddev = log->rdev->mddev; | ||
| 1066 | struct r5l_recovery_ctx ctx; | 2139 | struct r5l_recovery_ctx ctx; |
| 2140 | int ret; | ||
| 2141 | sector_t pos; | ||
| 2142 | struct stripe_head *sh; | ||
| 1067 | 2143 | ||
| 1068 | ctx.pos = log->last_checkpoint; | 2144 | ctx.pos = log->last_checkpoint; |
| 1069 | ctx.seq = log->last_cp_seq; | 2145 | ctx.seq = log->last_cp_seq; |
| 1070 | ctx.meta_page = alloc_page(GFP_KERNEL); | 2146 | ctx.meta_page = alloc_page(GFP_KERNEL); |
| 2147 | ctx.data_only_stripes = 0; | ||
| 2148 | ctx.data_parity_stripes = 0; | ||
| 2149 | INIT_LIST_HEAD(&ctx.cached_list); | ||
| 2150 | |||
| 1071 | if (!ctx.meta_page) | 2151 | if (!ctx.meta_page) |
| 1072 | return -ENOMEM; | 2152 | return -ENOMEM; |
| 1073 | 2153 | ||
| 1074 | r5l_recovery_flush_log(log, &ctx); | 2154 | ret = r5c_recovery_flush_log(log, &ctx); |
| 1075 | __free_page(ctx.meta_page); | 2155 | __free_page(ctx.meta_page); |
| 1076 | 2156 | ||
| 1077 | /* | 2157 | if (ret) |
| 1078 | * we did a recovery. Now ctx.pos points to an invalid meta block. New | 2158 | return ret; |
| 1079 | * log will start here. but we can't let superblock point to last valid | 2159 | |
| 1080 | * meta block. The log might looks like: | 2160 | pos = ctx.pos; |
| 1081 | * | meta 1| meta 2| meta 3| | 2161 | ctx.seq += 10000; |
| 1082 | * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If | 2162 | |
| 1083 | * superblock points to meta 1, we write a new valid meta 2n. if crash | 2163 | if (ctx.data_only_stripes == 0) { |
| 1084 | * happens again, new recovery will start from meta 1. Since meta 2n is | ||
| 1085 | * valid now, recovery will think meta 3 is valid, which is wrong. | ||
| 1086 | * The solution is we create a new meta in meta2 with its seq == meta | ||
| 1087 | * 1's seq + 10 and let superblock points to meta2. The same recovery will | ||
| 1088 | * not think meta 3 is a valid meta, because its seq doesn't match | ||
| 1089 | */ | ||
| 1090 | if (ctx.seq > log->last_cp_seq) { | ||
| 1091 | int ret; | ||
| 1092 | |||
| 1093 | ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); | ||
| 1094 | if (ret) | ||
| 1095 | return ret; | ||
| 1096 | log->seq = ctx.seq + 11; | ||
| 1097 | log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); | ||
| 1098 | r5l_write_super(log, ctx.pos); | ||
| 1099 | log->last_checkpoint = ctx.pos; | ||
| 1100 | log->next_checkpoint = ctx.pos; | 2164 | log->next_checkpoint = ctx.pos; |
| 2165 | r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); | ||
| 2166 | ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); | ||
| 1101 | } else { | 2167 | } else { |
| 1102 | log->log_start = ctx.pos; | 2168 | sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru); |
| 1103 | log->seq = ctx.seq; | 2169 | log->next_checkpoint = sh->log_start; |
| 1104 | } | 2170 | } |
| 2171 | |||
| 2172 | if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) | ||
| 2173 | pr_debug("md/raid:%s: starting from clean shutdown\n", | ||
| 2174 | mdname(mddev)); | ||
| 2175 | else { | ||
| 2176 | pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", | ||
| 2177 | mdname(mddev), ctx.data_only_stripes, | ||
| 2178 | ctx.data_parity_stripes); | ||
| 2179 | |||
| 2180 | if (ctx.data_only_stripes > 0) | ||
| 2181 | if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { | ||
| 2182 | pr_err("md/raid:%s: failed to rewrite stripes to journal\n", | ||
| 2183 | mdname(mddev)); | ||
| 2184 | return -EIO; | ||
| 2185 | } | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | log->log_start = ctx.pos; | ||
| 2189 | log->seq = ctx.seq; | ||
| 2190 | log->last_checkpoint = pos; | ||
| 2191 | r5l_write_super(log, pos); | ||
| 1105 | return 0; | 2192 | return 0; |
| 1106 | } | 2193 | } |
| 1107 | 2194 | ||
| @@ -1110,7 +2197,293 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) | |||
| 1110 | struct mddev *mddev = log->rdev->mddev; | 2197 | struct mddev *mddev = log->rdev->mddev; |
| 1111 | 2198 | ||
| 1112 | log->rdev->journal_tail = cp; | 2199 | log->rdev->journal_tail = cp; |
| 1113 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 2200 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 2201 | } | ||
| 2202 | |||
| 2203 | static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) | ||
| 2204 | { | ||
| 2205 | struct r5conf *conf = mddev->private; | ||
| 2206 | int ret; | ||
| 2207 | |||
| 2208 | if (!conf->log) | ||
| 2209 | return 0; | ||
| 2210 | |||
| 2211 | switch (conf->log->r5c_journal_mode) { | ||
| 2212 | case R5C_JOURNAL_MODE_WRITE_THROUGH: | ||
| 2213 | ret = snprintf( | ||
| 2214 | page, PAGE_SIZE, "[%s] %s\n", | ||
| 2215 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], | ||
| 2216 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); | ||
| 2217 | break; | ||
| 2218 | case R5C_JOURNAL_MODE_WRITE_BACK: | ||
| 2219 | ret = snprintf( | ||
| 2220 | page, PAGE_SIZE, "%s [%s]\n", | ||
| 2221 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], | ||
| 2222 | r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); | ||
| 2223 | break; | ||
| 2224 | default: | ||
| 2225 | ret = 0; | ||
| 2226 | } | ||
| 2227 | return ret; | ||
| 2228 | } | ||
| 2229 | |||
| 2230 | static ssize_t r5c_journal_mode_store(struct mddev *mddev, | ||
| 2231 | const char *page, size_t length) | ||
| 2232 | { | ||
| 2233 | struct r5conf *conf = mddev->private; | ||
| 2234 | struct r5l_log *log = conf->log; | ||
| 2235 | int val = -1, i; | ||
| 2236 | int len = length; | ||
| 2237 | |||
| 2238 | if (!log) | ||
| 2239 | return -ENODEV; | ||
| 2240 | |||
| 2241 | if (len && page[len - 1] == '\n') | ||
| 2242 | len -= 1; | ||
| 2243 | for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) | ||
| 2244 | if (strlen(r5c_journal_mode_str[i]) == len && | ||
| 2245 | strncmp(page, r5c_journal_mode_str[i], len) == 0) { | ||
| 2246 | val = i; | ||
| 2247 | break; | ||
| 2248 | } | ||
| 2249 | if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || | ||
| 2250 | val > R5C_JOURNAL_MODE_WRITE_BACK) | ||
| 2251 | return -EINVAL; | ||
| 2252 | |||
| 2253 | mddev_suspend(mddev); | ||
| 2254 | conf->log->r5c_journal_mode = val; | ||
| 2255 | mddev_resume(mddev); | ||
| 2256 | |||
| 2257 | pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", | ||
| 2258 | mdname(mddev), val, r5c_journal_mode_str[val]); | ||
| 2259 | return length; | ||
| 2260 | } | ||
| 2261 | |||
| 2262 | struct md_sysfs_entry | ||
| 2263 | r5c_journal_mode = __ATTR(journal_mode, 0644, | ||
| 2264 | r5c_journal_mode_show, r5c_journal_mode_store); | ||
| 2265 | |||
| 2266 | /* | ||
| 2267 | * Try handle write operation in caching phase. This function should only | ||
| 2268 | * be called in write-back mode. | ||
| 2269 | * | ||
| 2270 | * If all outstanding writes can be handled in caching phase, returns 0 | ||
| 2271 | * If writes requires write-out phase, call r5c_make_stripe_write_out() | ||
| 2272 | * and returns -EAGAIN | ||
| 2273 | */ | ||
| 2274 | int r5c_try_caching_write(struct r5conf *conf, | ||
| 2275 | struct stripe_head *sh, | ||
| 2276 | struct stripe_head_state *s, | ||
| 2277 | int disks) | ||
| 2278 | { | ||
| 2279 | struct r5l_log *log = conf->log; | ||
| 2280 | int i; | ||
| 2281 | struct r5dev *dev; | ||
| 2282 | int to_cache = 0; | ||
| 2283 | |||
| 2284 | BUG_ON(!r5c_is_writeback(log)); | ||
| 2285 | |||
| 2286 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { | ||
| 2287 | /* | ||
| 2288 | * There are two different scenarios here: | ||
| 2289 | * 1. The stripe has some data cached, and it is sent to | ||
| 2290 | * write-out phase for reclaim | ||
| 2291 | * 2. The stripe is clean, and this is the first write | ||
| 2292 | * | ||
| 2293 | * For 1, return -EAGAIN, so we continue with | ||
| 2294 | * handle_stripe_dirtying(). | ||
| 2295 | * | ||
| 2296 | * For 2, set STRIPE_R5C_CACHING and continue with caching | ||
| 2297 | * write. | ||
| 2298 | */ | ||
| 2299 | |||
| 2300 | /* case 1: anything injournal or anything in written */ | ||
| 2301 | if (s->injournal > 0 || s->written > 0) | ||
| 2302 | return -EAGAIN; | ||
| 2303 | /* case 2 */ | ||
| 2304 | set_bit(STRIPE_R5C_CACHING, &sh->state); | ||
| 2305 | } | ||
| 2306 | |||
| 2307 | for (i = disks; i--; ) { | ||
| 2308 | dev = &sh->dev[i]; | ||
| 2309 | /* if non-overwrite, use writing-out phase */ | ||
| 2310 | if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && | ||
| 2311 | !test_bit(R5_InJournal, &dev->flags)) { | ||
| 2312 | r5c_make_stripe_write_out(sh); | ||
| 2313 | return -EAGAIN; | ||
| 2314 | } | ||
| 2315 | } | ||
| 2316 | |||
| 2317 | for (i = disks; i--; ) { | ||
| 2318 | dev = &sh->dev[i]; | ||
| 2319 | if (dev->towrite) { | ||
| 2320 | set_bit(R5_Wantwrite, &dev->flags); | ||
| 2321 | set_bit(R5_Wantdrain, &dev->flags); | ||
| 2322 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2323 | to_cache++; | ||
| 2324 | } | ||
| 2325 | } | ||
| 2326 | |||
| 2327 | if (to_cache) { | ||
| 2328 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); | ||
| 2329 | /* | ||
| 2330 | * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() | ||
| 2331 | * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in | ||
| 2332 | * r5c_handle_data_cached() | ||
| 2333 | */ | ||
| 2334 | set_bit(STRIPE_LOG_TRAPPED, &sh->state); | ||
| 2335 | } | ||
| 2336 | |||
| 2337 | return 0; | ||
| 2338 | } | ||
| 2339 | |||
| 2340 | /* | ||
| 2341 | * free extra pages (orig_page) we allocated for prexor | ||
| 2342 | */ | ||
| 2343 | void r5c_release_extra_page(struct stripe_head *sh) | ||
| 2344 | { | ||
| 2345 | struct r5conf *conf = sh->raid_conf; | ||
| 2346 | int i; | ||
| 2347 | bool using_disk_info_extra_page; | ||
| 2348 | |||
| 2349 | using_disk_info_extra_page = | ||
| 2350 | sh->dev[0].orig_page == conf->disks[0].extra_page; | ||
| 2351 | |||
| 2352 | for (i = sh->disks; i--; ) | ||
| 2353 | if (sh->dev[i].page != sh->dev[i].orig_page) { | ||
| 2354 | struct page *p = sh->dev[i].orig_page; | ||
| 2355 | |||
| 2356 | sh->dev[i].orig_page = sh->dev[i].page; | ||
| 2357 | if (!using_disk_info_extra_page) | ||
| 2358 | put_page(p); | ||
| 2359 | } | ||
| 2360 | |||
| 2361 | if (using_disk_info_extra_page) { | ||
| 2362 | clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); | ||
| 2363 | md_wakeup_thread(conf->mddev->thread); | ||
| 2364 | } | ||
| 2365 | } | ||
| 2366 | |||
| 2367 | void r5c_use_extra_page(struct stripe_head *sh) | ||
| 2368 | { | ||
| 2369 | struct r5conf *conf = sh->raid_conf; | ||
| 2370 | int i; | ||
| 2371 | struct r5dev *dev; | ||
| 2372 | |||
| 2373 | for (i = sh->disks; i--; ) { | ||
| 2374 | dev = &sh->dev[i]; | ||
| 2375 | if (dev->orig_page != dev->page) | ||
| 2376 | put_page(dev->orig_page); | ||
| 2377 | dev->orig_page = conf->disks[i].extra_page; | ||
| 2378 | } | ||
| 2379 | } | ||
| 2380 | |||
| 2381 | /* | ||
| 2382 | * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the | ||
| 2383 | * stripe is committed to RAID disks. | ||
| 2384 | */ | ||
| 2385 | void r5c_finish_stripe_write_out(struct r5conf *conf, | ||
| 2386 | struct stripe_head *sh, | ||
| 2387 | struct stripe_head_state *s) | ||
| 2388 | { | ||
| 2389 | int i; | ||
| 2390 | int do_wakeup = 0; | ||
| 2391 | |||
| 2392 | if (!conf->log || | ||
| 2393 | !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) | ||
| 2394 | return; | ||
| 2395 | |||
| 2396 | WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); | ||
| 2397 | clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); | ||
| 2398 | |||
| 2399 | if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
| 2400 | return; | ||
| 2401 | |||
| 2402 | for (i = sh->disks; i--; ) { | ||
| 2403 | clear_bit(R5_InJournal, &sh->dev[i].flags); | ||
| 2404 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | ||
| 2405 | do_wakeup = 1; | ||
| 2406 | } | ||
| 2407 | |||
| 2408 | /* | ||
| 2409 | * analyse_stripe() runs before r5c_finish_stripe_write_out(), | ||
| 2410 | * We updated R5_InJournal, so we also update s->injournal. | ||
| 2411 | */ | ||
| 2412 | s->injournal = 0; | ||
| 2413 | |||
| 2414 | if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) | ||
| 2415 | if (atomic_dec_and_test(&conf->pending_full_writes)) | ||
| 2416 | md_wakeup_thread(conf->mddev->thread); | ||
| 2417 | |||
| 2418 | if (do_wakeup) | ||
| 2419 | wake_up(&conf->wait_for_overlap); | ||
| 2420 | |||
| 2421 | if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) | ||
| 2422 | return; | ||
| 2423 | |||
| 2424 | spin_lock_irq(&conf->log->stripe_in_journal_lock); | ||
| 2425 | list_del_init(&sh->r5c); | ||
| 2426 | spin_unlock_irq(&conf->log->stripe_in_journal_lock); | ||
| 2427 | sh->log_start = MaxSector; | ||
| 2428 | atomic_dec(&conf->log->stripe_in_journal_count); | ||
| 2429 | r5c_update_log_state(conf->log); | ||
| 2430 | } | ||
| 2431 | |||
| 2432 | int | ||
| 2433 | r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, | ||
| 2434 | struct stripe_head_state *s) | ||
| 2435 | { | ||
| 2436 | struct r5conf *conf = sh->raid_conf; | ||
| 2437 | int pages = 0; | ||
| 2438 | int reserve; | ||
| 2439 | int i; | ||
| 2440 | int ret = 0; | ||
| 2441 | |||
| 2442 | BUG_ON(!log); | ||
| 2443 | |||
| 2444 | for (i = 0; i < sh->disks; i++) { | ||
| 2445 | void *addr; | ||
| 2446 | |||
| 2447 | if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
| 2448 | continue; | ||
| 2449 | addr = kmap_atomic(sh->dev[i].page); | ||
| 2450 | sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, | ||
| 2451 | addr, PAGE_SIZE); | ||
| 2452 | kunmap_atomic(addr); | ||
| 2453 | pages++; | ||
| 2454 | } | ||
| 2455 | WARN_ON(pages == 0); | ||
| 2456 | |||
| 2457 | /* | ||
| 2458 | * The stripe must enter state machine again to call endio, so | ||
| 2459 | * don't delay. | ||
| 2460 | */ | ||
| 2461 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
| 2462 | atomic_inc(&sh->count); | ||
| 2463 | |||
| 2464 | mutex_lock(&log->io_mutex); | ||
| 2465 | /* meta + data */ | ||
| 2466 | reserve = (1 + pages) << (PAGE_SHIFT - 9); | ||
| 2467 | |||
| 2468 | if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && | ||
| 2469 | sh->log_start == MaxSector) | ||
| 2470 | r5l_add_no_space_stripe(log, sh); | ||
| 2471 | else if (!r5l_has_free_space(log, reserve)) { | ||
| 2472 | if (sh->log_start == log->last_checkpoint) | ||
| 2473 | BUG(); | ||
| 2474 | else | ||
| 2475 | r5l_add_no_space_stripe(log, sh); | ||
| 2476 | } else { | ||
| 2477 | ret = r5l_log_stripe(log, sh, pages, 0); | ||
| 2478 | if (ret) { | ||
| 2479 | spin_lock_irq(&log->io_list_lock); | ||
| 2480 | list_add_tail(&sh->log_list, &log->no_mem_stripes); | ||
| 2481 | spin_unlock_irq(&log->io_list_lock); | ||
| 2482 | } | ||
| 2483 | } | ||
| 2484 | |||
| 2485 | mutex_unlock(&log->io_mutex); | ||
| 2486 | return 0; | ||
| 1114 | } | 2487 | } |
| 1115 | 2488 | ||
| 1116 | static int r5l_load_log(struct r5l_log *log) | 2489 | static int r5l_load_log(struct r5l_log *log) |
| @@ -1121,7 +2494,7 @@ static int r5l_load_log(struct r5l_log *log) | |||
| 1121 | sector_t cp = log->rdev->journal_tail; | 2494 | sector_t cp = log->rdev->journal_tail; |
| 1122 | u32 stored_crc, expected_crc; | 2495 | u32 stored_crc, expected_crc; |
| 1123 | bool create_super = false; | 2496 | bool create_super = false; |
| 1124 | int ret; | 2497 | int ret = 0; |
| 1125 | 2498 | ||
| 1126 | /* Make sure it's valid */ | 2499 | /* Make sure it's valid */ |
| 1127 | if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) | 2500 | if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) |
| @@ -1171,11 +2544,18 @@ create: | |||
| 1171 | if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) | 2544 | if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) |
| 1172 | log->max_free_space = RECLAIM_MAX_FREE_SPACE; | 2545 | log->max_free_space = RECLAIM_MAX_FREE_SPACE; |
| 1173 | log->last_checkpoint = cp; | 2546 | log->last_checkpoint = cp; |
| 1174 | log->next_checkpoint = cp; | ||
| 1175 | 2547 | ||
| 1176 | __free_page(page); | 2548 | __free_page(page); |
| 1177 | 2549 | ||
| 1178 | return r5l_recovery_log(log); | 2550 | if (create_super) { |
| 2551 | log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); | ||
| 2552 | log->seq = log->last_cp_seq + 1; | ||
| 2553 | log->next_checkpoint = cp; | ||
| 2554 | } else | ||
| 2555 | ret = r5l_recovery_log(log); | ||
| 2556 | |||
| 2557 | r5c_update_log_state(log); | ||
| 2558 | return ret; | ||
| 1179 | ioerr: | 2559 | ioerr: |
| 1180 | __free_page(page); | 2560 | __free_page(page); |
| 1181 | return ret; | 2561 | return ret; |
| @@ -1188,6 +2568,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
| 1188 | 2568 | ||
| 1189 | if (PAGE_SIZE != 4096) | 2569 | if (PAGE_SIZE != 4096) |
| 1190 | return -EINVAL; | 2570 | return -EINVAL; |
| 2571 | |||
| 2572 | /* | ||
| 2573 | * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and | ||
| 2574 | * raid_disks r5l_payload_data_parity. | ||
| 2575 | * | ||
| 2576 | * Write journal and cache does not work for very big array | ||
| 2577 | * (raid_disks > 203) | ||
| 2578 | */ | ||
| 2579 | if (sizeof(struct r5l_meta_block) + | ||
| 2580 | ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * | ||
| 2581 | conf->raid_disks) > PAGE_SIZE) { | ||
| 2582 | pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", | ||
| 2583 | mdname(conf->mddev), conf->raid_disks); | ||
| 2584 | return -EINVAL; | ||
| 2585 | } | ||
| 2586 | |||
| 1191 | log = kzalloc(sizeof(*log), GFP_KERNEL); | 2587 | log = kzalloc(sizeof(*log), GFP_KERNEL); |
| 1192 | if (!log) | 2588 | if (!log) |
| 1193 | return -ENOMEM; | 2589 | return -ENOMEM; |
| @@ -1227,6 +2623,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
| 1227 | log->rdev->mddev, "reclaim"); | 2623 | log->rdev->mddev, "reclaim"); |
| 1228 | if (!log->reclaim_thread) | 2624 | if (!log->reclaim_thread) |
| 1229 | goto reclaim_thread; | 2625 | goto reclaim_thread; |
| 2626 | log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; | ||
| 2627 | |||
| 1230 | init_waitqueue_head(&log->iounit_wait); | 2628 | init_waitqueue_head(&log->iounit_wait); |
| 1231 | 2629 | ||
| 1232 | INIT_LIST_HEAD(&log->no_mem_stripes); | 2630 | INIT_LIST_HEAD(&log->no_mem_stripes); |
| @@ -1234,6 +2632,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) | |||
| 1234 | INIT_LIST_HEAD(&log->no_space_stripes); | 2632 | INIT_LIST_HEAD(&log->no_space_stripes); |
| 1235 | spin_lock_init(&log->no_space_stripes_lock); | 2633 | spin_lock_init(&log->no_space_stripes_lock); |
| 1236 | 2634 | ||
| 2635 | INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); | ||
| 2636 | |||
| 2637 | log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; | ||
| 2638 | INIT_LIST_HEAD(&log->stripe_in_journal_list); | ||
| 2639 | spin_lock_init(&log->stripe_in_journal_lock); | ||
| 2640 | atomic_set(&log->stripe_in_journal_count, 0); | ||
| 2641 | |||
| 1237 | if (r5l_load_log(log)) | 2642 | if (r5l_load_log(log)) |
| 1238 | goto error; | 2643 | goto error; |
| 1239 | 2644 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5f9e28443c8a..06d7279bdd04 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644); | |||
| 70 | MODULE_PARM_DESC(devices_handle_discard_safely, | 70 | MODULE_PARM_DESC(devices_handle_discard_safely, |
| 71 | "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); | 71 | "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); |
| 72 | static struct workqueue_struct *raid5_wq; | 72 | static struct workqueue_struct *raid5_wq; |
| 73 | /* | ||
| 74 | * Stripe cache | ||
| 75 | */ | ||
| 76 | |||
| 77 | #define NR_STRIPES 256 | ||
| 78 | #define STRIPE_SIZE PAGE_SIZE | ||
| 79 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | ||
| 80 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | ||
| 81 | #define IO_THRESHOLD 1 | ||
| 82 | #define BYPASS_THRESHOLD 1 | ||
| 83 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) | ||
| 84 | #define HASH_MASK (NR_HASH - 1) | ||
| 85 | #define MAX_STRIPE_BATCH 8 | ||
| 86 | 73 | ||
| 87 | static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) | 74 | static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) |
| 88 | { | 75 | { |
| @@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) | |||
| 126 | local_irq_enable(); | 113 | local_irq_enable(); |
| 127 | } | 114 | } |
| 128 | 115 | ||
| 129 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | ||
| 130 | * order without overlap. There may be several bio's per stripe+device, and | ||
| 131 | * a bio could span several devices. | ||
| 132 | * When walking this list for a particular stripe+device, we must never proceed | ||
| 133 | * beyond a bio that extends past this device, as the next bio might no longer | ||
| 134 | * be valid. | ||
| 135 | * This function is used to determine the 'next' bio in the list, given the sector | ||
| 136 | * of the current stripe+device | ||
| 137 | */ | ||
| 138 | static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | ||
| 139 | { | ||
| 140 | int sectors = bio_sectors(bio); | ||
| 141 | if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) | ||
| 142 | return bio->bi_next; | ||
| 143 | else | ||
| 144 | return NULL; | ||
| 145 | } | ||
| 146 | |||
| 147 | /* | ||
| 148 | * We maintain a biased count of active stripes in the bottom 16 bits of | ||
| 149 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | ||
| 150 | */ | ||
| 151 | static inline int raid5_bi_processed_stripes(struct bio *bio) | ||
| 152 | { | ||
| 153 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 154 | return (atomic_read(segments) >> 16) & 0xffff; | ||
| 155 | } | ||
| 156 | |||
| 157 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) | ||
| 158 | { | ||
| 159 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 160 | return atomic_sub_return(1, segments) & 0xffff; | ||
| 161 | } | ||
| 162 | |||
| 163 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) | ||
| 164 | { | ||
| 165 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 166 | atomic_inc(segments); | ||
| 167 | } | ||
| 168 | |||
| 169 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, | ||
| 170 | unsigned int cnt) | ||
| 171 | { | ||
| 172 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 173 | int old, new; | ||
| 174 | |||
| 175 | do { | ||
| 176 | old = atomic_read(segments); | ||
| 177 | new = (old & 0xffff) | (cnt << 16); | ||
| 178 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
| 179 | } | ||
| 180 | |||
| 181 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) | ||
| 182 | { | ||
| 183 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 184 | atomic_set(segments, cnt); | ||
| 185 | } | ||
| 186 | |||
| 187 | /* Find first data disk in a raid6 stripe */ | 116 | /* Find first data disk in a raid6 stripe */ |
| 188 | static inline int raid6_d0(struct stripe_head *sh) | 117 | static inline int raid6_d0(struct stripe_head *sh) |
| 189 | { | 118 | { |
| @@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) | |||
| 289 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, | 218 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, |
| 290 | struct list_head *temp_inactive_list) | 219 | struct list_head *temp_inactive_list) |
| 291 | { | 220 | { |
| 221 | int i; | ||
| 222 | int injournal = 0; /* number of date pages with R5_InJournal */ | ||
| 223 | |||
| 292 | BUG_ON(!list_empty(&sh->lru)); | 224 | BUG_ON(!list_empty(&sh->lru)); |
| 293 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 225 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
| 226 | |||
| 227 | if (r5c_is_writeback(conf->log)) | ||
| 228 | for (i = sh->disks; i--; ) | ||
| 229 | if (test_bit(R5_InJournal, &sh->dev[i].flags)) | ||
| 230 | injournal++; | ||
| 231 | /* | ||
| 232 | * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with | ||
| 233 | * data in journal, so they are not released to cached lists | ||
| 234 | */ | ||
| 235 | if (conf->quiesce && r5c_is_writeback(conf->log) && | ||
| 236 | !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { | ||
| 237 | if (test_bit(STRIPE_R5C_CACHING, &sh->state)) | ||
| 238 | r5c_make_stripe_write_out(sh); | ||
| 239 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 240 | } | ||
| 241 | |||
| 294 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 242 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
| 295 | if (test_bit(STRIPE_DELAYED, &sh->state) && | 243 | if (test_bit(STRIPE_DELAYED, &sh->state) && |
| 296 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 244 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| @@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 316 | < IO_THRESHOLD) | 264 | < IO_THRESHOLD) |
| 317 | md_wakeup_thread(conf->mddev->thread); | 265 | md_wakeup_thread(conf->mddev->thread); |
| 318 | atomic_dec(&conf->active_stripes); | 266 | atomic_dec(&conf->active_stripes); |
| 319 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) | 267 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
| 320 | list_add_tail(&sh->lru, temp_inactive_list); | 268 | if (!r5c_is_writeback(conf->log)) |
| 269 | list_add_tail(&sh->lru, temp_inactive_list); | ||
| 270 | else { | ||
| 271 | WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); | ||
| 272 | if (injournal == 0) | ||
| 273 | list_add_tail(&sh->lru, temp_inactive_list); | ||
| 274 | else if (injournal == conf->raid_disks - conf->max_degraded) { | ||
| 275 | /* full stripe */ | ||
| 276 | if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) | ||
| 277 | atomic_inc(&conf->r5c_cached_full_stripes); | ||
| 278 | if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) | ||
| 279 | atomic_dec(&conf->r5c_cached_partial_stripes); | ||
| 280 | list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); | ||
| 281 | r5c_check_cached_full_stripe(conf); | ||
| 282 | } else { | ||
| 283 | /* partial stripe */ | ||
| 284 | if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, | ||
| 285 | &sh->state)) | ||
| 286 | atomic_inc(&conf->r5c_cached_partial_stripes); | ||
| 287 | list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); | ||
| 288 | } | ||
| 289 | } | ||
| 290 | } | ||
| 321 | } | 291 | } |
| 322 | } | 292 | } |
| 323 | 293 | ||
| @@ -541,7 +511,7 @@ retry: | |||
| 541 | 511 | ||
| 542 | if (dev->toread || dev->read || dev->towrite || dev->written || | 512 | if (dev->toread || dev->read || dev->towrite || dev->written || |
| 543 | test_bit(R5_LOCKED, &dev->flags)) { | 513 | test_bit(R5_LOCKED, &dev->flags)) { |
| 544 | printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", | 514 | pr_err("sector=%llx i=%d %p %p %p %p %d\n", |
| 545 | (unsigned long long)sh->sector, i, dev->toread, | 515 | (unsigned long long)sh->sector, i, dev->toread, |
| 546 | dev->read, dev->towrite, dev->written, | 516 | dev->read, dev->towrite, dev->written, |
| 547 | test_bit(R5_LOCKED, &dev->flags)); | 517 | test_bit(R5_LOCKED, &dev->flags)); |
| @@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, | |||
| 680 | } | 650 | } |
| 681 | if (noblock && sh == NULL) | 651 | if (noblock && sh == NULL) |
| 682 | break; | 652 | break; |
| 653 | |||
| 654 | r5c_check_stripe_cache_usage(conf); | ||
| 683 | if (!sh) { | 655 | if (!sh) { |
| 684 | set_bit(R5_INACTIVE_BLOCKED, | 656 | set_bit(R5_INACTIVE_BLOCKED, |
| 685 | &conf->cache_state); | 657 | &conf->cache_state); |
| 658 | r5l_wake_reclaim(conf->log, 0); | ||
| 686 | wait_event_lock_irq( | 659 | wait_event_lock_irq( |
| 687 | conf->wait_for_stripe, | 660 | conf->wait_for_stripe, |
| 688 | !list_empty(conf->inactive_list + hash) && | 661 | !list_empty(conf->inactive_list + hash) && |
| @@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 901 | 874 | ||
| 902 | might_sleep(); | 875 | might_sleep(); |
| 903 | 876 | ||
| 904 | if (r5l_write_stripe(conf->log, sh) == 0) | 877 | if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { |
| 905 | return; | 878 | /* writing out phase */ |
| 879 | if (s->waiting_extra_page) | ||
| 880 | return; | ||
| 881 | if (r5l_write_stripe(conf->log, sh) == 0) | ||
| 882 | return; | ||
| 883 | } else { /* caching phase */ | ||
| 884 | if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { | ||
| 885 | r5c_cache_data(conf->log, sh, s); | ||
| 886 | return; | ||
| 887 | } | ||
| 888 | } | ||
| 889 | |||
| 906 | for (i = disks; i--; ) { | 890 | for (i = disks; i--; ) { |
| 907 | int op, op_flags = 0; | 891 | int op, op_flags = 0; |
| 908 | int replace_only = 0; | 892 | int replace_only = 0; |
| @@ -977,7 +961,7 @@ again: | |||
| 977 | if (bad < 0) { | 961 | if (bad < 0) { |
| 978 | set_bit(BlockedBadBlocks, &rdev->flags); | 962 | set_bit(BlockedBadBlocks, &rdev->flags); |
| 979 | if (!conf->mddev->external && | 963 | if (!conf->mddev->external && |
| 980 | conf->mddev->flags) { | 964 | conf->mddev->sb_flags) { |
| 981 | /* It is very unlikely, but we might | 965 | /* It is very unlikely, but we might |
| 982 | * still need to write out the | 966 | * still need to write out the |
| 983 | * bad block log - better give it | 967 | * bad block log - better give it |
| @@ -1115,7 +1099,7 @@ again: | |||
| 1115 | static struct dma_async_tx_descriptor * | 1099 | static struct dma_async_tx_descriptor * |
| 1116 | async_copy_data(int frombio, struct bio *bio, struct page **page, | 1100 | async_copy_data(int frombio, struct bio *bio, struct page **page, |
| 1117 | sector_t sector, struct dma_async_tx_descriptor *tx, | 1101 | sector_t sector, struct dma_async_tx_descriptor *tx, |
| 1118 | struct stripe_head *sh) | 1102 | struct stripe_head *sh, int no_skipcopy) |
| 1119 | { | 1103 | { |
| 1120 | struct bio_vec bvl; | 1104 | struct bio_vec bvl; |
| 1121 | struct bvec_iter iter; | 1105 | struct bvec_iter iter; |
| @@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, | |||
| 1155 | if (frombio) { | 1139 | if (frombio) { |
| 1156 | if (sh->raid_conf->skip_copy && | 1140 | if (sh->raid_conf->skip_copy && |
| 1157 | b_offset == 0 && page_offset == 0 && | 1141 | b_offset == 0 && page_offset == 0 && |
| 1158 | clen == STRIPE_SIZE) | 1142 | clen == STRIPE_SIZE && |
| 1143 | !no_skipcopy) | ||
| 1159 | *page = bio_page; | 1144 | *page = bio_page; |
| 1160 | else | 1145 | else |
| 1161 | tx = async_memcpy(*page, bio_page, page_offset, | 1146 | tx = async_memcpy(*page, bio_page, page_offset, |
| @@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
| 1237 | while (rbi && rbi->bi_iter.bi_sector < | 1222 | while (rbi && rbi->bi_iter.bi_sector < |
| 1238 | dev->sector + STRIPE_SECTORS) { | 1223 | dev->sector + STRIPE_SECTORS) { |
| 1239 | tx = async_copy_data(0, rbi, &dev->page, | 1224 | tx = async_copy_data(0, rbi, &dev->page, |
| 1240 | dev->sector, tx, sh); | 1225 | dev->sector, tx, sh, 0); |
| 1241 | rbi = r5_next_bio(rbi, dev->sector); | 1226 | rbi = r5_next_bio(rbi, dev->sector); |
| 1242 | } | 1227 | } |
| 1243 | } | 1228 | } |
| @@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs, | |||
| 1364 | if (i == sh->qd_idx || i == sh->pd_idx || | 1349 | if (i == sh->qd_idx || i == sh->pd_idx || |
| 1365 | (srctype == SYNDROME_SRC_ALL) || | 1350 | (srctype == SYNDROME_SRC_ALL) || |
| 1366 | (srctype == SYNDROME_SRC_WANT_DRAIN && | 1351 | (srctype == SYNDROME_SRC_WANT_DRAIN && |
| 1367 | test_bit(R5_Wantdrain, &dev->flags)) || | 1352 | (test_bit(R5_Wantdrain, &dev->flags) || |
| 1353 | test_bit(R5_InJournal, &dev->flags))) || | ||
| 1368 | (srctype == SYNDROME_SRC_WRITTEN && | 1354 | (srctype == SYNDROME_SRC_WRITTEN && |
| 1369 | dev->written)) | 1355 | dev->written)) { |
| 1370 | srcs[slot] = sh->dev[i].page; | 1356 | if (test_bit(R5_InJournal, &dev->flags)) |
| 1357 | srcs[slot] = sh->dev[i].orig_page; | ||
| 1358 | else | ||
| 1359 | srcs[slot] = sh->dev[i].page; | ||
| 1360 | } | ||
| 1371 | i = raid6_next_disk(i, disks); | 1361 | i = raid6_next_disk(i, disks); |
| 1372 | } while (i != d0_idx); | 1362 | } while (i != d0_idx); |
| 1373 | 1363 | ||
| @@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
| 1546 | 1536 | ||
| 1547 | pr_debug("%s: stripe %llu\n", __func__, | 1537 | pr_debug("%s: stripe %llu\n", __func__, |
| 1548 | (unsigned long long)sh->sector); | 1538 | (unsigned long long)sh->sector); |
| 1539 | |||
| 1540 | if (r5c_is_writeback(sh->raid_conf->log)) | ||
| 1541 | /* | ||
| 1542 | * raid5-cache write back uses orig_page during prexor. | ||
| 1543 | * After prexor, it is time to free orig_page | ||
| 1544 | */ | ||
| 1545 | r5c_release_extra_page(sh); | ||
| 1549 | } | 1546 | } |
| 1550 | 1547 | ||
| 1551 | static struct dma_async_tx_descriptor * | 1548 | static struct dma_async_tx_descriptor * |
| @@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1567 | for (i = disks; i--; ) { | 1564 | for (i = disks; i--; ) { |
| 1568 | struct r5dev *dev = &sh->dev[i]; | 1565 | struct r5dev *dev = &sh->dev[i]; |
| 1569 | /* Only process blocks that are known to be uptodate */ | 1566 | /* Only process blocks that are known to be uptodate */ |
| 1570 | if (test_bit(R5_Wantdrain, &dev->flags)) | 1567 | if (test_bit(R5_InJournal, &dev->flags)) |
| 1568 | xor_srcs[count++] = dev->orig_page; | ||
| 1569 | else if (test_bit(R5_Wantdrain, &dev->flags)) | ||
| 1571 | xor_srcs[count++] = dev->page; | 1570 | xor_srcs[count++] = dev->page; |
| 1572 | } | 1571 | } |
| 1573 | 1572 | ||
| @@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
| 1601 | static struct dma_async_tx_descriptor * | 1600 | static struct dma_async_tx_descriptor * |
| 1602 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | 1601 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
| 1603 | { | 1602 | { |
| 1603 | struct r5conf *conf = sh->raid_conf; | ||
| 1604 | int disks = sh->disks; | 1604 | int disks = sh->disks; |
| 1605 | int i; | 1605 | int i; |
| 1606 | struct stripe_head *head_sh = sh; | 1606 | struct stripe_head *head_sh = sh; |
| @@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1618 | 1618 | ||
| 1619 | again: | 1619 | again: |
| 1620 | dev = &sh->dev[i]; | 1620 | dev = &sh->dev[i]; |
| 1621 | /* | ||
| 1622 | * clear R5_InJournal, so when rewriting a page in | ||
| 1623 | * journal, it is not skipped by r5l_log_stripe() | ||
| 1624 | */ | ||
| 1625 | clear_bit(R5_InJournal, &dev->flags); | ||
| 1621 | spin_lock_irq(&sh->stripe_lock); | 1626 | spin_lock_irq(&sh->stripe_lock); |
| 1622 | chosen = dev->towrite; | 1627 | chosen = dev->towrite; |
| 1623 | dev->towrite = NULL; | 1628 | dev->towrite = NULL; |
| @@ -1637,8 +1642,10 @@ again: | |||
| 1637 | set_bit(R5_Discard, &dev->flags); | 1642 | set_bit(R5_Discard, &dev->flags); |
| 1638 | else { | 1643 | else { |
| 1639 | tx = async_copy_data(1, wbi, &dev->page, | 1644 | tx = async_copy_data(1, wbi, &dev->page, |
| 1640 | dev->sector, tx, sh); | 1645 | dev->sector, tx, sh, |
| 1641 | if (dev->page != dev->orig_page) { | 1646 | r5c_is_writeback(conf->log)); |
| 1647 | if (dev->page != dev->orig_page && | ||
| 1648 | !r5c_is_writeback(conf->log)) { | ||
| 1642 | set_bit(R5_SkipCopy, &dev->flags); | 1649 | set_bit(R5_SkipCopy, &dev->flags); |
| 1643 | clear_bit(R5_UPTODATE, &dev->flags); | 1650 | clear_bit(R5_UPTODATE, &dev->flags); |
| 1644 | clear_bit(R5_OVERWRITE, &dev->flags); | 1651 | clear_bit(R5_OVERWRITE, &dev->flags); |
| @@ -1746,7 +1753,8 @@ again: | |||
| 1746 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 1753 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
| 1747 | for (i = disks; i--; ) { | 1754 | for (i = disks; i--; ) { |
| 1748 | struct r5dev *dev = &sh->dev[i]; | 1755 | struct r5dev *dev = &sh->dev[i]; |
| 1749 | if (head_sh->dev[i].written) | 1756 | if (head_sh->dev[i].written || |
| 1757 | test_bit(R5_InJournal, &head_sh->dev[i].flags)) | ||
| 1750 | xor_srcs[count++] = dev->page; | 1758 | xor_srcs[count++] = dev->page; |
| 1751 | } | 1759 | } |
| 1752 | } else { | 1760 | } else { |
| @@ -2000,7 +2008,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, | |||
| 2000 | spin_lock_init(&sh->batch_lock); | 2008 | spin_lock_init(&sh->batch_lock); |
| 2001 | INIT_LIST_HEAD(&sh->batch_list); | 2009 | INIT_LIST_HEAD(&sh->batch_list); |
| 2002 | INIT_LIST_HEAD(&sh->lru); | 2010 | INIT_LIST_HEAD(&sh->lru); |
| 2011 | INIT_LIST_HEAD(&sh->r5c); | ||
| 2012 | INIT_LIST_HEAD(&sh->log_list); | ||
| 2003 | atomic_set(&sh->count, 1); | 2013 | atomic_set(&sh->count, 1); |
| 2014 | sh->log_start = MaxSector; | ||
| 2004 | for (i = 0; i < disks; i++) { | 2015 | for (i = 0; i < disks; i++) { |
| 2005 | struct r5dev *dev = &sh->dev[i]; | 2016 | struct r5dev *dev = &sh->dev[i]; |
| 2006 | 2017 | ||
| @@ -2240,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 2240 | */ | 2251 | */ |
| 2241 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); | 2252 | ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); |
| 2242 | if (ndisks) { | 2253 | if (ndisks) { |
| 2243 | for (i=0; i<conf->raid_disks; i++) | 2254 | for (i = 0; i < conf->pool_size; i++) |
| 2244 | ndisks[i] = conf->disks[i]; | 2255 | ndisks[i] = conf->disks[i]; |
| 2245 | kfree(conf->disks); | 2256 | |
| 2246 | conf->disks = ndisks; | 2257 | for (i = conf->pool_size; i < newsize; i++) { |
| 2258 | ndisks[i].extra_page = alloc_page(GFP_NOIO); | ||
| 2259 | if (!ndisks[i].extra_page) | ||
| 2260 | err = -ENOMEM; | ||
| 2261 | } | ||
| 2262 | |||
| 2263 | if (err) { | ||
| 2264 | for (i = conf->pool_size; i < newsize; i++) | ||
| 2265 | if (ndisks[i].extra_page) | ||
| 2266 | put_page(ndisks[i].extra_page); | ||
| 2267 | kfree(ndisks); | ||
| 2268 | } else { | ||
| 2269 | kfree(conf->disks); | ||
| 2270 | conf->disks = ndisks; | ||
| 2271 | } | ||
| 2247 | } else | 2272 | } else |
| 2248 | err = -ENOMEM; | 2273 | err = -ENOMEM; |
| 2249 | 2274 | ||
| @@ -2342,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi) | |||
| 2342 | * replacement device. We just fail those on | 2367 | * replacement device. We just fail those on |
| 2343 | * any error | 2368 | * any error |
| 2344 | */ | 2369 | */ |
| 2345 | printk_ratelimited( | 2370 | pr_info_ratelimited( |
| 2346 | KERN_INFO | 2371 | "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", |
| 2347 | "md/raid:%s: read error corrected" | ||
| 2348 | " (%lu sectors at %llu on %s)\n", | ||
| 2349 | mdname(conf->mddev), STRIPE_SECTORS, | 2372 | mdname(conf->mddev), STRIPE_SECTORS, |
| 2350 | (unsigned long long)s, | 2373 | (unsigned long long)s, |
| 2351 | bdevname(rdev->bdev, b)); | 2374 | bdevname(rdev->bdev, b)); |
| @@ -2365,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi) | |||
| 2365 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); | 2388 | clear_bit(R5_UPTODATE, &sh->dev[i].flags); |
| 2366 | atomic_inc(&rdev->read_errors); | 2389 | atomic_inc(&rdev->read_errors); |
| 2367 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) | 2390 | if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) |
| 2368 | printk_ratelimited( | 2391 | pr_warn_ratelimited( |
| 2369 | KERN_WARNING | 2392 | "md/raid:%s: read error on replacement device (sector %llu on %s).\n", |
| 2370 | "md/raid:%s: read error on replacement device " | ||
| 2371 | "(sector %llu on %s).\n", | ||
| 2372 | mdname(conf->mddev), | 2393 | mdname(conf->mddev), |
| 2373 | (unsigned long long)s, | 2394 | (unsigned long long)s, |
| 2374 | bdn); | 2395 | bdn); |
| 2375 | else if (conf->mddev->degraded >= conf->max_degraded) { | 2396 | else if (conf->mddev->degraded >= conf->max_degraded) { |
| 2376 | set_bad = 1; | 2397 | set_bad = 1; |
| 2377 | printk_ratelimited( | 2398 | pr_warn_ratelimited( |
| 2378 | KERN_WARNING | 2399 | "md/raid:%s: read error not correctable (sector %llu on %s).\n", |
| 2379 | "md/raid:%s: read error not correctable " | ||
| 2380 | "(sector %llu on %s).\n", | ||
| 2381 | mdname(conf->mddev), | 2400 | mdname(conf->mddev), |
| 2382 | (unsigned long long)s, | 2401 | (unsigned long long)s, |
| 2383 | bdn); | 2402 | bdn); |
| 2384 | } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { | 2403 | } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { |
| 2385 | /* Oh, no!!! */ | 2404 | /* Oh, no!!! */ |
| 2386 | set_bad = 1; | 2405 | set_bad = 1; |
| 2387 | printk_ratelimited( | 2406 | pr_warn_ratelimited( |
| 2388 | KERN_WARNING | 2407 | "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", |
| 2389 | "md/raid:%s: read error NOT corrected!! " | ||
| 2390 | "(sector %llu on %s).\n", | ||
| 2391 | mdname(conf->mddev), | 2408 | mdname(conf->mddev), |
| 2392 | (unsigned long long)s, | 2409 | (unsigned long long)s, |
| 2393 | bdn); | 2410 | bdn); |
| 2394 | } else if (atomic_read(&rdev->read_errors) | 2411 | } else if (atomic_read(&rdev->read_errors) |
| 2395 | > conf->max_nr_stripes) | 2412 | > conf->max_nr_stripes) |
| 2396 | printk(KERN_WARNING | 2413 | pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", |
| 2397 | "md/raid:%s: Too many read errors, failing device %s.\n", | ||
| 2398 | mdname(conf->mddev), bdn); | 2414 | mdname(conf->mddev), bdn); |
| 2399 | else | 2415 | else |
| 2400 | retry = 1; | 2416 | retry = 1; |
| @@ -2526,15 +2542,14 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 2526 | 2542 | ||
| 2527 | set_bit(Blocked, &rdev->flags); | 2543 | set_bit(Blocked, &rdev->flags); |
| 2528 | set_bit(Faulty, &rdev->flags); | 2544 | set_bit(Faulty, &rdev->flags); |
| 2529 | set_mask_bits(&mddev->flags, 0, | 2545 | set_mask_bits(&mddev->sb_flags, 0, |
| 2530 | BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); | 2546 | BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); |
| 2531 | printk(KERN_ALERT | 2547 | pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" |
| 2532 | "md/raid:%s: Disk failure on %s, disabling device.\n" | 2548 | "md/raid:%s: Operation continuing on %d devices.\n", |
| 2533 | "md/raid:%s: Operation continuing on %d devices.\n", | 2549 | mdname(mddev), |
| 2534 | mdname(mddev), | 2550 | bdevname(rdev->bdev, b), |
| 2535 | bdevname(rdev->bdev, b), | 2551 | mdname(mddev), |
| 2536 | mdname(mddev), | 2552 | conf->raid_disks - mddev->degraded); |
| 2537 | conf->raid_disks - mddev->degraded); | ||
| 2538 | } | 2553 | } |
| 2539 | 2554 | ||
| 2540 | /* | 2555 | /* |
| @@ -2856,8 +2871,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) | |||
| 2856 | previous, &dummy1, &sh2); | 2871 | previous, &dummy1, &sh2); |
| 2857 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx | 2872 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx |
| 2858 | || sh2.qd_idx != sh->qd_idx) { | 2873 | || sh2.qd_idx != sh->qd_idx) { |
| 2859 | printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", | 2874 | pr_warn("md/raid:%s: compute_blocknr: map not correct\n", |
| 2860 | mdname(conf->mddev)); | 2875 | mdname(conf->mddev)); |
| 2861 | return 0; | 2876 | return 0; |
| 2862 | } | 2877 | } |
| 2863 | return r_sector; | 2878 | return r_sector; |
| @@ -2872,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2872 | int level = conf->level; | 2887 | int level = conf->level; |
| 2873 | 2888 | ||
| 2874 | if (rcw) { | 2889 | if (rcw) { |
| 2890 | /* | ||
| 2891 | * In some cases, handle_stripe_dirtying initially decided to | ||
| 2892 | * run rmw and allocates extra page for prexor. However, rcw is | ||
| 2893 | * cheaper later on. We need to free the extra page now, | ||
| 2894 | * because we won't be able to do that in ops_complete_prexor(). | ||
| 2895 | */ | ||
| 2896 | r5c_release_extra_page(sh); | ||
| 2875 | 2897 | ||
| 2876 | for (i = disks; i--; ) { | 2898 | for (i = disks; i--; ) { |
| 2877 | struct r5dev *dev = &sh->dev[i]; | 2899 | struct r5dev *dev = &sh->dev[i]; |
| @@ -2882,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2882 | if (!expand) | 2904 | if (!expand) |
| 2883 | clear_bit(R5_UPTODATE, &dev->flags); | 2905 | clear_bit(R5_UPTODATE, &dev->flags); |
| 2884 | s->locked++; | 2906 | s->locked++; |
| 2907 | } else if (test_bit(R5_InJournal, &dev->flags)) { | ||
| 2908 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2909 | s->locked++; | ||
| 2885 | } | 2910 | } |
| 2886 | } | 2911 | } |
| 2887 | /* if we are not expanding this is a proper write request, and | 2912 | /* if we are not expanding this is a proper write request, and |
| @@ -2921,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, | |||
| 2921 | set_bit(R5_LOCKED, &dev->flags); | 2946 | set_bit(R5_LOCKED, &dev->flags); |
| 2922 | clear_bit(R5_UPTODATE, &dev->flags); | 2947 | clear_bit(R5_UPTODATE, &dev->flags); |
| 2923 | s->locked++; | 2948 | s->locked++; |
| 2949 | } else if (test_bit(R5_InJournal, &dev->flags)) { | ||
| 2950 | set_bit(R5_LOCKED, &dev->flags); | ||
| 2951 | s->locked++; | ||
| 2924 | } | 2952 | } |
| 2925 | } | 2953 | } |
| 2926 | if (!s->locked) | 2954 | if (!s->locked) |
| @@ -3564,10 +3592,10 @@ unhash: | |||
| 3564 | break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); | 3592 | break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); |
| 3565 | } | 3593 | } |
| 3566 | 3594 | ||
| 3567 | static void handle_stripe_dirtying(struct r5conf *conf, | 3595 | static int handle_stripe_dirtying(struct r5conf *conf, |
| 3568 | struct stripe_head *sh, | 3596 | struct stripe_head *sh, |
| 3569 | struct stripe_head_state *s, | 3597 | struct stripe_head_state *s, |
| 3570 | int disks) | 3598 | int disks) |
| 3571 | { | 3599 | { |
| 3572 | int rmw = 0, rcw = 0, i; | 3600 | int rmw = 0, rcw = 0, i; |
| 3573 | sector_t recovery_cp = conf->mddev->recovery_cp; | 3601 | sector_t recovery_cp = conf->mddev->recovery_cp; |
| @@ -3592,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3592 | } else for (i = disks; i--; ) { | 3620 | } else for (i = disks; i--; ) { |
| 3593 | /* would I have to read this buffer for read_modify_write */ | 3621 | /* would I have to read this buffer for read_modify_write */ |
| 3594 | struct r5dev *dev = &sh->dev[i]; | 3622 | struct r5dev *dev = &sh->dev[i]; |
| 3595 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && | 3623 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || |
| 3624 | test_bit(R5_InJournal, &dev->flags)) && | ||
| 3596 | !test_bit(R5_LOCKED, &dev->flags) && | 3625 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3597 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3626 | !((test_bit(R5_UPTODATE, &dev->flags) && |
| 3627 | (!test_bit(R5_InJournal, &dev->flags) || | ||
| 3628 | dev->page != dev->orig_page)) || | ||
| 3598 | test_bit(R5_Wantcompute, &dev->flags))) { | 3629 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 3599 | if (test_bit(R5_Insync, &dev->flags)) | 3630 | if (test_bit(R5_Insync, &dev->flags)) |
| 3600 | rmw++; | 3631 | rmw++; |
| @@ -3606,13 +3637,15 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3606 | i != sh->pd_idx && i != sh->qd_idx && | 3637 | i != sh->pd_idx && i != sh->qd_idx && |
| 3607 | !test_bit(R5_LOCKED, &dev->flags) && | 3638 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3608 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3639 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3609 | test_bit(R5_Wantcompute, &dev->flags))) { | 3640 | test_bit(R5_InJournal, &dev->flags) || |
| 3641 | test_bit(R5_Wantcompute, &dev->flags))) { | ||
| 3610 | if (test_bit(R5_Insync, &dev->flags)) | 3642 | if (test_bit(R5_Insync, &dev->flags)) |
| 3611 | rcw++; | 3643 | rcw++; |
| 3612 | else | 3644 | else |
| 3613 | rcw += 2*disks; | 3645 | rcw += 2*disks; |
| 3614 | } | 3646 | } |
| 3615 | } | 3647 | } |
| 3648 | |||
| 3616 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", | 3649 | pr_debug("for sector %llu, rmw=%d rcw=%d\n", |
| 3617 | (unsigned long long)sh->sector, rmw, rcw); | 3650 | (unsigned long long)sh->sector, rmw, rcw); |
| 3618 | set_bit(STRIPE_HANDLE, &sh->state); | 3651 | set_bit(STRIPE_HANDLE, &sh->state); |
| @@ -3624,10 +3657,44 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3624 | (unsigned long long)sh->sector, rmw); | 3657 | (unsigned long long)sh->sector, rmw); |
| 3625 | for (i = disks; i--; ) { | 3658 | for (i = disks; i--; ) { |
| 3626 | struct r5dev *dev = &sh->dev[i]; | 3659 | struct r5dev *dev = &sh->dev[i]; |
| 3627 | if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && | 3660 | if (test_bit(R5_InJournal, &dev->flags) && |
| 3661 | dev->page == dev->orig_page && | ||
| 3662 | !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { | ||
| 3663 | /* alloc page for prexor */ | ||
| 3664 | struct page *p = alloc_page(GFP_NOIO); | ||
| 3665 | |||
| 3666 | if (p) { | ||
| 3667 | dev->orig_page = p; | ||
| 3668 | continue; | ||
| 3669 | } | ||
| 3670 | |||
| 3671 | /* | ||
| 3672 | * alloc_page() failed, try use | ||
| 3673 | * disk_info->extra_page | ||
| 3674 | */ | ||
| 3675 | if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, | ||
| 3676 | &conf->cache_state)) { | ||
| 3677 | r5c_use_extra_page(sh); | ||
| 3678 | break; | ||
| 3679 | } | ||
| 3680 | |||
| 3681 | /* extra_page in use, add to delayed_list */ | ||
| 3682 | set_bit(STRIPE_DELAYED, &sh->state); | ||
| 3683 | s->waiting_extra_page = 1; | ||
| 3684 | return -EAGAIN; | ||
| 3685 | } | ||
| 3686 | } | ||
| 3687 | |||
| 3688 | for (i = disks; i--; ) { | ||
| 3689 | struct r5dev *dev = &sh->dev[i]; | ||
| 3690 | if ((dev->towrite || | ||
| 3691 | i == sh->pd_idx || i == sh->qd_idx || | ||
| 3692 | test_bit(R5_InJournal, &dev->flags)) && | ||
| 3628 | !test_bit(R5_LOCKED, &dev->flags) && | 3693 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3629 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3694 | !((test_bit(R5_UPTODATE, &dev->flags) && |
| 3630 | test_bit(R5_Wantcompute, &dev->flags)) && | 3695 | (!test_bit(R5_InJournal, &dev->flags) || |
| 3696 | dev->page != dev->orig_page)) || | ||
| 3697 | test_bit(R5_Wantcompute, &dev->flags)) && | ||
| 3631 | test_bit(R5_Insync, &dev->flags)) { | 3698 | test_bit(R5_Insync, &dev->flags)) { |
| 3632 | if (test_bit(STRIPE_PREREAD_ACTIVE, | 3699 | if (test_bit(STRIPE_PREREAD_ACTIVE, |
| 3633 | &sh->state)) { | 3700 | &sh->state)) { |
| @@ -3653,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3653 | i != sh->pd_idx && i != sh->qd_idx && | 3720 | i != sh->pd_idx && i != sh->qd_idx && |
| 3654 | !test_bit(R5_LOCKED, &dev->flags) && | 3721 | !test_bit(R5_LOCKED, &dev->flags) && |
| 3655 | !(test_bit(R5_UPTODATE, &dev->flags) || | 3722 | !(test_bit(R5_UPTODATE, &dev->flags) || |
| 3723 | test_bit(R5_InJournal, &dev->flags) || | ||
| 3656 | test_bit(R5_Wantcompute, &dev->flags))) { | 3724 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 3657 | rcw++; | 3725 | rcw++; |
| 3658 | if (test_bit(R5_Insync, &dev->flags) && | 3726 | if (test_bit(R5_Insync, &dev->flags) && |
| @@ -3692,8 +3760,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
| 3692 | */ | 3760 | */ |
| 3693 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && | 3761 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
| 3694 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 3762 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
| 3695 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 3763 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
| 3696 | schedule_reconstruction(sh, s, rcw == 0, 0); | 3764 | schedule_reconstruction(sh, s, rcw == 0, 0); |
| 3765 | return 0; | ||
| 3697 | } | 3766 | } |
| 3698 | 3767 | ||
| 3699 | static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | 3768 | static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, |
| @@ -3777,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | |||
| 3777 | case check_state_compute_run: | 3846 | case check_state_compute_run: |
| 3778 | break; | 3847 | break; |
| 3779 | default: | 3848 | default: |
| 3780 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | 3849 | pr_err("%s: unknown check_state: %d sector: %llu\n", |
| 3781 | __func__, sh->check_state, | 3850 | __func__, sh->check_state, |
| 3782 | (unsigned long long) sh->sector); | 3851 | (unsigned long long) sh->sector); |
| 3783 | BUG(); | 3852 | BUG(); |
| @@ -3941,9 +4010,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, | |||
| 3941 | case check_state_compute_run: | 4010 | case check_state_compute_run: |
| 3942 | break; | 4011 | break; |
| 3943 | default: | 4012 | default: |
| 3944 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | 4013 | pr_warn("%s: unknown check_state: %d sector: %llu\n", |
| 3945 | __func__, sh->check_state, | 4014 | __func__, sh->check_state, |
| 3946 | (unsigned long long) sh->sector); | 4015 | (unsigned long long) sh->sector); |
| 3947 | BUG(); | 4016 | BUG(); |
| 3948 | } | 4017 | } |
| 3949 | } | 4018 | } |
| @@ -4183,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 4183 | if (rdev && !test_bit(Faulty, &rdev->flags)) | 4252 | if (rdev && !test_bit(Faulty, &rdev->flags)) |
| 4184 | do_recovery = 1; | 4253 | do_recovery = 1; |
| 4185 | } | 4254 | } |
| 4255 | |||
| 4256 | if (test_bit(R5_InJournal, &dev->flags)) | ||
| 4257 | s->injournal++; | ||
| 4258 | if (test_bit(R5_InJournal, &dev->flags) && dev->written) | ||
| 4259 | s->just_cached++; | ||
| 4186 | } | 4260 | } |
| 4187 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | 4261 | if (test_bit(STRIPE_SYNCING, &sh->state)) { |
| 4188 | /* If there is a failed device being replaced, | 4262 | /* If there is a failed device being replaced, |
| @@ -4411,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 4411 | struct r5dev *dev = &sh->dev[i]; | 4485 | struct r5dev *dev = &sh->dev[i]; |
| 4412 | if (test_bit(R5_LOCKED, &dev->flags) && | 4486 | if (test_bit(R5_LOCKED, &dev->flags) && |
| 4413 | (i == sh->pd_idx || i == sh->qd_idx || | 4487 | (i == sh->pd_idx || i == sh->qd_idx || |
| 4414 | dev->written)) { | 4488 | dev->written || test_bit(R5_InJournal, |
| 4489 | &dev->flags))) { | ||
| 4415 | pr_debug("Writing block %d\n", i); | 4490 | pr_debug("Writing block %d\n", i); |
| 4416 | set_bit(R5_Wantwrite, &dev->flags); | 4491 | set_bit(R5_Wantwrite, &dev->flags); |
| 4417 | if (prexor) | 4492 | if (prexor) |
| @@ -4451,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 4451 | test_bit(R5_Discard, &qdev->flags)))))) | 4526 | test_bit(R5_Discard, &qdev->flags)))))) |
| 4452 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); | 4527 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
| 4453 | 4528 | ||
| 4529 | if (s.just_cached) | ||
| 4530 | r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); | ||
| 4531 | r5l_stripe_write_finished(sh); | ||
| 4532 | |||
| 4454 | /* Now we might consider reading some blocks, either to check/generate | 4533 | /* Now we might consider reading some blocks, either to check/generate |
| 4455 | * parity, or to satisfy requests | 4534 | * parity, or to satisfy requests |
| 4456 | * or to load a block that is being partially written. | 4535 | * or to load a block that is being partially written. |
| @@ -4462,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 4462 | || s.expanding) | 4541 | || s.expanding) |
| 4463 | handle_stripe_fill(sh, &s, disks); | 4542 | handle_stripe_fill(sh, &s, disks); |
| 4464 | 4543 | ||
| 4465 | /* Now to consider new write requests and what else, if anything | 4544 | /* |
| 4466 | * should be read. We do not handle new writes when: | 4545 | * When the stripe finishes full journal write cycle (write to journal |
| 4546 | * and raid disk), this is the clean up procedure so it is ready for | ||
| 4547 | * next operation. | ||
| 4548 | */ | ||
| 4549 | r5c_finish_stripe_write_out(conf, sh, &s); | ||
| 4550 | |||
| 4551 | /* | ||
| 4552 | * Now to consider new write requests, cache write back and what else, | ||
| 4553 | * if anything should be read. We do not handle new writes when: | ||
| 4467 | * 1/ A 'write' operation (copy+xor) is already in flight. | 4554 | * 1/ A 'write' operation (copy+xor) is already in flight. |
| 4468 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 4555 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
| 4469 | * block. | 4556 | * block. |
| 4557 | * 3/ A r5c cache log write is in flight. | ||
| 4470 | */ | 4558 | */ |
| 4471 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) | 4559 | |
| 4472 | handle_stripe_dirtying(conf, sh, &s, disks); | 4560 | if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { |
| 4561 | if (!r5c_is_writeback(conf->log)) { | ||
| 4562 | if (s.to_write) | ||
| 4563 | handle_stripe_dirtying(conf, sh, &s, disks); | ||
| 4564 | } else { /* write back cache */ | ||
| 4565 | int ret = 0; | ||
| 4566 | |||
| 4567 | /* First, try handle writes in caching phase */ | ||
| 4568 | if (s.to_write) | ||
| 4569 | ret = r5c_try_caching_write(conf, sh, &s, | ||
| 4570 | disks); | ||
| 4571 | /* | ||
| 4572 | * If caching phase failed: ret == -EAGAIN | ||
| 4573 | * OR | ||
| 4574 | * stripe under reclaim: !caching && injournal | ||
| 4575 | * | ||
| 4576 | * fall back to handle_stripe_dirtying() | ||
| 4577 | */ | ||
| 4578 | if (ret == -EAGAIN || | ||
| 4579 | /* stripe under reclaim: !caching && injournal */ | ||
| 4580 | (!test_bit(STRIPE_R5C_CACHING, &sh->state) && | ||
| 4581 | s.injournal > 0)) { | ||
| 4582 | ret = handle_stripe_dirtying(conf, sh, &s, | ||
| 4583 | disks); | ||
| 4584 | if (ret == -EAGAIN) | ||
| 4585 | goto finish; | ||
| 4586 | } | ||
| 4587 | } | ||
| 4588 | } | ||
| 4473 | 4589 | ||
| 4474 | /* maybe we need to check and possibly fix the parity for this stripe | 4590 | /* maybe we need to check and possibly fix the parity for this stripe |
| 4475 | * Any reads will already have been scheduled, so we just see if enough | 4591 | * Any reads will already have been scheduled, so we just see if enough |
| @@ -4640,9 +4756,7 @@ finish: | |||
| 4640 | } | 4756 | } |
| 4641 | 4757 | ||
| 4642 | if (!bio_list_empty(&s.return_bi)) { | 4758 | if (!bio_list_empty(&s.return_bi)) { |
| 4643 | if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && | 4759 | if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { |
| 4644 | (s.failed <= conf->max_degraded || | ||
| 4645 | conf->mddev->external == 0)) { | ||
| 4646 | spin_lock_irq(&conf->device_lock); | 4760 | spin_lock_irq(&conf->device_lock); |
| 4647 | bio_list_merge(&conf->return_bi, &s.return_bi); | 4761 | bio_list_merge(&conf->return_bi, &s.return_bi); |
| 4648 | spin_unlock_irq(&conf->device_lock); | 4762 | spin_unlock_irq(&conf->device_lock); |
| @@ -4698,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits) | |||
| 4698 | 4812 | ||
| 4699 | if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) | 4813 | if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) |
| 4700 | return 1; | 4814 | return 1; |
| 4815 | |||
| 4816 | /* Also checks whether there is pressure on r5cache log space */ | ||
| 4817 | if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) | ||
| 4818 | return 1; | ||
| 4701 | if (conf->quiesce) | 4819 | if (conf->quiesce) |
| 4702 | return 1; | 4820 | return 1; |
| 4703 | if (atomic_read(&conf->empty_inactive_list_nr)) | 4821 | if (atomic_read(&conf->empty_inactive_list_nr)) |
| @@ -5167,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
| 5167 | int remaining; | 5285 | int remaining; |
| 5168 | DEFINE_WAIT(w); | 5286 | DEFINE_WAIT(w); |
| 5169 | bool do_prepare; | 5287 | bool do_prepare; |
| 5288 | bool do_flush = false; | ||
| 5170 | 5289 | ||
| 5171 | if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { | 5290 | if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { |
| 5172 | int ret = r5l_handle_flush_request(conf->log, bi); | 5291 | int ret = r5l_handle_flush_request(conf->log, bi); |
| @@ -5178,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
| 5178 | return; | 5297 | return; |
| 5179 | } | 5298 | } |
| 5180 | /* ret == -EAGAIN, fallback */ | 5299 | /* ret == -EAGAIN, fallback */ |
| 5300 | /* | ||
| 5301 | * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, | ||
| 5302 | * we need to flush journal device | ||
| 5303 | */ | ||
| 5304 | do_flush = bi->bi_opf & REQ_PREFLUSH; | ||
| 5181 | } | 5305 | } |
| 5182 | 5306 | ||
| 5183 | md_write_start(mddev, bi); | 5307 | md_write_start(mddev, bi); |
| @@ -5188,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
| 5188 | * data on failed drives. | 5312 | * data on failed drives. |
| 5189 | */ | 5313 | */ |
| 5190 | if (rw == READ && mddev->degraded == 0 && | 5314 | if (rw == READ && mddev->degraded == 0 && |
| 5315 | !r5c_is_writeback(conf->log) && | ||
| 5191 | mddev->reshape_position == MaxSector) { | 5316 | mddev->reshape_position == MaxSector) { |
| 5192 | bi = chunk_aligned_read(mddev, bi); | 5317 | bi = chunk_aligned_read(mddev, bi); |
| 5193 | if (!bi) | 5318 | if (!bi) |
| @@ -5316,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) | |||
| 5316 | do_prepare = true; | 5441 | do_prepare = true; |
| 5317 | goto retry; | 5442 | goto retry; |
| 5318 | } | 5443 | } |
| 5444 | if (do_flush) { | ||
| 5445 | set_bit(STRIPE_R5C_PREFLUSH, &sh->state); | ||
| 5446 | /* we only need flush for one stripe */ | ||
| 5447 | do_flush = false; | ||
| 5448 | } | ||
| 5449 | |||
| 5319 | set_bit(STRIPE_HANDLE, &sh->state); | 5450 | set_bit(STRIPE_HANDLE, &sh->state); |
| 5320 | clear_bit(STRIPE_DELAYED, &sh->state); | 5451 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 5321 | if ((!sh->batch_head || sh == sh->batch_head) && | 5452 | if ((!sh->batch_head || sh == sh->batch_head) && |
| @@ -5481,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk | |||
| 5481 | mddev->reshape_position = conf->reshape_progress; | 5612 | mddev->reshape_position = conf->reshape_progress; |
| 5482 | mddev->curr_resync_completed = sector_nr; | 5613 | mddev->curr_resync_completed = sector_nr; |
| 5483 | conf->reshape_checkpoint = jiffies; | 5614 | conf->reshape_checkpoint = jiffies; |
| 5484 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5615 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 5485 | md_wakeup_thread(mddev->thread); | 5616 | md_wakeup_thread(mddev->thread); |
| 5486 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 5617 | wait_event(mddev->sb_wait, mddev->sb_flags == 0 || |
| 5487 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | 5618 | test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
| 5488 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 5619 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
| 5489 | return 0; | 5620 | return 0; |
| @@ -5579,10 +5710,10 @@ finish: | |||
| 5579 | mddev->reshape_position = conf->reshape_progress; | 5710 | mddev->reshape_position = conf->reshape_progress; |
| 5580 | mddev->curr_resync_completed = sector_nr; | 5711 | mddev->curr_resync_completed = sector_nr; |
| 5581 | conf->reshape_checkpoint = jiffies; | 5712 | conf->reshape_checkpoint = jiffies; |
| 5582 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 5713 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 5583 | md_wakeup_thread(mddev->thread); | 5714 | md_wakeup_thread(mddev->thread); |
| 5584 | wait_event(mddev->sb_wait, | 5715 | wait_event(mddev->sb_wait, |
| 5585 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) | 5716 | !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) |
| 5586 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); | 5717 | || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); |
| 5587 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 5718 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
| 5588 | goto ret; | 5719 | goto ret; |
| @@ -5857,10 +5988,10 @@ static void raid5d(struct md_thread *thread) | |||
| 5857 | md_check_recovery(mddev); | 5988 | md_check_recovery(mddev); |
| 5858 | 5989 | ||
| 5859 | if (!bio_list_empty(&conf->return_bi) && | 5990 | if (!bio_list_empty(&conf->return_bi) && |
| 5860 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 5991 | !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 5861 | struct bio_list tmp = BIO_EMPTY_LIST; | 5992 | struct bio_list tmp = BIO_EMPTY_LIST; |
| 5862 | spin_lock_irq(&conf->device_lock); | 5993 | spin_lock_irq(&conf->device_lock); |
| 5863 | if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { | 5994 | if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { |
| 5864 | bio_list_merge(&tmp, &conf->return_bi); | 5995 | bio_list_merge(&tmp, &conf->return_bi); |
| 5865 | bio_list_init(&conf->return_bi); | 5996 | bio_list_init(&conf->return_bi); |
| 5866 | } | 5997 | } |
| @@ -5907,7 +6038,7 @@ static void raid5d(struct md_thread *thread) | |||
| 5907 | break; | 6038 | break; |
| 5908 | handled += batch_size; | 6039 | handled += batch_size; |
| 5909 | 6040 | ||
| 5910 | if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { | 6041 | if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { |
| 5911 | spin_unlock_irq(&conf->device_lock); | 6042 | spin_unlock_irq(&conf->device_lock); |
| 5912 | md_check_recovery(mddev); | 6043 | md_check_recovery(mddev); |
| 5913 | spin_lock_irq(&conf->device_lock); | 6044 | spin_lock_irq(&conf->device_lock); |
| @@ -6237,6 +6368,7 @@ static struct attribute *raid5_attrs[] = { | |||
| 6237 | &raid5_group_thread_cnt.attr, | 6368 | &raid5_group_thread_cnt.attr, |
| 6238 | &raid5_skip_copy.attr, | 6369 | &raid5_skip_copy.attr, |
| 6239 | &raid5_rmw_level.attr, | 6370 | &raid5_rmw_level.attr, |
| 6371 | &r5c_journal_mode.attr, | ||
| 6240 | NULL, | 6372 | NULL, |
| 6241 | }; | 6373 | }; |
| 6242 | static struct attribute_group raid5_attrs_group = { | 6374 | static struct attribute_group raid5_attrs_group = { |
| @@ -6363,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf) | |||
| 6363 | 6495 | ||
| 6364 | static void free_conf(struct r5conf *conf) | 6496 | static void free_conf(struct r5conf *conf) |
| 6365 | { | 6497 | { |
| 6498 | int i; | ||
| 6499 | |||
| 6366 | if (conf->log) | 6500 | if (conf->log) |
| 6367 | r5l_exit_log(conf->log); | 6501 | r5l_exit_log(conf->log); |
| 6368 | if (conf->shrinker.nr_deferred) | 6502 | if (conf->shrinker.nr_deferred) |
| @@ -6371,6 +6505,9 @@ static void free_conf(struct r5conf *conf) | |||
| 6371 | free_thread_groups(conf); | 6505 | free_thread_groups(conf); |
| 6372 | shrink_stripes(conf); | 6506 | shrink_stripes(conf); |
| 6373 | raid5_free_percpu(conf); | 6507 | raid5_free_percpu(conf); |
| 6508 | for (i = 0; i < conf->pool_size; i++) | ||
| 6509 | if (conf->disks[i].extra_page) | ||
| 6510 | put_page(conf->disks[i].extra_page); | ||
| 6374 | kfree(conf->disks); | 6511 | kfree(conf->disks); |
| 6375 | kfree(conf->stripe_hashtbl); | 6512 | kfree(conf->stripe_hashtbl); |
| 6376 | kfree(conf); | 6513 | kfree(conf); |
| @@ -6382,8 +6519,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) | |||
| 6382 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); | 6519 | struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); |
| 6383 | 6520 | ||
| 6384 | if (alloc_scratch_buffer(conf, percpu)) { | 6521 | if (alloc_scratch_buffer(conf, percpu)) { |
| 6385 | pr_err("%s: failed memory allocation for cpu%u\n", | 6522 | pr_warn("%s: failed memory allocation for cpu%u\n", |
| 6386 | __func__, cpu); | 6523 | __func__, cpu); |
| 6387 | return -ENOMEM; | 6524 | return -ENOMEM; |
| 6388 | } | 6525 | } |
| 6389 | return 0; | 6526 | return 0; |
| @@ -6453,29 +6590,29 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6453 | if (mddev->new_level != 5 | 6590 | if (mddev->new_level != 5 |
| 6454 | && mddev->new_level != 4 | 6591 | && mddev->new_level != 4 |
| 6455 | && mddev->new_level != 6) { | 6592 | && mddev->new_level != 6) { |
| 6456 | printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", | 6593 | pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", |
| 6457 | mdname(mddev), mddev->new_level); | 6594 | mdname(mddev), mddev->new_level); |
| 6458 | return ERR_PTR(-EIO); | 6595 | return ERR_PTR(-EIO); |
| 6459 | } | 6596 | } |
| 6460 | if ((mddev->new_level == 5 | 6597 | if ((mddev->new_level == 5 |
| 6461 | && !algorithm_valid_raid5(mddev->new_layout)) || | 6598 | && !algorithm_valid_raid5(mddev->new_layout)) || |
| 6462 | (mddev->new_level == 6 | 6599 | (mddev->new_level == 6 |
| 6463 | && !algorithm_valid_raid6(mddev->new_layout))) { | 6600 | && !algorithm_valid_raid6(mddev->new_layout))) { |
| 6464 | printk(KERN_ERR "md/raid:%s: layout %d not supported\n", | 6601 | pr_warn("md/raid:%s: layout %d not supported\n", |
| 6465 | mdname(mddev), mddev->new_layout); | 6602 | mdname(mddev), mddev->new_layout); |
| 6466 | return ERR_PTR(-EIO); | 6603 | return ERR_PTR(-EIO); |
| 6467 | } | 6604 | } |
| 6468 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { | 6605 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { |
| 6469 | printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", | 6606 | pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", |
| 6470 | mdname(mddev), mddev->raid_disks); | 6607 | mdname(mddev), mddev->raid_disks); |
| 6471 | return ERR_PTR(-EINVAL); | 6608 | return ERR_PTR(-EINVAL); |
| 6472 | } | 6609 | } |
| 6473 | 6610 | ||
| 6474 | if (!mddev->new_chunk_sectors || | 6611 | if (!mddev->new_chunk_sectors || |
| 6475 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || | 6612 | (mddev->new_chunk_sectors << 9) % PAGE_SIZE || |
| 6476 | !is_power_of_2(mddev->new_chunk_sectors)) { | 6613 | !is_power_of_2(mddev->new_chunk_sectors)) { |
| 6477 | printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", | 6614 | pr_warn("md/raid:%s: invalid chunk size %d\n", |
| 6478 | mdname(mddev), mddev->new_chunk_sectors << 9); | 6615 | mdname(mddev), mddev->new_chunk_sectors << 9); |
| 6479 | return ERR_PTR(-EINVAL); | 6616 | return ERR_PTR(-EINVAL); |
| 6480 | } | 6617 | } |
| 6481 | 6618 | ||
| @@ -6517,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6517 | 6654 | ||
| 6518 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), | 6655 | conf->disks = kzalloc(max_disks * sizeof(struct disk_info), |
| 6519 | GFP_KERNEL); | 6656 | GFP_KERNEL); |
| 6657 | |||
| 6520 | if (!conf->disks) | 6658 | if (!conf->disks) |
| 6521 | goto abort; | 6659 | goto abort; |
| 6522 | 6660 | ||
| 6661 | for (i = 0; i < max_disks; i++) { | ||
| 6662 | conf->disks[i].extra_page = alloc_page(GFP_KERNEL); | ||
| 6663 | if (!conf->disks[i].extra_page) | ||
| 6664 | goto abort; | ||
| 6665 | } | ||
| 6666 | |||
| 6523 | conf->mddev = mddev; | 6667 | conf->mddev = mddev; |
| 6524 | 6668 | ||
| 6525 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 6669 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
| @@ -6540,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6540 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) | 6684 | for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) |
| 6541 | INIT_LIST_HEAD(conf->temp_inactive_list + i); | 6685 | INIT_LIST_HEAD(conf->temp_inactive_list + i); |
| 6542 | 6686 | ||
| 6687 | atomic_set(&conf->r5c_cached_full_stripes, 0); | ||
| 6688 | INIT_LIST_HEAD(&conf->r5c_full_stripe_list); | ||
| 6689 | atomic_set(&conf->r5c_cached_partial_stripes, 0); | ||
| 6690 | INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); | ||
| 6691 | |||
| 6543 | conf->level = mddev->new_level; | 6692 | conf->level = mddev->new_level; |
| 6544 | conf->chunk_sectors = mddev->new_chunk_sectors; | 6693 | conf->chunk_sectors = mddev->new_chunk_sectors; |
| 6545 | if (raid5_alloc_percpu(conf) != 0) | 6694 | if (raid5_alloc_percpu(conf) != 0) |
| @@ -6566,9 +6715,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6566 | 6715 | ||
| 6567 | if (test_bit(In_sync, &rdev->flags)) { | 6716 | if (test_bit(In_sync, &rdev->flags)) { |
| 6568 | char b[BDEVNAME_SIZE]; | 6717 | char b[BDEVNAME_SIZE]; |
| 6569 | printk(KERN_INFO "md/raid:%s: device %s operational as raid" | 6718 | pr_info("md/raid:%s: device %s operational as raid disk %d\n", |
| 6570 | " disk %d\n", | 6719 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); |
| 6571 | mdname(mddev), bdevname(rdev->bdev, b), raid_disk); | ||
| 6572 | } else if (rdev->saved_raid_disk != raid_disk) | 6720 | } else if (rdev->saved_raid_disk != raid_disk) |
| 6573 | /* Cannot rely on bitmap to complete recovery */ | 6721 | /* Cannot rely on bitmap to complete recovery */ |
| 6574 | conf->fullsync = 1; | 6722 | conf->fullsync = 1; |
| @@ -6602,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6602 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); | 6750 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); |
| 6603 | conf->min_nr_stripes = max(NR_STRIPES, stripes); | 6751 | conf->min_nr_stripes = max(NR_STRIPES, stripes); |
| 6604 | if (conf->min_nr_stripes != NR_STRIPES) | 6752 | if (conf->min_nr_stripes != NR_STRIPES) |
| 6605 | printk(KERN_INFO | 6753 | pr_info("md/raid:%s: force stripe size %d for reshape\n", |
| 6606 | "md/raid:%s: force stripe size %d for reshape\n", | ||
| 6607 | mdname(mddev), conf->min_nr_stripes); | 6754 | mdname(mddev), conf->min_nr_stripes); |
| 6608 | } | 6755 | } |
| 6609 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + | 6756 | memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + |
| 6610 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | 6757 | max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
| 6611 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); | 6758 | atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); |
| 6612 | if (grow_stripes(conf, conf->min_nr_stripes)) { | 6759 | if (grow_stripes(conf, conf->min_nr_stripes)) { |
| 6613 | printk(KERN_ERR | 6760 | pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", |
| 6614 | "md/raid:%s: couldn't allocate %dkB for buffers\n", | 6761 | mdname(mddev), memory); |
| 6615 | mdname(mddev), memory); | ||
| 6616 | goto abort; | 6762 | goto abort; |
| 6617 | } else | 6763 | } else |
| 6618 | printk(KERN_INFO "md/raid:%s: allocated %dkB\n", | 6764 | pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); |
| 6619 | mdname(mddev), memory); | ||
| 6620 | /* | 6765 | /* |
| 6621 | * Losing a stripe head costs more than the time to refill it, | 6766 | * Losing a stripe head costs more than the time to refill it, |
| 6622 | * it reduces the queue depth and so can hurt throughput. | 6767 | * it reduces the queue depth and so can hurt throughput. |
| @@ -6628,18 +6773,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
| 6628 | conf->shrinker.batch = 128; | 6773 | conf->shrinker.batch = 128; |
| 6629 | conf->shrinker.flags = 0; | 6774 | conf->shrinker.flags = 0; |
| 6630 | if (register_shrinker(&conf->shrinker)) { | 6775 | if (register_shrinker(&conf->shrinker)) { |
| 6631 | printk(KERN_ERR | 6776 | pr_warn("md/raid:%s: couldn't register shrinker.\n", |
| 6632 | "md/raid:%s: couldn't register shrinker.\n", | 6777 | mdname(mddev)); |
| 6633 | mdname(mddev)); | ||
| 6634 | goto abort; | 6778 | goto abort; |
| 6635 | } | 6779 | } |
| 6636 | 6780 | ||
| 6637 | sprintf(pers_name, "raid%d", mddev->new_level); | 6781 | sprintf(pers_name, "raid%d", mddev->new_level); |
| 6638 | conf->thread = md_register_thread(raid5d, mddev, pers_name); | 6782 | conf->thread = md_register_thread(raid5d, mddev, pers_name); |
| 6639 | if (!conf->thread) { | 6783 | if (!conf->thread) { |
| 6640 | printk(KERN_ERR | 6784 | pr_warn("md/raid:%s: couldn't allocate thread.\n", |
| 6641 | "md/raid:%s: couldn't allocate thread.\n", | 6785 | mdname(mddev)); |
| 6642 | mdname(mddev)); | ||
| 6643 | goto abort; | 6786 | goto abort; |
| 6644 | } | 6787 | } |
| 6645 | 6788 | ||
| @@ -6692,9 +6835,8 @@ static int raid5_run(struct mddev *mddev) | |||
| 6692 | int first = 1; | 6835 | int first = 1; |
| 6693 | 6836 | ||
| 6694 | if (mddev->recovery_cp != MaxSector) | 6837 | if (mddev->recovery_cp != MaxSector) |
| 6695 | printk(KERN_NOTICE "md/raid:%s: not clean" | 6838 | pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", |
| 6696 | " -- starting background reconstruction\n", | 6839 | mdname(mddev)); |
| 6697 | mdname(mddev)); | ||
| 6698 | 6840 | ||
| 6699 | rdev_for_each(rdev, mddev) { | 6841 | rdev_for_each(rdev, mddev) { |
| 6700 | long long diff; | 6842 | long long diff; |
| @@ -6737,15 +6879,14 @@ static int raid5_run(struct mddev *mddev) | |||
| 6737 | int new_data_disks; | 6879 | int new_data_disks; |
| 6738 | 6880 | ||
| 6739 | if (journal_dev) { | 6881 | if (journal_dev) { |
| 6740 | printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", | 6882 | pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", |
| 6741 | mdname(mddev)); | 6883 | mdname(mddev)); |
| 6742 | return -EINVAL; | 6884 | return -EINVAL; |
| 6743 | } | 6885 | } |
| 6744 | 6886 | ||
| 6745 | if (mddev->new_level != mddev->level) { | 6887 | if (mddev->new_level != mddev->level) { |
| 6746 | printk(KERN_ERR "md/raid:%s: unsupported reshape " | 6888 | pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", |
| 6747 | "required - aborting.\n", | 6889 | mdname(mddev)); |
| 6748 | mdname(mddev)); | ||
| 6749 | return -EINVAL; | 6890 | return -EINVAL; |
| 6750 | } | 6891 | } |
| 6751 | old_disks = mddev->raid_disks - mddev->delta_disks; | 6892 | old_disks = mddev->raid_disks - mddev->delta_disks; |
| @@ -6760,8 +6901,8 @@ static int raid5_run(struct mddev *mddev) | |||
| 6760 | chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); | 6901 | chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); |
| 6761 | new_data_disks = mddev->raid_disks - max_degraded; | 6902 | new_data_disks = mddev->raid_disks - max_degraded; |
| 6762 | if (sector_div(here_new, chunk_sectors * new_data_disks)) { | 6903 | if (sector_div(here_new, chunk_sectors * new_data_disks)) { |
| 6763 | printk(KERN_ERR "md/raid:%s: reshape_position not " | 6904 | pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", |
| 6764 | "on a stripe boundary\n", mdname(mddev)); | 6905 | mdname(mddev)); |
| 6765 | return -EINVAL; | 6906 | return -EINVAL; |
| 6766 | } | 6907 | } |
| 6767 | reshape_offset = here_new * chunk_sectors; | 6908 | reshape_offset = here_new * chunk_sectors; |
| @@ -6782,10 +6923,8 @@ static int raid5_run(struct mddev *mddev) | |||
| 6782 | abs(min_offset_diff) >= mddev->new_chunk_sectors) | 6923 | abs(min_offset_diff) >= mddev->new_chunk_sectors) |
| 6783 | /* not really in-place - so OK */; | 6924 | /* not really in-place - so OK */; |
| 6784 | else if (mddev->ro == 0) { | 6925 | else if (mddev->ro == 0) { |
| 6785 | printk(KERN_ERR "md/raid:%s: in-place reshape " | 6926 | pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", |
| 6786 | "must be started in read-only mode " | 6927 | mdname(mddev)); |
| 6787 | "- aborting\n", | ||
| 6788 | mdname(mddev)); | ||
| 6789 | return -EINVAL; | 6928 | return -EINVAL; |
| 6790 | } | 6929 | } |
| 6791 | } else if (mddev->reshape_backwards | 6930 | } else if (mddev->reshape_backwards |
| @@ -6794,13 +6933,11 @@ static int raid5_run(struct mddev *mddev) | |||
| 6794 | : (here_new * chunk_sectors >= | 6933 | : (here_new * chunk_sectors >= |
| 6795 | here_old * chunk_sectors + (-min_offset_diff))) { | 6934 | here_old * chunk_sectors + (-min_offset_diff))) { |
| 6796 | /* Reading from the same stripe as writing to - bad */ | 6935 | /* Reading from the same stripe as writing to - bad */ |
| 6797 | printk(KERN_ERR "md/raid:%s: reshape_position too early for " | 6936 | pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", |
| 6798 | "auto-recovery - aborting.\n", | 6937 | mdname(mddev)); |
| 6799 | mdname(mddev)); | ||
| 6800 | return -EINVAL; | 6938 | return -EINVAL; |
| 6801 | } | 6939 | } |
| 6802 | printk(KERN_INFO "md/raid:%s: reshape will continue\n", | 6940 | pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); |
| 6803 | mdname(mddev)); | ||
| 6804 | /* OK, we should be able to continue; */ | 6941 | /* OK, we should be able to continue; */ |
| 6805 | } else { | 6942 | } else { |
| 6806 | BUG_ON(mddev->level != mddev->new_level); | 6943 | BUG_ON(mddev->level != mddev->new_level); |
| @@ -6819,8 +6956,8 @@ static int raid5_run(struct mddev *mddev) | |||
| 6819 | 6956 | ||
| 6820 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { | 6957 | if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { |
| 6821 | if (!journal_dev) { | 6958 | if (!journal_dev) { |
| 6822 | pr_err("md/raid:%s: journal disk is missing, force array readonly\n", | 6959 | pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", |
| 6823 | mdname(mddev)); | 6960 | mdname(mddev)); |
| 6824 | mddev->ro = 1; | 6961 | mddev->ro = 1; |
| 6825 | set_disk_ro(mddev->gendisk, 1); | 6962 | set_disk_ro(mddev->gendisk, 1); |
| 6826 | } else if (mddev->recovery_cp == MaxSector) | 6963 | } else if (mddev->recovery_cp == MaxSector) |
| @@ -6847,8 +6984,7 @@ static int raid5_run(struct mddev *mddev) | |||
| 6847 | if (conf->disks[i].replacement && | 6984 | if (conf->disks[i].replacement && |
| 6848 | conf->reshape_progress != MaxSector) { | 6985 | conf->reshape_progress != MaxSector) { |
| 6849 | /* replacements and reshape simply do not mix. */ | 6986 | /* replacements and reshape simply do not mix. */ |
| 6850 | printk(KERN_ERR "md: cannot handle concurrent " | 6987 | pr_warn("md: cannot handle concurrent replacement and reshape.\n"); |
| 6851 | "replacement and reshape.\n"); | ||
| 6852 | goto abort; | 6988 | goto abort; |
| 6853 | } | 6989 | } |
| 6854 | if (test_bit(In_sync, &rdev->flags)) { | 6990 | if (test_bit(In_sync, &rdev->flags)) { |
| @@ -6890,8 +7026,7 @@ static int raid5_run(struct mddev *mddev) | |||
| 6890 | mddev->degraded = calc_degraded(conf); | 7026 | mddev->degraded = calc_degraded(conf); |
| 6891 | 7027 | ||
| 6892 | if (has_failed(conf)) { | 7028 | if (has_failed(conf)) { |
| 6893 | printk(KERN_ERR "md/raid:%s: not enough operational devices" | 7029 | pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", |
| 6894 | " (%d/%d failed)\n", | ||
| 6895 | mdname(mddev), mddev->degraded, conf->raid_disks); | 7030 | mdname(mddev), mddev->degraded, conf->raid_disks); |
| 6896 | goto abort; | 7031 | goto abort; |
| 6897 | } | 7032 | } |
| @@ -6903,29 +7038,19 @@ static int raid5_run(struct mddev *mddev) | |||
| 6903 | if (mddev->degraded > dirty_parity_disks && | 7038 | if (mddev->degraded > dirty_parity_disks && |
| 6904 | mddev->recovery_cp != MaxSector) { | 7039 | mddev->recovery_cp != MaxSector) { |
| 6905 | if (mddev->ok_start_degraded) | 7040 | if (mddev->ok_start_degraded) |
| 6906 | printk(KERN_WARNING | 7041 | pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", |
| 6907 | "md/raid:%s: starting dirty degraded array" | 7042 | mdname(mddev)); |
| 6908 | " - data corruption possible.\n", | ||
| 6909 | mdname(mddev)); | ||
| 6910 | else { | 7043 | else { |
| 6911 | printk(KERN_ERR | 7044 | pr_crit("md/raid:%s: cannot start dirty degraded array.\n", |
| 6912 | "md/raid:%s: cannot start dirty degraded array.\n", | 7045 | mdname(mddev)); |
| 6913 | mdname(mddev)); | ||
| 6914 | goto abort; | 7046 | goto abort; |
| 6915 | } | 7047 | } |
| 6916 | } | 7048 | } |
| 6917 | 7049 | ||
| 6918 | if (mddev->degraded == 0) | 7050 | pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", |
| 6919 | printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" | 7051 | mdname(mddev), conf->level, |
| 6920 | " devices, algorithm %d\n", mdname(mddev), conf->level, | 7052 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, |
| 6921 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, | 7053 | mddev->new_layout); |
| 6922 | mddev->new_layout); | ||
| 6923 | else | ||
| 6924 | printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" | ||
| 6925 | " out of %d devices, algorithm %d\n", | ||
| 6926 | mdname(mddev), conf->level, | ||
| 6927 | mddev->raid_disks - mddev->degraded, | ||
| 6928 | mddev->raid_disks, mddev->new_layout); | ||
| 6929 | 7054 | ||
| 6930 | print_raid5_conf(conf); | 7055 | print_raid5_conf(conf); |
| 6931 | 7056 | ||
| @@ -6945,9 +7070,8 @@ static int raid5_run(struct mddev *mddev) | |||
| 6945 | mddev->to_remove = NULL; | 7070 | mddev->to_remove = NULL; |
| 6946 | else if (mddev->kobj.sd && | 7071 | else if (mddev->kobj.sd && |
| 6947 | sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) | 7072 | sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) |
| 6948 | printk(KERN_WARNING | 7073 | pr_warn("raid5: failed to create sysfs attributes for %s\n", |
| 6949 | "raid5: failed to create sysfs attributes for %s\n", | 7074 | mdname(mddev)); |
| 6950 | mdname(mddev)); | ||
| 6951 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | 7075 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
| 6952 | 7076 | ||
| 6953 | if (mddev->queue) { | 7077 | if (mddev->queue) { |
| @@ -6979,6 +7103,15 @@ static int raid5_run(struct mddev *mddev) | |||
| 6979 | stripe = (stripe | (stripe-1)) + 1; | 7103 | stripe = (stripe | (stripe-1)) + 1; |
| 6980 | mddev->queue->limits.discard_alignment = stripe; | 7104 | mddev->queue->limits.discard_alignment = stripe; |
| 6981 | mddev->queue->limits.discard_granularity = stripe; | 7105 | mddev->queue->limits.discard_granularity = stripe; |
| 7106 | |||
| 7107 | /* | ||
| 7108 | * We use 16-bit counter of active stripes in bi_phys_segments | ||
| 7109 | * (minus one for over-loaded initialization) | ||
| 7110 | */ | ||
| 7111 | blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); | ||
| 7112 | blk_queue_max_discard_sectors(mddev->queue, | ||
| 7113 | 0xfffe * STRIPE_SECTORS); | ||
| 7114 | |||
| 6982 | /* | 7115 | /* |
| 6983 | * unaligned part of discard request will be ignored, so can't | 7116 | * unaligned part of discard request will be ignored, so can't |
| 6984 | * guarantee discard_zeroes_data | 7117 | * guarantee discard_zeroes_data |
| @@ -7035,9 +7168,10 @@ static int raid5_run(struct mddev *mddev) | |||
| 7035 | if (journal_dev) { | 7168 | if (journal_dev) { |
| 7036 | char b[BDEVNAME_SIZE]; | 7169 | char b[BDEVNAME_SIZE]; |
| 7037 | 7170 | ||
| 7038 | printk(KERN_INFO"md/raid:%s: using device %s as journal\n", | 7171 | pr_debug("md/raid:%s: using device %s as journal\n", |
| 7039 | mdname(mddev), bdevname(journal_dev->bdev, b)); | 7172 | mdname(mddev), bdevname(journal_dev->bdev, b)); |
| 7040 | r5l_init_log(conf, journal_dev); | 7173 | if (r5l_init_log(conf, journal_dev)) |
| 7174 | goto abort; | ||
| 7041 | } | 7175 | } |
| 7042 | 7176 | ||
| 7043 | return 0; | 7177 | return 0; |
| @@ -7046,7 +7180,7 @@ abort: | |||
| 7046 | print_raid5_conf(conf); | 7180 | print_raid5_conf(conf); |
| 7047 | free_conf(conf); | 7181 | free_conf(conf); |
| 7048 | mddev->private = NULL; | 7182 | mddev->private = NULL; |
| 7049 | printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); | 7183 | pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); |
| 7050 | return -EIO; | 7184 | return -EIO; |
| 7051 | } | 7185 | } |
| 7052 | 7186 | ||
| @@ -7080,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf) | |||
| 7080 | int i; | 7214 | int i; |
| 7081 | struct disk_info *tmp; | 7215 | struct disk_info *tmp; |
| 7082 | 7216 | ||
| 7083 | printk(KERN_DEBUG "RAID conf printout:\n"); | 7217 | pr_debug("RAID conf printout:\n"); |
| 7084 | if (!conf) { | 7218 | if (!conf) { |
| 7085 | printk("(conf==NULL)\n"); | 7219 | pr_debug("(conf==NULL)\n"); |
| 7086 | return; | 7220 | return; |
| 7087 | } | 7221 | } |
| 7088 | printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, | 7222 | pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, |
| 7089 | conf->raid_disks, | 7223 | conf->raid_disks, |
| 7090 | conf->raid_disks - conf->mddev->degraded); | 7224 | conf->raid_disks - conf->mddev->degraded); |
| 7091 | 7225 | ||
| @@ -7093,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf) | |||
| 7093 | char b[BDEVNAME_SIZE]; | 7227 | char b[BDEVNAME_SIZE]; |
| 7094 | tmp = conf->disks + i; | 7228 | tmp = conf->disks + i; |
| 7095 | if (tmp->rdev) | 7229 | if (tmp->rdev) |
| 7096 | printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", | 7230 | pr_debug(" disk %d, o:%d, dev:%s\n", |
| 7097 | i, !test_bit(Faulty, &tmp->rdev->flags), | 7231 | i, !test_bit(Faulty, &tmp->rdev->flags), |
| 7098 | bdevname(tmp->rdev->bdev, b)); | 7232 | bdevname(tmp->rdev->bdev, b)); |
| 7099 | } | 7233 | } |
| @@ -7241,8 +7375,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 7241 | * write requests running. We should be safe | 7375 | * write requests running. We should be safe |
| 7242 | */ | 7376 | */ |
| 7243 | r5l_init_log(conf, rdev); | 7377 | r5l_init_log(conf, rdev); |
| 7244 | printk(KERN_INFO"md/raid:%s: using device %s as journal\n", | 7378 | pr_debug("md/raid:%s: using device %s as journal\n", |
| 7245 | mdname(mddev), bdevname(rdev->bdev, b)); | 7379 | mdname(mddev), bdevname(rdev->bdev, b)); |
| 7246 | return 0; | 7380 | return 0; |
| 7247 | } | 7381 | } |
| 7248 | if (mddev->recovery_disabled == conf->recovery_disabled) | 7382 | if (mddev->recovery_disabled == conf->recovery_disabled) |
| @@ -7346,10 +7480,10 @@ static int check_stripe_cache(struct mddev *mddev) | |||
| 7346 | > conf->min_nr_stripes || | 7480 | > conf->min_nr_stripes || |
| 7347 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 | 7481 | ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 |
| 7348 | > conf->min_nr_stripes) { | 7482 | > conf->min_nr_stripes) { |
| 7349 | printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", | 7483 | pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", |
| 7350 | mdname(mddev), | 7484 | mdname(mddev), |
| 7351 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) | 7485 | ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) |
| 7352 | / STRIPE_SIZE)*4); | 7486 | / STRIPE_SIZE)*4); |
| 7353 | return 0; | 7487 | return 0; |
| 7354 | } | 7488 | } |
| 7355 | return 1; | 7489 | return 1; |
| @@ -7430,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 7430 | */ | 7564 | */ |
| 7431 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) | 7565 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) |
| 7432 | < mddev->array_sectors) { | 7566 | < mddev->array_sectors) { |
| 7433 | printk(KERN_ERR "md/raid:%s: array size must be reduced " | 7567 | pr_warn("md/raid:%s: array size must be reduced before number of disks\n", |
| 7434 | "before number of disks\n", mdname(mddev)); | 7568 | mdname(mddev)); |
| 7435 | return -EINVAL; | 7569 | return -EINVAL; |
| 7436 | } | 7570 | } |
| 7437 | 7571 | ||
| @@ -7501,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
| 7501 | } | 7635 | } |
| 7502 | mddev->raid_disks = conf->raid_disks; | 7636 | mddev->raid_disks = conf->raid_disks; |
| 7503 | mddev->reshape_position = conf->reshape_progress; | 7637 | mddev->reshape_position = conf->reshape_progress; |
| 7504 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 7638 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 7505 | 7639 | ||
| 7506 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 7640 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
| 7507 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 7641 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
| @@ -7619,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) | |||
| 7619 | /* '2' tells resync/reshape to pause so that all | 7753 | /* '2' tells resync/reshape to pause so that all |
| 7620 | * active stripes can drain | 7754 | * active stripes can drain |
| 7621 | */ | 7755 | */ |
| 7756 | r5c_flush_cache(conf, INT_MAX); | ||
| 7622 | conf->quiesce = 2; | 7757 | conf->quiesce = 2; |
| 7623 | wait_event_cmd(conf->wait_for_quiescent, | 7758 | wait_event_cmd(conf->wait_for_quiescent, |
| 7624 | atomic_read(&conf->active_stripes) == 0 && | 7759 | atomic_read(&conf->active_stripes) == 0 && |
| @@ -7649,8 +7784,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) | |||
| 7649 | 7784 | ||
| 7650 | /* for raid0 takeover only one zone is supported */ | 7785 | /* for raid0 takeover only one zone is supported */ |
| 7651 | if (raid0_conf->nr_strip_zones > 1) { | 7786 | if (raid0_conf->nr_strip_zones > 1) { |
| 7652 | printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", | 7787 | pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", |
| 7653 | mdname(mddev)); | 7788 | mdname(mddev)); |
| 7654 | return ERR_PTR(-EINVAL); | 7789 | return ERR_PTR(-EINVAL); |
| 7655 | } | 7790 | } |
| 7656 | 7791 | ||
| @@ -7671,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) | |||
| 7671 | static void *raid5_takeover_raid1(struct mddev *mddev) | 7806 | static void *raid5_takeover_raid1(struct mddev *mddev) |
| 7672 | { | 7807 | { |
| 7673 | int chunksect; | 7808 | int chunksect; |
| 7809 | void *ret; | ||
| 7674 | 7810 | ||
| 7675 | if (mddev->raid_disks != 2 || | 7811 | if (mddev->raid_disks != 2 || |
| 7676 | mddev->degraded > 1) | 7812 | mddev->degraded > 1) |
| @@ -7692,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev) | |||
| 7692 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; | 7828 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; |
| 7693 | mddev->new_chunk_sectors = chunksect; | 7829 | mddev->new_chunk_sectors = chunksect; |
| 7694 | 7830 | ||
| 7695 | return setup_conf(mddev); | 7831 | ret = setup_conf(mddev); |
| 7832 | if (!IS_ERR_VALUE(ret)) | ||
| 7833 | clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||
| 7834 | return ret; | ||
| 7696 | } | 7835 | } |
| 7697 | 7836 | ||
| 7698 | static void *raid5_takeover_raid6(struct mddev *mddev) | 7837 | static void *raid5_takeover_raid6(struct mddev *mddev) |
| @@ -7762,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev) | |||
| 7762 | conf->chunk_sectors = new_chunk ; | 7901 | conf->chunk_sectors = new_chunk ; |
| 7763 | mddev->chunk_sectors = new_chunk; | 7902 | mddev->chunk_sectors = new_chunk; |
| 7764 | } | 7903 | } |
| 7765 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 7904 | set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| 7766 | md_wakeup_thread(mddev->thread); | 7905 | md_wakeup_thread(mddev->thread); |
| 7767 | } | 7906 | } |
| 7768 | return check_reshape(mddev); | 7907 | return check_reshape(mddev); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 57ec49f0839e..ed8e1362ab36 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -226,6 +226,8 @@ struct stripe_head { | |||
| 226 | 226 | ||
| 227 | struct r5l_io_unit *log_io; | 227 | struct r5l_io_unit *log_io; |
| 228 | struct list_head log_list; | 228 | struct list_head log_list; |
| 229 | sector_t log_start; /* first meta block on the journal */ | ||
| 230 | struct list_head r5c; /* for r5c_cache->stripe_in_journal */ | ||
| 229 | /** | 231 | /** |
| 230 | * struct stripe_operations | 232 | * struct stripe_operations |
| 231 | * @target - STRIPE_OP_COMPUTE_BLK target | 233 | * @target - STRIPE_OP_COMPUTE_BLK target |
| @@ -264,6 +266,7 @@ struct stripe_head_state { | |||
| 264 | int syncing, expanding, expanded, replacing; | 266 | int syncing, expanding, expanded, replacing; |
| 265 | int locked, uptodate, to_read, to_write, failed, written; | 267 | int locked, uptodate, to_read, to_write, failed, written; |
| 266 | int to_fill, compute, req_compute, non_overwrite; | 268 | int to_fill, compute, req_compute, non_overwrite; |
| 269 | int injournal, just_cached; | ||
| 267 | int failed_num[2]; | 270 | int failed_num[2]; |
| 268 | int p_failed, q_failed; | 271 | int p_failed, q_failed; |
| 269 | int dec_preread_active; | 272 | int dec_preread_active; |
| @@ -273,6 +276,7 @@ struct stripe_head_state { | |||
| 273 | struct md_rdev *blocked_rdev; | 276 | struct md_rdev *blocked_rdev; |
| 274 | int handle_bad_blocks; | 277 | int handle_bad_blocks; |
| 275 | int log_failed; | 278 | int log_failed; |
| 279 | int waiting_extra_page; | ||
| 276 | }; | 280 | }; |
| 277 | 281 | ||
| 278 | /* Flags for struct r5dev.flags */ | 282 | /* Flags for struct r5dev.flags */ |
| @@ -313,6 +317,11 @@ enum r5dev_flags { | |||
| 313 | */ | 317 | */ |
| 314 | R5_Discard, /* Discard the stripe */ | 318 | R5_Discard, /* Discard the stripe */ |
| 315 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ | 319 | R5_SkipCopy, /* Don't copy data from bio to stripe cache */ |
| 320 | R5_InJournal, /* data being written is in the journal device. | ||
| 321 | * if R5_InJournal is set for parity pd_idx, all the | ||
| 322 | * data and parity being written are in the journal | ||
| 323 | * device | ||
| 324 | */ | ||
| 316 | }; | 325 | }; |
| 317 | 326 | ||
| 318 | /* | 327 | /* |
| @@ -345,7 +354,30 @@ enum { | |||
| 345 | STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add | 354 | STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add |
| 346 | * to batch yet. | 355 | * to batch yet. |
| 347 | */ | 356 | */ |
| 348 | STRIPE_LOG_TRAPPED, /* trapped into log */ | 357 | STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) |
| 358 | * this bit is used in two scenarios: | ||
| 359 | * | ||
| 360 | * 1. write-out phase | ||
| 361 | * set in first entry of r5l_write_stripe | ||
| 362 | * clear in second entry of r5l_write_stripe | ||
| 363 | * used to bypass logic in handle_stripe | ||
| 364 | * | ||
| 365 | * 2. caching phase | ||
| 366 | * set in r5c_try_caching_write() | ||
| 367 | * clear when journal write is done | ||
| 368 | * used to initiate r5c_cache_data() | ||
| 369 | * also used to bypass logic in handle_stripe | ||
| 370 | */ | ||
| 371 | STRIPE_R5C_CACHING, /* the stripe is in caching phase | ||
| 372 | * see more detail in the raid5-cache.c | ||
| 373 | */ | ||
| 374 | STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or | ||
| 375 | * in conf->r5c_partial_stripe_list) | ||
| 376 | */ | ||
| 377 | STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or | ||
| 378 | * in conf->r5c_full_stripe_list) | ||
| 379 | */ | ||
| 380 | STRIPE_R5C_PREFLUSH, /* need to flush journal device */ | ||
| 349 | }; | 381 | }; |
| 350 | 382 | ||
| 351 | #define STRIPE_EXPAND_SYNC_FLAGS \ | 383 | #define STRIPE_EXPAND_SYNC_FLAGS \ |
| @@ -408,8 +440,86 @@ enum { | |||
| 408 | 440 | ||
| 409 | struct disk_info { | 441 | struct disk_info { |
| 410 | struct md_rdev *rdev, *replacement; | 442 | struct md_rdev *rdev, *replacement; |
| 443 | struct page *extra_page; /* extra page to use in prexor */ | ||
| 411 | }; | 444 | }; |
| 412 | 445 | ||
| 446 | /* | ||
| 447 | * Stripe cache | ||
| 448 | */ | ||
| 449 | |||
| 450 | #define NR_STRIPES 256 | ||
| 451 | #define STRIPE_SIZE PAGE_SIZE | ||
| 452 | #define STRIPE_SHIFT (PAGE_SHIFT - 9) | ||
| 453 | #define STRIPE_SECTORS (STRIPE_SIZE>>9) | ||
| 454 | #define IO_THRESHOLD 1 | ||
| 455 | #define BYPASS_THRESHOLD 1 | ||
| 456 | #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) | ||
| 457 | #define HASH_MASK (NR_HASH - 1) | ||
| 458 | #define MAX_STRIPE_BATCH 8 | ||
| 459 | |||
| 460 | /* bio's attached to a stripe+device for I/O are linked together in bi_sector | ||
| 461 | * order without overlap. There may be several bio's per stripe+device, and | ||
| 462 | * a bio could span several devices. | ||
| 463 | * When walking this list for a particular stripe+device, we must never proceed | ||
| 464 | * beyond a bio that extends past this device, as the next bio might no longer | ||
| 465 | * be valid. | ||
| 466 | * This function is used to determine the 'next' bio in the list, given the | ||
| 467 | * sector of the current stripe+device | ||
| 468 | */ | ||
| 469 | static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | ||
| 470 | { | ||
| 471 | int sectors = bio_sectors(bio); | ||
| 472 | |||
| 473 | if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) | ||
| 474 | return bio->bi_next; | ||
| 475 | else | ||
| 476 | return NULL; | ||
| 477 | } | ||
| 478 | |||
| 479 | /* | ||
| 480 | * We maintain a biased count of active stripes in the bottom 16 bits of | ||
| 481 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | ||
| 482 | */ | ||
| 483 | static inline int raid5_bi_processed_stripes(struct bio *bio) | ||
| 484 | { | ||
| 485 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 486 | |||
| 487 | return (atomic_read(segments) >> 16) & 0xffff; | ||
| 488 | } | ||
| 489 | |||
| 490 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) | ||
| 491 | { | ||
| 492 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 493 | |||
| 494 | return atomic_sub_return(1, segments) & 0xffff; | ||
| 495 | } | ||
| 496 | |||
| 497 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) | ||
| 498 | { | ||
| 499 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 500 | |||
| 501 | atomic_inc(segments); | ||
| 502 | } | ||
| 503 | |||
| 504 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, | ||
| 505 | unsigned int cnt) | ||
| 506 | { | ||
| 507 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 508 | int old, new; | ||
| 509 | |||
| 510 | do { | ||
| 511 | old = atomic_read(segments); | ||
| 512 | new = (old & 0xffff) | (cnt << 16); | ||
| 513 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
| 514 | } | ||
| 515 | |||
| 516 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) | ||
| 517 | { | ||
| 518 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; | ||
| 519 | |||
| 520 | atomic_set(segments, cnt); | ||
| 521 | } | ||
| 522 | |||
| 413 | /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. | 523 | /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. |
| 414 | * This is because we sometimes take all the spinlocks | 524 | * This is because we sometimes take all the spinlocks |
| 415 | * and creating that much locking depth can cause | 525 | * and creating that much locking depth can cause |
| @@ -432,6 +542,30 @@ struct r5worker_group { | |||
| 432 | int stripes_cnt; | 542 | int stripes_cnt; |
| 433 | }; | 543 | }; |
| 434 | 544 | ||
| 545 | enum r5_cache_state { | ||
| 546 | R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, | ||
| 547 | * waiting for 25% to be free | ||
| 548 | */ | ||
| 549 | R5_ALLOC_MORE, /* It might help to allocate another | ||
| 550 | * stripe. | ||
| 551 | */ | ||
| 552 | R5_DID_ALLOC, /* A stripe was allocated, don't allocate | ||
| 553 | * more until at least one has been | ||
| 554 | * released. This avoids flooding | ||
| 555 | * the cache. | ||
| 556 | */ | ||
| 557 | R5C_LOG_TIGHT, /* log device space tight, need to | ||
| 558 | * prioritize stripes at last_checkpoint | ||
| 559 | */ | ||
| 560 | R5C_LOG_CRITICAL, /* log device is running out of space, | ||
| 561 | * only process stripes that are already | ||
| 562 | * occupying the log | ||
| 563 | */ | ||
| 564 | R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page | ||
| 565 | * for prexor | ||
| 566 | */ | ||
| 567 | }; | ||
| 568 | |||
| 435 | struct r5conf { | 569 | struct r5conf { |
| 436 | struct hlist_head *stripe_hashtbl; | 570 | struct hlist_head *stripe_hashtbl; |
| 437 | /* only protect corresponding hash list and inactive_list */ | 571 | /* only protect corresponding hash list and inactive_list */ |
| @@ -519,23 +653,18 @@ struct r5conf { | |||
| 519 | */ | 653 | */ |
| 520 | atomic_t active_stripes; | 654 | atomic_t active_stripes; |
| 521 | struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; | 655 | struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; |
| 656 | |||
| 657 | atomic_t r5c_cached_full_stripes; | ||
| 658 | struct list_head r5c_full_stripe_list; | ||
| 659 | atomic_t r5c_cached_partial_stripes; | ||
| 660 | struct list_head r5c_partial_stripe_list; | ||
| 661 | |||
| 522 | atomic_t empty_inactive_list_nr; | 662 | atomic_t empty_inactive_list_nr; |
| 523 | struct llist_head released_stripes; | 663 | struct llist_head released_stripes; |
| 524 | wait_queue_head_t wait_for_quiescent; | 664 | wait_queue_head_t wait_for_quiescent; |
| 525 | wait_queue_head_t wait_for_stripe; | 665 | wait_queue_head_t wait_for_stripe; |
| 526 | wait_queue_head_t wait_for_overlap; | 666 | wait_queue_head_t wait_for_overlap; |
| 527 | unsigned long cache_state; | 667 | unsigned long cache_state; |
| 528 | #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, | ||
| 529 | * waiting for 25% to be free | ||
| 530 | */ | ||
| 531 | #define R5_ALLOC_MORE 2 /* It might help to allocate another | ||
| 532 | * stripe. | ||
| 533 | */ | ||
| 534 | #define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate | ||
| 535 | * more until at least one has been | ||
| 536 | * released. This avoids flooding | ||
| 537 | * the cache. | ||
| 538 | */ | ||
| 539 | struct shrinker shrinker; | 668 | struct shrinker shrinker; |
| 540 | int pool_size; /* number of disks in stripeheads in pool */ | 669 | int pool_size; /* number of disks in stripeheads in pool */ |
| 541 | spinlock_t device_lock; | 670 | spinlock_t device_lock; |
| @@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh); | |||
| 633 | extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); | 762 | extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); |
| 634 | extern void r5l_quiesce(struct r5l_log *log, int state); | 763 | extern void r5l_quiesce(struct r5l_log *log, int state); |
| 635 | extern bool r5l_log_disk_error(struct r5conf *conf); | 764 | extern bool r5l_log_disk_error(struct r5conf *conf); |
| 765 | extern bool r5c_is_writeback(struct r5l_log *log); | ||
| 766 | extern int | ||
| 767 | r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, | ||
| 768 | struct stripe_head_state *s, int disks); | ||
| 769 | extern void | ||
| 770 | r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, | ||
| 771 | struct stripe_head_state *s); | ||
| 772 | extern void r5c_release_extra_page(struct stripe_head *sh); | ||
| 773 | extern void r5c_use_extra_page(struct stripe_head *sh); | ||
| 774 | extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); | ||
| 775 | extern void r5c_handle_cached_data_endio(struct r5conf *conf, | ||
| 776 | struct stripe_head *sh, int disks, struct bio_list *return_bi); | ||
| 777 | extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, | ||
| 778 | struct stripe_head_state *s); | ||
| 779 | extern void r5c_make_stripe_write_out(struct stripe_head *sh); | ||
| 780 | extern void r5c_flush_cache(struct r5conf *conf, int num); | ||
| 781 | extern void r5c_check_stripe_cache_usage(struct r5conf *conf); | ||
| 782 | extern void r5c_check_cached_full_stripe(struct r5conf *conf); | ||
| 783 | extern struct md_sysfs_entry r5c_journal_mode; | ||
| 636 | #endif | 784 | #endif |
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index c3e654c6d518..9930f3e9040f 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h | |||
| @@ -84,6 +84,10 @@ | |||
| 84 | #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed | 84 | #define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed |
| 85 | * For clustered enviroments only. | 85 | * For clustered enviroments only. |
| 86 | */ | 86 | */ |
| 87 | #define MD_DISK_FAILFAST 10 /* Send REQ_FAILFAST if there are multiple | ||
| 88 | * devices available - and don't try to | ||
| 89 | * correct read errors. | ||
| 90 | */ | ||
| 87 | 91 | ||
| 88 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. | 92 | #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. |
| 89 | * read requests will only be sent here in | 93 | * read requests will only be sent here in |
| @@ -265,8 +269,9 @@ struct mdp_superblock_1 { | |||
| 265 | __le32 dev_number; /* permanent identifier of this device - not role in raid */ | 269 | __le32 dev_number; /* permanent identifier of this device - not role in raid */ |
| 266 | __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ | 270 | __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ |
| 267 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ | 271 | __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ |
| 268 | __u8 devflags; /* per-device flags. Only one defined...*/ | 272 | __u8 devflags; /* per-device flags. Only two defined...*/ |
| 269 | #define WriteMostly1 1 /* mask for writemostly flag in above */ | 273 | #define WriteMostly1 1 /* mask for writemostly flag in above */ |
| 274 | #define FailFast1 2 /* Should avoid retries and fixups and just fail */ | ||
| 270 | /* Bad block log. If there are any bad blocks the feature flag is set. | 275 | /* Bad block log. If there are any bad blocks the feature flag is set. |
| 271 | * If offset and size are non-zero, that space is reserved and available | 276 | * If offset and size are non-zero, that space is reserved and available |
| 272 | */ | 277 | */ |
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index 76734004358d..20bca3d44f67 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c | |||
| @@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
| 87 | kernel_fpu_end(); | 87 | kernel_fpu_end(); |
| 88 | } | 88 | } |
| 89 | 89 | ||
| 90 | static void raid6_avx21_xor_syndrome(int disks, int start, int stop, | ||
| 91 | size_t bytes, void **ptrs) | ||
| 92 | { | ||
| 93 | u8 **dptr = (u8 **)ptrs; | ||
| 94 | u8 *p, *q; | ||
| 95 | int d, z, z0; | ||
| 96 | |||
| 97 | z0 = stop; /* P/Q right side optimization */ | ||
| 98 | p = dptr[disks-2]; /* XOR parity */ | ||
| 99 | q = dptr[disks-1]; /* RS syndrome */ | ||
| 100 | |||
| 101 | kernel_fpu_begin(); | ||
| 102 | |||
| 103 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | ||
| 104 | |||
| 105 | for (d = 0 ; d < bytes ; d += 32) { | ||
| 106 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | ||
| 107 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | ||
| 108 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | ||
| 109 | /* P/Q data pages */ | ||
| 110 | for (z = z0-1 ; z >= start ; z--) { | ||
| 111 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
| 112 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
| 113 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
| 114 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
| 115 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 116 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | ||
| 117 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | ||
| 118 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 119 | } | ||
| 120 | /* P/Q left side optimization */ | ||
| 121 | for (z = start-1 ; z >= 0 ; z--) { | ||
| 122 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
| 123 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
| 124 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
| 125 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
| 126 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 127 | } | ||
| 128 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | ||
| 129 | /* Don't use movntdq for r/w memory area < cache line */ | ||
| 130 | asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); | ||
| 131 | asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); | ||
| 132 | } | ||
| 133 | |||
| 134 | asm volatile("sfence" : : : "memory"); | ||
| 135 | kernel_fpu_end(); | ||
| 136 | } | ||
| 137 | |||
| 90 | const struct raid6_calls raid6_avx2x1 = { | 138 | const struct raid6_calls raid6_avx2x1 = { |
| 91 | raid6_avx21_gen_syndrome, | 139 | raid6_avx21_gen_syndrome, |
| 92 | NULL, /* XOR not yet implemented */ | 140 | raid6_avx21_xor_syndrome, |
| 93 | raid6_have_avx2, | 141 | raid6_have_avx2, |
| 94 | "avx2x1", | 142 | "avx2x1", |
| 95 | 1 /* Has cache hints */ | 143 | 1 /* Has cache hints */ |
| @@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
| 149 | kernel_fpu_end(); | 197 | kernel_fpu_end(); |
| 150 | } | 198 | } |
| 151 | 199 | ||
| 200 | static void raid6_avx22_xor_syndrome(int disks, int start, int stop, | ||
| 201 | size_t bytes, void **ptrs) | ||
| 202 | { | ||
| 203 | u8 **dptr = (u8 **)ptrs; | ||
| 204 | u8 *p, *q; | ||
| 205 | int d, z, z0; | ||
| 206 | |||
| 207 | z0 = stop; /* P/Q right side optimization */ | ||
| 208 | p = dptr[disks-2]; /* XOR parity */ | ||
| 209 | q = dptr[disks-1]; /* RS syndrome */ | ||
| 210 | |||
| 211 | kernel_fpu_begin(); | ||
| 212 | |||
| 213 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | ||
| 214 | |||
| 215 | for (d = 0 ; d < bytes ; d += 64) { | ||
| 216 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | ||
| 217 | asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); | ||
| 218 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | ||
| 219 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); | ||
| 220 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | ||
| 221 | asm volatile("vpxor %ymm6,%ymm3,%ymm3"); | ||
| 222 | /* P/Q data pages */ | ||
| 223 | for (z = z0-1 ; z >= start ; z--) { | ||
| 224 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
| 225 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
| 226 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
| 227 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
| 228 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
| 229 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
| 230 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
| 231 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
| 232 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 233 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
| 234 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | ||
| 235 | asm volatile("vmovdqa %0,%%ymm7" | ||
| 236 | :: "m" (dptr[z][d+32])); | ||
| 237 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | ||
| 238 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | ||
| 239 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 240 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
| 241 | } | ||
| 242 | /* P/Q left side optimization */ | ||
| 243 | for (z = start-1 ; z >= 0 ; z--) { | ||
| 244 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
| 245 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
| 246 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
| 247 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
| 248 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
| 249 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
| 250 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
| 251 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
| 252 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 253 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
| 254 | } | ||
| 255 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | ||
| 256 | asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); | ||
| 257 | /* Don't use movntdq for r/w memory area < cache line */ | ||
| 258 | asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); | ||
| 259 | asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); | ||
| 260 | asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); | ||
| 261 | asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); | ||
| 262 | } | ||
| 263 | |||
| 264 | asm volatile("sfence" : : : "memory"); | ||
| 265 | kernel_fpu_end(); | ||
| 266 | } | ||
| 267 | |||
| 152 | const struct raid6_calls raid6_avx2x2 = { | 268 | const struct raid6_calls raid6_avx2x2 = { |
| 153 | raid6_avx22_gen_syndrome, | 269 | raid6_avx22_gen_syndrome, |
| 154 | NULL, /* XOR not yet implemented */ | 270 | raid6_avx22_xor_syndrome, |
| 155 | raid6_have_avx2, | 271 | raid6_have_avx2, |
| 156 | "avx2x2", | 272 | "avx2x2", |
| 157 | 1 /* Has cache hints */ | 273 | 1 /* Has cache hints */ |
| @@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) | |||
| 242 | kernel_fpu_end(); | 358 | kernel_fpu_end(); |
| 243 | } | 359 | } |
| 244 | 360 | ||
| 361 | static void raid6_avx24_xor_syndrome(int disks, int start, int stop, | ||
| 362 | size_t bytes, void **ptrs) | ||
| 363 | { | ||
| 364 | u8 **dptr = (u8 **)ptrs; | ||
| 365 | u8 *p, *q; | ||
| 366 | int d, z, z0; | ||
| 367 | |||
| 368 | z0 = stop; /* P/Q right side optimization */ | ||
| 369 | p = dptr[disks-2]; /* XOR parity */ | ||
| 370 | q = dptr[disks-1]; /* RS syndrome */ | ||
| 371 | |||
| 372 | kernel_fpu_begin(); | ||
| 373 | |||
| 374 | asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); | ||
| 375 | |||
| 376 | for (d = 0 ; d < bytes ; d += 128) { | ||
| 377 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | ||
| 378 | asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); | ||
| 379 | asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); | ||
| 380 | asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); | ||
| 381 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | ||
| 382 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); | ||
| 383 | asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); | ||
| 384 | asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); | ||
| 385 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | ||
| 386 | asm volatile("vpxor %ymm6,%ymm3,%ymm3"); | ||
| 387 | asm volatile("vpxor %ymm12,%ymm10,%ymm10"); | ||
| 388 | asm volatile("vpxor %ymm14,%ymm11,%ymm11"); | ||
| 389 | /* P/Q data pages */ | ||
| 390 | for (z = z0-1 ; z >= start ; z--) { | ||
| 391 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); | ||
| 392 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); | ||
| 393 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
| 394 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
| 395 | asm volatile("vpxor %ymm13,%ymm13,%ymm13"); | ||
| 396 | asm volatile("vpxor %ymm15,%ymm15,%ymm15"); | ||
| 397 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
| 398 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
| 399 | asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); | ||
| 400 | asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); | ||
| 401 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
| 402 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
| 403 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | ||
| 404 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | ||
| 405 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
| 406 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
| 407 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | ||
| 408 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | ||
| 409 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 410 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
| 411 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | ||
| 412 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | ||
| 413 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | ||
| 414 | asm volatile("vmovdqa %0,%%ymm7" | ||
| 415 | :: "m" (dptr[z][d+32])); | ||
| 416 | asm volatile("vmovdqa %0,%%ymm13" | ||
| 417 | :: "m" (dptr[z][d+64])); | ||
| 418 | asm volatile("vmovdqa %0,%%ymm15" | ||
| 419 | :: "m" (dptr[z][d+96])); | ||
| 420 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | ||
| 421 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | ||
| 422 | asm volatile("vpxor %ymm13,%ymm10,%ymm10"); | ||
| 423 | asm volatile("vpxor %ymm15,%ymm11,%ymm11"); | ||
| 424 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 425 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
| 426 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | ||
| 427 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | ||
| 428 | } | ||
| 429 | asm volatile("prefetchnta %0" :: "m" (q[d])); | ||
| 430 | asm volatile("prefetchnta %0" :: "m" (q[d+64])); | ||
| 431 | /* P/Q left side optimization */ | ||
| 432 | for (z = start-1 ; z >= 0 ; z--) { | ||
| 433 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | ||
| 434 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | ||
| 435 | asm volatile("vpxor %ymm13,%ymm13,%ymm13"); | ||
| 436 | asm volatile("vpxor %ymm15,%ymm15,%ymm15"); | ||
| 437 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | ||
| 438 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | ||
| 439 | asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); | ||
| 440 | asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); | ||
| 441 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | ||
| 442 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | ||
| 443 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | ||
| 444 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | ||
| 445 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | ||
| 446 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | ||
| 447 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | ||
| 448 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | ||
| 449 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | ||
| 450 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | ||
| 451 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | ||
| 452 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | ||
| 453 | } | ||
| 454 | asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); | ||
| 455 | asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); | ||
| 456 | asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); | ||
| 457 | asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); | ||
| 458 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | ||
| 459 | asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); | ||
| 460 | asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); | ||
| 461 | asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); | ||
| 462 | asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); | ||
| 463 | asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); | ||
| 464 | asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); | ||
| 465 | asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); | ||
| 466 | } | ||
| 467 | asm volatile("sfence" : : : "memory"); | ||
| 468 | kernel_fpu_end(); | ||
| 469 | } | ||
| 470 | |||
| 245 | const struct raid6_calls raid6_avx2x4 = { | 471 | const struct raid6_calls raid6_avx2x4 = { |
| 246 | raid6_avx24_gen_syndrome, | 472 | raid6_avx24_gen_syndrome, |
| 247 | NULL, /* XOR not yet implemented */ | 473 | raid6_avx24_xor_syndrome, |
| 248 | raid6_have_avx2, | 474 | raid6_have_avx2, |
| 249 | "avx2x4", | 475 | "avx2x4", |
| 250 | 1 /* Has cache hints */ | 476 | 1 /* Has cache hints */ |
