aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/md.txt28
-rw-r--r--drivers/md/bitmap.c29
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/md.c340
-rw-r--r--drivers/md/multipath.c15
-rw-r--r--drivers/md/raid1.c19
-rw-r--r--drivers/md/raid10.c20
-rw-r--r--drivers/md/raid5.c729
-rw-r--r--include/linux/raid/bitmap.h1
-rw-r--r--include/linux/raid/md.h2
-rw-r--r--include/linux/raid/md_k.h7
-rw-r--r--include/linux/raid/raid5.h64
12 files changed, 662 insertions, 594 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index a8b430627473..e06cc59437e4 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -236,6 +236,11 @@ All md devices contain:
236 writing the word for the desired state, however some states 236 writing the word for the desired state, however some states
237 cannot be explicitly set, and some transitions are not allowed. 237 cannot be explicitly set, and some transitions are not allowed.
238 238
239 Select/poll works on this file. All changes except between
240 active_idle and active (which can be frequent and are not
241 very interesting) are notified. active->active_idle is
242 reported if the metadata is externally managed.
243
239 clear 244 clear
240 No devices, no size, no level 245 No devices, no size, no level
241 Writing is equivalent to STOP_ARRAY ioctl 246 Writing is equivalent to STOP_ARRAY ioctl
@@ -292,6 +297,10 @@ Each directory contains:
292 writemostly - device will only be subject to read 297 writemostly - device will only be subject to read
293 requests if there are no other options. 298 requests if there are no other options.
294 This applies only to raid1 arrays. 299 This applies only to raid1 arrays.
300 blocked - device has failed, metadata is "external",
301 and the failure hasn't been acknowledged yet.
302 Writes that would write to this device if
303 it were not faulty are blocked.
295 spare - device is working, but not a full member. 304 spare - device is working, but not a full member.
296 This includes spares that are in the process 305 This includes spares that are in the process
297 of being recovered to 306 of being recovered to
@@ -301,6 +310,12 @@ Each directory contains:
301 Writing "remove" removes the device from the array. 310 Writing "remove" removes the device from the array.
302 Writing "writemostly" sets the writemostly flag. 311 Writing "writemostly" sets the writemostly flag.
303 Writing "-writemostly" clears the writemostly flag. 312 Writing "-writemostly" clears the writemostly flag.
313 Writing "blocked" sets the "blocked" flag.
314 Writing "-blocked" clear the "blocked" flag and allows writes
315 to complete.
316
317 This file responds to select/poll. Any change to 'faulty'
318 or 'blocked' causes an event.
304 319
305 errors 320 errors
306 An approximate count of read errors that have been detected on 321 An approximate count of read errors that have been detected on
@@ -381,6 +396,19 @@ also have
381 'check' and 'repair' will start the appropriate process 396 'check' and 'repair' will start the appropriate process
382 providing the current state is 'idle'. 397 providing the current state is 'idle'.
383 398
399 This file responds to select/poll. Any important change in the value
400 triggers a poll event. Sometimes the value will briefly be
401 "recover" if a recovery seems to be needed, but cannot be
402 achieved. In that case, the transition to "recover" isn't
403 notified, but the transition away is.
404
405 degraded
406 This contains a count of the number of devices by which the
407 arrays is degraded. So an optimal array with show '0'. A
408 single failed/missing drive will show '1', etc.
409 This file responds to select/poll, any increase or decrease
410 in the count of missing devices will trigger an event.
411
384 mismatch_count 412 mismatch_count
385 When performing 'check' and 'repair', and possibly when 413 When performing 'check' and 'repair', and possibly when
386 performing 'resync', md will count the number of errors that are 414 performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b26927ce889c..dedba16d42f7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -454,8 +454,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
454 spin_unlock_irqrestore(&bitmap->lock, flags); 454 spin_unlock_irqrestore(&bitmap->lock, flags);
455 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 455 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
456 sb->events = cpu_to_le64(bitmap->mddev->events); 456 sb->events = cpu_to_le64(bitmap->mddev->events);
457 if (!bitmap->mddev->degraded) 457 if (bitmap->mddev->events < bitmap->events_cleared) {
458 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 458 /* rocking back to read-only */
459 bitmap->events_cleared = bitmap->mddev->events;
460 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
461 }
459 kunmap_atomic(sb, KM_USER0); 462 kunmap_atomic(sb, KM_USER0);
460 write_page(bitmap, bitmap->sb_page, 1); 463 write_page(bitmap, bitmap->sb_page, 1);
461} 464}
@@ -1085,9 +1088,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1085 } else 1088 } else
1086 spin_unlock_irqrestore(&bitmap->lock, flags); 1089 spin_unlock_irqrestore(&bitmap->lock, flags);
1087 lastpage = page; 1090 lastpage = page;
1088/* 1091
1089 printk("bitmap clean at page %lu\n", j); 1092 /* We are possibly going to clear some bits, so make
1090*/ 1093 * sure that events_cleared is up-to-date.
1094 */
1095 if (bitmap->need_sync) {
1096 bitmap_super_t *sb;
1097 bitmap->need_sync = 0;
1098 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
1099 sb->events_cleared =
1100 cpu_to_le64(bitmap->events_cleared);
1101 kunmap_atomic(sb, KM_USER0);
1102 write_page(bitmap, bitmap->sb_page, 1);
1103 }
1091 spin_lock_irqsave(&bitmap->lock, flags); 1104 spin_lock_irqsave(&bitmap->lock, flags);
1092 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1105 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1093 } 1106 }
@@ -1257,6 +1270,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1257 return; 1270 return;
1258 } 1271 }
1259 1272
1273 if (success &&
1274 bitmap->events_cleared < bitmap->mddev->events) {
1275 bitmap->events_cleared = bitmap->mddev->events;
1276 bitmap->need_sync = 1;
1277 }
1278
1260 if (!success && ! (*bmc & NEEDED_MASK)) 1279 if (!success && ! (*bmc & NEEDED_MASK))
1261 *bmc |= NEEDED_MASK; 1280 *bmc |= NEEDED_MASK;
1262 1281
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 10748240cb2f..ec921f58fbb8 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -126,7 +126,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
126 int j = rdev->raid_disk; 126 int j = rdev->raid_disk;
127 dev_info_t *disk = conf->disks + j; 127 dev_info_t *disk = conf->disks + j;
128 128
129 if (j < 0 || j > raid_disks || disk->rdev) { 129 if (j < 0 || j >= raid_disks || disk->rdev) {
130 printk("linear: disk numbering problem. Aborting!\n"); 130 printk("linear: disk numbering problem. Aborting!\n");
131 goto out; 131 goto out;
132 } 132 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..73399d17ac3e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
169{ 169{
170 atomic_inc(&md_event_count); 170 atomic_inc(&md_event_count);
171 wake_up(&md_event_waiters); 171 wake_up(&md_event_waiters);
172 sysfs_notify(&mddev->kobj, NULL, "sync_action");
173} 172}
174EXPORT_SYMBOL_GPL(md_new_event); 173EXPORT_SYMBOL_GPL(md_new_event);
175 174
@@ -278,6 +277,7 @@ static mddev_t * mddev_find(dev_t unit)
278 init_waitqueue_head(&new->sb_wait); 277 init_waitqueue_head(&new->sb_wait);
279 init_waitqueue_head(&new->recovery_wait); 278 init_waitqueue_head(&new->recovery_wait);
280 new->reshape_position = MaxSector; 279 new->reshape_position = MaxSector;
280 new->resync_min = 0;
281 new->resync_max = MaxSector; 281 new->resync_max = MaxSector;
282 new->level = LEVEL_NONE; 282 new->level = LEVEL_NONE;
283 283
@@ -564,7 +564,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
564 564
565 if (!tmp1 || !tmp2) { 565 if (!tmp1 || !tmp2) {
566 ret = 0; 566 ret = 0;
567 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 567 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
568 goto abort; 568 goto abort;
569 } 569 }
570 570
@@ -658,11 +658,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
658 */ 658 */
659 659
660struct super_type { 660struct super_type {
661 char *name; 661 char *name;
662 struct module *owner; 662 struct module *owner;
663 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 663 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
664 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 664 int minor_version);
665 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 665 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
666 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
667 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
668 unsigned long long size);
666}; 669};
667 670
668/* 671/*
@@ -1004,6 +1007,27 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1004} 1007}
1005 1008
1006/* 1009/*
1010 * rdev_size_change for 0.90.0
1011 */
1012static unsigned long long
1013super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
1014{
1015 if (size && size < rdev->mddev->size)
1016 return 0; /* component must fit device */
1017 size *= 2; /* convert to sectors */
1018 if (rdev->mddev->bitmap_offset)
1019 return 0; /* can't move bitmap */
1020 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
1021 if (!size || size > rdev->sb_offset*2)
1022 size = rdev->sb_offset*2;
1023 md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
1024 rdev->sb_page);
1025 md_super_wait(rdev->mddev);
1026 return size/2; /* kB for sysfs */
1027}
1028
1029
1030/*
1007 * version 1 superblock 1031 * version 1 superblock
1008 */ 1032 */
1009 1033
@@ -1328,21 +1352,59 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1328 sb->sb_csum = calc_sb_1_csum(sb); 1352 sb->sb_csum = calc_sb_1_csum(sb);
1329} 1353}
1330 1354
1355static unsigned long long
1356super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size)
1357{
1358 struct mdp_superblock_1 *sb;
1359 unsigned long long max_size;
1360 if (size && size < rdev->mddev->size)
1361 return 0; /* component must fit device */
1362 size *= 2; /* convert to sectors */
1363 if (rdev->sb_offset < rdev->data_offset/2) {
1364 /* minor versions 1 and 2; superblock before data */
1365 max_size = (rdev->bdev->bd_inode->i_size >> 9);
1366 max_size -= rdev->data_offset;
1367 if (!size || size > max_size)
1368 size = max_size;
1369 } else if (rdev->mddev->bitmap_offset) {
1370 /* minor version 0 with bitmap we can't move */
1371 return 0;
1372 } else {
1373 /* minor version 0; superblock after data */
1374 sector_t sb_offset;
1375 sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1376 sb_offset &= ~(sector_t)(4*2 - 1);
1377 max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2;
1378 if (!size || size > max_size)
1379 size = max_size;
1380 rdev->sb_offset = sb_offset/2;
1381 }
1382 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1383 sb->data_size = cpu_to_le64(size);
1384 sb->super_offset = rdev->sb_offset*2;
1385 sb->sb_csum = calc_sb_1_csum(sb);
1386 md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size,
1387 rdev->sb_page);
1388 md_super_wait(rdev->mddev);
1389 return size/2; /* kB for sysfs */
1390}
1331 1391
1332static struct super_type super_types[] = { 1392static struct super_type super_types[] = {
1333 [0] = { 1393 [0] = {
1334 .name = "0.90.0", 1394 .name = "0.90.0",
1335 .owner = THIS_MODULE, 1395 .owner = THIS_MODULE,
1336 .load_super = super_90_load, 1396 .load_super = super_90_load,
1337 .validate_super = super_90_validate, 1397 .validate_super = super_90_validate,
1338 .sync_super = super_90_sync, 1398 .sync_super = super_90_sync,
1399 .rdev_size_change = super_90_rdev_size_change,
1339 }, 1400 },
1340 [1] = { 1401 [1] = {
1341 .name = "md-1", 1402 .name = "md-1",
1342 .owner = THIS_MODULE, 1403 .owner = THIS_MODULE,
1343 .load_super = super_1_load, 1404 .load_super = super_1_load,
1344 .validate_super = super_1_validate, 1405 .validate_super = super_1_validate,
1345 .sync_super = super_1_sync, 1406 .sync_super = super_1_sync,
1407 .rdev_size_change = super_1_rdev_size_change,
1346 }, 1408 },
1347}; 1409};
1348 1410
@@ -1787,7 +1849,7 @@ repeat:
1787 1849
1788} 1850}
1789 1851
1790/* words written to sysfs files may, or my not, be \n terminated. 1852/* words written to sysfs files may, or may not, be \n terminated.
1791 * We want to accept with case. For this we use cmd_match. 1853 * We want to accept with case. For this we use cmd_match.
1792 */ 1854 */
1793static int cmd_match(const char *cmd, const char *str) 1855static int cmd_match(const char *cmd, const char *str)
@@ -1886,6 +1948,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1886 1948
1887 err = 0; 1949 err = 0;
1888 } 1950 }
1951 if (!err)
1952 sysfs_notify(&rdev->kobj, NULL, "state");
1889 return err ? err : len; 1953 return err ? err : len;
1890} 1954}
1891static struct rdev_sysfs_entry rdev_state = 1955static struct rdev_sysfs_entry rdev_state =
@@ -1931,7 +1995,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1931 slot = -1; 1995 slot = -1;
1932 else if (e==buf || (*e && *e!= '\n')) 1996 else if (e==buf || (*e && *e!= '\n'))
1933 return -EINVAL; 1997 return -EINVAL;
1934 if (rdev->mddev->pers) { 1998 if (rdev->mddev->pers && slot == -1) {
1935 /* Setting 'slot' on an active array requires also 1999 /* Setting 'slot' on an active array requires also
1936 * updating the 'rd%d' link, and communicating 2000 * updating the 'rd%d' link, and communicating
1937 * with the personality with ->hot_*_disk. 2001 * with the personality with ->hot_*_disk.
@@ -1939,8 +2003,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1939 * failed/spare devices. This normally happens automatically, 2003 * failed/spare devices. This normally happens automatically,
1940 * but not when the metadata is externally managed. 2004 * but not when the metadata is externally managed.
1941 */ 2005 */
1942 if (slot != -1)
1943 return -EBUSY;
1944 if (rdev->raid_disk == -1) 2006 if (rdev->raid_disk == -1)
1945 return -EEXIST; 2007 return -EEXIST;
1946 /* personality does all needed checks */ 2008 /* personality does all needed checks */
@@ -1954,6 +2016,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1954 sysfs_remove_link(&rdev->mddev->kobj, nm); 2016 sysfs_remove_link(&rdev->mddev->kobj, nm);
1955 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2017 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1956 md_wakeup_thread(rdev->mddev->thread); 2018 md_wakeup_thread(rdev->mddev->thread);
2019 } else if (rdev->mddev->pers) {
2020 mdk_rdev_t *rdev2;
2021 struct list_head *tmp;
2022 /* Activating a spare .. or possibly reactivating
2023 * if we every get bitmaps working here.
2024 */
2025
2026 if (rdev->raid_disk != -1)
2027 return -EBUSY;
2028
2029 if (rdev->mddev->pers->hot_add_disk == NULL)
2030 return -EINVAL;
2031
2032 rdev_for_each(rdev2, tmp, rdev->mddev)
2033 if (rdev2->raid_disk == slot)
2034 return -EEXIST;
2035
2036 rdev->raid_disk = slot;
2037 if (test_bit(In_sync, &rdev->flags))
2038 rdev->saved_raid_disk = slot;
2039 else
2040 rdev->saved_raid_disk = -1;
2041 err = rdev->mddev->pers->
2042 hot_add_disk(rdev->mddev, rdev);
2043 if (err) {
2044 rdev->raid_disk = -1;
2045 return err;
2046 } else
2047 sysfs_notify(&rdev->kobj, NULL, "state");
2048 sprintf(nm, "rd%d", rdev->raid_disk);
2049 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2050 printk(KERN_WARNING
2051 "md: cannot register "
2052 "%s for %s\n",
2053 nm, mdname(rdev->mddev));
2054
2055 /* don't wakeup anyone, leave that to userspace. */
1957 } else { 2056 } else {
1958 if (slot >= rdev->mddev->raid_disks) 2057 if (slot >= rdev->mddev->raid_disks)
1959 return -ENOSPC; 2058 return -ENOSPC;
@@ -1962,6 +2061,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1962 clear_bit(Faulty, &rdev->flags); 2061 clear_bit(Faulty, &rdev->flags);
1963 clear_bit(WriteMostly, &rdev->flags); 2062 clear_bit(WriteMostly, &rdev->flags);
1964 set_bit(In_sync, &rdev->flags); 2063 set_bit(In_sync, &rdev->flags);
2064 sysfs_notify(&rdev->kobj, NULL, "state");
1965 } 2065 }
1966 return len; 2066 return len;
1967} 2067}
@@ -1983,7 +2083,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1983 unsigned long long offset = simple_strtoull(buf, &e, 10); 2083 unsigned long long offset = simple_strtoull(buf, &e, 10);
1984 if (e==buf || (*e && *e != '\n')) 2084 if (e==buf || (*e && *e != '\n'))
1985 return -EINVAL; 2085 return -EINVAL;
1986 if (rdev->mddev->pers) 2086 if (rdev->mddev->pers && rdev->raid_disk >= 0)
1987 return -EBUSY; 2087 return -EBUSY;
1988 if (rdev->size && rdev->mddev->external) 2088 if (rdev->size && rdev->mddev->external)
1989 /* Must set offset before size, so overlap checks 2089 /* Must set offset before size, so overlap checks
@@ -2022,8 +2122,20 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2022 2122
2023 if (e==buf || (*e && *e != '\n')) 2123 if (e==buf || (*e && *e != '\n'))
2024 return -EINVAL; 2124 return -EINVAL;
2025 if (my_mddev->pers) 2125 if (my_mddev->pers && rdev->raid_disk >= 0) {
2026 return -EBUSY; 2126 if (rdev->mddev->persistent) {
2127 size = super_types[rdev->mddev->major_version].
2128 rdev_size_change(rdev, size);
2129 if (!size)
2130 return -EBUSY;
2131 } else if (!size) {
2132 size = (rdev->bdev->bd_inode->i_size >> 10);
2133 size -= rdev->data_offset/2;
2134 }
2135 if (size < rdev->mddev->size)
2136 return -EINVAL; /* component must fit device */
2137 }
2138
2027 rdev->size = size; 2139 rdev->size = size;
2028 if (size > oldsize && rdev->mddev->external) { 2140 if (size > oldsize && rdev->mddev->external) {
2029 /* need to check that all other rdevs with the same ->bdev 2141 /* need to check that all other rdevs with the same ->bdev
@@ -2512,7 +2624,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2512 * When written, doesn't tear down array, but just stops it 2624 * When written, doesn't tear down array, but just stops it
2513 * suspended (not supported yet) 2625 * suspended (not supported yet)
2514 * All IO requests will block. The array can be reconfigured. 2626 * All IO requests will block. The array can be reconfigured.
2515 * Writing this, if accepted, will block until array is quiessent 2627 * Writing this, if accepted, will block until array is quiescent
2516 * readonly 2628 * readonly
2517 * no resync can happen. no superblocks get written. 2629 * no resync can happen. no superblocks get written.
2518 * write requests fail 2630 * write requests fail
@@ -2681,8 +2793,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2681 } 2793 }
2682 if (err) 2794 if (err)
2683 return err; 2795 return err;
2684 else 2796 else {
2797 sysfs_notify(&mddev->kobj, NULL, "array_state");
2685 return len; 2798 return len;
2799 }
2686} 2800}
2687static struct md_sysfs_entry md_array_state = 2801static struct md_sysfs_entry md_array_state =
2688__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2802__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -2899,7 +3013,7 @@ action_show(mddev_t *mddev, char *page)
2899 type = "check"; 3013 type = "check";
2900 else 3014 else
2901 type = "repair"; 3015 type = "repair";
2902 } else 3016 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
2903 type = "recover"; 3017 type = "recover";
2904 } 3018 }
2905 return sprintf(page, "%s\n", type); 3019 return sprintf(page, "%s\n", type);
@@ -2921,15 +3035,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2921 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3035 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2922 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3036 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2923 return -EBUSY; 3037 return -EBUSY;
2924 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 3038 else if (cmd_match(page, "resync"))
3039 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3040 else if (cmd_match(page, "recover")) {
3041 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2925 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3042 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2926 else if (cmd_match(page, "reshape")) { 3043 } else if (cmd_match(page, "reshape")) {
2927 int err; 3044 int err;
2928 if (mddev->pers->start_reshape == NULL) 3045 if (mddev->pers->start_reshape == NULL)
2929 return -EINVAL; 3046 return -EINVAL;
2930 err = mddev->pers->start_reshape(mddev); 3047 err = mddev->pers->start_reshape(mddev);
2931 if (err) 3048 if (err)
2932 return err; 3049 return err;
3050 sysfs_notify(&mddev->kobj, NULL, "degraded");
2933 } else { 3051 } else {
2934 if (cmd_match(page, "check")) 3052 if (cmd_match(page, "check"))
2935 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3053 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2940,6 +3058,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2940 } 3058 }
2941 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3059 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2942 md_wakeup_thread(mddev->thread); 3060 md_wakeup_thread(mddev->thread);
3061 sysfs_notify(&mddev->kobj, NULL, "sync_action");
2943 return len; 3062 return len;
2944} 3063}
2945 3064
@@ -3049,11 +3168,11 @@ static ssize_t
3049sync_speed_show(mddev_t *mddev, char *page) 3168sync_speed_show(mddev_t *mddev, char *page)
3050{ 3169{
3051 unsigned long resync, dt, db; 3170 unsigned long resync, dt, db;
3052 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 3171 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3053 dt = ((jiffies - mddev->resync_mark) / HZ); 3172 dt = (jiffies - mddev->resync_mark) / HZ;
3054 if (!dt) dt++; 3173 if (!dt) dt++;
3055 db = resync - (mddev->resync_mark_cnt); 3174 db = resync - mddev->resync_mark_cnt;
3056 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 3175 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3057} 3176}
3058 3177
3059static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3178static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
@@ -3075,6 +3194,36 @@ sync_completed_show(mddev_t *mddev, char *page)
3075static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3194static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3076 3195
3077static ssize_t 3196static ssize_t
3197min_sync_show(mddev_t *mddev, char *page)
3198{
3199 return sprintf(page, "%llu\n",
3200 (unsigned long long)mddev->resync_min);
3201}
3202static ssize_t
3203min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3204{
3205 unsigned long long min;
3206 if (strict_strtoull(buf, 10, &min))
3207 return -EINVAL;
3208 if (min > mddev->resync_max)
3209 return -EINVAL;
3210 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3211 return -EBUSY;
3212
3213 /* Must be a multiple of chunk_size */
3214 if (mddev->chunk_size) {
3215 if (min & (sector_t)((mddev->chunk_size>>9)-1))
3216 return -EINVAL;
3217 }
3218 mddev->resync_min = min;
3219
3220 return len;
3221}
3222
3223static struct md_sysfs_entry md_min_sync =
3224__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3225
3226static ssize_t
3078max_sync_show(mddev_t *mddev, char *page) 3227max_sync_show(mddev_t *mddev, char *page)
3079{ 3228{
3080 if (mddev->resync_max == MaxSector) 3229 if (mddev->resync_max == MaxSector)
@@ -3089,9 +3238,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3089 if (strncmp(buf, "max", 3) == 0) 3238 if (strncmp(buf, "max", 3) == 0)
3090 mddev->resync_max = MaxSector; 3239 mddev->resync_max = MaxSector;
3091 else { 3240 else {
3092 char *ep; 3241 unsigned long long max;
3093 unsigned long long max = simple_strtoull(buf, &ep, 10); 3242 if (strict_strtoull(buf, 10, &max))
3094 if (ep == buf || (*ep != 0 && *ep != '\n')) 3243 return -EINVAL;
3244 if (max < mddev->resync_min)
3095 return -EINVAL; 3245 return -EINVAL;
3096 if (max < mddev->resync_max && 3246 if (max < mddev->resync_max &&
3097 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3247 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3372,7 @@ static struct attribute *md_redundancy_attrs[] = {
3222 &md_sync_speed.attr, 3372 &md_sync_speed.attr,
3223 &md_sync_force_parallel.attr, 3373 &md_sync_force_parallel.attr,
3224 &md_sync_completed.attr, 3374 &md_sync_completed.attr,
3375 &md_min_sync.attr,
3225 &md_max_sync.attr, 3376 &md_max_sync.attr,
3226 &md_suspend_lo.attr, 3377 &md_suspend_lo.attr,
3227 &md_suspend_hi.attr, 3378 &md_suspend_hi.attr,
@@ -3326,9 +3477,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3326 disk->queue = mddev->queue; 3477 disk->queue = mddev->queue;
3327 add_disk(disk); 3478 add_disk(disk);
3328 mddev->gendisk = disk; 3479 mddev->gendisk = disk;
3329 mutex_unlock(&disks_mutex);
3330 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, 3480 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3331 "%s", "md"); 3481 "%s", "md");
3482 mutex_unlock(&disks_mutex);
3332 if (error) 3483 if (error)
3333 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3484 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3334 disk->disk_name); 3485 disk->disk_name);
@@ -3341,7 +3492,11 @@ static void md_safemode_timeout(unsigned long data)
3341{ 3492{
3342 mddev_t *mddev = (mddev_t *) data; 3493 mddev_t *mddev = (mddev_t *) data;
3343 3494
3344 mddev->safemode = 1; 3495 if (!atomic_read(&mddev->writes_pending)) {
3496 mddev->safemode = 1;
3497 if (mddev->external)
3498 sysfs_notify(&mddev->kobj, NULL, "array_state");
3499 }
3345 md_wakeup_thread(mddev->thread); 3500 md_wakeup_thread(mddev->thread);
3346} 3501}
3347 3502
@@ -3448,6 +3603,7 @@ static int do_md_run(mddev_t * mddev)
3448 return -EINVAL; 3603 return -EINVAL;
3449 } 3604 }
3450 } 3605 }
3606 sysfs_notify(&rdev->kobj, NULL, "state");
3451 } 3607 }
3452 3608
3453 md_probe(mddev->unit, NULL, NULL); 3609 md_probe(mddev->unit, NULL, NULL);
@@ -3519,7 +3675,9 @@ static int do_md_run(mddev_t * mddev)
3519 mddev->ro = 2; /* read-only, but switch on first write */ 3675 mddev->ro = 2; /* read-only, but switch on first write */
3520 3676
3521 err = mddev->pers->run(mddev); 3677 err = mddev->pers->run(mddev);
3522 if (!err && mddev->pers->sync_request) { 3678 if (err)
3679 printk(KERN_ERR "md: pers->run() failed ...\n");
3680 else if (mddev->pers->sync_request) {
3523 err = bitmap_create(mddev); 3681 err = bitmap_create(mddev);
3524 if (err) { 3682 if (err) {
3525 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3683 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3528,7 +3686,6 @@ static int do_md_run(mddev_t * mddev)
3528 } 3686 }
3529 } 3687 }
3530 if (err) { 3688 if (err) {
3531 printk(KERN_ERR "md: pers->run() failed ...\n");
3532 module_put(mddev->pers->owner); 3689 module_put(mddev->pers->owner);
3533 mddev->pers = NULL; 3690 mddev->pers = NULL;
3534 bitmap_destroy(mddev); 3691 bitmap_destroy(mddev);
@@ -3608,6 +3765,9 @@ static int do_md_run(mddev_t * mddev)
3608 3765
3609 mddev->changed = 1; 3766 mddev->changed = 1;
3610 md_new_event(mddev); 3767 md_new_event(mddev);
3768 sysfs_notify(&mddev->kobj, NULL, "array_state");
3769 sysfs_notify(&mddev->kobj, NULL, "sync_action");
3770 sysfs_notify(&mddev->kobj, NULL, "degraded");
3611 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); 3771 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3612 return 0; 3772 return 0;
3613} 3773}
@@ -3642,6 +3802,8 @@ static int restart_array(mddev_t *mddev)
3642 md_wakeup_thread(mddev->thread); 3802 md_wakeup_thread(mddev->thread);
3643 md_wakeup_thread(mddev->sync_thread); 3803 md_wakeup_thread(mddev->sync_thread);
3644 err = 0; 3804 err = 0;
3805 sysfs_notify(&mddev->kobj, NULL, "array_state");
3806
3645 } else 3807 } else
3646 err = -EINVAL; 3808 err = -EINVAL;
3647 3809
@@ -3777,6 +3939,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3777 mddev->size = 0; 3939 mddev->size = 0;
3778 mddev->raid_disks = 0; 3940 mddev->raid_disks = 0;
3779 mddev->recovery_cp = 0; 3941 mddev->recovery_cp = 0;
3942 mddev->resync_min = 0;
3780 mddev->resync_max = MaxSector; 3943 mddev->resync_max = MaxSector;
3781 mddev->reshape_position = MaxSector; 3944 mddev->reshape_position = MaxSector;
3782 mddev->external = 0; 3945 mddev->external = 0;
@@ -3811,6 +3974,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3811 mdname(mddev)); 3974 mdname(mddev));
3812 err = 0; 3975 err = 0;
3813 md_new_event(mddev); 3976 md_new_event(mddev);
3977 sysfs_notify(&mddev->kobj, NULL, "array_state");
3814out: 3978out:
3815 return err; 3979 return err;
3816} 3980}
@@ -4009,9 +4173,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4009 char *ptr, *buf = NULL; 4173 char *ptr, *buf = NULL;
4010 int err = -ENOMEM; 4174 int err = -ENOMEM;
4011 4175
4012 md_allow_write(mddev); 4176 if (md_allow_write(mddev))
4177 file = kmalloc(sizeof(*file), GFP_NOIO);
4178 else
4179 file = kmalloc(sizeof(*file), GFP_KERNEL);
4013 4180
4014 file = kmalloc(sizeof(*file), GFP_KERNEL);
4015 if (!file) 4181 if (!file)
4016 goto out; 4182 goto out;
4017 4183
@@ -4172,8 +4338,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4172 } 4338 }
4173 if (err) 4339 if (err)
4174 export_rdev(rdev); 4340 export_rdev(rdev);
4341 else
4342 sysfs_notify(&rdev->kobj, NULL, "state");
4175 4343
4176 md_update_sb(mddev, 1); 4344 md_update_sb(mddev, 1);
4345 if (mddev->degraded)
4346 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4347 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4178 md_wakeup_thread(mddev->thread); 4348 md_wakeup_thread(mddev->thread);
4179 return err; 4349 return err;
@@ -4232,9 +4402,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4232 char b[BDEVNAME_SIZE]; 4402 char b[BDEVNAME_SIZE];
4233 mdk_rdev_t *rdev; 4403 mdk_rdev_t *rdev;
4234 4404
4235 if (!mddev->pers)
4236 return -ENODEV;
4237
4238 rdev = find_rdev(mddev, dev); 4405 rdev = find_rdev(mddev, dev);
4239 if (!rdev) 4406 if (!rdev)
4240 return -ENXIO; 4407 return -ENXIO;
@@ -4641,6 +4808,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4641 return 0; 4808 return 0;
4642} 4809}
4643 4810
4811/*
4812 * We have a problem here : there is no easy way to give a CHS
4813 * virtual geometry. We currently pretend that we have a 2 heads
4814 * 4 sectors (with a BIG number of cylinders...). This drives
4815 * dosfs just mad... ;-)
4816 */
4644static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4817static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4645{ 4818{
4646 mddev_t *mddev = bdev->bd_disk->private_data; 4819 mddev_t *mddev = bdev->bd_disk->private_data;
@@ -4792,12 +4965,6 @@ static int md_ioctl(struct inode *inode, struct file *file,
4792 err = do_md_stop (mddev, 1); 4965 err = do_md_stop (mddev, 1);
4793 goto done_unlock; 4966 goto done_unlock;
4794 4967
4795 /*
4796 * We have a problem here : there is no easy way to give a CHS
4797 * virtual geometry. We currently pretend that we have a 2 heads
4798 * 4 sectors (with a BIG number of cylinders...). This drives
4799 * dosfs just mad... ;-)
4800 */
4801 } 4968 }
4802 4969
4803 /* 4970 /*
@@ -4807,13 +4974,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
4807 * here and hit the 'default' below, so only disallow 4974 * here and hit the 'default' below, so only disallow
4808 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4975 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4809 */ 4976 */
4810 if (_IOC_TYPE(cmd) == MD_MAJOR && 4977 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
4811 mddev->ro && mddev->pers) {
4812 if (mddev->ro == 2) { 4978 if (mddev->ro == 2) {
4813 mddev->ro = 0; 4979 mddev->ro = 0;
4814 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4980 sysfs_notify(&mddev->kobj, NULL, "array_state");
4815 md_wakeup_thread(mddev->thread); 4981 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4816 4982 md_wakeup_thread(mddev->thread);
4817 } else { 4983 } else {
4818 err = -EROFS; 4984 err = -EROFS;
4819 goto abort_unlock; 4985 goto abort_unlock;
@@ -5029,6 +5195,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5029 if (!mddev->pers->error_handler) 5195 if (!mddev->pers->error_handler)
5030 return; 5196 return;
5031 mddev->pers->error_handler(mddev,rdev); 5197 mddev->pers->error_handler(mddev,rdev);
5198 if (mddev->degraded)
5199 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5200 set_bit(StateChanged, &rdev->flags);
5032 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5201 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5033 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5202 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5034 md_wakeup_thread(mddev->thread); 5203 md_wakeup_thread(mddev->thread);
@@ -5451,6 +5620,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
5451 */ 5620 */
5452void md_write_start(mddev_t *mddev, struct bio *bi) 5621void md_write_start(mddev_t *mddev, struct bio *bi)
5453{ 5622{
5623 int did_change = 0;
5454 if (bio_data_dir(bi) != WRITE) 5624 if (bio_data_dir(bi) != WRITE)
5455 return; 5625 return;
5456 5626
@@ -5461,6 +5631,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5631 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5462 md_wakeup_thread(mddev->thread); 5632 md_wakeup_thread(mddev->thread);
5463 md_wakeup_thread(mddev->sync_thread); 5633 md_wakeup_thread(mddev->sync_thread);
5634 did_change = 1;
5464 } 5635 }
5465 atomic_inc(&mddev->writes_pending); 5636 atomic_inc(&mddev->writes_pending);
5466 if (mddev->safemode == 1) 5637 if (mddev->safemode == 1)
@@ -5471,10 +5642,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5471 mddev->in_sync = 0; 5642 mddev->in_sync = 0;
5472 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5643 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5473 md_wakeup_thread(mddev->thread); 5644 md_wakeup_thread(mddev->thread);
5645 did_change = 1;
5474 } 5646 }
5475 spin_unlock_irq(&mddev->write_lock); 5647 spin_unlock_irq(&mddev->write_lock);
5476 sysfs_notify(&mddev->kobj, NULL, "array_state");
5477 } 5648 }
5649 if (did_change)
5650 sysfs_notify(&mddev->kobj, NULL, "array_state");
5478 wait_event(mddev->sb_wait, 5651 wait_event(mddev->sb_wait,
5479 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 5652 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5480 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5653 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5495,13 +5668,18 @@ void md_write_end(mddev_t *mddev)
5495 * may proceed without blocking. It is important to call this before 5668 * may proceed without blocking. It is important to call this before
5496 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5669 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5497 * Must be called with mddev_lock held. 5670 * Must be called with mddev_lock held.
5671 *
5672 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5673 * is dropped, so return -EAGAIN after notifying userspace.
5498 */ 5674 */
5499void md_allow_write(mddev_t *mddev) 5675int md_allow_write(mddev_t *mddev)
5500{ 5676{
5501 if (!mddev->pers) 5677 if (!mddev->pers)
5502 return; 5678 return 0;
5503 if (mddev->ro) 5679 if (mddev->ro)
5504 return; 5680 return 0;
5681 if (!mddev->pers->sync_request)
5682 return 0;
5505 5683
5506 spin_lock_irq(&mddev->write_lock); 5684 spin_lock_irq(&mddev->write_lock);
5507 if (mddev->in_sync) { 5685 if (mddev->in_sync) {
@@ -5512,14 +5690,14 @@ void md_allow_write(mddev_t *mddev)
5512 mddev->safemode = 1; 5690 mddev->safemode = 1;
5513 spin_unlock_irq(&mddev->write_lock); 5691 spin_unlock_irq(&mddev->write_lock);
5514 md_update_sb(mddev, 0); 5692 md_update_sb(mddev, 0);
5515
5516 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5693 sysfs_notify(&mddev->kobj, NULL, "array_state");
5517 /* wait for the dirty state to be recorded in the metadata */
5518 wait_event(mddev->sb_wait,
5519 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5520 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5521 } else 5694 } else
5522 spin_unlock_irq(&mddev->write_lock); 5695 spin_unlock_irq(&mddev->write_lock);
5696
5697 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5698 return -EAGAIN;
5699 else
5700 return 0;
5523} 5701}
5524EXPORT_SYMBOL_GPL(md_allow_write); 5702EXPORT_SYMBOL_GPL(md_allow_write);
5525 5703
@@ -5625,9 +5803,11 @@ void md_do_sync(mddev_t *mddev)
5625 max_sectors = mddev->resync_max_sectors; 5803 max_sectors = mddev->resync_max_sectors;
5626 mddev->resync_mismatches = 0; 5804 mddev->resync_mismatches = 0;
5627 /* we don't use the checkpoint if there's a bitmap */ 5805 /* we don't use the checkpoint if there's a bitmap */
5628 if (!mddev->bitmap && 5806 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5629 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5807 j = mddev->resync_min;
5808 else if (!mddev->bitmap)
5630 j = mddev->recovery_cp; 5809 j = mddev->recovery_cp;
5810
5631 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5811 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5632 max_sectors = mddev->size << 1; 5812 max_sectors = mddev->size << 1;
5633 else { 5813 else {
@@ -5796,6 +5976,7 @@ void md_do_sync(mddev_t *mddev)
5796 5976
5797 skip: 5977 skip:
5798 mddev->curr_resync = 0; 5978 mddev->curr_resync = 0;
5979 mddev->resync_min = 0;
5799 mddev->resync_max = MaxSector; 5980 mddev->resync_max = MaxSector;
5800 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5981 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5801 wake_up(&resync_wait); 5982 wake_up(&resync_wait);
@@ -5845,7 +6026,8 @@ static int remove_and_add_spares(mddev_t *mddev)
5845 if (rdev->raid_disk < 0 6026 if (rdev->raid_disk < 0
5846 && !test_bit(Faulty, &rdev->flags)) { 6027 && !test_bit(Faulty, &rdev->flags)) {
5847 rdev->recovery_offset = 0; 6028 rdev->recovery_offset = 0;
5848 if (mddev->pers->hot_add_disk(mddev,rdev)) { 6029 if (mddev->pers->
6030 hot_add_disk(mddev, rdev) == 0) {
5849 char nm[20]; 6031 char nm[20];
5850 sprintf(nm, "rd%d", rdev->raid_disk); 6032 sprintf(nm, "rd%d", rdev->raid_disk);
5851 if (sysfs_create_link(&mddev->kobj, 6033 if (sysfs_create_link(&mddev->kobj,
@@ -5920,23 +6102,31 @@ void md_check_recovery(mddev_t *mddev)
5920 int spares = 0; 6102 int spares = 0;
5921 6103
5922 if (!mddev->external) { 6104 if (!mddev->external) {
6105 int did_change = 0;
5923 spin_lock_irq(&mddev->write_lock); 6106 spin_lock_irq(&mddev->write_lock);
5924 if (mddev->safemode && 6107 if (mddev->safemode &&
5925 !atomic_read(&mddev->writes_pending) && 6108 !atomic_read(&mddev->writes_pending) &&
5926 !mddev->in_sync && 6109 !mddev->in_sync &&
5927 mddev->recovery_cp == MaxSector) { 6110 mddev->recovery_cp == MaxSector) {
5928 mddev->in_sync = 1; 6111 mddev->in_sync = 1;
6112 did_change = 1;
5929 if (mddev->persistent) 6113 if (mddev->persistent)
5930 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6114 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5931 } 6115 }
5932 if (mddev->safemode == 1) 6116 if (mddev->safemode == 1)
5933 mddev->safemode = 0; 6117 mddev->safemode = 0;
5934 spin_unlock_irq(&mddev->write_lock); 6118 spin_unlock_irq(&mddev->write_lock);
6119 if (did_change)
6120 sysfs_notify(&mddev->kobj, NULL, "array_state");
5935 } 6121 }
5936 6122
5937 if (mddev->flags) 6123 if (mddev->flags)
5938 md_update_sb(mddev, 0); 6124 md_update_sb(mddev, 0);
5939 6125
6126 rdev_for_each(rdev, rtmp, mddev)
6127 if (test_and_clear_bit(StateChanged, &rdev->flags))
6128 sysfs_notify(&rdev->kobj, NULL, "state");
6129
5940 6130
5941 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6131 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5942 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6132 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -5951,7 +6141,9 @@ void md_check_recovery(mddev_t *mddev)
5951 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6141 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5952 /* success...*/ 6142 /* success...*/
5953 /* activate any spares */ 6143 /* activate any spares */
5954 mddev->pers->spare_active(mddev); 6144 if (mddev->pers->spare_active(mddev))
6145 sysfs_notify(&mddev->kobj, NULL,
6146 "degraded");
5955 } 6147 }
5956 md_update_sb(mddev, 1); 6148 md_update_sb(mddev, 1);
5957 6149
@@ -5965,13 +6157,18 @@ void md_check_recovery(mddev_t *mddev)
5965 mddev->recovery = 0; 6157 mddev->recovery = 0;
5966 /* flag recovery needed just to double check */ 6158 /* flag recovery needed just to double check */
5967 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6159 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6160 sysfs_notify(&mddev->kobj, NULL, "sync_action");
5968 md_new_event(mddev); 6161 md_new_event(mddev);
5969 goto unlock; 6162 goto unlock;
5970 } 6163 }
6164 /* Set RUNNING before clearing NEEDED to avoid
6165 * any transients in the value of "sync_action".
6166 */
6167 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6168 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5971 /* Clear some bits that don't mean anything, but 6169 /* Clear some bits that don't mean anything, but
5972 * might be left set 6170 * might be left set
5973 */ 6171 */
5974 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5975 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6172 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5976 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6173 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5977 6174
@@ -5989,17 +6186,19 @@ void md_check_recovery(mddev_t *mddev)
5989 /* Cannot proceed */ 6186 /* Cannot proceed */
5990 goto unlock; 6187 goto unlock;
5991 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6188 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6189 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5992 } else if ((spares = remove_and_add_spares(mddev))) { 6190 } else if ((spares = remove_and_add_spares(mddev))) {
5993 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6191 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5994 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6192 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6193 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5995 } else if (mddev->recovery_cp < MaxSector) { 6194 } else if (mddev->recovery_cp < MaxSector) {
5996 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6195 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6196 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5997 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6197 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5998 /* nothing to be done ... */ 6198 /* nothing to be done ... */
5999 goto unlock; 6199 goto unlock;
6000 6200
6001 if (mddev->pers->sync_request) { 6201 if (mddev->pers->sync_request) {
6002 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6003 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6202 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6004 /* We are adding a device or devices to an array 6203 /* We are adding a device or devices to an array
6005 * which has the bitmap stored on all devices. 6204 * which has the bitmap stored on all devices.
@@ -6018,9 +6217,16 @@ void md_check_recovery(mddev_t *mddev)
6018 mddev->recovery = 0; 6217 mddev->recovery = 0;
6019 } else 6218 } else
6020 md_wakeup_thread(mddev->sync_thread); 6219 md_wakeup_thread(mddev->sync_thread);
6220 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6021 md_new_event(mddev); 6221 md_new_event(mddev);
6022 } 6222 }
6023 unlock: 6223 unlock:
6224 if (!mddev->sync_thread) {
6225 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6226 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6227 &mddev->recovery))
6228 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6229 }
6024 mddev_unlock(mddev); 6230 mddev_unlock(mddev);
6025 } 6231 }
6026} 6232}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e968116e0de9..541cbe3414bd 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
281{ 281{
282 multipath_conf_t *conf = mddev->private; 282 multipath_conf_t *conf = mddev->private;
283 struct request_queue *q; 283 struct request_queue *q;
284 int found = 0; 284 int err = -EEXIST;
285 int path; 285 int path;
286 struct multipath_info *p; 286 struct multipath_info *p;
287 int first = 0;
288 int last = mddev->raid_disks - 1;
289
290 if (rdev->raid_disk >= 0)
291 first = last = rdev->raid_disk;
287 292
288 print_multipath_conf(conf); 293 print_multipath_conf(conf);
289 294
290 for (path=0; path<mddev->raid_disks; path++) 295 for (path = first; path <= last; path++)
291 if ((p=conf->multipaths+path)->rdev == NULL) { 296 if ((p=conf->multipaths+path)->rdev == NULL) {
292 q = rdev->bdev->bd_disk->queue; 297 q = rdev->bdev->bd_disk->queue;
293 blk_queue_stack_limits(mddev->queue, q); 298 blk_queue_stack_limits(mddev->queue, q);
@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
307 rdev->raid_disk = path; 312 rdev->raid_disk = path;
308 set_bit(In_sync, &rdev->flags); 313 set_bit(In_sync, &rdev->flags);
309 rcu_assign_pointer(p->rdev, rdev); 314 rcu_assign_pointer(p->rdev, rdev);
310 found = 1; 315 err = 0;
316 break;
311 } 317 }
312 318
313 print_multipath_conf(conf); 319 print_multipath_conf(conf);
314 return found; 320
321 return err;
315} 322}
316 323
317static int multipath_remove_disk(mddev_t *mddev, int number) 324static int multipath_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c610b947218a..491dc2d4ad5f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev)
1100static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1100static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1101{ 1101{
1102 conf_t *conf = mddev->private; 1102 conf_t *conf = mddev->private;
1103 int found = 0; 1103 int err = -EEXIST;
1104 int mirror = 0; 1104 int mirror = 0;
1105 mirror_info_t *p; 1105 mirror_info_t *p;
1106 int first = 0;
1107 int last = mddev->raid_disks - 1;
1106 1108
1107 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1109 if (rdev->raid_disk >= 0)
1110 first = last = rdev->raid_disk;
1111
1112 for (mirror = first; mirror <= last; mirror++)
1108 if ( !(p=conf->mirrors+mirror)->rdev) { 1113 if ( !(p=conf->mirrors+mirror)->rdev) {
1109 1114
1110 blk_queue_stack_limits(mddev->queue, 1115 blk_queue_stack_limits(mddev->queue,
@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1119 1124
1120 p->head_position = 0; 1125 p->head_position = 0;
1121 rdev->raid_disk = mirror; 1126 rdev->raid_disk = mirror;
1122 found = 1; 1127 err = 0;
1123 /* As all devices are equivalent, we don't need a full recovery 1128 /* As all devices are equivalent, we don't need a full recovery
1124 * if this was recently any drive of the array 1129 * if this was recently any drive of the array
1125 */ 1130 */
@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1130 } 1135 }
1131 1136
1132 print_conf(conf); 1137 print_conf(conf);
1133 return found; 1138 return err;
1134} 1139}
1135 1140
1136static int raid1_remove_disk(mddev_t *mddev, int number) 1141static int raid1_remove_disk(mddev_t *mddev, int number)
@@ -2131,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
2131 conf_t *conf = mddev_to_conf(mddev); 2136 conf_t *conf = mddev_to_conf(mddev);
2132 int cnt, raid_disks; 2137 int cnt, raid_disks;
2133 unsigned long flags; 2138 unsigned long flags;
2134 int d, d2; 2139 int d, d2, err;
2135 2140
2136 /* Cannot change chunk_size, layout, or level */ 2141 /* Cannot change chunk_size, layout, or level */
2137 if (mddev->chunk_size != mddev->new_chunk || 2142 if (mddev->chunk_size != mddev->new_chunk ||
@@ -2143,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
2143 return -EINVAL; 2148 return -EINVAL;
2144 } 2149 }
2145 2150
2146 md_allow_write(mddev); 2151 err = md_allow_write(mddev);
2152 if (err)
2153 return err;
2147 2154
2148 raid_disks = mddev->raid_disks + mddev->delta_disks; 2155 raid_disks = mddev->raid_disks + mddev->delta_disks;
2149 2156
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a71277b640ab..df08a9fa3a1f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1113,24 +1113,30 @@ static int raid10_spare_active(mddev_t *mddev)
1113static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1113static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1114{ 1114{
1115 conf_t *conf = mddev->private; 1115 conf_t *conf = mddev->private;
1116 int found = 0; 1116 int err = -EEXIST;
1117 int mirror; 1117 int mirror;
1118 mirror_info_t *p; 1118 mirror_info_t *p;
1119 int first = 0;
1120 int last = mddev->raid_disks - 1;
1119 1121
1120 if (mddev->recovery_cp < MaxSector) 1122 if (mddev->recovery_cp < MaxSector)
1121 /* only hot-add to in-sync arrays, as recovery is 1123 /* only hot-add to in-sync arrays, as recovery is
1122 * very different from resync 1124 * very different from resync
1123 */ 1125 */
1124 return 0; 1126 return -EBUSY;
1125 if (!enough(conf)) 1127 if (!enough(conf))
1126 return 0; 1128 return -EINVAL;
1129
1130 if (rdev->raid_disk)
1131 first = last = rdev->raid_disk;
1127 1132
1128 if (rdev->saved_raid_disk >= 0 && 1133 if (rdev->saved_raid_disk >= 0 &&
1134 rdev->saved_raid_disk >= first &&
1129 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1135 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1130 mirror = rdev->saved_raid_disk; 1136 mirror = rdev->saved_raid_disk;
1131 else 1137 else
1132 mirror = 0; 1138 mirror = first;
1133 for ( ; mirror < mddev->raid_disks; mirror++) 1139 for ( ; mirror <= last ; mirror++)
1134 if ( !(p=conf->mirrors+mirror)->rdev) { 1140 if ( !(p=conf->mirrors+mirror)->rdev) {
1135 1141
1136 blk_queue_stack_limits(mddev->queue, 1142 blk_queue_stack_limits(mddev->queue,
@@ -1145,7 +1151,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1145 1151
1146 p->head_position = 0; 1152 p->head_position = 0;
1147 rdev->raid_disk = mirror; 1153 rdev->raid_disk = mirror;
1148 found = 1; 1154 err = 0;
1149 if (rdev->saved_raid_disk != mirror) 1155 if (rdev->saved_raid_disk != mirror)
1150 conf->fullsync = 1; 1156 conf->fullsync = 1;
1151 rcu_assign_pointer(p->rdev, rdev); 1157 rcu_assign_pointer(p->rdev, rdev);
@@ -1153,7 +1159,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1153 } 1159 }
1154 1160
1155 print_conf(conf); 1161 print_conf(conf);
1156 return found; 1162 return err;
1157} 1163}
1158 1164
1159static int raid10_remove_disk(mddev_t *mddev, int number) 1165static int raid10_remove_disk(mddev_t *mddev, int number)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3b27df52456b..8f4c70a53210 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi)
115 return_bi = bi->bi_next; 115 return_bi = bi->bi_next;
116 bi->bi_next = NULL; 116 bi->bi_next = NULL;
117 bi->bi_size = 0; 117 bi->bi_size = 0;
118 bi->bi_end_io(bi, 118 bio_endio(bi, 0);
119 test_bit(BIO_UPTODATE, &bi->bi_flags)
120 ? 0 : -EIO);
121 bi = return_bi; 119 bi = return_bi;
122 } 120 }
123} 121}
124 122
125static void print_raid5_conf (raid5_conf_t *conf); 123static void print_raid5_conf (raid5_conf_t *conf);
126 124
125static int stripe_operations_active(struct stripe_head *sh)
126{
127 return sh->check_state || sh->reconstruct_state ||
128 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
129 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
130}
131
127static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 132static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
128{ 133{
129 if (atomic_dec_and_test(&sh->count)) { 134 if (atomic_dec_and_test(&sh->count)) {
@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
143 } 148 }
144 md_wakeup_thread(conf->mddev->thread); 149 md_wakeup_thread(conf->mddev->thread);
145 } else { 150 } else {
146 BUG_ON(sh->ops.pending); 151 BUG_ON(stripe_operations_active(sh));
147 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 152 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
148 atomic_dec(&conf->preread_active_stripes); 153 atomic_dec(&conf->preread_active_stripes);
149 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 154 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
245 250
246 BUG_ON(atomic_read(&sh->count) != 0); 251 BUG_ON(atomic_read(&sh->count) != 0);
247 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 252 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
248 BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); 253 BUG_ON(stripe_operations_active(sh));
249 254
250 CHECK_DEVLOCK(); 255 CHECK_DEVLOCK();
251 pr_debug("init_stripe called, stripe %llu\n", 256 pr_debug("init_stripe called, stripe %llu\n",
@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
346 return sh; 351 return sh;
347} 352}
348 353
349/* test_and_ack_op() ensures that we only dequeue an operation once */
350#define test_and_ack_op(op, pend) \
351do { \
352 if (test_bit(op, &sh->ops.pending) && \
353 !test_bit(op, &sh->ops.complete)) { \
354 if (test_and_set_bit(op, &sh->ops.ack)) \
355 clear_bit(op, &pend); \
356 else \
357 ack++; \
358 } else \
359 clear_bit(op, &pend); \
360} while (0)
361
362/* find new work to run, do not resubmit work that is already
363 * in flight
364 */
365static unsigned long get_stripe_work(struct stripe_head *sh)
366{
367 unsigned long pending;
368 int ack = 0;
369
370 pending = sh->ops.pending;
371
372 test_and_ack_op(STRIPE_OP_BIOFILL, pending);
373 test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
374 test_and_ack_op(STRIPE_OP_PREXOR, pending);
375 test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
376 test_and_ack_op(STRIPE_OP_POSTXOR, pending);
377 test_and_ack_op(STRIPE_OP_CHECK, pending);
378 if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
379 ack++;
380
381 sh->ops.count -= ack;
382 if (unlikely(sh->ops.count < 0)) {
383 printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
384 "ops.complete: %#lx\n", pending, sh->ops.pending,
385 sh->ops.ack, sh->ops.complete);
386 BUG();
387 }
388
389 return pending;
390}
391
392static void 354static void
393raid5_end_read_request(struct bio *bi, int error); 355raid5_end_read_request(struct bio *bi, int error);
394static void 356static void
395raid5_end_write_request(struct bio *bi, int error); 357raid5_end_write_request(struct bio *bi, int error);
396 358
397static void ops_run_io(struct stripe_head *sh) 359static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
398{ 360{
399 raid5_conf_t *conf = sh->raid_conf; 361 raid5_conf_t *conf = sh->raid_conf;
400 int i, disks = sh->disks; 362 int i, disks = sh->disks;
401 363
402 might_sleep(); 364 might_sleep();
403 365
404 set_bit(STRIPE_IO_STARTED, &sh->state);
405 for (i = disks; i--; ) { 366 for (i = disks; i--; ) {
406 int rw; 367 int rw;
407 struct bio *bi; 368 struct bio *bi;
@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
430 rcu_read_unlock(); 391 rcu_read_unlock();
431 392
432 if (rdev) { 393 if (rdev) {
433 if (test_bit(STRIPE_SYNCING, &sh->state) || 394 if (s->syncing || s->expanding || s->expanded)
434 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
435 test_bit(STRIPE_EXPAND_READY, &sh->state))
436 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 395 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
437 396
397 set_bit(STRIPE_IO_STARTED, &sh->state);
398
438 bi->bi_bdev = rdev->bdev; 399 bi->bi_bdev = rdev->bdev;
439 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 400 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
440 __func__, (unsigned long long)sh->sector, 401 __func__, (unsigned long long)sh->sector,
@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
528 (unsigned long long)sh->sector); 489 (unsigned long long)sh->sector);
529 490
530 /* clear completed biofills */ 491 /* clear completed biofills */
492 spin_lock_irq(&conf->device_lock);
531 for (i = sh->disks; i--; ) { 493 for (i = sh->disks; i--; ) {
532 struct r5dev *dev = &sh->dev[i]; 494 struct r5dev *dev = &sh->dev[i];
533 495
534 /* acknowledge completion of a biofill operation */ 496 /* acknowledge completion of a biofill operation */
535 /* and check if we need to reply to a read request, 497 /* and check if we need to reply to a read request,
536 * new R5_Wantfill requests are held off until 498 * new R5_Wantfill requests are held off until
537 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) 499 * !STRIPE_BIOFILL_RUN
538 */ 500 */
539 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 501 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
540 struct bio *rbi, *rbi2; 502 struct bio *rbi, *rbi2;
541 503
542 /* The access to dev->read is outside of the
543 * spin_lock_irq(&conf->device_lock), but is protected
544 * by the STRIPE_OP_BIOFILL pending bit
545 */
546 BUG_ON(!dev->read); 504 BUG_ON(!dev->read);
547 rbi = dev->read; 505 rbi = dev->read;
548 dev->read = NULL; 506 dev->read = NULL;
549 while (rbi && rbi->bi_sector < 507 while (rbi && rbi->bi_sector <
550 dev->sector + STRIPE_SECTORS) { 508 dev->sector + STRIPE_SECTORS) {
551 rbi2 = r5_next_bio(rbi, dev->sector); 509 rbi2 = r5_next_bio(rbi, dev->sector);
552 spin_lock_irq(&conf->device_lock);
553 if (--rbi->bi_phys_segments == 0) { 510 if (--rbi->bi_phys_segments == 0) {
554 rbi->bi_next = return_bi; 511 rbi->bi_next = return_bi;
555 return_bi = rbi; 512 return_bi = rbi;
556 } 513 }
557 spin_unlock_irq(&conf->device_lock);
558 rbi = rbi2; 514 rbi = rbi2;
559 } 515 }
560 } 516 }
561 } 517 }
562 set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); 518 spin_unlock_irq(&conf->device_lock);
519 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
563 520
564 return_io(return_bi); 521 return_io(return_bi);
565 522
@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
610 set_bit(R5_UPTODATE, &tgt->flags); 567 set_bit(R5_UPTODATE, &tgt->flags);
611 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 568 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
612 clear_bit(R5_Wantcompute, &tgt->flags); 569 clear_bit(R5_Wantcompute, &tgt->flags);
613 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 570 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
571 if (sh->check_state == check_state_compute_run)
572 sh->check_state = check_state_compute_result;
614 set_bit(STRIPE_HANDLE, &sh->state); 573 set_bit(STRIPE_HANDLE, &sh->state);
615 release_stripe(sh); 574 release_stripe(sh);
616} 575}
617 576
618static struct dma_async_tx_descriptor * 577static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
619ops_run_compute5(struct stripe_head *sh, unsigned long pending)
620{ 578{
621 /* kernel stack size limits the total number of disks */ 579 /* kernel stack size limits the total number of disks */
622 int disks = sh->disks; 580 int disks = sh->disks;
@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
646 ASYNC_TX_XOR_ZERO_DST, NULL, 604 ASYNC_TX_XOR_ZERO_DST, NULL,
647 ops_complete_compute5, sh); 605 ops_complete_compute5, sh);
648 606
649 /* ack now if postxor is not set to be run */
650 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
651 async_tx_ack(tx);
652
653 return tx; 607 return tx;
654} 608}
655 609
@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
659 613
660 pr_debug("%s: stripe %llu\n", __func__, 614 pr_debug("%s: stripe %llu\n", __func__,
661 (unsigned long long)sh->sector); 615 (unsigned long long)sh->sector);
662
663 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
664} 616}
665 617
666static struct dma_async_tx_descriptor * 618static struct dma_async_tx_descriptor *
@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
680 for (i = disks; i--; ) { 632 for (i = disks; i--; ) {
681 struct r5dev *dev = &sh->dev[i]; 633 struct r5dev *dev = &sh->dev[i];
682 /* Only process blocks that are known to be uptodate */ 634 /* Only process blocks that are known to be uptodate */
683 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) 635 if (test_bit(R5_Wantdrain, &dev->flags))
684 xor_srcs[count++] = dev->page; 636 xor_srcs[count++] = dev->page;
685 } 637 }
686 638
@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
692} 644}
693 645
694static struct dma_async_tx_descriptor * 646static struct dma_async_tx_descriptor *
695ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 647ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
696 unsigned long pending)
697{ 648{
698 int disks = sh->disks; 649 int disks = sh->disks;
699 int pd_idx = sh->pd_idx, i; 650 int i;
700
701 /* check if prexor is active which means only process blocks
702 * that are part of a read-modify-write (Wantprexor)
703 */
704 int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
705 651
706 pr_debug("%s: stripe %llu\n", __func__, 652 pr_debug("%s: stripe %llu\n", __func__,
707 (unsigned long long)sh->sector); 653 (unsigned long long)sh->sector);
@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
709 for (i = disks; i--; ) { 655 for (i = disks; i--; ) {
710 struct r5dev *dev = &sh->dev[i]; 656 struct r5dev *dev = &sh->dev[i];
711 struct bio *chosen; 657 struct bio *chosen;
712 int towrite;
713
714 towrite = 0;
715 if (prexor) { /* rmw */
716 if (dev->towrite &&
717 test_bit(R5_Wantprexor, &dev->flags))
718 towrite = 1;
719 } else { /* rcw */
720 if (i != pd_idx && dev->towrite &&
721 test_bit(R5_LOCKED, &dev->flags))
722 towrite = 1;
723 }
724 658
725 if (towrite) { 659 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
726 struct bio *wbi; 660 struct bio *wbi;
727 661
728 spin_lock(&sh->lock); 662 spin_lock(&sh->lock);
@@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
747static void ops_complete_postxor(void *stripe_head_ref) 681static void ops_complete_postxor(void *stripe_head_ref)
748{ 682{
749 struct stripe_head *sh = stripe_head_ref; 683 struct stripe_head *sh = stripe_head_ref;
750
751 pr_debug("%s: stripe %llu\n", __func__,
752 (unsigned long long)sh->sector);
753
754 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
755 set_bit(STRIPE_HANDLE, &sh->state);
756 release_stripe(sh);
757}
758
759static void ops_complete_write(void *stripe_head_ref)
760{
761 struct stripe_head *sh = stripe_head_ref;
762 int disks = sh->disks, i, pd_idx = sh->pd_idx; 684 int disks = sh->disks, i, pd_idx = sh->pd_idx;
763 685
764 pr_debug("%s: stripe %llu\n", __func__, 686 pr_debug("%s: stripe %llu\n", __func__,
@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
770 set_bit(R5_UPTODATE, &dev->flags); 692 set_bit(R5_UPTODATE, &dev->flags);
771 } 693 }
772 694
773 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); 695 if (sh->reconstruct_state == reconstruct_state_drain_run)
774 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); 696 sh->reconstruct_state = reconstruct_state_drain_result;
697 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
698 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
699 else {
700 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
701 sh->reconstruct_state = reconstruct_state_result;
702 }
775 703
776 set_bit(STRIPE_HANDLE, &sh->state); 704 set_bit(STRIPE_HANDLE, &sh->state);
777 release_stripe(sh); 705 release_stripe(sh);
778} 706}
779 707
780static void 708static void
781ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 709ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
782 unsigned long pending)
783{ 710{
784 /* kernel stack size limits the total number of disks */ 711 /* kernel stack size limits the total number of disks */
785 int disks = sh->disks; 712 int disks = sh->disks;
@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
787 714
788 int count = 0, pd_idx = sh->pd_idx, i; 715 int count = 0, pd_idx = sh->pd_idx, i;
789 struct page *xor_dest; 716 struct page *xor_dest;
790 int prexor = test_bit(STRIPE_OP_PREXOR, &pending); 717 int prexor = 0;
791 unsigned long flags; 718 unsigned long flags;
792 dma_async_tx_callback callback;
793 719
794 pr_debug("%s: stripe %llu\n", __func__, 720 pr_debug("%s: stripe %llu\n", __func__,
795 (unsigned long long)sh->sector); 721 (unsigned long long)sh->sector);
@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
797 /* check if prexor is active which means only process blocks 723 /* check if prexor is active which means only process blocks
798 * that are part of a read-modify-write (written) 724 * that are part of a read-modify-write (written)
799 */ 725 */
800 if (prexor) { 726 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
727 prexor = 1;
801 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 728 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
802 for (i = disks; i--; ) { 729 for (i = disks; i--; ) {
803 struct r5dev *dev = &sh->dev[i]; 730 struct r5dev *dev = &sh->dev[i];
@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
813 } 740 }
814 } 741 }
815 742
816 /* check whether this postxor is part of a write */
817 callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
818 ops_complete_write : ops_complete_postxor;
819
820 /* 1/ if we prexor'd then the dest is reused as a source 743 /* 1/ if we prexor'd then the dest is reused as a source
821 * 2/ if we did not prexor then we are redoing the parity 744 * 2/ if we did not prexor then we are redoing the parity
822 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 745 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
830 if (unlikely(count == 1)) { 753 if (unlikely(count == 1)) {
831 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 754 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
832 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 755 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
833 flags, tx, callback, sh); 756 flags, tx, ops_complete_postxor, sh);
834 } else 757 } else
835 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 758 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
836 flags, tx, callback, sh); 759 flags, tx, ops_complete_postxor, sh);
837} 760}
838 761
839static void ops_complete_check(void *stripe_head_ref) 762static void ops_complete_check(void *stripe_head_ref)
840{ 763{
841 struct stripe_head *sh = stripe_head_ref; 764 struct stripe_head *sh = stripe_head_ref;
842 int pd_idx = sh->pd_idx;
843 765
844 pr_debug("%s: stripe %llu\n", __func__, 766 pr_debug("%s: stripe %llu\n", __func__,
845 (unsigned long long)sh->sector); 767 (unsigned long long)sh->sector);
846 768
847 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && 769 sh->check_state = check_state_check_result;
848 sh->ops.zero_sum_result == 0)
849 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
850
851 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
852 set_bit(STRIPE_HANDLE, &sh->state); 770 set_bit(STRIPE_HANDLE, &sh->state);
853 release_stripe(sh); 771 release_stripe(sh);
854} 772}
@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
875 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 793 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
876 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 794 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
877 795
878 if (tx)
879 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
880 else
881 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
882
883 atomic_inc(&sh->count); 796 atomic_inc(&sh->count);
884 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 797 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
885 ops_complete_check, sh); 798 ops_complete_check, sh);
886} 799}
887 800
888static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) 801static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
889{ 802{
890 int overlap_clear = 0, i, disks = sh->disks; 803 int overlap_clear = 0, i, disks = sh->disks;
891 struct dma_async_tx_descriptor *tx = NULL; 804 struct dma_async_tx_descriptor *tx = NULL;
892 805
893 if (test_bit(STRIPE_OP_BIOFILL, &pending)) { 806 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
894 ops_run_biofill(sh); 807 ops_run_biofill(sh);
895 overlap_clear++; 808 overlap_clear++;
896 } 809 }
897 810
898 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) 811 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
899 tx = ops_run_compute5(sh, pending); 812 tx = ops_run_compute5(sh);
813 /* terminate the chain if postxor is not set to be run */
814 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
815 async_tx_ack(tx);
816 }
900 817
901 if (test_bit(STRIPE_OP_PREXOR, &pending)) 818 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
902 tx = ops_run_prexor(sh, tx); 819 tx = ops_run_prexor(sh, tx);
903 820
904 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { 821 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
905 tx = ops_run_biodrain(sh, tx, pending); 822 tx = ops_run_biodrain(sh, tx);
906 overlap_clear++; 823 overlap_clear++;
907 } 824 }
908 825
909 if (test_bit(STRIPE_OP_POSTXOR, &pending)) 826 if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
910 ops_run_postxor(sh, tx, pending); 827 ops_run_postxor(sh, tx);
911 828
912 if (test_bit(STRIPE_OP_CHECK, &pending)) 829 if (test_bit(STRIPE_OP_CHECK, &ops_request))
913 ops_run_check(sh); 830 ops_run_check(sh);
914 831
915 if (test_bit(STRIPE_OP_IO, &pending))
916 ops_run_io(sh);
917
918 if (overlap_clear) 832 if (overlap_clear)
919 for (i = disks; i--; ) { 833 for (i = disks; i--; ) {
920 struct r5dev *dev = &sh->dev[i]; 834 struct r5dev *dev = &sh->dev[i];
@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
997 struct stripe_head *osh, *nsh; 911 struct stripe_head *osh, *nsh;
998 LIST_HEAD(newstripes); 912 LIST_HEAD(newstripes);
999 struct disk_info *ndisks; 913 struct disk_info *ndisks;
1000 int err = 0; 914 int err;
1001 struct kmem_cache *sc; 915 struct kmem_cache *sc;
1002 int i; 916 int i;
1003 917
1004 if (newsize <= conf->pool_size) 918 if (newsize <= conf->pool_size)
1005 return 0; /* never bother to shrink */ 919 return 0; /* never bother to shrink */
1006 920
1007 md_allow_write(conf->mddev); 921 err = md_allow_write(conf->mddev);
922 if (err)
923 return err;
1008 924
1009 /* Step 1 */ 925 /* Step 1 */
1010 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 926 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1703 } 1619 }
1704} 1620}
1705 1621
1706static int 1622static void
1707handle_write_operations5(struct stripe_head *sh, int rcw, int expand) 1623schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1624 int rcw, int expand)
1708{ 1625{
1709 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1626 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1710 int locked = 0;
1711 1627
1712 if (rcw) { 1628 if (rcw) {
1713 /* if we are not expanding this is a proper write request, and 1629 /* if we are not expanding this is a proper write request, and
@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1715 * stripe cache 1631 * stripe cache
1716 */ 1632 */
1717 if (!expand) { 1633 if (!expand) {
1718 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1634 sh->reconstruct_state = reconstruct_state_drain_run;
1719 sh->ops.count++; 1635 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1720 } 1636 } else
1637 sh->reconstruct_state = reconstruct_state_run;
1721 1638
1722 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1639 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1723 sh->ops.count++;
1724 1640
1725 for (i = disks; i--; ) { 1641 for (i = disks; i--; ) {
1726 struct r5dev *dev = &sh->dev[i]; 1642 struct r5dev *dev = &sh->dev[i];
1727 1643
1728 if (dev->towrite) { 1644 if (dev->towrite) {
1729 set_bit(R5_LOCKED, &dev->flags); 1645 set_bit(R5_LOCKED, &dev->flags);
1646 set_bit(R5_Wantdrain, &dev->flags);
1730 if (!expand) 1647 if (!expand)
1731 clear_bit(R5_UPTODATE, &dev->flags); 1648 clear_bit(R5_UPTODATE, &dev->flags);
1732 locked++; 1649 s->locked++;
1733 } 1650 }
1734 } 1651 }
1735 if (locked + 1 == disks) 1652 if (s->locked + 1 == disks)
1736 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1653 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1737 atomic_inc(&sh->raid_conf->pending_full_writes); 1654 atomic_inc(&sh->raid_conf->pending_full_writes);
1738 } else { 1655 } else {
1739 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1656 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1740 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1657 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1741 1658
1742 set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 1659 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1743 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1660 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1744 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1661 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1745 1662 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1746 sh->ops.count += 3;
1747 1663
1748 for (i = disks; i--; ) { 1664 for (i = disks; i--; ) {
1749 struct r5dev *dev = &sh->dev[i]; 1665 struct r5dev *dev = &sh->dev[i];
1750 if (i == pd_idx) 1666 if (i == pd_idx)
1751 continue; 1667 continue;
1752 1668
1753 /* For a read-modify write there may be blocks that are
1754 * locked for reading while others are ready to be
1755 * written so we distinguish these blocks by the
1756 * R5_Wantprexor bit
1757 */
1758 if (dev->towrite && 1669 if (dev->towrite &&
1759 (test_bit(R5_UPTODATE, &dev->flags) || 1670 (test_bit(R5_UPTODATE, &dev->flags) ||
1760 test_bit(R5_Wantcompute, &dev->flags))) { 1671 test_bit(R5_Wantcompute, &dev->flags))) {
1761 set_bit(R5_Wantprexor, &dev->flags); 1672 set_bit(R5_Wantdrain, &dev->flags);
1762 set_bit(R5_LOCKED, &dev->flags); 1673 set_bit(R5_LOCKED, &dev->flags);
1763 clear_bit(R5_UPTODATE, &dev->flags); 1674 clear_bit(R5_UPTODATE, &dev->flags);
1764 locked++; 1675 s->locked++;
1765 } 1676 }
1766 } 1677 }
1767 } 1678 }
@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1771 */ 1682 */
1772 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1683 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1773 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1684 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1774 locked++; 1685 s->locked++;
1775 1686
1776 pr_debug("%s: stripe %llu locked: %d pending: %lx\n", 1687 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1777 __func__, (unsigned long long)sh->sector, 1688 __func__, (unsigned long long)sh->sector,
1778 locked, sh->ops.pending); 1689 s->locked, s->ops_request);
1779
1780 return locked;
1781} 1690}
1782 1691
1783/* 1692/*
@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1876} 1785}
1877 1786
1878static void 1787static void
1879handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, 1788handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
1880 struct stripe_head_state *s, int disks, 1789 struct stripe_head_state *s, int disks,
1881 struct bio **return_bi) 1790 struct bio **return_bi)
1882{ 1791{
@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1967 md_wakeup_thread(conf->mddev->thread); 1876 md_wakeup_thread(conf->mddev->thread);
1968} 1877}
1969 1878
1970/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1879/* fetch_block5 - checks the given member device to see if its data needs
1971 * to process 1880 * to be read or computed to satisfy a request.
1881 *
1882 * Returns 1 when no more member devices need to be checked, otherwise returns
1883 * 0 to tell the loop in handle_stripe_fill5 to continue
1972 */ 1884 */
1973static int __handle_issuing_new_read_requests5(struct stripe_head *sh, 1885static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
1974 struct stripe_head_state *s, int disk_idx, int disks) 1886 int disk_idx, int disks)
1975{ 1887{
1976 struct r5dev *dev = &sh->dev[disk_idx]; 1888 struct r5dev *dev = &sh->dev[disk_idx];
1977 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 1889 struct r5dev *failed_dev = &sh->dev[s->failed_num];
1978 1890
1979 /* don't schedule compute operations or reads on the parity block while
1980 * a check is in flight
1981 */
1982 if ((disk_idx == sh->pd_idx) &&
1983 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1984 return ~0;
1985
1986 /* is the data in this block needed, and can we get it? */ 1891 /* is the data in this block needed, and can we get it? */
1987 if (!test_bit(R5_LOCKED, &dev->flags) && 1892 if (!test_bit(R5_LOCKED, &dev->flags) &&
1988 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || 1893 !test_bit(R5_UPTODATE, &dev->flags) &&
1989 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1894 (dev->toread ||
1990 s->syncing || s->expanding || (s->failed && 1895 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1991 (failed_dev->toread || (failed_dev->towrite && 1896 s->syncing || s->expanding ||
1992 !test_bit(R5_OVERWRITE, &failed_dev->flags) 1897 (s->failed &&
1993 ))))) { 1898 (failed_dev->toread ||
1994 /* 1/ We would like to get this block, possibly by computing it, 1899 (failed_dev->towrite &&
1995 * but we might not be able to. 1900 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
1996 * 1901 /* We would like to get this block, possibly by computing it,
1997 * 2/ Since parity check operations potentially make the parity 1902 * otherwise read it if the backing disk is insync
1998 * block !uptodate it will need to be refreshed before any
1999 * compute operations on data disks are scheduled.
2000 *
2001 * 3/ We hold off parity block re-reads until check operations
2002 * have quiesced.
2003 */ 1903 */
2004 if ((s->uptodate == disks - 1) && 1904 if ((s->uptodate == disks - 1) &&
2005 (s->failed && disk_idx == s->failed_num) && 1905 (s->failed && disk_idx == s->failed_num)) {
2006 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { 1906 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2007 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 1907 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2008 set_bit(R5_Wantcompute, &dev->flags); 1908 set_bit(R5_Wantcompute, &dev->flags);
2009 sh->ops.target = disk_idx; 1909 sh->ops.target = disk_idx;
2010 s->req_compute = 1; 1910 s->req_compute = 1;
2011 sh->ops.count++;
2012 /* Careful: from this point on 'uptodate' is in the eye 1911 /* Careful: from this point on 'uptodate' is in the eye
2013 * of raid5_run_ops which services 'compute' operations 1912 * of raid5_run_ops which services 'compute' operations
2014 * before writes. R5_Wantcompute flags a block that will 1913 * before writes. R5_Wantcompute flags a block that will
@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
2016 * subsequent operation. 1915 * subsequent operation.
2017 */ 1916 */
2018 s->uptodate++; 1917 s->uptodate++;
2019 return 0; /* uptodate + compute == disks */ 1918 return 1; /* uptodate + compute == disks */
2020 } else if (test_bit(R5_Insync, &dev->flags)) { 1919 } else if (test_bit(R5_Insync, &dev->flags)) {
2021 set_bit(R5_LOCKED, &dev->flags); 1920 set_bit(R5_LOCKED, &dev->flags);
2022 set_bit(R5_Wantread, &dev->flags); 1921 set_bit(R5_Wantread, &dev->flags);
2023 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2024 sh->ops.count++;
2025 s->locked++; 1922 s->locked++;
2026 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 1923 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2027 s->syncing); 1924 s->syncing);
2028 } 1925 }
2029 } 1926 }
2030 1927
2031 return ~0; 1928 return 0;
2032} 1929}
2033 1930
2034static void handle_issuing_new_read_requests5(struct stripe_head *sh, 1931/**
1932 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
1933 */
1934static void handle_stripe_fill5(struct stripe_head *sh,
2035 struct stripe_head_state *s, int disks) 1935 struct stripe_head_state *s, int disks)
2036{ 1936{
2037 int i; 1937 int i;
2038 1938
2039 /* Clear completed compute operations. Parity recovery
2040 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2041 * later on in this routine
2042 */
2043 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2044 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2045 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2046 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2047 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2048 }
2049
2050 /* look for blocks to read/compute, skip this if a compute 1939 /* look for blocks to read/compute, skip this if a compute
2051 * is already in flight, or if the stripe contents are in the 1940 * is already in flight, or if the stripe contents are in the
2052 * midst of changing due to a write 1941 * midst of changing due to a write
2053 */ 1942 */
2054 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 1943 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2055 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && 1944 !sh->reconstruct_state)
2056 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2057 for (i = disks; i--; ) 1945 for (i = disks; i--; )
2058 if (__handle_issuing_new_read_requests5( 1946 if (fetch_block5(sh, s, i, disks))
2059 sh, s, i, disks) == 0)
2060 break; 1947 break;
2061 }
2062 set_bit(STRIPE_HANDLE, &sh->state); 1948 set_bit(STRIPE_HANDLE, &sh->state);
2063} 1949}
2064 1950
2065static void handle_issuing_new_read_requests6(struct stripe_head *sh, 1951static void handle_stripe_fill6(struct stripe_head *sh,
2066 struct stripe_head_state *s, struct r6_state *r6s, 1952 struct stripe_head_state *s, struct r6_state *r6s,
2067 int disks) 1953 int disks)
2068{ 1954{
@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2121} 2007}
2122 2008
2123 2009
2124/* handle_completed_write_requests 2010/* handle_stripe_clean_event
2125 * any written block on an uptodate or failed drive can be returned. 2011 * any written block on an uptodate or failed drive can be returned.
2126 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2012 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2127 * never LOCKED, so we don't need to test 'failed' directly. 2013 * never LOCKED, so we don't need to test 'failed' directly.
2128 */ 2014 */
2129static void handle_completed_write_requests(raid5_conf_t *conf, 2015static void handle_stripe_clean_event(raid5_conf_t *conf,
2130 struct stripe_head *sh, int disks, struct bio **return_bi) 2016 struct stripe_head *sh, int disks, struct bio **return_bi)
2131{ 2017{
2132 int i; 2018 int i;
@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2171 md_wakeup_thread(conf->mddev->thread); 2057 md_wakeup_thread(conf->mddev->thread);
2172} 2058}
2173 2059
2174static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2060static void handle_stripe_dirtying5(raid5_conf_t *conf,
2175 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2061 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2176{ 2062{
2177 int rmw = 0, rcw = 0, i; 2063 int rmw = 0, rcw = 0, i;
@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2215 "%d for r-m-w\n", i); 2101 "%d for r-m-w\n", i);
2216 set_bit(R5_LOCKED, &dev->flags); 2102 set_bit(R5_LOCKED, &dev->flags);
2217 set_bit(R5_Wantread, &dev->flags); 2103 set_bit(R5_Wantread, &dev->flags);
2218 if (!test_and_set_bit(
2219 STRIPE_OP_IO, &sh->ops.pending))
2220 sh->ops.count++;
2221 s->locked++; 2104 s->locked++;
2222 } else { 2105 } else {
2223 set_bit(STRIPE_DELAYED, &sh->state); 2106 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2241 "%d for Reconstruct\n", i); 2124 "%d for Reconstruct\n", i);
2242 set_bit(R5_LOCKED, &dev->flags); 2125 set_bit(R5_LOCKED, &dev->flags);
2243 set_bit(R5_Wantread, &dev->flags); 2126 set_bit(R5_Wantread, &dev->flags);
2244 if (!test_and_set_bit(
2245 STRIPE_OP_IO, &sh->ops.pending))
2246 sh->ops.count++;
2247 s->locked++; 2127 s->locked++;
2248 } else { 2128 } else {
2249 set_bit(STRIPE_DELAYED, &sh->state); 2129 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2261 * simultaneously. If this is not the case then new writes need to be 2141 * simultaneously. If this is not the case then new writes need to be
2262 * held off until the compute completes. 2142 * held off until the compute completes.
2263 */ 2143 */
2264 if ((s->req_compute || 2144 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2265 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && 2145 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2266 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2146 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2267 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2147 schedule_reconstruction5(sh, s, rcw == 0, 0);
2268 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2269} 2148}
2270 2149
2271static void handle_issuing_new_write_requests6(raid5_conf_t *conf, 2150static void handle_stripe_dirtying6(raid5_conf_t *conf,
2272 struct stripe_head *sh, struct stripe_head_state *s, 2151 struct stripe_head *sh, struct stripe_head_state *s,
2273 struct r6_state *r6s, int disks) 2152 struct r6_state *r6s, int disks)
2274{ 2153{
@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2371static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2250static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2372 struct stripe_head_state *s, int disks) 2251 struct stripe_head_state *s, int disks)
2373{ 2252{
2374 int canceled_check = 0; 2253 struct r5dev *dev = NULL;
2375 2254
2376 set_bit(STRIPE_HANDLE, &sh->state); 2255 set_bit(STRIPE_HANDLE, &sh->state);
2377 2256
2378 /* complete a check operation */ 2257 switch (sh->check_state) {
2379 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { 2258 case check_state_idle:
2380 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); 2259 /* start a new check operation if there are no failures */
2381 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2382 if (s->failed == 0) { 2260 if (s->failed == 0) {
2383 if (sh->ops.zero_sum_result == 0)
2384 /* parity is correct (on disc,
2385 * not in buffer any more)
2386 */
2387 set_bit(STRIPE_INSYNC, &sh->state);
2388 else {
2389 conf->mddev->resync_mismatches +=
2390 STRIPE_SECTORS;
2391 if (test_bit(
2392 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2393 /* don't try to repair!! */
2394 set_bit(STRIPE_INSYNC, &sh->state);
2395 else {
2396 set_bit(STRIPE_OP_COMPUTE_BLK,
2397 &sh->ops.pending);
2398 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2399 &sh->ops.pending);
2400 set_bit(R5_Wantcompute,
2401 &sh->dev[sh->pd_idx].flags);
2402 sh->ops.target = sh->pd_idx;
2403 sh->ops.count++;
2404 s->uptodate++;
2405 }
2406 }
2407 } else
2408 canceled_check = 1; /* STRIPE_INSYNC is not set */
2409 }
2410
2411 /* start a new check operation if there are no failures, the stripe is
2412 * not insync, and a repair is not in flight
2413 */
2414 if (s->failed == 0 &&
2415 !test_bit(STRIPE_INSYNC, &sh->state) &&
2416 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2417 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2418 BUG_ON(s->uptodate != disks); 2261 BUG_ON(s->uptodate != disks);
2262 sh->check_state = check_state_run;
2263 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2419 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2264 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2420 sh->ops.count++;
2421 s->uptodate--; 2265 s->uptodate--;
2266 break;
2422 } 2267 }
2423 } 2268 dev = &sh->dev[s->failed_num];
2424 2269 /* fall through */
2425 /* check if we can clear a parity disk reconstruct */ 2270 case check_state_compute_result:
2426 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && 2271 sh->check_state = check_state_idle;
2427 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { 2272 if (!dev)
2428 2273 dev = &sh->dev[sh->pd_idx];
2429 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); 2274
2430 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 2275 /* check that a write has not made the stripe insync */
2431 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); 2276 if (test_bit(STRIPE_INSYNC, &sh->state))
2432 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2277 break;
2433 }
2434
2435 2278
2436 /* Wait for check parity and compute block operations to complete
2437 * before write-back. If a failure occurred while the check operation
2438 * was in flight we need to cycle this stripe through handle_stripe
2439 * since the parity block may not be uptodate
2440 */
2441 if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
2442 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2443 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2444 struct r5dev *dev;
2445 /* either failed parity check, or recovery is happening */ 2279 /* either failed parity check, or recovery is happening */
2446 if (s->failed == 0)
2447 s->failed_num = sh->pd_idx;
2448 dev = &sh->dev[s->failed_num];
2449 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2280 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2450 BUG_ON(s->uptodate != disks); 2281 BUG_ON(s->uptodate != disks);
2451 2282
2452 set_bit(R5_LOCKED, &dev->flags); 2283 set_bit(R5_LOCKED, &dev->flags);
2284 s->locked++;
2453 set_bit(R5_Wantwrite, &dev->flags); 2285 set_bit(R5_Wantwrite, &dev->flags);
2454 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2455 sh->ops.count++;
2456 2286
2457 clear_bit(STRIPE_DEGRADED, &sh->state); 2287 clear_bit(STRIPE_DEGRADED, &sh->state);
2458 s->locked++;
2459 set_bit(STRIPE_INSYNC, &sh->state); 2288 set_bit(STRIPE_INSYNC, &sh->state);
2289 break;
2290 case check_state_run:
2291 break; /* we will be called again upon completion */
2292 case check_state_check_result:
2293 sh->check_state = check_state_idle;
2294
2295 /* if a failure occurred during the check operation, leave
2296 * STRIPE_INSYNC not set and let the stripe be handled again
2297 */
2298 if (s->failed)
2299 break;
2300
2301 /* handle a successful check operation, if parity is correct
2302 * we are done. Otherwise update the mismatch count and repair
2303 * parity if !MD_RECOVERY_CHECK
2304 */
2305 if (sh->ops.zero_sum_result == 0)
2306 /* parity is correct (on disc,
2307 * not in buffer any more)
2308 */
2309 set_bit(STRIPE_INSYNC, &sh->state);
2310 else {
2311 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2312 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2313 /* don't try to repair!! */
2314 set_bit(STRIPE_INSYNC, &sh->state);
2315 else {
2316 sh->check_state = check_state_compute_run;
2317 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2318 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2319 set_bit(R5_Wantcompute,
2320 &sh->dev[sh->pd_idx].flags);
2321 sh->ops.target = sh->pd_idx;
2322 s->uptodate++;
2323 }
2324 }
2325 break;
2326 case check_state_compute_run:
2327 break;
2328 default:
2329 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2330 __func__, sh->check_state,
2331 (unsigned long long) sh->sector);
2332 BUG();
2460 } 2333 }
2461} 2334}
2462 2335
@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
2641 struct bio *return_bi = NULL; 2514 struct bio *return_bi = NULL;
2642 struct stripe_head_state s; 2515 struct stripe_head_state s;
2643 struct r5dev *dev; 2516 struct r5dev *dev;
2644 unsigned long pending = 0;
2645 mdk_rdev_t *blocked_rdev = NULL; 2517 mdk_rdev_t *blocked_rdev = NULL;
2646 int prexor; 2518 int prexor;
2647 2519
2648 memset(&s, 0, sizeof(s)); 2520 memset(&s, 0, sizeof(s));
2649 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2521 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2650 "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, 2522 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2651 atomic_read(&sh->count), sh->pd_idx, 2523 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2652 sh->ops.pending, sh->ops.ack, sh->ops.complete); 2524 sh->reconstruct_state);
2653 2525
2654 spin_lock(&sh->lock); 2526 spin_lock(&sh->lock);
2655 clear_bit(STRIPE_HANDLE, &sh->state); 2527 clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
2658 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2530 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2659 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2531 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2660 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2532 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2661 /* Now to look around and see what can be done */
2662
2663 /* clean-up completed biofill operations */
2664 if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
2665 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
2666 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
2667 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
2668 }
2669 2533
2534 /* Now to look around and see what can be done */
2670 rcu_read_lock(); 2535 rcu_read_lock();
2671 for (i=disks; i--; ) { 2536 for (i=disks; i--; ) {
2672 mdk_rdev_t *rdev; 2537 mdk_rdev_t *rdev;
@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
2680 /* maybe we can request a biofill operation 2545 /* maybe we can request a biofill operation
2681 * 2546 *
2682 * new wantfill requests are only permitted while 2547 * new wantfill requests are only permitted while
2683 * STRIPE_OP_BIOFILL is clear 2548 * ops_complete_biofill is guaranteed to be inactive
2684 */ 2549 */
2685 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2550 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2686 !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2551 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2687 set_bit(R5_Wantfill, &dev->flags); 2552 set_bit(R5_Wantfill, &dev->flags);
2688 2553
2689 /* now count some things */ 2554 /* now count some things */
@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
2727 goto unlock; 2592 goto unlock;
2728 } 2593 }
2729 2594
2730 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2595 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
2731 sh->ops.count++; 2596 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
2597 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
2598 }
2732 2599
2733 pr_debug("locked=%d uptodate=%d to_read=%d" 2600 pr_debug("locked=%d uptodate=%d to_read=%d"
2734 " to_write=%d failed=%d failed_num=%d\n", 2601 " to_write=%d failed=%d failed_num=%d\n",
@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
2738 * need to be failed 2605 * need to be failed
2739 */ 2606 */
2740 if (s.failed > 1 && s.to_read+s.to_write+s.written) 2607 if (s.failed > 1 && s.to_read+s.to_write+s.written)
2741 handle_requests_to_failed_array(conf, sh, &s, disks, 2608 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
2742 &return_bi);
2743 if (s.failed > 1 && s.syncing) { 2609 if (s.failed > 1 && s.syncing) {
2744 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2610 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2745 clear_bit(STRIPE_SYNCING, &sh->state); 2611 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
2755 !test_bit(R5_LOCKED, &dev->flags) && 2621 !test_bit(R5_LOCKED, &dev->flags) &&
2756 test_bit(R5_UPTODATE, &dev->flags)) || 2622 test_bit(R5_UPTODATE, &dev->flags)) ||
2757 (s.failed == 1 && s.failed_num == sh->pd_idx))) 2623 (s.failed == 1 && s.failed_num == sh->pd_idx)))
2758 handle_completed_write_requests(conf, sh, disks, &return_bi); 2624 handle_stripe_clean_event(conf, sh, disks, &return_bi);
2759 2625
2760 /* Now we might consider reading some blocks, either to check/generate 2626 /* Now we might consider reading some blocks, either to check/generate
2761 * parity, or to satisfy requests 2627 * parity, or to satisfy requests
2762 * or to load a block that is being partially written. 2628 * or to load a block that is being partially written.
2763 */ 2629 */
2764 if (s.to_read || s.non_overwrite || 2630 if (s.to_read || s.non_overwrite ||
2765 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || 2631 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
2766 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2632 handle_stripe_fill5(sh, &s, disks);
2767 handle_issuing_new_read_requests5(sh, &s, disks);
2768 2633
2769 /* Now we check to see if any write operations have recently 2634 /* Now we check to see if any write operations have recently
2770 * completed 2635 * completed
2771 */ 2636 */
2772
2773 /* leave prexor set until postxor is done, allows us to distinguish
2774 * a rmw from a rcw during biodrain
2775 */
2776 prexor = 0; 2637 prexor = 0;
2777 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && 2638 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
2778 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2779
2780 prexor = 1; 2639 prexor = 1;
2781 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 2640 if (sh->reconstruct_state == reconstruct_state_drain_result ||
2782 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); 2641 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
2783 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 2642 sh->reconstruct_state = reconstruct_state_idle;
2784
2785 for (i = disks; i--; )
2786 clear_bit(R5_Wantprexor, &sh->dev[i].flags);
2787 }
2788
2789 /* if only POSTXOR is set then this is an 'expand' postxor */
2790 if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
2791 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2792
2793 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2794 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2795 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2796
2797 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2798 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2799 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2800 2643
2801 /* All the 'written' buffers and the parity block are ready to 2644 /* All the 'written' buffers and the parity block are ready to
2802 * be written back to disk 2645 * be written back to disk
@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh)
2808 (i == sh->pd_idx || dev->written)) { 2651 (i == sh->pd_idx || dev->written)) {
2809 pr_debug("Writing block %d\n", i); 2652 pr_debug("Writing block %d\n", i);
2810 set_bit(R5_Wantwrite, &dev->flags); 2653 set_bit(R5_Wantwrite, &dev->flags);
2811 if (!test_and_set_bit(
2812 STRIPE_OP_IO, &sh->ops.pending))
2813 sh->ops.count++;
2814 if (prexor) 2654 if (prexor)
2815 continue; 2655 continue;
2816 if (!test_bit(R5_Insync, &dev->flags) || 2656 if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
2832 * 2/ A 'check' operation is in flight, as it may clobber the parity 2672 * 2/ A 'check' operation is in flight, as it may clobber the parity
2833 * block. 2673 * block.
2834 */ 2674 */
2835 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && 2675 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
2836 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) 2676 handle_stripe_dirtying5(conf, sh, &s, disks);
2837 handle_issuing_new_write_requests5(conf, sh, &s, disks);
2838 2677
2839 /* maybe we need to check and possibly fix the parity for this stripe 2678 /* maybe we need to check and possibly fix the parity for this stripe
2840 * Any reads will already have been scheduled, so we just see if enough 2679 * Any reads will already have been scheduled, so we just see if enough
2841 * data is available. The parity check is held off while parity 2680 * data is available. The parity check is held off while parity
2842 * dependent operations are in flight. 2681 * dependent operations are in flight.
2843 */ 2682 */
2844 if ((s.syncing && s.locked == 0 && 2683 if (sh->check_state ||
2845 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 2684 (s.syncing && s.locked == 0 &&
2846 !test_bit(STRIPE_INSYNC, &sh->state)) || 2685 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
2847 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || 2686 !test_bit(STRIPE_INSYNC, &sh->state)))
2848 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2849 handle_parity_checks5(conf, sh, &s, disks); 2687 handle_parity_checks5(conf, sh, &s, disks);
2850 2688
2851 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2689 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
2864 dev = &sh->dev[s.failed_num]; 2702 dev = &sh->dev[s.failed_num];
2865 if (!test_bit(R5_ReWrite, &dev->flags)) { 2703 if (!test_bit(R5_ReWrite, &dev->flags)) {
2866 set_bit(R5_Wantwrite, &dev->flags); 2704 set_bit(R5_Wantwrite, &dev->flags);
2867 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2868 sh->ops.count++;
2869 set_bit(R5_ReWrite, &dev->flags); 2705 set_bit(R5_ReWrite, &dev->flags);
2870 set_bit(R5_LOCKED, &dev->flags); 2706 set_bit(R5_LOCKED, &dev->flags);
2871 s.locked++; 2707 s.locked++;
2872 } else { 2708 } else {
2873 /* let's read it back */ 2709 /* let's read it back */
2874 set_bit(R5_Wantread, &dev->flags); 2710 set_bit(R5_Wantread, &dev->flags);
2875 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2876 sh->ops.count++;
2877 set_bit(R5_LOCKED, &dev->flags); 2711 set_bit(R5_LOCKED, &dev->flags);
2878 s.locked++; 2712 s.locked++;
2879 } 2713 }
2880 } 2714 }
2881 2715
2882 /* Finish postxor operations initiated by the expansion 2716 /* Finish reconstruct operations initiated by the expansion process */
2883 * process 2717 if (sh->reconstruct_state == reconstruct_state_result) {
2884 */ 2718 sh->reconstruct_state = reconstruct_state_idle;
2885 if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
2886 !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
2887
2888 clear_bit(STRIPE_EXPANDING, &sh->state); 2719 clear_bit(STRIPE_EXPANDING, &sh->state);
2889 2720 for (i = conf->raid_disks; i--; )
2890 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2891 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2892 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2893
2894 for (i = conf->raid_disks; i--; ) {
2895 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2721 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2896 set_bit(R5_LOCKED, &dev->flags); 2722 set_bit(R5_LOCKED, &dev->flags);
2897 s.locked++; 2723 s.locked++;
2898 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2899 sh->ops.count++;
2900 }
2901 } 2724 }
2902 2725
2903 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2726 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2904 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { 2727 !sh->reconstruct_state) {
2905 /* Need to write out all blocks after computing parity */ 2728 /* Need to write out all blocks after computing parity */
2906 sh->disks = conf->raid_disks; 2729 sh->disks = conf->raid_disks;
2907 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2730 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2908 conf->raid_disks); 2731 conf->raid_disks);
2909 s.locked += handle_write_operations5(sh, 1, 1); 2732 schedule_reconstruction5(sh, &s, 1, 1);
2910 } else if (s.expanded && 2733 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2911 s.locked == 0 &&
2912 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2913 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2734 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2914 atomic_dec(&conf->reshape_stripes); 2735 atomic_dec(&conf->reshape_stripes);
2915 wake_up(&conf->wait_for_overlap); 2736 wake_up(&conf->wait_for_overlap);
@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
2917 } 2738 }
2918 2739
2919 if (s.expanding && s.locked == 0 && 2740 if (s.expanding && s.locked == 0 &&
2920 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2741 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
2921 handle_stripe_expansion(conf, sh, NULL); 2742 handle_stripe_expansion(conf, sh, NULL);
2922 2743
2923 if (sh->ops.count)
2924 pending = get_stripe_work(sh);
2925
2926 unlock: 2744 unlock:
2927 spin_unlock(&sh->lock); 2745 spin_unlock(&sh->lock);
2928 2746
@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
2930 if (unlikely(blocked_rdev)) 2748 if (unlikely(blocked_rdev))
2931 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2749 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2932 2750
2933 if (pending) 2751 if (s.ops_request)
2934 raid5_run_ops(sh, pending); 2752 raid5_run_ops(sh, s.ops_request);
2935 2753
2936 return_io(return_bi); 2754 ops_run_io(sh, &s);
2937 2755
2756 return_io(return_bi);
2938} 2757}
2939 2758
2940static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2759static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3042 * might need to be failed 2861 * might need to be failed
3043 */ 2862 */
3044 if (s.failed > 2 && s.to_read+s.to_write+s.written) 2863 if (s.failed > 2 && s.to_read+s.to_write+s.written)
3045 handle_requests_to_failed_array(conf, sh, &s, disks, 2864 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3046 &return_bi);
3047 if (s.failed > 2 && s.syncing) { 2865 if (s.failed > 2 && s.syncing) {
3048 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2866 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3049 clear_bit(STRIPE_SYNCING, &sh->state); 2867 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3068 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 2886 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3069 && !test_bit(R5_LOCKED, &qdev->flags) 2887 && !test_bit(R5_LOCKED, &qdev->flags)
3070 && test_bit(R5_UPTODATE, &qdev->flags))))) 2888 && test_bit(R5_UPTODATE, &qdev->flags)))))
3071 handle_completed_write_requests(conf, sh, disks, &return_bi); 2889 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3072 2890
3073 /* Now we might consider reading some blocks, either to check/generate 2891 /* Now we might consider reading some blocks, either to check/generate
3074 * parity, or to satisfy requests 2892 * parity, or to satisfy requests
@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3076 */ 2894 */
3077 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 2895 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3078 (s.syncing && (s.uptodate < disks)) || s.expanding) 2896 (s.syncing && (s.uptodate < disks)) || s.expanding)
3079 handle_issuing_new_read_requests6(sh, &s, &r6s, disks); 2897 handle_stripe_fill6(sh, &s, &r6s, disks);
3080 2898
3081 /* now to consider writing and what else, if anything should be read */ 2899 /* now to consider writing and what else, if anything should be read */
3082 if (s.to_write) 2900 if (s.to_write)
3083 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); 2901 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3084 2902
3085 /* maybe we need to check and possibly fix the parity for this stripe 2903 /* maybe we need to check and possibly fix the parity for this stripe
3086 * Any reads will already have been scheduled, so we just see if enough 2904 * Any reads will already have been scheduled, so we just see if enough
@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3136 } 2954 }
3137 2955
3138 if (s.expanding && s.locked == 0 && 2956 if (s.expanding && s.locked == 0 &&
3139 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2957 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3140 handle_stripe_expansion(conf, sh, &r6s); 2958 handle_stripe_expansion(conf, sh, &r6s);
3141 2959
3142 unlock: 2960 unlock:
@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3146 if (unlikely(blocked_rdev)) 2964 if (unlikely(blocked_rdev))
3147 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2965 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3148 2966
3149 return_io(return_bi); 2967 ops_run_io(sh, &s);
3150
3151 for (i=disks; i-- ;) {
3152 int rw;
3153 struct bio *bi;
3154 mdk_rdev_t *rdev;
3155 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
3156 rw = WRITE;
3157 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
3158 rw = READ;
3159 else
3160 continue;
3161
3162 set_bit(STRIPE_IO_STARTED, &sh->state);
3163
3164 bi = &sh->dev[i].req;
3165
3166 bi->bi_rw = rw;
3167 if (rw == WRITE)
3168 bi->bi_end_io = raid5_end_write_request;
3169 else
3170 bi->bi_end_io = raid5_end_read_request;
3171
3172 rcu_read_lock();
3173 rdev = rcu_dereference(conf->disks[i].rdev);
3174 if (rdev && test_bit(Faulty, &rdev->flags))
3175 rdev = NULL;
3176 if (rdev)
3177 atomic_inc(&rdev->nr_pending);
3178 rcu_read_unlock();
3179 2968
3180 if (rdev) { 2969 return_io(return_bi);
3181 if (s.syncing || s.expanding || s.expanded)
3182 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
3183
3184 bi->bi_bdev = rdev->bdev;
3185 pr_debug("for %llu schedule op %ld on disc %d\n",
3186 (unsigned long long)sh->sector, bi->bi_rw, i);
3187 atomic_inc(&sh->count);
3188 bi->bi_sector = sh->sector + rdev->data_offset;
3189 bi->bi_flags = 1 << BIO_UPTODATE;
3190 bi->bi_vcnt = 1;
3191 bi->bi_max_vecs = 1;
3192 bi->bi_idx = 0;
3193 bi->bi_io_vec = &sh->dev[i].vec;
3194 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
3195 bi->bi_io_vec[0].bv_offset = 0;
3196 bi->bi_size = STRIPE_SIZE;
3197 bi->bi_next = NULL;
3198 if (rw == WRITE &&
3199 test_bit(R5_ReWrite, &sh->dev[i].flags))
3200 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
3201 generic_make_request(bi);
3202 } else {
3203 if (rw == WRITE)
3204 set_bit(STRIPE_DEGRADED, &sh->state);
3205 pr_debug("skip op %ld on disc %d for sector %llu\n",
3206 bi->bi_rw, i, (unsigned long long)sh->sector);
3207 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3208 set_bit(STRIPE_HANDLE, &sh->state);
3209 }
3210 }
3211} 2970}
3212 2971
3213static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) 2972static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
@@ -3695,9 +3454,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3695 if ( rw == WRITE ) 3454 if ( rw == WRITE )
3696 md_write_end(mddev); 3455 md_write_end(mddev);
3697 3456
3698 bi->bi_end_io(bi, 3457 bio_endio(bi, 0);
3699 test_bit(BIO_UPTODATE, &bi->bi_flags)
3700 ? 0 : -EIO);
3701 } 3458 }
3702 return 0; 3459 return 0;
3703} 3460}
@@ -4000,12 +3757,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4000 spin_lock_irq(&conf->device_lock); 3757 spin_lock_irq(&conf->device_lock);
4001 remaining = --raid_bio->bi_phys_segments; 3758 remaining = --raid_bio->bi_phys_segments;
4002 spin_unlock_irq(&conf->device_lock); 3759 spin_unlock_irq(&conf->device_lock);
4003 if (remaining == 0) { 3760 if (remaining == 0)
4004 3761 bio_endio(raid_bio, 0);
4005 raid_bio->bi_end_io(raid_bio,
4006 test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
4007 ? 0 : -EIO);
4008 }
4009 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3762 if (atomic_dec_and_test(&conf->active_aligned_reads))
4010 wake_up(&conf->wait_for_stripe); 3763 wake_up(&conf->wait_for_stripe);
4011 return handled; 3764 return handled;
@@ -4092,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4092{ 3845{
4093 raid5_conf_t *conf = mddev_to_conf(mddev); 3846 raid5_conf_t *conf = mddev_to_conf(mddev);
4094 unsigned long new; 3847 unsigned long new;
3848 int err;
3849
4095 if (len >= PAGE_SIZE) 3850 if (len >= PAGE_SIZE)
4096 return -EINVAL; 3851 return -EINVAL;
4097 if (!conf) 3852 if (!conf)
@@ -4107,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4107 else 3862 else
4108 break; 3863 break;
4109 } 3864 }
4110 md_allow_write(mddev); 3865 err = md_allow_write(mddev);
3866 if (err)
3867 return err;
4111 while (new > conf->max_nr_stripes) { 3868 while (new > conf->max_nr_stripes) {
4112 if (grow_one_stripe(conf)) 3869 if (grow_one_stripe(conf))
4113 conf->max_nr_stripes++; 3870 conf->max_nr_stripes++;
@@ -4607,35 +4364,41 @@ abort:
4607static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 4364static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4608{ 4365{
4609 raid5_conf_t *conf = mddev->private; 4366 raid5_conf_t *conf = mddev->private;
4610 int found = 0; 4367 int err = -EEXIST;
4611 int disk; 4368 int disk;
4612 struct disk_info *p; 4369 struct disk_info *p;
4370 int first = 0;
4371 int last = conf->raid_disks - 1;
4613 4372
4614 if (mddev->degraded > conf->max_degraded) 4373 if (mddev->degraded > conf->max_degraded)
4615 /* no point adding a device */ 4374 /* no point adding a device */
4616 return 0; 4375 return -EINVAL;
4376
4377 if (rdev->raid_disk >= 0)
4378 first = last = rdev->raid_disk;
4617 4379
4618 /* 4380 /*
4619 * find the disk ... but prefer rdev->saved_raid_disk 4381 * find the disk ... but prefer rdev->saved_raid_disk
4620 * if possible. 4382 * if possible.
4621 */ 4383 */
4622 if (rdev->saved_raid_disk >= 0 && 4384 if (rdev->saved_raid_disk >= 0 &&
4385 rdev->saved_raid_disk >= first &&
4623 conf->disks[rdev->saved_raid_disk].rdev == NULL) 4386 conf->disks[rdev->saved_raid_disk].rdev == NULL)
4624 disk = rdev->saved_raid_disk; 4387 disk = rdev->saved_raid_disk;
4625 else 4388 else
4626 disk = 0; 4389 disk = first;
4627 for ( ; disk < conf->raid_disks; disk++) 4390 for ( ; disk <= last ; disk++)
4628 if ((p=conf->disks + disk)->rdev == NULL) { 4391 if ((p=conf->disks + disk)->rdev == NULL) {
4629 clear_bit(In_sync, &rdev->flags); 4392 clear_bit(In_sync, &rdev->flags);
4630 rdev->raid_disk = disk; 4393 rdev->raid_disk = disk;
4631 found = 1; 4394 err = 0;
4632 if (rdev->saved_raid_disk != disk) 4395 if (rdev->saved_raid_disk != disk)
4633 conf->fullsync = 1; 4396 conf->fullsync = 1;
4634 rcu_assign_pointer(p->rdev, rdev); 4397 rcu_assign_pointer(p->rdev, rdev);
4635 break; 4398 break;
4636 } 4399 }
4637 print_raid5_conf(conf); 4400 print_raid5_conf(conf);
4638 return found; 4401 return err;
4639} 4402}
4640 4403
4641static int raid5_resize(mddev_t *mddev, sector_t sectors) 4404static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4736,7 +4499,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4736 rdev_for_each(rdev, rtmp, mddev) 4499 rdev_for_each(rdev, rtmp, mddev)
4737 if (rdev->raid_disk < 0 && 4500 if (rdev->raid_disk < 0 &&
4738 !test_bit(Faulty, &rdev->flags)) { 4501 !test_bit(Faulty, &rdev->flags)) {
4739 if (raid5_add_disk(mddev, rdev)) { 4502 if (raid5_add_disk(mddev, rdev) == 0) {
4740 char nm[20]; 4503 char nm[20];
4741 set_bit(In_sync, &rdev->flags); 4504 set_bit(In_sync, &rdev->flags);
4742 added_devices++; 4505 added_devices++;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 78bfdea24a8e..e98900671ca9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -221,6 +221,7 @@ struct bitmap {
221 unsigned long syncchunk; 221 unsigned long syncchunk;
222 222
223 __u64 events_cleared; 223 __u64 events_cleared;
224 int need_sync;
224 225
225 /* bitmap spinlock */ 226 /* bitmap spinlock */
226 spinlock_t lock; 227 spinlock_t lock;
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b7386ae9d288..dc0e3fcb9f28 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
95 struct page *page, int rw); 95 struct page *page, int rw);
96extern void md_do_sync(mddev_t *mddev); 96extern void md_do_sync(mddev_t *mddev);
97extern void md_new_event(mddev_t *mddev); 97extern void md_new_event(mddev_t *mddev);
98extern void md_allow_write(mddev_t *mddev); 98extern int md_allow_write(mddev_t *mddev);
99extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 99extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
100 100
101#endif /* CONFIG_MD */ 101#endif /* CONFIG_MD */
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 3dea9f545c8f..df30c4395875 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -87,6 +87,9 @@ struct mdk_rdev_s
87#define Blocked 8 /* An error occured on an externally 87#define Blocked 8 /* An error occured on an externally
88 * managed array, don't allow writes 88 * managed array, don't allow writes
89 * until it is cleared */ 89 * until it is cleared */
90#define StateChanged 9 /* Faulty or Blocked has changed during
91 * interrupt, so it needs to be
92 * notified by the thread */
90 wait_queue_head_t blocked_wait; 93 wait_queue_head_t blocked_wait;
91 94
92 int desc_nr; /* descriptor index in the superblock */ 95 int desc_nr; /* descriptor index in the superblock */
@@ -188,6 +191,7 @@ struct mddev_s
188 * NEEDED: we might need to start a resync/recover 191 * NEEDED: we might need to start a resync/recover
189 * RUNNING: a thread is running, or about to be started 192 * RUNNING: a thread is running, or about to be started
190 * SYNC: actually doing a resync, not a recovery 193 * SYNC: actually doing a resync, not a recovery
194 * RECOVER: doing recovery, or need to try it.
191 * INTR: resync needs to be aborted for some reason 195 * INTR: resync needs to be aborted for some reason
192 * DONE: thread is done and is waiting to be reaped 196 * DONE: thread is done and is waiting to be reaped
193 * REQUEST: user-space has requested a sync (used with SYNC) 197 * REQUEST: user-space has requested a sync (used with SYNC)
@@ -198,6 +202,7 @@ struct mddev_s
198 */ 202 */
199#define MD_RECOVERY_RUNNING 0 203#define MD_RECOVERY_RUNNING 0
200#define MD_RECOVERY_SYNC 1 204#define MD_RECOVERY_SYNC 1
205#define MD_RECOVERY_RECOVER 2
201#define MD_RECOVERY_INTR 3 206#define MD_RECOVERY_INTR 3
202#define MD_RECOVERY_DONE 4 207#define MD_RECOVERY_DONE 4
203#define MD_RECOVERY_NEEDED 5 208#define MD_RECOVERY_NEEDED 5
@@ -227,6 +232,8 @@ struct mddev_s
227 atomic_t recovery_active; /* blocks scheduled, but not written */ 232 atomic_t recovery_active; /* blocks scheduled, but not written */
228 wait_queue_head_t recovery_wait; 233 wait_queue_head_t recovery_wait;
229 sector_t recovery_cp; 234 sector_t recovery_cp;
235 sector_t resync_min; /* user requested sync
236 * starts here */
230 sector_t resync_max; /* resync should pause 237 sector_t resync_max; /* resync should pause
231 * when it gets here */ 238 * when it gets here */
232 239
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f0827d31ae6f..3b2672792457 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -158,6 +158,43 @@
158 * the compute block completes. 158 * the compute block completes.
159 */ 159 */
160 160
161/*
162 * Operations state - intermediate states that are visible outside of sh->lock
163 * In general _idle indicates nothing is running, _run indicates a data
164 * processing operation is active, and _result means the data processing result
165 * is stable and can be acted upon. For simple operations like biofill and
166 * compute that only have an _idle and _run state they are indicated with
167 * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
168 */
169/**
170 * enum check_states - handles syncing / repairing a stripe
171 * @check_state_idle - check operations are quiesced
172 * @check_state_run - check operation is running
173 * @check_state_result - set outside lock when check result is valid
174 * @check_state_compute_run - check failed and we are repairing
175 * @check_state_compute_result - set outside lock when compute result is valid
176 */
177enum check_states {
178 check_state_idle = 0,
179 check_state_run, /* parity check */
180 check_state_check_result,
181 check_state_compute_run, /* parity repair */
182 check_state_compute_result,
183};
184
185/**
186 * enum reconstruct_states - handles writing or expanding a stripe
187 */
188enum reconstruct_states {
189 reconstruct_state_idle = 0,
190 reconstruct_state_prexor_drain_run, /* prexor-write */
191 reconstruct_state_drain_run, /* write */
192 reconstruct_state_run, /* expand */
193 reconstruct_state_prexor_drain_result,
194 reconstruct_state_drain_result,
195 reconstruct_state_result,
196};
197
161struct stripe_head { 198struct stripe_head {
162 struct hlist_node hash; 199 struct hlist_node hash;
163 struct list_head lru; /* inactive_list or handle_list */ 200 struct list_head lru; /* inactive_list or handle_list */
@@ -169,19 +206,13 @@ struct stripe_head {
169 spinlock_t lock; 206 spinlock_t lock;
170 int bm_seq; /* sequence number for bitmap flushes */ 207 int bm_seq; /* sequence number for bitmap flushes */
171 int disks; /* disks in stripe */ 208 int disks; /* disks in stripe */
209 enum check_states check_state;
210 enum reconstruct_states reconstruct_state;
172 /* stripe_operations 211 /* stripe_operations
173 * @pending - pending ops flags (set for request->issue->complete)
174 * @ack - submitted ops flags (set for issue->complete)
175 * @complete - completed ops flags (set for complete)
176 * @target - STRIPE_OP_COMPUTE_BLK target 212 * @target - STRIPE_OP_COMPUTE_BLK target
177 * @count - raid5_runs_ops is set to run when this is non-zero
178 */ 213 */
179 struct stripe_operations { 214 struct stripe_operations {
180 unsigned long pending;
181 unsigned long ack;
182 unsigned long complete;
183 int target; 215 int target;
184 int count;
185 u32 zero_sum_result; 216 u32 zero_sum_result;
186 } ops; 217 } ops;
187 struct r5dev { 218 struct r5dev {
@@ -202,6 +233,7 @@ struct stripe_head_state {
202 int locked, uptodate, to_read, to_write, failed, written; 233 int locked, uptodate, to_read, to_write, failed, written;
203 int to_fill, compute, req_compute, non_overwrite; 234 int to_fill, compute, req_compute, non_overwrite;
204 int failed_num; 235 int failed_num;
236 unsigned long ops_request;
205}; 237};
206 238
207/* r6_state - extra state data only relevant to r6 */ 239/* r6_state - extra state data only relevant to r6 */
@@ -228,9 +260,7 @@ struct r6_state {
228#define R5_Wantfill 12 /* dev->toread contains a bio that needs 260#define R5_Wantfill 12 /* dev->toread contains a bio that needs
229 * filling 261 * filling
230 */ 262 */
231#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from 263#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
232 * other "towrites"
233 */
234/* 264/*
235 * Write method 265 * Write method
236 */ 266 */
@@ -254,8 +284,10 @@ struct r6_state {
254#define STRIPE_EXPAND_READY 11 284#define STRIPE_EXPAND_READY 11
255#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ 285#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
256#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 286#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
287#define STRIPE_BIOFILL_RUN 14
288#define STRIPE_COMPUTE_RUN 15
257/* 289/*
258 * Operations flags (in issue order) 290 * Operation request flags
259 */ 291 */
260#define STRIPE_OP_BIOFILL 0 292#define STRIPE_OP_BIOFILL 0
261#define STRIPE_OP_COMPUTE_BLK 1 293#define STRIPE_OP_COMPUTE_BLK 1
@@ -263,14 +295,6 @@ struct r6_state {
263#define STRIPE_OP_BIODRAIN 3 295#define STRIPE_OP_BIODRAIN 3
264#define STRIPE_OP_POSTXOR 4 296#define STRIPE_OP_POSTXOR 4
265#define STRIPE_OP_CHECK 5 297#define STRIPE_OP_CHECK 5
266#define STRIPE_OP_IO 6
267
268/* modifiers to the base operations
269 * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
270 * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
271 */
272#define STRIPE_OP_MOD_REPAIR_PD 7
273#define STRIPE_OP_MOD_DMA_CHECK 8
274 298
275/* 299/*
276 * Plugging: 300 * Plugging: