diff options
-rw-r--r-- | Documentation/md.txt | 28 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 29 | ||||
-rw-r--r-- | drivers/md/linear.c | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 340 | ||||
-rw-r--r-- | drivers/md/multipath.c | 15 | ||||
-rw-r--r-- | drivers/md/raid1.c | 19 | ||||
-rw-r--r-- | drivers/md/raid10.c | 20 | ||||
-rw-r--r-- | drivers/md/raid5.c | 729 | ||||
-rw-r--r-- | include/linux/raid/bitmap.h | 1 | ||||
-rw-r--r-- | include/linux/raid/md.h | 2 | ||||
-rw-r--r-- | include/linux/raid/md_k.h | 7 | ||||
-rw-r--r-- | include/linux/raid/raid5.h | 64 |
12 files changed, 662 insertions, 594 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index a8b430627473..e06cc59437e4 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
@@ -236,6 +236,11 @@ All md devices contain: | |||
236 | writing the word for the desired state, however some states | 236 | writing the word for the desired state, however some states |
237 | cannot be explicitly set, and some transitions are not allowed. | 237 | cannot be explicitly set, and some transitions are not allowed. |
238 | 238 | ||
239 | Select/poll works on this file. All changes except between | ||
240 | active_idle and active (which can be frequent and are not | ||
241 | very interesting) are notified. active->active_idle is | ||
242 | reported if the metadata is externally managed. | ||
243 | |||
239 | clear | 244 | clear |
240 | No devices, no size, no level | 245 | No devices, no size, no level |
241 | Writing is equivalent to STOP_ARRAY ioctl | 246 | Writing is equivalent to STOP_ARRAY ioctl |
@@ -292,6 +297,10 @@ Each directory contains: | |||
292 | writemostly - device will only be subject to read | 297 | writemostly - device will only be subject to read |
293 | requests if there are no other options. | 298 | requests if there are no other options. |
294 | This applies only to raid1 arrays. | 299 | This applies only to raid1 arrays. |
300 | blocked - device has failed, metadata is "external", | ||
301 | and the failure hasn't been acknowledged yet. | ||
302 | Writes that would write to this device if | ||
303 | it were not faulty are blocked. | ||
295 | spare - device is working, but not a full member. | 304 | spare - device is working, but not a full member. |
296 | This includes spares that are in the process | 305 | This includes spares that are in the process |
297 | of being recovered to | 306 | of being recovered to |
@@ -301,6 +310,12 @@ Each directory contains: | |||
301 | Writing "remove" removes the device from the array. | 310 | Writing "remove" removes the device from the array. |
302 | Writing "writemostly" sets the writemostly flag. | 311 | Writing "writemostly" sets the writemostly flag. |
303 | Writing "-writemostly" clears the writemostly flag. | 312 | Writing "-writemostly" clears the writemostly flag. |
313 | Writing "blocked" sets the "blocked" flag. | ||
314 | Writing "-blocked" clear the "blocked" flag and allows writes | ||
315 | to complete. | ||
316 | |||
317 | This file responds to select/poll. Any change to 'faulty' | ||
318 | or 'blocked' causes an event. | ||
304 | 319 | ||
305 | errors | 320 | errors |
306 | An approximate count of read errors that have been detected on | 321 | An approximate count of read errors that have been detected on |
@@ -381,6 +396,19 @@ also have | |||
381 | 'check' and 'repair' will start the appropriate process | 396 | 'check' and 'repair' will start the appropriate process |
382 | providing the current state is 'idle'. | 397 | providing the current state is 'idle'. |
383 | 398 | ||
399 | This file responds to select/poll. Any important change in the value | ||
400 | triggers a poll event. Sometimes the value will briefly be | ||
401 | "recover" if a recovery seems to be needed, but cannot be | ||
402 | achieved. In that case, the transition to "recover" isn't | ||
403 | notified, but the transition away is. | ||
404 | |||
405 | degraded | ||
406 | This contains a count of the number of devices by which the | ||
407 | arrays is degraded. So an optimal array with show '0'. A | ||
408 | single failed/missing drive will show '1', etc. | ||
409 | This file responds to select/poll, any increase or decrease | ||
410 | in the count of missing devices will trigger an event. | ||
411 | |||
384 | mismatch_count | 412 | mismatch_count |
385 | When performing 'check' and 'repair', and possibly when | 413 | When performing 'check' and 'repair', and possibly when |
386 | performing 'resync', md will count the number of errors that are | 414 | performing 'resync', md will count the number of errors that are |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index b26927ce889c..dedba16d42f7 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -454,8 +454,11 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
454 | spin_unlock_irqrestore(&bitmap->lock, flags); | 454 | spin_unlock_irqrestore(&bitmap->lock, flags); |
455 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 455 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); |
456 | sb->events = cpu_to_le64(bitmap->mddev->events); | 456 | sb->events = cpu_to_le64(bitmap->mddev->events); |
457 | if (!bitmap->mddev->degraded) | 457 | if (bitmap->mddev->events < bitmap->events_cleared) { |
458 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); | 458 | /* rocking back to read-only */ |
459 | bitmap->events_cleared = bitmap->mddev->events; | ||
460 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | ||
461 | } | ||
459 | kunmap_atomic(sb, KM_USER0); | 462 | kunmap_atomic(sb, KM_USER0); |
460 | write_page(bitmap, bitmap->sb_page, 1); | 463 | write_page(bitmap, bitmap->sb_page, 1); |
461 | } | 464 | } |
@@ -1085,9 +1088,19 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1085 | } else | 1088 | } else |
1086 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1089 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1087 | lastpage = page; | 1090 | lastpage = page; |
1088 | /* | 1091 | |
1089 | printk("bitmap clean at page %lu\n", j); | 1092 | /* We are possibly going to clear some bits, so make |
1090 | */ | 1093 | * sure that events_cleared is up-to-date. |
1094 | */ | ||
1095 | if (bitmap->need_sync) { | ||
1096 | bitmap_super_t *sb; | ||
1097 | bitmap->need_sync = 0; | ||
1098 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | ||
1099 | sb->events_cleared = | ||
1100 | cpu_to_le64(bitmap->events_cleared); | ||
1101 | kunmap_atomic(sb, KM_USER0); | ||
1102 | write_page(bitmap, bitmap->sb_page, 1); | ||
1103 | } | ||
1091 | spin_lock_irqsave(&bitmap->lock, flags); | 1104 | spin_lock_irqsave(&bitmap->lock, flags); |
1092 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1105 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
1093 | } | 1106 | } |
@@ -1257,6 +1270,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1257 | return; | 1270 | return; |
1258 | } | 1271 | } |
1259 | 1272 | ||
1273 | if (success && | ||
1274 | bitmap->events_cleared < bitmap->mddev->events) { | ||
1275 | bitmap->events_cleared = bitmap->mddev->events; | ||
1276 | bitmap->need_sync = 1; | ||
1277 | } | ||
1278 | |||
1260 | if (!success && ! (*bmc & NEEDED_MASK)) | 1279 | if (!success && ! (*bmc & NEEDED_MASK)) |
1261 | *bmc |= NEEDED_MASK; | 1280 | *bmc |= NEEDED_MASK; |
1262 | 1281 | ||
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 10748240cb2f..ec921f58fbb8 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -126,7 +126,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
126 | int j = rdev->raid_disk; | 126 | int j = rdev->raid_disk; |
127 | dev_info_t *disk = conf->disks + j; | 127 | dev_info_t *disk = conf->disks + j; |
128 | 128 | ||
129 | if (j < 0 || j > raid_disks || disk->rdev) { | 129 | if (j < 0 || j >= raid_disks || disk->rdev) { |
130 | printk("linear: disk numbering problem. Aborting!\n"); | 130 | printk("linear: disk numbering problem. Aborting!\n"); |
131 | goto out; | 131 | goto out; |
132 | } | 132 | } |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 2580ac1b9b0f..73399d17ac3e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev) | |||
169 | { | 169 | { |
170 | atomic_inc(&md_event_count); | 170 | atomic_inc(&md_event_count); |
171 | wake_up(&md_event_waiters); | 171 | wake_up(&md_event_waiters); |
172 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
173 | } | 172 | } |
174 | EXPORT_SYMBOL_GPL(md_new_event); | 173 | EXPORT_SYMBOL_GPL(md_new_event); |
175 | 174 | ||
@@ -278,6 +277,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
278 | init_waitqueue_head(&new->sb_wait); | 277 | init_waitqueue_head(&new->sb_wait); |
279 | init_waitqueue_head(&new->recovery_wait); | 278 | init_waitqueue_head(&new->recovery_wait); |
280 | new->reshape_position = MaxSector; | 279 | new->reshape_position = MaxSector; |
280 | new->resync_min = 0; | ||
281 | new->resync_max = MaxSector; | 281 | new->resync_max = MaxSector; |
282 | new->level = LEVEL_NONE; | 282 | new->level = LEVEL_NONE; |
283 | 283 | ||
@@ -564,7 +564,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
564 | 564 | ||
565 | if (!tmp1 || !tmp2) { | 565 | if (!tmp1 || !tmp2) { |
566 | ret = 0; | 566 | ret = 0; |
567 | printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); | 567 | printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); |
568 | goto abort; | 568 | goto abort; |
569 | } | 569 | } |
570 | 570 | ||
@@ -658,11 +658,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) | |||
658 | */ | 658 | */ |
659 | 659 | ||
660 | struct super_type { | 660 | struct super_type { |
661 | char *name; | 661 | char *name; |
662 | struct module *owner; | 662 | struct module *owner; |
663 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); | 663 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, |
664 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); | 664 | int minor_version); |
665 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | 665 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); |
666 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
667 | unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, | ||
668 | unsigned long long size); | ||
666 | }; | 669 | }; |
667 | 670 | ||
668 | /* | 671 | /* |
@@ -1004,6 +1007,27 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1004 | } | 1007 | } |
1005 | 1008 | ||
1006 | /* | 1009 | /* |
1010 | * rdev_size_change for 0.90.0 | ||
1011 | */ | ||
1012 | static unsigned long long | ||
1013 | super_90_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size) | ||
1014 | { | ||
1015 | if (size && size < rdev->mddev->size) | ||
1016 | return 0; /* component must fit device */ | ||
1017 | size *= 2; /* convert to sectors */ | ||
1018 | if (rdev->mddev->bitmap_offset) | ||
1019 | return 0; /* can't move bitmap */ | ||
1020 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | ||
1021 | if (!size || size > rdev->sb_offset*2) | ||
1022 | size = rdev->sb_offset*2; | ||
1023 | md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size, | ||
1024 | rdev->sb_page); | ||
1025 | md_super_wait(rdev->mddev); | ||
1026 | return size/2; /* kB for sysfs */ | ||
1027 | } | ||
1028 | |||
1029 | |||
1030 | /* | ||
1007 | * version 1 superblock | 1031 | * version 1 superblock |
1008 | */ | 1032 | */ |
1009 | 1033 | ||
@@ -1328,21 +1352,59 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1328 | sb->sb_csum = calc_sb_1_csum(sb); | 1352 | sb->sb_csum = calc_sb_1_csum(sb); |
1329 | } | 1353 | } |
1330 | 1354 | ||
1355 | static unsigned long long | ||
1356 | super_1_rdev_size_change(mdk_rdev_t *rdev, unsigned long long size) | ||
1357 | { | ||
1358 | struct mdp_superblock_1 *sb; | ||
1359 | unsigned long long max_size; | ||
1360 | if (size && size < rdev->mddev->size) | ||
1361 | return 0; /* component must fit device */ | ||
1362 | size *= 2; /* convert to sectors */ | ||
1363 | if (rdev->sb_offset < rdev->data_offset/2) { | ||
1364 | /* minor versions 1 and 2; superblock before data */ | ||
1365 | max_size = (rdev->bdev->bd_inode->i_size >> 9); | ||
1366 | max_size -= rdev->data_offset; | ||
1367 | if (!size || size > max_size) | ||
1368 | size = max_size; | ||
1369 | } else if (rdev->mddev->bitmap_offset) { | ||
1370 | /* minor version 0 with bitmap we can't move */ | ||
1371 | return 0; | ||
1372 | } else { | ||
1373 | /* minor version 0; superblock after data */ | ||
1374 | sector_t sb_offset; | ||
1375 | sb_offset = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; | ||
1376 | sb_offset &= ~(sector_t)(4*2 - 1); | ||
1377 | max_size = rdev->size*2 + sb_offset - rdev->sb_offset*2; | ||
1378 | if (!size || size > max_size) | ||
1379 | size = max_size; | ||
1380 | rdev->sb_offset = sb_offset/2; | ||
1381 | } | ||
1382 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | ||
1383 | sb->data_size = cpu_to_le64(size); | ||
1384 | sb->super_offset = rdev->sb_offset*2; | ||
1385 | sb->sb_csum = calc_sb_1_csum(sb); | ||
1386 | md_super_write(rdev->mddev, rdev, rdev->sb_offset << 1, rdev->sb_size, | ||
1387 | rdev->sb_page); | ||
1388 | md_super_wait(rdev->mddev); | ||
1389 | return size/2; /* kB for sysfs */ | ||
1390 | } | ||
1331 | 1391 | ||
1332 | static struct super_type super_types[] = { | 1392 | static struct super_type super_types[] = { |
1333 | [0] = { | 1393 | [0] = { |
1334 | .name = "0.90.0", | 1394 | .name = "0.90.0", |
1335 | .owner = THIS_MODULE, | 1395 | .owner = THIS_MODULE, |
1336 | .load_super = super_90_load, | 1396 | .load_super = super_90_load, |
1337 | .validate_super = super_90_validate, | 1397 | .validate_super = super_90_validate, |
1338 | .sync_super = super_90_sync, | 1398 | .sync_super = super_90_sync, |
1399 | .rdev_size_change = super_90_rdev_size_change, | ||
1339 | }, | 1400 | }, |
1340 | [1] = { | 1401 | [1] = { |
1341 | .name = "md-1", | 1402 | .name = "md-1", |
1342 | .owner = THIS_MODULE, | 1403 | .owner = THIS_MODULE, |
1343 | .load_super = super_1_load, | 1404 | .load_super = super_1_load, |
1344 | .validate_super = super_1_validate, | 1405 | .validate_super = super_1_validate, |
1345 | .sync_super = super_1_sync, | 1406 | .sync_super = super_1_sync, |
1407 | .rdev_size_change = super_1_rdev_size_change, | ||
1346 | }, | 1408 | }, |
1347 | }; | 1409 | }; |
1348 | 1410 | ||
@@ -1787,7 +1849,7 @@ repeat: | |||
1787 | 1849 | ||
1788 | } | 1850 | } |
1789 | 1851 | ||
1790 | /* words written to sysfs files may, or my not, be \n terminated. | 1852 | /* words written to sysfs files may, or may not, be \n terminated. |
1791 | * We want to accept with case. For this we use cmd_match. | 1853 | * We want to accept with case. For this we use cmd_match. |
1792 | */ | 1854 | */ |
1793 | static int cmd_match(const char *cmd, const char *str) | 1855 | static int cmd_match(const char *cmd, const char *str) |
@@ -1886,6 +1948,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1886 | 1948 | ||
1887 | err = 0; | 1949 | err = 0; |
1888 | } | 1950 | } |
1951 | if (!err) | ||
1952 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
1889 | return err ? err : len; | 1953 | return err ? err : len; |
1890 | } | 1954 | } |
1891 | static struct rdev_sysfs_entry rdev_state = | 1955 | static struct rdev_sysfs_entry rdev_state = |
@@ -1931,7 +1995,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1931 | slot = -1; | 1995 | slot = -1; |
1932 | else if (e==buf || (*e && *e!= '\n')) | 1996 | else if (e==buf || (*e && *e!= '\n')) |
1933 | return -EINVAL; | 1997 | return -EINVAL; |
1934 | if (rdev->mddev->pers) { | 1998 | if (rdev->mddev->pers && slot == -1) { |
1935 | /* Setting 'slot' on an active array requires also | 1999 | /* Setting 'slot' on an active array requires also |
1936 | * updating the 'rd%d' link, and communicating | 2000 | * updating the 'rd%d' link, and communicating |
1937 | * with the personality with ->hot_*_disk. | 2001 | * with the personality with ->hot_*_disk. |
@@ -1939,8 +2003,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1939 | * failed/spare devices. This normally happens automatically, | 2003 | * failed/spare devices. This normally happens automatically, |
1940 | * but not when the metadata is externally managed. | 2004 | * but not when the metadata is externally managed. |
1941 | */ | 2005 | */ |
1942 | if (slot != -1) | ||
1943 | return -EBUSY; | ||
1944 | if (rdev->raid_disk == -1) | 2006 | if (rdev->raid_disk == -1) |
1945 | return -EEXIST; | 2007 | return -EEXIST; |
1946 | /* personality does all needed checks */ | 2008 | /* personality does all needed checks */ |
@@ -1954,6 +2016,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1954 | sysfs_remove_link(&rdev->mddev->kobj, nm); | 2016 | sysfs_remove_link(&rdev->mddev->kobj, nm); |
1955 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2017 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
1956 | md_wakeup_thread(rdev->mddev->thread); | 2018 | md_wakeup_thread(rdev->mddev->thread); |
2019 | } else if (rdev->mddev->pers) { | ||
2020 | mdk_rdev_t *rdev2; | ||
2021 | struct list_head *tmp; | ||
2022 | /* Activating a spare .. or possibly reactivating | ||
2023 | * if we every get bitmaps working here. | ||
2024 | */ | ||
2025 | |||
2026 | if (rdev->raid_disk != -1) | ||
2027 | return -EBUSY; | ||
2028 | |||
2029 | if (rdev->mddev->pers->hot_add_disk == NULL) | ||
2030 | return -EINVAL; | ||
2031 | |||
2032 | rdev_for_each(rdev2, tmp, rdev->mddev) | ||
2033 | if (rdev2->raid_disk == slot) | ||
2034 | return -EEXIST; | ||
2035 | |||
2036 | rdev->raid_disk = slot; | ||
2037 | if (test_bit(In_sync, &rdev->flags)) | ||
2038 | rdev->saved_raid_disk = slot; | ||
2039 | else | ||
2040 | rdev->saved_raid_disk = -1; | ||
2041 | err = rdev->mddev->pers-> | ||
2042 | hot_add_disk(rdev->mddev, rdev); | ||
2043 | if (err) { | ||
2044 | rdev->raid_disk = -1; | ||
2045 | return err; | ||
2046 | } else | ||
2047 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
2048 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
2049 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
2050 | printk(KERN_WARNING | ||
2051 | "md: cannot register " | ||
2052 | "%s for %s\n", | ||
2053 | nm, mdname(rdev->mddev)); | ||
2054 | |||
2055 | /* don't wakeup anyone, leave that to userspace. */ | ||
1957 | } else { | 2056 | } else { |
1958 | if (slot >= rdev->mddev->raid_disks) | 2057 | if (slot >= rdev->mddev->raid_disks) |
1959 | return -ENOSPC; | 2058 | return -ENOSPC; |
@@ -1962,6 +2061,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1962 | clear_bit(Faulty, &rdev->flags); | 2061 | clear_bit(Faulty, &rdev->flags); |
1963 | clear_bit(WriteMostly, &rdev->flags); | 2062 | clear_bit(WriteMostly, &rdev->flags); |
1964 | set_bit(In_sync, &rdev->flags); | 2063 | set_bit(In_sync, &rdev->flags); |
2064 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
1965 | } | 2065 | } |
1966 | return len; | 2066 | return len; |
1967 | } | 2067 | } |
@@ -1983,7 +2083,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1983 | unsigned long long offset = simple_strtoull(buf, &e, 10); | 2083 | unsigned long long offset = simple_strtoull(buf, &e, 10); |
1984 | if (e==buf || (*e && *e != '\n')) | 2084 | if (e==buf || (*e && *e != '\n')) |
1985 | return -EINVAL; | 2085 | return -EINVAL; |
1986 | if (rdev->mddev->pers) | 2086 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
1987 | return -EBUSY; | 2087 | return -EBUSY; |
1988 | if (rdev->size && rdev->mddev->external) | 2088 | if (rdev->size && rdev->mddev->external) |
1989 | /* Must set offset before size, so overlap checks | 2089 | /* Must set offset before size, so overlap checks |
@@ -2022,8 +2122,20 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2022 | 2122 | ||
2023 | if (e==buf || (*e && *e != '\n')) | 2123 | if (e==buf || (*e && *e != '\n')) |
2024 | return -EINVAL; | 2124 | return -EINVAL; |
2025 | if (my_mddev->pers) | 2125 | if (my_mddev->pers && rdev->raid_disk >= 0) { |
2026 | return -EBUSY; | 2126 | if (rdev->mddev->persistent) { |
2127 | size = super_types[rdev->mddev->major_version]. | ||
2128 | rdev_size_change(rdev, size); | ||
2129 | if (!size) | ||
2130 | return -EBUSY; | ||
2131 | } else if (!size) { | ||
2132 | size = (rdev->bdev->bd_inode->i_size >> 10); | ||
2133 | size -= rdev->data_offset/2; | ||
2134 | } | ||
2135 | if (size < rdev->mddev->size) | ||
2136 | return -EINVAL; /* component must fit device */ | ||
2137 | } | ||
2138 | |||
2027 | rdev->size = size; | 2139 | rdev->size = size; |
2028 | if (size > oldsize && rdev->mddev->external) { | 2140 | if (size > oldsize && rdev->mddev->external) { |
2029 | /* need to check that all other rdevs with the same ->bdev | 2141 | /* need to check that all other rdevs with the same ->bdev |
@@ -2512,7 +2624,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); | |||
2512 | * When written, doesn't tear down array, but just stops it | 2624 | * When written, doesn't tear down array, but just stops it |
2513 | * suspended (not supported yet) | 2625 | * suspended (not supported yet) |
2514 | * All IO requests will block. The array can be reconfigured. | 2626 | * All IO requests will block. The array can be reconfigured. |
2515 | * Writing this, if accepted, will block until array is quiessent | 2627 | * Writing this, if accepted, will block until array is quiescent |
2516 | * readonly | 2628 | * readonly |
2517 | * no resync can happen. no superblocks get written. | 2629 | * no resync can happen. no superblocks get written. |
2518 | * write requests fail | 2630 | * write requests fail |
@@ -2681,8 +2793,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
2681 | } | 2793 | } |
2682 | if (err) | 2794 | if (err) |
2683 | return err; | 2795 | return err; |
2684 | else | 2796 | else { |
2797 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
2685 | return len; | 2798 | return len; |
2799 | } | ||
2686 | } | 2800 | } |
2687 | static struct md_sysfs_entry md_array_state = | 2801 | static struct md_sysfs_entry md_array_state = |
2688 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); | 2802 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
@@ -2899,7 +3013,7 @@ action_show(mddev_t *mddev, char *page) | |||
2899 | type = "check"; | 3013 | type = "check"; |
2900 | else | 3014 | else |
2901 | type = "repair"; | 3015 | type = "repair"; |
2902 | } else | 3016 | } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) |
2903 | type = "recover"; | 3017 | type = "recover"; |
2904 | } | 3018 | } |
2905 | return sprintf(page, "%s\n", type); | 3019 | return sprintf(page, "%s\n", type); |
@@ -2921,15 +3035,19 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
2921 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || | 3035 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
2922 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 3036 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
2923 | return -EBUSY; | 3037 | return -EBUSY; |
2924 | else if (cmd_match(page, "resync") || cmd_match(page, "recover")) | 3038 | else if (cmd_match(page, "resync")) |
3039 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
3040 | else if (cmd_match(page, "recover")) { | ||
3041 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
2925 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3042 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2926 | else if (cmd_match(page, "reshape")) { | 3043 | } else if (cmd_match(page, "reshape")) { |
2927 | int err; | 3044 | int err; |
2928 | if (mddev->pers->start_reshape == NULL) | 3045 | if (mddev->pers->start_reshape == NULL) |
2929 | return -EINVAL; | 3046 | return -EINVAL; |
2930 | err = mddev->pers->start_reshape(mddev); | 3047 | err = mddev->pers->start_reshape(mddev); |
2931 | if (err) | 3048 | if (err) |
2932 | return err; | 3049 | return err; |
3050 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
2933 | } else { | 3051 | } else { |
2934 | if (cmd_match(page, "check")) | 3052 | if (cmd_match(page, "check")) |
2935 | set_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 3053 | set_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
@@ -2940,6 +3058,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
2940 | } | 3058 | } |
2941 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3059 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2942 | md_wakeup_thread(mddev->thread); | 3060 | md_wakeup_thread(mddev->thread); |
3061 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
2943 | return len; | 3062 | return len; |
2944 | } | 3063 | } |
2945 | 3064 | ||
@@ -3049,11 +3168,11 @@ static ssize_t | |||
3049 | sync_speed_show(mddev_t *mddev, char *page) | 3168 | sync_speed_show(mddev_t *mddev, char *page) |
3050 | { | 3169 | { |
3051 | unsigned long resync, dt, db; | 3170 | unsigned long resync, dt, db; |
3052 | resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); | 3171 | resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); |
3053 | dt = ((jiffies - mddev->resync_mark) / HZ); | 3172 | dt = (jiffies - mddev->resync_mark) / HZ; |
3054 | if (!dt) dt++; | 3173 | if (!dt) dt++; |
3055 | db = resync - (mddev->resync_mark_cnt); | 3174 | db = resync - mddev->resync_mark_cnt; |
3056 | return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ | 3175 | return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ |
3057 | } | 3176 | } |
3058 | 3177 | ||
3059 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); | 3178 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); |
@@ -3075,6 +3194,36 @@ sync_completed_show(mddev_t *mddev, char *page) | |||
3075 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); | 3194 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); |
3076 | 3195 | ||
3077 | static ssize_t | 3196 | static ssize_t |
3197 | min_sync_show(mddev_t *mddev, char *page) | ||
3198 | { | ||
3199 | return sprintf(page, "%llu\n", | ||
3200 | (unsigned long long)mddev->resync_min); | ||
3201 | } | ||
3202 | static ssize_t | ||
3203 | min_sync_store(mddev_t *mddev, const char *buf, size_t len) | ||
3204 | { | ||
3205 | unsigned long long min; | ||
3206 | if (strict_strtoull(buf, 10, &min)) | ||
3207 | return -EINVAL; | ||
3208 | if (min > mddev->resync_max) | ||
3209 | return -EINVAL; | ||
3210 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
3211 | return -EBUSY; | ||
3212 | |||
3213 | /* Must be a multiple of chunk_size */ | ||
3214 | if (mddev->chunk_size) { | ||
3215 | if (min & (sector_t)((mddev->chunk_size>>9)-1)) | ||
3216 | return -EINVAL; | ||
3217 | } | ||
3218 | mddev->resync_min = min; | ||
3219 | |||
3220 | return len; | ||
3221 | } | ||
3222 | |||
3223 | static struct md_sysfs_entry md_min_sync = | ||
3224 | __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); | ||
3225 | |||
3226 | static ssize_t | ||
3078 | max_sync_show(mddev_t *mddev, char *page) | 3227 | max_sync_show(mddev_t *mddev, char *page) |
3079 | { | 3228 | { |
3080 | if (mddev->resync_max == MaxSector) | 3229 | if (mddev->resync_max == MaxSector) |
@@ -3089,9 +3238,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
3089 | if (strncmp(buf, "max", 3) == 0) | 3238 | if (strncmp(buf, "max", 3) == 0) |
3090 | mddev->resync_max = MaxSector; | 3239 | mddev->resync_max = MaxSector; |
3091 | else { | 3240 | else { |
3092 | char *ep; | 3241 | unsigned long long max; |
3093 | unsigned long long max = simple_strtoull(buf, &ep, 10); | 3242 | if (strict_strtoull(buf, 10, &max)) |
3094 | if (ep == buf || (*ep != 0 && *ep != '\n')) | 3243 | return -EINVAL; |
3244 | if (max < mddev->resync_min) | ||
3095 | return -EINVAL; | 3245 | return -EINVAL; |
3096 | if (max < mddev->resync_max && | 3246 | if (max < mddev->resync_max && |
3097 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 3247 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
@@ -3222,6 +3372,7 @@ static struct attribute *md_redundancy_attrs[] = { | |||
3222 | &md_sync_speed.attr, | 3372 | &md_sync_speed.attr, |
3223 | &md_sync_force_parallel.attr, | 3373 | &md_sync_force_parallel.attr, |
3224 | &md_sync_completed.attr, | 3374 | &md_sync_completed.attr, |
3375 | &md_min_sync.attr, | ||
3225 | &md_max_sync.attr, | 3376 | &md_max_sync.attr, |
3226 | &md_suspend_lo.attr, | 3377 | &md_suspend_lo.attr, |
3227 | &md_suspend_hi.attr, | 3378 | &md_suspend_hi.attr, |
@@ -3326,9 +3477,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) | |||
3326 | disk->queue = mddev->queue; | 3477 | disk->queue = mddev->queue; |
3327 | add_disk(disk); | 3478 | add_disk(disk); |
3328 | mddev->gendisk = disk; | 3479 | mddev->gendisk = disk; |
3329 | mutex_unlock(&disks_mutex); | ||
3330 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, | 3480 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, |
3331 | "%s", "md"); | 3481 | "%s", "md"); |
3482 | mutex_unlock(&disks_mutex); | ||
3332 | if (error) | 3483 | if (error) |
3333 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 3484 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", |
3334 | disk->disk_name); | 3485 | disk->disk_name); |
@@ -3341,7 +3492,11 @@ static void md_safemode_timeout(unsigned long data) | |||
3341 | { | 3492 | { |
3342 | mddev_t *mddev = (mddev_t *) data; | 3493 | mddev_t *mddev = (mddev_t *) data; |
3343 | 3494 | ||
3344 | mddev->safemode = 1; | 3495 | if (!atomic_read(&mddev->writes_pending)) { |
3496 | mddev->safemode = 1; | ||
3497 | if (mddev->external) | ||
3498 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3499 | } | ||
3345 | md_wakeup_thread(mddev->thread); | 3500 | md_wakeup_thread(mddev->thread); |
3346 | } | 3501 | } |
3347 | 3502 | ||
@@ -3448,6 +3603,7 @@ static int do_md_run(mddev_t * mddev) | |||
3448 | return -EINVAL; | 3603 | return -EINVAL; |
3449 | } | 3604 | } |
3450 | } | 3605 | } |
3606 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
3451 | } | 3607 | } |
3452 | 3608 | ||
3453 | md_probe(mddev->unit, NULL, NULL); | 3609 | md_probe(mddev->unit, NULL, NULL); |
@@ -3519,7 +3675,9 @@ static int do_md_run(mddev_t * mddev) | |||
3519 | mddev->ro = 2; /* read-only, but switch on first write */ | 3675 | mddev->ro = 2; /* read-only, but switch on first write */ |
3520 | 3676 | ||
3521 | err = mddev->pers->run(mddev); | 3677 | err = mddev->pers->run(mddev); |
3522 | if (!err && mddev->pers->sync_request) { | 3678 | if (err) |
3679 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
3680 | else if (mddev->pers->sync_request) { | ||
3523 | err = bitmap_create(mddev); | 3681 | err = bitmap_create(mddev); |
3524 | if (err) { | 3682 | if (err) { |
3525 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 3683 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
@@ -3528,7 +3686,6 @@ static int do_md_run(mddev_t * mddev) | |||
3528 | } | 3686 | } |
3529 | } | 3687 | } |
3530 | if (err) { | 3688 | if (err) { |
3531 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
3532 | module_put(mddev->pers->owner); | 3689 | module_put(mddev->pers->owner); |
3533 | mddev->pers = NULL; | 3690 | mddev->pers = NULL; |
3534 | bitmap_destroy(mddev); | 3691 | bitmap_destroy(mddev); |
@@ -3608,6 +3765,9 @@ static int do_md_run(mddev_t * mddev) | |||
3608 | 3765 | ||
3609 | mddev->changed = 1; | 3766 | mddev->changed = 1; |
3610 | md_new_event(mddev); | 3767 | md_new_event(mddev); |
3768 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3769 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
3770 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
3611 | kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); | 3771 | kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); |
3612 | return 0; | 3772 | return 0; |
3613 | } | 3773 | } |
@@ -3642,6 +3802,8 @@ static int restart_array(mddev_t *mddev) | |||
3642 | md_wakeup_thread(mddev->thread); | 3802 | md_wakeup_thread(mddev->thread); |
3643 | md_wakeup_thread(mddev->sync_thread); | 3803 | md_wakeup_thread(mddev->sync_thread); |
3644 | err = 0; | 3804 | err = 0; |
3805 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3806 | |||
3645 | } else | 3807 | } else |
3646 | err = -EINVAL; | 3808 | err = -EINVAL; |
3647 | 3809 | ||
@@ -3777,6 +3939,7 @@ static int do_md_stop(mddev_t * mddev, int mode) | |||
3777 | mddev->size = 0; | 3939 | mddev->size = 0; |
3778 | mddev->raid_disks = 0; | 3940 | mddev->raid_disks = 0; |
3779 | mddev->recovery_cp = 0; | 3941 | mddev->recovery_cp = 0; |
3942 | mddev->resync_min = 0; | ||
3780 | mddev->resync_max = MaxSector; | 3943 | mddev->resync_max = MaxSector; |
3781 | mddev->reshape_position = MaxSector; | 3944 | mddev->reshape_position = MaxSector; |
3782 | mddev->external = 0; | 3945 | mddev->external = 0; |
@@ -3811,6 +3974,7 @@ static int do_md_stop(mddev_t * mddev, int mode) | |||
3811 | mdname(mddev)); | 3974 | mdname(mddev)); |
3812 | err = 0; | 3975 | err = 0; |
3813 | md_new_event(mddev); | 3976 | md_new_event(mddev); |
3977 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3814 | out: | 3978 | out: |
3815 | return err; | 3979 | return err; |
3816 | } | 3980 | } |
@@ -4009,9 +4173,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) | |||
4009 | char *ptr, *buf = NULL; | 4173 | char *ptr, *buf = NULL; |
4010 | int err = -ENOMEM; | 4174 | int err = -ENOMEM; |
4011 | 4175 | ||
4012 | md_allow_write(mddev); | 4176 | if (md_allow_write(mddev)) |
4177 | file = kmalloc(sizeof(*file), GFP_NOIO); | ||
4178 | else | ||
4179 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
4013 | 4180 | ||
4014 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
4015 | if (!file) | 4181 | if (!file) |
4016 | goto out; | 4182 | goto out; |
4017 | 4183 | ||
@@ -4172,8 +4338,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4172 | } | 4338 | } |
4173 | if (err) | 4339 | if (err) |
4174 | export_rdev(rdev); | 4340 | export_rdev(rdev); |
4341 | else | ||
4342 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
4175 | 4343 | ||
4176 | md_update_sb(mddev, 1); | 4344 | md_update_sb(mddev, 1); |
4345 | if (mddev->degraded) | ||
4346 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
4177 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4347 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4178 | md_wakeup_thread(mddev->thread); | 4348 | md_wakeup_thread(mddev->thread); |
4179 | return err; | 4349 | return err; |
@@ -4232,9 +4402,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) | |||
4232 | char b[BDEVNAME_SIZE]; | 4402 | char b[BDEVNAME_SIZE]; |
4233 | mdk_rdev_t *rdev; | 4403 | mdk_rdev_t *rdev; |
4234 | 4404 | ||
4235 | if (!mddev->pers) | ||
4236 | return -ENODEV; | ||
4237 | |||
4238 | rdev = find_rdev(mddev, dev); | 4405 | rdev = find_rdev(mddev, dev); |
4239 | if (!rdev) | 4406 | if (!rdev) |
4240 | return -ENXIO; | 4407 | return -ENXIO; |
@@ -4641,6 +4808,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) | |||
4641 | return 0; | 4808 | return 0; |
4642 | } | 4809 | } |
4643 | 4810 | ||
4811 | /* | ||
4812 | * We have a problem here : there is no easy way to give a CHS | ||
4813 | * virtual geometry. We currently pretend that we have a 2 heads | ||
4814 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
4815 | * dosfs just mad... ;-) | ||
4816 | */ | ||
4644 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) | 4817 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
4645 | { | 4818 | { |
4646 | mddev_t *mddev = bdev->bd_disk->private_data; | 4819 | mddev_t *mddev = bdev->bd_disk->private_data; |
@@ -4792,12 +4965,6 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
4792 | err = do_md_stop (mddev, 1); | 4965 | err = do_md_stop (mddev, 1); |
4793 | goto done_unlock; | 4966 | goto done_unlock; |
4794 | 4967 | ||
4795 | /* | ||
4796 | * We have a problem here : there is no easy way to give a CHS | ||
4797 | * virtual geometry. We currently pretend that we have a 2 heads | ||
4798 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
4799 | * dosfs just mad... ;-) | ||
4800 | */ | ||
4801 | } | 4968 | } |
4802 | 4969 | ||
4803 | /* | 4970 | /* |
@@ -4807,13 +4974,12 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
4807 | * here and hit the 'default' below, so only disallow | 4974 | * here and hit the 'default' below, so only disallow |
4808 | * 'md' ioctls, and switch to rw mode if started auto-readonly. | 4975 | * 'md' ioctls, and switch to rw mode if started auto-readonly. |
4809 | */ | 4976 | */ |
4810 | if (_IOC_TYPE(cmd) == MD_MAJOR && | 4977 | if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { |
4811 | mddev->ro && mddev->pers) { | ||
4812 | if (mddev->ro == 2) { | 4978 | if (mddev->ro == 2) { |
4813 | mddev->ro = 0; | 4979 | mddev->ro = 0; |
4814 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4980 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
4815 | md_wakeup_thread(mddev->thread); | 4981 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4816 | 4982 | md_wakeup_thread(mddev->thread); | |
4817 | } else { | 4983 | } else { |
4818 | err = -EROFS; | 4984 | err = -EROFS; |
4819 | goto abort_unlock; | 4985 | goto abort_unlock; |
@@ -5029,6 +5195,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5029 | if (!mddev->pers->error_handler) | 5195 | if (!mddev->pers->error_handler) |
5030 | return; | 5196 | return; |
5031 | mddev->pers->error_handler(mddev,rdev); | 5197 | mddev->pers->error_handler(mddev,rdev); |
5198 | if (mddev->degraded) | ||
5199 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5200 | set_bit(StateChanged, &rdev->flags); | ||
5032 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 5201 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
5033 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5202 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5034 | md_wakeup_thread(mddev->thread); | 5203 | md_wakeup_thread(mddev->thread); |
@@ -5451,6 +5620,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) | |||
5451 | */ | 5620 | */ |
5452 | void md_write_start(mddev_t *mddev, struct bio *bi) | 5621 | void md_write_start(mddev_t *mddev, struct bio *bi) |
5453 | { | 5622 | { |
5623 | int did_change = 0; | ||
5454 | if (bio_data_dir(bi) != WRITE) | 5624 | if (bio_data_dir(bi) != WRITE) |
5455 | return; | 5625 | return; |
5456 | 5626 | ||
@@ -5461,6 +5631,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) | |||
5461 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5631 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5462 | md_wakeup_thread(mddev->thread); | 5632 | md_wakeup_thread(mddev->thread); |
5463 | md_wakeup_thread(mddev->sync_thread); | 5633 | md_wakeup_thread(mddev->sync_thread); |
5634 | did_change = 1; | ||
5464 | } | 5635 | } |
5465 | atomic_inc(&mddev->writes_pending); | 5636 | atomic_inc(&mddev->writes_pending); |
5466 | if (mddev->safemode == 1) | 5637 | if (mddev->safemode == 1) |
@@ -5471,10 +5642,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) | |||
5471 | mddev->in_sync = 0; | 5642 | mddev->in_sync = 0; |
5472 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 5643 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
5473 | md_wakeup_thread(mddev->thread); | 5644 | md_wakeup_thread(mddev->thread); |
5645 | did_change = 1; | ||
5474 | } | 5646 | } |
5475 | spin_unlock_irq(&mddev->write_lock); | 5647 | spin_unlock_irq(&mddev->write_lock); |
5476 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
5477 | } | 5648 | } |
5649 | if (did_change) | ||
5650 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
5478 | wait_event(mddev->sb_wait, | 5651 | wait_event(mddev->sb_wait, |
5479 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | 5652 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && |
5480 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 5653 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |
@@ -5495,13 +5668,18 @@ void md_write_end(mddev_t *mddev) | |||
5495 | * may proceed without blocking. It is important to call this before | 5668 | * may proceed without blocking. It is important to call this before |
5496 | * attempting a GFP_KERNEL allocation while holding the mddev lock. | 5669 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
5497 | * Must be called with mddev_lock held. | 5670 | * Must be called with mddev_lock held. |
5671 | * | ||
5672 | * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock | ||
5673 | * is dropped, so return -EAGAIN after notifying userspace. | ||
5498 | */ | 5674 | */ |
5499 | void md_allow_write(mddev_t *mddev) | 5675 | int md_allow_write(mddev_t *mddev) |
5500 | { | 5676 | { |
5501 | if (!mddev->pers) | 5677 | if (!mddev->pers) |
5502 | return; | 5678 | return 0; |
5503 | if (mddev->ro) | 5679 | if (mddev->ro) |
5504 | return; | 5680 | return 0; |
5681 | if (!mddev->pers->sync_request) | ||
5682 | return 0; | ||
5505 | 5683 | ||
5506 | spin_lock_irq(&mddev->write_lock); | 5684 | spin_lock_irq(&mddev->write_lock); |
5507 | if (mddev->in_sync) { | 5685 | if (mddev->in_sync) { |
@@ -5512,14 +5690,14 @@ void md_allow_write(mddev_t *mddev) | |||
5512 | mddev->safemode = 1; | 5690 | mddev->safemode = 1; |
5513 | spin_unlock_irq(&mddev->write_lock); | 5691 | spin_unlock_irq(&mddev->write_lock); |
5514 | md_update_sb(mddev, 0); | 5692 | md_update_sb(mddev, 0); |
5515 | |||
5516 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | 5693 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
5517 | /* wait for the dirty state to be recorded in the metadata */ | ||
5518 | wait_event(mddev->sb_wait, | ||
5519 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | ||
5520 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
5521 | } else | 5694 | } else |
5522 | spin_unlock_irq(&mddev->write_lock); | 5695 | spin_unlock_irq(&mddev->write_lock); |
5696 | |||
5697 | if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) | ||
5698 | return -EAGAIN; | ||
5699 | else | ||
5700 | return 0; | ||
5523 | } | 5701 | } |
5524 | EXPORT_SYMBOL_GPL(md_allow_write); | 5702 | EXPORT_SYMBOL_GPL(md_allow_write); |
5525 | 5703 | ||
@@ -5625,9 +5803,11 @@ void md_do_sync(mddev_t *mddev) | |||
5625 | max_sectors = mddev->resync_max_sectors; | 5803 | max_sectors = mddev->resync_max_sectors; |
5626 | mddev->resync_mismatches = 0; | 5804 | mddev->resync_mismatches = 0; |
5627 | /* we don't use the checkpoint if there's a bitmap */ | 5805 | /* we don't use the checkpoint if there's a bitmap */ |
5628 | if (!mddev->bitmap && | 5806 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
5629 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | 5807 | j = mddev->resync_min; |
5808 | else if (!mddev->bitmap) | ||
5630 | j = mddev->recovery_cp; | 5809 | j = mddev->recovery_cp; |
5810 | |||
5631 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 5811 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
5632 | max_sectors = mddev->size << 1; | 5812 | max_sectors = mddev->size << 1; |
5633 | else { | 5813 | else { |
@@ -5796,6 +5976,7 @@ void md_do_sync(mddev_t *mddev) | |||
5796 | 5976 | ||
5797 | skip: | 5977 | skip: |
5798 | mddev->curr_resync = 0; | 5978 | mddev->curr_resync = 0; |
5979 | mddev->resync_min = 0; | ||
5799 | mddev->resync_max = MaxSector; | 5980 | mddev->resync_max = MaxSector; |
5800 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 5981 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
5801 | wake_up(&resync_wait); | 5982 | wake_up(&resync_wait); |
@@ -5845,7 +6026,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
5845 | if (rdev->raid_disk < 0 | 6026 | if (rdev->raid_disk < 0 |
5846 | && !test_bit(Faulty, &rdev->flags)) { | 6027 | && !test_bit(Faulty, &rdev->flags)) { |
5847 | rdev->recovery_offset = 0; | 6028 | rdev->recovery_offset = 0; |
5848 | if (mddev->pers->hot_add_disk(mddev,rdev)) { | 6029 | if (mddev->pers-> |
6030 | hot_add_disk(mddev, rdev) == 0) { | ||
5849 | char nm[20]; | 6031 | char nm[20]; |
5850 | sprintf(nm, "rd%d", rdev->raid_disk); | 6032 | sprintf(nm, "rd%d", rdev->raid_disk); |
5851 | if (sysfs_create_link(&mddev->kobj, | 6033 | if (sysfs_create_link(&mddev->kobj, |
@@ -5920,23 +6102,31 @@ void md_check_recovery(mddev_t *mddev) | |||
5920 | int spares = 0; | 6102 | int spares = 0; |
5921 | 6103 | ||
5922 | if (!mddev->external) { | 6104 | if (!mddev->external) { |
6105 | int did_change = 0; | ||
5923 | spin_lock_irq(&mddev->write_lock); | 6106 | spin_lock_irq(&mddev->write_lock); |
5924 | if (mddev->safemode && | 6107 | if (mddev->safemode && |
5925 | !atomic_read(&mddev->writes_pending) && | 6108 | !atomic_read(&mddev->writes_pending) && |
5926 | !mddev->in_sync && | 6109 | !mddev->in_sync && |
5927 | mddev->recovery_cp == MaxSector) { | 6110 | mddev->recovery_cp == MaxSector) { |
5928 | mddev->in_sync = 1; | 6111 | mddev->in_sync = 1; |
6112 | did_change = 1; | ||
5929 | if (mddev->persistent) | 6113 | if (mddev->persistent) |
5930 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 6114 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
5931 | } | 6115 | } |
5932 | if (mddev->safemode == 1) | 6116 | if (mddev->safemode == 1) |
5933 | mddev->safemode = 0; | 6117 | mddev->safemode = 0; |
5934 | spin_unlock_irq(&mddev->write_lock); | 6118 | spin_unlock_irq(&mddev->write_lock); |
6119 | if (did_change) | ||
6120 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
5935 | } | 6121 | } |
5936 | 6122 | ||
5937 | if (mddev->flags) | 6123 | if (mddev->flags) |
5938 | md_update_sb(mddev, 0); | 6124 | md_update_sb(mddev, 0); |
5939 | 6125 | ||
6126 | rdev_for_each(rdev, rtmp, mddev) | ||
6127 | if (test_and_clear_bit(StateChanged, &rdev->flags)) | ||
6128 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
6129 | |||
5940 | 6130 | ||
5941 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 6131 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
5942 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 6132 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
@@ -5951,7 +6141,9 @@ void md_check_recovery(mddev_t *mddev) | |||
5951 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 6141 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
5952 | /* success...*/ | 6142 | /* success...*/ |
5953 | /* activate any spares */ | 6143 | /* activate any spares */ |
5954 | mddev->pers->spare_active(mddev); | 6144 | if (mddev->pers->spare_active(mddev)) |
6145 | sysfs_notify(&mddev->kobj, NULL, | ||
6146 | "degraded"); | ||
5955 | } | 6147 | } |
5956 | md_update_sb(mddev, 1); | 6148 | md_update_sb(mddev, 1); |
5957 | 6149 | ||
@@ -5965,13 +6157,18 @@ void md_check_recovery(mddev_t *mddev) | |||
5965 | mddev->recovery = 0; | 6157 | mddev->recovery = 0; |
5966 | /* flag recovery needed just to double check */ | 6158 | /* flag recovery needed just to double check */ |
5967 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6159 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
6160 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
5968 | md_new_event(mddev); | 6161 | md_new_event(mddev); |
5969 | goto unlock; | 6162 | goto unlock; |
5970 | } | 6163 | } |
6164 | /* Set RUNNING before clearing NEEDED to avoid | ||
6165 | * any transients in the value of "sync_action". | ||
6166 | */ | ||
6167 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
6168 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
5971 | /* Clear some bits that don't mean anything, but | 6169 | /* Clear some bits that don't mean anything, but |
5972 | * might be left set | 6170 | * might be left set |
5973 | */ | 6171 | */ |
5974 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
5975 | clear_bit(MD_RECOVERY_INTR, &mddev->recovery); | 6172 | clear_bit(MD_RECOVERY_INTR, &mddev->recovery); |
5976 | clear_bit(MD_RECOVERY_DONE, &mddev->recovery); | 6173 | clear_bit(MD_RECOVERY_DONE, &mddev->recovery); |
5977 | 6174 | ||
@@ -5989,17 +6186,19 @@ void md_check_recovery(mddev_t *mddev) | |||
5989 | /* Cannot proceed */ | 6186 | /* Cannot proceed */ |
5990 | goto unlock; | 6187 | goto unlock; |
5991 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 6188 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
6189 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5992 | } else if ((spares = remove_and_add_spares(mddev))) { | 6190 | } else if ((spares = remove_and_add_spares(mddev))) { |
5993 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 6191 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
5994 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 6192 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
6193 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5995 | } else if (mddev->recovery_cp < MaxSector) { | 6194 | } else if (mddev->recovery_cp < MaxSector) { |
5996 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 6195 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
6196 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5997 | } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 6197 | } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
5998 | /* nothing to be done ... */ | 6198 | /* nothing to be done ... */ |
5999 | goto unlock; | 6199 | goto unlock; |
6000 | 6200 | ||
6001 | if (mddev->pers->sync_request) { | 6201 | if (mddev->pers->sync_request) { |
6002 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
6003 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { | 6202 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { |
6004 | /* We are adding a device or devices to an array | 6203 | /* We are adding a device or devices to an array |
6005 | * which has the bitmap stored on all devices. | 6204 | * which has the bitmap stored on all devices. |
@@ -6018,9 +6217,16 @@ void md_check_recovery(mddev_t *mddev) | |||
6018 | mddev->recovery = 0; | 6217 | mddev->recovery = 0; |
6019 | } else | 6218 | } else |
6020 | md_wakeup_thread(mddev->sync_thread); | 6219 | md_wakeup_thread(mddev->sync_thread); |
6220 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
6021 | md_new_event(mddev); | 6221 | md_new_event(mddev); |
6022 | } | 6222 | } |
6023 | unlock: | 6223 | unlock: |
6224 | if (!mddev->sync_thread) { | ||
6225 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
6226 | if (test_and_clear_bit(MD_RECOVERY_RECOVER, | ||
6227 | &mddev->recovery)) | ||
6228 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
6229 | } | ||
6024 | mddev_unlock(mddev); | 6230 | mddev_unlock(mddev); |
6025 | } | 6231 | } |
6026 | } | 6232 | } |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e968116e0de9..541cbe3414bd 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
281 | { | 281 | { |
282 | multipath_conf_t *conf = mddev->private; | 282 | multipath_conf_t *conf = mddev->private; |
283 | struct request_queue *q; | 283 | struct request_queue *q; |
284 | int found = 0; | 284 | int err = -EEXIST; |
285 | int path; | 285 | int path; |
286 | struct multipath_info *p; | 286 | struct multipath_info *p; |
287 | int first = 0; | ||
288 | int last = mddev->raid_disks - 1; | ||
289 | |||
290 | if (rdev->raid_disk >= 0) | ||
291 | first = last = rdev->raid_disk; | ||
287 | 292 | ||
288 | print_multipath_conf(conf); | 293 | print_multipath_conf(conf); |
289 | 294 | ||
290 | for (path=0; path<mddev->raid_disks; path++) | 295 | for (path = first; path <= last; path++) |
291 | if ((p=conf->multipaths+path)->rdev == NULL) { | 296 | if ((p=conf->multipaths+path)->rdev == NULL) { |
292 | q = rdev->bdev->bd_disk->queue; | 297 | q = rdev->bdev->bd_disk->queue; |
293 | blk_queue_stack_limits(mddev->queue, q); | 298 | blk_queue_stack_limits(mddev->queue, q); |
@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
307 | rdev->raid_disk = path; | 312 | rdev->raid_disk = path; |
308 | set_bit(In_sync, &rdev->flags); | 313 | set_bit(In_sync, &rdev->flags); |
309 | rcu_assign_pointer(p->rdev, rdev); | 314 | rcu_assign_pointer(p->rdev, rdev); |
310 | found = 1; | 315 | err = 0; |
316 | break; | ||
311 | } | 317 | } |
312 | 318 | ||
313 | print_multipath_conf(conf); | 319 | print_multipath_conf(conf); |
314 | return found; | 320 | |
321 | return err; | ||
315 | } | 322 | } |
316 | 323 | ||
317 | static int multipath_remove_disk(mddev_t *mddev, int number) | 324 | static int multipath_remove_disk(mddev_t *mddev, int number) |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c610b947218a..491dc2d4ad5f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev) | |||
1100 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 1100 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
1101 | { | 1101 | { |
1102 | conf_t *conf = mddev->private; | 1102 | conf_t *conf = mddev->private; |
1103 | int found = 0; | 1103 | int err = -EEXIST; |
1104 | int mirror = 0; | 1104 | int mirror = 0; |
1105 | mirror_info_t *p; | 1105 | mirror_info_t *p; |
1106 | int first = 0; | ||
1107 | int last = mddev->raid_disks - 1; | ||
1106 | 1108 | ||
1107 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | 1109 | if (rdev->raid_disk >= 0) |
1110 | first = last = rdev->raid_disk; | ||
1111 | |||
1112 | for (mirror = first; mirror <= last; mirror++) | ||
1108 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1113 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1109 | 1114 | ||
1110 | blk_queue_stack_limits(mddev->queue, | 1115 | blk_queue_stack_limits(mddev->queue, |
@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1119 | 1124 | ||
1120 | p->head_position = 0; | 1125 | p->head_position = 0; |
1121 | rdev->raid_disk = mirror; | 1126 | rdev->raid_disk = mirror; |
1122 | found = 1; | 1127 | err = 0; |
1123 | /* As all devices are equivalent, we don't need a full recovery | 1128 | /* As all devices are equivalent, we don't need a full recovery |
1124 | * if this was recently any drive of the array | 1129 | * if this was recently any drive of the array |
1125 | */ | 1130 | */ |
@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1130 | } | 1135 | } |
1131 | 1136 | ||
1132 | print_conf(conf); | 1137 | print_conf(conf); |
1133 | return found; | 1138 | return err; |
1134 | } | 1139 | } |
1135 | 1140 | ||
1136 | static int raid1_remove_disk(mddev_t *mddev, int number) | 1141 | static int raid1_remove_disk(mddev_t *mddev, int number) |
@@ -2131,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev) | |||
2131 | conf_t *conf = mddev_to_conf(mddev); | 2136 | conf_t *conf = mddev_to_conf(mddev); |
2132 | int cnt, raid_disks; | 2137 | int cnt, raid_disks; |
2133 | unsigned long flags; | 2138 | unsigned long flags; |
2134 | int d, d2; | 2139 | int d, d2, err; |
2135 | 2140 | ||
2136 | /* Cannot change chunk_size, layout, or level */ | 2141 | /* Cannot change chunk_size, layout, or level */ |
2137 | if (mddev->chunk_size != mddev->new_chunk || | 2142 | if (mddev->chunk_size != mddev->new_chunk || |
@@ -2143,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev) | |||
2143 | return -EINVAL; | 2148 | return -EINVAL; |
2144 | } | 2149 | } |
2145 | 2150 | ||
2146 | md_allow_write(mddev); | 2151 | err = md_allow_write(mddev); |
2152 | if (err) | ||
2153 | return err; | ||
2147 | 2154 | ||
2148 | raid_disks = mddev->raid_disks + mddev->delta_disks; | 2155 | raid_disks = mddev->raid_disks + mddev->delta_disks; |
2149 | 2156 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a71277b640ab..df08a9fa3a1f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -1113,24 +1113,30 @@ static int raid10_spare_active(mddev_t *mddev) | |||
1113 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 1113 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
1114 | { | 1114 | { |
1115 | conf_t *conf = mddev->private; | 1115 | conf_t *conf = mddev->private; |
1116 | int found = 0; | 1116 | int err = -EEXIST; |
1117 | int mirror; | 1117 | int mirror; |
1118 | mirror_info_t *p; | 1118 | mirror_info_t *p; |
1119 | int first = 0; | ||
1120 | int last = mddev->raid_disks - 1; | ||
1119 | 1121 | ||
1120 | if (mddev->recovery_cp < MaxSector) | 1122 | if (mddev->recovery_cp < MaxSector) |
1121 | /* only hot-add to in-sync arrays, as recovery is | 1123 | /* only hot-add to in-sync arrays, as recovery is |
1122 | * very different from resync | 1124 | * very different from resync |
1123 | */ | 1125 | */ |
1124 | return 0; | 1126 | return -EBUSY; |
1125 | if (!enough(conf)) | 1127 | if (!enough(conf)) |
1126 | return 0; | 1128 | return -EINVAL; |
1129 | |||
1130 | if (rdev->raid_disk) | ||
1131 | first = last = rdev->raid_disk; | ||
1127 | 1132 | ||
1128 | if (rdev->saved_raid_disk >= 0 && | 1133 | if (rdev->saved_raid_disk >= 0 && |
1134 | rdev->saved_raid_disk >= first && | ||
1129 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1135 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
1130 | mirror = rdev->saved_raid_disk; | 1136 | mirror = rdev->saved_raid_disk; |
1131 | else | 1137 | else |
1132 | mirror = 0; | 1138 | mirror = first; |
1133 | for ( ; mirror < mddev->raid_disks; mirror++) | 1139 | for ( ; mirror <= last ; mirror++) |
1134 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1140 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1135 | 1141 | ||
1136 | blk_queue_stack_limits(mddev->queue, | 1142 | blk_queue_stack_limits(mddev->queue, |
@@ -1145,7 +1151,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1145 | 1151 | ||
1146 | p->head_position = 0; | 1152 | p->head_position = 0; |
1147 | rdev->raid_disk = mirror; | 1153 | rdev->raid_disk = mirror; |
1148 | found = 1; | 1154 | err = 0; |
1149 | if (rdev->saved_raid_disk != mirror) | 1155 | if (rdev->saved_raid_disk != mirror) |
1150 | conf->fullsync = 1; | 1156 | conf->fullsync = 1; |
1151 | rcu_assign_pointer(p->rdev, rdev); | 1157 | rcu_assign_pointer(p->rdev, rdev); |
@@ -1153,7 +1159,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1153 | } | 1159 | } |
1154 | 1160 | ||
1155 | print_conf(conf); | 1161 | print_conf(conf); |
1156 | return found; | 1162 | return err; |
1157 | } | 1163 | } |
1158 | 1164 | ||
1159 | static int raid10_remove_disk(mddev_t *mddev, int number) | 1165 | static int raid10_remove_disk(mddev_t *mddev, int number) |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3b27df52456b..8f4c70a53210 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi) | |||
115 | return_bi = bi->bi_next; | 115 | return_bi = bi->bi_next; |
116 | bi->bi_next = NULL; | 116 | bi->bi_next = NULL; |
117 | bi->bi_size = 0; | 117 | bi->bi_size = 0; |
118 | bi->bi_end_io(bi, | 118 | bio_endio(bi, 0); |
119 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
120 | ? 0 : -EIO); | ||
121 | bi = return_bi; | 119 | bi = return_bi; |
122 | } | 120 | } |
123 | } | 121 | } |
124 | 122 | ||
125 | static void print_raid5_conf (raid5_conf_t *conf); | 123 | static void print_raid5_conf (raid5_conf_t *conf); |
126 | 124 | ||
125 | static int stripe_operations_active(struct stripe_head *sh) | ||
126 | { | ||
127 | return sh->check_state || sh->reconstruct_state || | ||
128 | test_bit(STRIPE_BIOFILL_RUN, &sh->state) || | ||
129 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
130 | } | ||
131 | |||
127 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | 132 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) |
128 | { | 133 | { |
129 | if (atomic_dec_and_test(&sh->count)) { | 134 | if (atomic_dec_and_test(&sh->count)) { |
@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
143 | } | 148 | } |
144 | md_wakeup_thread(conf->mddev->thread); | 149 | md_wakeup_thread(conf->mddev->thread); |
145 | } else { | 150 | } else { |
146 | BUG_ON(sh->ops.pending); | 151 | BUG_ON(stripe_operations_active(sh)); |
147 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 152 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
148 | atomic_dec(&conf->preread_active_stripes); | 153 | atomic_dec(&conf->preread_active_stripes); |
149 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 154 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) |
@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int | |||
245 | 250 | ||
246 | BUG_ON(atomic_read(&sh->count) != 0); | 251 | BUG_ON(atomic_read(&sh->count) != 0); |
247 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 252 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
248 | BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); | 253 | BUG_ON(stripe_operations_active(sh)); |
249 | 254 | ||
250 | CHECK_DEVLOCK(); | 255 | CHECK_DEVLOCK(); |
251 | pr_debug("init_stripe called, stripe %llu\n", | 256 | pr_debug("init_stripe called, stripe %llu\n", |
@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
346 | return sh; | 351 | return sh; |
347 | } | 352 | } |
348 | 353 | ||
349 | /* test_and_ack_op() ensures that we only dequeue an operation once */ | ||
350 | #define test_and_ack_op(op, pend) \ | ||
351 | do { \ | ||
352 | if (test_bit(op, &sh->ops.pending) && \ | ||
353 | !test_bit(op, &sh->ops.complete)) { \ | ||
354 | if (test_and_set_bit(op, &sh->ops.ack)) \ | ||
355 | clear_bit(op, &pend); \ | ||
356 | else \ | ||
357 | ack++; \ | ||
358 | } else \ | ||
359 | clear_bit(op, &pend); \ | ||
360 | } while (0) | ||
361 | |||
362 | /* find new work to run, do not resubmit work that is already | ||
363 | * in flight | ||
364 | */ | ||
365 | static unsigned long get_stripe_work(struct stripe_head *sh) | ||
366 | { | ||
367 | unsigned long pending; | ||
368 | int ack = 0; | ||
369 | |||
370 | pending = sh->ops.pending; | ||
371 | |||
372 | test_and_ack_op(STRIPE_OP_BIOFILL, pending); | ||
373 | test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); | ||
374 | test_and_ack_op(STRIPE_OP_PREXOR, pending); | ||
375 | test_and_ack_op(STRIPE_OP_BIODRAIN, pending); | ||
376 | test_and_ack_op(STRIPE_OP_POSTXOR, pending); | ||
377 | test_and_ack_op(STRIPE_OP_CHECK, pending); | ||
378 | if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
379 | ack++; | ||
380 | |||
381 | sh->ops.count -= ack; | ||
382 | if (unlikely(sh->ops.count < 0)) { | ||
383 | printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " | ||
384 | "ops.complete: %#lx\n", pending, sh->ops.pending, | ||
385 | sh->ops.ack, sh->ops.complete); | ||
386 | BUG(); | ||
387 | } | ||
388 | |||
389 | return pending; | ||
390 | } | ||
391 | |||
392 | static void | 354 | static void |
393 | raid5_end_read_request(struct bio *bi, int error); | 355 | raid5_end_read_request(struct bio *bi, int error); |
394 | static void | 356 | static void |
395 | raid5_end_write_request(struct bio *bi, int error); | 357 | raid5_end_write_request(struct bio *bi, int error); |
396 | 358 | ||
397 | static void ops_run_io(struct stripe_head *sh) | 359 | static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) |
398 | { | 360 | { |
399 | raid5_conf_t *conf = sh->raid_conf; | 361 | raid5_conf_t *conf = sh->raid_conf; |
400 | int i, disks = sh->disks; | 362 | int i, disks = sh->disks; |
401 | 363 | ||
402 | might_sleep(); | 364 | might_sleep(); |
403 | 365 | ||
404 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
405 | for (i = disks; i--; ) { | 366 | for (i = disks; i--; ) { |
406 | int rw; | 367 | int rw; |
407 | struct bio *bi; | 368 | struct bio *bi; |
@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) | |||
430 | rcu_read_unlock(); | 391 | rcu_read_unlock(); |
431 | 392 | ||
432 | if (rdev) { | 393 | if (rdev) { |
433 | if (test_bit(STRIPE_SYNCING, &sh->state) || | 394 | if (s->syncing || s->expanding || s->expanded) |
434 | test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || | ||
435 | test_bit(STRIPE_EXPAND_READY, &sh->state)) | ||
436 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 395 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
437 | 396 | ||
397 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
398 | |||
438 | bi->bi_bdev = rdev->bdev; | 399 | bi->bi_bdev = rdev->bdev; |
439 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", | 400 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", |
440 | __func__, (unsigned long long)sh->sector, | 401 | __func__, (unsigned long long)sh->sector, |
@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
528 | (unsigned long long)sh->sector); | 489 | (unsigned long long)sh->sector); |
529 | 490 | ||
530 | /* clear completed biofills */ | 491 | /* clear completed biofills */ |
492 | spin_lock_irq(&conf->device_lock); | ||
531 | for (i = sh->disks; i--; ) { | 493 | for (i = sh->disks; i--; ) { |
532 | struct r5dev *dev = &sh->dev[i]; | 494 | struct r5dev *dev = &sh->dev[i]; |
533 | 495 | ||
534 | /* acknowledge completion of a biofill operation */ | 496 | /* acknowledge completion of a biofill operation */ |
535 | /* and check if we need to reply to a read request, | 497 | /* and check if we need to reply to a read request, |
536 | * new R5_Wantfill requests are held off until | 498 | * new R5_Wantfill requests are held off until |
537 | * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) | 499 | * !STRIPE_BIOFILL_RUN |
538 | */ | 500 | */ |
539 | if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { | 501 | if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { |
540 | struct bio *rbi, *rbi2; | 502 | struct bio *rbi, *rbi2; |
541 | 503 | ||
542 | /* The access to dev->read is outside of the | ||
543 | * spin_lock_irq(&conf->device_lock), but is protected | ||
544 | * by the STRIPE_OP_BIOFILL pending bit | ||
545 | */ | ||
546 | BUG_ON(!dev->read); | 504 | BUG_ON(!dev->read); |
547 | rbi = dev->read; | 505 | rbi = dev->read; |
548 | dev->read = NULL; | 506 | dev->read = NULL; |
549 | while (rbi && rbi->bi_sector < | 507 | while (rbi && rbi->bi_sector < |
550 | dev->sector + STRIPE_SECTORS) { | 508 | dev->sector + STRIPE_SECTORS) { |
551 | rbi2 = r5_next_bio(rbi, dev->sector); | 509 | rbi2 = r5_next_bio(rbi, dev->sector); |
552 | spin_lock_irq(&conf->device_lock); | ||
553 | if (--rbi->bi_phys_segments == 0) { | 510 | if (--rbi->bi_phys_segments == 0) { |
554 | rbi->bi_next = return_bi; | 511 | rbi->bi_next = return_bi; |
555 | return_bi = rbi; | 512 | return_bi = rbi; |
556 | } | 513 | } |
557 | spin_unlock_irq(&conf->device_lock); | ||
558 | rbi = rbi2; | 514 | rbi = rbi2; |
559 | } | 515 | } |
560 | } | 516 | } |
561 | } | 517 | } |
562 | set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); | 518 | spin_unlock_irq(&conf->device_lock); |
519 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
563 | 520 | ||
564 | return_io(return_bi); | 521 | return_io(return_bi); |
565 | 522 | ||
@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
610 | set_bit(R5_UPTODATE, &tgt->flags); | 567 | set_bit(R5_UPTODATE, &tgt->flags); |
611 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 568 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
612 | clear_bit(R5_Wantcompute, &tgt->flags); | 569 | clear_bit(R5_Wantcompute, &tgt->flags); |
613 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 570 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
571 | if (sh->check_state == check_state_compute_run) | ||
572 | sh->check_state = check_state_compute_result; | ||
614 | set_bit(STRIPE_HANDLE, &sh->state); | 573 | set_bit(STRIPE_HANDLE, &sh->state); |
615 | release_stripe(sh); | 574 | release_stripe(sh); |
616 | } | 575 | } |
617 | 576 | ||
618 | static struct dma_async_tx_descriptor * | 577 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) |
619 | ops_run_compute5(struct stripe_head *sh, unsigned long pending) | ||
620 | { | 578 | { |
621 | /* kernel stack size limits the total number of disks */ | 579 | /* kernel stack size limits the total number of disks */ |
622 | int disks = sh->disks; | 580 | int disks = sh->disks; |
@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) | |||
646 | ASYNC_TX_XOR_ZERO_DST, NULL, | 604 | ASYNC_TX_XOR_ZERO_DST, NULL, |
647 | ops_complete_compute5, sh); | 605 | ops_complete_compute5, sh); |
648 | 606 | ||
649 | /* ack now if postxor is not set to be run */ | ||
650 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) | ||
651 | async_tx_ack(tx); | ||
652 | |||
653 | return tx; | 607 | return tx; |
654 | } | 608 | } |
655 | 609 | ||
@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
659 | 613 | ||
660 | pr_debug("%s: stripe %llu\n", __func__, | 614 | pr_debug("%s: stripe %llu\n", __func__, |
661 | (unsigned long long)sh->sector); | 615 | (unsigned long long)sh->sector); |
662 | |||
663 | set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | ||
664 | } | 616 | } |
665 | 617 | ||
666 | static struct dma_async_tx_descriptor * | 618 | static struct dma_async_tx_descriptor * |
@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
680 | for (i = disks; i--; ) { | 632 | for (i = disks; i--; ) { |
681 | struct r5dev *dev = &sh->dev[i]; | 633 | struct r5dev *dev = &sh->dev[i]; |
682 | /* Only process blocks that are known to be uptodate */ | 634 | /* Only process blocks that are known to be uptodate */ |
683 | if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) | 635 | if (test_bit(R5_Wantdrain, &dev->flags)) |
684 | xor_srcs[count++] = dev->page; | 636 | xor_srcs[count++] = dev->page; |
685 | } | 637 | } |
686 | 638 | ||
@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
692 | } | 644 | } |
693 | 645 | ||
694 | static struct dma_async_tx_descriptor * | 646 | static struct dma_async_tx_descriptor * |
695 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | 647 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
696 | unsigned long pending) | ||
697 | { | 648 | { |
698 | int disks = sh->disks; | 649 | int disks = sh->disks; |
699 | int pd_idx = sh->pd_idx, i; | 650 | int i; |
700 | |||
701 | /* check if prexor is active which means only process blocks | ||
702 | * that are part of a read-modify-write (Wantprexor) | ||
703 | */ | ||
704 | int prexor = test_bit(STRIPE_OP_PREXOR, &pending); | ||
705 | 651 | ||
706 | pr_debug("%s: stripe %llu\n", __func__, | 652 | pr_debug("%s: stripe %llu\n", __func__, |
707 | (unsigned long long)sh->sector); | 653 | (unsigned long long)sh->sector); |
@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
709 | for (i = disks; i--; ) { | 655 | for (i = disks; i--; ) { |
710 | struct r5dev *dev = &sh->dev[i]; | 656 | struct r5dev *dev = &sh->dev[i]; |
711 | struct bio *chosen; | 657 | struct bio *chosen; |
712 | int towrite; | ||
713 | |||
714 | towrite = 0; | ||
715 | if (prexor) { /* rmw */ | ||
716 | if (dev->towrite && | ||
717 | test_bit(R5_Wantprexor, &dev->flags)) | ||
718 | towrite = 1; | ||
719 | } else { /* rcw */ | ||
720 | if (i != pd_idx && dev->towrite && | ||
721 | test_bit(R5_LOCKED, &dev->flags)) | ||
722 | towrite = 1; | ||
723 | } | ||
724 | 658 | ||
725 | if (towrite) { | 659 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
726 | struct bio *wbi; | 660 | struct bio *wbi; |
727 | 661 | ||
728 | spin_lock(&sh->lock); | 662 | spin_lock(&sh->lock); |
@@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
747 | static void ops_complete_postxor(void *stripe_head_ref) | 681 | static void ops_complete_postxor(void *stripe_head_ref) |
748 | { | 682 | { |
749 | struct stripe_head *sh = stripe_head_ref; | 683 | struct stripe_head *sh = stripe_head_ref; |
750 | |||
751 | pr_debug("%s: stripe %llu\n", __func__, | ||
752 | (unsigned long long)sh->sector); | ||
753 | |||
754 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
755 | set_bit(STRIPE_HANDLE, &sh->state); | ||
756 | release_stripe(sh); | ||
757 | } | ||
758 | |||
759 | static void ops_complete_write(void *stripe_head_ref) | ||
760 | { | ||
761 | struct stripe_head *sh = stripe_head_ref; | ||
762 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 684 | int disks = sh->disks, i, pd_idx = sh->pd_idx; |
763 | 685 | ||
764 | pr_debug("%s: stripe %llu\n", __func__, | 686 | pr_debug("%s: stripe %llu\n", __func__, |
@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) | |||
770 | set_bit(R5_UPTODATE, &dev->flags); | 692 | set_bit(R5_UPTODATE, &dev->flags); |
771 | } | 693 | } |
772 | 694 | ||
773 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | 695 | if (sh->reconstruct_state == reconstruct_state_drain_run) |
774 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | 696 | sh->reconstruct_state = reconstruct_state_drain_result; |
697 | else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) | ||
698 | sh->reconstruct_state = reconstruct_state_prexor_drain_result; | ||
699 | else { | ||
700 | BUG_ON(sh->reconstruct_state != reconstruct_state_run); | ||
701 | sh->reconstruct_state = reconstruct_state_result; | ||
702 | } | ||
775 | 703 | ||
776 | set_bit(STRIPE_HANDLE, &sh->state); | 704 | set_bit(STRIPE_HANDLE, &sh->state); |
777 | release_stripe(sh); | 705 | release_stripe(sh); |
778 | } | 706 | } |
779 | 707 | ||
780 | static void | 708 | static void |
781 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | 709 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
782 | unsigned long pending) | ||
783 | { | 710 | { |
784 | /* kernel stack size limits the total number of disks */ | 711 | /* kernel stack size limits the total number of disks */ |
785 | int disks = sh->disks; | 712 | int disks = sh->disks; |
@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
787 | 714 | ||
788 | int count = 0, pd_idx = sh->pd_idx, i; | 715 | int count = 0, pd_idx = sh->pd_idx, i; |
789 | struct page *xor_dest; | 716 | struct page *xor_dest; |
790 | int prexor = test_bit(STRIPE_OP_PREXOR, &pending); | 717 | int prexor = 0; |
791 | unsigned long flags; | 718 | unsigned long flags; |
792 | dma_async_tx_callback callback; | ||
793 | 719 | ||
794 | pr_debug("%s: stripe %llu\n", __func__, | 720 | pr_debug("%s: stripe %llu\n", __func__, |
795 | (unsigned long long)sh->sector); | 721 | (unsigned long long)sh->sector); |
@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
797 | /* check if prexor is active which means only process blocks | 723 | /* check if prexor is active which means only process blocks |
798 | * that are part of a read-modify-write (written) | 724 | * that are part of a read-modify-write (written) |
799 | */ | 725 | */ |
800 | if (prexor) { | 726 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
727 | prexor = 1; | ||
801 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 728 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
802 | for (i = disks; i--; ) { | 729 | for (i = disks; i--; ) { |
803 | struct r5dev *dev = &sh->dev[i]; | 730 | struct r5dev *dev = &sh->dev[i]; |
@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
813 | } | 740 | } |
814 | } | 741 | } |
815 | 742 | ||
816 | /* check whether this postxor is part of a write */ | ||
817 | callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? | ||
818 | ops_complete_write : ops_complete_postxor; | ||
819 | |||
820 | /* 1/ if we prexor'd then the dest is reused as a source | 743 | /* 1/ if we prexor'd then the dest is reused as a source |
821 | * 2/ if we did not prexor then we are redoing the parity | 744 | * 2/ if we did not prexor then we are redoing the parity |
822 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 745 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
830 | if (unlikely(count == 1)) { | 753 | if (unlikely(count == 1)) { |
831 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 754 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); |
832 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 755 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, |
833 | flags, tx, callback, sh); | 756 | flags, tx, ops_complete_postxor, sh); |
834 | } else | 757 | } else |
835 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 758 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
836 | flags, tx, callback, sh); | 759 | flags, tx, ops_complete_postxor, sh); |
837 | } | 760 | } |
838 | 761 | ||
839 | static void ops_complete_check(void *stripe_head_ref) | 762 | static void ops_complete_check(void *stripe_head_ref) |
840 | { | 763 | { |
841 | struct stripe_head *sh = stripe_head_ref; | 764 | struct stripe_head *sh = stripe_head_ref; |
842 | int pd_idx = sh->pd_idx; | ||
843 | 765 | ||
844 | pr_debug("%s: stripe %llu\n", __func__, | 766 | pr_debug("%s: stripe %llu\n", __func__, |
845 | (unsigned long long)sh->sector); | 767 | (unsigned long long)sh->sector); |
846 | 768 | ||
847 | if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && | 769 | sh->check_state = check_state_check_result; |
848 | sh->ops.zero_sum_result == 0) | ||
849 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
850 | |||
851 | set_bit(STRIPE_OP_CHECK, &sh->ops.complete); | ||
852 | set_bit(STRIPE_HANDLE, &sh->state); | 770 | set_bit(STRIPE_HANDLE, &sh->state); |
853 | release_stripe(sh); | 771 | release_stripe(sh); |
854 | } | 772 | } |
@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) | |||
875 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 793 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
876 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 794 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); |
877 | 795 | ||
878 | if (tx) | ||
879 | set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
880 | else | ||
881 | clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
882 | |||
883 | atomic_inc(&sh->count); | 796 | atomic_inc(&sh->count); |
884 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 797 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, |
885 | ops_complete_check, sh); | 798 | ops_complete_check, sh); |
886 | } | 799 | } |
887 | 800 | ||
888 | static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | 801 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) |
889 | { | 802 | { |
890 | int overlap_clear = 0, i, disks = sh->disks; | 803 | int overlap_clear = 0, i, disks = sh->disks; |
891 | struct dma_async_tx_descriptor *tx = NULL; | 804 | struct dma_async_tx_descriptor *tx = NULL; |
892 | 805 | ||
893 | if (test_bit(STRIPE_OP_BIOFILL, &pending)) { | 806 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
894 | ops_run_biofill(sh); | 807 | ops_run_biofill(sh); |
895 | overlap_clear++; | 808 | overlap_clear++; |
896 | } | 809 | } |
897 | 810 | ||
898 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) | 811 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
899 | tx = ops_run_compute5(sh, pending); | 812 | tx = ops_run_compute5(sh); |
813 | /* terminate the chain if postxor is not set to be run */ | ||
814 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | ||
815 | async_tx_ack(tx); | ||
816 | } | ||
900 | 817 | ||
901 | if (test_bit(STRIPE_OP_PREXOR, &pending)) | 818 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
902 | tx = ops_run_prexor(sh, tx); | 819 | tx = ops_run_prexor(sh, tx); |
903 | 820 | ||
904 | if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { | 821 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
905 | tx = ops_run_biodrain(sh, tx, pending); | 822 | tx = ops_run_biodrain(sh, tx); |
906 | overlap_clear++; | 823 | overlap_clear++; |
907 | } | 824 | } |
908 | 825 | ||
909 | if (test_bit(STRIPE_OP_POSTXOR, &pending)) | 826 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) |
910 | ops_run_postxor(sh, tx, pending); | 827 | ops_run_postxor(sh, tx); |
911 | 828 | ||
912 | if (test_bit(STRIPE_OP_CHECK, &pending)) | 829 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) |
913 | ops_run_check(sh); | 830 | ops_run_check(sh); |
914 | 831 | ||
915 | if (test_bit(STRIPE_OP_IO, &pending)) | ||
916 | ops_run_io(sh); | ||
917 | |||
918 | if (overlap_clear) | 832 | if (overlap_clear) |
919 | for (i = disks; i--; ) { | 833 | for (i = disks; i--; ) { |
920 | struct r5dev *dev = &sh->dev[i]; | 834 | struct r5dev *dev = &sh->dev[i]; |
@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
997 | struct stripe_head *osh, *nsh; | 911 | struct stripe_head *osh, *nsh; |
998 | LIST_HEAD(newstripes); | 912 | LIST_HEAD(newstripes); |
999 | struct disk_info *ndisks; | 913 | struct disk_info *ndisks; |
1000 | int err = 0; | 914 | int err; |
1001 | struct kmem_cache *sc; | 915 | struct kmem_cache *sc; |
1002 | int i; | 916 | int i; |
1003 | 917 | ||
1004 | if (newsize <= conf->pool_size) | 918 | if (newsize <= conf->pool_size) |
1005 | return 0; /* never bother to shrink */ | 919 | return 0; /* never bother to shrink */ |
1006 | 920 | ||
1007 | md_allow_write(conf->mddev); | 921 | err = md_allow_write(conf->mddev); |
922 | if (err) | ||
923 | return err; | ||
1008 | 924 | ||
1009 | /* Step 1 */ | 925 | /* Step 1 */ |
1010 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], | 926 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], |
@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | |||
1703 | } | 1619 | } |
1704 | } | 1620 | } |
1705 | 1621 | ||
1706 | static int | 1622 | static void |
1707 | handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | 1623 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, |
1624 | int rcw, int expand) | ||
1708 | { | 1625 | { |
1709 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1626 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
1710 | int locked = 0; | ||
1711 | 1627 | ||
1712 | if (rcw) { | 1628 | if (rcw) { |
1713 | /* if we are not expanding this is a proper write request, and | 1629 | /* if we are not expanding this is a proper write request, and |
@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
1715 | * stripe cache | 1631 | * stripe cache |
1716 | */ | 1632 | */ |
1717 | if (!expand) { | 1633 | if (!expand) { |
1718 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | 1634 | sh->reconstruct_state = reconstruct_state_drain_run; |
1719 | sh->ops.count++; | 1635 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1720 | } | 1636 | } else |
1637 | sh->reconstruct_state = reconstruct_state_run; | ||
1721 | 1638 | ||
1722 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | 1639 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); |
1723 | sh->ops.count++; | ||
1724 | 1640 | ||
1725 | for (i = disks; i--; ) { | 1641 | for (i = disks; i--; ) { |
1726 | struct r5dev *dev = &sh->dev[i]; | 1642 | struct r5dev *dev = &sh->dev[i]; |
1727 | 1643 | ||
1728 | if (dev->towrite) { | 1644 | if (dev->towrite) { |
1729 | set_bit(R5_LOCKED, &dev->flags); | 1645 | set_bit(R5_LOCKED, &dev->flags); |
1646 | set_bit(R5_Wantdrain, &dev->flags); | ||
1730 | if (!expand) | 1647 | if (!expand) |
1731 | clear_bit(R5_UPTODATE, &dev->flags); | 1648 | clear_bit(R5_UPTODATE, &dev->flags); |
1732 | locked++; | 1649 | s->locked++; |
1733 | } | 1650 | } |
1734 | } | 1651 | } |
1735 | if (locked + 1 == disks) | 1652 | if (s->locked + 1 == disks) |
1736 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1653 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
1737 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1654 | atomic_inc(&sh->raid_conf->pending_full_writes); |
1738 | } else { | 1655 | } else { |
1739 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1656 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1740 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1657 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
1741 | 1658 | ||
1742 | set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | 1659 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
1743 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | 1660 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
1744 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | 1661 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1745 | 1662 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | |
1746 | sh->ops.count += 3; | ||
1747 | 1663 | ||
1748 | for (i = disks; i--; ) { | 1664 | for (i = disks; i--; ) { |
1749 | struct r5dev *dev = &sh->dev[i]; | 1665 | struct r5dev *dev = &sh->dev[i]; |
1750 | if (i == pd_idx) | 1666 | if (i == pd_idx) |
1751 | continue; | 1667 | continue; |
1752 | 1668 | ||
1753 | /* For a read-modify write there may be blocks that are | ||
1754 | * locked for reading while others are ready to be | ||
1755 | * written so we distinguish these blocks by the | ||
1756 | * R5_Wantprexor bit | ||
1757 | */ | ||
1758 | if (dev->towrite && | 1669 | if (dev->towrite && |
1759 | (test_bit(R5_UPTODATE, &dev->flags) || | 1670 | (test_bit(R5_UPTODATE, &dev->flags) || |
1760 | test_bit(R5_Wantcompute, &dev->flags))) { | 1671 | test_bit(R5_Wantcompute, &dev->flags))) { |
1761 | set_bit(R5_Wantprexor, &dev->flags); | 1672 | set_bit(R5_Wantdrain, &dev->flags); |
1762 | set_bit(R5_LOCKED, &dev->flags); | 1673 | set_bit(R5_LOCKED, &dev->flags); |
1763 | clear_bit(R5_UPTODATE, &dev->flags); | 1674 | clear_bit(R5_UPTODATE, &dev->flags); |
1764 | locked++; | 1675 | s->locked++; |
1765 | } | 1676 | } |
1766 | } | 1677 | } |
1767 | } | 1678 | } |
@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
1771 | */ | 1682 | */ |
1772 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 1683 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
1773 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 1684 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
1774 | locked++; | 1685 | s->locked++; |
1775 | 1686 | ||
1776 | pr_debug("%s: stripe %llu locked: %d pending: %lx\n", | 1687 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
1777 | __func__, (unsigned long long)sh->sector, | 1688 | __func__, (unsigned long long)sh->sector, |
1778 | locked, sh->ops.pending); | 1689 | s->locked, s->ops_request); |
1779 | |||
1780 | return locked; | ||
1781 | } | 1690 | } |
1782 | 1691 | ||
1783 | /* | 1692 | /* |
@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | |||
1876 | } | 1785 | } |
1877 | 1786 | ||
1878 | static void | 1787 | static void |
1879 | handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | 1788 | handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, |
1880 | struct stripe_head_state *s, int disks, | 1789 | struct stripe_head_state *s, int disks, |
1881 | struct bio **return_bi) | 1790 | struct bio **return_bi) |
1882 | { | 1791 | { |
@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | |||
1967 | md_wakeup_thread(conf->mddev->thread); | 1876 | md_wakeup_thread(conf->mddev->thread); |
1968 | } | 1877 | } |
1969 | 1878 | ||
1970 | /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks | 1879 | /* fetch_block5 - checks the given member device to see if its data needs |
1971 | * to process | 1880 | * to be read or computed to satisfy a request. |
1881 | * | ||
1882 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
1883 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
1972 | */ | 1884 | */ |
1973 | static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | 1885 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, |
1974 | struct stripe_head_state *s, int disk_idx, int disks) | 1886 | int disk_idx, int disks) |
1975 | { | 1887 | { |
1976 | struct r5dev *dev = &sh->dev[disk_idx]; | 1888 | struct r5dev *dev = &sh->dev[disk_idx]; |
1977 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | 1889 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; |
1978 | 1890 | ||
1979 | /* don't schedule compute operations or reads on the parity block while | ||
1980 | * a check is in flight | ||
1981 | */ | ||
1982 | if ((disk_idx == sh->pd_idx) && | ||
1983 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | ||
1984 | return ~0; | ||
1985 | |||
1986 | /* is the data in this block needed, and can we get it? */ | 1891 | /* is the data in this block needed, and can we get it? */ |
1987 | if (!test_bit(R5_LOCKED, &dev->flags) && | 1892 | if (!test_bit(R5_LOCKED, &dev->flags) && |
1988 | !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || | 1893 | !test_bit(R5_UPTODATE, &dev->flags) && |
1989 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 1894 | (dev->toread || |
1990 | s->syncing || s->expanding || (s->failed && | 1895 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
1991 | (failed_dev->toread || (failed_dev->towrite && | 1896 | s->syncing || s->expanding || |
1992 | !test_bit(R5_OVERWRITE, &failed_dev->flags) | 1897 | (s->failed && |
1993 | ))))) { | 1898 | (failed_dev->toread || |
1994 | /* 1/ We would like to get this block, possibly by computing it, | 1899 | (failed_dev->towrite && |
1995 | * but we might not be able to. | 1900 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { |
1996 | * | 1901 | /* We would like to get this block, possibly by computing it, |
1997 | * 2/ Since parity check operations potentially make the parity | 1902 | * otherwise read it if the backing disk is insync |
1998 | * block !uptodate it will need to be refreshed before any | ||
1999 | * compute operations on data disks are scheduled. | ||
2000 | * | ||
2001 | * 3/ We hold off parity block re-reads until check operations | ||
2002 | * have quiesced. | ||
2003 | */ | 1903 | */ |
2004 | if ((s->uptodate == disks - 1) && | 1904 | if ((s->uptodate == disks - 1) && |
2005 | (s->failed && disk_idx == s->failed_num) && | 1905 | (s->failed && disk_idx == s->failed_num)) { |
2006 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | 1906 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
2007 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 1907 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2008 | set_bit(R5_Wantcompute, &dev->flags); | 1908 | set_bit(R5_Wantcompute, &dev->flags); |
2009 | sh->ops.target = disk_idx; | 1909 | sh->ops.target = disk_idx; |
2010 | s->req_compute = 1; | 1910 | s->req_compute = 1; |
2011 | sh->ops.count++; | ||
2012 | /* Careful: from this point on 'uptodate' is in the eye | 1911 | /* Careful: from this point on 'uptodate' is in the eye |
2013 | * of raid5_run_ops which services 'compute' operations | 1912 | * of raid5_run_ops which services 'compute' operations |
2014 | * before writes. R5_Wantcompute flags a block that will | 1913 | * before writes. R5_Wantcompute flags a block that will |
@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | |||
2016 | * subsequent operation. | 1915 | * subsequent operation. |
2017 | */ | 1916 | */ |
2018 | s->uptodate++; | 1917 | s->uptodate++; |
2019 | return 0; /* uptodate + compute == disks */ | 1918 | return 1; /* uptodate + compute == disks */ |
2020 | } else if (test_bit(R5_Insync, &dev->flags)) { | 1919 | } else if (test_bit(R5_Insync, &dev->flags)) { |
2021 | set_bit(R5_LOCKED, &dev->flags); | 1920 | set_bit(R5_LOCKED, &dev->flags); |
2022 | set_bit(R5_Wantread, &dev->flags); | 1921 | set_bit(R5_Wantread, &dev->flags); |
2023 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2024 | sh->ops.count++; | ||
2025 | s->locked++; | 1922 | s->locked++; |
2026 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | 1923 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, |
2027 | s->syncing); | 1924 | s->syncing); |
2028 | } | 1925 | } |
2029 | } | 1926 | } |
2030 | 1927 | ||
2031 | return ~0; | 1928 | return 0; |
2032 | } | 1929 | } |
2033 | 1930 | ||
2034 | static void handle_issuing_new_read_requests5(struct stripe_head *sh, | 1931 | /** |
1932 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
1933 | */ | ||
1934 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
2035 | struct stripe_head_state *s, int disks) | 1935 | struct stripe_head_state *s, int disks) |
2036 | { | 1936 | { |
2037 | int i; | 1937 | int i; |
2038 | 1938 | ||
2039 | /* Clear completed compute operations. Parity recovery | ||
2040 | * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled | ||
2041 | * later on in this routine | ||
2042 | */ | ||
2043 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | ||
2044 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
2045 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | ||
2046 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | ||
2047 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | ||
2048 | } | ||
2049 | |||
2050 | /* look for blocks to read/compute, skip this if a compute | 1939 | /* look for blocks to read/compute, skip this if a compute |
2051 | * is already in flight, or if the stripe contents are in the | 1940 | * is already in flight, or if the stripe contents are in the |
2052 | * midst of changing due to a write | 1941 | * midst of changing due to a write |
2053 | */ | 1942 | */ |
2054 | if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 1943 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
2055 | !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && | 1944 | !sh->reconstruct_state) |
2056 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | ||
2057 | for (i = disks; i--; ) | 1945 | for (i = disks; i--; ) |
2058 | if (__handle_issuing_new_read_requests5( | 1946 | if (fetch_block5(sh, s, i, disks)) |
2059 | sh, s, i, disks) == 0) | ||
2060 | break; | 1947 | break; |
2061 | } | ||
2062 | set_bit(STRIPE_HANDLE, &sh->state); | 1948 | set_bit(STRIPE_HANDLE, &sh->state); |
2063 | } | 1949 | } |
2064 | 1950 | ||
2065 | static void handle_issuing_new_read_requests6(struct stripe_head *sh, | 1951 | static void handle_stripe_fill6(struct stripe_head *sh, |
2066 | struct stripe_head_state *s, struct r6_state *r6s, | 1952 | struct stripe_head_state *s, struct r6_state *r6s, |
2067 | int disks) | 1953 | int disks) |
2068 | { | 1954 | { |
@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, | |||
2121 | } | 2007 | } |
2122 | 2008 | ||
2123 | 2009 | ||
2124 | /* handle_completed_write_requests | 2010 | /* handle_stripe_clean_event |
2125 | * any written block on an uptodate or failed drive can be returned. | 2011 | * any written block on an uptodate or failed drive can be returned. |
2126 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but | 2012 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but |
2127 | * never LOCKED, so we don't need to test 'failed' directly. | 2013 | * never LOCKED, so we don't need to test 'failed' directly. |
2128 | */ | 2014 | */ |
2129 | static void handle_completed_write_requests(raid5_conf_t *conf, | 2015 | static void handle_stripe_clean_event(raid5_conf_t *conf, |
2130 | struct stripe_head *sh, int disks, struct bio **return_bi) | 2016 | struct stripe_head *sh, int disks, struct bio **return_bi) |
2131 | { | 2017 | { |
2132 | int i; | 2018 | int i; |
@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, | |||
2171 | md_wakeup_thread(conf->mddev->thread); | 2057 | md_wakeup_thread(conf->mddev->thread); |
2172 | } | 2058 | } |
2173 | 2059 | ||
2174 | static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | 2060 | static void handle_stripe_dirtying5(raid5_conf_t *conf, |
2175 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2061 | struct stripe_head *sh, struct stripe_head_state *s, int disks) |
2176 | { | 2062 | { |
2177 | int rmw = 0, rcw = 0, i; | 2063 | int rmw = 0, rcw = 0, i; |
@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
2215 | "%d for r-m-w\n", i); | 2101 | "%d for r-m-w\n", i); |
2216 | set_bit(R5_LOCKED, &dev->flags); | 2102 | set_bit(R5_LOCKED, &dev->flags); |
2217 | set_bit(R5_Wantread, &dev->flags); | 2103 | set_bit(R5_Wantread, &dev->flags); |
2218 | if (!test_and_set_bit( | ||
2219 | STRIPE_OP_IO, &sh->ops.pending)) | ||
2220 | sh->ops.count++; | ||
2221 | s->locked++; | 2104 | s->locked++; |
2222 | } else { | 2105 | } else { |
2223 | set_bit(STRIPE_DELAYED, &sh->state); | 2106 | set_bit(STRIPE_DELAYED, &sh->state); |
@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
2241 | "%d for Reconstruct\n", i); | 2124 | "%d for Reconstruct\n", i); |
2242 | set_bit(R5_LOCKED, &dev->flags); | 2125 | set_bit(R5_LOCKED, &dev->flags); |
2243 | set_bit(R5_Wantread, &dev->flags); | 2126 | set_bit(R5_Wantread, &dev->flags); |
2244 | if (!test_and_set_bit( | ||
2245 | STRIPE_OP_IO, &sh->ops.pending)) | ||
2246 | sh->ops.count++; | ||
2247 | s->locked++; | 2127 | s->locked++; |
2248 | } else { | 2128 | } else { |
2249 | set_bit(STRIPE_DELAYED, &sh->state); | 2129 | set_bit(STRIPE_DELAYED, &sh->state); |
@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
2261 | * simultaneously. If this is not the case then new writes need to be | 2141 | * simultaneously. If this is not the case then new writes need to be |
2262 | * held off until the compute completes. | 2142 | * held off until the compute completes. |
2263 | */ | 2143 | */ |
2264 | if ((s->req_compute || | 2144 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2265 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && | 2145 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
2266 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2146 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
2267 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2147 | schedule_reconstruction5(sh, s, rcw == 0, 0); |
2268 | s->locked += handle_write_operations5(sh, rcw == 0, 0); | ||
2269 | } | 2148 | } |
2270 | 2149 | ||
2271 | static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | 2150 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
2272 | struct stripe_head *sh, struct stripe_head_state *s, | 2151 | struct stripe_head *sh, struct stripe_head_state *s, |
2273 | struct r6_state *r6s, int disks) | 2152 | struct r6_state *r6s, int disks) |
2274 | { | 2153 | { |
@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | |||
2371 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2250 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
2372 | struct stripe_head_state *s, int disks) | 2251 | struct stripe_head_state *s, int disks) |
2373 | { | 2252 | { |
2374 | int canceled_check = 0; | 2253 | struct r5dev *dev = NULL; |
2375 | 2254 | ||
2376 | set_bit(STRIPE_HANDLE, &sh->state); | 2255 | set_bit(STRIPE_HANDLE, &sh->state); |
2377 | 2256 | ||
2378 | /* complete a check operation */ | 2257 | switch (sh->check_state) { |
2379 | if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { | 2258 | case check_state_idle: |
2380 | clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); | 2259 | /* start a new check operation if there are no failures */ |
2381 | clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); | ||
2382 | if (s->failed == 0) { | 2260 | if (s->failed == 0) { |
2383 | if (sh->ops.zero_sum_result == 0) | ||
2384 | /* parity is correct (on disc, | ||
2385 | * not in buffer any more) | ||
2386 | */ | ||
2387 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2388 | else { | ||
2389 | conf->mddev->resync_mismatches += | ||
2390 | STRIPE_SECTORS; | ||
2391 | if (test_bit( | ||
2392 | MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2393 | /* don't try to repair!! */ | ||
2394 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2395 | else { | ||
2396 | set_bit(STRIPE_OP_COMPUTE_BLK, | ||
2397 | &sh->ops.pending); | ||
2398 | set_bit(STRIPE_OP_MOD_REPAIR_PD, | ||
2399 | &sh->ops.pending); | ||
2400 | set_bit(R5_Wantcompute, | ||
2401 | &sh->dev[sh->pd_idx].flags); | ||
2402 | sh->ops.target = sh->pd_idx; | ||
2403 | sh->ops.count++; | ||
2404 | s->uptodate++; | ||
2405 | } | ||
2406 | } | ||
2407 | } else | ||
2408 | canceled_check = 1; /* STRIPE_INSYNC is not set */ | ||
2409 | } | ||
2410 | |||
2411 | /* start a new check operation if there are no failures, the stripe is | ||
2412 | * not insync, and a repair is not in flight | ||
2413 | */ | ||
2414 | if (s->failed == 0 && | ||
2415 | !test_bit(STRIPE_INSYNC, &sh->state) && | ||
2416 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
2417 | if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | ||
2418 | BUG_ON(s->uptodate != disks); | 2261 | BUG_ON(s->uptodate != disks); |
2262 | sh->check_state = check_state_run; | ||
2263 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2419 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); | 2264 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); |
2420 | sh->ops.count++; | ||
2421 | s->uptodate--; | 2265 | s->uptodate--; |
2266 | break; | ||
2422 | } | 2267 | } |
2423 | } | 2268 | dev = &sh->dev[s->failed_num]; |
2424 | 2269 | /* fall through */ | |
2425 | /* check if we can clear a parity disk reconstruct */ | 2270 | case check_state_compute_result: |
2426 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | 2271 | sh->check_state = check_state_idle; |
2427 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | 2272 | if (!dev) |
2428 | 2273 | dev = &sh->dev[sh->pd_idx]; | |
2429 | clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); | 2274 | |
2430 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 2275 | /* check that a write has not made the stripe insync */ |
2431 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | 2276 | if (test_bit(STRIPE_INSYNC, &sh->state)) |
2432 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 2277 | break; |
2433 | } | ||
2434 | |||
2435 | 2278 | ||
2436 | /* Wait for check parity and compute block operations to complete | ||
2437 | * before write-back. If a failure occurred while the check operation | ||
2438 | * was in flight we need to cycle this stripe through handle_stripe | ||
2439 | * since the parity block may not be uptodate | ||
2440 | */ | ||
2441 | if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && | ||
2442 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && | ||
2443 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { | ||
2444 | struct r5dev *dev; | ||
2445 | /* either failed parity check, or recovery is happening */ | 2279 | /* either failed parity check, or recovery is happening */ |
2446 | if (s->failed == 0) | ||
2447 | s->failed_num = sh->pd_idx; | ||
2448 | dev = &sh->dev[s->failed_num]; | ||
2449 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | 2280 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); |
2450 | BUG_ON(s->uptodate != disks); | 2281 | BUG_ON(s->uptodate != disks); |
2451 | 2282 | ||
2452 | set_bit(R5_LOCKED, &dev->flags); | 2283 | set_bit(R5_LOCKED, &dev->flags); |
2284 | s->locked++; | ||
2453 | set_bit(R5_Wantwrite, &dev->flags); | 2285 | set_bit(R5_Wantwrite, &dev->flags); |
2454 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2455 | sh->ops.count++; | ||
2456 | 2286 | ||
2457 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2287 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2458 | s->locked++; | ||
2459 | set_bit(STRIPE_INSYNC, &sh->state); | 2288 | set_bit(STRIPE_INSYNC, &sh->state); |
2289 | break; | ||
2290 | case check_state_run: | ||
2291 | break; /* we will be called again upon completion */ | ||
2292 | case check_state_check_result: | ||
2293 | sh->check_state = check_state_idle; | ||
2294 | |||
2295 | /* if a failure occurred during the check operation, leave | ||
2296 | * STRIPE_INSYNC not set and let the stripe be handled again | ||
2297 | */ | ||
2298 | if (s->failed) | ||
2299 | break; | ||
2300 | |||
2301 | /* handle a successful check operation, if parity is correct | ||
2302 | * we are done. Otherwise update the mismatch count and repair | ||
2303 | * parity if !MD_RECOVERY_CHECK | ||
2304 | */ | ||
2305 | if (sh->ops.zero_sum_result == 0) | ||
2306 | /* parity is correct (on disc, | ||
2307 | * not in buffer any more) | ||
2308 | */ | ||
2309 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2310 | else { | ||
2311 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2312 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2313 | /* don't try to repair!! */ | ||
2314 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2315 | else { | ||
2316 | sh->check_state = check_state_compute_run; | ||
2317 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2318 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2319 | set_bit(R5_Wantcompute, | ||
2320 | &sh->dev[sh->pd_idx].flags); | ||
2321 | sh->ops.target = sh->pd_idx; | ||
2322 | s->uptodate++; | ||
2323 | } | ||
2324 | } | ||
2325 | break; | ||
2326 | case check_state_compute_run: | ||
2327 | break; | ||
2328 | default: | ||
2329 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2330 | __func__, sh->check_state, | ||
2331 | (unsigned long long) sh->sector); | ||
2332 | BUG(); | ||
2460 | } | 2333 | } |
2461 | } | 2334 | } |
2462 | 2335 | ||
@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2641 | struct bio *return_bi = NULL; | 2514 | struct bio *return_bi = NULL; |
2642 | struct stripe_head_state s; | 2515 | struct stripe_head_state s; |
2643 | struct r5dev *dev; | 2516 | struct r5dev *dev; |
2644 | unsigned long pending = 0; | ||
2645 | mdk_rdev_t *blocked_rdev = NULL; | 2517 | mdk_rdev_t *blocked_rdev = NULL; |
2646 | int prexor; | 2518 | int prexor; |
2647 | 2519 | ||
2648 | memset(&s, 0, sizeof(s)); | 2520 | memset(&s, 0, sizeof(s)); |
2649 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " | 2521 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " |
2650 | "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, | 2522 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, |
2651 | atomic_read(&sh->count), sh->pd_idx, | 2523 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, |
2652 | sh->ops.pending, sh->ops.ack, sh->ops.complete); | 2524 | sh->reconstruct_state); |
2653 | 2525 | ||
2654 | spin_lock(&sh->lock); | 2526 | spin_lock(&sh->lock); |
2655 | clear_bit(STRIPE_HANDLE, &sh->state); | 2527 | clear_bit(STRIPE_HANDLE, &sh->state); |
@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2658 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2530 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
2659 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 2531 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
2660 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 2532 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
2661 | /* Now to look around and see what can be done */ | ||
2662 | |||
2663 | /* clean-up completed biofill operations */ | ||
2664 | if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { | ||
2665 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); | ||
2666 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); | ||
2667 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); | ||
2668 | } | ||
2669 | 2533 | ||
2534 | /* Now to look around and see what can be done */ | ||
2670 | rcu_read_lock(); | 2535 | rcu_read_lock(); |
2671 | for (i=disks; i--; ) { | 2536 | for (i=disks; i--; ) { |
2672 | mdk_rdev_t *rdev; | 2537 | mdk_rdev_t *rdev; |
@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2680 | /* maybe we can request a biofill operation | 2545 | /* maybe we can request a biofill operation |
2681 | * | 2546 | * |
2682 | * new wantfill requests are only permitted while | 2547 | * new wantfill requests are only permitted while |
2683 | * STRIPE_OP_BIOFILL is clear | 2548 | * ops_complete_biofill is guaranteed to be inactive |
2684 | */ | 2549 | */ |
2685 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | 2550 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
2686 | !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2551 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
2687 | set_bit(R5_Wantfill, &dev->flags); | 2552 | set_bit(R5_Wantfill, &dev->flags); |
2688 | 2553 | ||
2689 | /* now count some things */ | 2554 | /* now count some things */ |
@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2727 | goto unlock; | 2592 | goto unlock; |
2728 | } | 2593 | } |
2729 | 2594 | ||
2730 | if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2595 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
2731 | sh->ops.count++; | 2596 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); |
2597 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
2598 | } | ||
2732 | 2599 | ||
2733 | pr_debug("locked=%d uptodate=%d to_read=%d" | 2600 | pr_debug("locked=%d uptodate=%d to_read=%d" |
2734 | " to_write=%d failed=%d failed_num=%d\n", | 2601 | " to_write=%d failed=%d failed_num=%d\n", |
@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2738 | * need to be failed | 2605 | * need to be failed |
2739 | */ | 2606 | */ |
2740 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | 2607 | if (s.failed > 1 && s.to_read+s.to_write+s.written) |
2741 | handle_requests_to_failed_array(conf, sh, &s, disks, | 2608 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); |
2742 | &return_bi); | ||
2743 | if (s.failed > 1 && s.syncing) { | 2609 | if (s.failed > 1 && s.syncing) { |
2744 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2610 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
2745 | clear_bit(STRIPE_SYNCING, &sh->state); | 2611 | clear_bit(STRIPE_SYNCING, &sh->state); |
@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2755 | !test_bit(R5_LOCKED, &dev->flags) && | 2621 | !test_bit(R5_LOCKED, &dev->flags) && |
2756 | test_bit(R5_UPTODATE, &dev->flags)) || | 2622 | test_bit(R5_UPTODATE, &dev->flags)) || |
2757 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | 2623 | (s.failed == 1 && s.failed_num == sh->pd_idx))) |
2758 | handle_completed_write_requests(conf, sh, disks, &return_bi); | 2624 | handle_stripe_clean_event(conf, sh, disks, &return_bi); |
2759 | 2625 | ||
2760 | /* Now we might consider reading some blocks, either to check/generate | 2626 | /* Now we might consider reading some blocks, either to check/generate |
2761 | * parity, or to satisfy requests | 2627 | * parity, or to satisfy requests |
2762 | * or to load a block that is being partially written. | 2628 | * or to load a block that is being partially written. |
2763 | */ | 2629 | */ |
2764 | if (s.to_read || s.non_overwrite || | 2630 | if (s.to_read || s.non_overwrite || |
2765 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || | 2631 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
2766 | test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2632 | handle_stripe_fill5(sh, &s, disks); |
2767 | handle_issuing_new_read_requests5(sh, &s, disks); | ||
2768 | 2633 | ||
2769 | /* Now we check to see if any write operations have recently | 2634 | /* Now we check to see if any write operations have recently |
2770 | * completed | 2635 | * completed |
2771 | */ | 2636 | */ |
2772 | |||
2773 | /* leave prexor set until postxor is done, allows us to distinguish | ||
2774 | * a rmw from a rcw during biodrain | ||
2775 | */ | ||
2776 | prexor = 0; | 2637 | prexor = 0; |
2777 | if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && | 2638 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) |
2778 | test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { | ||
2779 | |||
2780 | prexor = 1; | 2639 | prexor = 1; |
2781 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | 2640 | if (sh->reconstruct_state == reconstruct_state_drain_result || |
2782 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); | 2641 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { |
2783 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | 2642 | sh->reconstruct_state = reconstruct_state_idle; |
2784 | |||
2785 | for (i = disks; i--; ) | ||
2786 | clear_bit(R5_Wantprexor, &sh->dev[i].flags); | ||
2787 | } | ||
2788 | |||
2789 | /* if only POSTXOR is set then this is an 'expand' postxor */ | ||
2790 | if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && | ||
2791 | test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { | ||
2792 | |||
2793 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | ||
2794 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); | ||
2795 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | ||
2796 | |||
2797 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
2798 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); | ||
2799 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | ||
2800 | 2643 | ||
2801 | /* All the 'written' buffers and the parity block are ready to | 2644 | /* All the 'written' buffers and the parity block are ready to |
2802 | * be written back to disk | 2645 | * be written back to disk |
@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2808 | (i == sh->pd_idx || dev->written)) { | 2651 | (i == sh->pd_idx || dev->written)) { |
2809 | pr_debug("Writing block %d\n", i); | 2652 | pr_debug("Writing block %d\n", i); |
2810 | set_bit(R5_Wantwrite, &dev->flags); | 2653 | set_bit(R5_Wantwrite, &dev->flags); |
2811 | if (!test_and_set_bit( | ||
2812 | STRIPE_OP_IO, &sh->ops.pending)) | ||
2813 | sh->ops.count++; | ||
2814 | if (prexor) | 2654 | if (prexor) |
2815 | continue; | 2655 | continue; |
2816 | if (!test_bit(R5_Insync, &dev->flags) || | 2656 | if (!test_bit(R5_Insync, &dev->flags) || |
@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2832 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 2672 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
2833 | * block. | 2673 | * block. |
2834 | */ | 2674 | */ |
2835 | if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && | 2675 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
2836 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | 2676 | handle_stripe_dirtying5(conf, sh, &s, disks); |
2837 | handle_issuing_new_write_requests5(conf, sh, &s, disks); | ||
2838 | 2677 | ||
2839 | /* maybe we need to check and possibly fix the parity for this stripe | 2678 | /* maybe we need to check and possibly fix the parity for this stripe |
2840 | * Any reads will already have been scheduled, so we just see if enough | 2679 | * Any reads will already have been scheduled, so we just see if enough |
2841 | * data is available. The parity check is held off while parity | 2680 | * data is available. The parity check is held off while parity |
2842 | * dependent operations are in flight. | 2681 | * dependent operations are in flight. |
2843 | */ | 2682 | */ |
2844 | if ((s.syncing && s.locked == 0 && | 2683 | if (sh->check_state || |
2845 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 2684 | (s.syncing && s.locked == 0 && |
2846 | !test_bit(STRIPE_INSYNC, &sh->state)) || | 2685 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
2847 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || | 2686 | !test_bit(STRIPE_INSYNC, &sh->state))) |
2848 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) | ||
2849 | handle_parity_checks5(conf, sh, &s, disks); | 2687 | handle_parity_checks5(conf, sh, &s, disks); |
2850 | 2688 | ||
2851 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 2689 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2864 | dev = &sh->dev[s.failed_num]; | 2702 | dev = &sh->dev[s.failed_num]; |
2865 | if (!test_bit(R5_ReWrite, &dev->flags)) { | 2703 | if (!test_bit(R5_ReWrite, &dev->flags)) { |
2866 | set_bit(R5_Wantwrite, &dev->flags); | 2704 | set_bit(R5_Wantwrite, &dev->flags); |
2867 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2868 | sh->ops.count++; | ||
2869 | set_bit(R5_ReWrite, &dev->flags); | 2705 | set_bit(R5_ReWrite, &dev->flags); |
2870 | set_bit(R5_LOCKED, &dev->flags); | 2706 | set_bit(R5_LOCKED, &dev->flags); |
2871 | s.locked++; | 2707 | s.locked++; |
2872 | } else { | 2708 | } else { |
2873 | /* let's read it back */ | 2709 | /* let's read it back */ |
2874 | set_bit(R5_Wantread, &dev->flags); | 2710 | set_bit(R5_Wantread, &dev->flags); |
2875 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2876 | sh->ops.count++; | ||
2877 | set_bit(R5_LOCKED, &dev->flags); | 2711 | set_bit(R5_LOCKED, &dev->flags); |
2878 | s.locked++; | 2712 | s.locked++; |
2879 | } | 2713 | } |
2880 | } | 2714 | } |
2881 | 2715 | ||
2882 | /* Finish postxor operations initiated by the expansion | 2716 | /* Finish reconstruct operations initiated by the expansion process */ |
2883 | * process | 2717 | if (sh->reconstruct_state == reconstruct_state_result) { |
2884 | */ | 2718 | sh->reconstruct_state = reconstruct_state_idle; |
2885 | if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && | ||
2886 | !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { | ||
2887 | |||
2888 | clear_bit(STRIPE_EXPANDING, &sh->state); | 2719 | clear_bit(STRIPE_EXPANDING, &sh->state); |
2889 | 2720 | for (i = conf->raid_disks; i--; ) | |
2890 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | ||
2891 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); | ||
2892 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
2893 | |||
2894 | for (i = conf->raid_disks; i--; ) { | ||
2895 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 2721 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
2896 | set_bit(R5_LOCKED, &dev->flags); | 2722 | set_bit(R5_LOCKED, &dev->flags); |
2897 | s.locked++; | 2723 | s.locked++; |
2898 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2899 | sh->ops.count++; | ||
2900 | } | ||
2901 | } | 2724 | } |
2902 | 2725 | ||
2903 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 2726 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
2904 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | 2727 | !sh->reconstruct_state) { |
2905 | /* Need to write out all blocks after computing parity */ | 2728 | /* Need to write out all blocks after computing parity */ |
2906 | sh->disks = conf->raid_disks; | 2729 | sh->disks = conf->raid_disks; |
2907 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, | 2730 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, |
2908 | conf->raid_disks); | 2731 | conf->raid_disks); |
2909 | s.locked += handle_write_operations5(sh, 1, 1); | 2732 | schedule_reconstruction5(sh, &s, 1, 1); |
2910 | } else if (s.expanded && | 2733 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2911 | s.locked == 0 && | ||
2912 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | ||
2913 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 2734 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2914 | atomic_dec(&conf->reshape_stripes); | 2735 | atomic_dec(&conf->reshape_stripes); |
2915 | wake_up(&conf->wait_for_overlap); | 2736 | wake_up(&conf->wait_for_overlap); |
@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2917 | } | 2738 | } |
2918 | 2739 | ||
2919 | if (s.expanding && s.locked == 0 && | 2740 | if (s.expanding && s.locked == 0 && |
2920 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2741 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
2921 | handle_stripe_expansion(conf, sh, NULL); | 2742 | handle_stripe_expansion(conf, sh, NULL); |
2922 | 2743 | ||
2923 | if (sh->ops.count) | ||
2924 | pending = get_stripe_work(sh); | ||
2925 | |||
2926 | unlock: | 2744 | unlock: |
2927 | spin_unlock(&sh->lock); | 2745 | spin_unlock(&sh->lock); |
2928 | 2746 | ||
@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2930 | if (unlikely(blocked_rdev)) | 2748 | if (unlikely(blocked_rdev)) |
2931 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2749 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2932 | 2750 | ||
2933 | if (pending) | 2751 | if (s.ops_request) |
2934 | raid5_run_ops(sh, pending); | 2752 | raid5_run_ops(sh, s.ops_request); |
2935 | 2753 | ||
2936 | return_io(return_bi); | 2754 | ops_run_io(sh, &s); |
2937 | 2755 | ||
2756 | return_io(return_bi); | ||
2938 | } | 2757 | } |
2939 | 2758 | ||
2940 | static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 2759 | static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) |
@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3042 | * might need to be failed | 2861 | * might need to be failed |
3043 | */ | 2862 | */ |
3044 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 2863 | if (s.failed > 2 && s.to_read+s.to_write+s.written) |
3045 | handle_requests_to_failed_array(conf, sh, &s, disks, | 2864 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); |
3046 | &return_bi); | ||
3047 | if (s.failed > 2 && s.syncing) { | 2865 | if (s.failed > 2 && s.syncing) { |
3048 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2866 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
3049 | clear_bit(STRIPE_SYNCING, &sh->state); | 2867 | clear_bit(STRIPE_SYNCING, &sh->state); |
@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3068 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 2886 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
3069 | && !test_bit(R5_LOCKED, &qdev->flags) | 2887 | && !test_bit(R5_LOCKED, &qdev->flags) |
3070 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 2888 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
3071 | handle_completed_write_requests(conf, sh, disks, &return_bi); | 2889 | handle_stripe_clean_event(conf, sh, disks, &return_bi); |
3072 | 2890 | ||
3073 | /* Now we might consider reading some blocks, either to check/generate | 2891 | /* Now we might consider reading some blocks, either to check/generate |
3074 | * parity, or to satisfy requests | 2892 | * parity, or to satisfy requests |
@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3076 | */ | 2894 | */ |
3077 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 2895 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
3078 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 2896 | (s.syncing && (s.uptodate < disks)) || s.expanding) |
3079 | handle_issuing_new_read_requests6(sh, &s, &r6s, disks); | 2897 | handle_stripe_fill6(sh, &s, &r6s, disks); |
3080 | 2898 | ||
3081 | /* now to consider writing and what else, if anything should be read */ | 2899 | /* now to consider writing and what else, if anything should be read */ |
3082 | if (s.to_write) | 2900 | if (s.to_write) |
3083 | handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); | 2901 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
3084 | 2902 | ||
3085 | /* maybe we need to check and possibly fix the parity for this stripe | 2903 | /* maybe we need to check and possibly fix the parity for this stripe |
3086 | * Any reads will already have been scheduled, so we just see if enough | 2904 | * Any reads will already have been scheduled, so we just see if enough |
@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3136 | } | 2954 | } |
3137 | 2955 | ||
3138 | if (s.expanding && s.locked == 0 && | 2956 | if (s.expanding && s.locked == 0 && |
3139 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2957 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
3140 | handle_stripe_expansion(conf, sh, &r6s); | 2958 | handle_stripe_expansion(conf, sh, &r6s); |
3141 | 2959 | ||
3142 | unlock: | 2960 | unlock: |
@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3146 | if (unlikely(blocked_rdev)) | 2964 | if (unlikely(blocked_rdev)) |
3147 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2965 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
3148 | 2966 | ||
3149 | return_io(return_bi); | 2967 | ops_run_io(sh, &s); |
3150 | |||
3151 | for (i=disks; i-- ;) { | ||
3152 | int rw; | ||
3153 | struct bio *bi; | ||
3154 | mdk_rdev_t *rdev; | ||
3155 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
3156 | rw = WRITE; | ||
3157 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
3158 | rw = READ; | ||
3159 | else | ||
3160 | continue; | ||
3161 | |||
3162 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
3163 | |||
3164 | bi = &sh->dev[i].req; | ||
3165 | |||
3166 | bi->bi_rw = rw; | ||
3167 | if (rw == WRITE) | ||
3168 | bi->bi_end_io = raid5_end_write_request; | ||
3169 | else | ||
3170 | bi->bi_end_io = raid5_end_read_request; | ||
3171 | |||
3172 | rcu_read_lock(); | ||
3173 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
3174 | if (rdev && test_bit(Faulty, &rdev->flags)) | ||
3175 | rdev = NULL; | ||
3176 | if (rdev) | ||
3177 | atomic_inc(&rdev->nr_pending); | ||
3178 | rcu_read_unlock(); | ||
3179 | 2968 | ||
3180 | if (rdev) { | 2969 | return_io(return_bi); |
3181 | if (s.syncing || s.expanding || s.expanded) | ||
3182 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | ||
3183 | |||
3184 | bi->bi_bdev = rdev->bdev; | ||
3185 | pr_debug("for %llu schedule op %ld on disc %d\n", | ||
3186 | (unsigned long long)sh->sector, bi->bi_rw, i); | ||
3187 | atomic_inc(&sh->count); | ||
3188 | bi->bi_sector = sh->sector + rdev->data_offset; | ||
3189 | bi->bi_flags = 1 << BIO_UPTODATE; | ||
3190 | bi->bi_vcnt = 1; | ||
3191 | bi->bi_max_vecs = 1; | ||
3192 | bi->bi_idx = 0; | ||
3193 | bi->bi_io_vec = &sh->dev[i].vec; | ||
3194 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
3195 | bi->bi_io_vec[0].bv_offset = 0; | ||
3196 | bi->bi_size = STRIPE_SIZE; | ||
3197 | bi->bi_next = NULL; | ||
3198 | if (rw == WRITE && | ||
3199 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
3200 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
3201 | generic_make_request(bi); | ||
3202 | } else { | ||
3203 | if (rw == WRITE) | ||
3204 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
3205 | pr_debug("skip op %ld on disc %d for sector %llu\n", | ||
3206 | bi->bi_rw, i, (unsigned long long)sh->sector); | ||
3207 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3208 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3209 | } | ||
3210 | } | ||
3211 | } | 2970 | } |
3212 | 2971 | ||
3213 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 2972 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) |
@@ -3695,9 +3454,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3695 | if ( rw == WRITE ) | 3454 | if ( rw == WRITE ) |
3696 | md_write_end(mddev); | 3455 | md_write_end(mddev); |
3697 | 3456 | ||
3698 | bi->bi_end_io(bi, | 3457 | bio_endio(bi, 0); |
3699 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
3700 | ? 0 : -EIO); | ||
3701 | } | 3458 | } |
3702 | return 0; | 3459 | return 0; |
3703 | } | 3460 | } |
@@ -4000,12 +3757,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4000 | spin_lock_irq(&conf->device_lock); | 3757 | spin_lock_irq(&conf->device_lock); |
4001 | remaining = --raid_bio->bi_phys_segments; | 3758 | remaining = --raid_bio->bi_phys_segments; |
4002 | spin_unlock_irq(&conf->device_lock); | 3759 | spin_unlock_irq(&conf->device_lock); |
4003 | if (remaining == 0) { | 3760 | if (remaining == 0) |
4004 | 3761 | bio_endio(raid_bio, 0); | |
4005 | raid_bio->bi_end_io(raid_bio, | ||
4006 | test_bit(BIO_UPTODATE, &raid_bio->bi_flags) | ||
4007 | ? 0 : -EIO); | ||
4008 | } | ||
4009 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 3762 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
4010 | wake_up(&conf->wait_for_stripe); | 3763 | wake_up(&conf->wait_for_stripe); |
4011 | return handled; | 3764 | return handled; |
@@ -4092,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
4092 | { | 3845 | { |
4093 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3846 | raid5_conf_t *conf = mddev_to_conf(mddev); |
4094 | unsigned long new; | 3847 | unsigned long new; |
3848 | int err; | ||
3849 | |||
4095 | if (len >= PAGE_SIZE) | 3850 | if (len >= PAGE_SIZE) |
4096 | return -EINVAL; | 3851 | return -EINVAL; |
4097 | if (!conf) | 3852 | if (!conf) |
@@ -4107,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
4107 | else | 3862 | else |
4108 | break; | 3863 | break; |
4109 | } | 3864 | } |
4110 | md_allow_write(mddev); | 3865 | err = md_allow_write(mddev); |
3866 | if (err) | ||
3867 | return err; | ||
4111 | while (new > conf->max_nr_stripes) { | 3868 | while (new > conf->max_nr_stripes) { |
4112 | if (grow_one_stripe(conf)) | 3869 | if (grow_one_stripe(conf)) |
4113 | conf->max_nr_stripes++; | 3870 | conf->max_nr_stripes++; |
@@ -4607,35 +4364,41 @@ abort: | |||
4607 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 4364 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
4608 | { | 4365 | { |
4609 | raid5_conf_t *conf = mddev->private; | 4366 | raid5_conf_t *conf = mddev->private; |
4610 | int found = 0; | 4367 | int err = -EEXIST; |
4611 | int disk; | 4368 | int disk; |
4612 | struct disk_info *p; | 4369 | struct disk_info *p; |
4370 | int first = 0; | ||
4371 | int last = conf->raid_disks - 1; | ||
4613 | 4372 | ||
4614 | if (mddev->degraded > conf->max_degraded) | 4373 | if (mddev->degraded > conf->max_degraded) |
4615 | /* no point adding a device */ | 4374 | /* no point adding a device */ |
4616 | return 0; | 4375 | return -EINVAL; |
4376 | |||
4377 | if (rdev->raid_disk >= 0) | ||
4378 | first = last = rdev->raid_disk; | ||
4617 | 4379 | ||
4618 | /* | 4380 | /* |
4619 | * find the disk ... but prefer rdev->saved_raid_disk | 4381 | * find the disk ... but prefer rdev->saved_raid_disk |
4620 | * if possible. | 4382 | * if possible. |
4621 | */ | 4383 | */ |
4622 | if (rdev->saved_raid_disk >= 0 && | 4384 | if (rdev->saved_raid_disk >= 0 && |
4385 | rdev->saved_raid_disk >= first && | ||
4623 | conf->disks[rdev->saved_raid_disk].rdev == NULL) | 4386 | conf->disks[rdev->saved_raid_disk].rdev == NULL) |
4624 | disk = rdev->saved_raid_disk; | 4387 | disk = rdev->saved_raid_disk; |
4625 | else | 4388 | else |
4626 | disk = 0; | 4389 | disk = first; |
4627 | for ( ; disk < conf->raid_disks; disk++) | 4390 | for ( ; disk <= last ; disk++) |
4628 | if ((p=conf->disks + disk)->rdev == NULL) { | 4391 | if ((p=conf->disks + disk)->rdev == NULL) { |
4629 | clear_bit(In_sync, &rdev->flags); | 4392 | clear_bit(In_sync, &rdev->flags); |
4630 | rdev->raid_disk = disk; | 4393 | rdev->raid_disk = disk; |
4631 | found = 1; | 4394 | err = 0; |
4632 | if (rdev->saved_raid_disk != disk) | 4395 | if (rdev->saved_raid_disk != disk) |
4633 | conf->fullsync = 1; | 4396 | conf->fullsync = 1; |
4634 | rcu_assign_pointer(p->rdev, rdev); | 4397 | rcu_assign_pointer(p->rdev, rdev); |
4635 | break; | 4398 | break; |
4636 | } | 4399 | } |
4637 | print_raid5_conf(conf); | 4400 | print_raid5_conf(conf); |
4638 | return found; | 4401 | return err; |
4639 | } | 4402 | } |
4640 | 4403 | ||
4641 | static int raid5_resize(mddev_t *mddev, sector_t sectors) | 4404 | static int raid5_resize(mddev_t *mddev, sector_t sectors) |
@@ -4736,7 +4499,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4736 | rdev_for_each(rdev, rtmp, mddev) | 4499 | rdev_for_each(rdev, rtmp, mddev) |
4737 | if (rdev->raid_disk < 0 && | 4500 | if (rdev->raid_disk < 0 && |
4738 | !test_bit(Faulty, &rdev->flags)) { | 4501 | !test_bit(Faulty, &rdev->flags)) { |
4739 | if (raid5_add_disk(mddev, rdev)) { | 4502 | if (raid5_add_disk(mddev, rdev) == 0) { |
4740 | char nm[20]; | 4503 | char nm[20]; |
4741 | set_bit(In_sync, &rdev->flags); | 4504 | set_bit(In_sync, &rdev->flags); |
4742 | added_devices++; | 4505 | added_devices++; |
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 78bfdea24a8e..e98900671ca9 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h | |||
@@ -221,6 +221,7 @@ struct bitmap { | |||
221 | unsigned long syncchunk; | 221 | unsigned long syncchunk; |
222 | 222 | ||
223 | __u64 events_cleared; | 223 | __u64 events_cleared; |
224 | int need_sync; | ||
224 | 225 | ||
225 | /* bitmap spinlock */ | 226 | /* bitmap spinlock */ |
226 | spinlock_t lock; | 227 | spinlock_t lock; |
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b7386ae9d288..dc0e3fcb9f28 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h | |||
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | |||
95 | struct page *page, int rw); | 95 | struct page *page, int rw); |
96 | extern void md_do_sync(mddev_t *mddev); | 96 | extern void md_do_sync(mddev_t *mddev); |
97 | extern void md_new_event(mddev_t *mddev); | 97 | extern void md_new_event(mddev_t *mddev); |
98 | extern void md_allow_write(mddev_t *mddev); | 98 | extern int md_allow_write(mddev_t *mddev); |
99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
100 | 100 | ||
101 | #endif /* CONFIG_MD */ | 101 | #endif /* CONFIG_MD */ |
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 3dea9f545c8f..df30c4395875 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h | |||
@@ -87,6 +87,9 @@ struct mdk_rdev_s | |||
87 | #define Blocked 8 /* An error occured on an externally | 87 | #define Blocked 8 /* An error occured on an externally |
88 | * managed array, don't allow writes | 88 | * managed array, don't allow writes |
89 | * until it is cleared */ | 89 | * until it is cleared */ |
90 | #define StateChanged 9 /* Faulty or Blocked has changed during | ||
91 | * interrupt, so it needs to be | ||
92 | * notified by the thread */ | ||
90 | wait_queue_head_t blocked_wait; | 93 | wait_queue_head_t blocked_wait; |
91 | 94 | ||
92 | int desc_nr; /* descriptor index in the superblock */ | 95 | int desc_nr; /* descriptor index in the superblock */ |
@@ -188,6 +191,7 @@ struct mddev_s | |||
188 | * NEEDED: we might need to start a resync/recover | 191 | * NEEDED: we might need to start a resync/recover |
189 | * RUNNING: a thread is running, or about to be started | 192 | * RUNNING: a thread is running, or about to be started |
190 | * SYNC: actually doing a resync, not a recovery | 193 | * SYNC: actually doing a resync, not a recovery |
194 | * RECOVER: doing recovery, or need to try it. | ||
191 | * INTR: resync needs to be aborted for some reason | 195 | * INTR: resync needs to be aborted for some reason |
192 | * DONE: thread is done and is waiting to be reaped | 196 | * DONE: thread is done and is waiting to be reaped |
193 | * REQUEST: user-space has requested a sync (used with SYNC) | 197 | * REQUEST: user-space has requested a sync (used with SYNC) |
@@ -198,6 +202,7 @@ struct mddev_s | |||
198 | */ | 202 | */ |
199 | #define MD_RECOVERY_RUNNING 0 | 203 | #define MD_RECOVERY_RUNNING 0 |
200 | #define MD_RECOVERY_SYNC 1 | 204 | #define MD_RECOVERY_SYNC 1 |
205 | #define MD_RECOVERY_RECOVER 2 | ||
201 | #define MD_RECOVERY_INTR 3 | 206 | #define MD_RECOVERY_INTR 3 |
202 | #define MD_RECOVERY_DONE 4 | 207 | #define MD_RECOVERY_DONE 4 |
203 | #define MD_RECOVERY_NEEDED 5 | 208 | #define MD_RECOVERY_NEEDED 5 |
@@ -227,6 +232,8 @@ struct mddev_s | |||
227 | atomic_t recovery_active; /* blocks scheduled, but not written */ | 232 | atomic_t recovery_active; /* blocks scheduled, but not written */ |
228 | wait_queue_head_t recovery_wait; | 233 | wait_queue_head_t recovery_wait; |
229 | sector_t recovery_cp; | 234 | sector_t recovery_cp; |
235 | sector_t resync_min; /* user requested sync | ||
236 | * starts here */ | ||
230 | sector_t resync_max; /* resync should pause | 237 | sector_t resync_max; /* resync should pause |
231 | * when it gets here */ | 238 | * when it gets here */ |
232 | 239 | ||
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index f0827d31ae6f..3b2672792457 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -158,6 +158,43 @@ | |||
158 | * the compute block completes. | 158 | * the compute block completes. |
159 | */ | 159 | */ |
160 | 160 | ||
161 | /* | ||
162 | * Operations state - intermediate states that are visible outside of sh->lock | ||
163 | * In general _idle indicates nothing is running, _run indicates a data | ||
164 | * processing operation is active, and _result means the data processing result | ||
165 | * is stable and can be acted upon. For simple operations like biofill and | ||
166 | * compute that only have an _idle and _run state they are indicated with | ||
167 | * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) | ||
168 | */ | ||
169 | /** | ||
170 | * enum check_states - handles syncing / repairing a stripe | ||
171 | * @check_state_idle - check operations are quiesced | ||
172 | * @check_state_run - check operation is running | ||
173 | * @check_state_result - set outside lock when check result is valid | ||
174 | * @check_state_compute_run - check failed and we are repairing | ||
175 | * @check_state_compute_result - set outside lock when compute result is valid | ||
176 | */ | ||
177 | enum check_states { | ||
178 | check_state_idle = 0, | ||
179 | check_state_run, /* parity check */ | ||
180 | check_state_check_result, | ||
181 | check_state_compute_run, /* parity repair */ | ||
182 | check_state_compute_result, | ||
183 | }; | ||
184 | |||
185 | /** | ||
186 | * enum reconstruct_states - handles writing or expanding a stripe | ||
187 | */ | ||
188 | enum reconstruct_states { | ||
189 | reconstruct_state_idle = 0, | ||
190 | reconstruct_state_prexor_drain_run, /* prexor-write */ | ||
191 | reconstruct_state_drain_run, /* write */ | ||
192 | reconstruct_state_run, /* expand */ | ||
193 | reconstruct_state_prexor_drain_result, | ||
194 | reconstruct_state_drain_result, | ||
195 | reconstruct_state_result, | ||
196 | }; | ||
197 | |||
161 | struct stripe_head { | 198 | struct stripe_head { |
162 | struct hlist_node hash; | 199 | struct hlist_node hash; |
163 | struct list_head lru; /* inactive_list or handle_list */ | 200 | struct list_head lru; /* inactive_list or handle_list */ |
@@ -169,19 +206,13 @@ struct stripe_head { | |||
169 | spinlock_t lock; | 206 | spinlock_t lock; |
170 | int bm_seq; /* sequence number for bitmap flushes */ | 207 | int bm_seq; /* sequence number for bitmap flushes */ |
171 | int disks; /* disks in stripe */ | 208 | int disks; /* disks in stripe */ |
209 | enum check_states check_state; | ||
210 | enum reconstruct_states reconstruct_state; | ||
172 | /* stripe_operations | 211 | /* stripe_operations |
173 | * @pending - pending ops flags (set for request->issue->complete) | ||
174 | * @ack - submitted ops flags (set for issue->complete) | ||
175 | * @complete - completed ops flags (set for complete) | ||
176 | * @target - STRIPE_OP_COMPUTE_BLK target | 212 | * @target - STRIPE_OP_COMPUTE_BLK target |
177 | * @count - raid5_runs_ops is set to run when this is non-zero | ||
178 | */ | 213 | */ |
179 | struct stripe_operations { | 214 | struct stripe_operations { |
180 | unsigned long pending; | ||
181 | unsigned long ack; | ||
182 | unsigned long complete; | ||
183 | int target; | 215 | int target; |
184 | int count; | ||
185 | u32 zero_sum_result; | 216 | u32 zero_sum_result; |
186 | } ops; | 217 | } ops; |
187 | struct r5dev { | 218 | struct r5dev { |
@@ -202,6 +233,7 @@ struct stripe_head_state { | |||
202 | int locked, uptodate, to_read, to_write, failed, written; | 233 | int locked, uptodate, to_read, to_write, failed, written; |
203 | int to_fill, compute, req_compute, non_overwrite; | 234 | int to_fill, compute, req_compute, non_overwrite; |
204 | int failed_num; | 235 | int failed_num; |
236 | unsigned long ops_request; | ||
205 | }; | 237 | }; |
206 | 238 | ||
207 | /* r6_state - extra state data only relevant to r6 */ | 239 | /* r6_state - extra state data only relevant to r6 */ |
@@ -228,9 +260,7 @@ struct r6_state { | |||
228 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 260 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs |
229 | * filling | 261 | * filling |
230 | */ | 262 | */ |
231 | #define R5_Wantprexor 13 /* distinguish blocks ready for rmw from | 263 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
232 | * other "towrites" | ||
233 | */ | ||
234 | /* | 264 | /* |
235 | * Write method | 265 | * Write method |
236 | */ | 266 | */ |
@@ -254,8 +284,10 @@ struct r6_state { | |||
254 | #define STRIPE_EXPAND_READY 11 | 284 | #define STRIPE_EXPAND_READY 11 |
255 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | 285 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ |
256 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 286 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ |
287 | #define STRIPE_BIOFILL_RUN 14 | ||
288 | #define STRIPE_COMPUTE_RUN 15 | ||
257 | /* | 289 | /* |
258 | * Operations flags (in issue order) | 290 | * Operation request flags |
259 | */ | 291 | */ |
260 | #define STRIPE_OP_BIOFILL 0 | 292 | #define STRIPE_OP_BIOFILL 0 |
261 | #define STRIPE_OP_COMPUTE_BLK 1 | 293 | #define STRIPE_OP_COMPUTE_BLK 1 |
@@ -263,14 +295,6 @@ struct r6_state { | |||
263 | #define STRIPE_OP_BIODRAIN 3 | 295 | #define STRIPE_OP_BIODRAIN 3 |
264 | #define STRIPE_OP_POSTXOR 4 | 296 | #define STRIPE_OP_POSTXOR 4 |
265 | #define STRIPE_OP_CHECK 5 | 297 | #define STRIPE_OP_CHECK 5 |
266 | #define STRIPE_OP_IO 6 | ||
267 | |||
268 | /* modifiers to the base operations | ||
269 | * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back | ||
270 | * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check | ||
271 | */ | ||
272 | #define STRIPE_OP_MOD_REPAIR_PD 7 | ||
273 | #define STRIPE_OP_MOD_DMA_CHECK 8 | ||
274 | 298 | ||
275 | /* | 299 | /* |
276 | * Plugging: | 300 | * Plugging: |