diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 183 | ||||
-rw-r--r-- | drivers/md/dm-exception-store.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 12 | ||||
-rw-r--r-- | drivers/md/linear.c | 100 | ||||
-rw-r--r-- | drivers/md/md.c | 227 | ||||
-rw-r--r-- | drivers/md/multipath.c | 5 | ||||
-rw-r--r-- | drivers/md/raid0.c | 5 | ||||
-rw-r--r-- | drivers/md/raid1.c | 234 | ||||
-rw-r--r-- | drivers/md/raid10.c | 46 | ||||
-rw-r--r-- | drivers/md/raid5.c | 138 | ||||
-rw-r--r-- | drivers/md/raid6main.c | 138 |
11 files changed, 856 insertions, 241 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 41df4cda66e2..2fba2bbe72d8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -270,19 +270,20 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
270 | 270 | ||
271 | if (!page) | 271 | if (!page) |
272 | return ERR_PTR(-ENOMEM); | 272 | return ERR_PTR(-ENOMEM); |
273 | do { | ||
274 | ITERATE_RDEV(mddev, rdev, tmp) | ||
275 | if (rdev->in_sync && !rdev->faulty) | ||
276 | goto found; | ||
277 | return ERR_PTR(-EIO); | ||
278 | 273 | ||
279 | found: | 274 | ITERATE_RDEV(mddev, rdev, tmp) { |
275 | if (! rdev->in_sync || rdev->faulty) | ||
276 | continue; | ||
277 | |||
280 | target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); | 278 | target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); |
281 | 279 | ||
282 | } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)); | 280 | if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { |
281 | page->index = index; | ||
282 | return page; | ||
283 | } | ||
284 | } | ||
285 | return ERR_PTR(-EIO); | ||
283 | 286 | ||
284 | page->index = index; | ||
285 | return page; | ||
286 | } | 287 | } |
287 | 288 | ||
288 | static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) | 289 | static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) |
@@ -437,6 +438,7 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
437 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); | 438 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); |
438 | printk(KERN_DEBUG " sync size: %llu KB\n", | 439 | printk(KERN_DEBUG " sync size: %llu KB\n", |
439 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); | 440 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); |
441 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); | ||
440 | kunmap(bitmap->sb_page); | 442 | kunmap(bitmap->sb_page); |
441 | } | 443 | } |
442 | 444 | ||
@@ -445,7 +447,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
445 | { | 447 | { |
446 | char *reason = NULL; | 448 | char *reason = NULL; |
447 | bitmap_super_t *sb; | 449 | bitmap_super_t *sb; |
448 | unsigned long chunksize, daemon_sleep; | 450 | unsigned long chunksize, daemon_sleep, write_behind; |
449 | unsigned long bytes_read; | 451 | unsigned long bytes_read; |
450 | unsigned long long events; | 452 | unsigned long long events; |
451 | int err = -EINVAL; | 453 | int err = -EINVAL; |
@@ -474,6 +476,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
474 | 476 | ||
475 | chunksize = le32_to_cpu(sb->chunksize); | 477 | chunksize = le32_to_cpu(sb->chunksize); |
476 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); | 478 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); |
479 | write_behind = le32_to_cpu(sb->write_behind); | ||
477 | 480 | ||
478 | /* verify that the bitmap-specific fields are valid */ | 481 | /* verify that the bitmap-specific fields are valid */ |
479 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 482 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
@@ -485,7 +488,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
485 | else if ((1 << ffz(~chunksize)) != chunksize) | 488 | else if ((1 << ffz(~chunksize)) != chunksize) |
486 | reason = "bitmap chunksize not a power of 2"; | 489 | reason = "bitmap chunksize not a power of 2"; |
487 | else if (daemon_sleep < 1 || daemon_sleep > 15) | 490 | else if (daemon_sleep < 1 || daemon_sleep > 15) |
488 | reason = "daemon sleep period out of range"; | 491 | reason = "daemon sleep period out of range (1-15s)"; |
492 | else if (write_behind > COUNTER_MAX) | ||
493 | reason = "write-behind limit out of range (0 - 16383)"; | ||
489 | if (reason) { | 494 | if (reason) { |
490 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", | 495 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", |
491 | bmname(bitmap), reason); | 496 | bmname(bitmap), reason); |
@@ -518,8 +523,12 @@ success: | |||
518 | /* assign fields using values from superblock */ | 523 | /* assign fields using values from superblock */ |
519 | bitmap->chunksize = chunksize; | 524 | bitmap->chunksize = chunksize; |
520 | bitmap->daemon_sleep = daemon_sleep; | 525 | bitmap->daemon_sleep = daemon_sleep; |
526 | bitmap->daemon_lastrun = jiffies; | ||
527 | bitmap->max_write_behind = write_behind; | ||
521 | bitmap->flags |= sb->state; | 528 | bitmap->flags |= sb->state; |
522 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 529 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
530 | if (sb->state & BITMAP_STALE) | ||
531 | bitmap->events_cleared = bitmap->mddev->events; | ||
523 | err = 0; | 532 | err = 0; |
524 | out: | 533 | out: |
525 | kunmap(bitmap->sb_page); | 534 | kunmap(bitmap->sb_page); |
@@ -617,7 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) | |||
617 | page_cache_release(sb_page); | 626 | page_cache_release(sb_page); |
618 | } | 627 | } |
619 | 628 | ||
620 | static void bitmap_stop_daemons(struct bitmap *bitmap); | 629 | static void bitmap_stop_daemon(struct bitmap *bitmap); |
621 | 630 | ||
622 | /* dequeue the next item in a page list -- don't call from irq context */ | 631 | /* dequeue the next item in a page list -- don't call from irq context */ |
623 | static struct page_list *dequeue_page(struct bitmap *bitmap) | 632 | static struct page_list *dequeue_page(struct bitmap *bitmap) |
@@ -659,7 +668,7 @@ static void bitmap_file_put(struct bitmap *bitmap) | |||
659 | bitmap->file = NULL; | 668 | bitmap->file = NULL; |
660 | spin_unlock_irqrestore(&bitmap->lock, flags); | 669 | spin_unlock_irqrestore(&bitmap->lock, flags); |
661 | 670 | ||
662 | bitmap_stop_daemons(bitmap); | 671 | bitmap_stop_daemon(bitmap); |
663 | 672 | ||
664 | drain_write_queues(bitmap); | 673 | drain_write_queues(bitmap); |
665 | 674 | ||
@@ -818,7 +827,7 @@ int bitmap_unplug(struct bitmap *bitmap) | |||
818 | return 0; | 827 | return 0; |
819 | } | 828 | } |
820 | 829 | ||
821 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); | 830 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); |
822 | /* * bitmap_init_from_disk -- called at bitmap_create time to initialize | 831 | /* * bitmap_init_from_disk -- called at bitmap_create time to initialize |
823 | * the in-memory bitmap from the on-disk bitmap -- also, sets up the | 832 | * the in-memory bitmap from the on-disk bitmap -- also, sets up the |
824 | * memory mapping of the bitmap file | 833 | * memory mapping of the bitmap file |
@@ -826,8 +835,11 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); | |||
826 | * if there's no bitmap file, or if the bitmap file had been | 835 | * if there's no bitmap file, or if the bitmap file had been |
827 | * previously kicked from the array, we mark all the bits as | 836 | * previously kicked from the array, we mark all the bits as |
828 | * 1's in order to cause a full resync. | 837 | * 1's in order to cause a full resync. |
838 | * | ||
839 | * We ignore all bits for sectors that end earlier than 'start'. | ||
840 | * This is used when reading an out-of-date bitmap... | ||
829 | */ | 841 | */ |
830 | static int bitmap_init_from_disk(struct bitmap *bitmap) | 842 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) |
831 | { | 843 | { |
832 | unsigned long i, chunks, index, oldindex, bit; | 844 | unsigned long i, chunks, index, oldindex, bit; |
833 | struct page *page = NULL, *oldpage = NULL; | 845 | struct page *page = NULL, *oldpage = NULL; |
@@ -914,7 +926,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap) | |||
914 | * whole page and write it out | 926 | * whole page and write it out |
915 | */ | 927 | */ |
916 | memset(page_address(page) + offset, 0xff, | 928 | memset(page_address(page) + offset, 0xff, |
917 | PAGE_SIZE - offset); | 929 | PAGE_SIZE - offset); |
918 | ret = write_page(bitmap, page, 1); | 930 | ret = write_page(bitmap, page, 1); |
919 | if (ret) { | 931 | if (ret) { |
920 | kunmap(page); | 932 | kunmap(page); |
@@ -928,8 +940,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap) | |||
928 | } | 940 | } |
929 | if (test_bit(bit, page_address(page))) { | 941 | if (test_bit(bit, page_address(page))) { |
930 | /* if the disk bit is set, set the memory bit */ | 942 | /* if the disk bit is set, set the memory bit */ |
931 | bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap)); | 943 | bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), |
944 | ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) | ||
945 | ); | ||
932 | bit_cnt++; | 946 | bit_cnt++; |
947 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | ||
933 | } | 948 | } |
934 | } | 949 | } |
935 | 950 | ||
@@ -1141,6 +1156,9 @@ static void bitmap_writeback_daemon(mddev_t *mddev) | |||
1141 | err = -EINTR; | 1156 | err = -EINTR; |
1142 | goto out; | 1157 | goto out; |
1143 | } | 1158 | } |
1159 | if (bitmap == NULL) | ||
1160 | /* about to be stopped. */ | ||
1161 | return; | ||
1144 | 1162 | ||
1145 | PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); | 1163 | PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); |
1146 | /* wait on bitmap page writebacks */ | 1164 | /* wait on bitmap page writebacks */ |
@@ -1170,21 +1188,12 @@ static void bitmap_writeback_daemon(mddev_t *mddev) | |||
1170 | } | 1188 | } |
1171 | } | 1189 | } |
1172 | 1190 | ||
1173 | static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, | 1191 | static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap, |
1174 | void (*func)(mddev_t *), char *name) | 1192 | void (*func)(mddev_t *), char *name) |
1175 | { | 1193 | { |
1176 | mdk_thread_t *daemon; | 1194 | mdk_thread_t *daemon; |
1177 | unsigned long flags; | ||
1178 | char namebuf[32]; | 1195 | char namebuf[32]; |
1179 | 1196 | ||
1180 | spin_lock_irqsave(&bitmap->lock, flags); | ||
1181 | *ptr = NULL; | ||
1182 | |||
1183 | if (!bitmap->file) /* no need for daemon if there's no backing file */ | ||
1184 | goto out_unlock; | ||
1185 | |||
1186 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1187 | |||
1188 | #ifdef INJECT_FATAL_FAULT_2 | 1197 | #ifdef INJECT_FATAL_FAULT_2 |
1189 | daemon = NULL; | 1198 | daemon = NULL; |
1190 | #else | 1199 | #else |
@@ -1194,47 +1203,32 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, | |||
1194 | if (!daemon) { | 1203 | if (!daemon) { |
1195 | printk(KERN_ERR "%s: failed to start bitmap daemon\n", | 1204 | printk(KERN_ERR "%s: failed to start bitmap daemon\n", |
1196 | bmname(bitmap)); | 1205 | bmname(bitmap)); |
1197 | return -ECHILD; | 1206 | return ERR_PTR(-ECHILD); |
1198 | } | 1207 | } |
1199 | 1208 | ||
1200 | spin_lock_irqsave(&bitmap->lock, flags); | ||
1201 | *ptr = daemon; | ||
1202 | |||
1203 | md_wakeup_thread(daemon); /* start it running */ | 1209 | md_wakeup_thread(daemon); /* start it running */ |
1204 | 1210 | ||
1205 | PRINTK("%s: %s daemon (pid %d) started...\n", | 1211 | PRINTK("%s: %s daemon (pid %d) started...\n", |
1206 | bmname(bitmap), name, daemon->tsk->pid); | 1212 | bmname(bitmap), name, daemon->tsk->pid); |
1207 | out_unlock: | ||
1208 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1209 | return 0; | ||
1210 | } | ||
1211 | 1213 | ||
1212 | static int bitmap_start_daemons(struct bitmap *bitmap) | 1214 | return daemon; |
1213 | { | ||
1214 | int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon, | ||
1215 | bitmap_writeback_daemon, "bitmap_wb"); | ||
1216 | return err; | ||
1217 | } | 1215 | } |
1218 | 1216 | ||
1219 | static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr) | 1217 | static void bitmap_stop_daemon(struct bitmap *bitmap) |
1220 | { | 1218 | { |
1221 | mdk_thread_t *daemon; | 1219 | /* the daemon can't stop itself... it'll just exit instead... */ |
1222 | unsigned long flags; | 1220 | if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) && |
1223 | 1221 | current->pid != bitmap->writeback_daemon->tsk->pid) { | |
1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1222 | mdk_thread_t *daemon; |
1225 | daemon = *ptr; | 1223 | unsigned long flags; |
1226 | *ptr = NULL; | ||
1227 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1228 | if (daemon) | ||
1229 | md_unregister_thread(daemon); /* destroy the thread */ | ||
1230 | } | ||
1231 | 1224 | ||
1232 | static void bitmap_stop_daemons(struct bitmap *bitmap) | 1225 | spin_lock_irqsave(&bitmap->lock, flags); |
1233 | { | 1226 | daemon = bitmap->writeback_daemon; |
1234 | /* the daemons can't stop themselves... they'll just exit instead... */ | 1227 | bitmap->writeback_daemon = NULL; |
1235 | if (bitmap->writeback_daemon && | 1228 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1236 | current->pid != bitmap->writeback_daemon->tsk->pid) | 1229 | if (daemon && ! IS_ERR(daemon)) |
1237 | bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon); | 1230 | md_unregister_thread(daemon); /* destroy the thread */ |
1231 | } | ||
1238 | } | 1232 | } |
1239 | 1233 | ||
1240 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1234 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
@@ -1274,9 +1268,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | |||
1274 | } | 1268 | } |
1275 | } | 1269 | } |
1276 | 1270 | ||
1277 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) | 1271 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) |
1278 | { | 1272 | { |
1279 | if (!bitmap) return 0; | 1273 | if (!bitmap) return 0; |
1274 | |||
1275 | if (behind) { | ||
1276 | atomic_inc(&bitmap->behind_writes); | ||
1277 | PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", | ||
1278 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | ||
1279 | } | ||
1280 | |||
1280 | while (sectors) { | 1281 | while (sectors) { |
1281 | int blocks; | 1282 | int blocks; |
1282 | bitmap_counter_t *bmc; | 1283 | bitmap_counter_t *bmc; |
@@ -1311,9 +1312,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1311 | } | 1312 | } |
1312 | 1313 | ||
1313 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, | 1314 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, |
1314 | int success) | 1315 | int success, int behind) |
1315 | { | 1316 | { |
1316 | if (!bitmap) return; | 1317 | if (!bitmap) return; |
1318 | if (behind) { | ||
1319 | atomic_dec(&bitmap->behind_writes); | ||
1320 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", | ||
1321 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | ||
1322 | } | ||
1323 | |||
1317 | while (sectors) { | 1324 | while (sectors) { |
1318 | int blocks; | 1325 | int blocks; |
1319 | unsigned long flags; | 1326 | unsigned long flags; |
@@ -1424,7 +1431,7 @@ void bitmap_close_sync(struct bitmap *bitmap) | |||
1424 | } | 1431 | } |
1425 | } | 1432 | } |
1426 | 1433 | ||
1427 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) | 1434 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) |
1428 | { | 1435 | { |
1429 | /* For each chunk covered by any of these sectors, set the | 1436 | /* For each chunk covered by any of these sectors, set the |
1430 | * counter to 1 and set resync_needed. They should all | 1437 | * counter to 1 and set resync_needed. They should all |
@@ -1441,7 +1448,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) | |||
1441 | } | 1448 | } |
1442 | if (! *bmc) { | 1449 | if (! *bmc) { |
1443 | struct page *page; | 1450 | struct page *page; |
1444 | *bmc = 1 | NEEDED_MASK; | 1451 | *bmc = 1 | (needed?NEEDED_MASK:0); |
1445 | bitmap_count_page(bitmap, offset, 1); | 1452 | bitmap_count_page(bitmap, offset, 1); |
1446 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); | 1453 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); |
1447 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1454 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
@@ -1476,17 +1483,14 @@ void bitmap_flush(mddev_t *mddev) | |||
1476 | /* | 1483 | /* |
1477 | * free memory that was allocated | 1484 | * free memory that was allocated |
1478 | */ | 1485 | */ |
1479 | void bitmap_destroy(mddev_t *mddev) | 1486 | static void bitmap_free(struct bitmap *bitmap) |
1480 | { | 1487 | { |
1481 | unsigned long k, pages; | 1488 | unsigned long k, pages; |
1482 | struct bitmap_page *bp; | 1489 | struct bitmap_page *bp; |
1483 | struct bitmap *bitmap = mddev->bitmap; | ||
1484 | 1490 | ||
1485 | if (!bitmap) /* there was no bitmap */ | 1491 | if (!bitmap) /* there was no bitmap */ |
1486 | return; | 1492 | return; |
1487 | 1493 | ||
1488 | mddev->bitmap = NULL; /* disconnect from the md device */ | ||
1489 | |||
1490 | /* release the bitmap file and kill the daemon */ | 1494 | /* release the bitmap file and kill the daemon */ |
1491 | bitmap_file_put(bitmap); | 1495 | bitmap_file_put(bitmap); |
1492 | 1496 | ||
@@ -1504,6 +1508,17 @@ void bitmap_destroy(mddev_t *mddev) | |||
1504 | kfree(bp); | 1508 | kfree(bp); |
1505 | kfree(bitmap); | 1509 | kfree(bitmap); |
1506 | } | 1510 | } |
1511 | void bitmap_destroy(mddev_t *mddev) | ||
1512 | { | ||
1513 | struct bitmap *bitmap = mddev->bitmap; | ||
1514 | |||
1515 | if (!bitmap) /* there was no bitmap */ | ||
1516 | return; | ||
1517 | |||
1518 | mddev->bitmap = NULL; /* disconnect from the md device */ | ||
1519 | |||
1520 | bitmap_free(bitmap); | ||
1521 | } | ||
1507 | 1522 | ||
1508 | /* | 1523 | /* |
1509 | * initialize the bitmap structure | 1524 | * initialize the bitmap structure |
@@ -1517,6 +1532,7 @@ int bitmap_create(mddev_t *mddev) | |||
1517 | unsigned long pages; | 1532 | unsigned long pages; |
1518 | struct file *file = mddev->bitmap_file; | 1533 | struct file *file = mddev->bitmap_file; |
1519 | int err; | 1534 | int err; |
1535 | sector_t start; | ||
1520 | 1536 | ||
1521 | BUG_ON(sizeof(bitmap_super_t) != 256); | 1537 | BUG_ON(sizeof(bitmap_super_t) != 256); |
1522 | 1538 | ||
@@ -1533,15 +1549,15 @@ int bitmap_create(mddev_t *mddev) | |||
1533 | 1549 | ||
1534 | spin_lock_init(&bitmap->lock); | 1550 | spin_lock_init(&bitmap->lock); |
1535 | bitmap->mddev = mddev; | 1551 | bitmap->mddev = mddev; |
1536 | mddev->bitmap = bitmap; | ||
1537 | 1552 | ||
1538 | spin_lock_init(&bitmap->write_lock); | 1553 | spin_lock_init(&bitmap->write_lock); |
1539 | INIT_LIST_HEAD(&bitmap->complete_pages); | 1554 | INIT_LIST_HEAD(&bitmap->complete_pages); |
1540 | init_waitqueue_head(&bitmap->write_wait); | 1555 | init_waitqueue_head(&bitmap->write_wait); |
1541 | bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, | 1556 | bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, |
1542 | write_pool_free, NULL); | 1557 | write_pool_free, NULL); |
1558 | err = -ENOMEM; | ||
1543 | if (!bitmap->write_pool) | 1559 | if (!bitmap->write_pool) |
1544 | return -ENOMEM; | 1560 | goto error; |
1545 | 1561 | ||
1546 | bitmap->file = file; | 1562 | bitmap->file = file; |
1547 | bitmap->offset = mddev->bitmap_offset; | 1563 | bitmap->offset = mddev->bitmap_offset; |
@@ -1549,7 +1565,7 @@ int bitmap_create(mddev_t *mddev) | |||
1549 | /* read superblock from bitmap file (this sets bitmap->chunksize) */ | 1565 | /* read superblock from bitmap file (this sets bitmap->chunksize) */ |
1550 | err = bitmap_read_sb(bitmap); | 1566 | err = bitmap_read_sb(bitmap); |
1551 | if (err) | 1567 | if (err) |
1552 | return err; | 1568 | goto error; |
1553 | 1569 | ||
1554 | bitmap->chunkshift = find_first_bit(&bitmap->chunksize, | 1570 | bitmap->chunkshift = find_first_bit(&bitmap->chunksize, |
1555 | sizeof(bitmap->chunksize)); | 1571 | sizeof(bitmap->chunksize)); |
@@ -1573,27 +1589,44 @@ int bitmap_create(mddev_t *mddev) | |||
1573 | #else | 1589 | #else |
1574 | bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); | 1590 | bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); |
1575 | #endif | 1591 | #endif |
1592 | err = -ENOMEM; | ||
1576 | if (!bitmap->bp) | 1593 | if (!bitmap->bp) |
1577 | return -ENOMEM; | 1594 | goto error; |
1578 | memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); | 1595 | memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); |
1579 | 1596 | ||
1580 | bitmap->flags |= BITMAP_ACTIVE; | 1597 | bitmap->flags |= BITMAP_ACTIVE; |
1581 | 1598 | ||
1582 | /* now that we have some pages available, initialize the in-memory | 1599 | /* now that we have some pages available, initialize the in-memory |
1583 | * bitmap from the on-disk bitmap */ | 1600 | * bitmap from the on-disk bitmap */ |
1584 | err = bitmap_init_from_disk(bitmap); | 1601 | start = 0; |
1602 | if (mddev->degraded == 0 | ||
1603 | || bitmap->events_cleared == mddev->events) | ||
1604 | /* no need to keep dirty bits to optimise a re-add of a missing device */ | ||
1605 | start = mddev->recovery_cp; | ||
1606 | err = bitmap_init_from_disk(bitmap, start); | ||
1585 | 1607 | ||
1586 | if (err) | 1608 | if (err) |
1587 | return err; | 1609 | goto error; |
1588 | 1610 | ||
1589 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1611 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", |
1590 | pages, bmname(bitmap)); | 1612 | pages, bmname(bitmap)); |
1591 | 1613 | ||
1592 | /* kick off the bitmap daemons */ | 1614 | mddev->bitmap = bitmap; |
1593 | err = bitmap_start_daemons(bitmap); | 1615 | |
1594 | if (err) | 1616 | if (file) |
1595 | return err; | 1617 | /* kick off the bitmap writeback daemon */ |
1618 | bitmap->writeback_daemon = | ||
1619 | bitmap_start_daemon(bitmap, | ||
1620 | bitmap_writeback_daemon, | ||
1621 | "bitmap_wb"); | ||
1622 | |||
1623 | if (IS_ERR(bitmap->writeback_daemon)) | ||
1624 | return PTR_ERR(bitmap->writeback_daemon); | ||
1596 | return bitmap_update_sb(bitmap); | 1625 | return bitmap_update_sb(bitmap); |
1626 | |||
1627 | error: | ||
1628 | bitmap_free(bitmap); | ||
1629 | return err; | ||
1597 | } | 1630 | } |
1598 | 1631 | ||
1599 | /* the bitmap API -- for raid personalities */ | 1632 | /* the bitmap API -- for raid personalities */ |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 17212b4201a1..cc07bbebbb16 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -568,12 +568,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) | |||
568 | 568 | ||
569 | bad: | 569 | bad: |
570 | dm_io_put(sectors_to_pages(chunk_size)); | 570 | dm_io_put(sectors_to_pages(chunk_size)); |
571 | if (ps) { | 571 | if (ps && ps->area) |
572 | if (ps->area) | 572 | free_area(ps); |
573 | free_area(ps); | 573 | kfree(ps); |
574 | |||
575 | kfree(ps); | ||
576 | } | ||
577 | return r; | 574 | return r; |
578 | } | 575 | } |
579 | 576 | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index b08df8b9b2ca..863282513753 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -375,16 +375,18 @@ static void rh_inc(struct region_hash *rh, region_t region) | |||
375 | 375 | ||
376 | read_lock(&rh->hash_lock); | 376 | read_lock(&rh->hash_lock); |
377 | reg = __rh_find(rh, region); | 377 | reg = __rh_find(rh, region); |
378 | |||
379 | atomic_inc(®->pending); | ||
380 | |||
381 | spin_lock_irq(&rh->region_lock); | ||
378 | if (reg->state == RH_CLEAN) { | 382 | if (reg->state == RH_CLEAN) { |
379 | rh->log->type->mark_region(rh->log, reg->key); | 383 | rh->log->type->mark_region(rh->log, reg->key); |
380 | 384 | ||
381 | spin_lock_irq(&rh->region_lock); | ||
382 | reg->state = RH_DIRTY; | 385 | reg->state = RH_DIRTY; |
383 | list_del_init(®->list); /* take off the clean list */ | 386 | list_del_init(®->list); /* take off the clean list */ |
384 | spin_unlock_irq(&rh->region_lock); | ||
385 | } | 387 | } |
388 | spin_unlock_irq(&rh->region_lock); | ||
386 | 389 | ||
387 | atomic_inc(®->pending); | ||
388 | read_unlock(&rh->hash_lock); | 390 | read_unlock(&rh->hash_lock); |
389 | } | 391 | } |
390 | 392 | ||
@@ -408,6 +410,10 @@ static void rh_dec(struct region_hash *rh, region_t region) | |||
408 | 410 | ||
409 | if (atomic_dec_and_test(®->pending)) { | 411 | if (atomic_dec_and_test(®->pending)) { |
410 | spin_lock_irqsave(&rh->region_lock, flags); | 412 | spin_lock_irqsave(&rh->region_lock, flags); |
413 | if (atomic_read(®->pending)) { /* check race */ | ||
414 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
415 | return; | ||
416 | } | ||
411 | if (reg->state == RH_RECOVERING) { | 417 | if (reg->state == RH_RECOVERING) { |
412 | list_add_tail(®->list, &rh->quiesced_regions); | 418 | list_add_tail(®->list, &rh->quiesced_regions); |
413 | } else { | 419 | } else { |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 8d740013d74d..bb279fad2fd2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | |||
38 | /* | 38 | /* |
39 | * sector_div(a,b) returns the remainer and sets a to a/b | 39 | * sector_div(a,b) returns the remainer and sets a to a/b |
40 | */ | 40 | */ |
41 | (void)sector_div(block, conf->smallest->size); | 41 | block >>= conf->preshift; |
42 | (void)sector_div(block, conf->hash_spacing); | ||
42 | hash = conf->hash_table[block]; | 43 | hash = conf->hash_table[block]; |
43 | 44 | ||
44 | while ((sector>>1) >= (hash->size + hash->offset)) | 45 | while ((sector>>1) >= (hash->size + hash->offset)) |
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | |||
47 | } | 48 | } |
48 | 49 | ||
49 | /** | 50 | /** |
50 | * linear_mergeable_bvec -- tell bio layer if a two requests can be merged | 51 | * linear_mergeable_bvec -- tell bio layer if two requests can be merged |
51 | * @q: request queue | 52 | * @q: request queue |
52 | * @bio: the buffer head that's been built up so far | 53 | * @bio: the buffer head that's been built up so far |
53 | * @biovec: the request that could be merged to it. | 54 | * @biovec: the request that could be merged to it. |
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev) | |||
116 | dev_info_t **table; | 117 | dev_info_t **table; |
117 | mdk_rdev_t *rdev; | 118 | mdk_rdev_t *rdev; |
118 | int i, nb_zone, cnt; | 119 | int i, nb_zone, cnt; |
119 | sector_t start; | 120 | sector_t min_spacing; |
120 | sector_t curr_offset; | 121 | sector_t curr_offset; |
121 | struct list_head *tmp; | 122 | struct list_head *tmp; |
122 | 123 | ||
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev) | |||
127 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); | 128 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); |
128 | mddev->private = conf; | 129 | mddev->private = conf; |
129 | 130 | ||
130 | /* | ||
131 | * Find the smallest device. | ||
132 | */ | ||
133 | |||
134 | conf->smallest = NULL; | ||
135 | cnt = 0; | 131 | cnt = 0; |
136 | mddev->array_size = 0; | 132 | mddev->array_size = 0; |
137 | 133 | ||
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev) | |||
159 | disk->size = rdev->size; | 155 | disk->size = rdev->size; |
160 | mddev->array_size += rdev->size; | 156 | mddev->array_size += rdev->size; |
161 | 157 | ||
162 | if (!conf->smallest || (disk->size < conf->smallest->size)) | ||
163 | conf->smallest = disk; | ||
164 | cnt++; | 158 | cnt++; |
165 | } | 159 | } |
166 | if (cnt != mddev->raid_disks) { | 160 | if (cnt != mddev->raid_disks) { |
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev) | |||
168 | goto out; | 162 | goto out; |
169 | } | 163 | } |
170 | 164 | ||
165 | min_spacing = mddev->array_size; | ||
166 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); | ||
167 | |||
168 | /* min_spacing is the minimum spacing that will fit the hash | ||
169 | * table in one PAGE. This may be much smaller than needed. | ||
170 | * We find the smallest non-terminal set of consecutive devices | ||
171 | * that is larger than min_spacing as use the size of that as | ||
172 | * the actual spacing | ||
173 | */ | ||
174 | conf->hash_spacing = mddev->array_size; | ||
175 | for (i=0; i < cnt-1 ; i++) { | ||
176 | sector_t sz = 0; | ||
177 | int j; | ||
178 | for (j=i; i<cnt-1 && sz < min_spacing ; j++) | ||
179 | sz += conf->disks[j].size; | ||
180 | if (sz >= min_spacing && sz < conf->hash_spacing) | ||
181 | conf->hash_spacing = sz; | ||
182 | } | ||
183 | |||
184 | /* hash_spacing may be too large for sector_div to work with, | ||
185 | * so we might need to pre-shift | ||
186 | */ | ||
187 | conf->preshift = 0; | ||
188 | if (sizeof(sector_t) > sizeof(u32)) { | ||
189 | sector_t space = conf->hash_spacing; | ||
190 | while (space > (sector_t)(~(u32)0)) { | ||
191 | space >>= 1; | ||
192 | conf->preshift++; | ||
193 | } | ||
194 | } | ||
171 | /* | 195 | /* |
172 | * This code was restructured to work around a gcc-2.95.3 internal | 196 | * This code was restructured to work around a gcc-2.95.3 internal |
173 | * compiler error. Alter it with care. | 197 | * compiler error. Alter it with care. |
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev) | |||
177 | unsigned round; | 201 | unsigned round; |
178 | unsigned long base; | 202 | unsigned long base; |
179 | 203 | ||
180 | sz = mddev->array_size; | 204 | sz = mddev->array_size >> conf->preshift; |
181 | base = conf->smallest->size; | 205 | sz += 1; /* force round-up */ |
206 | base = conf->hash_spacing >> conf->preshift; | ||
182 | round = sector_div(sz, base); | 207 | round = sector_div(sz, base); |
183 | nb_zone = conf->nr_zones = sz + (round ? 1 : 0); | 208 | nb_zone = sz + (round ? 1 : 0); |
184 | } | 209 | } |
185 | 210 | BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); | |
186 | conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, | 211 | |
212 | conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, | ||
187 | GFP_KERNEL); | 213 | GFP_KERNEL); |
188 | if (!conf->hash_table) | 214 | if (!conf->hash_table) |
189 | goto out; | 215 | goto out; |
190 | 216 | ||
191 | /* | 217 | /* |
192 | * Here we generate the linear hash table | 218 | * Here we generate the linear hash table |
219 | * First calculate the device offsets. | ||
193 | */ | 220 | */ |
221 | conf->disks[0].offset = 0; | ||
222 | for (i=1; i<mddev->raid_disks; i++) | ||
223 | conf->disks[i].offset = | ||
224 | conf->disks[i-1].offset + | ||
225 | conf->disks[i-1].size; | ||
226 | |||
194 | table = conf->hash_table; | 227 | table = conf->hash_table; |
195 | start = 0; | ||
196 | curr_offset = 0; | 228 | curr_offset = 0; |
197 | for (i = 0; i < cnt; i++) { | 229 | i = 0; |
198 | dev_info_t *disk = conf->disks + i; | 230 | for (curr_offset = 0; |
231 | curr_offset < mddev->array_size; | ||
232 | curr_offset += conf->hash_spacing) { | ||
199 | 233 | ||
200 | disk->offset = curr_offset; | 234 | while (i < mddev->raid_disks-1 && |
201 | curr_offset += disk->size; | 235 | curr_offset >= conf->disks[i+1].offset) |
236 | i++; | ||
202 | 237 | ||
203 | /* 'curr_offset' is the end of this disk | 238 | *table ++ = conf->disks + i; |
204 | * 'start' is the start of table | 239 | } |
240 | |||
241 | if (conf->preshift) { | ||
242 | conf->hash_spacing >>= conf->preshift; | ||
243 | /* round hash_spacing up so that when we divide by it, | ||
244 | * we err on the side of "too-low", which is safest. | ||
205 | */ | 245 | */ |
206 | while (start < curr_offset) { | 246 | conf->hash_spacing++; |
207 | *table++ = disk; | ||
208 | start += conf->smallest->size; | ||
209 | } | ||
210 | } | 247 | } |
211 | if (table-conf->hash_table != nb_zone) | 248 | |
212 | BUG(); | 249 | BUG_ON(table - conf->hash_table > nb_zone); |
213 | 250 | ||
214 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 251 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
215 | mddev->queue->unplug_fn = linear_unplug; | 252 | mddev->queue->unplug_fn = linear_unplug; |
@@ -238,6 +275,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio) | |||
238 | dev_info_t *tmp_dev; | 275 | dev_info_t *tmp_dev; |
239 | sector_t block; | 276 | sector_t block; |
240 | 277 | ||
278 | if (unlikely(bio_barrier(bio))) { | ||
279 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
280 | return 0; | ||
281 | } | ||
282 | |||
241 | if (bio_data_dir(bio)==WRITE) { | 283 | if (bio_data_dir(bio)==WRITE) { |
242 | disk_stat_inc(mddev->gendisk, writes); | 284 | disk_stat_inc(mddev->gendisk, writes); |
243 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | 285 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); |
@@ -294,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev) | |||
294 | sector_t s = 0; | 336 | sector_t s = 0; |
295 | 337 | ||
296 | seq_printf(seq, " "); | 338 | seq_printf(seq, " "); |
297 | for (j = 0; j < conf->nr_zones; j++) | 339 | for (j = 0; j < mddev->raid_disks; j++) |
298 | { | 340 | { |
299 | char b[BDEVNAME_SIZE]; | 341 | char b[BDEVNAME_SIZE]; |
300 | s += conf->smallest_size; | 342 | s += conf->smallest_size; |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 20ca80b7dc20..2897df90df44 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -34,6 +34,7 @@ | |||
34 | 34 | ||
35 | #include <linux/module.h> | 35 | #include <linux/module.h> |
36 | #include <linux/config.h> | 36 | #include <linux/config.h> |
37 | #include <linux/kthread.h> | ||
37 | #include <linux/linkage.h> | 38 | #include <linux/linkage.h> |
38 | #include <linux/raid/md.h> | 39 | #include <linux/raid/md.h> |
39 | #include <linux/raid/bitmap.h> | 40 | #include <linux/raid/bitmap.h> |
@@ -73,7 +74,7 @@ static DEFINE_SPINLOCK(pers_lock); | |||
73 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' | 74 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
74 | * is 1000 KB/sec, so the extra system load does not show up that much. | 75 | * is 1000 KB/sec, so the extra system load does not show up that much. |
75 | * Increase it if you want to have more _guaranteed_ speed. Note that | 76 | * Increase it if you want to have more _guaranteed_ speed. Note that |
76 | * the RAID driver will use the maximum available bandwith if the IO | 77 | * the RAID driver will use the maximum available bandwidth if the IO |
77 | * subsystem is idle. There is also an 'absolute maximum' reconstruction | 78 | * subsystem is idle. There is also an 'absolute maximum' reconstruction |
78 | * speed limit - in case reconstruction slows down your system despite | 79 | * speed limit - in case reconstruction slows down your system despite |
79 | * idle IO detection. | 80 | * idle IO detection. |
@@ -393,7 +394,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, | |||
393 | return ret; | 394 | return ret; |
394 | } | 395 | } |
395 | 396 | ||
396 | static int read_disk_sb(mdk_rdev_t * rdev) | 397 | static int read_disk_sb(mdk_rdev_t * rdev, int size) |
397 | { | 398 | { |
398 | char b[BDEVNAME_SIZE]; | 399 | char b[BDEVNAME_SIZE]; |
399 | if (!rdev->sb_page) { | 400 | if (!rdev->sb_page) { |
@@ -404,7 +405,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) | |||
404 | return 0; | 405 | return 0; |
405 | 406 | ||
406 | 407 | ||
407 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) | 408 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) |
408 | goto fail; | 409 | goto fail; |
409 | rdev->sb_loaded = 1; | 410 | rdev->sb_loaded = 1; |
410 | return 0; | 411 | return 0; |
@@ -531,7 +532,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
531 | sb_offset = calc_dev_sboffset(rdev->bdev); | 532 | sb_offset = calc_dev_sboffset(rdev->bdev); |
532 | rdev->sb_offset = sb_offset; | 533 | rdev->sb_offset = sb_offset; |
533 | 534 | ||
534 | ret = read_disk_sb(rdev); | 535 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
535 | if (ret) return ret; | 536 | if (ret) return ret; |
536 | 537 | ||
537 | ret = -EINVAL; | 538 | ret = -EINVAL; |
@@ -564,6 +565,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
564 | 565 | ||
565 | rdev->preferred_minor = sb->md_minor; | 566 | rdev->preferred_minor = sb->md_minor; |
566 | rdev->data_offset = 0; | 567 | rdev->data_offset = 0; |
568 | rdev->sb_size = MD_SB_BYTES; | ||
567 | 569 | ||
568 | if (sb->level == LEVEL_MULTIPATH) | 570 | if (sb->level == LEVEL_MULTIPATH) |
569 | rdev->desc_nr = -1; | 571 | rdev->desc_nr = -1; |
@@ -623,6 +625,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
623 | mddev->size = sb->size; | 625 | mddev->size = sb->size; |
624 | mddev->events = md_event(sb); | 626 | mddev->events = md_event(sb); |
625 | mddev->bitmap_offset = 0; | 627 | mddev->bitmap_offset = 0; |
628 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | ||
626 | 629 | ||
627 | if (sb->state & (1<<MD_SB_CLEAN)) | 630 | if (sb->state & (1<<MD_SB_CLEAN)) |
628 | mddev->recovery_cp = MaxSector; | 631 | mddev->recovery_cp = MaxSector; |
@@ -643,12 +646,12 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
643 | 646 | ||
644 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && | 647 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
645 | mddev->bitmap_file == NULL) { | 648 | mddev->bitmap_file == NULL) { |
646 | if (mddev->level != 1) { | 649 | if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { |
647 | /* FIXME use a better test */ | 650 | /* FIXME use a better test */ |
648 | printk(KERN_WARNING "md: bitmaps only support for raid1\n"); | 651 | printk(KERN_WARNING "md: bitmaps only support for raid1\n"); |
649 | return -EINVAL; | 652 | return -EINVAL; |
650 | } | 653 | } |
651 | mddev->bitmap_offset = (MD_SB_BYTES >> 9); | 654 | mddev->bitmap_offset = mddev->default_bitmap_offset; |
652 | } | 655 | } |
653 | 656 | ||
654 | } else if (mddev->pers == NULL) { | 657 | } else if (mddev->pers == NULL) { |
@@ -669,6 +672,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
669 | 672 | ||
670 | if (mddev->level != LEVEL_MULTIPATH) { | 673 | if (mddev->level != LEVEL_MULTIPATH) { |
671 | rdev->faulty = 0; | 674 | rdev->faulty = 0; |
675 | rdev->flags = 0; | ||
672 | desc = sb->disks + rdev->desc_nr; | 676 | desc = sb->disks + rdev->desc_nr; |
673 | 677 | ||
674 | if (desc->state & (1<<MD_DISK_FAULTY)) | 678 | if (desc->state & (1<<MD_DISK_FAULTY)) |
@@ -678,6 +682,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
678 | rdev->in_sync = 1; | 682 | rdev->in_sync = 1; |
679 | rdev->raid_disk = desc->raid_disk; | 683 | rdev->raid_disk = desc->raid_disk; |
680 | } | 684 | } |
685 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
686 | set_bit(WriteMostly, &rdev->flags); | ||
681 | } else /* MULTIPATH are always insync */ | 687 | } else /* MULTIPATH are always insync */ |
682 | rdev->in_sync = 1; | 688 | rdev->in_sync = 1; |
683 | return 0; | 689 | return 0; |
@@ -706,6 +712,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
706 | int i; | 712 | int i; |
707 | int active=0, working=0,failed=0,spare=0,nr_disks=0; | 713 | int active=0, working=0,failed=0,spare=0,nr_disks=0; |
708 | 714 | ||
715 | rdev->sb_size = MD_SB_BYTES; | ||
716 | |||
709 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 717 | sb = (mdp_super_t*)page_address(rdev->sb_page); |
710 | 718 | ||
711 | memset(sb, 0, sizeof(*sb)); | 719 | memset(sb, 0, sizeof(*sb)); |
@@ -776,6 +784,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
776 | spare++; | 784 | spare++; |
777 | working++; | 785 | working++; |
778 | } | 786 | } |
787 | if (test_bit(WriteMostly, &rdev2->flags)) | ||
788 | d->state |= (1<<MD_DISK_WRITEMOSTLY); | ||
779 | } | 789 | } |
780 | 790 | ||
781 | /* now set the "removed" and "faulty" bits on any missing devices */ | 791 | /* now set the "removed" and "faulty" bits on any missing devices */ |
@@ -831,6 +841,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
831 | int ret; | 841 | int ret; |
832 | sector_t sb_offset; | 842 | sector_t sb_offset; |
833 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 843 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
844 | int bmask; | ||
834 | 845 | ||
835 | /* | 846 | /* |
836 | * Calculate the position of the superblock. | 847 | * Calculate the position of the superblock. |
@@ -859,7 +870,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
859 | } | 870 | } |
860 | rdev->sb_offset = sb_offset; | 871 | rdev->sb_offset = sb_offset; |
861 | 872 | ||
862 | ret = read_disk_sb(rdev); | 873 | /* superblock is rarely larger than 1K, but it can be larger, |
874 | * and it is safe to read 4k, so we do that | ||
875 | */ | ||
876 | ret = read_disk_sb(rdev, 4096); | ||
863 | if (ret) return ret; | 877 | if (ret) return ret; |
864 | 878 | ||
865 | 879 | ||
@@ -869,7 +883,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
869 | sb->major_version != cpu_to_le32(1) || | 883 | sb->major_version != cpu_to_le32(1) || |
870 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || | 884 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
871 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || | 885 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || |
872 | sb->feature_map != 0) | 886 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
873 | return -EINVAL; | 887 | return -EINVAL; |
874 | 888 | ||
875 | if (calc_sb_1_csum(sb) != sb->sb_csum) { | 889 | if (calc_sb_1_csum(sb) != sb->sb_csum) { |
@@ -885,6 +899,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
885 | rdev->preferred_minor = 0xffff; | 899 | rdev->preferred_minor = 0xffff; |
886 | rdev->data_offset = le64_to_cpu(sb->data_offset); | 900 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
887 | 901 | ||
902 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; | ||
903 | bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; | ||
904 | if (rdev->sb_size & bmask) | ||
905 | rdev-> sb_size = (rdev->sb_size | bmask)+1; | ||
906 | |||
888 | if (refdev == 0) | 907 | if (refdev == 0) |
889 | return 1; | 908 | return 1; |
890 | else { | 909 | else { |
@@ -939,13 +958,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
939 | mddev->size = le64_to_cpu(sb->size)/2; | 958 | mddev->size = le64_to_cpu(sb->size)/2; |
940 | mddev->events = le64_to_cpu(sb->events); | 959 | mddev->events = le64_to_cpu(sb->events); |
941 | mddev->bitmap_offset = 0; | 960 | mddev->bitmap_offset = 0; |
961 | mddev->default_bitmap_offset = 0; | ||
962 | mddev->default_bitmap_offset = 1024; | ||
942 | 963 | ||
943 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | 964 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
944 | memcpy(mddev->uuid, sb->set_uuid, 16); | 965 | memcpy(mddev->uuid, sb->set_uuid, 16); |
945 | 966 | ||
946 | mddev->max_disks = (4096-256)/2; | 967 | mddev->max_disks = (4096-256)/2; |
947 | 968 | ||
948 | if ((le32_to_cpu(sb->feature_map) & 1) && | 969 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
949 | mddev->bitmap_file == NULL ) { | 970 | mddev->bitmap_file == NULL ) { |
950 | if (mddev->level != 1) { | 971 | if (mddev->level != 1) { |
951 | printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); | 972 | printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); |
@@ -986,6 +1007,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
986 | rdev->raid_disk = role; | 1007 | rdev->raid_disk = role; |
987 | break; | 1008 | break; |
988 | } | 1009 | } |
1010 | rdev->flags = 0; | ||
1011 | if (sb->devflags & WriteMostly1) | ||
1012 | set_bit(WriteMostly, &rdev->flags); | ||
989 | } else /* MULTIPATH are always insync */ | 1013 | } else /* MULTIPATH are always insync */ |
990 | rdev->in_sync = 1; | 1014 | rdev->in_sync = 1; |
991 | 1015 | ||
@@ -1017,7 +1041,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1017 | 1041 | ||
1018 | if (mddev->bitmap && mddev->bitmap_file == NULL) { | 1042 | if (mddev->bitmap && mddev->bitmap_file == NULL) { |
1019 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); | 1043 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); |
1020 | sb->feature_map = cpu_to_le32(1); | 1044 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
1021 | } | 1045 | } |
1022 | 1046 | ||
1023 | max_dev = 0; | 1047 | max_dev = 0; |
@@ -1363,7 +1387,7 @@ repeat: | |||
1363 | dprintk("%s ", bdevname(rdev->bdev,b)); | 1387 | dprintk("%s ", bdevname(rdev->bdev,b)); |
1364 | if (!rdev->faulty) { | 1388 | if (!rdev->faulty) { |
1365 | md_super_write(mddev,rdev, | 1389 | md_super_write(mddev,rdev, |
1366 | rdev->sb_offset<<1, MD_SB_BYTES, | 1390 | rdev->sb_offset<<1, rdev->sb_size, |
1367 | rdev->sb_page); | 1391 | rdev->sb_page); |
1368 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | 1392 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", |
1369 | bdevname(rdev->bdev,b), | 1393 | bdevname(rdev->bdev,b), |
@@ -2073,6 +2097,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
2073 | info.state = 0; | 2097 | info.state = 0; |
2074 | if (mddev->in_sync) | 2098 | if (mddev->in_sync) |
2075 | info.state = (1<<MD_SB_CLEAN); | 2099 | info.state = (1<<MD_SB_CLEAN); |
2100 | if (mddev->bitmap && mddev->bitmap_offset) | ||
2101 | info.state = (1<<MD_SB_BITMAP_PRESENT); | ||
2076 | info.active_disks = active; | 2102 | info.active_disks = active; |
2077 | info.working_disks = working; | 2103 | info.working_disks = working; |
2078 | info.failed_disks = failed; | 2104 | info.failed_disks = failed; |
@@ -2087,7 +2113,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
2087 | return 0; | 2113 | return 0; |
2088 | } | 2114 | } |
2089 | 2115 | ||
2090 | static int get_bitmap_file(mddev_t * mddev, void * arg) | 2116 | static int get_bitmap_file(mddev_t * mddev, void __user * arg) |
2091 | { | 2117 | { |
2092 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ | 2118 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ |
2093 | char *ptr, *buf = NULL; | 2119 | char *ptr, *buf = NULL; |
@@ -2146,6 +2172,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) | |||
2146 | info.state |= (1<<MD_DISK_ACTIVE); | 2172 | info.state |= (1<<MD_DISK_ACTIVE); |
2147 | info.state |= (1<<MD_DISK_SYNC); | 2173 | info.state |= (1<<MD_DISK_SYNC); |
2148 | } | 2174 | } |
2175 | if (test_bit(WriteMostly, &rdev->flags)) | ||
2176 | info.state |= (1<<MD_DISK_WRITEMOSTLY); | ||
2149 | } else { | 2177 | } else { |
2150 | info.major = info.minor = 0; | 2178 | info.major = info.minor = 0; |
2151 | info.raid_disk = -1; | 2179 | info.raid_disk = -1; |
@@ -2210,8 +2238,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
2210 | mdname(mddev)); | 2238 | mdname(mddev)); |
2211 | return -EINVAL; | 2239 | return -EINVAL; |
2212 | } | 2240 | } |
2213 | rdev = md_import_device(dev, mddev->major_version, | 2241 | if (mddev->persistent) |
2214 | mddev->minor_version); | 2242 | rdev = md_import_device(dev, mddev->major_version, |
2243 | mddev->minor_version); | ||
2244 | else | ||
2245 | rdev = md_import_device(dev, -1, -1); | ||
2215 | if (IS_ERR(rdev)) { | 2246 | if (IS_ERR(rdev)) { |
2216 | printk(KERN_WARNING | 2247 | printk(KERN_WARNING |
2217 | "md: md_import_device returned %ld\n", | 2248 | "md: md_import_device returned %ld\n", |
@@ -2231,6 +2262,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
2231 | rdev->saved_raid_disk = rdev->raid_disk; | 2262 | rdev->saved_raid_disk = rdev->raid_disk; |
2232 | 2263 | ||
2233 | rdev->in_sync = 0; /* just to be sure */ | 2264 | rdev->in_sync = 0; /* just to be sure */ |
2265 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
2266 | set_bit(WriteMostly, &rdev->flags); | ||
2267 | |||
2234 | rdev->raid_disk = -1; | 2268 | rdev->raid_disk = -1; |
2235 | err = bind_rdev_to_array(rdev, mddev); | 2269 | err = bind_rdev_to_array(rdev, mddev); |
2236 | if (err) | 2270 | if (err) |
@@ -2271,6 +2305,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
2271 | else | 2305 | else |
2272 | rdev->in_sync = 0; | 2306 | rdev->in_sync = 0; |
2273 | 2307 | ||
2308 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
2309 | set_bit(WriteMostly, &rdev->flags); | ||
2310 | |||
2274 | err = bind_rdev_to_array(rdev, mddev); | 2311 | err = bind_rdev_to_array(rdev, mddev); |
2275 | if (err) { | 2312 | if (err) { |
2276 | export_rdev(rdev); | 2313 | export_rdev(rdev); |
@@ -2430,25 +2467,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd) | |||
2430 | { | 2467 | { |
2431 | int err; | 2468 | int err; |
2432 | 2469 | ||
2433 | if (mddev->pers) | 2470 | if (mddev->pers) { |
2434 | return -EBUSY; | 2471 | if (!mddev->pers->quiesce) |
2472 | return -EBUSY; | ||
2473 | if (mddev->recovery || mddev->sync_thread) | ||
2474 | return -EBUSY; | ||
2475 | /* we should be able to change the bitmap.. */ | ||
2476 | } | ||
2435 | 2477 | ||
2436 | mddev->bitmap_file = fget(fd); | ||
2437 | 2478 | ||
2438 | if (mddev->bitmap_file == NULL) { | 2479 | if (fd >= 0) { |
2439 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | 2480 | if (mddev->bitmap) |
2440 | mdname(mddev)); | 2481 | return -EEXIST; /* cannot add when bitmap is present */ |
2441 | return -EBADF; | 2482 | mddev->bitmap_file = fget(fd); |
2442 | } | ||
2443 | 2483 | ||
2444 | err = deny_bitmap_write_access(mddev->bitmap_file); | 2484 | if (mddev->bitmap_file == NULL) { |
2445 | if (err) { | 2485 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", |
2446 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | 2486 | mdname(mddev)); |
2447 | mdname(mddev)); | 2487 | return -EBADF; |
2448 | fput(mddev->bitmap_file); | 2488 | } |
2449 | mddev->bitmap_file = NULL; | 2489 | |
2450 | } else | 2490 | err = deny_bitmap_write_access(mddev->bitmap_file); |
2491 | if (err) { | ||
2492 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | ||
2493 | mdname(mddev)); | ||
2494 | fput(mddev->bitmap_file); | ||
2495 | mddev->bitmap_file = NULL; | ||
2496 | return err; | ||
2497 | } | ||
2451 | mddev->bitmap_offset = 0; /* file overrides offset */ | 2498 | mddev->bitmap_offset = 0; /* file overrides offset */ |
2499 | } else if (mddev->bitmap == NULL) | ||
2500 | return -ENOENT; /* cannot remove what isn't there */ | ||
2501 | err = 0; | ||
2502 | if (mddev->pers) { | ||
2503 | mddev->pers->quiesce(mddev, 1); | ||
2504 | if (fd >= 0) | ||
2505 | err = bitmap_create(mddev); | ||
2506 | if (fd < 0 || err) | ||
2507 | bitmap_destroy(mddev); | ||
2508 | mddev->pers->quiesce(mddev, 0); | ||
2509 | } else if (fd < 0) { | ||
2510 | if (mddev->bitmap_file) | ||
2511 | fput(mddev->bitmap_file); | ||
2512 | mddev->bitmap_file = NULL; | ||
2513 | } | ||
2514 | |||
2452 | return err; | 2515 | return err; |
2453 | } | 2516 | } |
2454 | 2517 | ||
@@ -2528,6 +2591,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
2528 | { | 2591 | { |
2529 | int rv = 0; | 2592 | int rv = 0; |
2530 | int cnt = 0; | 2593 | int cnt = 0; |
2594 | int state = 0; | ||
2595 | |||
2596 | /* calculate expected state,ignoring low bits */ | ||
2597 | if (mddev->bitmap && mddev->bitmap_offset) | ||
2598 | state |= (1 << MD_SB_BITMAP_PRESENT); | ||
2531 | 2599 | ||
2532 | if (mddev->major_version != info->major_version || | 2600 | if (mddev->major_version != info->major_version || |
2533 | mddev->minor_version != info->minor_version || | 2601 | mddev->minor_version != info->minor_version || |
@@ -2536,12 +2604,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
2536 | mddev->level != info->level || | 2604 | mddev->level != info->level || |
2537 | /* mddev->layout != info->layout || */ | 2605 | /* mddev->layout != info->layout || */ |
2538 | !mddev->persistent != info->not_persistent|| | 2606 | !mddev->persistent != info->not_persistent|| |
2539 | mddev->chunk_size != info->chunk_size ) | 2607 | mddev->chunk_size != info->chunk_size || |
2608 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ | ||
2609 | ((state^info->state) & 0xfffffe00) | ||
2610 | ) | ||
2540 | return -EINVAL; | 2611 | return -EINVAL; |
2541 | /* Check there is only one change */ | 2612 | /* Check there is only one change */ |
2542 | if (mddev->size != info->size) cnt++; | 2613 | if (mddev->size != info->size) cnt++; |
2543 | if (mddev->raid_disks != info->raid_disks) cnt++; | 2614 | if (mddev->raid_disks != info->raid_disks) cnt++; |
2544 | if (mddev->layout != info->layout) cnt++; | 2615 | if (mddev->layout != info->layout) cnt++; |
2616 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; | ||
2545 | if (cnt == 0) return 0; | 2617 | if (cnt == 0) return 0; |
2546 | if (cnt > 1) return -EINVAL; | 2618 | if (cnt > 1) return -EINVAL; |
2547 | 2619 | ||
@@ -2620,6 +2692,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
2620 | } | 2692 | } |
2621 | } | 2693 | } |
2622 | } | 2694 | } |
2695 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { | ||
2696 | if (mddev->pers->quiesce == NULL) | ||
2697 | return -EINVAL; | ||
2698 | if (mddev->recovery || mddev->sync_thread) | ||
2699 | return -EBUSY; | ||
2700 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { | ||
2701 | /* add the bitmap */ | ||
2702 | if (mddev->bitmap) | ||
2703 | return -EEXIST; | ||
2704 | if (mddev->default_bitmap_offset == 0) | ||
2705 | return -EINVAL; | ||
2706 | mddev->bitmap_offset = mddev->default_bitmap_offset; | ||
2707 | mddev->pers->quiesce(mddev, 1); | ||
2708 | rv = bitmap_create(mddev); | ||
2709 | if (rv) | ||
2710 | bitmap_destroy(mddev); | ||
2711 | mddev->pers->quiesce(mddev, 0); | ||
2712 | } else { | ||
2713 | /* remove the bitmap */ | ||
2714 | if (!mddev->bitmap) | ||
2715 | return -ENOENT; | ||
2716 | if (mddev->bitmap->file) | ||
2717 | return -EINVAL; | ||
2718 | mddev->pers->quiesce(mddev, 1); | ||
2719 | bitmap_destroy(mddev); | ||
2720 | mddev->pers->quiesce(mddev, 0); | ||
2721 | mddev->bitmap_offset = 0; | ||
2722 | } | ||
2723 | } | ||
2623 | md_update_sb(mddev); | 2724 | md_update_sb(mddev); |
2624 | return rv; | 2725 | return rv; |
2625 | } | 2726 | } |
@@ -2781,7 +2882,7 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
2781 | goto done_unlock; | 2882 | goto done_unlock; |
2782 | 2883 | ||
2783 | case GET_BITMAP_FILE: | 2884 | case GET_BITMAP_FILE: |
2784 | err = get_bitmap_file(mddev, (void *)arg); | 2885 | err = get_bitmap_file(mddev, argp); |
2785 | goto done_unlock; | 2886 | goto done_unlock; |
2786 | 2887 | ||
2787 | case GET_DISK_INFO: | 2888 | case GET_DISK_INFO: |
@@ -2950,18 +3051,6 @@ static int md_thread(void * arg) | |||
2950 | { | 3051 | { |
2951 | mdk_thread_t *thread = arg; | 3052 | mdk_thread_t *thread = arg; |
2952 | 3053 | ||
2953 | lock_kernel(); | ||
2954 | |||
2955 | /* | ||
2956 | * Detach thread | ||
2957 | */ | ||
2958 | |||
2959 | daemonize(thread->name, mdname(thread->mddev)); | ||
2960 | |||
2961 | current->exit_signal = SIGCHLD; | ||
2962 | allow_signal(SIGKILL); | ||
2963 | thread->tsk = current; | ||
2964 | |||
2965 | /* | 3054 | /* |
2966 | * md_thread is a 'system-thread', it's priority should be very | 3055 | * md_thread is a 'system-thread', it's priority should be very |
2967 | * high. We avoid resource deadlocks individually in each | 3056 | * high. We avoid resource deadlocks individually in each |
@@ -2973,14 +3062,14 @@ static int md_thread(void * arg) | |||
2973 | * bdflush, otherwise bdflush will deadlock if there are too | 3062 | * bdflush, otherwise bdflush will deadlock if there are too |
2974 | * many dirty RAID5 blocks. | 3063 | * many dirty RAID5 blocks. |
2975 | */ | 3064 | */ |
2976 | unlock_kernel(); | ||
2977 | 3065 | ||
2978 | complete(thread->event); | 3066 | complete(thread->event); |
2979 | while (thread->run) { | 3067 | while (!kthread_should_stop()) { |
2980 | void (*run)(mddev_t *); | 3068 | void (*run)(mddev_t *); |
2981 | 3069 | ||
2982 | wait_event_interruptible_timeout(thread->wqueue, | 3070 | wait_event_interruptible_timeout(thread->wqueue, |
2983 | test_bit(THREAD_WAKEUP, &thread->flags), | 3071 | test_bit(THREAD_WAKEUP, &thread->flags) |
3072 | || kthread_should_stop(), | ||
2984 | thread->timeout); | 3073 | thread->timeout); |
2985 | try_to_freeze(); | 3074 | try_to_freeze(); |
2986 | 3075 | ||
@@ -2989,11 +3078,8 @@ static int md_thread(void * arg) | |||
2989 | run = thread->run; | 3078 | run = thread->run; |
2990 | if (run) | 3079 | if (run) |
2991 | run(thread->mddev); | 3080 | run(thread->mddev); |
2992 | |||
2993 | if (signal_pending(current)) | ||
2994 | flush_signals(current); | ||
2995 | } | 3081 | } |
2996 | complete(thread->event); | 3082 | |
2997 | return 0; | 3083 | return 0; |
2998 | } | 3084 | } |
2999 | 3085 | ||
@@ -3010,11 +3096,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
3010 | const char *name) | 3096 | const char *name) |
3011 | { | 3097 | { |
3012 | mdk_thread_t *thread; | 3098 | mdk_thread_t *thread; |
3013 | int ret; | ||
3014 | struct completion event; | 3099 | struct completion event; |
3015 | 3100 | ||
3016 | thread = (mdk_thread_t *) kmalloc | 3101 | thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); |
3017 | (sizeof(mdk_thread_t), GFP_KERNEL); | ||
3018 | if (!thread) | 3102 | if (!thread) |
3019 | return NULL; | 3103 | return NULL; |
3020 | 3104 | ||
@@ -3027,8 +3111,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
3027 | thread->mddev = mddev; | 3111 | thread->mddev = mddev; |
3028 | thread->name = name; | 3112 | thread->name = name; |
3029 | thread->timeout = MAX_SCHEDULE_TIMEOUT; | 3113 | thread->timeout = MAX_SCHEDULE_TIMEOUT; |
3030 | ret = kernel_thread(md_thread, thread, 0); | 3114 | thread->tsk = kthread_run(md_thread, thread, mdname(thread->mddev)); |
3031 | if (ret < 0) { | 3115 | if (IS_ERR(thread->tsk)) { |
3032 | kfree(thread); | 3116 | kfree(thread); |
3033 | return NULL; | 3117 | return NULL; |
3034 | } | 3118 | } |
@@ -3038,21 +3122,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
3038 | 3122 | ||
3039 | void md_unregister_thread(mdk_thread_t *thread) | 3123 | void md_unregister_thread(mdk_thread_t *thread) |
3040 | { | 3124 | { |
3041 | struct completion event; | ||
3042 | |||
3043 | init_completion(&event); | ||
3044 | |||
3045 | thread->event = &event; | ||
3046 | |||
3047 | /* As soon as ->run is set to NULL, the task could disappear, | ||
3048 | * so we need to hold tasklist_lock until we have sent the signal | ||
3049 | */ | ||
3050 | dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); | 3125 | dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); |
3051 | read_lock(&tasklist_lock); | 3126 | |
3052 | thread->run = NULL; | 3127 | kthread_stop(thread->tsk); |
3053 | send_sig(SIGKILL, thread->tsk, 1); | ||
3054 | read_unlock(&tasklist_lock); | ||
3055 | wait_for_completion(&event); | ||
3056 | kfree(thread); | 3128 | kfree(thread); |
3057 | } | 3129 | } |
3058 | 3130 | ||
@@ -3259,10 +3331,13 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
3259 | char b[BDEVNAME_SIZE]; | 3331 | char b[BDEVNAME_SIZE]; |
3260 | seq_printf(seq, " %s[%d]", | 3332 | seq_printf(seq, " %s[%d]", |
3261 | bdevname(rdev->bdev,b), rdev->desc_nr); | 3333 | bdevname(rdev->bdev,b), rdev->desc_nr); |
3334 | if (test_bit(WriteMostly, &rdev->flags)) | ||
3335 | seq_printf(seq, "(W)"); | ||
3262 | if (rdev->faulty) { | 3336 | if (rdev->faulty) { |
3263 | seq_printf(seq, "(F)"); | 3337 | seq_printf(seq, "(F)"); |
3264 | continue; | 3338 | continue; |
3265 | } | 3339 | } else if (rdev->raid_disk < 0) |
3340 | seq_printf(seq, "(S)"); /* spare */ | ||
3266 | size += rdev->size; | 3341 | size += rdev->size; |
3267 | } | 3342 | } |
3268 | 3343 | ||
@@ -3274,6 +3349,15 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
3274 | seq_printf(seq, "\n %llu blocks", | 3349 | seq_printf(seq, "\n %llu blocks", |
3275 | (unsigned long long)size); | 3350 | (unsigned long long)size); |
3276 | } | 3351 | } |
3352 | if (mddev->persistent) { | ||
3353 | if (mddev->major_version != 0 || | ||
3354 | mddev->minor_version != 90) { | ||
3355 | seq_printf(seq," super %d.%d", | ||
3356 | mddev->major_version, | ||
3357 | mddev->minor_version); | ||
3358 | } | ||
3359 | } else | ||
3360 | seq_printf(seq, " super non-persistent"); | ||
3277 | 3361 | ||
3278 | if (mddev->pers) { | 3362 | if (mddev->pers) { |
3279 | mddev->pers->status (seq, mddev); | 3363 | mddev->pers->status (seq, mddev); |
@@ -3416,7 +3500,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) | |||
3416 | */ | 3500 | */ |
3417 | void md_write_start(mddev_t *mddev, struct bio *bi) | 3501 | void md_write_start(mddev_t *mddev, struct bio *bi) |
3418 | { | 3502 | { |
3419 | DEFINE_WAIT(w); | ||
3420 | if (bio_data_dir(bi) != WRITE) | 3503 | if (bio_data_dir(bi) != WRITE) |
3421 | return; | 3504 | return; |
3422 | 3505 | ||
@@ -3533,7 +3616,7 @@ static void md_do_sync(mddev_t *mddev) | |||
3533 | printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); | 3616 | printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); |
3534 | printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" | 3617 | printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" |
3535 | " %d KB/sec/disc.\n", sysctl_speed_limit_min); | 3618 | " %d KB/sec/disc.\n", sysctl_speed_limit_min); |
3536 | printk(KERN_INFO "md: using maximum available idle IO bandwith " | 3619 | printk(KERN_INFO "md: using maximum available idle IO bandwidth " |
3537 | "(but not more than %d KB/sec) for reconstruction.\n", | 3620 | "(but not more than %d KB/sec) for reconstruction.\n", |
3538 | sysctl_speed_limit_max); | 3621 | sysctl_speed_limit_max); |
3539 | 3622 | ||
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 2d2ca7fa0265..286342375fb7 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -169,6 +169,11 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio) | |||
169 | struct multipath_bh * mp_bh; | 169 | struct multipath_bh * mp_bh; |
170 | struct multipath_info *multipath; | 170 | struct multipath_info *multipath; |
171 | 171 | ||
172 | if (unlikely(bio_barrier(bio))) { | ||
173 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
174 | return 0; | ||
175 | } | ||
176 | |||
172 | mp_bh = mempool_alloc(conf->pool, GFP_NOIO); | 177 | mp_bh = mempool_alloc(conf->pool, GFP_NOIO); |
173 | 178 | ||
174 | mp_bh->master_bio = bio; | 179 | mp_bh->master_bio = bio; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2120710172c5..f6757259ce7f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -404,6 +404,11 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) | |||
404 | unsigned long chunk; | 404 | unsigned long chunk; |
405 | sector_t block, rsect; | 405 | sector_t block, rsect; |
406 | 406 | ||
407 | if (unlikely(bio_barrier(bio))) { | ||
408 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
409 | return 0; | ||
410 | } | ||
411 | |||
407 | if (bio_data_dir(bio)==WRITE) { | 412 | if (bio_data_dir(bio)==WRITE) { |
408 | disk_stat_inc(mddev->gendisk, writes); | 413 | disk_stat_inc(mddev->gendisk, writes); |
409 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | 414 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 51d9645ed09c..a93ca478142a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
222 | { | 222 | { |
223 | struct bio *bio = r1_bio->master_bio; | 223 | struct bio *bio = r1_bio->master_bio; |
224 | 224 | ||
225 | bio_endio(bio, bio->bi_size, | 225 | /* if nobody has done the final endio yet, do it now */ |
226 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | 226 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
227 | PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", | ||
228 | (bio_data_dir(bio) == WRITE) ? "write" : "read", | ||
229 | (unsigned long long) bio->bi_sector, | ||
230 | (unsigned long long) bio->bi_sector + | ||
231 | (bio->bi_size >> 9) - 1); | ||
232 | |||
233 | bio_endio(bio, bio->bi_size, | ||
234 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
235 | } | ||
227 | free_r1bio(r1_bio); | 236 | free_r1bio(r1_bio); |
228 | } | 237 | } |
229 | 238 | ||
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
292 | { | 301 | { |
293 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
294 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
295 | int mirror; | 304 | int mirror, behind; |
296 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
297 | 306 | ||
298 | if (bio->bi_size) | 307 | if (bio->bi_size) |
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
323 | 332 | ||
324 | update_head_pos(mirror, r1_bio); | 333 | update_head_pos(mirror, r1_bio); |
325 | 334 | ||
335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | ||
336 | if (behind) { | ||
337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | ||
338 | atomic_dec(&r1_bio->behind_remaining); | ||
339 | |||
340 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
341 | * reached all non-writemostly disks. Setting the Returned bit | ||
342 | * ensures that this gets done only once -- we don't ever want to | ||
343 | * return -EIO here, instead we'll wait */ | ||
344 | |||
345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
347 | /* Maybe we can return now */ | ||
348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
349 | struct bio *mbio = r1_bio->master_bio; | ||
350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
351 | (unsigned long long) mbio->bi_sector, | ||
352 | (unsigned long long) mbio->bi_sector + | ||
353 | (mbio->bi_size >> 9) - 1); | ||
354 | bio_endio(mbio, mbio->bi_size, 0); | ||
355 | } | ||
356 | } | ||
357 | } | ||
326 | /* | 358 | /* |
327 | * | 359 | * |
328 | * Let's see if all mirrored write operations have finished | 360 | * Let's see if all mirrored write operations have finished |
329 | * already. | 361 | * already. |
330 | */ | 362 | */ |
331 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 363 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
365 | /* free extra copy of the data pages */ | ||
366 | int i = bio->bi_vcnt; | ||
367 | while (i--) | ||
368 | __free_page(bio->bi_io_vec[i].bv_page); | ||
369 | } | ||
332 | /* clear the bitmap if all writes complete successfully */ | 370 | /* clear the bitmap if all writes complete successfully */ |
333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 371 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
334 | r1_bio->sectors, | 372 | r1_bio->sectors, |
335 | !test_bit(R1BIO_Degraded, &r1_bio->state)); | 373 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
374 | behind); | ||
336 | md_write_end(r1_bio->mddev); | 375 | md_write_end(r1_bio->mddev); |
337 | raid_end_bio_io(r1_bio); | 376 | raid_end_bio_io(r1_bio); |
338 | } | 377 | } |
@@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
360 | { | 399 | { |
361 | const unsigned long this_sector = r1_bio->sector; | 400 | const unsigned long this_sector = r1_bio->sector; |
362 | int new_disk = conf->last_used, disk = new_disk; | 401 | int new_disk = conf->last_used, disk = new_disk; |
402 | int wonly_disk = -1; | ||
363 | const int sectors = r1_bio->sectors; | 403 | const int sectors = r1_bio->sectors; |
364 | sector_t new_distance, current_distance; | 404 | sector_t new_distance, current_distance; |
365 | mdk_rdev_t *new_rdev, *rdev; | 405 | mdk_rdev_t *rdev; |
366 | 406 | ||
367 | rcu_read_lock(); | 407 | rcu_read_lock(); |
368 | /* | 408 | /* |
369 | * Check if it if we can balance. We can balance on the whole | 409 | * Check if we can balance. We can balance on the whole |
370 | * device if no resync is going on, or below the resync window. | 410 | * device if no resync is going on, or below the resync window. |
371 | * We take the first readable disk when above the resync window. | 411 | * We take the first readable disk when above the resync window. |
372 | */ | 412 | */ |
@@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
376 | /* Choose the first operation device, for consistancy */ | 416 | /* Choose the first operation device, for consistancy */ |
377 | new_disk = 0; | 417 | new_disk = 0; |
378 | 418 | ||
379 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | 419 | for (rdev = conf->mirrors[new_disk].rdev; |
380 | !new_rdev->in_sync) { | 420 | !rdev || !rdev->in_sync |
381 | new_disk++; | 421 | || test_bit(WriteMostly, &rdev->flags); |
382 | if (new_disk == conf->raid_disks) { | 422 | rdev = conf->mirrors[++new_disk].rdev) { |
383 | new_disk = -1; | 423 | |
424 | if (rdev && rdev->in_sync) | ||
425 | wonly_disk = new_disk; | ||
426 | |||
427 | if (new_disk == conf->raid_disks - 1) { | ||
428 | new_disk = wonly_disk; | ||
384 | break; | 429 | break; |
385 | } | 430 | } |
386 | } | 431 | } |
@@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
389 | 434 | ||
390 | 435 | ||
391 | /* make sure the disk is operational */ | 436 | /* make sure the disk is operational */ |
392 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | 437 | for (rdev = conf->mirrors[new_disk].rdev; |
393 | !new_rdev->in_sync) { | 438 | !rdev || !rdev->in_sync || |
439 | test_bit(WriteMostly, &rdev->flags); | ||
440 | rdev = conf->mirrors[new_disk].rdev) { | ||
441 | |||
442 | if (rdev && rdev->in_sync) | ||
443 | wonly_disk = new_disk; | ||
444 | |||
394 | if (new_disk <= 0) | 445 | if (new_disk <= 0) |
395 | new_disk = conf->raid_disks; | 446 | new_disk = conf->raid_disks; |
396 | new_disk--; | 447 | new_disk--; |
397 | if (new_disk == disk) { | 448 | if (new_disk == disk) { |
398 | new_disk = -1; | 449 | new_disk = wonly_disk; |
399 | goto rb_out; | 450 | break; |
400 | } | 451 | } |
401 | } | 452 | } |
453 | |||
454 | if (new_disk < 0) | ||
455 | goto rb_out; | ||
456 | |||
402 | disk = new_disk; | 457 | disk = new_disk; |
403 | /* now disk == new_disk == starting point for search */ | 458 | /* now disk == new_disk == starting point for search */ |
404 | 459 | ||
@@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
419 | disk = conf->raid_disks; | 474 | disk = conf->raid_disks; |
420 | disk--; | 475 | disk--; |
421 | 476 | ||
422 | if ((rdev=conf->mirrors[disk].rdev) == NULL || | 477 | rdev = conf->mirrors[disk].rdev; |
423 | !rdev->in_sync) | 478 | |
479 | if (!rdev || | ||
480 | !rdev->in_sync || | ||
481 | test_bit(WriteMostly, &rdev->flags)) | ||
424 | continue; | 482 | continue; |
425 | 483 | ||
426 | if (!atomic_read(&rdev->nr_pending)) { | 484 | if (!atomic_read(&rdev->nr_pending)) { |
427 | new_disk = disk; | 485 | new_disk = disk; |
428 | new_rdev = rdev; | ||
429 | break; | 486 | break; |
430 | } | 487 | } |
431 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 488 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); |
432 | if (new_distance < current_distance) { | 489 | if (new_distance < current_distance) { |
433 | current_distance = new_distance; | 490 | current_distance = new_distance; |
434 | new_disk = disk; | 491 | new_disk = disk; |
435 | new_rdev = rdev; | ||
436 | } | 492 | } |
437 | } while (disk != conf->last_used); | 493 | } while (disk != conf->last_used); |
438 | 494 | ||
439 | rb_out: | 495 | rb_out: |
440 | 496 | ||
441 | 497 | ||
442 | if (new_disk >= 0) { | 498 | if (new_disk >= 0) { |
443 | conf->next_seq_sect = this_sector + sectors; | 499 | rdev = conf->mirrors[new_disk].rdev; |
444 | conf->last_used = new_disk; | 500 | if (!rdev) |
445 | atomic_inc(&new_rdev->nr_pending); | 501 | goto retry; |
446 | if (!new_rdev->in_sync) { | 502 | atomic_inc(&rdev->nr_pending); |
503 | if (!rdev->in_sync) { | ||
447 | /* cannot risk returning a device that failed | 504 | /* cannot risk returning a device that failed |
448 | * before we inc'ed nr_pending | 505 | * before we inc'ed nr_pending |
449 | */ | 506 | */ |
450 | atomic_dec(&new_rdev->nr_pending); | 507 | atomic_dec(&rdev->nr_pending); |
451 | goto retry; | 508 | goto retry; |
452 | } | 509 | } |
510 | conf->next_seq_sect = this_sector + sectors; | ||
511 | conf->last_used = new_disk; | ||
453 | } | 512 | } |
454 | rcu_read_unlock(); | 513 | rcu_read_unlock(); |
455 | 514 | ||
@@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) | |||
542 | spin_unlock_irq(&conf->resync_lock); | 601 | spin_unlock_irq(&conf->resync_lock); |
543 | } | 602 | } |
544 | 603 | ||
604 | /* duplicate the data pages for behind I/O */ | ||
605 | static struct page **alloc_behind_pages(struct bio *bio) | ||
606 | { | ||
607 | int i; | ||
608 | struct bio_vec *bvec; | ||
609 | struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), | ||
610 | GFP_NOIO); | ||
611 | if (unlikely(!pages)) | ||
612 | goto do_sync_io; | ||
613 | |||
614 | memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); | ||
615 | |||
616 | bio_for_each_segment(bvec, bio, i) { | ||
617 | pages[i] = alloc_page(GFP_NOIO); | ||
618 | if (unlikely(!pages[i])) | ||
619 | goto do_sync_io; | ||
620 | memcpy(kmap(pages[i]) + bvec->bv_offset, | ||
621 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | ||
622 | kunmap(pages[i]); | ||
623 | kunmap(bvec->bv_page); | ||
624 | } | ||
625 | |||
626 | return pages; | ||
627 | |||
628 | do_sync_io: | ||
629 | if (pages) | ||
630 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | ||
631 | __free_page(pages[i]); | ||
632 | kfree(pages); | ||
633 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | ||
634 | return NULL; | ||
635 | } | ||
636 | |||
545 | static int make_request(request_queue_t *q, struct bio * bio) | 637 | static int make_request(request_queue_t *q, struct bio * bio) |
546 | { | 638 | { |
547 | mddev_t *mddev = q->queuedata; | 639 | mddev_t *mddev = q->queuedata; |
@@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
554 | struct bitmap *bitmap = mddev->bitmap; | 646 | struct bitmap *bitmap = mddev->bitmap; |
555 | unsigned long flags; | 647 | unsigned long flags; |
556 | struct bio_list bl; | 648 | struct bio_list bl; |
649 | struct page **behind_pages = NULL; | ||
557 | 650 | ||
651 | if (unlikely(bio_barrier(bio))) { | ||
652 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
653 | return 0; | ||
654 | } | ||
558 | 655 | ||
559 | /* | 656 | /* |
560 | * Register the new request and wait if the reconstruction | 657 | * Register the new request and wait if the reconstruction |
@@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
589 | r1_bio->mddev = mddev; | 686 | r1_bio->mddev = mddev; |
590 | r1_bio->sector = bio->bi_sector; | 687 | r1_bio->sector = bio->bi_sector; |
591 | 688 | ||
592 | r1_bio->state = 0; | ||
593 | |||
594 | if (bio_data_dir(bio) == READ) { | 689 | if (bio_data_dir(bio) == READ) { |
595 | /* | 690 | /* |
596 | * read balancing logic: | 691 | * read balancing logic: |
@@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
651 | } | 746 | } |
652 | rcu_read_unlock(); | 747 | rcu_read_unlock(); |
653 | 748 | ||
749 | BUG_ON(targets == 0); /* we never fail the last device */ | ||
750 | |||
654 | if (targets < conf->raid_disks) { | 751 | if (targets < conf->raid_disks) { |
655 | /* array is degraded, we will not clear the bitmap | 752 | /* array is degraded, we will not clear the bitmap |
656 | * on I/O completion (see raid1_end_write_request) */ | 753 | * on I/O completion (see raid1_end_write_request) */ |
657 | set_bit(R1BIO_Degraded, &r1_bio->state); | 754 | set_bit(R1BIO_Degraded, &r1_bio->state); |
658 | } | 755 | } |
659 | 756 | ||
757 | /* do behind I/O ? */ | ||
758 | if (bitmap && | ||
759 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | ||
760 | (behind_pages = alloc_behind_pages(bio)) != NULL) | ||
761 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
762 | |||
660 | atomic_set(&r1_bio->remaining, 0); | 763 | atomic_set(&r1_bio->remaining, 0); |
764 | atomic_set(&r1_bio->behind_remaining, 0); | ||
661 | 765 | ||
662 | bio_list_init(&bl); | 766 | bio_list_init(&bl); |
663 | for (i = 0; i < disks; i++) { | 767 | for (i = 0; i < disks; i++) { |
@@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
674 | mbio->bi_rw = WRITE; | 778 | mbio->bi_rw = WRITE; |
675 | mbio->bi_private = r1_bio; | 779 | mbio->bi_private = r1_bio; |
676 | 780 | ||
781 | if (behind_pages) { | ||
782 | struct bio_vec *bvec; | ||
783 | int j; | ||
784 | |||
785 | /* Yes, I really want the '__' version so that | ||
786 | * we clear any unused pointer in the io_vec, rather | ||
787 | * than leave them unchanged. This is important | ||
788 | * because when we come to free the pages, we won't | ||
789 | * know the originial bi_idx, so we just free | ||
790 | * them all | ||
791 | */ | ||
792 | __bio_for_each_segment(bvec, mbio, j, 0) | ||
793 | bvec->bv_page = behind_pages[j]; | ||
794 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | ||
795 | atomic_inc(&r1_bio->behind_remaining); | ||
796 | } | ||
797 | |||
677 | atomic_inc(&r1_bio->remaining); | 798 | atomic_inc(&r1_bio->remaining); |
678 | 799 | ||
679 | bio_list_add(&bl, mbio); | 800 | bio_list_add(&bl, mbio); |
680 | } | 801 | } |
802 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
681 | 803 | ||
682 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); | 804 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
805 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
683 | spin_lock_irqsave(&conf->device_lock, flags); | 806 | spin_lock_irqsave(&conf->device_lock, flags); |
684 | bio_list_merge(&conf->pending_bio_list, &bl); | 807 | bio_list_merge(&conf->pending_bio_list, &bl); |
685 | bio_list_init(&bl); | 808 | bio_list_init(&bl); |
@@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1105 | sector_t max_sector, nr_sectors; | 1228 | sector_t max_sector, nr_sectors; |
1106 | int disk; | 1229 | int disk; |
1107 | int i; | 1230 | int i; |
1231 | int wonly; | ||
1108 | int write_targets = 0; | 1232 | int write_targets = 0; |
1109 | int sync_blocks; | 1233 | int sync_blocks; |
1110 | int still_degraded = 0; | 1234 | int still_degraded = 0; |
@@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1160 | */ | 1284 | */ |
1161 | disk = conf->last_used; | 1285 | disk = conf->last_used; |
1162 | /* make sure disk is operational */ | 1286 | /* make sure disk is operational */ |
1163 | 1287 | wonly = disk; | |
1164 | while (conf->mirrors[disk].rdev == NULL || | 1288 | while (conf->mirrors[disk].rdev == NULL || |
1165 | !conf->mirrors[disk].rdev->in_sync) { | 1289 | !conf->mirrors[disk].rdev->in_sync || |
1290 | test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) | ||
1291 | ) { | ||
1292 | if (conf->mirrors[disk].rdev && | ||
1293 | conf->mirrors[disk].rdev->in_sync) | ||
1294 | wonly = disk; | ||
1166 | if (disk <= 0) | 1295 | if (disk <= 0) |
1167 | disk = conf->raid_disks; | 1296 | disk = conf->raid_disks; |
1168 | disk--; | 1297 | disk--; |
1169 | if (disk == conf->last_used) | 1298 | if (disk == conf->last_used) { |
1299 | disk = wonly; | ||
1170 | break; | 1300 | break; |
1301 | } | ||
1171 | } | 1302 | } |
1172 | conf->last_used = disk; | 1303 | conf->last_used = disk; |
1173 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | 1304 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); |
@@ -1439,6 +1570,17 @@ out: | |||
1439 | static int stop(mddev_t *mddev) | 1570 | static int stop(mddev_t *mddev) |
1440 | { | 1571 | { |
1441 | conf_t *conf = mddev_to_conf(mddev); | 1572 | conf_t *conf = mddev_to_conf(mddev); |
1573 | struct bitmap *bitmap = mddev->bitmap; | ||
1574 | int behind_wait = 0; | ||
1575 | |||
1576 | /* wait for behind writes to complete */ | ||
1577 | while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | ||
1578 | behind_wait++; | ||
1579 | printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); | ||
1580 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1581 | schedule_timeout(HZ); /* wait a second */ | ||
1582 | /* need to kick something here to make sure I/O goes? */ | ||
1583 | } | ||
1442 | 1584 | ||
1443 | md_unregister_thread(mddev->thread); | 1585 | md_unregister_thread(mddev->thread); |
1444 | mddev->thread = NULL; | 1586 | mddev->thread = NULL; |
@@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) | |||
1561 | return 0; | 1703 | return 0; |
1562 | } | 1704 | } |
1563 | 1705 | ||
1706 | static void raid1_quiesce(mddev_t *mddev, int state) | ||
1707 | { | ||
1708 | conf_t *conf = mddev_to_conf(mddev); | ||
1709 | |||
1710 | switch(state) { | ||
1711 | case 1: | ||
1712 | spin_lock_irq(&conf->resync_lock); | ||
1713 | conf->barrier++; | ||
1714 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
1715 | conf->resync_lock, raid1_unplug(mddev->queue)); | ||
1716 | spin_unlock_irq(&conf->resync_lock); | ||
1717 | break; | ||
1718 | case 0: | ||
1719 | spin_lock_irq(&conf->resync_lock); | ||
1720 | conf->barrier--; | ||
1721 | spin_unlock_irq(&conf->resync_lock); | ||
1722 | wake_up(&conf->wait_resume); | ||
1723 | wake_up(&conf->wait_idle); | ||
1724 | break; | ||
1725 | } | ||
1726 | if (mddev->thread) { | ||
1727 | if (mddev->bitmap) | ||
1728 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
1729 | else | ||
1730 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
1731 | md_wakeup_thread(mddev->thread); | ||
1732 | } | ||
1733 | } | ||
1734 | |||
1564 | 1735 | ||
1565 | static mdk_personality_t raid1_personality = | 1736 | static mdk_personality_t raid1_personality = |
1566 | { | 1737 | { |
@@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality = | |||
1577 | .sync_request = sync_request, | 1748 | .sync_request = sync_request, |
1578 | .resize = raid1_resize, | 1749 | .resize = raid1_resize, |
1579 | .reshape = raid1_reshape, | 1750 | .reshape = raid1_reshape, |
1751 | .quiesce = raid1_quiesce, | ||
1580 | }; | 1752 | }; |
1581 | 1753 | ||
1582 | static int __init raid_init(void) | 1754 | static int __init raid_init(void) |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 62ebb1bc72be..5bd1e9ec899d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -538,7 +538,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
538 | } | 538 | } |
539 | 539 | ||
540 | 540 | ||
541 | current_distance = abs(this_sector - conf->mirrors[disk].head_position); | 541 | current_distance = abs(r10_bio->devs[slot].addr - |
542 | conf->mirrors[disk].head_position); | ||
542 | 543 | ||
543 | /* Find the disk whose head is closest */ | 544 | /* Find the disk whose head is closest */ |
544 | 545 | ||
@@ -668,6 +669,11 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
668 | int i; | 669 | int i; |
669 | int chunk_sects = conf->chunk_mask + 1; | 670 | int chunk_sects = conf->chunk_mask + 1; |
670 | 671 | ||
672 | if (unlikely(bio_barrier(bio))) { | ||
673 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
674 | return 0; | ||
675 | } | ||
676 | |||
671 | /* If this request crosses a chunk boundary, we need to | 677 | /* If this request crosses a chunk boundary, we need to |
672 | * split it. This will only happen for 1 PAGE (or less) requests. | 678 | * split it. This will only happen for 1 PAGE (or less) requests. |
673 | */ | 679 | */ |
@@ -900,6 +906,27 @@ static void close_sync(conf_t *conf) | |||
900 | conf->r10buf_pool = NULL; | 906 | conf->r10buf_pool = NULL; |
901 | } | 907 | } |
902 | 908 | ||
909 | /* check if there are enough drives for | ||
910 | * every block to appear on atleast one | ||
911 | */ | ||
912 | static int enough(conf_t *conf) | ||
913 | { | ||
914 | int first = 0; | ||
915 | |||
916 | do { | ||
917 | int n = conf->copies; | ||
918 | int cnt = 0; | ||
919 | while (n--) { | ||
920 | if (conf->mirrors[first].rdev) | ||
921 | cnt++; | ||
922 | first = (first+1) % conf->raid_disks; | ||
923 | } | ||
924 | if (cnt == 0) | ||
925 | return 0; | ||
926 | } while (first != 0); | ||
927 | return 1; | ||
928 | } | ||
929 | |||
903 | static int raid10_spare_active(mddev_t *mddev) | 930 | static int raid10_spare_active(mddev_t *mddev) |
904 | { | 931 | { |
905 | int i; | 932 | int i; |
@@ -938,6 +965,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
938 | * very different from resync | 965 | * very different from resync |
939 | */ | 966 | */ |
940 | return 0; | 967 | return 0; |
968 | if (!enough(conf)) | ||
969 | return 0; | ||
941 | 970 | ||
942 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | 971 | for (mirror=0; mirror < mddev->raid_disks; mirror++) |
943 | if ( !(p=conf->mirrors+mirror)->rdev) { | 972 | if ( !(p=conf->mirrors+mirror)->rdev) { |
@@ -1445,7 +1474,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1445 | } | 1474 | } |
1446 | } | 1475 | } |
1447 | if (j == conf->copies) { | 1476 | if (j == conf->copies) { |
1448 | BUG(); | 1477 | /* Cannot recover, so abort the recovery */ |
1478 | put_buf(r10_bio); | ||
1479 | r10_bio = rb2; | ||
1480 | if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) | ||
1481 | printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", | ||
1482 | mdname(mddev)); | ||
1483 | break; | ||
1449 | } | 1484 | } |
1450 | } | 1485 | } |
1451 | if (biolist == NULL) { | 1486 | if (biolist == NULL) { |
@@ -1678,9 +1713,10 @@ static int run(mddev_t *mddev) | |||
1678 | init_waitqueue_head(&conf->wait_idle); | 1713 | init_waitqueue_head(&conf->wait_idle); |
1679 | init_waitqueue_head(&conf->wait_resume); | 1714 | init_waitqueue_head(&conf->wait_resume); |
1680 | 1715 | ||
1681 | if (!conf->working_disks) { | 1716 | /* need to check that every block has at least one working mirror */ |
1682 | printk(KERN_ERR "raid10: no operational mirrors for %s\n", | 1717 | if (!enough(conf)) { |
1683 | mdname(mddev)); | 1718 | printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", |
1719 | mdname(mddev)); | ||
1684 | goto out_free_conf; | 1720 | goto out_free_conf; |
1685 | } | 1721 | } |
1686 | 1722 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 43f231a467d5..4683ca24c046 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -24,6 +24,8 @@ | |||
24 | #include <linux/bitops.h> | 24 | #include <linux/bitops.h> |
25 | #include <asm/atomic.h> | 25 | #include <asm/atomic.h> |
26 | 26 | ||
27 | #include <linux/raid/bitmap.h> | ||
28 | |||
27 | /* | 29 | /* |
28 | * Stripe cache | 30 | * Stripe cache |
29 | */ | 31 | */ |
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
79 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 81 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
80 | if (test_bit(STRIPE_DELAYED, &sh->state)) | 82 | if (test_bit(STRIPE_DELAYED, &sh->state)) |
81 | list_add_tail(&sh->lru, &conf->delayed_list); | 83 | list_add_tail(&sh->lru, &conf->delayed_list); |
82 | else | 84 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
85 | conf->seq_write == sh->bm_seq) | ||
86 | list_add_tail(&sh->lru, &conf->bitmap_list); | ||
87 | else { | ||
88 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | ||
83 | list_add_tail(&sh->lru, &conf->handle_list); | 89 | list_add_tail(&sh->lru, &conf->handle_list); |
90 | } | ||
84 | md_wakeup_thread(conf->mddev->thread); | 91 | md_wakeup_thread(conf->mddev->thread); |
85 | } else { | 92 | } else { |
86 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 93 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
244 | spin_lock_irq(&conf->device_lock); | 251 | spin_lock_irq(&conf->device_lock); |
245 | 252 | ||
246 | do { | 253 | do { |
254 | wait_event_lock_irq(conf->wait_for_stripe, | ||
255 | conf->quiesce == 0, | ||
256 | conf->device_lock, /* nothing */); | ||
247 | sh = __find_stripe(conf, sector); | 257 | sh = __find_stripe(conf, sector); |
248 | if (!sh) { | 258 | if (!sh) { |
249 | if (!conf->inactive_blocked) | 259 | if (!conf->inactive_blocked) |
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
803 | { | 813 | { |
804 | struct bio **bip; | 814 | struct bio **bip; |
805 | raid5_conf_t *conf = sh->raid_conf; | 815 | raid5_conf_t *conf = sh->raid_conf; |
816 | int firstwrite=0; | ||
806 | 817 | ||
807 | PRINTK("adding bh b#%llu to stripe s#%llu\n", | 818 | PRINTK("adding bh b#%llu to stripe s#%llu\n", |
808 | (unsigned long long)bi->bi_sector, | 819 | (unsigned long long)bi->bi_sector, |
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
811 | 822 | ||
812 | spin_lock(&sh->lock); | 823 | spin_lock(&sh->lock); |
813 | spin_lock_irq(&conf->device_lock); | 824 | spin_lock_irq(&conf->device_lock); |
814 | if (forwrite) | 825 | if (forwrite) { |
815 | bip = &sh->dev[dd_idx].towrite; | 826 | bip = &sh->dev[dd_idx].towrite; |
816 | else | 827 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) |
828 | firstwrite = 1; | ||
829 | } else | ||
817 | bip = &sh->dev[dd_idx].toread; | 830 | bip = &sh->dev[dd_idx].toread; |
818 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | 831 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { |
819 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) | 832 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) |
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
836 | (unsigned long long)bi->bi_sector, | 849 | (unsigned long long)bi->bi_sector, |
837 | (unsigned long long)sh->sector, dd_idx); | 850 | (unsigned long long)sh->sector, dd_idx); |
838 | 851 | ||
852 | if (conf->mddev->bitmap && firstwrite) { | ||
853 | sh->bm_seq = conf->seq_write; | ||
854 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
855 | STRIPE_SECTORS, 0); | ||
856 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
857 | } | ||
858 | |||
839 | if (forwrite) { | 859 | if (forwrite) { |
840 | /* check if page is covered */ | 860 | /* check if page is covered */ |
841 | sector_t sector = sh->dev[dd_idx].sector; | 861 | sector_t sector = sh->dev[dd_idx].sector; |
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
958 | * need to be failed | 978 | * need to be failed |
959 | */ | 979 | */ |
960 | if (failed > 1 && to_read+to_write+written) { | 980 | if (failed > 1 && to_read+to_write+written) { |
961 | spin_lock_irq(&conf->device_lock); | ||
962 | for (i=disks; i--; ) { | 981 | for (i=disks; i--; ) { |
982 | int bitmap_end = 0; | ||
983 | spin_lock_irq(&conf->device_lock); | ||
963 | /* fail all writes first */ | 984 | /* fail all writes first */ |
964 | bi = sh->dev[i].towrite; | 985 | bi = sh->dev[i].towrite; |
965 | sh->dev[i].towrite = NULL; | 986 | sh->dev[i].towrite = NULL; |
966 | if (bi) to_write--; | 987 | if (bi) { to_write--; bitmap_end = 1; } |
967 | 988 | ||
968 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 989 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
969 | wake_up(&conf->wait_for_overlap); | 990 | wake_up(&conf->wait_for_overlap); |
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
981 | /* and fail all 'written' */ | 1002 | /* and fail all 'written' */ |
982 | bi = sh->dev[i].written; | 1003 | bi = sh->dev[i].written; |
983 | sh->dev[i].written = NULL; | 1004 | sh->dev[i].written = NULL; |
1005 | if (bi) bitmap_end = 1; | ||
984 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | 1006 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { |
985 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 1007 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
986 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 1008 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh) | |||
1009 | bi = nextbi; | 1031 | bi = nextbi; |
1010 | } | 1032 | } |
1011 | } | 1033 | } |
1034 | spin_unlock_irq(&conf->device_lock); | ||
1035 | if (bitmap_end) | ||
1036 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1037 | STRIPE_SECTORS, 0, 0); | ||
1012 | } | 1038 | } |
1013 | spin_unlock_irq(&conf->device_lock); | ||
1014 | } | 1039 | } |
1015 | if (failed > 1 && syncing) { | 1040 | if (failed > 1 && syncing) { |
1016 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 1041 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1038 | test_bit(R5_UPTODATE, &dev->flags) ) { | 1063 | test_bit(R5_UPTODATE, &dev->flags) ) { |
1039 | /* We can return any write requests */ | 1064 | /* We can return any write requests */ |
1040 | struct bio *wbi, *wbi2; | 1065 | struct bio *wbi, *wbi2; |
1066 | int bitmap_end = 0; | ||
1041 | PRINTK("Return write for disc %d\n", i); | 1067 | PRINTK("Return write for disc %d\n", i); |
1042 | spin_lock_irq(&conf->device_lock); | 1068 | spin_lock_irq(&conf->device_lock); |
1043 | wbi = dev->written; | 1069 | wbi = dev->written; |
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
1051 | } | 1077 | } |
1052 | wbi = wbi2; | 1078 | wbi = wbi2; |
1053 | } | 1079 | } |
1080 | if (dev->towrite == NULL) | ||
1081 | bitmap_end = 1; | ||
1054 | spin_unlock_irq(&conf->device_lock); | 1082 | spin_unlock_irq(&conf->device_lock); |
1083 | if (bitmap_end) | ||
1084 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1085 | STRIPE_SECTORS, | ||
1086 | !test_bit(STRIPE_DEGRADED, &sh->state), 0); | ||
1055 | } | 1087 | } |
1056 | } | 1088 | } |
1057 | } | 1089 | } |
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
1175 | } | 1207 | } |
1176 | } | 1208 | } |
1177 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | 1209 | /* now if nothing is locked, and if we have enough data, we can start a write request */ |
1178 | if (locked == 0 && (rcw == 0 ||rmw == 0)) { | 1210 | if (locked == 0 && (rcw == 0 ||rmw == 0) && |
1211 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
1179 | PRINTK("Computing parity...\n"); | 1212 | PRINTK("Computing parity...\n"); |
1180 | compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); | 1213 | compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); |
1181 | /* now every locked buffer is ready to be written */ | 1214 | /* now every locked buffer is ready to be written */ |
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1231 | dev = &sh->dev[failed_num]; | 1264 | dev = &sh->dev[failed_num]; |
1232 | set_bit(R5_LOCKED, &dev->flags); | 1265 | set_bit(R5_LOCKED, &dev->flags); |
1233 | set_bit(R5_Wantwrite, &dev->flags); | 1266 | set_bit(R5_Wantwrite, &dev->flags); |
1267 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
1234 | locked++; | 1268 | locked++; |
1235 | set_bit(STRIPE_INSYNC, &sh->state); | 1269 | set_bit(STRIPE_INSYNC, &sh->state); |
1236 | set_bit(R5_Syncio, &dev->flags); | 1270 | set_bit(R5_Syncio, &dev->flags); |
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
1298 | bi->bi_next = NULL; | 1332 | bi->bi_next = NULL; |
1299 | generic_make_request(bi); | 1333 | generic_make_request(bi); |
1300 | } else { | 1334 | } else { |
1335 | if (rw == 1) | ||
1336 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
1301 | PRINTK("skip op %ld on disc %d for sector %llu\n", | 1337 | PRINTK("skip op %ld on disc %d for sector %llu\n", |
1302 | bi->bi_rw, i, (unsigned long long)sh->sector); | 1338 | bi->bi_rw, i, (unsigned long long)sh->sector); |
1303 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1339 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) | |||
1322 | } | 1358 | } |
1323 | } | 1359 | } |
1324 | 1360 | ||
1361 | static inline void activate_bit_delay(raid5_conf_t *conf) | ||
1362 | { | ||
1363 | /* device_lock is held */ | ||
1364 | struct list_head head; | ||
1365 | list_add(&head, &conf->bitmap_list); | ||
1366 | list_del_init(&conf->bitmap_list); | ||
1367 | while (!list_empty(&head)) { | ||
1368 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | ||
1369 | list_del_init(&sh->lru); | ||
1370 | atomic_inc(&sh->count); | ||
1371 | __release_stripe(conf, sh); | ||
1372 | } | ||
1373 | } | ||
1374 | |||
1325 | static void unplug_slaves(mddev_t *mddev) | 1375 | static void unplug_slaves(mddev_t *mddev) |
1326 | { | 1376 | { |
1327 | raid5_conf_t *conf = mddev_to_conf(mddev); | 1377 | raid5_conf_t *conf = mddev_to_conf(mddev); |
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q) | |||
1354 | 1404 | ||
1355 | spin_lock_irqsave(&conf->device_lock, flags); | 1405 | spin_lock_irqsave(&conf->device_lock, flags); |
1356 | 1406 | ||
1357 | if (blk_remove_plug(q)) | 1407 | if (blk_remove_plug(q)) { |
1408 | conf->seq_flush++; | ||
1358 | raid5_activate_delayed(conf); | 1409 | raid5_activate_delayed(conf); |
1410 | } | ||
1359 | md_wakeup_thread(mddev->thread); | 1411 | md_wakeup_thread(mddev->thread); |
1360 | 1412 | ||
1361 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1413 | spin_unlock_irqrestore(&conf->device_lock, flags); |
@@ -1411,6 +1463,11 @@ static int make_request (request_queue_t *q, struct bio * bi) | |||
1411 | sector_t logical_sector, last_sector; | 1463 | sector_t logical_sector, last_sector; |
1412 | struct stripe_head *sh; | 1464 | struct stripe_head *sh; |
1413 | 1465 | ||
1466 | if (unlikely(bio_barrier(bi))) { | ||
1467 | bio_endio(bi, bi->bi_size, -EOPNOTSUPP); | ||
1468 | return 0; | ||
1469 | } | ||
1470 | |||
1414 | md_write_start(mddev, bi); | 1471 | md_write_start(mddev, bi); |
1415 | 1472 | ||
1416 | if (bio_data_dir(bi)==WRITE) { | 1473 | if (bio_data_dir(bi)==WRITE) { |
@@ -1488,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1488 | sector_t first_sector; | 1545 | sector_t first_sector; |
1489 | int raid_disks = conf->raid_disks; | 1546 | int raid_disks = conf->raid_disks; |
1490 | int data_disks = raid_disks-1; | 1547 | int data_disks = raid_disks-1; |
1548 | sector_t max_sector = mddev->size << 1; | ||
1549 | int sync_blocks; | ||
1491 | 1550 | ||
1492 | if (sector_nr >= mddev->size <<1) { | 1551 | if (sector_nr >= max_sector) { |
1493 | /* just being told to finish up .. nothing much to do */ | 1552 | /* just being told to finish up .. nothing much to do */ |
1494 | unplug_slaves(mddev); | 1553 | unplug_slaves(mddev); |
1554 | |||
1555 | if (mddev->curr_resync < max_sector) /* aborted */ | ||
1556 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | ||
1557 | &sync_blocks, 1); | ||
1558 | else /* compelted sync */ | ||
1559 | conf->fullsync = 0; | ||
1560 | bitmap_close_sync(mddev->bitmap); | ||
1561 | |||
1495 | return 0; | 1562 | return 0; |
1496 | } | 1563 | } |
1497 | /* if there is 1 or more failed drives and we are trying | 1564 | /* if there is 1 or more failed drives and we are trying |
@@ -1503,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1503 | *skipped = 1; | 1570 | *skipped = 1; |
1504 | return rv; | 1571 | return rv; |
1505 | } | 1572 | } |
1573 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && | ||
1574 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { | ||
1575 | /* we can skip this block, and probably more */ | ||
1576 | sync_blocks /= STRIPE_SECTORS; | ||
1577 | *skipped = 1; | ||
1578 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | ||
1579 | } | ||
1506 | 1580 | ||
1507 | x = sector_nr; | 1581 | x = sector_nr; |
1508 | chunk_offset = sector_div(x, sectors_per_chunk); | 1582 | chunk_offset = sector_div(x, sectors_per_chunk); |
@@ -1520,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1520 | set_current_state(TASK_UNINTERRUPTIBLE); | 1594 | set_current_state(TASK_UNINTERRUPTIBLE); |
1521 | schedule_timeout(1); | 1595 | schedule_timeout(1); |
1522 | } | 1596 | } |
1597 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); | ||
1523 | spin_lock(&sh->lock); | 1598 | spin_lock(&sh->lock); |
1524 | set_bit(STRIPE_SYNCING, &sh->state); | 1599 | set_bit(STRIPE_SYNCING, &sh->state); |
1525 | clear_bit(STRIPE_INSYNC, &sh->state); | 1600 | clear_bit(STRIPE_INSYNC, &sh->state); |
@@ -1553,6 +1628,13 @@ static void raid5d (mddev_t *mddev) | |||
1553 | while (1) { | 1628 | while (1) { |
1554 | struct list_head *first; | 1629 | struct list_head *first; |
1555 | 1630 | ||
1631 | if (conf->seq_flush - conf->seq_write > 0) { | ||
1632 | int seq = conf->seq_flush; | ||
1633 | bitmap_unplug(mddev->bitmap); | ||
1634 | conf->seq_write = seq; | ||
1635 | activate_bit_delay(conf); | ||
1636 | } | ||
1637 | |||
1556 | if (list_empty(&conf->handle_list) && | 1638 | if (list_empty(&conf->handle_list) && |
1557 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && | 1639 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && |
1558 | !blk_queue_plugged(mddev->queue) && | 1640 | !blk_queue_plugged(mddev->queue) && |
@@ -1586,7 +1668,7 @@ static void raid5d (mddev_t *mddev) | |||
1586 | PRINTK("--- raid5d inactive\n"); | 1668 | PRINTK("--- raid5d inactive\n"); |
1587 | } | 1669 | } |
1588 | 1670 | ||
1589 | static int run (mddev_t *mddev) | 1671 | static int run(mddev_t *mddev) |
1590 | { | 1672 | { |
1591 | raid5_conf_t *conf; | 1673 | raid5_conf_t *conf; |
1592 | int raid_disk, memory; | 1674 | int raid_disk, memory; |
@@ -1616,6 +1698,7 @@ static int run (mddev_t *mddev) | |||
1616 | init_waitqueue_head(&conf->wait_for_overlap); | 1698 | init_waitqueue_head(&conf->wait_for_overlap); |
1617 | INIT_LIST_HEAD(&conf->handle_list); | 1699 | INIT_LIST_HEAD(&conf->handle_list); |
1618 | INIT_LIST_HEAD(&conf->delayed_list); | 1700 | INIT_LIST_HEAD(&conf->delayed_list); |
1701 | INIT_LIST_HEAD(&conf->bitmap_list); | ||
1619 | INIT_LIST_HEAD(&conf->inactive_list); | 1702 | INIT_LIST_HEAD(&conf->inactive_list); |
1620 | atomic_set(&conf->active_stripes, 0); | 1703 | atomic_set(&conf->active_stripes, 0); |
1621 | atomic_set(&conf->preread_active_stripes, 0); | 1704 | atomic_set(&conf->preread_active_stripes, 0); |
@@ -1727,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | |||
1727 | 1810 | ||
1728 | /* Ok, everything is just fine now */ | 1811 | /* Ok, everything is just fine now */ |
1729 | 1812 | ||
1813 | if (mddev->bitmap) | ||
1814 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
1815 | |||
1730 | mddev->queue->unplug_fn = raid5_unplug_device; | 1816 | mddev->queue->unplug_fn = raid5_unplug_device; |
1731 | mddev->queue->issue_flush_fn = raid5_issue_flush; | 1817 | mddev->queue->issue_flush_fn = raid5_issue_flush; |
1732 | 1818 | ||
@@ -1907,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1907 | rdev->in_sync = 0; | 1993 | rdev->in_sync = 0; |
1908 | rdev->raid_disk = disk; | 1994 | rdev->raid_disk = disk; |
1909 | found = 1; | 1995 | found = 1; |
1996 | if (rdev->saved_raid_disk != disk) | ||
1997 | conf->fullsync = 1; | ||
1910 | p->rdev = rdev; | 1998 | p->rdev = rdev; |
1911 | break; | 1999 | break; |
1912 | } | 2000 | } |
@@ -1936,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
1936 | return 0; | 2024 | return 0; |
1937 | } | 2025 | } |
1938 | 2026 | ||
2027 | static void raid5_quiesce(mddev_t *mddev, int state) | ||
2028 | { | ||
2029 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
2030 | |||
2031 | switch(state) { | ||
2032 | case 1: /* stop all writes */ | ||
2033 | spin_lock_irq(&conf->device_lock); | ||
2034 | conf->quiesce = 1; | ||
2035 | wait_event_lock_irq(conf->wait_for_stripe, | ||
2036 | atomic_read(&conf->active_stripes) == 0, | ||
2037 | conf->device_lock, /* nothing */); | ||
2038 | spin_unlock_irq(&conf->device_lock); | ||
2039 | break; | ||
2040 | |||
2041 | case 0: /* re-enable writes */ | ||
2042 | spin_lock_irq(&conf->device_lock); | ||
2043 | conf->quiesce = 0; | ||
2044 | wake_up(&conf->wait_for_stripe); | ||
2045 | spin_unlock_irq(&conf->device_lock); | ||
2046 | break; | ||
2047 | } | ||
2048 | if (mddev->thread) { | ||
2049 | if (mddev->bitmap) | ||
2050 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
2051 | else | ||
2052 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
2053 | md_wakeup_thread(mddev->thread); | ||
2054 | } | ||
2055 | } | ||
1939 | static mdk_personality_t raid5_personality= | 2056 | static mdk_personality_t raid5_personality= |
1940 | { | 2057 | { |
1941 | .name = "raid5", | 2058 | .name = "raid5", |
@@ -1950,6 +2067,7 @@ static mdk_personality_t raid5_personality= | |||
1950 | .spare_active = raid5_spare_active, | 2067 | .spare_active = raid5_spare_active, |
1951 | .sync_request = sync_request, | 2068 | .sync_request = sync_request, |
1952 | .resize = raid5_resize, | 2069 | .resize = raid5_resize, |
2070 | .quiesce = raid5_quiesce, | ||
1953 | }; | 2071 | }; |
1954 | 2072 | ||
1955 | static int __init raid5_init (void) | 2073 | static int __init raid5_init (void) |
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 495dee1d1e83..267eb1430c83 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c | |||
@@ -29,6 +29,8 @@ | |||
29 | #include <asm/atomic.h> | 29 | #include <asm/atomic.h> |
30 | #include "raid6.h" | 30 | #include "raid6.h" |
31 | 31 | ||
32 | #include <linux/raid/bitmap.h> | ||
33 | |||
32 | /* | 34 | /* |
33 | * Stripe cache | 35 | * Stripe cache |
34 | */ | 36 | */ |
@@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) | |||
98 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 100 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
99 | if (test_bit(STRIPE_DELAYED, &sh->state)) | 101 | if (test_bit(STRIPE_DELAYED, &sh->state)) |
100 | list_add_tail(&sh->lru, &conf->delayed_list); | 102 | list_add_tail(&sh->lru, &conf->delayed_list); |
101 | else | 103 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
104 | conf->seq_write == sh->bm_seq) | ||
105 | list_add_tail(&sh->lru, &conf->bitmap_list); | ||
106 | else { | ||
107 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | ||
102 | list_add_tail(&sh->lru, &conf->handle_list); | 108 | list_add_tail(&sh->lru, &conf->handle_list); |
109 | } | ||
103 | md_wakeup_thread(conf->mddev->thread); | 110 | md_wakeup_thread(conf->mddev->thread); |
104 | } else { | 111 | } else { |
105 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 112 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
@@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector | |||
262 | spin_lock_irq(&conf->device_lock); | 269 | spin_lock_irq(&conf->device_lock); |
263 | 270 | ||
264 | do { | 271 | do { |
272 | wait_event_lock_irq(conf->wait_for_stripe, | ||
273 | conf->quiesce == 0, | ||
274 | conf->device_lock, /* nothing */); | ||
265 | sh = __find_stripe(conf, sector); | 275 | sh = __find_stripe(conf, sector); |
266 | if (!sh) { | 276 | if (!sh) { |
267 | if (!conf->inactive_blocked) | 277 | if (!conf->inactive_blocked) |
@@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
906 | { | 916 | { |
907 | struct bio **bip; | 917 | struct bio **bip; |
908 | raid6_conf_t *conf = sh->raid_conf; | 918 | raid6_conf_t *conf = sh->raid_conf; |
919 | int firstwrite=0; | ||
909 | 920 | ||
910 | PRINTK("adding bh b#%llu to stripe s#%llu\n", | 921 | PRINTK("adding bh b#%llu to stripe s#%llu\n", |
911 | (unsigned long long)bi->bi_sector, | 922 | (unsigned long long)bi->bi_sector, |
@@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
914 | 925 | ||
915 | spin_lock(&sh->lock); | 926 | spin_lock(&sh->lock); |
916 | spin_lock_irq(&conf->device_lock); | 927 | spin_lock_irq(&conf->device_lock); |
917 | if (forwrite) | 928 | if (forwrite) { |
918 | bip = &sh->dev[dd_idx].towrite; | 929 | bip = &sh->dev[dd_idx].towrite; |
919 | else | 930 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) |
931 | firstwrite = 1; | ||
932 | } else | ||
920 | bip = &sh->dev[dd_idx].toread; | 933 | bip = &sh->dev[dd_idx].toread; |
921 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | 934 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { |
922 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) | 935 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) |
@@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
939 | (unsigned long long)bi->bi_sector, | 952 | (unsigned long long)bi->bi_sector, |
940 | (unsigned long long)sh->sector, dd_idx); | 953 | (unsigned long long)sh->sector, dd_idx); |
941 | 954 | ||
955 | if (conf->mddev->bitmap && firstwrite) { | ||
956 | sh->bm_seq = conf->seq_write; | ||
957 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
958 | STRIPE_SECTORS, 0); | ||
959 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
960 | } | ||
961 | |||
942 | if (forwrite) { | 962 | if (forwrite) { |
943 | /* check if page is covered */ | 963 | /* check if page is covered */ |
944 | sector_t sector = sh->dev[dd_idx].sector; | 964 | sector_t sector = sh->dev[dd_idx].sector; |
@@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
1066 | * need to be failed | 1086 | * need to be failed |
1067 | */ | 1087 | */ |
1068 | if (failed > 2 && to_read+to_write+written) { | 1088 | if (failed > 2 && to_read+to_write+written) { |
1069 | spin_lock_irq(&conf->device_lock); | ||
1070 | for (i=disks; i--; ) { | 1089 | for (i=disks; i--; ) { |
1090 | int bitmap_end = 0; | ||
1091 | spin_lock_irq(&conf->device_lock); | ||
1071 | /* fail all writes first */ | 1092 | /* fail all writes first */ |
1072 | bi = sh->dev[i].towrite; | 1093 | bi = sh->dev[i].towrite; |
1073 | sh->dev[i].towrite = NULL; | 1094 | sh->dev[i].towrite = NULL; |
1074 | if (bi) to_write--; | 1095 | if (bi) { to_write--; bitmap_end = 1; } |
1075 | 1096 | ||
1076 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 1097 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
1077 | wake_up(&conf->wait_for_overlap); | 1098 | wake_up(&conf->wait_for_overlap); |
@@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1089 | /* and fail all 'written' */ | 1110 | /* and fail all 'written' */ |
1090 | bi = sh->dev[i].written; | 1111 | bi = sh->dev[i].written; |
1091 | sh->dev[i].written = NULL; | 1112 | sh->dev[i].written = NULL; |
1113 | if (bi) bitmap_end = 1; | ||
1092 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | 1114 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { |
1093 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 1115 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
1094 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 1116 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
@@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh) | |||
1117 | bi = nextbi; | 1139 | bi = nextbi; |
1118 | } | 1140 | } |
1119 | } | 1141 | } |
1142 | spin_unlock_irq(&conf->device_lock); | ||
1143 | if (bitmap_end) | ||
1144 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1145 | STRIPE_SECTORS, 0, 0); | ||
1120 | } | 1146 | } |
1121 | spin_unlock_irq(&conf->device_lock); | ||
1122 | } | 1147 | } |
1123 | if (failed > 2 && syncing) { | 1148 | if (failed > 2 && syncing) { |
1124 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 1149 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
@@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1155 | if (!test_bit(R5_LOCKED, &dev->flags) && | 1180 | if (!test_bit(R5_LOCKED, &dev->flags) && |
1156 | test_bit(R5_UPTODATE, &dev->flags) ) { | 1181 | test_bit(R5_UPTODATE, &dev->flags) ) { |
1157 | /* We can return any write requests */ | 1182 | /* We can return any write requests */ |
1183 | int bitmap_end = 0; | ||
1158 | struct bio *wbi, *wbi2; | 1184 | struct bio *wbi, *wbi2; |
1159 | PRINTK("Return write for stripe %llu disc %d\n", | 1185 | PRINTK("Return write for stripe %llu disc %d\n", |
1160 | (unsigned long long)sh->sector, i); | 1186 | (unsigned long long)sh->sector, i); |
@@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
1170 | } | 1196 | } |
1171 | wbi = wbi2; | 1197 | wbi = wbi2; |
1172 | } | 1198 | } |
1199 | if (dev->towrite == NULL) | ||
1200 | bitmap_end = 1; | ||
1173 | spin_unlock_irq(&conf->device_lock); | 1201 | spin_unlock_irq(&conf->device_lock); |
1202 | if (bitmap_end) | ||
1203 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
1204 | STRIPE_SECTORS, | ||
1205 | !test_bit(STRIPE_DEGRADED, &sh->state), 0); | ||
1174 | } | 1206 | } |
1175 | } | 1207 | } |
1176 | } | 1208 | } |
@@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
1285 | } | 1317 | } |
1286 | } | 1318 | } |
1287 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | 1319 | /* now if nothing is locked, and if we have enough data, we can start a write request */ |
1288 | if (locked == 0 && rcw == 0) { | 1320 | if (locked == 0 && rcw == 0 && |
1321 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
1289 | if ( must_compute > 0 ) { | 1322 | if ( must_compute > 0 ) { |
1290 | /* We have failed blocks and need to compute them */ | 1323 | /* We have failed blocks and need to compute them */ |
1291 | switch ( failed ) { | 1324 | switch ( failed ) { |
@@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
1388 | bdev = &sh->dev[failed_num[1]]; | 1421 | bdev = &sh->dev[failed_num[1]]; |
1389 | locked += !test_bit(R5_LOCKED, &bdev->flags); | 1422 | locked += !test_bit(R5_LOCKED, &bdev->flags); |
1390 | set_bit(R5_LOCKED, &bdev->flags); | 1423 | set_bit(R5_LOCKED, &bdev->flags); |
1424 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
1391 | set_bit(R5_Wantwrite, &bdev->flags); | 1425 | set_bit(R5_Wantwrite, &bdev->flags); |
1392 | 1426 | ||
1393 | set_bit(STRIPE_INSYNC, &sh->state); | 1427 | set_bit(STRIPE_INSYNC, &sh->state); |
@@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
1457 | bi->bi_next = NULL; | 1491 | bi->bi_next = NULL; |
1458 | generic_make_request(bi); | 1492 | generic_make_request(bi); |
1459 | } else { | 1493 | } else { |
1494 | if (rw == 1) | ||
1495 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
1460 | PRINTK("skip op %ld on disc %d for sector %llu\n", | 1496 | PRINTK("skip op %ld on disc %d for sector %llu\n", |
1461 | bi->bi_rw, i, (unsigned long long)sh->sector); | 1497 | bi->bi_rw, i, (unsigned long long)sh->sector); |
1462 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1498 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
@@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) | |||
1481 | } | 1517 | } |
1482 | } | 1518 | } |
1483 | 1519 | ||
1520 | static inline void activate_bit_delay(raid6_conf_t *conf) | ||
1521 | { | ||
1522 | /* device_lock is held */ | ||
1523 | struct list_head head; | ||
1524 | list_add(&head, &conf->bitmap_list); | ||
1525 | list_del_init(&conf->bitmap_list); | ||
1526 | while (!list_empty(&head)) { | ||
1527 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | ||
1528 | list_del_init(&sh->lru); | ||
1529 | atomic_inc(&sh->count); | ||
1530 | __release_stripe(conf, sh); | ||
1531 | } | ||
1532 | } | ||
1533 | |||
1484 | static void unplug_slaves(mddev_t *mddev) | 1534 | static void unplug_slaves(mddev_t *mddev) |
1485 | { | 1535 | { |
1486 | raid6_conf_t *conf = mddev_to_conf(mddev); | 1536 | raid6_conf_t *conf = mddev_to_conf(mddev); |
@@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q) | |||
1513 | 1563 | ||
1514 | spin_lock_irqsave(&conf->device_lock, flags); | 1564 | spin_lock_irqsave(&conf->device_lock, flags); |
1515 | 1565 | ||
1516 | if (blk_remove_plug(q)) | 1566 | if (blk_remove_plug(q)) { |
1567 | conf->seq_flush++; | ||
1517 | raid6_activate_delayed(conf); | 1568 | raid6_activate_delayed(conf); |
1569 | } | ||
1518 | md_wakeup_thread(mddev->thread); | 1570 | md_wakeup_thread(mddev->thread); |
1519 | 1571 | ||
1520 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1572 | spin_unlock_irqrestore(&conf->device_lock, flags); |
@@ -1570,6 +1622,11 @@ static int make_request (request_queue_t *q, struct bio * bi) | |||
1570 | sector_t logical_sector, last_sector; | 1622 | sector_t logical_sector, last_sector; |
1571 | struct stripe_head *sh; | 1623 | struct stripe_head *sh; |
1572 | 1624 | ||
1625 | if (unlikely(bio_barrier(bi))) { | ||
1626 | bio_endio(bi, bi->bi_size, -EOPNOTSUPP); | ||
1627 | return 0; | ||
1628 | } | ||
1629 | |||
1573 | md_write_start(mddev, bi); | 1630 | md_write_start(mddev, bi); |
1574 | 1631 | ||
1575 | if (bio_data_dir(bi)==WRITE) { | 1632 | if (bio_data_dir(bi)==WRITE) { |
@@ -1647,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1647 | sector_t first_sector; | 1704 | sector_t first_sector; |
1648 | int raid_disks = conf->raid_disks; | 1705 | int raid_disks = conf->raid_disks; |
1649 | int data_disks = raid_disks - 2; | 1706 | int data_disks = raid_disks - 2; |
1707 | sector_t max_sector = mddev->size << 1; | ||
1708 | int sync_blocks; | ||
1650 | 1709 | ||
1651 | if (sector_nr >= mddev->size <<1) { | 1710 | if (sector_nr >= max_sector) { |
1652 | /* just being told to finish up .. nothing much to do */ | 1711 | /* just being told to finish up .. nothing much to do */ |
1653 | unplug_slaves(mddev); | 1712 | unplug_slaves(mddev); |
1713 | |||
1714 | if (mddev->curr_resync < max_sector) /* aborted */ | ||
1715 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | ||
1716 | &sync_blocks, 1); | ||
1717 | else /* compelted sync */ | ||
1718 | conf->fullsync = 0; | ||
1719 | bitmap_close_sync(mddev->bitmap); | ||
1720 | |||
1654 | return 0; | 1721 | return 0; |
1655 | } | 1722 | } |
1656 | /* if there are 2 or more failed drives and we are trying | 1723 | /* if there are 2 or more failed drives and we are trying |
@@ -1662,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1662 | *skipped = 1; | 1729 | *skipped = 1; |
1663 | return rv; | 1730 | return rv; |
1664 | } | 1731 | } |
1732 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && | ||
1733 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { | ||
1734 | /* we can skip this block, and probably more */ | ||
1735 | sync_blocks /= STRIPE_SECTORS; | ||
1736 | *skipped = 1; | ||
1737 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | ||
1738 | } | ||
1665 | 1739 | ||
1666 | x = sector_nr; | 1740 | x = sector_nr; |
1667 | chunk_offset = sector_div(x, sectors_per_chunk); | 1741 | chunk_offset = sector_div(x, sectors_per_chunk); |
@@ -1679,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1679 | set_current_state(TASK_UNINTERRUPTIBLE); | 1753 | set_current_state(TASK_UNINTERRUPTIBLE); |
1680 | schedule_timeout(1); | 1754 | schedule_timeout(1); |
1681 | } | 1755 | } |
1756 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); | ||
1682 | spin_lock(&sh->lock); | 1757 | spin_lock(&sh->lock); |
1683 | set_bit(STRIPE_SYNCING, &sh->state); | 1758 | set_bit(STRIPE_SYNCING, &sh->state); |
1684 | clear_bit(STRIPE_INSYNC, &sh->state); | 1759 | clear_bit(STRIPE_INSYNC, &sh->state); |
@@ -1712,6 +1787,13 @@ static void raid6d (mddev_t *mddev) | |||
1712 | while (1) { | 1787 | while (1) { |
1713 | struct list_head *first; | 1788 | struct list_head *first; |
1714 | 1789 | ||
1790 | if (conf->seq_flush - conf->seq_write > 0) { | ||
1791 | int seq = conf->seq_flush; | ||
1792 | bitmap_unplug(mddev->bitmap); | ||
1793 | conf->seq_write = seq; | ||
1794 | activate_bit_delay(conf); | ||
1795 | } | ||
1796 | |||
1715 | if (list_empty(&conf->handle_list) && | 1797 | if (list_empty(&conf->handle_list) && |
1716 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && | 1798 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && |
1717 | !blk_queue_plugged(mddev->queue) && | 1799 | !blk_queue_plugged(mddev->queue) && |
@@ -1745,7 +1827,7 @@ static void raid6d (mddev_t *mddev) | |||
1745 | PRINTK("--- raid6d inactive\n"); | 1827 | PRINTK("--- raid6d inactive\n"); |
1746 | } | 1828 | } |
1747 | 1829 | ||
1748 | static int run (mddev_t *mddev) | 1830 | static int run(mddev_t *mddev) |
1749 | { | 1831 | { |
1750 | raid6_conf_t *conf; | 1832 | raid6_conf_t *conf; |
1751 | int raid_disk, memory; | 1833 | int raid_disk, memory; |
@@ -1775,6 +1857,7 @@ static int run (mddev_t *mddev) | |||
1775 | init_waitqueue_head(&conf->wait_for_overlap); | 1857 | init_waitqueue_head(&conf->wait_for_overlap); |
1776 | INIT_LIST_HEAD(&conf->handle_list); | 1858 | INIT_LIST_HEAD(&conf->handle_list); |
1777 | INIT_LIST_HEAD(&conf->delayed_list); | 1859 | INIT_LIST_HEAD(&conf->delayed_list); |
1860 | INIT_LIST_HEAD(&conf->bitmap_list); | ||
1778 | INIT_LIST_HEAD(&conf->inactive_list); | 1861 | INIT_LIST_HEAD(&conf->inactive_list); |
1779 | atomic_set(&conf->active_stripes, 0); | 1862 | atomic_set(&conf->active_stripes, 0); |
1780 | atomic_set(&conf->preread_active_stripes, 0); | 1863 | atomic_set(&conf->preread_active_stripes, 0); |
@@ -1894,6 +1977,9 @@ static int run (mddev_t *mddev) | |||
1894 | /* Ok, everything is just fine now */ | 1977 | /* Ok, everything is just fine now */ |
1895 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); | 1978 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); |
1896 | 1979 | ||
1980 | if (mddev->bitmap) | ||
1981 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
1982 | |||
1897 | mddev->queue->unplug_fn = raid6_unplug_device; | 1983 | mddev->queue->unplug_fn = raid6_unplug_device; |
1898 | mddev->queue->issue_flush_fn = raid6_issue_flush; | 1984 | mddev->queue->issue_flush_fn = raid6_issue_flush; |
1899 | return 0; | 1985 | return 0; |
@@ -2071,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
2071 | rdev->in_sync = 0; | 2157 | rdev->in_sync = 0; |
2072 | rdev->raid_disk = disk; | 2158 | rdev->raid_disk = disk; |
2073 | found = 1; | 2159 | found = 1; |
2160 | if (rdev->saved_raid_disk != disk) | ||
2161 | conf->fullsync = 1; | ||
2074 | p->rdev = rdev; | 2162 | p->rdev = rdev; |
2075 | break; | 2163 | break; |
2076 | } | 2164 | } |
@@ -2100,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors) | |||
2100 | return 0; | 2188 | return 0; |
2101 | } | 2189 | } |
2102 | 2190 | ||
2191 | static void raid6_quiesce(mddev_t *mddev, int state) | ||
2192 | { | ||
2193 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
2194 | |||
2195 | switch(state) { | ||
2196 | case 1: /* stop all writes */ | ||
2197 | spin_lock_irq(&conf->device_lock); | ||
2198 | conf->quiesce = 1; | ||
2199 | wait_event_lock_irq(conf->wait_for_stripe, | ||
2200 | atomic_read(&conf->active_stripes) == 0, | ||
2201 | conf->device_lock, /* nothing */); | ||
2202 | spin_unlock_irq(&conf->device_lock); | ||
2203 | break; | ||
2204 | |||
2205 | case 0: /* re-enable writes */ | ||
2206 | spin_lock_irq(&conf->device_lock); | ||
2207 | conf->quiesce = 0; | ||
2208 | wake_up(&conf->wait_for_stripe); | ||
2209 | spin_unlock_irq(&conf->device_lock); | ||
2210 | break; | ||
2211 | } | ||
2212 | if (mddev->thread) { | ||
2213 | if (mddev->bitmap) | ||
2214 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
2215 | else | ||
2216 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
2217 | md_wakeup_thread(mddev->thread); | ||
2218 | } | ||
2219 | } | ||
2103 | static mdk_personality_t raid6_personality= | 2220 | static mdk_personality_t raid6_personality= |
2104 | { | 2221 | { |
2105 | .name = "raid6", | 2222 | .name = "raid6", |
@@ -2114,6 +2231,7 @@ static mdk_personality_t raid6_personality= | |||
2114 | .spare_active = raid6_spare_active, | 2231 | .spare_active = raid6_spare_active, |
2115 | .sync_request = sync_request, | 2232 | .sync_request = sync_request, |
2116 | .resize = raid6_resize, | 2233 | .resize = raid6_resize, |
2234 | .quiesce = raid6_quiesce, | ||
2117 | }; | 2235 | }; |
2118 | 2236 | ||
2119 | static int __init raid6_init (void) | 2237 | static int __init raid6_init (void) |