diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/bitmap.c | 183 | ||||
| -rw-r--r-- | drivers/md/dm-exception-store.c | 9 | ||||
| -rw-r--r-- | drivers/md/dm-raid1.c | 12 | ||||
| -rw-r--r-- | drivers/md/linear.c | 100 | ||||
| -rw-r--r-- | drivers/md/md.c | 227 | ||||
| -rw-r--r-- | drivers/md/multipath.c | 5 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 5 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 234 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 46 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 138 | ||||
| -rw-r--r-- | drivers/md/raid6main.c | 138 |
11 files changed, 856 insertions, 241 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 41df4cda66e2..2fba2bbe72d8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -270,19 +270,20 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
| 270 | 270 | ||
| 271 | if (!page) | 271 | if (!page) |
| 272 | return ERR_PTR(-ENOMEM); | 272 | return ERR_PTR(-ENOMEM); |
| 273 | do { | ||
| 274 | ITERATE_RDEV(mddev, rdev, tmp) | ||
| 275 | if (rdev->in_sync && !rdev->faulty) | ||
| 276 | goto found; | ||
| 277 | return ERR_PTR(-EIO); | ||
| 278 | 273 | ||
| 279 | found: | 274 | ITERATE_RDEV(mddev, rdev, tmp) { |
| 275 | if (! rdev->in_sync || rdev->faulty) | ||
| 276 | continue; | ||
| 277 | |||
| 280 | target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); | 278 | target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); |
| 281 | 279 | ||
| 282 | } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)); | 280 | if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { |
| 281 | page->index = index; | ||
| 282 | return page; | ||
| 283 | } | ||
| 284 | } | ||
| 285 | return ERR_PTR(-EIO); | ||
| 283 | 286 | ||
| 284 | page->index = index; | ||
| 285 | return page; | ||
| 286 | } | 287 | } |
| 287 | 288 | ||
| 288 | static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) | 289 | static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) |
| @@ -437,6 +438,7 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
| 437 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); | 438 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); |
| 438 | printk(KERN_DEBUG " sync size: %llu KB\n", | 439 | printk(KERN_DEBUG " sync size: %llu KB\n", |
| 439 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); | 440 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); |
| 441 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); | ||
| 440 | kunmap(bitmap->sb_page); | 442 | kunmap(bitmap->sb_page); |
| 441 | } | 443 | } |
| 442 | 444 | ||
| @@ -445,7 +447,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
| 445 | { | 447 | { |
| 446 | char *reason = NULL; | 448 | char *reason = NULL; |
| 447 | bitmap_super_t *sb; | 449 | bitmap_super_t *sb; |
| 448 | unsigned long chunksize, daemon_sleep; | 450 | unsigned long chunksize, daemon_sleep, write_behind; |
| 449 | unsigned long bytes_read; | 451 | unsigned long bytes_read; |
| 450 | unsigned long long events; | 452 | unsigned long long events; |
| 451 | int err = -EINVAL; | 453 | int err = -EINVAL; |
| @@ -474,6 +476,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
| 474 | 476 | ||
| 475 | chunksize = le32_to_cpu(sb->chunksize); | 477 | chunksize = le32_to_cpu(sb->chunksize); |
| 476 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); | 478 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); |
| 479 | write_behind = le32_to_cpu(sb->write_behind); | ||
| 477 | 480 | ||
| 478 | /* verify that the bitmap-specific fields are valid */ | 481 | /* verify that the bitmap-specific fields are valid */ |
| 479 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 482 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
| @@ -485,7 +488,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
| 485 | else if ((1 << ffz(~chunksize)) != chunksize) | 488 | else if ((1 << ffz(~chunksize)) != chunksize) |
| 486 | reason = "bitmap chunksize not a power of 2"; | 489 | reason = "bitmap chunksize not a power of 2"; |
| 487 | else if (daemon_sleep < 1 || daemon_sleep > 15) | 490 | else if (daemon_sleep < 1 || daemon_sleep > 15) |
| 488 | reason = "daemon sleep period out of range"; | 491 | reason = "daemon sleep period out of range (1-15s)"; |
| 492 | else if (write_behind > COUNTER_MAX) | ||
| 493 | reason = "write-behind limit out of range (0 - 16383)"; | ||
| 489 | if (reason) { | 494 | if (reason) { |
| 490 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", | 495 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", |
| 491 | bmname(bitmap), reason); | 496 | bmname(bitmap), reason); |
| @@ -518,8 +523,12 @@ success: | |||
| 518 | /* assign fields using values from superblock */ | 523 | /* assign fields using values from superblock */ |
| 519 | bitmap->chunksize = chunksize; | 524 | bitmap->chunksize = chunksize; |
| 520 | bitmap->daemon_sleep = daemon_sleep; | 525 | bitmap->daemon_sleep = daemon_sleep; |
| 526 | bitmap->daemon_lastrun = jiffies; | ||
| 527 | bitmap->max_write_behind = write_behind; | ||
| 521 | bitmap->flags |= sb->state; | 528 | bitmap->flags |= sb->state; |
| 522 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 529 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
| 530 | if (sb->state & BITMAP_STALE) | ||
| 531 | bitmap->events_cleared = bitmap->mddev->events; | ||
| 523 | err = 0; | 532 | err = 0; |
| 524 | out: | 533 | out: |
| 525 | kunmap(bitmap->sb_page); | 534 | kunmap(bitmap->sb_page); |
| @@ -617,7 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) | |||
| 617 | page_cache_release(sb_page); | 626 | page_cache_release(sb_page); |
| 618 | } | 627 | } |
| 619 | 628 | ||
| 620 | static void bitmap_stop_daemons(struct bitmap *bitmap); | 629 | static void bitmap_stop_daemon(struct bitmap *bitmap); |
| 621 | 630 | ||
| 622 | /* dequeue the next item in a page list -- don't call from irq context */ | 631 | /* dequeue the next item in a page list -- don't call from irq context */ |
| 623 | static struct page_list *dequeue_page(struct bitmap *bitmap) | 632 | static struct page_list *dequeue_page(struct bitmap *bitmap) |
| @@ -659,7 +668,7 @@ static void bitmap_file_put(struct bitmap *bitmap) | |||
| 659 | bitmap->file = NULL; | 668 | bitmap->file = NULL; |
| 660 | spin_unlock_irqrestore(&bitmap->lock, flags); | 669 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| 661 | 670 | ||
| 662 | bitmap_stop_daemons(bitmap); | 671 | bitmap_stop_daemon(bitmap); |
| 663 | 672 | ||
| 664 | drain_write_queues(bitmap); | 673 | drain_write_queues(bitmap); |
| 665 | 674 | ||
| @@ -818,7 +827,7 @@ int bitmap_unplug(struct bitmap *bitmap) | |||
| 818 | return 0; | 827 | return 0; |
| 819 | } | 828 | } |
| 820 | 829 | ||
| 821 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); | 830 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); |
| 822 | /* * bitmap_init_from_disk -- called at bitmap_create time to initialize | 831 | /* * bitmap_init_from_disk -- called at bitmap_create time to initialize |
| 823 | * the in-memory bitmap from the on-disk bitmap -- also, sets up the | 832 | * the in-memory bitmap from the on-disk bitmap -- also, sets up the |
| 824 | * memory mapping of the bitmap file | 833 | * memory mapping of the bitmap file |
| @@ -826,8 +835,11 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); | |||
| 826 | * if there's no bitmap file, or if the bitmap file had been | 835 | * if there's no bitmap file, or if the bitmap file had been |
| 827 | * previously kicked from the array, we mark all the bits as | 836 | * previously kicked from the array, we mark all the bits as |
| 828 | * 1's in order to cause a full resync. | 837 | * 1's in order to cause a full resync. |
| 838 | * | ||
| 839 | * We ignore all bits for sectors that end earlier than 'start'. | ||
| 840 | * This is used when reading an out-of-date bitmap... | ||
| 829 | */ | 841 | */ |
| 830 | static int bitmap_init_from_disk(struct bitmap *bitmap) | 842 | static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) |
| 831 | { | 843 | { |
| 832 | unsigned long i, chunks, index, oldindex, bit; | 844 | unsigned long i, chunks, index, oldindex, bit; |
| 833 | struct page *page = NULL, *oldpage = NULL; | 845 | struct page *page = NULL, *oldpage = NULL; |
| @@ -914,7 +926,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap) | |||
| 914 | * whole page and write it out | 926 | * whole page and write it out |
| 915 | */ | 927 | */ |
| 916 | memset(page_address(page) + offset, 0xff, | 928 | memset(page_address(page) + offset, 0xff, |
| 917 | PAGE_SIZE - offset); | 929 | PAGE_SIZE - offset); |
| 918 | ret = write_page(bitmap, page, 1); | 930 | ret = write_page(bitmap, page, 1); |
| 919 | if (ret) { | 931 | if (ret) { |
| 920 | kunmap(page); | 932 | kunmap(page); |
| @@ -928,8 +940,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap) | |||
| 928 | } | 940 | } |
| 929 | if (test_bit(bit, page_address(page))) { | 941 | if (test_bit(bit, page_address(page))) { |
| 930 | /* if the disk bit is set, set the memory bit */ | 942 | /* if the disk bit is set, set the memory bit */ |
| 931 | bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap)); | 943 | bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), |
| 944 | ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) | ||
| 945 | ); | ||
| 932 | bit_cnt++; | 946 | bit_cnt++; |
| 947 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | ||
| 933 | } | 948 | } |
| 934 | } | 949 | } |
| 935 | 950 | ||
| @@ -1141,6 +1156,9 @@ static void bitmap_writeback_daemon(mddev_t *mddev) | |||
| 1141 | err = -EINTR; | 1156 | err = -EINTR; |
| 1142 | goto out; | 1157 | goto out; |
| 1143 | } | 1158 | } |
| 1159 | if (bitmap == NULL) | ||
| 1160 | /* about to be stopped. */ | ||
| 1161 | return; | ||
| 1144 | 1162 | ||
| 1145 | PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); | 1163 | PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); |
| 1146 | /* wait on bitmap page writebacks */ | 1164 | /* wait on bitmap page writebacks */ |
| @@ -1170,21 +1188,12 @@ static void bitmap_writeback_daemon(mddev_t *mddev) | |||
| 1170 | } | 1188 | } |
| 1171 | } | 1189 | } |
| 1172 | 1190 | ||
| 1173 | static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, | 1191 | static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap, |
| 1174 | void (*func)(mddev_t *), char *name) | 1192 | void (*func)(mddev_t *), char *name) |
| 1175 | { | 1193 | { |
| 1176 | mdk_thread_t *daemon; | 1194 | mdk_thread_t *daemon; |
| 1177 | unsigned long flags; | ||
| 1178 | char namebuf[32]; | 1195 | char namebuf[32]; |
| 1179 | 1196 | ||
| 1180 | spin_lock_irqsave(&bitmap->lock, flags); | ||
| 1181 | *ptr = NULL; | ||
| 1182 | |||
| 1183 | if (!bitmap->file) /* no need for daemon if there's no backing file */ | ||
| 1184 | goto out_unlock; | ||
| 1185 | |||
| 1186 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1187 | |||
| 1188 | #ifdef INJECT_FATAL_FAULT_2 | 1197 | #ifdef INJECT_FATAL_FAULT_2 |
| 1189 | daemon = NULL; | 1198 | daemon = NULL; |
| 1190 | #else | 1199 | #else |
| @@ -1194,47 +1203,32 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, | |||
| 1194 | if (!daemon) { | 1203 | if (!daemon) { |
| 1195 | printk(KERN_ERR "%s: failed to start bitmap daemon\n", | 1204 | printk(KERN_ERR "%s: failed to start bitmap daemon\n", |
| 1196 | bmname(bitmap)); | 1205 | bmname(bitmap)); |
| 1197 | return -ECHILD; | 1206 | return ERR_PTR(-ECHILD); |
| 1198 | } | 1207 | } |
| 1199 | 1208 | ||
| 1200 | spin_lock_irqsave(&bitmap->lock, flags); | ||
| 1201 | *ptr = daemon; | ||
| 1202 | |||
| 1203 | md_wakeup_thread(daemon); /* start it running */ | 1209 | md_wakeup_thread(daemon); /* start it running */ |
| 1204 | 1210 | ||
| 1205 | PRINTK("%s: %s daemon (pid %d) started...\n", | 1211 | PRINTK("%s: %s daemon (pid %d) started...\n", |
| 1206 | bmname(bitmap), name, daemon->tsk->pid); | 1212 | bmname(bitmap), name, daemon->tsk->pid); |
| 1207 | out_unlock: | ||
| 1208 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1209 | return 0; | ||
| 1210 | } | ||
| 1211 | 1213 | ||
| 1212 | static int bitmap_start_daemons(struct bitmap *bitmap) | 1214 | return daemon; |
| 1213 | { | ||
| 1214 | int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon, | ||
| 1215 | bitmap_writeback_daemon, "bitmap_wb"); | ||
| 1216 | return err; | ||
| 1217 | } | 1215 | } |
| 1218 | 1216 | ||
| 1219 | static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr) | 1217 | static void bitmap_stop_daemon(struct bitmap *bitmap) |
| 1220 | { | 1218 | { |
| 1221 | mdk_thread_t *daemon; | 1219 | /* the daemon can't stop itself... it'll just exit instead... */ |
| 1222 | unsigned long flags; | 1220 | if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) && |
| 1223 | 1221 | current->pid != bitmap->writeback_daemon->tsk->pid) { | |
| 1224 | spin_lock_irqsave(&bitmap->lock, flags); | 1222 | mdk_thread_t *daemon; |
| 1225 | daemon = *ptr; | 1223 | unsigned long flags; |
| 1226 | *ptr = NULL; | ||
| 1227 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
| 1228 | if (daemon) | ||
| 1229 | md_unregister_thread(daemon); /* destroy the thread */ | ||
| 1230 | } | ||
| 1231 | 1224 | ||
| 1232 | static void bitmap_stop_daemons(struct bitmap *bitmap) | 1225 | spin_lock_irqsave(&bitmap->lock, flags); |
| 1233 | { | 1226 | daemon = bitmap->writeback_daemon; |
| 1234 | /* the daemons can't stop themselves... they'll just exit instead... */ | 1227 | bitmap->writeback_daemon = NULL; |
| 1235 | if (bitmap->writeback_daemon && | 1228 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| 1236 | current->pid != bitmap->writeback_daemon->tsk->pid) | 1229 | if (daemon && ! IS_ERR(daemon)) |
| 1237 | bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon); | 1230 | md_unregister_thread(daemon); /* destroy the thread */ |
| 1231 | } | ||
| 1238 | } | 1232 | } |
| 1239 | 1233 | ||
| 1240 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | 1234 | static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, |
| @@ -1274,9 +1268,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | |||
| 1274 | } | 1268 | } |
| 1275 | } | 1269 | } |
| 1276 | 1270 | ||
| 1277 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) | 1271 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) |
| 1278 | { | 1272 | { |
| 1279 | if (!bitmap) return 0; | 1273 | if (!bitmap) return 0; |
| 1274 | |||
| 1275 | if (behind) { | ||
| 1276 | atomic_inc(&bitmap->behind_writes); | ||
| 1277 | PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", | ||
| 1278 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | ||
| 1279 | } | ||
| 1280 | |||
| 1280 | while (sectors) { | 1281 | while (sectors) { |
| 1281 | int blocks; | 1282 | int blocks; |
| 1282 | bitmap_counter_t *bmc; | 1283 | bitmap_counter_t *bmc; |
| @@ -1311,9 +1312,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
| 1311 | } | 1312 | } |
| 1312 | 1313 | ||
| 1313 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, | 1314 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, |
| 1314 | int success) | 1315 | int success, int behind) |
| 1315 | { | 1316 | { |
| 1316 | if (!bitmap) return; | 1317 | if (!bitmap) return; |
| 1318 | if (behind) { | ||
| 1319 | atomic_dec(&bitmap->behind_writes); | ||
| 1320 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", | ||
| 1321 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | ||
| 1322 | } | ||
| 1323 | |||
| 1317 | while (sectors) { | 1324 | while (sectors) { |
| 1318 | int blocks; | 1325 | int blocks; |
| 1319 | unsigned long flags; | 1326 | unsigned long flags; |
| @@ -1424,7 +1431,7 @@ void bitmap_close_sync(struct bitmap *bitmap) | |||
| 1424 | } | 1431 | } |
| 1425 | } | 1432 | } |
| 1426 | 1433 | ||
| 1427 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) | 1434 | static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) |
| 1428 | { | 1435 | { |
| 1429 | /* For each chunk covered by any of these sectors, set the | 1436 | /* For each chunk covered by any of these sectors, set the |
| 1430 | * counter to 1 and set resync_needed. They should all | 1437 | * counter to 1 and set resync_needed. They should all |
| @@ -1441,7 +1448,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) | |||
| 1441 | } | 1448 | } |
| 1442 | if (! *bmc) { | 1449 | if (! *bmc) { |
| 1443 | struct page *page; | 1450 | struct page *page; |
| 1444 | *bmc = 1 | NEEDED_MASK; | 1451 | *bmc = 1 | (needed?NEEDED_MASK:0); |
| 1445 | bitmap_count_page(bitmap, offset, 1); | 1452 | bitmap_count_page(bitmap, offset, 1); |
| 1446 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); | 1453 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); |
| 1447 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1454 | set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
| @@ -1476,17 +1483,14 @@ void bitmap_flush(mddev_t *mddev) | |||
| 1476 | /* | 1483 | /* |
| 1477 | * free memory that was allocated | 1484 | * free memory that was allocated |
| 1478 | */ | 1485 | */ |
| 1479 | void bitmap_destroy(mddev_t *mddev) | 1486 | static void bitmap_free(struct bitmap *bitmap) |
| 1480 | { | 1487 | { |
| 1481 | unsigned long k, pages; | 1488 | unsigned long k, pages; |
| 1482 | struct bitmap_page *bp; | 1489 | struct bitmap_page *bp; |
| 1483 | struct bitmap *bitmap = mddev->bitmap; | ||
| 1484 | 1490 | ||
| 1485 | if (!bitmap) /* there was no bitmap */ | 1491 | if (!bitmap) /* there was no bitmap */ |
| 1486 | return; | 1492 | return; |
| 1487 | 1493 | ||
| 1488 | mddev->bitmap = NULL; /* disconnect from the md device */ | ||
| 1489 | |||
| 1490 | /* release the bitmap file and kill the daemon */ | 1494 | /* release the bitmap file and kill the daemon */ |
| 1491 | bitmap_file_put(bitmap); | 1495 | bitmap_file_put(bitmap); |
| 1492 | 1496 | ||
| @@ -1504,6 +1508,17 @@ void bitmap_destroy(mddev_t *mddev) | |||
| 1504 | kfree(bp); | 1508 | kfree(bp); |
| 1505 | kfree(bitmap); | 1509 | kfree(bitmap); |
| 1506 | } | 1510 | } |
| 1511 | void bitmap_destroy(mddev_t *mddev) | ||
| 1512 | { | ||
| 1513 | struct bitmap *bitmap = mddev->bitmap; | ||
| 1514 | |||
| 1515 | if (!bitmap) /* there was no bitmap */ | ||
| 1516 | return; | ||
| 1517 | |||
| 1518 | mddev->bitmap = NULL; /* disconnect from the md device */ | ||
| 1519 | |||
| 1520 | bitmap_free(bitmap); | ||
| 1521 | } | ||
| 1507 | 1522 | ||
| 1508 | /* | 1523 | /* |
| 1509 | * initialize the bitmap structure | 1524 | * initialize the bitmap structure |
| @@ -1517,6 +1532,7 @@ int bitmap_create(mddev_t *mddev) | |||
| 1517 | unsigned long pages; | 1532 | unsigned long pages; |
| 1518 | struct file *file = mddev->bitmap_file; | 1533 | struct file *file = mddev->bitmap_file; |
| 1519 | int err; | 1534 | int err; |
| 1535 | sector_t start; | ||
| 1520 | 1536 | ||
| 1521 | BUG_ON(sizeof(bitmap_super_t) != 256); | 1537 | BUG_ON(sizeof(bitmap_super_t) != 256); |
| 1522 | 1538 | ||
| @@ -1533,15 +1549,15 @@ int bitmap_create(mddev_t *mddev) | |||
| 1533 | 1549 | ||
| 1534 | spin_lock_init(&bitmap->lock); | 1550 | spin_lock_init(&bitmap->lock); |
| 1535 | bitmap->mddev = mddev; | 1551 | bitmap->mddev = mddev; |
| 1536 | mddev->bitmap = bitmap; | ||
| 1537 | 1552 | ||
| 1538 | spin_lock_init(&bitmap->write_lock); | 1553 | spin_lock_init(&bitmap->write_lock); |
| 1539 | INIT_LIST_HEAD(&bitmap->complete_pages); | 1554 | INIT_LIST_HEAD(&bitmap->complete_pages); |
| 1540 | init_waitqueue_head(&bitmap->write_wait); | 1555 | init_waitqueue_head(&bitmap->write_wait); |
| 1541 | bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, | 1556 | bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, |
| 1542 | write_pool_free, NULL); | 1557 | write_pool_free, NULL); |
| 1558 | err = -ENOMEM; | ||
| 1543 | if (!bitmap->write_pool) | 1559 | if (!bitmap->write_pool) |
| 1544 | return -ENOMEM; | 1560 | goto error; |
| 1545 | 1561 | ||
| 1546 | bitmap->file = file; | 1562 | bitmap->file = file; |
| 1547 | bitmap->offset = mddev->bitmap_offset; | 1563 | bitmap->offset = mddev->bitmap_offset; |
| @@ -1549,7 +1565,7 @@ int bitmap_create(mddev_t *mddev) | |||
| 1549 | /* read superblock from bitmap file (this sets bitmap->chunksize) */ | 1565 | /* read superblock from bitmap file (this sets bitmap->chunksize) */ |
| 1550 | err = bitmap_read_sb(bitmap); | 1566 | err = bitmap_read_sb(bitmap); |
| 1551 | if (err) | 1567 | if (err) |
| 1552 | return err; | 1568 | goto error; |
| 1553 | 1569 | ||
| 1554 | bitmap->chunkshift = find_first_bit(&bitmap->chunksize, | 1570 | bitmap->chunkshift = find_first_bit(&bitmap->chunksize, |
| 1555 | sizeof(bitmap->chunksize)); | 1571 | sizeof(bitmap->chunksize)); |
| @@ -1573,27 +1589,44 @@ int bitmap_create(mddev_t *mddev) | |||
| 1573 | #else | 1589 | #else |
| 1574 | bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); | 1590 | bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); |
| 1575 | #endif | 1591 | #endif |
| 1592 | err = -ENOMEM; | ||
| 1576 | if (!bitmap->bp) | 1593 | if (!bitmap->bp) |
| 1577 | return -ENOMEM; | 1594 | goto error; |
| 1578 | memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); | 1595 | memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); |
| 1579 | 1596 | ||
| 1580 | bitmap->flags |= BITMAP_ACTIVE; | 1597 | bitmap->flags |= BITMAP_ACTIVE; |
| 1581 | 1598 | ||
| 1582 | /* now that we have some pages available, initialize the in-memory | 1599 | /* now that we have some pages available, initialize the in-memory |
| 1583 | * bitmap from the on-disk bitmap */ | 1600 | * bitmap from the on-disk bitmap */ |
| 1584 | err = bitmap_init_from_disk(bitmap); | 1601 | start = 0; |
| 1602 | if (mddev->degraded == 0 | ||
| 1603 | || bitmap->events_cleared == mddev->events) | ||
| 1604 | /* no need to keep dirty bits to optimise a re-add of a missing device */ | ||
| 1605 | start = mddev->recovery_cp; | ||
| 1606 | err = bitmap_init_from_disk(bitmap, start); | ||
| 1585 | 1607 | ||
| 1586 | if (err) | 1608 | if (err) |
| 1587 | return err; | 1609 | goto error; |
| 1588 | 1610 | ||
| 1589 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", | 1611 | printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", |
| 1590 | pages, bmname(bitmap)); | 1612 | pages, bmname(bitmap)); |
| 1591 | 1613 | ||
| 1592 | /* kick off the bitmap daemons */ | 1614 | mddev->bitmap = bitmap; |
| 1593 | err = bitmap_start_daemons(bitmap); | 1615 | |
| 1594 | if (err) | 1616 | if (file) |
| 1595 | return err; | 1617 | /* kick off the bitmap writeback daemon */ |
| 1618 | bitmap->writeback_daemon = | ||
| 1619 | bitmap_start_daemon(bitmap, | ||
| 1620 | bitmap_writeback_daemon, | ||
| 1621 | "bitmap_wb"); | ||
| 1622 | |||
| 1623 | if (IS_ERR(bitmap->writeback_daemon)) | ||
| 1624 | return PTR_ERR(bitmap->writeback_daemon); | ||
| 1596 | return bitmap_update_sb(bitmap); | 1625 | return bitmap_update_sb(bitmap); |
| 1626 | |||
| 1627 | error: | ||
| 1628 | bitmap_free(bitmap); | ||
| 1629 | return err; | ||
| 1597 | } | 1630 | } |
| 1598 | 1631 | ||
| 1599 | /* the bitmap API -- for raid personalities */ | 1632 | /* the bitmap API -- for raid personalities */ |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 17212b4201a1..cc07bbebbb16 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
| @@ -568,12 +568,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) | |||
| 568 | 568 | ||
| 569 | bad: | 569 | bad: |
| 570 | dm_io_put(sectors_to_pages(chunk_size)); | 570 | dm_io_put(sectors_to_pages(chunk_size)); |
| 571 | if (ps) { | 571 | if (ps && ps->area) |
| 572 | if (ps->area) | 572 | free_area(ps); |
| 573 | free_area(ps); | 573 | kfree(ps); |
| 574 | |||
| 575 | kfree(ps); | ||
| 576 | } | ||
| 577 | return r; | 574 | return r; |
| 578 | } | 575 | } |
| 579 | 576 | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index b08df8b9b2ca..863282513753 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
| @@ -375,16 +375,18 @@ static void rh_inc(struct region_hash *rh, region_t region) | |||
| 375 | 375 | ||
| 376 | read_lock(&rh->hash_lock); | 376 | read_lock(&rh->hash_lock); |
| 377 | reg = __rh_find(rh, region); | 377 | reg = __rh_find(rh, region); |
| 378 | |||
| 379 | atomic_inc(®->pending); | ||
| 380 | |||
| 381 | spin_lock_irq(&rh->region_lock); | ||
| 378 | if (reg->state == RH_CLEAN) { | 382 | if (reg->state == RH_CLEAN) { |
| 379 | rh->log->type->mark_region(rh->log, reg->key); | 383 | rh->log->type->mark_region(rh->log, reg->key); |
| 380 | 384 | ||
| 381 | spin_lock_irq(&rh->region_lock); | ||
| 382 | reg->state = RH_DIRTY; | 385 | reg->state = RH_DIRTY; |
| 383 | list_del_init(®->list); /* take off the clean list */ | 386 | list_del_init(®->list); /* take off the clean list */ |
| 384 | spin_unlock_irq(&rh->region_lock); | ||
| 385 | } | 387 | } |
| 388 | spin_unlock_irq(&rh->region_lock); | ||
| 386 | 389 | ||
| 387 | atomic_inc(®->pending); | ||
| 388 | read_unlock(&rh->hash_lock); | 390 | read_unlock(&rh->hash_lock); |
| 389 | } | 391 | } |
| 390 | 392 | ||
| @@ -408,6 +410,10 @@ static void rh_dec(struct region_hash *rh, region_t region) | |||
| 408 | 410 | ||
| 409 | if (atomic_dec_and_test(®->pending)) { | 411 | if (atomic_dec_and_test(®->pending)) { |
| 410 | spin_lock_irqsave(&rh->region_lock, flags); | 412 | spin_lock_irqsave(&rh->region_lock, flags); |
| 413 | if (atomic_read(®->pending)) { /* check race */ | ||
| 414 | spin_unlock_irqrestore(&rh->region_lock, flags); | ||
| 415 | return; | ||
| 416 | } | ||
| 411 | if (reg->state == RH_RECOVERING) { | 417 | if (reg->state == RH_RECOVERING) { |
| 412 | list_add_tail(®->list, &rh->quiesced_regions); | 418 | list_add_tail(®->list, &rh->quiesced_regions); |
| 413 | } else { | 419 | } else { |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 8d740013d74d..bb279fad2fd2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
| @@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | |||
| 38 | /* | 38 | /* |
| 39 | * sector_div(a,b) returns the remainer and sets a to a/b | 39 | * sector_div(a,b) returns the remainer and sets a to a/b |
| 40 | */ | 40 | */ |
| 41 | (void)sector_div(block, conf->smallest->size); | 41 | block >>= conf->preshift; |
| 42 | (void)sector_div(block, conf->hash_spacing); | ||
| 42 | hash = conf->hash_table[block]; | 43 | hash = conf->hash_table[block]; |
| 43 | 44 | ||
| 44 | while ((sector>>1) >= (hash->size + hash->offset)) | 45 | while ((sector>>1) >= (hash->size + hash->offset)) |
| @@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) | |||
| 47 | } | 48 | } |
| 48 | 49 | ||
| 49 | /** | 50 | /** |
| 50 | * linear_mergeable_bvec -- tell bio layer if a two requests can be merged | 51 | * linear_mergeable_bvec -- tell bio layer if two requests can be merged |
| 51 | * @q: request queue | 52 | * @q: request queue |
| 52 | * @bio: the buffer head that's been built up so far | 53 | * @bio: the buffer head that's been built up so far |
| 53 | * @biovec: the request that could be merged to it. | 54 | * @biovec: the request that could be merged to it. |
| @@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev) | |||
| 116 | dev_info_t **table; | 117 | dev_info_t **table; |
| 117 | mdk_rdev_t *rdev; | 118 | mdk_rdev_t *rdev; |
| 118 | int i, nb_zone, cnt; | 119 | int i, nb_zone, cnt; |
| 119 | sector_t start; | 120 | sector_t min_spacing; |
| 120 | sector_t curr_offset; | 121 | sector_t curr_offset; |
| 121 | struct list_head *tmp; | 122 | struct list_head *tmp; |
| 122 | 123 | ||
| @@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev) | |||
| 127 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); | 128 | memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); |
| 128 | mddev->private = conf; | 129 | mddev->private = conf; |
| 129 | 130 | ||
| 130 | /* | ||
| 131 | * Find the smallest device. | ||
| 132 | */ | ||
| 133 | |||
| 134 | conf->smallest = NULL; | ||
| 135 | cnt = 0; | 131 | cnt = 0; |
| 136 | mddev->array_size = 0; | 132 | mddev->array_size = 0; |
| 137 | 133 | ||
| @@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev) | |||
| 159 | disk->size = rdev->size; | 155 | disk->size = rdev->size; |
| 160 | mddev->array_size += rdev->size; | 156 | mddev->array_size += rdev->size; |
| 161 | 157 | ||
| 162 | if (!conf->smallest || (disk->size < conf->smallest->size)) | ||
| 163 | conf->smallest = disk; | ||
| 164 | cnt++; | 158 | cnt++; |
| 165 | } | 159 | } |
| 166 | if (cnt != mddev->raid_disks) { | 160 | if (cnt != mddev->raid_disks) { |
| @@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev) | |||
| 168 | goto out; | 162 | goto out; |
| 169 | } | 163 | } |
| 170 | 164 | ||
| 165 | min_spacing = mddev->array_size; | ||
| 166 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); | ||
| 167 | |||
| 168 | /* min_spacing is the minimum spacing that will fit the hash | ||
| 169 | * table in one PAGE. This may be much smaller than needed. | ||
| 170 | * We find the smallest non-terminal set of consecutive devices | ||
| 171 | * that is larger than min_spacing as use the size of that as | ||
| 172 | * the actual spacing | ||
| 173 | */ | ||
| 174 | conf->hash_spacing = mddev->array_size; | ||
| 175 | for (i=0; i < cnt-1 ; i++) { | ||
| 176 | sector_t sz = 0; | ||
| 177 | int j; | ||
| 178 | for (j=i; i<cnt-1 && sz < min_spacing ; j++) | ||
| 179 | sz += conf->disks[j].size; | ||
| 180 | if (sz >= min_spacing && sz < conf->hash_spacing) | ||
| 181 | conf->hash_spacing = sz; | ||
| 182 | } | ||
| 183 | |||
| 184 | /* hash_spacing may be too large for sector_div to work with, | ||
| 185 | * so we might need to pre-shift | ||
| 186 | */ | ||
| 187 | conf->preshift = 0; | ||
| 188 | if (sizeof(sector_t) > sizeof(u32)) { | ||
| 189 | sector_t space = conf->hash_spacing; | ||
| 190 | while (space > (sector_t)(~(u32)0)) { | ||
| 191 | space >>= 1; | ||
| 192 | conf->preshift++; | ||
| 193 | } | ||
| 194 | } | ||
| 171 | /* | 195 | /* |
| 172 | * This code was restructured to work around a gcc-2.95.3 internal | 196 | * This code was restructured to work around a gcc-2.95.3 internal |
| 173 | * compiler error. Alter it with care. | 197 | * compiler error. Alter it with care. |
| @@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev) | |||
| 177 | unsigned round; | 201 | unsigned round; |
| 178 | unsigned long base; | 202 | unsigned long base; |
| 179 | 203 | ||
| 180 | sz = mddev->array_size; | 204 | sz = mddev->array_size >> conf->preshift; |
| 181 | base = conf->smallest->size; | 205 | sz += 1; /* force round-up */ |
| 206 | base = conf->hash_spacing >> conf->preshift; | ||
| 182 | round = sector_div(sz, base); | 207 | round = sector_div(sz, base); |
| 183 | nb_zone = conf->nr_zones = sz + (round ? 1 : 0); | 208 | nb_zone = sz + (round ? 1 : 0); |
| 184 | } | 209 | } |
| 185 | 210 | BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); | |
| 186 | conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, | 211 | |
| 212 | conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, | ||
| 187 | GFP_KERNEL); | 213 | GFP_KERNEL); |
| 188 | if (!conf->hash_table) | 214 | if (!conf->hash_table) |
| 189 | goto out; | 215 | goto out; |
| 190 | 216 | ||
| 191 | /* | 217 | /* |
| 192 | * Here we generate the linear hash table | 218 | * Here we generate the linear hash table |
| 219 | * First calculate the device offsets. | ||
| 193 | */ | 220 | */ |
| 221 | conf->disks[0].offset = 0; | ||
| 222 | for (i=1; i<mddev->raid_disks; i++) | ||
| 223 | conf->disks[i].offset = | ||
| 224 | conf->disks[i-1].offset + | ||
| 225 | conf->disks[i-1].size; | ||
| 226 | |||
| 194 | table = conf->hash_table; | 227 | table = conf->hash_table; |
| 195 | start = 0; | ||
| 196 | curr_offset = 0; | 228 | curr_offset = 0; |
| 197 | for (i = 0; i < cnt; i++) { | 229 | i = 0; |
| 198 | dev_info_t *disk = conf->disks + i; | 230 | for (curr_offset = 0; |
| 231 | curr_offset < mddev->array_size; | ||
| 232 | curr_offset += conf->hash_spacing) { | ||
| 199 | 233 | ||
| 200 | disk->offset = curr_offset; | 234 | while (i < mddev->raid_disks-1 && |
| 201 | curr_offset += disk->size; | 235 | curr_offset >= conf->disks[i+1].offset) |
| 236 | i++; | ||
| 202 | 237 | ||
| 203 | /* 'curr_offset' is the end of this disk | 238 | *table ++ = conf->disks + i; |
| 204 | * 'start' is the start of table | 239 | } |
| 240 | |||
| 241 | if (conf->preshift) { | ||
| 242 | conf->hash_spacing >>= conf->preshift; | ||
| 243 | /* round hash_spacing up so that when we divide by it, | ||
| 244 | * we err on the side of "too-low", which is safest. | ||
| 205 | */ | 245 | */ |
| 206 | while (start < curr_offset) { | 246 | conf->hash_spacing++; |
| 207 | *table++ = disk; | ||
| 208 | start += conf->smallest->size; | ||
| 209 | } | ||
| 210 | } | 247 | } |
| 211 | if (table-conf->hash_table != nb_zone) | 248 | |
| 212 | BUG(); | 249 | BUG_ON(table - conf->hash_table > nb_zone); |
| 213 | 250 | ||
| 214 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 251 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
| 215 | mddev->queue->unplug_fn = linear_unplug; | 252 | mddev->queue->unplug_fn = linear_unplug; |
| @@ -238,6 +275,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio) | |||
| 238 | dev_info_t *tmp_dev; | 275 | dev_info_t *tmp_dev; |
| 239 | sector_t block; | 276 | sector_t block; |
| 240 | 277 | ||
| 278 | if (unlikely(bio_barrier(bio))) { | ||
| 279 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
| 280 | return 0; | ||
| 281 | } | ||
| 282 | |||
| 241 | if (bio_data_dir(bio)==WRITE) { | 283 | if (bio_data_dir(bio)==WRITE) { |
| 242 | disk_stat_inc(mddev->gendisk, writes); | 284 | disk_stat_inc(mddev->gendisk, writes); |
| 243 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | 285 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); |
| @@ -294,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev) | |||
| 294 | sector_t s = 0; | 336 | sector_t s = 0; |
| 295 | 337 | ||
| 296 | seq_printf(seq, " "); | 338 | seq_printf(seq, " "); |
| 297 | for (j = 0; j < conf->nr_zones; j++) | 339 | for (j = 0; j < mddev->raid_disks; j++) |
| 298 | { | 340 | { |
| 299 | char b[BDEVNAME_SIZE]; | 341 | char b[BDEVNAME_SIZE]; |
| 300 | s += conf->smallest_size; | 342 | s += conf->smallest_size; |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 20ca80b7dc20..2897df90df44 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -34,6 +34,7 @@ | |||
| 34 | 34 | ||
| 35 | #include <linux/module.h> | 35 | #include <linux/module.h> |
| 36 | #include <linux/config.h> | 36 | #include <linux/config.h> |
| 37 | #include <linux/kthread.h> | ||
| 37 | #include <linux/linkage.h> | 38 | #include <linux/linkage.h> |
| 38 | #include <linux/raid/md.h> | 39 | #include <linux/raid/md.h> |
| 39 | #include <linux/raid/bitmap.h> | 40 | #include <linux/raid/bitmap.h> |
| @@ -73,7 +74,7 @@ static DEFINE_SPINLOCK(pers_lock); | |||
| 73 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' | 74 | * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
| 74 | * is 1000 KB/sec, so the extra system load does not show up that much. | 75 | * is 1000 KB/sec, so the extra system load does not show up that much. |
| 75 | * Increase it if you want to have more _guaranteed_ speed. Note that | 76 | * Increase it if you want to have more _guaranteed_ speed. Note that |
| 76 | * the RAID driver will use the maximum available bandwith if the IO | 77 | * the RAID driver will use the maximum available bandwidth if the IO |
| 77 | * subsystem is idle. There is also an 'absolute maximum' reconstruction | 78 | * subsystem is idle. There is also an 'absolute maximum' reconstruction |
| 78 | * speed limit - in case reconstruction slows down your system despite | 79 | * speed limit - in case reconstruction slows down your system despite |
| 79 | * idle IO detection. | 80 | * idle IO detection. |
| @@ -393,7 +394,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, | |||
| 393 | return ret; | 394 | return ret; |
| 394 | } | 395 | } |
| 395 | 396 | ||
| 396 | static int read_disk_sb(mdk_rdev_t * rdev) | 397 | static int read_disk_sb(mdk_rdev_t * rdev, int size) |
| 397 | { | 398 | { |
| 398 | char b[BDEVNAME_SIZE]; | 399 | char b[BDEVNAME_SIZE]; |
| 399 | if (!rdev->sb_page) { | 400 | if (!rdev->sb_page) { |
| @@ -404,7 +405,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) | |||
| 404 | return 0; | 405 | return 0; |
| 405 | 406 | ||
| 406 | 407 | ||
| 407 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) | 408 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) |
| 408 | goto fail; | 409 | goto fail; |
| 409 | rdev->sb_loaded = 1; | 410 | rdev->sb_loaded = 1; |
| 410 | return 0; | 411 | return 0; |
| @@ -531,7 +532,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 531 | sb_offset = calc_dev_sboffset(rdev->bdev); | 532 | sb_offset = calc_dev_sboffset(rdev->bdev); |
| 532 | rdev->sb_offset = sb_offset; | 533 | rdev->sb_offset = sb_offset; |
| 533 | 534 | ||
| 534 | ret = read_disk_sb(rdev); | 535 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
| 535 | if (ret) return ret; | 536 | if (ret) return ret; |
| 536 | 537 | ||
| 537 | ret = -EINVAL; | 538 | ret = -EINVAL; |
| @@ -564,6 +565,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 564 | 565 | ||
| 565 | rdev->preferred_minor = sb->md_minor; | 566 | rdev->preferred_minor = sb->md_minor; |
| 566 | rdev->data_offset = 0; | 567 | rdev->data_offset = 0; |
| 568 | rdev->sb_size = MD_SB_BYTES; | ||
| 567 | 569 | ||
| 568 | if (sb->level == LEVEL_MULTIPATH) | 570 | if (sb->level == LEVEL_MULTIPATH) |
| 569 | rdev->desc_nr = -1; | 571 | rdev->desc_nr = -1; |
| @@ -623,6 +625,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 623 | mddev->size = sb->size; | 625 | mddev->size = sb->size; |
| 624 | mddev->events = md_event(sb); | 626 | mddev->events = md_event(sb); |
| 625 | mddev->bitmap_offset = 0; | 627 | mddev->bitmap_offset = 0; |
| 628 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | ||
| 626 | 629 | ||
| 627 | if (sb->state & (1<<MD_SB_CLEAN)) | 630 | if (sb->state & (1<<MD_SB_CLEAN)) |
| 628 | mddev->recovery_cp = MaxSector; | 631 | mddev->recovery_cp = MaxSector; |
| @@ -643,12 +646,12 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 643 | 646 | ||
| 644 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && | 647 | if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
| 645 | mddev->bitmap_file == NULL) { | 648 | mddev->bitmap_file == NULL) { |
| 646 | if (mddev->level != 1) { | 649 | if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { |
| 647 | /* FIXME use a better test */ | 650 | /* FIXME use a better test */ |
| 648 | printk(KERN_WARNING "md: bitmaps only support for raid1\n"); | 651 | printk(KERN_WARNING "md: bitmaps only support for raid1\n"); |
| 649 | return -EINVAL; | 652 | return -EINVAL; |
| 650 | } | 653 | } |
| 651 | mddev->bitmap_offset = (MD_SB_BYTES >> 9); | 654 | mddev->bitmap_offset = mddev->default_bitmap_offset; |
| 652 | } | 655 | } |
| 653 | 656 | ||
| 654 | } else if (mddev->pers == NULL) { | 657 | } else if (mddev->pers == NULL) { |
| @@ -669,6 +672,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 669 | 672 | ||
| 670 | if (mddev->level != LEVEL_MULTIPATH) { | 673 | if (mddev->level != LEVEL_MULTIPATH) { |
| 671 | rdev->faulty = 0; | 674 | rdev->faulty = 0; |
| 675 | rdev->flags = 0; | ||
| 672 | desc = sb->disks + rdev->desc_nr; | 676 | desc = sb->disks + rdev->desc_nr; |
| 673 | 677 | ||
| 674 | if (desc->state & (1<<MD_DISK_FAULTY)) | 678 | if (desc->state & (1<<MD_DISK_FAULTY)) |
| @@ -678,6 +682,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 678 | rdev->in_sync = 1; | 682 | rdev->in_sync = 1; |
| 679 | rdev->raid_disk = desc->raid_disk; | 683 | rdev->raid_disk = desc->raid_disk; |
| 680 | } | 684 | } |
| 685 | if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
| 686 | set_bit(WriteMostly, &rdev->flags); | ||
| 681 | } else /* MULTIPATH are always insync */ | 687 | } else /* MULTIPATH are always insync */ |
| 682 | rdev->in_sync = 1; | 688 | rdev->in_sync = 1; |
| 683 | return 0; | 689 | return 0; |
| @@ -706,6 +712,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 706 | int i; | 712 | int i; |
| 707 | int active=0, working=0,failed=0,spare=0,nr_disks=0; | 713 | int active=0, working=0,failed=0,spare=0,nr_disks=0; |
| 708 | 714 | ||
| 715 | rdev->sb_size = MD_SB_BYTES; | ||
| 716 | |||
| 709 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 717 | sb = (mdp_super_t*)page_address(rdev->sb_page); |
| 710 | 718 | ||
| 711 | memset(sb, 0, sizeof(*sb)); | 719 | memset(sb, 0, sizeof(*sb)); |
| @@ -776,6 +784,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 776 | spare++; | 784 | spare++; |
| 777 | working++; | 785 | working++; |
| 778 | } | 786 | } |
| 787 | if (test_bit(WriteMostly, &rdev2->flags)) | ||
| 788 | d->state |= (1<<MD_DISK_WRITEMOSTLY); | ||
| 779 | } | 789 | } |
| 780 | 790 | ||
| 781 | /* now set the "removed" and "faulty" bits on any missing devices */ | 791 | /* now set the "removed" and "faulty" bits on any missing devices */ |
| @@ -831,6 +841,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 831 | int ret; | 841 | int ret; |
| 832 | sector_t sb_offset; | 842 | sector_t sb_offset; |
| 833 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 843 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| 844 | int bmask; | ||
| 834 | 845 | ||
| 835 | /* | 846 | /* |
| 836 | * Calculate the position of the superblock. | 847 | * Calculate the position of the superblock. |
| @@ -859,7 +870,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 859 | } | 870 | } |
| 860 | rdev->sb_offset = sb_offset; | 871 | rdev->sb_offset = sb_offset; |
| 861 | 872 | ||
| 862 | ret = read_disk_sb(rdev); | 873 | /* superblock is rarely larger than 1K, but it can be larger, |
| 874 | * and it is safe to read 4k, so we do that | ||
| 875 | */ | ||
| 876 | ret = read_disk_sb(rdev, 4096); | ||
| 863 | if (ret) return ret; | 877 | if (ret) return ret; |
| 864 | 878 | ||
| 865 | 879 | ||
| @@ -869,7 +883,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 869 | sb->major_version != cpu_to_le32(1) || | 883 | sb->major_version != cpu_to_le32(1) || |
| 870 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || | 884 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
| 871 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || | 885 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || |
| 872 | sb->feature_map != 0) | 886 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
| 873 | return -EINVAL; | 887 | return -EINVAL; |
| 874 | 888 | ||
| 875 | if (calc_sb_1_csum(sb) != sb->sb_csum) { | 889 | if (calc_sb_1_csum(sb) != sb->sb_csum) { |
| @@ -885,6 +899,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 885 | rdev->preferred_minor = 0xffff; | 899 | rdev->preferred_minor = 0xffff; |
| 886 | rdev->data_offset = le64_to_cpu(sb->data_offset); | 900 | rdev->data_offset = le64_to_cpu(sb->data_offset); |
| 887 | 901 | ||
| 902 | rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; | ||
| 903 | bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; | ||
| 904 | if (rdev->sb_size & bmask) | ||
| 905 | rdev-> sb_size = (rdev->sb_size | bmask)+1; | ||
| 906 | |||
| 888 | if (refdev == 0) | 907 | if (refdev == 0) |
| 889 | return 1; | 908 | return 1; |
| 890 | else { | 909 | else { |
| @@ -939,13 +958,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 939 | mddev->size = le64_to_cpu(sb->size)/2; | 958 | mddev->size = le64_to_cpu(sb->size)/2; |
| 940 | mddev->events = le64_to_cpu(sb->events); | 959 | mddev->events = le64_to_cpu(sb->events); |
| 941 | mddev->bitmap_offset = 0; | 960 | mddev->bitmap_offset = 0; |
| 961 | mddev->default_bitmap_offset = 0; | ||
| 962 | mddev->default_bitmap_offset = 1024; | ||
| 942 | 963 | ||
| 943 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); | 964 | mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
| 944 | memcpy(mddev->uuid, sb->set_uuid, 16); | 965 | memcpy(mddev->uuid, sb->set_uuid, 16); |
| 945 | 966 | ||
| 946 | mddev->max_disks = (4096-256)/2; | 967 | mddev->max_disks = (4096-256)/2; |
| 947 | 968 | ||
| 948 | if ((le32_to_cpu(sb->feature_map) & 1) && | 969 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
| 949 | mddev->bitmap_file == NULL ) { | 970 | mddev->bitmap_file == NULL ) { |
| 950 | if (mddev->level != 1) { | 971 | if (mddev->level != 1) { |
| 951 | printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); | 972 | printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); |
| @@ -986,6 +1007,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 986 | rdev->raid_disk = role; | 1007 | rdev->raid_disk = role; |
| 987 | break; | 1008 | break; |
| 988 | } | 1009 | } |
| 1010 | rdev->flags = 0; | ||
| 1011 | if (sb->devflags & WriteMostly1) | ||
| 1012 | set_bit(WriteMostly, &rdev->flags); | ||
| 989 | } else /* MULTIPATH are always insync */ | 1013 | } else /* MULTIPATH are always insync */ |
| 990 | rdev->in_sync = 1; | 1014 | rdev->in_sync = 1; |
| 991 | 1015 | ||
| @@ -1017,7 +1041,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1017 | 1041 | ||
| 1018 | if (mddev->bitmap && mddev->bitmap_file == NULL) { | 1042 | if (mddev->bitmap && mddev->bitmap_file == NULL) { |
| 1019 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); | 1043 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); |
| 1020 | sb->feature_map = cpu_to_le32(1); | 1044 | sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
| 1021 | } | 1045 | } |
| 1022 | 1046 | ||
| 1023 | max_dev = 0; | 1047 | max_dev = 0; |
| @@ -1363,7 +1387,7 @@ repeat: | |||
| 1363 | dprintk("%s ", bdevname(rdev->bdev,b)); | 1387 | dprintk("%s ", bdevname(rdev->bdev,b)); |
| 1364 | if (!rdev->faulty) { | 1388 | if (!rdev->faulty) { |
| 1365 | md_super_write(mddev,rdev, | 1389 | md_super_write(mddev,rdev, |
| 1366 | rdev->sb_offset<<1, MD_SB_BYTES, | 1390 | rdev->sb_offset<<1, rdev->sb_size, |
| 1367 | rdev->sb_page); | 1391 | rdev->sb_page); |
| 1368 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | 1392 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", |
| 1369 | bdevname(rdev->bdev,b), | 1393 | bdevname(rdev->bdev,b), |
| @@ -2073,6 +2097,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
| 2073 | info.state = 0; | 2097 | info.state = 0; |
| 2074 | if (mddev->in_sync) | 2098 | if (mddev->in_sync) |
| 2075 | info.state = (1<<MD_SB_CLEAN); | 2099 | info.state = (1<<MD_SB_CLEAN); |
| 2100 | if (mddev->bitmap && mddev->bitmap_offset) | ||
| 2101 | info.state = (1<<MD_SB_BITMAP_PRESENT); | ||
| 2076 | info.active_disks = active; | 2102 | info.active_disks = active; |
| 2077 | info.working_disks = working; | 2103 | info.working_disks = working; |
| 2078 | info.failed_disks = failed; | 2104 | info.failed_disks = failed; |
| @@ -2087,7 +2113,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
| 2087 | return 0; | 2113 | return 0; |
| 2088 | } | 2114 | } |
| 2089 | 2115 | ||
| 2090 | static int get_bitmap_file(mddev_t * mddev, void * arg) | 2116 | static int get_bitmap_file(mddev_t * mddev, void __user * arg) |
| 2091 | { | 2117 | { |
| 2092 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ | 2118 | mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ |
| 2093 | char *ptr, *buf = NULL; | 2119 | char *ptr, *buf = NULL; |
| @@ -2146,6 +2172,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) | |||
| 2146 | info.state |= (1<<MD_DISK_ACTIVE); | 2172 | info.state |= (1<<MD_DISK_ACTIVE); |
| 2147 | info.state |= (1<<MD_DISK_SYNC); | 2173 | info.state |= (1<<MD_DISK_SYNC); |
| 2148 | } | 2174 | } |
| 2175 | if (test_bit(WriteMostly, &rdev->flags)) | ||
| 2176 | info.state |= (1<<MD_DISK_WRITEMOSTLY); | ||
| 2149 | } else { | 2177 | } else { |
| 2150 | info.major = info.minor = 0; | 2178 | info.major = info.minor = 0; |
| 2151 | info.raid_disk = -1; | 2179 | info.raid_disk = -1; |
| @@ -2210,8 +2238,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 2210 | mdname(mddev)); | 2238 | mdname(mddev)); |
| 2211 | return -EINVAL; | 2239 | return -EINVAL; |
| 2212 | } | 2240 | } |
| 2213 | rdev = md_import_device(dev, mddev->major_version, | 2241 | if (mddev->persistent) |
| 2214 | mddev->minor_version); | 2242 | rdev = md_import_device(dev, mddev->major_version, |
| 2243 | mddev->minor_version); | ||
| 2244 | else | ||
| 2245 | rdev = md_import_device(dev, -1, -1); | ||
| 2215 | if (IS_ERR(rdev)) { | 2246 | if (IS_ERR(rdev)) { |
| 2216 | printk(KERN_WARNING | 2247 | printk(KERN_WARNING |
| 2217 | "md: md_import_device returned %ld\n", | 2248 | "md: md_import_device returned %ld\n", |
| @@ -2231,6 +2262,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 2231 | rdev->saved_raid_disk = rdev->raid_disk; | 2262 | rdev->saved_raid_disk = rdev->raid_disk; |
| 2232 | 2263 | ||
| 2233 | rdev->in_sync = 0; /* just to be sure */ | 2264 | rdev->in_sync = 0; /* just to be sure */ |
| 2265 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
| 2266 | set_bit(WriteMostly, &rdev->flags); | ||
| 2267 | |||
| 2234 | rdev->raid_disk = -1; | 2268 | rdev->raid_disk = -1; |
| 2235 | err = bind_rdev_to_array(rdev, mddev); | 2269 | err = bind_rdev_to_array(rdev, mddev); |
| 2236 | if (err) | 2270 | if (err) |
| @@ -2271,6 +2305,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 2271 | else | 2305 | else |
| 2272 | rdev->in_sync = 0; | 2306 | rdev->in_sync = 0; |
| 2273 | 2307 | ||
| 2308 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | ||
| 2309 | set_bit(WriteMostly, &rdev->flags); | ||
| 2310 | |||
| 2274 | err = bind_rdev_to_array(rdev, mddev); | 2311 | err = bind_rdev_to_array(rdev, mddev); |
| 2275 | if (err) { | 2312 | if (err) { |
| 2276 | export_rdev(rdev); | 2313 | export_rdev(rdev); |
| @@ -2430,25 +2467,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd) | |||
| 2430 | { | 2467 | { |
| 2431 | int err; | 2468 | int err; |
| 2432 | 2469 | ||
| 2433 | if (mddev->pers) | 2470 | if (mddev->pers) { |
| 2434 | return -EBUSY; | 2471 | if (!mddev->pers->quiesce) |
| 2472 | return -EBUSY; | ||
| 2473 | if (mddev->recovery || mddev->sync_thread) | ||
| 2474 | return -EBUSY; | ||
| 2475 | /* we should be able to change the bitmap.. */ | ||
| 2476 | } | ||
| 2435 | 2477 | ||
| 2436 | mddev->bitmap_file = fget(fd); | ||
| 2437 | 2478 | ||
| 2438 | if (mddev->bitmap_file == NULL) { | 2479 | if (fd >= 0) { |
| 2439 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", | 2480 | if (mddev->bitmap) |
| 2440 | mdname(mddev)); | 2481 | return -EEXIST; /* cannot add when bitmap is present */ |
| 2441 | return -EBADF; | 2482 | mddev->bitmap_file = fget(fd); |
| 2442 | } | ||
| 2443 | 2483 | ||
| 2444 | err = deny_bitmap_write_access(mddev->bitmap_file); | 2484 | if (mddev->bitmap_file == NULL) { |
| 2445 | if (err) { | 2485 | printk(KERN_ERR "%s: error: failed to get bitmap file\n", |
| 2446 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | 2486 | mdname(mddev)); |
| 2447 | mdname(mddev)); | 2487 | return -EBADF; |
| 2448 | fput(mddev->bitmap_file); | 2488 | } |
| 2449 | mddev->bitmap_file = NULL; | 2489 | |
| 2450 | } else | 2490 | err = deny_bitmap_write_access(mddev->bitmap_file); |
| 2491 | if (err) { | ||
| 2492 | printk(KERN_ERR "%s: error: bitmap file is already in use\n", | ||
| 2493 | mdname(mddev)); | ||
| 2494 | fput(mddev->bitmap_file); | ||
| 2495 | mddev->bitmap_file = NULL; | ||
| 2496 | return err; | ||
| 2497 | } | ||
| 2451 | mddev->bitmap_offset = 0; /* file overrides offset */ | 2498 | mddev->bitmap_offset = 0; /* file overrides offset */ |
| 2499 | } else if (mddev->bitmap == NULL) | ||
| 2500 | return -ENOENT; /* cannot remove what isn't there */ | ||
| 2501 | err = 0; | ||
| 2502 | if (mddev->pers) { | ||
| 2503 | mddev->pers->quiesce(mddev, 1); | ||
| 2504 | if (fd >= 0) | ||
| 2505 | err = bitmap_create(mddev); | ||
| 2506 | if (fd < 0 || err) | ||
| 2507 | bitmap_destroy(mddev); | ||
| 2508 | mddev->pers->quiesce(mddev, 0); | ||
| 2509 | } else if (fd < 0) { | ||
| 2510 | if (mddev->bitmap_file) | ||
| 2511 | fput(mddev->bitmap_file); | ||
| 2512 | mddev->bitmap_file = NULL; | ||
| 2513 | } | ||
| 2514 | |||
| 2452 | return err; | 2515 | return err; |
| 2453 | } | 2516 | } |
| 2454 | 2517 | ||
| @@ -2528,6 +2591,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
| 2528 | { | 2591 | { |
| 2529 | int rv = 0; | 2592 | int rv = 0; |
| 2530 | int cnt = 0; | 2593 | int cnt = 0; |
| 2594 | int state = 0; | ||
| 2595 | |||
| 2596 | /* calculate expected state,ignoring low bits */ | ||
| 2597 | if (mddev->bitmap && mddev->bitmap_offset) | ||
| 2598 | state |= (1 << MD_SB_BITMAP_PRESENT); | ||
| 2531 | 2599 | ||
| 2532 | if (mddev->major_version != info->major_version || | 2600 | if (mddev->major_version != info->major_version || |
| 2533 | mddev->minor_version != info->minor_version || | 2601 | mddev->minor_version != info->minor_version || |
| @@ -2536,12 +2604,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
| 2536 | mddev->level != info->level || | 2604 | mddev->level != info->level || |
| 2537 | /* mddev->layout != info->layout || */ | 2605 | /* mddev->layout != info->layout || */ |
| 2538 | !mddev->persistent != info->not_persistent|| | 2606 | !mddev->persistent != info->not_persistent|| |
| 2539 | mddev->chunk_size != info->chunk_size ) | 2607 | mddev->chunk_size != info->chunk_size || |
| 2608 | /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ | ||
| 2609 | ((state^info->state) & 0xfffffe00) | ||
| 2610 | ) | ||
| 2540 | return -EINVAL; | 2611 | return -EINVAL; |
| 2541 | /* Check there is only one change */ | 2612 | /* Check there is only one change */ |
| 2542 | if (mddev->size != info->size) cnt++; | 2613 | if (mddev->size != info->size) cnt++; |
| 2543 | if (mddev->raid_disks != info->raid_disks) cnt++; | 2614 | if (mddev->raid_disks != info->raid_disks) cnt++; |
| 2544 | if (mddev->layout != info->layout) cnt++; | 2615 | if (mddev->layout != info->layout) cnt++; |
| 2616 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; | ||
| 2545 | if (cnt == 0) return 0; | 2617 | if (cnt == 0) return 0; |
| 2546 | if (cnt > 1) return -EINVAL; | 2618 | if (cnt > 1) return -EINVAL; |
| 2547 | 2619 | ||
| @@ -2620,6 +2692,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
| 2620 | } | 2692 | } |
| 2621 | } | 2693 | } |
| 2622 | } | 2694 | } |
| 2695 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { | ||
| 2696 | if (mddev->pers->quiesce == NULL) | ||
| 2697 | return -EINVAL; | ||
| 2698 | if (mddev->recovery || mddev->sync_thread) | ||
| 2699 | return -EBUSY; | ||
| 2700 | if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { | ||
| 2701 | /* add the bitmap */ | ||
| 2702 | if (mddev->bitmap) | ||
| 2703 | return -EEXIST; | ||
| 2704 | if (mddev->default_bitmap_offset == 0) | ||
| 2705 | return -EINVAL; | ||
| 2706 | mddev->bitmap_offset = mddev->default_bitmap_offset; | ||
| 2707 | mddev->pers->quiesce(mddev, 1); | ||
| 2708 | rv = bitmap_create(mddev); | ||
| 2709 | if (rv) | ||
| 2710 | bitmap_destroy(mddev); | ||
| 2711 | mddev->pers->quiesce(mddev, 0); | ||
| 2712 | } else { | ||
| 2713 | /* remove the bitmap */ | ||
| 2714 | if (!mddev->bitmap) | ||
| 2715 | return -ENOENT; | ||
| 2716 | if (mddev->bitmap->file) | ||
| 2717 | return -EINVAL; | ||
| 2718 | mddev->pers->quiesce(mddev, 1); | ||
| 2719 | bitmap_destroy(mddev); | ||
| 2720 | mddev->pers->quiesce(mddev, 0); | ||
| 2721 | mddev->bitmap_offset = 0; | ||
| 2722 | } | ||
| 2723 | } | ||
| 2623 | md_update_sb(mddev); | 2724 | md_update_sb(mddev); |
| 2624 | return rv; | 2725 | return rv; |
| 2625 | } | 2726 | } |
| @@ -2781,7 +2882,7 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
| 2781 | goto done_unlock; | 2882 | goto done_unlock; |
| 2782 | 2883 | ||
| 2783 | case GET_BITMAP_FILE: | 2884 | case GET_BITMAP_FILE: |
| 2784 | err = get_bitmap_file(mddev, (void *)arg); | 2885 | err = get_bitmap_file(mddev, argp); |
| 2785 | goto done_unlock; | 2886 | goto done_unlock; |
| 2786 | 2887 | ||
| 2787 | case GET_DISK_INFO: | 2888 | case GET_DISK_INFO: |
| @@ -2950,18 +3051,6 @@ static int md_thread(void * arg) | |||
| 2950 | { | 3051 | { |
| 2951 | mdk_thread_t *thread = arg; | 3052 | mdk_thread_t *thread = arg; |
| 2952 | 3053 | ||
| 2953 | lock_kernel(); | ||
| 2954 | |||
| 2955 | /* | ||
| 2956 | * Detach thread | ||
| 2957 | */ | ||
| 2958 | |||
| 2959 | daemonize(thread->name, mdname(thread->mddev)); | ||
| 2960 | |||
| 2961 | current->exit_signal = SIGCHLD; | ||
| 2962 | allow_signal(SIGKILL); | ||
| 2963 | thread->tsk = current; | ||
| 2964 | |||
| 2965 | /* | 3054 | /* |
| 2966 | * md_thread is a 'system-thread', it's priority should be very | 3055 | * md_thread is a 'system-thread', it's priority should be very |
| 2967 | * high. We avoid resource deadlocks individually in each | 3056 | * high. We avoid resource deadlocks individually in each |
| @@ -2973,14 +3062,14 @@ static int md_thread(void * arg) | |||
| 2973 | * bdflush, otherwise bdflush will deadlock if there are too | 3062 | * bdflush, otherwise bdflush will deadlock if there are too |
| 2974 | * many dirty RAID5 blocks. | 3063 | * many dirty RAID5 blocks. |
| 2975 | */ | 3064 | */ |
| 2976 | unlock_kernel(); | ||
| 2977 | 3065 | ||
| 2978 | complete(thread->event); | 3066 | complete(thread->event); |
| 2979 | while (thread->run) { | 3067 | while (!kthread_should_stop()) { |
| 2980 | void (*run)(mddev_t *); | 3068 | void (*run)(mddev_t *); |
| 2981 | 3069 | ||
| 2982 | wait_event_interruptible_timeout(thread->wqueue, | 3070 | wait_event_interruptible_timeout(thread->wqueue, |
| 2983 | test_bit(THREAD_WAKEUP, &thread->flags), | 3071 | test_bit(THREAD_WAKEUP, &thread->flags) |
| 3072 | || kthread_should_stop(), | ||
| 2984 | thread->timeout); | 3073 | thread->timeout); |
| 2985 | try_to_freeze(); | 3074 | try_to_freeze(); |
| 2986 | 3075 | ||
| @@ -2989,11 +3078,8 @@ static int md_thread(void * arg) | |||
| 2989 | run = thread->run; | 3078 | run = thread->run; |
| 2990 | if (run) | 3079 | if (run) |
| 2991 | run(thread->mddev); | 3080 | run(thread->mddev); |
| 2992 | |||
| 2993 | if (signal_pending(current)) | ||
| 2994 | flush_signals(current); | ||
| 2995 | } | 3081 | } |
| 2996 | complete(thread->event); | 3082 | |
| 2997 | return 0; | 3083 | return 0; |
| 2998 | } | 3084 | } |
| 2999 | 3085 | ||
| @@ -3010,11 +3096,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
| 3010 | const char *name) | 3096 | const char *name) |
| 3011 | { | 3097 | { |
| 3012 | mdk_thread_t *thread; | 3098 | mdk_thread_t *thread; |
| 3013 | int ret; | ||
| 3014 | struct completion event; | 3099 | struct completion event; |
| 3015 | 3100 | ||
| 3016 | thread = (mdk_thread_t *) kmalloc | 3101 | thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); |
| 3017 | (sizeof(mdk_thread_t), GFP_KERNEL); | ||
| 3018 | if (!thread) | 3102 | if (!thread) |
| 3019 | return NULL; | 3103 | return NULL; |
| 3020 | 3104 | ||
| @@ -3027,8 +3111,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
| 3027 | thread->mddev = mddev; | 3111 | thread->mddev = mddev; |
| 3028 | thread->name = name; | 3112 | thread->name = name; |
| 3029 | thread->timeout = MAX_SCHEDULE_TIMEOUT; | 3113 | thread->timeout = MAX_SCHEDULE_TIMEOUT; |
| 3030 | ret = kernel_thread(md_thread, thread, 0); | 3114 | thread->tsk = kthread_run(md_thread, thread, mdname(thread->mddev)); |
| 3031 | if (ret < 0) { | 3115 | if (IS_ERR(thread->tsk)) { |
| 3032 | kfree(thread); | 3116 | kfree(thread); |
| 3033 | return NULL; | 3117 | return NULL; |
| 3034 | } | 3118 | } |
| @@ -3038,21 +3122,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
| 3038 | 3122 | ||
| 3039 | void md_unregister_thread(mdk_thread_t *thread) | 3123 | void md_unregister_thread(mdk_thread_t *thread) |
| 3040 | { | 3124 | { |
| 3041 | struct completion event; | ||
| 3042 | |||
| 3043 | init_completion(&event); | ||
| 3044 | |||
| 3045 | thread->event = &event; | ||
| 3046 | |||
| 3047 | /* As soon as ->run is set to NULL, the task could disappear, | ||
| 3048 | * so we need to hold tasklist_lock until we have sent the signal | ||
| 3049 | */ | ||
| 3050 | dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); | 3125 | dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); |
| 3051 | read_lock(&tasklist_lock); | 3126 | |
| 3052 | thread->run = NULL; | 3127 | kthread_stop(thread->tsk); |
| 3053 | send_sig(SIGKILL, thread->tsk, 1); | ||
| 3054 | read_unlock(&tasklist_lock); | ||
| 3055 | wait_for_completion(&event); | ||
| 3056 | kfree(thread); | 3128 | kfree(thread); |
| 3057 | } | 3129 | } |
| 3058 | 3130 | ||
| @@ -3259,10 +3331,13 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
| 3259 | char b[BDEVNAME_SIZE]; | 3331 | char b[BDEVNAME_SIZE]; |
| 3260 | seq_printf(seq, " %s[%d]", | 3332 | seq_printf(seq, " %s[%d]", |
| 3261 | bdevname(rdev->bdev,b), rdev->desc_nr); | 3333 | bdevname(rdev->bdev,b), rdev->desc_nr); |
| 3334 | if (test_bit(WriteMostly, &rdev->flags)) | ||
| 3335 | seq_printf(seq, "(W)"); | ||
| 3262 | if (rdev->faulty) { | 3336 | if (rdev->faulty) { |
| 3263 | seq_printf(seq, "(F)"); | 3337 | seq_printf(seq, "(F)"); |
| 3264 | continue; | 3338 | continue; |
| 3265 | } | 3339 | } else if (rdev->raid_disk < 0) |
| 3340 | seq_printf(seq, "(S)"); /* spare */ | ||
| 3266 | size += rdev->size; | 3341 | size += rdev->size; |
| 3267 | } | 3342 | } |
| 3268 | 3343 | ||
| @@ -3274,6 +3349,15 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
| 3274 | seq_printf(seq, "\n %llu blocks", | 3349 | seq_printf(seq, "\n %llu blocks", |
| 3275 | (unsigned long long)size); | 3350 | (unsigned long long)size); |
| 3276 | } | 3351 | } |
| 3352 | if (mddev->persistent) { | ||
| 3353 | if (mddev->major_version != 0 || | ||
| 3354 | mddev->minor_version != 90) { | ||
| 3355 | seq_printf(seq," super %d.%d", | ||
| 3356 | mddev->major_version, | ||
| 3357 | mddev->minor_version); | ||
| 3358 | } | ||
| 3359 | } else | ||
| 3360 | seq_printf(seq, " super non-persistent"); | ||
| 3277 | 3361 | ||
| 3278 | if (mddev->pers) { | 3362 | if (mddev->pers) { |
| 3279 | mddev->pers->status (seq, mddev); | 3363 | mddev->pers->status (seq, mddev); |
| @@ -3416,7 +3500,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) | |||
| 3416 | */ | 3500 | */ |
| 3417 | void md_write_start(mddev_t *mddev, struct bio *bi) | 3501 | void md_write_start(mddev_t *mddev, struct bio *bi) |
| 3418 | { | 3502 | { |
| 3419 | DEFINE_WAIT(w); | ||
| 3420 | if (bio_data_dir(bi) != WRITE) | 3503 | if (bio_data_dir(bi) != WRITE) |
| 3421 | return; | 3504 | return; |
| 3422 | 3505 | ||
| @@ -3533,7 +3616,7 @@ static void md_do_sync(mddev_t *mddev) | |||
| 3533 | printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); | 3616 | printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); |
| 3534 | printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" | 3617 | printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" |
| 3535 | " %d KB/sec/disc.\n", sysctl_speed_limit_min); | 3618 | " %d KB/sec/disc.\n", sysctl_speed_limit_min); |
| 3536 | printk(KERN_INFO "md: using maximum available idle IO bandwith " | 3619 | printk(KERN_INFO "md: using maximum available idle IO bandwidth " |
| 3537 | "(but not more than %d KB/sec) for reconstruction.\n", | 3620 | "(but not more than %d KB/sec) for reconstruction.\n", |
| 3538 | sysctl_speed_limit_max); | 3621 | sysctl_speed_limit_max); |
| 3539 | 3622 | ||
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 2d2ca7fa0265..286342375fb7 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
| @@ -169,6 +169,11 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio) | |||
| 169 | struct multipath_bh * mp_bh; | 169 | struct multipath_bh * mp_bh; |
| 170 | struct multipath_info *multipath; | 170 | struct multipath_info *multipath; |
| 171 | 171 | ||
| 172 | if (unlikely(bio_barrier(bio))) { | ||
| 173 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | |||
| 172 | mp_bh = mempool_alloc(conf->pool, GFP_NOIO); | 177 | mp_bh = mempool_alloc(conf->pool, GFP_NOIO); |
| 173 | 178 | ||
| 174 | mp_bh->master_bio = bio; | 179 | mp_bh->master_bio = bio; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 2120710172c5..f6757259ce7f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -404,6 +404,11 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) | |||
| 404 | unsigned long chunk; | 404 | unsigned long chunk; |
| 405 | sector_t block, rsect; | 405 | sector_t block, rsect; |
| 406 | 406 | ||
| 407 | if (unlikely(bio_barrier(bio))) { | ||
| 408 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
| 409 | return 0; | ||
| 410 | } | ||
| 411 | |||
| 407 | if (bio_data_dir(bio)==WRITE) { | 412 | if (bio_data_dir(bio)==WRITE) { |
| 408 | disk_stat_inc(mddev->gendisk, writes); | 413 | disk_stat_inc(mddev->gendisk, writes); |
| 409 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); | 414 | disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 51d9645ed09c..a93ca478142a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
| 222 | { | 222 | { |
| 223 | struct bio *bio = r1_bio->master_bio; | 223 | struct bio *bio = r1_bio->master_bio; |
| 224 | 224 | ||
| 225 | bio_endio(bio, bio->bi_size, | 225 | /* if nobody has done the final endio yet, do it now */ |
| 226 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | 226 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
| 227 | PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", | ||
| 228 | (bio_data_dir(bio) == WRITE) ? "write" : "read", | ||
| 229 | (unsigned long long) bio->bi_sector, | ||
| 230 | (unsigned long long) bio->bi_sector + | ||
| 231 | (bio->bi_size >> 9) - 1); | ||
| 232 | |||
| 233 | bio_endio(bio, bio->bi_size, | ||
| 234 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
| 235 | } | ||
| 227 | free_r1bio(r1_bio); | 236 | free_r1bio(r1_bio); |
| 228 | } | 237 | } |
| 229 | 238 | ||
| @@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
| 292 | { | 301 | { |
| 293 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
| 294 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
| 295 | int mirror; | 304 | int mirror, behind; |
| 296 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
| 297 | 306 | ||
| 298 | if (bio->bi_size) | 307 | if (bio->bi_size) |
| @@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
| 323 | 332 | ||
| 324 | update_head_pos(mirror, r1_bio); | 333 | update_head_pos(mirror, r1_bio); |
| 325 | 334 | ||
| 335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | ||
| 336 | if (behind) { | ||
| 337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | ||
| 338 | atomic_dec(&r1_bio->behind_remaining); | ||
| 339 | |||
| 340 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
| 341 | * reached all non-writemostly disks. Setting the Returned bit | ||
| 342 | * ensures that this gets done only once -- we don't ever want to | ||
| 343 | * return -EIO here, instead we'll wait */ | ||
| 344 | |||
| 345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
| 346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
| 347 | /* Maybe we can return now */ | ||
| 348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
| 349 | struct bio *mbio = r1_bio->master_bio; | ||
| 350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
| 351 | (unsigned long long) mbio->bi_sector, | ||
| 352 | (unsigned long long) mbio->bi_sector + | ||
| 353 | (mbio->bi_size >> 9) - 1); | ||
| 354 | bio_endio(mbio, mbio->bi_size, 0); | ||
| 355 | } | ||
| 356 | } | ||
| 357 | } | ||
| 326 | /* | 358 | /* |
| 327 | * | 359 | * |
| 328 | * Let's see if all mirrored write operations have finished | 360 | * Let's see if all mirrored write operations have finished |
| 329 | * already. | 361 | * already. |
| 330 | */ | 362 | */ |
| 331 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 363 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
| 364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
| 365 | /* free extra copy of the data pages */ | ||
| 366 | int i = bio->bi_vcnt; | ||
| 367 | while (i--) | ||
| 368 | __free_page(bio->bi_io_vec[i].bv_page); | ||
| 369 | } | ||
| 332 | /* clear the bitmap if all writes complete successfully */ | 370 | /* clear the bitmap if all writes complete successfully */ |
| 333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 371 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
| 334 | r1_bio->sectors, | 372 | r1_bio->sectors, |
| 335 | !test_bit(R1BIO_Degraded, &r1_bio->state)); | 373 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
| 374 | behind); | ||
| 336 | md_write_end(r1_bio->mddev); | 375 | md_write_end(r1_bio->mddev); |
| 337 | raid_end_bio_io(r1_bio); | 376 | raid_end_bio_io(r1_bio); |
| 338 | } | 377 | } |
| @@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 360 | { | 399 | { |
| 361 | const unsigned long this_sector = r1_bio->sector; | 400 | const unsigned long this_sector = r1_bio->sector; |
| 362 | int new_disk = conf->last_used, disk = new_disk; | 401 | int new_disk = conf->last_used, disk = new_disk; |
| 402 | int wonly_disk = -1; | ||
| 363 | const int sectors = r1_bio->sectors; | 403 | const int sectors = r1_bio->sectors; |
| 364 | sector_t new_distance, current_distance; | 404 | sector_t new_distance, current_distance; |
| 365 | mdk_rdev_t *new_rdev, *rdev; | 405 | mdk_rdev_t *rdev; |
| 366 | 406 | ||
| 367 | rcu_read_lock(); | 407 | rcu_read_lock(); |
| 368 | /* | 408 | /* |
| 369 | * Check if it if we can balance. We can balance on the whole | 409 | * Check if we can balance. We can balance on the whole |
| 370 | * device if no resync is going on, or below the resync window. | 410 | * device if no resync is going on, or below the resync window. |
| 371 | * We take the first readable disk when above the resync window. | 411 | * We take the first readable disk when above the resync window. |
| 372 | */ | 412 | */ |
| @@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 376 | /* Choose the first operation device, for consistancy */ | 416 | /* Choose the first operation device, for consistancy */ |
| 377 | new_disk = 0; | 417 | new_disk = 0; |
| 378 | 418 | ||
| 379 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | 419 | for (rdev = conf->mirrors[new_disk].rdev; |
| 380 | !new_rdev->in_sync) { | 420 | !rdev || !rdev->in_sync |
| 381 | new_disk++; | 421 | || test_bit(WriteMostly, &rdev->flags); |
| 382 | if (new_disk == conf->raid_disks) { | 422 | rdev = conf->mirrors[++new_disk].rdev) { |
| 383 | new_disk = -1; | 423 | |
| 424 | if (rdev && rdev->in_sync) | ||
| 425 | wonly_disk = new_disk; | ||
| 426 | |||
| 427 | if (new_disk == conf->raid_disks - 1) { | ||
| 428 | new_disk = wonly_disk; | ||
| 384 | break; | 429 | break; |
| 385 | } | 430 | } |
| 386 | } | 431 | } |
| @@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 389 | 434 | ||
| 390 | 435 | ||
| 391 | /* make sure the disk is operational */ | 436 | /* make sure the disk is operational */ |
| 392 | while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || | 437 | for (rdev = conf->mirrors[new_disk].rdev; |
| 393 | !new_rdev->in_sync) { | 438 | !rdev || !rdev->in_sync || |
| 439 | test_bit(WriteMostly, &rdev->flags); | ||
| 440 | rdev = conf->mirrors[new_disk].rdev) { | ||
| 441 | |||
| 442 | if (rdev && rdev->in_sync) | ||
| 443 | wonly_disk = new_disk; | ||
| 444 | |||
| 394 | if (new_disk <= 0) | 445 | if (new_disk <= 0) |
| 395 | new_disk = conf->raid_disks; | 446 | new_disk = conf->raid_disks; |
| 396 | new_disk--; | 447 | new_disk--; |
| 397 | if (new_disk == disk) { | 448 | if (new_disk == disk) { |
| 398 | new_disk = -1; | 449 | new_disk = wonly_disk; |
| 399 | goto rb_out; | 450 | break; |
| 400 | } | 451 | } |
| 401 | } | 452 | } |
| 453 | |||
| 454 | if (new_disk < 0) | ||
| 455 | goto rb_out; | ||
| 456 | |||
| 402 | disk = new_disk; | 457 | disk = new_disk; |
| 403 | /* now disk == new_disk == starting point for search */ | 458 | /* now disk == new_disk == starting point for search */ |
| 404 | 459 | ||
| @@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
| 419 | disk = conf->raid_disks; | 474 | disk = conf->raid_disks; |
| 420 | disk--; | 475 | disk--; |
| 421 | 476 | ||
| 422 | if ((rdev=conf->mirrors[disk].rdev) == NULL || | 477 | rdev = conf->mirrors[disk].rdev; |
| 423 | !rdev->in_sync) | 478 | |
| 479 | if (!rdev || | ||
| 480 | !rdev->in_sync || | ||
| 481 | test_bit(WriteMostly, &rdev->flags)) | ||
| 424 | continue; | 482 | continue; |
| 425 | 483 | ||
| 426 | if (!atomic_read(&rdev->nr_pending)) { | 484 | if (!atomic_read(&rdev->nr_pending)) { |
| 427 | new_disk = disk; | 485 | new_disk = disk; |
| 428 | new_rdev = rdev; | ||
| 429 | break; | 486 | break; |
| 430 | } | 487 | } |
| 431 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 488 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); |
| 432 | if (new_distance < current_distance) { | 489 | if (new_distance < current_distance) { |
| 433 | current_distance = new_distance; | 490 | current_distance = new_distance; |
| 434 | new_disk = disk; | 491 | new_disk = disk; |
| 435 | new_rdev = rdev; | ||
| 436 | } | 492 | } |
| 437 | } while (disk != conf->last_used); | 493 | } while (disk != conf->last_used); |
| 438 | 494 | ||
| 439 | rb_out: | 495 | rb_out: |
| 440 | 496 | ||
| 441 | 497 | ||
| 442 | if (new_disk >= 0) { | 498 | if (new_disk >= 0) { |
| 443 | conf->next_seq_sect = this_sector + sectors; | 499 | rdev = conf->mirrors[new_disk].rdev; |
| 444 | conf->last_used = new_disk; | 500 | if (!rdev) |
| 445 | atomic_inc(&new_rdev->nr_pending); | 501 | goto retry; |
| 446 | if (!new_rdev->in_sync) { | 502 | atomic_inc(&rdev->nr_pending); |
| 503 | if (!rdev->in_sync) { | ||
| 447 | /* cannot risk returning a device that failed | 504 | /* cannot risk returning a device that failed |
| 448 | * before we inc'ed nr_pending | 505 | * before we inc'ed nr_pending |
| 449 | */ | 506 | */ |
| 450 | atomic_dec(&new_rdev->nr_pending); | 507 | atomic_dec(&rdev->nr_pending); |
| 451 | goto retry; | 508 | goto retry; |
| 452 | } | 509 | } |
| 510 | conf->next_seq_sect = this_sector + sectors; | ||
| 511 | conf->last_used = new_disk; | ||
| 453 | } | 512 | } |
| 454 | rcu_read_unlock(); | 513 | rcu_read_unlock(); |
| 455 | 514 | ||
| @@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) | |||
| 542 | spin_unlock_irq(&conf->resync_lock); | 601 | spin_unlock_irq(&conf->resync_lock); |
| 543 | } | 602 | } |
| 544 | 603 | ||
| 604 | /* duplicate the data pages for behind I/O */ | ||
| 605 | static struct page **alloc_behind_pages(struct bio *bio) | ||
| 606 | { | ||
| 607 | int i; | ||
| 608 | struct bio_vec *bvec; | ||
| 609 | struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), | ||
| 610 | GFP_NOIO); | ||
| 611 | if (unlikely(!pages)) | ||
| 612 | goto do_sync_io; | ||
| 613 | |||
| 614 | memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); | ||
| 615 | |||
| 616 | bio_for_each_segment(bvec, bio, i) { | ||
| 617 | pages[i] = alloc_page(GFP_NOIO); | ||
| 618 | if (unlikely(!pages[i])) | ||
| 619 | goto do_sync_io; | ||
| 620 | memcpy(kmap(pages[i]) + bvec->bv_offset, | ||
| 621 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | ||
| 622 | kunmap(pages[i]); | ||
| 623 | kunmap(bvec->bv_page); | ||
| 624 | } | ||
| 625 | |||
| 626 | return pages; | ||
| 627 | |||
| 628 | do_sync_io: | ||
| 629 | if (pages) | ||
| 630 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | ||
| 631 | __free_page(pages[i]); | ||
| 632 | kfree(pages); | ||
| 633 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | ||
| 634 | return NULL; | ||
| 635 | } | ||
| 636 | |||
| 545 | static int make_request(request_queue_t *q, struct bio * bio) | 637 | static int make_request(request_queue_t *q, struct bio * bio) |
| 546 | { | 638 | { |
| 547 | mddev_t *mddev = q->queuedata; | 639 | mddev_t *mddev = q->queuedata; |
| @@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 554 | struct bitmap *bitmap = mddev->bitmap; | 646 | struct bitmap *bitmap = mddev->bitmap; |
| 555 | unsigned long flags; | 647 | unsigned long flags; |
| 556 | struct bio_list bl; | 648 | struct bio_list bl; |
| 649 | struct page **behind_pages = NULL; | ||
| 557 | 650 | ||
| 651 | if (unlikely(bio_barrier(bio))) { | ||
| 652 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
| 653 | return 0; | ||
| 654 | } | ||
| 558 | 655 | ||
| 559 | /* | 656 | /* |
| 560 | * Register the new request and wait if the reconstruction | 657 | * Register the new request and wait if the reconstruction |
| @@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 589 | r1_bio->mddev = mddev; | 686 | r1_bio->mddev = mddev; |
| 590 | r1_bio->sector = bio->bi_sector; | 687 | r1_bio->sector = bio->bi_sector; |
| 591 | 688 | ||
| 592 | r1_bio->state = 0; | ||
| 593 | |||
| 594 | if (bio_data_dir(bio) == READ) { | 689 | if (bio_data_dir(bio) == READ) { |
| 595 | /* | 690 | /* |
| 596 | * read balancing logic: | 691 | * read balancing logic: |
| @@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 651 | } | 746 | } |
| 652 | rcu_read_unlock(); | 747 | rcu_read_unlock(); |
| 653 | 748 | ||
| 749 | BUG_ON(targets == 0); /* we never fail the last device */ | ||
| 750 | |||
| 654 | if (targets < conf->raid_disks) { | 751 | if (targets < conf->raid_disks) { |
| 655 | /* array is degraded, we will not clear the bitmap | 752 | /* array is degraded, we will not clear the bitmap |
| 656 | * on I/O completion (see raid1_end_write_request) */ | 753 | * on I/O completion (see raid1_end_write_request) */ |
| 657 | set_bit(R1BIO_Degraded, &r1_bio->state); | 754 | set_bit(R1BIO_Degraded, &r1_bio->state); |
| 658 | } | 755 | } |
| 659 | 756 | ||
| 757 | /* do behind I/O ? */ | ||
| 758 | if (bitmap && | ||
| 759 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | ||
| 760 | (behind_pages = alloc_behind_pages(bio)) != NULL) | ||
| 761 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
| 762 | |||
| 660 | atomic_set(&r1_bio->remaining, 0); | 763 | atomic_set(&r1_bio->remaining, 0); |
| 764 | atomic_set(&r1_bio->behind_remaining, 0); | ||
| 661 | 765 | ||
| 662 | bio_list_init(&bl); | 766 | bio_list_init(&bl); |
| 663 | for (i = 0; i < disks; i++) { | 767 | for (i = 0; i < disks; i++) { |
| @@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 674 | mbio->bi_rw = WRITE; | 778 | mbio->bi_rw = WRITE; |
| 675 | mbio->bi_private = r1_bio; | 779 | mbio->bi_private = r1_bio; |
| 676 | 780 | ||
| 781 | if (behind_pages) { | ||
| 782 | struct bio_vec *bvec; | ||
| 783 | int j; | ||
| 784 | |||
| 785 | /* Yes, I really want the '__' version so that | ||
| 786 | * we clear any unused pointer in the io_vec, rather | ||
| 787 | * than leave them unchanged. This is important | ||
| 788 | * because when we come to free the pages, we won't | ||
| 789 | * know the originial bi_idx, so we just free | ||
| 790 | * them all | ||
| 791 | */ | ||
| 792 | __bio_for_each_segment(bvec, mbio, j, 0) | ||
| 793 | bvec->bv_page = behind_pages[j]; | ||
| 794 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | ||
| 795 | atomic_inc(&r1_bio->behind_remaining); | ||
| 796 | } | ||
| 797 | |||
| 677 | atomic_inc(&r1_bio->remaining); | 798 | atomic_inc(&r1_bio->remaining); |
| 678 | 799 | ||
| 679 | bio_list_add(&bl, mbio); | 800 | bio_list_add(&bl, mbio); |
| 680 | } | 801 | } |
| 802 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
| 681 | 803 | ||
| 682 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); | 804 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
| 805 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
| 683 | spin_lock_irqsave(&conf->device_lock, flags); | 806 | spin_lock_irqsave(&conf->device_lock, flags); |
| 684 | bio_list_merge(&conf->pending_bio_list, &bl); | 807 | bio_list_merge(&conf->pending_bio_list, &bl); |
| 685 | bio_list_init(&bl); | 808 | bio_list_init(&bl); |
| @@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1105 | sector_t max_sector, nr_sectors; | 1228 | sector_t max_sector, nr_sectors; |
| 1106 | int disk; | 1229 | int disk; |
| 1107 | int i; | 1230 | int i; |
| 1231 | int wonly; | ||
| 1108 | int write_targets = 0; | 1232 | int write_targets = 0; |
| 1109 | int sync_blocks; | 1233 | int sync_blocks; |
| 1110 | int still_degraded = 0; | 1234 | int still_degraded = 0; |
| @@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1160 | */ | 1284 | */ |
| 1161 | disk = conf->last_used; | 1285 | disk = conf->last_used; |
| 1162 | /* make sure disk is operational */ | 1286 | /* make sure disk is operational */ |
| 1163 | 1287 | wonly = disk; | |
| 1164 | while (conf->mirrors[disk].rdev == NULL || | 1288 | while (conf->mirrors[disk].rdev == NULL || |
| 1165 | !conf->mirrors[disk].rdev->in_sync) { | 1289 | !conf->mirrors[disk].rdev->in_sync || |
| 1290 | test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) | ||
| 1291 | ) { | ||
| 1292 | if (conf->mirrors[disk].rdev && | ||
| 1293 | conf->mirrors[disk].rdev->in_sync) | ||
| 1294 | wonly = disk; | ||
| 1166 | if (disk <= 0) | 1295 | if (disk <= 0) |
| 1167 | disk = conf->raid_disks; | 1296 | disk = conf->raid_disks; |
| 1168 | disk--; | 1297 | disk--; |
| 1169 | if (disk == conf->last_used) | 1298 | if (disk == conf->last_used) { |
| 1299 | disk = wonly; | ||
| 1170 | break; | 1300 | break; |
| 1301 | } | ||
| 1171 | } | 1302 | } |
| 1172 | conf->last_used = disk; | 1303 | conf->last_used = disk; |
| 1173 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | 1304 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); |
| @@ -1439,6 +1570,17 @@ out: | |||
| 1439 | static int stop(mddev_t *mddev) | 1570 | static int stop(mddev_t *mddev) |
| 1440 | { | 1571 | { |
| 1441 | conf_t *conf = mddev_to_conf(mddev); | 1572 | conf_t *conf = mddev_to_conf(mddev); |
| 1573 | struct bitmap *bitmap = mddev->bitmap; | ||
| 1574 | int behind_wait = 0; | ||
| 1575 | |||
| 1576 | /* wait for behind writes to complete */ | ||
| 1577 | while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | ||
| 1578 | behind_wait++; | ||
| 1579 | printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); | ||
| 1580 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 1581 | schedule_timeout(HZ); /* wait a second */ | ||
| 1582 | /* need to kick something here to make sure I/O goes? */ | ||
| 1583 | } | ||
| 1442 | 1584 | ||
| 1443 | md_unregister_thread(mddev->thread); | 1585 | md_unregister_thread(mddev->thread); |
| 1444 | mddev->thread = NULL; | 1586 | mddev->thread = NULL; |
| @@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) | |||
| 1561 | return 0; | 1703 | return 0; |
| 1562 | } | 1704 | } |
| 1563 | 1705 | ||
| 1706 | static void raid1_quiesce(mddev_t *mddev, int state) | ||
| 1707 | { | ||
| 1708 | conf_t *conf = mddev_to_conf(mddev); | ||
| 1709 | |||
| 1710 | switch(state) { | ||
| 1711 | case 1: | ||
| 1712 | spin_lock_irq(&conf->resync_lock); | ||
| 1713 | conf->barrier++; | ||
| 1714 | wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, | ||
| 1715 | conf->resync_lock, raid1_unplug(mddev->queue)); | ||
| 1716 | spin_unlock_irq(&conf->resync_lock); | ||
| 1717 | break; | ||
| 1718 | case 0: | ||
| 1719 | spin_lock_irq(&conf->resync_lock); | ||
| 1720 | conf->barrier--; | ||
| 1721 | spin_unlock_irq(&conf->resync_lock); | ||
| 1722 | wake_up(&conf->wait_resume); | ||
| 1723 | wake_up(&conf->wait_idle); | ||
| 1724 | break; | ||
| 1725 | } | ||
| 1726 | if (mddev->thread) { | ||
| 1727 | if (mddev->bitmap) | ||
| 1728 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
| 1729 | else | ||
| 1730 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 1731 | md_wakeup_thread(mddev->thread); | ||
| 1732 | } | ||
| 1733 | } | ||
| 1734 | |||
| 1564 | 1735 | ||
| 1565 | static mdk_personality_t raid1_personality = | 1736 | static mdk_personality_t raid1_personality = |
| 1566 | { | 1737 | { |
| @@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality = | |||
| 1577 | .sync_request = sync_request, | 1748 | .sync_request = sync_request, |
| 1578 | .resize = raid1_resize, | 1749 | .resize = raid1_resize, |
| 1579 | .reshape = raid1_reshape, | 1750 | .reshape = raid1_reshape, |
| 1751 | .quiesce = raid1_quiesce, | ||
| 1580 | }; | 1752 | }; |
| 1581 | 1753 | ||
| 1582 | static int __init raid_init(void) | 1754 | static int __init raid_init(void) |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 62ebb1bc72be..5bd1e9ec899d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -538,7 +538,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
| 538 | } | 538 | } |
| 539 | 539 | ||
| 540 | 540 | ||
| 541 | current_distance = abs(this_sector - conf->mirrors[disk].head_position); | 541 | current_distance = abs(r10_bio->devs[slot].addr - |
| 542 | conf->mirrors[disk].head_position); | ||
| 542 | 543 | ||
| 543 | /* Find the disk whose head is closest */ | 544 | /* Find the disk whose head is closest */ |
| 544 | 545 | ||
| @@ -668,6 +669,11 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
| 668 | int i; | 669 | int i; |
| 669 | int chunk_sects = conf->chunk_mask + 1; | 670 | int chunk_sects = conf->chunk_mask + 1; |
| 670 | 671 | ||
| 672 | if (unlikely(bio_barrier(bio))) { | ||
| 673 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | ||
| 674 | return 0; | ||
| 675 | } | ||
| 676 | |||
| 671 | /* If this request crosses a chunk boundary, we need to | 677 | /* If this request crosses a chunk boundary, we need to |
| 672 | * split it. This will only happen for 1 PAGE (or less) requests. | 678 | * split it. This will only happen for 1 PAGE (or less) requests. |
| 673 | */ | 679 | */ |
| @@ -900,6 +906,27 @@ static void close_sync(conf_t *conf) | |||
| 900 | conf->r10buf_pool = NULL; | 906 | conf->r10buf_pool = NULL; |
| 901 | } | 907 | } |
| 902 | 908 | ||
| 909 | /* check if there are enough drives for | ||
| 910 | * every block to appear on atleast one | ||
| 911 | */ | ||
| 912 | static int enough(conf_t *conf) | ||
| 913 | { | ||
| 914 | int first = 0; | ||
| 915 | |||
| 916 | do { | ||
| 917 | int n = conf->copies; | ||
| 918 | int cnt = 0; | ||
| 919 | while (n--) { | ||
| 920 | if (conf->mirrors[first].rdev) | ||
| 921 | cnt++; | ||
| 922 | first = (first+1) % conf->raid_disks; | ||
| 923 | } | ||
| 924 | if (cnt == 0) | ||
| 925 | return 0; | ||
| 926 | } while (first != 0); | ||
| 927 | return 1; | ||
| 928 | } | ||
| 929 | |||
| 903 | static int raid10_spare_active(mddev_t *mddev) | 930 | static int raid10_spare_active(mddev_t *mddev) |
| 904 | { | 931 | { |
| 905 | int i; | 932 | int i; |
| @@ -938,6 +965,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 938 | * very different from resync | 965 | * very different from resync |
| 939 | */ | 966 | */ |
| 940 | return 0; | 967 | return 0; |
| 968 | if (!enough(conf)) | ||
| 969 | return 0; | ||
| 941 | 970 | ||
| 942 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | 971 | for (mirror=0; mirror < mddev->raid_disks; mirror++) |
| 943 | if ( !(p=conf->mirrors+mirror)->rdev) { | 972 | if ( !(p=conf->mirrors+mirror)->rdev) { |
| @@ -1445,7 +1474,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1445 | } | 1474 | } |
| 1446 | } | 1475 | } |
| 1447 | if (j == conf->copies) { | 1476 | if (j == conf->copies) { |
| 1448 | BUG(); | 1477 | /* Cannot recover, so abort the recovery */ |
| 1478 | put_buf(r10_bio); | ||
| 1479 | r10_bio = rb2; | ||
| 1480 | if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery)) | ||
| 1481 | printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", | ||
| 1482 | mdname(mddev)); | ||
| 1483 | break; | ||
| 1449 | } | 1484 | } |
| 1450 | } | 1485 | } |
| 1451 | if (biolist == NULL) { | 1486 | if (biolist == NULL) { |
| @@ -1678,9 +1713,10 @@ static int run(mddev_t *mddev) | |||
| 1678 | init_waitqueue_head(&conf->wait_idle); | 1713 | init_waitqueue_head(&conf->wait_idle); |
| 1679 | init_waitqueue_head(&conf->wait_resume); | 1714 | init_waitqueue_head(&conf->wait_resume); |
| 1680 | 1715 | ||
| 1681 | if (!conf->working_disks) { | 1716 | /* need to check that every block has at least one working mirror */ |
| 1682 | printk(KERN_ERR "raid10: no operational mirrors for %s\n", | 1717 | if (!enough(conf)) { |
| 1683 | mdname(mddev)); | 1718 | printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", |
| 1719 | mdname(mddev)); | ||
| 1684 | goto out_free_conf; | 1720 | goto out_free_conf; |
| 1685 | } | 1721 | } |
| 1686 | 1722 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 43f231a467d5..4683ca24c046 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -24,6 +24,8 @@ | |||
| 24 | #include <linux/bitops.h> | 24 | #include <linux/bitops.h> |
| 25 | #include <asm/atomic.h> | 25 | #include <asm/atomic.h> |
| 26 | 26 | ||
| 27 | #include <linux/raid/bitmap.h> | ||
| 28 | |||
| 27 | /* | 29 | /* |
| 28 | * Stripe cache | 30 | * Stripe cache |
| 29 | */ | 31 | */ |
| @@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
| 79 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 81 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
| 80 | if (test_bit(STRIPE_DELAYED, &sh->state)) | 82 | if (test_bit(STRIPE_DELAYED, &sh->state)) |
| 81 | list_add_tail(&sh->lru, &conf->delayed_list); | 83 | list_add_tail(&sh->lru, &conf->delayed_list); |
| 82 | else | 84 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
| 85 | conf->seq_write == sh->bm_seq) | ||
| 86 | list_add_tail(&sh->lru, &conf->bitmap_list); | ||
| 87 | else { | ||
| 88 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 83 | list_add_tail(&sh->lru, &conf->handle_list); | 89 | list_add_tail(&sh->lru, &conf->handle_list); |
| 90 | } | ||
| 84 | md_wakeup_thread(conf->mddev->thread); | 91 | md_wakeup_thread(conf->mddev->thread); |
| 85 | } else { | 92 | } else { |
| 86 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 93 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
| @@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
| 244 | spin_lock_irq(&conf->device_lock); | 251 | spin_lock_irq(&conf->device_lock); |
| 245 | 252 | ||
| 246 | do { | 253 | do { |
| 254 | wait_event_lock_irq(conf->wait_for_stripe, | ||
| 255 | conf->quiesce == 0, | ||
| 256 | conf->device_lock, /* nothing */); | ||
| 247 | sh = __find_stripe(conf, sector); | 257 | sh = __find_stripe(conf, sector); |
| 248 | if (!sh) { | 258 | if (!sh) { |
| 249 | if (!conf->inactive_blocked) | 259 | if (!conf->inactive_blocked) |
| @@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 803 | { | 813 | { |
| 804 | struct bio **bip; | 814 | struct bio **bip; |
| 805 | raid5_conf_t *conf = sh->raid_conf; | 815 | raid5_conf_t *conf = sh->raid_conf; |
| 816 | int firstwrite=0; | ||
| 806 | 817 | ||
| 807 | PRINTK("adding bh b#%llu to stripe s#%llu\n", | 818 | PRINTK("adding bh b#%llu to stripe s#%llu\n", |
| 808 | (unsigned long long)bi->bi_sector, | 819 | (unsigned long long)bi->bi_sector, |
| @@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 811 | 822 | ||
| 812 | spin_lock(&sh->lock); | 823 | spin_lock(&sh->lock); |
| 813 | spin_lock_irq(&conf->device_lock); | 824 | spin_lock_irq(&conf->device_lock); |
| 814 | if (forwrite) | 825 | if (forwrite) { |
| 815 | bip = &sh->dev[dd_idx].towrite; | 826 | bip = &sh->dev[dd_idx].towrite; |
| 816 | else | 827 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) |
| 828 | firstwrite = 1; | ||
| 829 | } else | ||
| 817 | bip = &sh->dev[dd_idx].toread; | 830 | bip = &sh->dev[dd_idx].toread; |
| 818 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | 831 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { |
| 819 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) | 832 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) |
| @@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 836 | (unsigned long long)bi->bi_sector, | 849 | (unsigned long long)bi->bi_sector, |
| 837 | (unsigned long long)sh->sector, dd_idx); | 850 | (unsigned long long)sh->sector, dd_idx); |
| 838 | 851 | ||
| 852 | if (conf->mddev->bitmap && firstwrite) { | ||
| 853 | sh->bm_seq = conf->seq_write; | ||
| 854 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
| 855 | STRIPE_SECTORS, 0); | ||
| 856 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 857 | } | ||
| 858 | |||
| 839 | if (forwrite) { | 859 | if (forwrite) { |
| 840 | /* check if page is covered */ | 860 | /* check if page is covered */ |
| 841 | sector_t sector = sh->dev[dd_idx].sector; | 861 | sector_t sector = sh->dev[dd_idx].sector; |
| @@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 958 | * need to be failed | 978 | * need to be failed |
| 959 | */ | 979 | */ |
| 960 | if (failed > 1 && to_read+to_write+written) { | 980 | if (failed > 1 && to_read+to_write+written) { |
| 961 | spin_lock_irq(&conf->device_lock); | ||
| 962 | for (i=disks; i--; ) { | 981 | for (i=disks; i--; ) { |
| 982 | int bitmap_end = 0; | ||
| 983 | spin_lock_irq(&conf->device_lock); | ||
| 963 | /* fail all writes first */ | 984 | /* fail all writes first */ |
| 964 | bi = sh->dev[i].towrite; | 985 | bi = sh->dev[i].towrite; |
| 965 | sh->dev[i].towrite = NULL; | 986 | sh->dev[i].towrite = NULL; |
| 966 | if (bi) to_write--; | 987 | if (bi) { to_write--; bitmap_end = 1; } |
| 967 | 988 | ||
| 968 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 989 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
| 969 | wake_up(&conf->wait_for_overlap); | 990 | wake_up(&conf->wait_for_overlap); |
| @@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 981 | /* and fail all 'written' */ | 1002 | /* and fail all 'written' */ |
| 982 | bi = sh->dev[i].written; | 1003 | bi = sh->dev[i].written; |
| 983 | sh->dev[i].written = NULL; | 1004 | sh->dev[i].written = NULL; |
| 1005 | if (bi) bitmap_end = 1; | ||
| 984 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | 1006 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { |
| 985 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 1007 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
| 986 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 1008 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| @@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1009 | bi = nextbi; | 1031 | bi = nextbi; |
| 1010 | } | 1032 | } |
| 1011 | } | 1033 | } |
| 1034 | spin_unlock_irq(&conf->device_lock); | ||
| 1035 | if (bitmap_end) | ||
| 1036 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 1037 | STRIPE_SECTORS, 0, 0); | ||
| 1012 | } | 1038 | } |
| 1013 | spin_unlock_irq(&conf->device_lock); | ||
| 1014 | } | 1039 | } |
| 1015 | if (failed > 1 && syncing) { | 1040 | if (failed > 1 && syncing) { |
| 1016 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 1041 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
| @@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1038 | test_bit(R5_UPTODATE, &dev->flags) ) { | 1063 | test_bit(R5_UPTODATE, &dev->flags) ) { |
| 1039 | /* We can return any write requests */ | 1064 | /* We can return any write requests */ |
| 1040 | struct bio *wbi, *wbi2; | 1065 | struct bio *wbi, *wbi2; |
| 1066 | int bitmap_end = 0; | ||
| 1041 | PRINTK("Return write for disc %d\n", i); | 1067 | PRINTK("Return write for disc %d\n", i); |
| 1042 | spin_lock_irq(&conf->device_lock); | 1068 | spin_lock_irq(&conf->device_lock); |
| 1043 | wbi = dev->written; | 1069 | wbi = dev->written; |
| @@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1051 | } | 1077 | } |
| 1052 | wbi = wbi2; | 1078 | wbi = wbi2; |
| 1053 | } | 1079 | } |
| 1080 | if (dev->towrite == NULL) | ||
| 1081 | bitmap_end = 1; | ||
| 1054 | spin_unlock_irq(&conf->device_lock); | 1082 | spin_unlock_irq(&conf->device_lock); |
| 1083 | if (bitmap_end) | ||
| 1084 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 1085 | STRIPE_SECTORS, | ||
| 1086 | !test_bit(STRIPE_DEGRADED, &sh->state), 0); | ||
| 1055 | } | 1087 | } |
| 1056 | } | 1088 | } |
| 1057 | } | 1089 | } |
| @@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1175 | } | 1207 | } |
| 1176 | } | 1208 | } |
| 1177 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | 1209 | /* now if nothing is locked, and if we have enough data, we can start a write request */ |
| 1178 | if (locked == 0 && (rcw == 0 ||rmw == 0)) { | 1210 | if (locked == 0 && (rcw == 0 ||rmw == 0) && |
| 1211 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
| 1179 | PRINTK("Computing parity...\n"); | 1212 | PRINTK("Computing parity...\n"); |
| 1180 | compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); | 1213 | compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); |
| 1181 | /* now every locked buffer is ready to be written */ | 1214 | /* now every locked buffer is ready to be written */ |
| @@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1231 | dev = &sh->dev[failed_num]; | 1264 | dev = &sh->dev[failed_num]; |
| 1232 | set_bit(R5_LOCKED, &dev->flags); | 1265 | set_bit(R5_LOCKED, &dev->flags); |
| 1233 | set_bit(R5_Wantwrite, &dev->flags); | 1266 | set_bit(R5_Wantwrite, &dev->flags); |
| 1267 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
| 1234 | locked++; | 1268 | locked++; |
| 1235 | set_bit(STRIPE_INSYNC, &sh->state); | 1269 | set_bit(STRIPE_INSYNC, &sh->state); |
| 1236 | set_bit(R5_Syncio, &dev->flags); | 1270 | set_bit(R5_Syncio, &dev->flags); |
| @@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1298 | bi->bi_next = NULL; | 1332 | bi->bi_next = NULL; |
| 1299 | generic_make_request(bi); | 1333 | generic_make_request(bi); |
| 1300 | } else { | 1334 | } else { |
| 1335 | if (rw == 1) | ||
| 1336 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
| 1301 | PRINTK("skip op %ld on disc %d for sector %llu\n", | 1337 | PRINTK("skip op %ld on disc %d for sector %llu\n", |
| 1302 | bi->bi_rw, i, (unsigned long long)sh->sector); | 1338 | bi->bi_rw, i, (unsigned long long)sh->sector); |
| 1303 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1339 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
| @@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) | |||
| 1322 | } | 1358 | } |
| 1323 | } | 1359 | } |
| 1324 | 1360 | ||
| 1361 | static inline void activate_bit_delay(raid5_conf_t *conf) | ||
| 1362 | { | ||
| 1363 | /* device_lock is held */ | ||
| 1364 | struct list_head head; | ||
| 1365 | list_add(&head, &conf->bitmap_list); | ||
| 1366 | list_del_init(&conf->bitmap_list); | ||
| 1367 | while (!list_empty(&head)) { | ||
| 1368 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | ||
| 1369 | list_del_init(&sh->lru); | ||
| 1370 | atomic_inc(&sh->count); | ||
| 1371 | __release_stripe(conf, sh); | ||
| 1372 | } | ||
| 1373 | } | ||
| 1374 | |||
| 1325 | static void unplug_slaves(mddev_t *mddev) | 1375 | static void unplug_slaves(mddev_t *mddev) |
| 1326 | { | 1376 | { |
| 1327 | raid5_conf_t *conf = mddev_to_conf(mddev); | 1377 | raid5_conf_t *conf = mddev_to_conf(mddev); |
| @@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q) | |||
| 1354 | 1404 | ||
| 1355 | spin_lock_irqsave(&conf->device_lock, flags); | 1405 | spin_lock_irqsave(&conf->device_lock, flags); |
| 1356 | 1406 | ||
| 1357 | if (blk_remove_plug(q)) | 1407 | if (blk_remove_plug(q)) { |
| 1408 | conf->seq_flush++; | ||
| 1358 | raid5_activate_delayed(conf); | 1409 | raid5_activate_delayed(conf); |
| 1410 | } | ||
| 1359 | md_wakeup_thread(mddev->thread); | 1411 | md_wakeup_thread(mddev->thread); |
| 1360 | 1412 | ||
| 1361 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1413 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| @@ -1411,6 +1463,11 @@ static int make_request (request_queue_t *q, struct bio * bi) | |||
| 1411 | sector_t logical_sector, last_sector; | 1463 | sector_t logical_sector, last_sector; |
| 1412 | struct stripe_head *sh; | 1464 | struct stripe_head *sh; |
| 1413 | 1465 | ||
| 1466 | if (unlikely(bio_barrier(bi))) { | ||
| 1467 | bio_endio(bi, bi->bi_size, -EOPNOTSUPP); | ||
| 1468 | return 0; | ||
| 1469 | } | ||
| 1470 | |||
| 1414 | md_write_start(mddev, bi); | 1471 | md_write_start(mddev, bi); |
| 1415 | 1472 | ||
| 1416 | if (bio_data_dir(bi)==WRITE) { | 1473 | if (bio_data_dir(bi)==WRITE) { |
| @@ -1488,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1488 | sector_t first_sector; | 1545 | sector_t first_sector; |
| 1489 | int raid_disks = conf->raid_disks; | 1546 | int raid_disks = conf->raid_disks; |
| 1490 | int data_disks = raid_disks-1; | 1547 | int data_disks = raid_disks-1; |
| 1548 | sector_t max_sector = mddev->size << 1; | ||
| 1549 | int sync_blocks; | ||
| 1491 | 1550 | ||
| 1492 | if (sector_nr >= mddev->size <<1) { | 1551 | if (sector_nr >= max_sector) { |
| 1493 | /* just being told to finish up .. nothing much to do */ | 1552 | /* just being told to finish up .. nothing much to do */ |
| 1494 | unplug_slaves(mddev); | 1553 | unplug_slaves(mddev); |
| 1554 | |||
| 1555 | if (mddev->curr_resync < max_sector) /* aborted */ | ||
| 1556 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | ||
| 1557 | &sync_blocks, 1); | ||
| 1558 | else /* compelted sync */ | ||
| 1559 | conf->fullsync = 0; | ||
| 1560 | bitmap_close_sync(mddev->bitmap); | ||
| 1561 | |||
| 1495 | return 0; | 1562 | return 0; |
| 1496 | } | 1563 | } |
| 1497 | /* if there is 1 or more failed drives and we are trying | 1564 | /* if there is 1 or more failed drives and we are trying |
| @@ -1503,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1503 | *skipped = 1; | 1570 | *skipped = 1; |
| 1504 | return rv; | 1571 | return rv; |
| 1505 | } | 1572 | } |
| 1573 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && | ||
| 1574 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { | ||
| 1575 | /* we can skip this block, and probably more */ | ||
| 1576 | sync_blocks /= STRIPE_SECTORS; | ||
| 1577 | *skipped = 1; | ||
| 1578 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | ||
| 1579 | } | ||
| 1506 | 1580 | ||
| 1507 | x = sector_nr; | 1581 | x = sector_nr; |
| 1508 | chunk_offset = sector_div(x, sectors_per_chunk); | 1582 | chunk_offset = sector_div(x, sectors_per_chunk); |
| @@ -1520,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1520 | set_current_state(TASK_UNINTERRUPTIBLE); | 1594 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 1521 | schedule_timeout(1); | 1595 | schedule_timeout(1); |
| 1522 | } | 1596 | } |
| 1597 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); | ||
| 1523 | spin_lock(&sh->lock); | 1598 | spin_lock(&sh->lock); |
| 1524 | set_bit(STRIPE_SYNCING, &sh->state); | 1599 | set_bit(STRIPE_SYNCING, &sh->state); |
| 1525 | clear_bit(STRIPE_INSYNC, &sh->state); | 1600 | clear_bit(STRIPE_INSYNC, &sh->state); |
| @@ -1553,6 +1628,13 @@ static void raid5d (mddev_t *mddev) | |||
| 1553 | while (1) { | 1628 | while (1) { |
| 1554 | struct list_head *first; | 1629 | struct list_head *first; |
| 1555 | 1630 | ||
| 1631 | if (conf->seq_flush - conf->seq_write > 0) { | ||
| 1632 | int seq = conf->seq_flush; | ||
| 1633 | bitmap_unplug(mddev->bitmap); | ||
| 1634 | conf->seq_write = seq; | ||
| 1635 | activate_bit_delay(conf); | ||
| 1636 | } | ||
| 1637 | |||
| 1556 | if (list_empty(&conf->handle_list) && | 1638 | if (list_empty(&conf->handle_list) && |
| 1557 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && | 1639 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && |
| 1558 | !blk_queue_plugged(mddev->queue) && | 1640 | !blk_queue_plugged(mddev->queue) && |
| @@ -1586,7 +1668,7 @@ static void raid5d (mddev_t *mddev) | |||
| 1586 | PRINTK("--- raid5d inactive\n"); | 1668 | PRINTK("--- raid5d inactive\n"); |
| 1587 | } | 1669 | } |
| 1588 | 1670 | ||
| 1589 | static int run (mddev_t *mddev) | 1671 | static int run(mddev_t *mddev) |
| 1590 | { | 1672 | { |
| 1591 | raid5_conf_t *conf; | 1673 | raid5_conf_t *conf; |
| 1592 | int raid_disk, memory; | 1674 | int raid_disk, memory; |
| @@ -1616,6 +1698,7 @@ static int run (mddev_t *mddev) | |||
| 1616 | init_waitqueue_head(&conf->wait_for_overlap); | 1698 | init_waitqueue_head(&conf->wait_for_overlap); |
| 1617 | INIT_LIST_HEAD(&conf->handle_list); | 1699 | INIT_LIST_HEAD(&conf->handle_list); |
| 1618 | INIT_LIST_HEAD(&conf->delayed_list); | 1700 | INIT_LIST_HEAD(&conf->delayed_list); |
| 1701 | INIT_LIST_HEAD(&conf->bitmap_list); | ||
| 1619 | INIT_LIST_HEAD(&conf->inactive_list); | 1702 | INIT_LIST_HEAD(&conf->inactive_list); |
| 1620 | atomic_set(&conf->active_stripes, 0); | 1703 | atomic_set(&conf->active_stripes, 0); |
| 1621 | atomic_set(&conf->preread_active_stripes, 0); | 1704 | atomic_set(&conf->preread_active_stripes, 0); |
| @@ -1727,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | |||
| 1727 | 1810 | ||
| 1728 | /* Ok, everything is just fine now */ | 1811 | /* Ok, everything is just fine now */ |
| 1729 | 1812 | ||
| 1813 | if (mddev->bitmap) | ||
| 1814 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
| 1815 | |||
| 1730 | mddev->queue->unplug_fn = raid5_unplug_device; | 1816 | mddev->queue->unplug_fn = raid5_unplug_device; |
| 1731 | mddev->queue->issue_flush_fn = raid5_issue_flush; | 1817 | mddev->queue->issue_flush_fn = raid5_issue_flush; |
| 1732 | 1818 | ||
| @@ -1907,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1907 | rdev->in_sync = 0; | 1993 | rdev->in_sync = 0; |
| 1908 | rdev->raid_disk = disk; | 1994 | rdev->raid_disk = disk; |
| 1909 | found = 1; | 1995 | found = 1; |
| 1996 | if (rdev->saved_raid_disk != disk) | ||
| 1997 | conf->fullsync = 1; | ||
| 1910 | p->rdev = rdev; | 1998 | p->rdev = rdev; |
| 1911 | break; | 1999 | break; |
| 1912 | } | 2000 | } |
| @@ -1936,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
| 1936 | return 0; | 2024 | return 0; |
| 1937 | } | 2025 | } |
| 1938 | 2026 | ||
| 2027 | static void raid5_quiesce(mddev_t *mddev, int state) | ||
| 2028 | { | ||
| 2029 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
| 2030 | |||
| 2031 | switch(state) { | ||
| 2032 | case 1: /* stop all writes */ | ||
| 2033 | spin_lock_irq(&conf->device_lock); | ||
| 2034 | conf->quiesce = 1; | ||
| 2035 | wait_event_lock_irq(conf->wait_for_stripe, | ||
| 2036 | atomic_read(&conf->active_stripes) == 0, | ||
| 2037 | conf->device_lock, /* nothing */); | ||
| 2038 | spin_unlock_irq(&conf->device_lock); | ||
| 2039 | break; | ||
| 2040 | |||
| 2041 | case 0: /* re-enable writes */ | ||
| 2042 | spin_lock_irq(&conf->device_lock); | ||
| 2043 | conf->quiesce = 0; | ||
| 2044 | wake_up(&conf->wait_for_stripe); | ||
| 2045 | spin_unlock_irq(&conf->device_lock); | ||
| 2046 | break; | ||
| 2047 | } | ||
| 2048 | if (mddev->thread) { | ||
| 2049 | if (mddev->bitmap) | ||
| 2050 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
| 2051 | else | ||
| 2052 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 2053 | md_wakeup_thread(mddev->thread); | ||
| 2054 | } | ||
| 2055 | } | ||
| 1939 | static mdk_personality_t raid5_personality= | 2056 | static mdk_personality_t raid5_personality= |
| 1940 | { | 2057 | { |
| 1941 | .name = "raid5", | 2058 | .name = "raid5", |
| @@ -1950,6 +2067,7 @@ static mdk_personality_t raid5_personality= | |||
| 1950 | .spare_active = raid5_spare_active, | 2067 | .spare_active = raid5_spare_active, |
| 1951 | .sync_request = sync_request, | 2068 | .sync_request = sync_request, |
| 1952 | .resize = raid5_resize, | 2069 | .resize = raid5_resize, |
| 2070 | .quiesce = raid5_quiesce, | ||
| 1953 | }; | 2071 | }; |
| 1954 | 2072 | ||
| 1955 | static int __init raid5_init (void) | 2073 | static int __init raid5_init (void) |
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 495dee1d1e83..267eb1430c83 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c | |||
| @@ -29,6 +29,8 @@ | |||
| 29 | #include <asm/atomic.h> | 29 | #include <asm/atomic.h> |
| 30 | #include "raid6.h" | 30 | #include "raid6.h" |
| 31 | 31 | ||
| 32 | #include <linux/raid/bitmap.h> | ||
| 33 | |||
| 32 | /* | 34 | /* |
| 33 | * Stripe cache | 35 | * Stripe cache |
| 34 | */ | 36 | */ |
| @@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) | |||
| 98 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 100 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
| 99 | if (test_bit(STRIPE_DELAYED, &sh->state)) | 101 | if (test_bit(STRIPE_DELAYED, &sh->state)) |
| 100 | list_add_tail(&sh->lru, &conf->delayed_list); | 102 | list_add_tail(&sh->lru, &conf->delayed_list); |
| 101 | else | 103 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
| 104 | conf->seq_write == sh->bm_seq) | ||
| 105 | list_add_tail(&sh->lru, &conf->bitmap_list); | ||
| 106 | else { | ||
| 107 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 102 | list_add_tail(&sh->lru, &conf->handle_list); | 108 | list_add_tail(&sh->lru, &conf->handle_list); |
| 109 | } | ||
| 103 | md_wakeup_thread(conf->mddev->thread); | 110 | md_wakeup_thread(conf->mddev->thread); |
| 104 | } else { | 111 | } else { |
| 105 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 112 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
| @@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector | |||
| 262 | spin_lock_irq(&conf->device_lock); | 269 | spin_lock_irq(&conf->device_lock); |
| 263 | 270 | ||
| 264 | do { | 271 | do { |
| 272 | wait_event_lock_irq(conf->wait_for_stripe, | ||
| 273 | conf->quiesce == 0, | ||
| 274 | conf->device_lock, /* nothing */); | ||
| 265 | sh = __find_stripe(conf, sector); | 275 | sh = __find_stripe(conf, sector); |
| 266 | if (!sh) { | 276 | if (!sh) { |
| 267 | if (!conf->inactive_blocked) | 277 | if (!conf->inactive_blocked) |
| @@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 906 | { | 916 | { |
| 907 | struct bio **bip; | 917 | struct bio **bip; |
| 908 | raid6_conf_t *conf = sh->raid_conf; | 918 | raid6_conf_t *conf = sh->raid_conf; |
| 919 | int firstwrite=0; | ||
| 909 | 920 | ||
| 910 | PRINTK("adding bh b#%llu to stripe s#%llu\n", | 921 | PRINTK("adding bh b#%llu to stripe s#%llu\n", |
| 911 | (unsigned long long)bi->bi_sector, | 922 | (unsigned long long)bi->bi_sector, |
| @@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 914 | 925 | ||
| 915 | spin_lock(&sh->lock); | 926 | spin_lock(&sh->lock); |
| 916 | spin_lock_irq(&conf->device_lock); | 927 | spin_lock_irq(&conf->device_lock); |
| 917 | if (forwrite) | 928 | if (forwrite) { |
| 918 | bip = &sh->dev[dd_idx].towrite; | 929 | bip = &sh->dev[dd_idx].towrite; |
| 919 | else | 930 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) |
| 931 | firstwrite = 1; | ||
| 932 | } else | ||
| 920 | bip = &sh->dev[dd_idx].toread; | 933 | bip = &sh->dev[dd_idx].toread; |
| 921 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { | 934 | while (*bip && (*bip)->bi_sector < bi->bi_sector) { |
| 922 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) | 935 | if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) |
| @@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 939 | (unsigned long long)bi->bi_sector, | 952 | (unsigned long long)bi->bi_sector, |
| 940 | (unsigned long long)sh->sector, dd_idx); | 953 | (unsigned long long)sh->sector, dd_idx); |
| 941 | 954 | ||
| 955 | if (conf->mddev->bitmap && firstwrite) { | ||
| 956 | sh->bm_seq = conf->seq_write; | ||
| 957 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | ||
| 958 | STRIPE_SECTORS, 0); | ||
| 959 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
| 960 | } | ||
| 961 | |||
| 942 | if (forwrite) { | 962 | if (forwrite) { |
| 943 | /* check if page is covered */ | 963 | /* check if page is covered */ |
| 944 | sector_t sector = sh->dev[dd_idx].sector; | 964 | sector_t sector = sh->dev[dd_idx].sector; |
| @@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1066 | * need to be failed | 1086 | * need to be failed |
| 1067 | */ | 1087 | */ |
| 1068 | if (failed > 2 && to_read+to_write+written) { | 1088 | if (failed > 2 && to_read+to_write+written) { |
| 1069 | spin_lock_irq(&conf->device_lock); | ||
| 1070 | for (i=disks; i--; ) { | 1089 | for (i=disks; i--; ) { |
| 1090 | int bitmap_end = 0; | ||
| 1091 | spin_lock_irq(&conf->device_lock); | ||
| 1071 | /* fail all writes first */ | 1092 | /* fail all writes first */ |
| 1072 | bi = sh->dev[i].towrite; | 1093 | bi = sh->dev[i].towrite; |
| 1073 | sh->dev[i].towrite = NULL; | 1094 | sh->dev[i].towrite = NULL; |
| 1074 | if (bi) to_write--; | 1095 | if (bi) { to_write--; bitmap_end = 1; } |
| 1075 | 1096 | ||
| 1076 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 1097 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
| 1077 | wake_up(&conf->wait_for_overlap); | 1098 | wake_up(&conf->wait_for_overlap); |
| @@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1089 | /* and fail all 'written' */ | 1110 | /* and fail all 'written' */ |
| 1090 | bi = sh->dev[i].written; | 1111 | bi = sh->dev[i].written; |
| 1091 | sh->dev[i].written = NULL; | 1112 | sh->dev[i].written = NULL; |
| 1113 | if (bi) bitmap_end = 1; | ||
| 1092 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { | 1114 | while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { |
| 1093 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 1115 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
| 1094 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 1116 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| @@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1117 | bi = nextbi; | 1139 | bi = nextbi; |
| 1118 | } | 1140 | } |
| 1119 | } | 1141 | } |
| 1142 | spin_unlock_irq(&conf->device_lock); | ||
| 1143 | if (bitmap_end) | ||
| 1144 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 1145 | STRIPE_SECTORS, 0, 0); | ||
| 1120 | } | 1146 | } |
| 1121 | spin_unlock_irq(&conf->device_lock); | ||
| 1122 | } | 1147 | } |
| 1123 | if (failed > 2 && syncing) { | 1148 | if (failed > 2 && syncing) { |
| 1124 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 1149 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
| @@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1155 | if (!test_bit(R5_LOCKED, &dev->flags) && | 1180 | if (!test_bit(R5_LOCKED, &dev->flags) && |
| 1156 | test_bit(R5_UPTODATE, &dev->flags) ) { | 1181 | test_bit(R5_UPTODATE, &dev->flags) ) { |
| 1157 | /* We can return any write requests */ | 1182 | /* We can return any write requests */ |
| 1183 | int bitmap_end = 0; | ||
| 1158 | struct bio *wbi, *wbi2; | 1184 | struct bio *wbi, *wbi2; |
| 1159 | PRINTK("Return write for stripe %llu disc %d\n", | 1185 | PRINTK("Return write for stripe %llu disc %d\n", |
| 1160 | (unsigned long long)sh->sector, i); | 1186 | (unsigned long long)sh->sector, i); |
| @@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1170 | } | 1196 | } |
| 1171 | wbi = wbi2; | 1197 | wbi = wbi2; |
| 1172 | } | 1198 | } |
| 1199 | if (dev->towrite == NULL) | ||
| 1200 | bitmap_end = 1; | ||
| 1173 | spin_unlock_irq(&conf->device_lock); | 1201 | spin_unlock_irq(&conf->device_lock); |
| 1202 | if (bitmap_end) | ||
| 1203 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 1204 | STRIPE_SECTORS, | ||
| 1205 | !test_bit(STRIPE_DEGRADED, &sh->state), 0); | ||
| 1174 | } | 1206 | } |
| 1175 | } | 1207 | } |
| 1176 | } | 1208 | } |
| @@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1285 | } | 1317 | } |
| 1286 | } | 1318 | } |
| 1287 | /* now if nothing is locked, and if we have enough data, we can start a write request */ | 1319 | /* now if nothing is locked, and if we have enough data, we can start a write request */ |
| 1288 | if (locked == 0 && rcw == 0) { | 1320 | if (locked == 0 && rcw == 0 && |
| 1321 | !test_bit(STRIPE_BIT_DELAY, &sh->state)) { | ||
| 1289 | if ( must_compute > 0 ) { | 1322 | if ( must_compute > 0 ) { |
| 1290 | /* We have failed blocks and need to compute them */ | 1323 | /* We have failed blocks and need to compute them */ |
| 1291 | switch ( failed ) { | 1324 | switch ( failed ) { |
| @@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1388 | bdev = &sh->dev[failed_num[1]]; | 1421 | bdev = &sh->dev[failed_num[1]]; |
| 1389 | locked += !test_bit(R5_LOCKED, &bdev->flags); | 1422 | locked += !test_bit(R5_LOCKED, &bdev->flags); |
| 1390 | set_bit(R5_LOCKED, &bdev->flags); | 1423 | set_bit(R5_LOCKED, &bdev->flags); |
| 1424 | clear_bit(STRIPE_DEGRADED, &sh->state); | ||
| 1391 | set_bit(R5_Wantwrite, &bdev->flags); | 1425 | set_bit(R5_Wantwrite, &bdev->flags); |
| 1392 | 1426 | ||
| 1393 | set_bit(STRIPE_INSYNC, &sh->state); | 1427 | set_bit(STRIPE_INSYNC, &sh->state); |
| @@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh) | |||
| 1457 | bi->bi_next = NULL; | 1491 | bi->bi_next = NULL; |
| 1458 | generic_make_request(bi); | 1492 | generic_make_request(bi); |
| 1459 | } else { | 1493 | } else { |
| 1494 | if (rw == 1) | ||
| 1495 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
| 1460 | PRINTK("skip op %ld on disc %d for sector %llu\n", | 1496 | PRINTK("skip op %ld on disc %d for sector %llu\n", |
| 1461 | bi->bi_rw, i, (unsigned long long)sh->sector); | 1497 | bi->bi_rw, i, (unsigned long long)sh->sector); |
| 1462 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | 1498 | clear_bit(R5_LOCKED, &sh->dev[i].flags); |
| @@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) | |||
| 1481 | } | 1517 | } |
| 1482 | } | 1518 | } |
| 1483 | 1519 | ||
| 1520 | static inline void activate_bit_delay(raid6_conf_t *conf) | ||
| 1521 | { | ||
| 1522 | /* device_lock is held */ | ||
| 1523 | struct list_head head; | ||
| 1524 | list_add(&head, &conf->bitmap_list); | ||
| 1525 | list_del_init(&conf->bitmap_list); | ||
| 1526 | while (!list_empty(&head)) { | ||
| 1527 | struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); | ||
| 1528 | list_del_init(&sh->lru); | ||
| 1529 | atomic_inc(&sh->count); | ||
| 1530 | __release_stripe(conf, sh); | ||
| 1531 | } | ||
| 1532 | } | ||
| 1533 | |||
| 1484 | static void unplug_slaves(mddev_t *mddev) | 1534 | static void unplug_slaves(mddev_t *mddev) |
| 1485 | { | 1535 | { |
| 1486 | raid6_conf_t *conf = mddev_to_conf(mddev); | 1536 | raid6_conf_t *conf = mddev_to_conf(mddev); |
| @@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q) | |||
| 1513 | 1563 | ||
| 1514 | spin_lock_irqsave(&conf->device_lock, flags); | 1564 | spin_lock_irqsave(&conf->device_lock, flags); |
| 1515 | 1565 | ||
| 1516 | if (blk_remove_plug(q)) | 1566 | if (blk_remove_plug(q)) { |
| 1567 | conf->seq_flush++; | ||
| 1517 | raid6_activate_delayed(conf); | 1568 | raid6_activate_delayed(conf); |
| 1569 | } | ||
| 1518 | md_wakeup_thread(mddev->thread); | 1570 | md_wakeup_thread(mddev->thread); |
| 1519 | 1571 | ||
| 1520 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1572 | spin_unlock_irqrestore(&conf->device_lock, flags); |
| @@ -1570,6 +1622,11 @@ static int make_request (request_queue_t *q, struct bio * bi) | |||
| 1570 | sector_t logical_sector, last_sector; | 1622 | sector_t logical_sector, last_sector; |
| 1571 | struct stripe_head *sh; | 1623 | struct stripe_head *sh; |
| 1572 | 1624 | ||
| 1625 | if (unlikely(bio_barrier(bi))) { | ||
| 1626 | bio_endio(bi, bi->bi_size, -EOPNOTSUPP); | ||
| 1627 | return 0; | ||
| 1628 | } | ||
| 1629 | |||
| 1573 | md_write_start(mddev, bi); | 1630 | md_write_start(mddev, bi); |
| 1574 | 1631 | ||
| 1575 | if (bio_data_dir(bi)==WRITE) { | 1632 | if (bio_data_dir(bi)==WRITE) { |
| @@ -1647,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1647 | sector_t first_sector; | 1704 | sector_t first_sector; |
| 1648 | int raid_disks = conf->raid_disks; | 1705 | int raid_disks = conf->raid_disks; |
| 1649 | int data_disks = raid_disks - 2; | 1706 | int data_disks = raid_disks - 2; |
| 1707 | sector_t max_sector = mddev->size << 1; | ||
| 1708 | int sync_blocks; | ||
| 1650 | 1709 | ||
| 1651 | if (sector_nr >= mddev->size <<1) { | 1710 | if (sector_nr >= max_sector) { |
| 1652 | /* just being told to finish up .. nothing much to do */ | 1711 | /* just being told to finish up .. nothing much to do */ |
| 1653 | unplug_slaves(mddev); | 1712 | unplug_slaves(mddev); |
| 1713 | |||
| 1714 | if (mddev->curr_resync < max_sector) /* aborted */ | ||
| 1715 | bitmap_end_sync(mddev->bitmap, mddev->curr_resync, | ||
| 1716 | &sync_blocks, 1); | ||
| 1717 | else /* compelted sync */ | ||
| 1718 | conf->fullsync = 0; | ||
| 1719 | bitmap_close_sync(mddev->bitmap); | ||
| 1720 | |||
| 1654 | return 0; | 1721 | return 0; |
| 1655 | } | 1722 | } |
| 1656 | /* if there are 2 or more failed drives and we are trying | 1723 | /* if there are 2 or more failed drives and we are trying |
| @@ -1662,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1662 | *skipped = 1; | 1729 | *skipped = 1; |
| 1663 | return rv; | 1730 | return rv; |
| 1664 | } | 1731 | } |
| 1732 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && | ||
| 1733 | !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { | ||
| 1734 | /* we can skip this block, and probably more */ | ||
| 1735 | sync_blocks /= STRIPE_SECTORS; | ||
| 1736 | *skipped = 1; | ||
| 1737 | return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ | ||
| 1738 | } | ||
| 1665 | 1739 | ||
| 1666 | x = sector_nr; | 1740 | x = sector_nr; |
| 1667 | chunk_offset = sector_div(x, sectors_per_chunk); | 1741 | chunk_offset = sector_div(x, sectors_per_chunk); |
| @@ -1679,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
| 1679 | set_current_state(TASK_UNINTERRUPTIBLE); | 1753 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 1680 | schedule_timeout(1); | 1754 | schedule_timeout(1); |
| 1681 | } | 1755 | } |
| 1756 | bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); | ||
| 1682 | spin_lock(&sh->lock); | 1757 | spin_lock(&sh->lock); |
| 1683 | set_bit(STRIPE_SYNCING, &sh->state); | 1758 | set_bit(STRIPE_SYNCING, &sh->state); |
| 1684 | clear_bit(STRIPE_INSYNC, &sh->state); | 1759 | clear_bit(STRIPE_INSYNC, &sh->state); |
| @@ -1712,6 +1787,13 @@ static void raid6d (mddev_t *mddev) | |||
| 1712 | while (1) { | 1787 | while (1) { |
| 1713 | struct list_head *first; | 1788 | struct list_head *first; |
| 1714 | 1789 | ||
| 1790 | if (conf->seq_flush - conf->seq_write > 0) { | ||
| 1791 | int seq = conf->seq_flush; | ||
| 1792 | bitmap_unplug(mddev->bitmap); | ||
| 1793 | conf->seq_write = seq; | ||
| 1794 | activate_bit_delay(conf); | ||
| 1795 | } | ||
| 1796 | |||
| 1715 | if (list_empty(&conf->handle_list) && | 1797 | if (list_empty(&conf->handle_list) && |
| 1716 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && | 1798 | atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && |
| 1717 | !blk_queue_plugged(mddev->queue) && | 1799 | !blk_queue_plugged(mddev->queue) && |
| @@ -1745,7 +1827,7 @@ static void raid6d (mddev_t *mddev) | |||
| 1745 | PRINTK("--- raid6d inactive\n"); | 1827 | PRINTK("--- raid6d inactive\n"); |
| 1746 | } | 1828 | } |
| 1747 | 1829 | ||
| 1748 | static int run (mddev_t *mddev) | 1830 | static int run(mddev_t *mddev) |
| 1749 | { | 1831 | { |
| 1750 | raid6_conf_t *conf; | 1832 | raid6_conf_t *conf; |
| 1751 | int raid_disk, memory; | 1833 | int raid_disk, memory; |
| @@ -1775,6 +1857,7 @@ static int run (mddev_t *mddev) | |||
| 1775 | init_waitqueue_head(&conf->wait_for_overlap); | 1857 | init_waitqueue_head(&conf->wait_for_overlap); |
| 1776 | INIT_LIST_HEAD(&conf->handle_list); | 1858 | INIT_LIST_HEAD(&conf->handle_list); |
| 1777 | INIT_LIST_HEAD(&conf->delayed_list); | 1859 | INIT_LIST_HEAD(&conf->delayed_list); |
| 1860 | INIT_LIST_HEAD(&conf->bitmap_list); | ||
| 1778 | INIT_LIST_HEAD(&conf->inactive_list); | 1861 | INIT_LIST_HEAD(&conf->inactive_list); |
| 1779 | atomic_set(&conf->active_stripes, 0); | 1862 | atomic_set(&conf->active_stripes, 0); |
| 1780 | atomic_set(&conf->preread_active_stripes, 0); | 1863 | atomic_set(&conf->preread_active_stripes, 0); |
| @@ -1894,6 +1977,9 @@ static int run (mddev_t *mddev) | |||
| 1894 | /* Ok, everything is just fine now */ | 1977 | /* Ok, everything is just fine now */ |
| 1895 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); | 1978 | mddev->array_size = mddev->size * (mddev->raid_disks - 2); |
| 1896 | 1979 | ||
| 1980 | if (mddev->bitmap) | ||
| 1981 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
| 1982 | |||
| 1897 | mddev->queue->unplug_fn = raid6_unplug_device; | 1983 | mddev->queue->unplug_fn = raid6_unplug_device; |
| 1898 | mddev->queue->issue_flush_fn = raid6_issue_flush; | 1984 | mddev->queue->issue_flush_fn = raid6_issue_flush; |
| 1899 | return 0; | 1985 | return 0; |
| @@ -2071,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 2071 | rdev->in_sync = 0; | 2157 | rdev->in_sync = 0; |
| 2072 | rdev->raid_disk = disk; | 2158 | rdev->raid_disk = disk; |
| 2073 | found = 1; | 2159 | found = 1; |
| 2160 | if (rdev->saved_raid_disk != disk) | ||
| 2161 | conf->fullsync = 1; | ||
| 2074 | p->rdev = rdev; | 2162 | p->rdev = rdev; |
| 2075 | break; | 2163 | break; |
| 2076 | } | 2164 | } |
| @@ -2100,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors) | |||
| 2100 | return 0; | 2188 | return 0; |
| 2101 | } | 2189 | } |
| 2102 | 2190 | ||
| 2191 | static void raid6_quiesce(mddev_t *mddev, int state) | ||
| 2192 | { | ||
| 2193 | raid6_conf_t *conf = mddev_to_conf(mddev); | ||
| 2194 | |||
| 2195 | switch(state) { | ||
| 2196 | case 1: /* stop all writes */ | ||
| 2197 | spin_lock_irq(&conf->device_lock); | ||
| 2198 | conf->quiesce = 1; | ||
| 2199 | wait_event_lock_irq(conf->wait_for_stripe, | ||
| 2200 | atomic_read(&conf->active_stripes) == 0, | ||
| 2201 | conf->device_lock, /* nothing */); | ||
| 2202 | spin_unlock_irq(&conf->device_lock); | ||
| 2203 | break; | ||
| 2204 | |||
| 2205 | case 0: /* re-enable writes */ | ||
| 2206 | spin_lock_irq(&conf->device_lock); | ||
| 2207 | conf->quiesce = 0; | ||
| 2208 | wake_up(&conf->wait_for_stripe); | ||
| 2209 | spin_unlock_irq(&conf->device_lock); | ||
| 2210 | break; | ||
| 2211 | } | ||
| 2212 | if (mddev->thread) { | ||
| 2213 | if (mddev->bitmap) | ||
| 2214 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
| 2215 | else | ||
| 2216 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 2217 | md_wakeup_thread(mddev->thread); | ||
| 2218 | } | ||
| 2219 | } | ||
| 2103 | static mdk_personality_t raid6_personality= | 2220 | static mdk_personality_t raid6_personality= |
| 2104 | { | 2221 | { |
| 2105 | .name = "raid6", | 2222 | .name = "raid6", |
| @@ -2114,6 +2231,7 @@ static mdk_personality_t raid6_personality= | |||
| 2114 | .spare_active = raid6_spare_active, | 2231 | .spare_active = raid6_spare_active, |
| 2115 | .sync_request = sync_request, | 2232 | .sync_request = sync_request, |
| 2116 | .resize = raid6_resize, | 2233 | .resize = raid6_resize, |
| 2234 | .quiesce = raid6_quiesce, | ||
| 2117 | }; | 2235 | }; |
| 2118 | 2236 | ||
| 2119 | static int __init raid6_init (void) | 2237 | static int __init raid6_init (void) |
