aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c183
-rw-r--r--drivers/md/dm-exception-store.c9
-rw-r--r--drivers/md/dm-raid1.c12
-rw-r--r--drivers/md/linear.c100
-rw-r--r--drivers/md/md.c227
-rw-r--r--drivers/md/multipath.c5
-rw-r--r--drivers/md/raid0.c5
-rw-r--r--drivers/md/raid1.c234
-rw-r--r--drivers/md/raid10.c46
-rw-r--r--drivers/md/raid5.c138
-rw-r--r--drivers/md/raid6main.c138
11 files changed, 856 insertions, 241 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 41df4cda66e2..2fba2bbe72d8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -270,19 +270,20 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
270 270
271 if (!page) 271 if (!page)
272 return ERR_PTR(-ENOMEM); 272 return ERR_PTR(-ENOMEM);
273 do {
274 ITERATE_RDEV(mddev, rdev, tmp)
275 if (rdev->in_sync && !rdev->faulty)
276 goto found;
277 return ERR_PTR(-EIO);
278 273
279 found: 274 ITERATE_RDEV(mddev, rdev, tmp) {
275 if (! rdev->in_sync || rdev->faulty)
276 continue;
277
280 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); 278 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
281 279
282 } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)); 280 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
281 page->index = index;
282 return page;
283 }
284 }
285 return ERR_PTR(-EIO);
283 286
284 page->index = index;
285 return page;
286} 287}
287 288
288static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) 289static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
@@ -437,6 +438,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
437 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); 438 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
438 printk(KERN_DEBUG " sync size: %llu KB\n", 439 printk(KERN_DEBUG " sync size: %llu KB\n",
439 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 440 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
441 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
440 kunmap(bitmap->sb_page); 442 kunmap(bitmap->sb_page);
441} 443}
442 444
@@ -445,7 +447,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
445{ 447{
446 char *reason = NULL; 448 char *reason = NULL;
447 bitmap_super_t *sb; 449 bitmap_super_t *sb;
448 unsigned long chunksize, daemon_sleep; 450 unsigned long chunksize, daemon_sleep, write_behind;
449 unsigned long bytes_read; 451 unsigned long bytes_read;
450 unsigned long long events; 452 unsigned long long events;
451 int err = -EINVAL; 453 int err = -EINVAL;
@@ -474,6 +476,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
474 476
475 chunksize = le32_to_cpu(sb->chunksize); 477 chunksize = le32_to_cpu(sb->chunksize);
476 daemon_sleep = le32_to_cpu(sb->daemon_sleep); 478 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
479 write_behind = le32_to_cpu(sb->write_behind);
477 480
478 /* verify that the bitmap-specific fields are valid */ 481 /* verify that the bitmap-specific fields are valid */
479 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) 482 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -485,7 +488,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
485 else if ((1 << ffz(~chunksize)) != chunksize) 488 else if ((1 << ffz(~chunksize)) != chunksize)
486 reason = "bitmap chunksize not a power of 2"; 489 reason = "bitmap chunksize not a power of 2";
487 else if (daemon_sleep < 1 || daemon_sleep > 15) 490 else if (daemon_sleep < 1 || daemon_sleep > 15)
488 reason = "daemon sleep period out of range"; 491 reason = "daemon sleep period out of range (1-15s)";
492 else if (write_behind > COUNTER_MAX)
493 reason = "write-behind limit out of range (0 - 16383)";
489 if (reason) { 494 if (reason) {
490 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", 495 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
491 bmname(bitmap), reason); 496 bmname(bitmap), reason);
@@ -518,8 +523,12 @@ success:
518 /* assign fields using values from superblock */ 523 /* assign fields using values from superblock */
519 bitmap->chunksize = chunksize; 524 bitmap->chunksize = chunksize;
520 bitmap->daemon_sleep = daemon_sleep; 525 bitmap->daemon_sleep = daemon_sleep;
526 bitmap->daemon_lastrun = jiffies;
527 bitmap->max_write_behind = write_behind;
521 bitmap->flags |= sb->state; 528 bitmap->flags |= sb->state;
522 bitmap->events_cleared = le64_to_cpu(sb->events_cleared); 529 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
530 if (sb->state & BITMAP_STALE)
531 bitmap->events_cleared = bitmap->mddev->events;
523 err = 0; 532 err = 0;
524out: 533out:
525 kunmap(bitmap->sb_page); 534 kunmap(bitmap->sb_page);
@@ -617,7 +626,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
617 page_cache_release(sb_page); 626 page_cache_release(sb_page);
618} 627}
619 628
620static void bitmap_stop_daemons(struct bitmap *bitmap); 629static void bitmap_stop_daemon(struct bitmap *bitmap);
621 630
622/* dequeue the next item in a page list -- don't call from irq context */ 631/* dequeue the next item in a page list -- don't call from irq context */
623static struct page_list *dequeue_page(struct bitmap *bitmap) 632static struct page_list *dequeue_page(struct bitmap *bitmap)
@@ -659,7 +668,7 @@ static void bitmap_file_put(struct bitmap *bitmap)
659 bitmap->file = NULL; 668 bitmap->file = NULL;
660 spin_unlock_irqrestore(&bitmap->lock, flags); 669 spin_unlock_irqrestore(&bitmap->lock, flags);
661 670
662 bitmap_stop_daemons(bitmap); 671 bitmap_stop_daemon(bitmap);
663 672
664 drain_write_queues(bitmap); 673 drain_write_queues(bitmap);
665 674
@@ -818,7 +827,7 @@ int bitmap_unplug(struct bitmap *bitmap)
818 return 0; 827 return 0;
819} 828}
820 829
821static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset); 830static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
822/* * bitmap_init_from_disk -- called at bitmap_create time to initialize 831/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
823 * the in-memory bitmap from the on-disk bitmap -- also, sets up the 832 * the in-memory bitmap from the on-disk bitmap -- also, sets up the
824 * memory mapping of the bitmap file 833 * memory mapping of the bitmap file
@@ -826,8 +835,11 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset);
826 * if there's no bitmap file, or if the bitmap file had been 835 * if there's no bitmap file, or if the bitmap file had been
827 * previously kicked from the array, we mark all the bits as 836 * previously kicked from the array, we mark all the bits as
828 * 1's in order to cause a full resync. 837 * 1's in order to cause a full resync.
838 *
839 * We ignore all bits for sectors that end earlier than 'start'.
840 * This is used when reading an out-of-date bitmap...
829 */ 841 */
830static int bitmap_init_from_disk(struct bitmap *bitmap) 842static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
831{ 843{
832 unsigned long i, chunks, index, oldindex, bit; 844 unsigned long i, chunks, index, oldindex, bit;
833 struct page *page = NULL, *oldpage = NULL; 845 struct page *page = NULL, *oldpage = NULL;
@@ -914,7 +926,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap)
914 * whole page and write it out 926 * whole page and write it out
915 */ 927 */
916 memset(page_address(page) + offset, 0xff, 928 memset(page_address(page) + offset, 0xff,
917 PAGE_SIZE - offset); 929 PAGE_SIZE - offset);
918 ret = write_page(bitmap, page, 1); 930 ret = write_page(bitmap, page, 1);
919 if (ret) { 931 if (ret) {
920 kunmap(page); 932 kunmap(page);
@@ -928,8 +940,11 @@ static int bitmap_init_from_disk(struct bitmap *bitmap)
928 } 940 }
929 if (test_bit(bit, page_address(page))) { 941 if (test_bit(bit, page_address(page))) {
930 /* if the disk bit is set, set the memory bit */ 942 /* if the disk bit is set, set the memory bit */
931 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap)); 943 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
944 ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start)
945 );
932 bit_cnt++; 946 bit_cnt++;
947 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
933 } 948 }
934 } 949 }
935 950
@@ -1141,6 +1156,9 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
1141 err = -EINTR; 1156 err = -EINTR;
1142 goto out; 1157 goto out;
1143 } 1158 }
1159 if (bitmap == NULL)
1160 /* about to be stopped. */
1161 return;
1144 1162
1145 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); 1163 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
1146 /* wait on bitmap page writebacks */ 1164 /* wait on bitmap page writebacks */
@@ -1170,21 +1188,12 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
1170 } 1188 }
1171} 1189}
1172 1190
1173static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, 1191static mdk_thread_t *bitmap_start_daemon(struct bitmap *bitmap,
1174 void (*func)(mddev_t *), char *name) 1192 void (*func)(mddev_t *), char *name)
1175{ 1193{
1176 mdk_thread_t *daemon; 1194 mdk_thread_t *daemon;
1177 unsigned long flags;
1178 char namebuf[32]; 1195 char namebuf[32];
1179 1196
1180 spin_lock_irqsave(&bitmap->lock, flags);
1181 *ptr = NULL;
1182
1183 if (!bitmap->file) /* no need for daemon if there's no backing file */
1184 goto out_unlock;
1185
1186 spin_unlock_irqrestore(&bitmap->lock, flags);
1187
1188#ifdef INJECT_FATAL_FAULT_2 1197#ifdef INJECT_FATAL_FAULT_2
1189 daemon = NULL; 1198 daemon = NULL;
1190#else 1199#else
@@ -1194,47 +1203,32 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
1194 if (!daemon) { 1203 if (!daemon) {
1195 printk(KERN_ERR "%s: failed to start bitmap daemon\n", 1204 printk(KERN_ERR "%s: failed to start bitmap daemon\n",
1196 bmname(bitmap)); 1205 bmname(bitmap));
1197 return -ECHILD; 1206 return ERR_PTR(-ECHILD);
1198 } 1207 }
1199 1208
1200 spin_lock_irqsave(&bitmap->lock, flags);
1201 *ptr = daemon;
1202
1203 md_wakeup_thread(daemon); /* start it running */ 1209 md_wakeup_thread(daemon); /* start it running */
1204 1210
1205 PRINTK("%s: %s daemon (pid %d) started...\n", 1211 PRINTK("%s: %s daemon (pid %d) started...\n",
1206 bmname(bitmap), name, daemon->tsk->pid); 1212 bmname(bitmap), name, daemon->tsk->pid);
1207out_unlock:
1208 spin_unlock_irqrestore(&bitmap->lock, flags);
1209 return 0;
1210}
1211 1213
1212static int bitmap_start_daemons(struct bitmap *bitmap) 1214 return daemon;
1213{
1214 int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon,
1215 bitmap_writeback_daemon, "bitmap_wb");
1216 return err;
1217} 1215}
1218 1216
1219static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr) 1217static void bitmap_stop_daemon(struct bitmap *bitmap)
1220{ 1218{
1221 mdk_thread_t *daemon; 1219 /* the daemon can't stop itself... it'll just exit instead... */
1222 unsigned long flags; 1220 if (bitmap->writeback_daemon && ! IS_ERR(bitmap->writeback_daemon) &&
1223 1221 current->pid != bitmap->writeback_daemon->tsk->pid) {
1224 spin_lock_irqsave(&bitmap->lock, flags); 1222 mdk_thread_t *daemon;
1225 daemon = *ptr; 1223 unsigned long flags;
1226 *ptr = NULL;
1227 spin_unlock_irqrestore(&bitmap->lock, flags);
1228 if (daemon)
1229 md_unregister_thread(daemon); /* destroy the thread */
1230}
1231 1224
1232static void bitmap_stop_daemons(struct bitmap *bitmap) 1225 spin_lock_irqsave(&bitmap->lock, flags);
1233{ 1226 daemon = bitmap->writeback_daemon;
1234 /* the daemons can't stop themselves... they'll just exit instead... */ 1227 bitmap->writeback_daemon = NULL;
1235 if (bitmap->writeback_daemon && 1228 spin_unlock_irqrestore(&bitmap->lock, flags);
1236 current->pid != bitmap->writeback_daemon->tsk->pid) 1229 if (daemon && ! IS_ERR(daemon))
1237 bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon); 1230 md_unregister_thread(daemon); /* destroy the thread */
1231 }
1238} 1232}
1239 1233
1240static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, 1234static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
@@ -1274,9 +1268,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1274 } 1268 }
1275} 1269}
1276 1270
1277int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) 1271int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
1278{ 1272{
1279 if (!bitmap) return 0; 1273 if (!bitmap) return 0;
1274
1275 if (behind) {
1276 atomic_inc(&bitmap->behind_writes);
1277 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
1278 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1279 }
1280
1280 while (sectors) { 1281 while (sectors) {
1281 int blocks; 1282 int blocks;
1282 bitmap_counter_t *bmc; 1283 bitmap_counter_t *bmc;
@@ -1311,9 +1312,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
1311} 1312}
1312 1313
1313void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, 1314void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
1314 int success) 1315 int success, int behind)
1315{ 1316{
1316 if (!bitmap) return; 1317 if (!bitmap) return;
1318 if (behind) {
1319 atomic_dec(&bitmap->behind_writes);
1320 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
1321 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1322 }
1323
1317 while (sectors) { 1324 while (sectors) {
1318 int blocks; 1325 int blocks;
1319 unsigned long flags; 1326 unsigned long flags;
@@ -1424,7 +1431,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
1424 } 1431 }
1425} 1432}
1426 1433
1427static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset) 1434static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
1428{ 1435{
1429 /* For each chunk covered by any of these sectors, set the 1436 /* For each chunk covered by any of these sectors, set the
1430 * counter to 1 and set resync_needed. They should all 1437 * counter to 1 and set resync_needed. They should all
@@ -1441,7 +1448,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset)
1441 } 1448 }
1442 if (! *bmc) { 1449 if (! *bmc) {
1443 struct page *page; 1450 struct page *page;
1444 *bmc = 1 | NEEDED_MASK; 1451 *bmc = 1 | (needed?NEEDED_MASK:0);
1445 bitmap_count_page(bitmap, offset, 1); 1452 bitmap_count_page(bitmap, offset, 1);
1446 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1453 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
1447 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1454 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
@@ -1476,17 +1483,14 @@ void bitmap_flush(mddev_t *mddev)
1476/* 1483/*
1477 * free memory that was allocated 1484 * free memory that was allocated
1478 */ 1485 */
1479void bitmap_destroy(mddev_t *mddev) 1486static void bitmap_free(struct bitmap *bitmap)
1480{ 1487{
1481 unsigned long k, pages; 1488 unsigned long k, pages;
1482 struct bitmap_page *bp; 1489 struct bitmap_page *bp;
1483 struct bitmap *bitmap = mddev->bitmap;
1484 1490
1485 if (!bitmap) /* there was no bitmap */ 1491 if (!bitmap) /* there was no bitmap */
1486 return; 1492 return;
1487 1493
1488 mddev->bitmap = NULL; /* disconnect from the md device */
1489
1490 /* release the bitmap file and kill the daemon */ 1494 /* release the bitmap file and kill the daemon */
1491 bitmap_file_put(bitmap); 1495 bitmap_file_put(bitmap);
1492 1496
@@ -1504,6 +1508,17 @@ void bitmap_destroy(mddev_t *mddev)
1504 kfree(bp); 1508 kfree(bp);
1505 kfree(bitmap); 1509 kfree(bitmap);
1506} 1510}
1511void bitmap_destroy(mddev_t *mddev)
1512{
1513 struct bitmap *bitmap = mddev->bitmap;
1514
1515 if (!bitmap) /* there was no bitmap */
1516 return;
1517
1518 mddev->bitmap = NULL; /* disconnect from the md device */
1519
1520 bitmap_free(bitmap);
1521}
1507 1522
1508/* 1523/*
1509 * initialize the bitmap structure 1524 * initialize the bitmap structure
@@ -1517,6 +1532,7 @@ int bitmap_create(mddev_t *mddev)
1517 unsigned long pages; 1532 unsigned long pages;
1518 struct file *file = mddev->bitmap_file; 1533 struct file *file = mddev->bitmap_file;
1519 int err; 1534 int err;
1535 sector_t start;
1520 1536
1521 BUG_ON(sizeof(bitmap_super_t) != 256); 1537 BUG_ON(sizeof(bitmap_super_t) != 256);
1522 1538
@@ -1533,15 +1549,15 @@ int bitmap_create(mddev_t *mddev)
1533 1549
1534 spin_lock_init(&bitmap->lock); 1550 spin_lock_init(&bitmap->lock);
1535 bitmap->mddev = mddev; 1551 bitmap->mddev = mddev;
1536 mddev->bitmap = bitmap;
1537 1552
1538 spin_lock_init(&bitmap->write_lock); 1553 spin_lock_init(&bitmap->write_lock);
1539 INIT_LIST_HEAD(&bitmap->complete_pages); 1554 INIT_LIST_HEAD(&bitmap->complete_pages);
1540 init_waitqueue_head(&bitmap->write_wait); 1555 init_waitqueue_head(&bitmap->write_wait);
1541 bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, 1556 bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
1542 write_pool_free, NULL); 1557 write_pool_free, NULL);
1558 err = -ENOMEM;
1543 if (!bitmap->write_pool) 1559 if (!bitmap->write_pool)
1544 return -ENOMEM; 1560 goto error;
1545 1561
1546 bitmap->file = file; 1562 bitmap->file = file;
1547 bitmap->offset = mddev->bitmap_offset; 1563 bitmap->offset = mddev->bitmap_offset;
@@ -1549,7 +1565,7 @@ int bitmap_create(mddev_t *mddev)
1549 /* read superblock from bitmap file (this sets bitmap->chunksize) */ 1565 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1550 err = bitmap_read_sb(bitmap); 1566 err = bitmap_read_sb(bitmap);
1551 if (err) 1567 if (err)
1552 return err; 1568 goto error;
1553 1569
1554 bitmap->chunkshift = find_first_bit(&bitmap->chunksize, 1570 bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
1555 sizeof(bitmap->chunksize)); 1571 sizeof(bitmap->chunksize));
@@ -1573,27 +1589,44 @@ int bitmap_create(mddev_t *mddev)
1573#else 1589#else
1574 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); 1590 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1575#endif 1591#endif
1592 err = -ENOMEM;
1576 if (!bitmap->bp) 1593 if (!bitmap->bp)
1577 return -ENOMEM; 1594 goto error;
1578 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); 1595 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
1579 1596
1580 bitmap->flags |= BITMAP_ACTIVE; 1597 bitmap->flags |= BITMAP_ACTIVE;
1581 1598
1582 /* now that we have some pages available, initialize the in-memory 1599 /* now that we have some pages available, initialize the in-memory
1583 * bitmap from the on-disk bitmap */ 1600 * bitmap from the on-disk bitmap */
1584 err = bitmap_init_from_disk(bitmap); 1601 start = 0;
1602 if (mddev->degraded == 0
1603 || bitmap->events_cleared == mddev->events)
1604 /* no need to keep dirty bits to optimise a re-add of a missing device */
1605 start = mddev->recovery_cp;
1606 err = bitmap_init_from_disk(bitmap, start);
1585 1607
1586 if (err) 1608 if (err)
1587 return err; 1609 goto error;
1588 1610
1589 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", 1611 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
1590 pages, bmname(bitmap)); 1612 pages, bmname(bitmap));
1591 1613
1592 /* kick off the bitmap daemons */ 1614 mddev->bitmap = bitmap;
1593 err = bitmap_start_daemons(bitmap); 1615
1594 if (err) 1616 if (file)
1595 return err; 1617 /* kick off the bitmap writeback daemon */
1618 bitmap->writeback_daemon =
1619 bitmap_start_daemon(bitmap,
1620 bitmap_writeback_daemon,
1621 "bitmap_wb");
1622
1623 if (IS_ERR(bitmap->writeback_daemon))
1624 return PTR_ERR(bitmap->writeback_daemon);
1596 return bitmap_update_sb(bitmap); 1625 return bitmap_update_sb(bitmap);
1626
1627 error:
1628 bitmap_free(bitmap);
1629 return err;
1597} 1630}
1598 1631
1599/* the bitmap API -- for raid personalities */ 1632/* the bitmap API -- for raid personalities */
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 17212b4201a1..cc07bbebbb16 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -568,12 +568,9 @@ int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
568 568
569 bad: 569 bad:
570 dm_io_put(sectors_to_pages(chunk_size)); 570 dm_io_put(sectors_to_pages(chunk_size));
571 if (ps) { 571 if (ps && ps->area)
572 if (ps->area) 572 free_area(ps);
573 free_area(ps); 573 kfree(ps);
574
575 kfree(ps);
576 }
577 return r; 574 return r;
578} 575}
579 576
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index b08df8b9b2ca..863282513753 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -375,16 +375,18 @@ static void rh_inc(struct region_hash *rh, region_t region)
375 375
376 read_lock(&rh->hash_lock); 376 read_lock(&rh->hash_lock);
377 reg = __rh_find(rh, region); 377 reg = __rh_find(rh, region);
378
379 atomic_inc(&reg->pending);
380
381 spin_lock_irq(&rh->region_lock);
378 if (reg->state == RH_CLEAN) { 382 if (reg->state == RH_CLEAN) {
379 rh->log->type->mark_region(rh->log, reg->key); 383 rh->log->type->mark_region(rh->log, reg->key);
380 384
381 spin_lock_irq(&rh->region_lock);
382 reg->state = RH_DIRTY; 385 reg->state = RH_DIRTY;
383 list_del_init(&reg->list); /* take off the clean list */ 386 list_del_init(&reg->list); /* take off the clean list */
384 spin_unlock_irq(&rh->region_lock);
385 } 387 }
388 spin_unlock_irq(&rh->region_lock);
386 389
387 atomic_inc(&reg->pending);
388 read_unlock(&rh->hash_lock); 390 read_unlock(&rh->hash_lock);
389} 391}
390 392
@@ -408,6 +410,10 @@ static void rh_dec(struct region_hash *rh, region_t region)
408 410
409 if (atomic_dec_and_test(&reg->pending)) { 411 if (atomic_dec_and_test(&reg->pending)) {
410 spin_lock_irqsave(&rh->region_lock, flags); 412 spin_lock_irqsave(&rh->region_lock, flags);
413 if (atomic_read(&reg->pending)) { /* check race */
414 spin_unlock_irqrestore(&rh->region_lock, flags);
415 return;
416 }
411 if (reg->state == RH_RECOVERING) { 417 if (reg->state == RH_RECOVERING) {
412 list_add_tail(&reg->list, &rh->quiesced_regions); 418 list_add_tail(&reg->list, &rh->quiesced_regions);
413 } else { 419 } else {
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 8d740013d74d..bb279fad2fd2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
38 /* 38 /*
39 * sector_div(a,b) returns the remainer and sets a to a/b 39 * sector_div(a,b) returns the remainer and sets a to a/b
40 */ 40 */
41 (void)sector_div(block, conf->smallest->size); 41 block >>= conf->preshift;
42 (void)sector_div(block, conf->hash_spacing);
42 hash = conf->hash_table[block]; 43 hash = conf->hash_table[block];
43 44
44 while ((sector>>1) >= (hash->size + hash->offset)) 45 while ((sector>>1) >= (hash->size + hash->offset))
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
47} 48}
48 49
49/** 50/**
50 * linear_mergeable_bvec -- tell bio layer if a two requests can be merged 51 * linear_mergeable_bvec -- tell bio layer if two requests can be merged
51 * @q: request queue 52 * @q: request queue
52 * @bio: the buffer head that's been built up so far 53 * @bio: the buffer head that's been built up so far
53 * @biovec: the request that could be merged to it. 54 * @biovec: the request that could be merged to it.
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev)
116 dev_info_t **table; 117 dev_info_t **table;
117 mdk_rdev_t *rdev; 118 mdk_rdev_t *rdev;
118 int i, nb_zone, cnt; 119 int i, nb_zone, cnt;
119 sector_t start; 120 sector_t min_spacing;
120 sector_t curr_offset; 121 sector_t curr_offset;
121 struct list_head *tmp; 122 struct list_head *tmp;
122 123
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev)
127 memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); 128 memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
128 mddev->private = conf; 129 mddev->private = conf;
129 130
130 /*
131 * Find the smallest device.
132 */
133
134 conf->smallest = NULL;
135 cnt = 0; 131 cnt = 0;
136 mddev->array_size = 0; 132 mddev->array_size = 0;
137 133
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev)
159 disk->size = rdev->size; 155 disk->size = rdev->size;
160 mddev->array_size += rdev->size; 156 mddev->array_size += rdev->size;
161 157
162 if (!conf->smallest || (disk->size < conf->smallest->size))
163 conf->smallest = disk;
164 cnt++; 158 cnt++;
165 } 159 }
166 if (cnt != mddev->raid_disks) { 160 if (cnt != mddev->raid_disks) {
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev)
168 goto out; 162 goto out;
169 } 163 }
170 164
165 min_spacing = mddev->array_size;
166 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
167
168 /* min_spacing is the minimum spacing that will fit the hash
169 * table in one PAGE. This may be much smaller than needed.
170 * We find the smallest non-terminal set of consecutive devices
171 * that is larger than min_spacing as use the size of that as
172 * the actual spacing
173 */
174 conf->hash_spacing = mddev->array_size;
175 for (i=0; i < cnt-1 ; i++) {
176 sector_t sz = 0;
177 int j;
178 for (j=i; i<cnt-1 && sz < min_spacing ; j++)
179 sz += conf->disks[j].size;
180 if (sz >= min_spacing && sz < conf->hash_spacing)
181 conf->hash_spacing = sz;
182 }
183
184 /* hash_spacing may be too large for sector_div to work with,
185 * so we might need to pre-shift
186 */
187 conf->preshift = 0;
188 if (sizeof(sector_t) > sizeof(u32)) {
189 sector_t space = conf->hash_spacing;
190 while (space > (sector_t)(~(u32)0)) {
191 space >>= 1;
192 conf->preshift++;
193 }
194 }
171 /* 195 /*
172 * This code was restructured to work around a gcc-2.95.3 internal 196 * This code was restructured to work around a gcc-2.95.3 internal
173 * compiler error. Alter it with care. 197 * compiler error. Alter it with care.
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev)
177 unsigned round; 201 unsigned round;
178 unsigned long base; 202 unsigned long base;
179 203
180 sz = mddev->array_size; 204 sz = mddev->array_size >> conf->preshift;
181 base = conf->smallest->size; 205 sz += 1; /* force round-up */
206 base = conf->hash_spacing >> conf->preshift;
182 round = sector_div(sz, base); 207 round = sector_div(sz, base);
183 nb_zone = conf->nr_zones = sz + (round ? 1 : 0); 208 nb_zone = sz + (round ? 1 : 0);
184 } 209 }
185 210 BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
186 conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, 211
212 conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
187 GFP_KERNEL); 213 GFP_KERNEL);
188 if (!conf->hash_table) 214 if (!conf->hash_table)
189 goto out; 215 goto out;
190 216
191 /* 217 /*
192 * Here we generate the linear hash table 218 * Here we generate the linear hash table
219 * First calculate the device offsets.
193 */ 220 */
221 conf->disks[0].offset = 0;
222 for (i=1; i<mddev->raid_disks; i++)
223 conf->disks[i].offset =
224 conf->disks[i-1].offset +
225 conf->disks[i-1].size;
226
194 table = conf->hash_table; 227 table = conf->hash_table;
195 start = 0;
196 curr_offset = 0; 228 curr_offset = 0;
197 for (i = 0; i < cnt; i++) { 229 i = 0;
198 dev_info_t *disk = conf->disks + i; 230 for (curr_offset = 0;
231 curr_offset < mddev->array_size;
232 curr_offset += conf->hash_spacing) {
199 233
200 disk->offset = curr_offset; 234 while (i < mddev->raid_disks-1 &&
201 curr_offset += disk->size; 235 curr_offset >= conf->disks[i+1].offset)
236 i++;
202 237
203 /* 'curr_offset' is the end of this disk 238 *table ++ = conf->disks + i;
204 * 'start' is the start of table 239 }
240
241 if (conf->preshift) {
242 conf->hash_spacing >>= conf->preshift;
243 /* round hash_spacing up so that when we divide by it,
244 * we err on the side of "too-low", which is safest.
205 */ 245 */
206 while (start < curr_offset) { 246 conf->hash_spacing++;
207 *table++ = disk;
208 start += conf->smallest->size;
209 }
210 } 247 }
211 if (table-conf->hash_table != nb_zone) 248
212 BUG(); 249 BUG_ON(table - conf->hash_table > nb_zone);
213 250
214 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 251 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
215 mddev->queue->unplug_fn = linear_unplug; 252 mddev->queue->unplug_fn = linear_unplug;
@@ -238,6 +275,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio)
238 dev_info_t *tmp_dev; 275 dev_info_t *tmp_dev;
239 sector_t block; 276 sector_t block;
240 277
278 if (unlikely(bio_barrier(bio))) {
279 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
280 return 0;
281 }
282
241 if (bio_data_dir(bio)==WRITE) { 283 if (bio_data_dir(bio)==WRITE) {
242 disk_stat_inc(mddev->gendisk, writes); 284 disk_stat_inc(mddev->gendisk, writes);
243 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); 285 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
@@ -294,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
294 sector_t s = 0; 336 sector_t s = 0;
295 337
296 seq_printf(seq, " "); 338 seq_printf(seq, " ");
297 for (j = 0; j < conf->nr_zones; j++) 339 for (j = 0; j < mddev->raid_disks; j++)
298 { 340 {
299 char b[BDEVNAME_SIZE]; 341 char b[BDEVNAME_SIZE];
300 s += conf->smallest_size; 342 s += conf->smallest_size;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 20ca80b7dc20..2897df90df44 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/config.h> 36#include <linux/config.h>
37#include <linux/kthread.h>
37#include <linux/linkage.h> 38#include <linux/linkage.h>
38#include <linux/raid/md.h> 39#include <linux/raid/md.h>
39#include <linux/raid/bitmap.h> 40#include <linux/raid/bitmap.h>
@@ -73,7 +74,7 @@ static DEFINE_SPINLOCK(pers_lock);
73 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 74 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
74 * is 1000 KB/sec, so the extra system load does not show up that much. 75 * is 1000 KB/sec, so the extra system load does not show up that much.
75 * Increase it if you want to have more _guaranteed_ speed. Note that 76 * Increase it if you want to have more _guaranteed_ speed. Note that
76 * the RAID driver will use the maximum available bandwith if the IO 77 * the RAID driver will use the maximum available bandwidth if the IO
77 * subsystem is idle. There is also an 'absolute maximum' reconstruction 78 * subsystem is idle. There is also an 'absolute maximum' reconstruction
78 * speed limit - in case reconstruction slows down your system despite 79 * speed limit - in case reconstruction slows down your system despite
79 * idle IO detection. 80 * idle IO detection.
@@ -393,7 +394,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
393 return ret; 394 return ret;
394} 395}
395 396
396static int read_disk_sb(mdk_rdev_t * rdev) 397static int read_disk_sb(mdk_rdev_t * rdev, int size)
397{ 398{
398 char b[BDEVNAME_SIZE]; 399 char b[BDEVNAME_SIZE];
399 if (!rdev->sb_page) { 400 if (!rdev->sb_page) {
@@ -404,7 +405,7 @@ static int read_disk_sb(mdk_rdev_t * rdev)
404 return 0; 405 return 0;
405 406
406 407
407 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) 408 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
408 goto fail; 409 goto fail;
409 rdev->sb_loaded = 1; 410 rdev->sb_loaded = 1;
410 return 0; 411 return 0;
@@ -531,7 +532,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
531 sb_offset = calc_dev_sboffset(rdev->bdev); 532 sb_offset = calc_dev_sboffset(rdev->bdev);
532 rdev->sb_offset = sb_offset; 533 rdev->sb_offset = sb_offset;
533 534
534 ret = read_disk_sb(rdev); 535 ret = read_disk_sb(rdev, MD_SB_BYTES);
535 if (ret) return ret; 536 if (ret) return ret;
536 537
537 ret = -EINVAL; 538 ret = -EINVAL;
@@ -564,6 +565,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
564 565
565 rdev->preferred_minor = sb->md_minor; 566 rdev->preferred_minor = sb->md_minor;
566 rdev->data_offset = 0; 567 rdev->data_offset = 0;
568 rdev->sb_size = MD_SB_BYTES;
567 569
568 if (sb->level == LEVEL_MULTIPATH) 570 if (sb->level == LEVEL_MULTIPATH)
569 rdev->desc_nr = -1; 571 rdev->desc_nr = -1;
@@ -623,6 +625,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
623 mddev->size = sb->size; 625 mddev->size = sb->size;
624 mddev->events = md_event(sb); 626 mddev->events = md_event(sb);
625 mddev->bitmap_offset = 0; 627 mddev->bitmap_offset = 0;
628 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
626 629
627 if (sb->state & (1<<MD_SB_CLEAN)) 630 if (sb->state & (1<<MD_SB_CLEAN))
628 mddev->recovery_cp = MaxSector; 631 mddev->recovery_cp = MaxSector;
@@ -643,12 +646,12 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
643 646
644 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 647 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
645 mddev->bitmap_file == NULL) { 648 mddev->bitmap_file == NULL) {
646 if (mddev->level != 1) { 649 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
647 /* FIXME use a better test */ 650 /* FIXME use a better test */
648 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 651 printk(KERN_WARNING "md: bitmaps only support for raid1\n");
649 return -EINVAL; 652 return -EINVAL;
650 } 653 }
651 mddev->bitmap_offset = (MD_SB_BYTES >> 9); 654 mddev->bitmap_offset = mddev->default_bitmap_offset;
652 } 655 }
653 656
654 } else if (mddev->pers == NULL) { 657 } else if (mddev->pers == NULL) {
@@ -669,6 +672,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
669 672
670 if (mddev->level != LEVEL_MULTIPATH) { 673 if (mddev->level != LEVEL_MULTIPATH) {
671 rdev->faulty = 0; 674 rdev->faulty = 0;
675 rdev->flags = 0;
672 desc = sb->disks + rdev->desc_nr; 676 desc = sb->disks + rdev->desc_nr;
673 677
674 if (desc->state & (1<<MD_DISK_FAULTY)) 678 if (desc->state & (1<<MD_DISK_FAULTY))
@@ -678,6 +682,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
678 rdev->in_sync = 1; 682 rdev->in_sync = 1;
679 rdev->raid_disk = desc->raid_disk; 683 rdev->raid_disk = desc->raid_disk;
680 } 684 }
685 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
686 set_bit(WriteMostly, &rdev->flags);
681 } else /* MULTIPATH are always insync */ 687 } else /* MULTIPATH are always insync */
682 rdev->in_sync = 1; 688 rdev->in_sync = 1;
683 return 0; 689 return 0;
@@ -706,6 +712,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
706 int i; 712 int i;
707 int active=0, working=0,failed=0,spare=0,nr_disks=0; 713 int active=0, working=0,failed=0,spare=0,nr_disks=0;
708 714
715 rdev->sb_size = MD_SB_BYTES;
716
709 sb = (mdp_super_t*)page_address(rdev->sb_page); 717 sb = (mdp_super_t*)page_address(rdev->sb_page);
710 718
711 memset(sb, 0, sizeof(*sb)); 719 memset(sb, 0, sizeof(*sb));
@@ -776,6 +784,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
776 spare++; 784 spare++;
777 working++; 785 working++;
778 } 786 }
787 if (test_bit(WriteMostly, &rdev2->flags))
788 d->state |= (1<<MD_DISK_WRITEMOSTLY);
779 } 789 }
780 790
781 /* now set the "removed" and "faulty" bits on any missing devices */ 791 /* now set the "removed" and "faulty" bits on any missing devices */
@@ -831,6 +841,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
831 int ret; 841 int ret;
832 sector_t sb_offset; 842 sector_t sb_offset;
833 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 843 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
844 int bmask;
834 845
835 /* 846 /*
836 * Calculate the position of the superblock. 847 * Calculate the position of the superblock.
@@ -859,7 +870,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
859 } 870 }
860 rdev->sb_offset = sb_offset; 871 rdev->sb_offset = sb_offset;
861 872
862 ret = read_disk_sb(rdev); 873 /* superblock is rarely larger than 1K, but it can be larger,
874 * and it is safe to read 4k, so we do that
875 */
876 ret = read_disk_sb(rdev, 4096);
863 if (ret) return ret; 877 if (ret) return ret;
864 878
865 879
@@ -869,7 +883,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
869 sb->major_version != cpu_to_le32(1) || 883 sb->major_version != cpu_to_le32(1) ||
870 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 884 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
871 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 885 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
872 sb->feature_map != 0) 886 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
873 return -EINVAL; 887 return -EINVAL;
874 888
875 if (calc_sb_1_csum(sb) != sb->sb_csum) { 889 if (calc_sb_1_csum(sb) != sb->sb_csum) {
@@ -885,6 +899,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
885 rdev->preferred_minor = 0xffff; 899 rdev->preferred_minor = 0xffff;
886 rdev->data_offset = le64_to_cpu(sb->data_offset); 900 rdev->data_offset = le64_to_cpu(sb->data_offset);
887 901
902 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
903 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
904 if (rdev->sb_size & bmask)
905 rdev-> sb_size = (rdev->sb_size | bmask)+1;
906
888 if (refdev == 0) 907 if (refdev == 0)
889 return 1; 908 return 1;
890 else { 909 else {
@@ -939,13 +958,15 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
939 mddev->size = le64_to_cpu(sb->size)/2; 958 mddev->size = le64_to_cpu(sb->size)/2;
940 mddev->events = le64_to_cpu(sb->events); 959 mddev->events = le64_to_cpu(sb->events);
941 mddev->bitmap_offset = 0; 960 mddev->bitmap_offset = 0;
961 mddev->default_bitmap_offset = 0;
962 mddev->default_bitmap_offset = 1024;
942 963
943 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 964 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
944 memcpy(mddev->uuid, sb->set_uuid, 16); 965 memcpy(mddev->uuid, sb->set_uuid, 16);
945 966
946 mddev->max_disks = (4096-256)/2; 967 mddev->max_disks = (4096-256)/2;
947 968
948 if ((le32_to_cpu(sb->feature_map) & 1) && 969 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
949 mddev->bitmap_file == NULL ) { 970 mddev->bitmap_file == NULL ) {
950 if (mddev->level != 1) { 971 if (mddev->level != 1) {
951 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 972 printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
@@ -986,6 +1007,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
986 rdev->raid_disk = role; 1007 rdev->raid_disk = role;
987 break; 1008 break;
988 } 1009 }
1010 rdev->flags = 0;
1011 if (sb->devflags & WriteMostly1)
1012 set_bit(WriteMostly, &rdev->flags);
989 } else /* MULTIPATH are always insync */ 1013 } else /* MULTIPATH are always insync */
990 rdev->in_sync = 1; 1014 rdev->in_sync = 1;
991 1015
@@ -1017,7 +1041,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1017 1041
1018 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1042 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1019 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1043 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1020 sb->feature_map = cpu_to_le32(1); 1044 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1021 } 1045 }
1022 1046
1023 max_dev = 0; 1047 max_dev = 0;
@@ -1363,7 +1387,7 @@ repeat:
1363 dprintk("%s ", bdevname(rdev->bdev,b)); 1387 dprintk("%s ", bdevname(rdev->bdev,b));
1364 if (!rdev->faulty) { 1388 if (!rdev->faulty) {
1365 md_super_write(mddev,rdev, 1389 md_super_write(mddev,rdev,
1366 rdev->sb_offset<<1, MD_SB_BYTES, 1390 rdev->sb_offset<<1, rdev->sb_size,
1367 rdev->sb_page); 1391 rdev->sb_page);
1368 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1392 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1369 bdevname(rdev->bdev,b), 1393 bdevname(rdev->bdev,b),
@@ -2073,6 +2097,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
2073 info.state = 0; 2097 info.state = 0;
2074 if (mddev->in_sync) 2098 if (mddev->in_sync)
2075 info.state = (1<<MD_SB_CLEAN); 2099 info.state = (1<<MD_SB_CLEAN);
2100 if (mddev->bitmap && mddev->bitmap_offset)
2101 info.state = (1<<MD_SB_BITMAP_PRESENT);
2076 info.active_disks = active; 2102 info.active_disks = active;
2077 info.working_disks = working; 2103 info.working_disks = working;
2078 info.failed_disks = failed; 2104 info.failed_disks = failed;
@@ -2087,7 +2113,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
2087 return 0; 2113 return 0;
2088} 2114}
2089 2115
2090static int get_bitmap_file(mddev_t * mddev, void * arg) 2116static int get_bitmap_file(mddev_t * mddev, void __user * arg)
2091{ 2117{
2092 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2118 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2093 char *ptr, *buf = NULL; 2119 char *ptr, *buf = NULL;
@@ -2146,6 +2172,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
2146 info.state |= (1<<MD_DISK_ACTIVE); 2172 info.state |= (1<<MD_DISK_ACTIVE);
2147 info.state |= (1<<MD_DISK_SYNC); 2173 info.state |= (1<<MD_DISK_SYNC);
2148 } 2174 }
2175 if (test_bit(WriteMostly, &rdev->flags))
2176 info.state |= (1<<MD_DISK_WRITEMOSTLY);
2149 } else { 2177 } else {
2150 info.major = info.minor = 0; 2178 info.major = info.minor = 0;
2151 info.raid_disk = -1; 2179 info.raid_disk = -1;
@@ -2210,8 +2238,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2210 mdname(mddev)); 2238 mdname(mddev));
2211 return -EINVAL; 2239 return -EINVAL;
2212 } 2240 }
2213 rdev = md_import_device(dev, mddev->major_version, 2241 if (mddev->persistent)
2214 mddev->minor_version); 2242 rdev = md_import_device(dev, mddev->major_version,
2243 mddev->minor_version);
2244 else
2245 rdev = md_import_device(dev, -1, -1);
2215 if (IS_ERR(rdev)) { 2246 if (IS_ERR(rdev)) {
2216 printk(KERN_WARNING 2247 printk(KERN_WARNING
2217 "md: md_import_device returned %ld\n", 2248 "md: md_import_device returned %ld\n",
@@ -2231,6 +2262,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2231 rdev->saved_raid_disk = rdev->raid_disk; 2262 rdev->saved_raid_disk = rdev->raid_disk;
2232 2263
2233 rdev->in_sync = 0; /* just to be sure */ 2264 rdev->in_sync = 0; /* just to be sure */
2265 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2266 set_bit(WriteMostly, &rdev->flags);
2267
2234 rdev->raid_disk = -1; 2268 rdev->raid_disk = -1;
2235 err = bind_rdev_to_array(rdev, mddev); 2269 err = bind_rdev_to_array(rdev, mddev);
2236 if (err) 2270 if (err)
@@ -2271,6 +2305,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2271 else 2305 else
2272 rdev->in_sync = 0; 2306 rdev->in_sync = 0;
2273 2307
2308 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2309 set_bit(WriteMostly, &rdev->flags);
2310
2274 err = bind_rdev_to_array(rdev, mddev); 2311 err = bind_rdev_to_array(rdev, mddev);
2275 if (err) { 2312 if (err) {
2276 export_rdev(rdev); 2313 export_rdev(rdev);
@@ -2430,25 +2467,51 @@ static int set_bitmap_file(mddev_t *mddev, int fd)
2430{ 2467{
2431 int err; 2468 int err;
2432 2469
2433 if (mddev->pers) 2470 if (mddev->pers) {
2434 return -EBUSY; 2471 if (!mddev->pers->quiesce)
2472 return -EBUSY;
2473 if (mddev->recovery || mddev->sync_thread)
2474 return -EBUSY;
2475 /* we should be able to change the bitmap.. */
2476 }
2435 2477
2436 mddev->bitmap_file = fget(fd);
2437 2478
2438 if (mddev->bitmap_file == NULL) { 2479 if (fd >= 0) {
2439 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2480 if (mddev->bitmap)
2440 mdname(mddev)); 2481 return -EEXIST; /* cannot add when bitmap is present */
2441 return -EBADF; 2482 mddev->bitmap_file = fget(fd);
2442 }
2443 2483
2444 err = deny_bitmap_write_access(mddev->bitmap_file); 2484 if (mddev->bitmap_file == NULL) {
2445 if (err) { 2485 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
2446 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2486 mdname(mddev));
2447 mdname(mddev)); 2487 return -EBADF;
2448 fput(mddev->bitmap_file); 2488 }
2449 mddev->bitmap_file = NULL; 2489
2450 } else 2490 err = deny_bitmap_write_access(mddev->bitmap_file);
2491 if (err) {
2492 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
2493 mdname(mddev));
2494 fput(mddev->bitmap_file);
2495 mddev->bitmap_file = NULL;
2496 return err;
2497 }
2451 mddev->bitmap_offset = 0; /* file overrides offset */ 2498 mddev->bitmap_offset = 0; /* file overrides offset */
2499 } else if (mddev->bitmap == NULL)
2500 return -ENOENT; /* cannot remove what isn't there */
2501 err = 0;
2502 if (mddev->pers) {
2503 mddev->pers->quiesce(mddev, 1);
2504 if (fd >= 0)
2505 err = bitmap_create(mddev);
2506 if (fd < 0 || err)
2507 bitmap_destroy(mddev);
2508 mddev->pers->quiesce(mddev, 0);
2509 } else if (fd < 0) {
2510 if (mddev->bitmap_file)
2511 fput(mddev->bitmap_file);
2512 mddev->bitmap_file = NULL;
2513 }
2514
2452 return err; 2515 return err;
2453} 2516}
2454 2517
@@ -2528,6 +2591,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2528{ 2591{
2529 int rv = 0; 2592 int rv = 0;
2530 int cnt = 0; 2593 int cnt = 0;
2594 int state = 0;
2595
2596 /* calculate expected state,ignoring low bits */
2597 if (mddev->bitmap && mddev->bitmap_offset)
2598 state |= (1 << MD_SB_BITMAP_PRESENT);
2531 2599
2532 if (mddev->major_version != info->major_version || 2600 if (mddev->major_version != info->major_version ||
2533 mddev->minor_version != info->minor_version || 2601 mddev->minor_version != info->minor_version ||
@@ -2536,12 +2604,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2536 mddev->level != info->level || 2604 mddev->level != info->level ||
2537/* mddev->layout != info->layout || */ 2605/* mddev->layout != info->layout || */
2538 !mddev->persistent != info->not_persistent|| 2606 !mddev->persistent != info->not_persistent||
2539 mddev->chunk_size != info->chunk_size ) 2607 mddev->chunk_size != info->chunk_size ||
2608 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
2609 ((state^info->state) & 0xfffffe00)
2610 )
2540 return -EINVAL; 2611 return -EINVAL;
2541 /* Check there is only one change */ 2612 /* Check there is only one change */
2542 if (mddev->size != info->size) cnt++; 2613 if (mddev->size != info->size) cnt++;
2543 if (mddev->raid_disks != info->raid_disks) cnt++; 2614 if (mddev->raid_disks != info->raid_disks) cnt++;
2544 if (mddev->layout != info->layout) cnt++; 2615 if (mddev->layout != info->layout) cnt++;
2616 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
2545 if (cnt == 0) return 0; 2617 if (cnt == 0) return 0;
2546 if (cnt > 1) return -EINVAL; 2618 if (cnt > 1) return -EINVAL;
2547 2619
@@ -2620,6 +2692,35 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2620 } 2692 }
2621 } 2693 }
2622 } 2694 }
2695 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
2696 if (mddev->pers->quiesce == NULL)
2697 return -EINVAL;
2698 if (mddev->recovery || mddev->sync_thread)
2699 return -EBUSY;
2700 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
2701 /* add the bitmap */
2702 if (mddev->bitmap)
2703 return -EEXIST;
2704 if (mddev->default_bitmap_offset == 0)
2705 return -EINVAL;
2706 mddev->bitmap_offset = mddev->default_bitmap_offset;
2707 mddev->pers->quiesce(mddev, 1);
2708 rv = bitmap_create(mddev);
2709 if (rv)
2710 bitmap_destroy(mddev);
2711 mddev->pers->quiesce(mddev, 0);
2712 } else {
2713 /* remove the bitmap */
2714 if (!mddev->bitmap)
2715 return -ENOENT;
2716 if (mddev->bitmap->file)
2717 return -EINVAL;
2718 mddev->pers->quiesce(mddev, 1);
2719 bitmap_destroy(mddev);
2720 mddev->pers->quiesce(mddev, 0);
2721 mddev->bitmap_offset = 0;
2722 }
2723 }
2623 md_update_sb(mddev); 2724 md_update_sb(mddev);
2624 return rv; 2725 return rv;
2625} 2726}
@@ -2781,7 +2882,7 @@ static int md_ioctl(struct inode *inode, struct file *file,
2781 goto done_unlock; 2882 goto done_unlock;
2782 2883
2783 case GET_BITMAP_FILE: 2884 case GET_BITMAP_FILE:
2784 err = get_bitmap_file(mddev, (void *)arg); 2885 err = get_bitmap_file(mddev, argp);
2785 goto done_unlock; 2886 goto done_unlock;
2786 2887
2787 case GET_DISK_INFO: 2888 case GET_DISK_INFO:
@@ -2950,18 +3051,6 @@ static int md_thread(void * arg)
2950{ 3051{
2951 mdk_thread_t *thread = arg; 3052 mdk_thread_t *thread = arg;
2952 3053
2953 lock_kernel();
2954
2955 /*
2956 * Detach thread
2957 */
2958
2959 daemonize(thread->name, mdname(thread->mddev));
2960
2961 current->exit_signal = SIGCHLD;
2962 allow_signal(SIGKILL);
2963 thread->tsk = current;
2964
2965 /* 3054 /*
2966 * md_thread is a 'system-thread', it's priority should be very 3055 * md_thread is a 'system-thread', it's priority should be very
2967 * high. We avoid resource deadlocks individually in each 3056 * high. We avoid resource deadlocks individually in each
@@ -2973,14 +3062,14 @@ static int md_thread(void * arg)
2973 * bdflush, otherwise bdflush will deadlock if there are too 3062 * bdflush, otherwise bdflush will deadlock if there are too
2974 * many dirty RAID5 blocks. 3063 * many dirty RAID5 blocks.
2975 */ 3064 */
2976 unlock_kernel();
2977 3065
2978 complete(thread->event); 3066 complete(thread->event);
2979 while (thread->run) { 3067 while (!kthread_should_stop()) {
2980 void (*run)(mddev_t *); 3068 void (*run)(mddev_t *);
2981 3069
2982 wait_event_interruptible_timeout(thread->wqueue, 3070 wait_event_interruptible_timeout(thread->wqueue,
2983 test_bit(THREAD_WAKEUP, &thread->flags), 3071 test_bit(THREAD_WAKEUP, &thread->flags)
3072 || kthread_should_stop(),
2984 thread->timeout); 3073 thread->timeout);
2985 try_to_freeze(); 3074 try_to_freeze();
2986 3075
@@ -2989,11 +3078,8 @@ static int md_thread(void * arg)
2989 run = thread->run; 3078 run = thread->run;
2990 if (run) 3079 if (run)
2991 run(thread->mddev); 3080 run(thread->mddev);
2992
2993 if (signal_pending(current))
2994 flush_signals(current);
2995 } 3081 }
2996 complete(thread->event); 3082
2997 return 0; 3083 return 0;
2998} 3084}
2999 3085
@@ -3010,11 +3096,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3010 const char *name) 3096 const char *name)
3011{ 3097{
3012 mdk_thread_t *thread; 3098 mdk_thread_t *thread;
3013 int ret;
3014 struct completion event; 3099 struct completion event;
3015 3100
3016 thread = (mdk_thread_t *) kmalloc 3101 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL);
3017 (sizeof(mdk_thread_t), GFP_KERNEL);
3018 if (!thread) 3102 if (!thread)
3019 return NULL; 3103 return NULL;
3020 3104
@@ -3027,8 +3111,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3027 thread->mddev = mddev; 3111 thread->mddev = mddev;
3028 thread->name = name; 3112 thread->name = name;
3029 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3113 thread->timeout = MAX_SCHEDULE_TIMEOUT;
3030 ret = kernel_thread(md_thread, thread, 0); 3114 thread->tsk = kthread_run(md_thread, thread, mdname(thread->mddev));
3031 if (ret < 0) { 3115 if (IS_ERR(thread->tsk)) {
3032 kfree(thread); 3116 kfree(thread);
3033 return NULL; 3117 return NULL;
3034 } 3118 }
@@ -3038,21 +3122,9 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3038 3122
3039void md_unregister_thread(mdk_thread_t *thread) 3123void md_unregister_thread(mdk_thread_t *thread)
3040{ 3124{
3041 struct completion event;
3042
3043 init_completion(&event);
3044
3045 thread->event = &event;
3046
3047 /* As soon as ->run is set to NULL, the task could disappear,
3048 * so we need to hold tasklist_lock until we have sent the signal
3049 */
3050 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3125 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
3051 read_lock(&tasklist_lock); 3126
3052 thread->run = NULL; 3127 kthread_stop(thread->tsk);
3053 send_sig(SIGKILL, thread->tsk, 1);
3054 read_unlock(&tasklist_lock);
3055 wait_for_completion(&event);
3056 kfree(thread); 3128 kfree(thread);
3057} 3129}
3058 3130
@@ -3259,10 +3331,13 @@ static int md_seq_show(struct seq_file *seq, void *v)
3259 char b[BDEVNAME_SIZE]; 3331 char b[BDEVNAME_SIZE];
3260 seq_printf(seq, " %s[%d]", 3332 seq_printf(seq, " %s[%d]",
3261 bdevname(rdev->bdev,b), rdev->desc_nr); 3333 bdevname(rdev->bdev,b), rdev->desc_nr);
3334 if (test_bit(WriteMostly, &rdev->flags))
3335 seq_printf(seq, "(W)");
3262 if (rdev->faulty) { 3336 if (rdev->faulty) {
3263 seq_printf(seq, "(F)"); 3337 seq_printf(seq, "(F)");
3264 continue; 3338 continue;
3265 } 3339 } else if (rdev->raid_disk < 0)
3340 seq_printf(seq, "(S)"); /* spare */
3266 size += rdev->size; 3341 size += rdev->size;
3267 } 3342 }
3268 3343
@@ -3274,6 +3349,15 @@ static int md_seq_show(struct seq_file *seq, void *v)
3274 seq_printf(seq, "\n %llu blocks", 3349 seq_printf(seq, "\n %llu blocks",
3275 (unsigned long long)size); 3350 (unsigned long long)size);
3276 } 3351 }
3352 if (mddev->persistent) {
3353 if (mddev->major_version != 0 ||
3354 mddev->minor_version != 90) {
3355 seq_printf(seq," super %d.%d",
3356 mddev->major_version,
3357 mddev->minor_version);
3358 }
3359 } else
3360 seq_printf(seq, " super non-persistent");
3277 3361
3278 if (mddev->pers) { 3362 if (mddev->pers) {
3279 mddev->pers->status (seq, mddev); 3363 mddev->pers->status (seq, mddev);
@@ -3416,7 +3500,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
3416 */ 3500 */
3417void md_write_start(mddev_t *mddev, struct bio *bi) 3501void md_write_start(mddev_t *mddev, struct bio *bi)
3418{ 3502{
3419 DEFINE_WAIT(w);
3420 if (bio_data_dir(bi) != WRITE) 3503 if (bio_data_dir(bi) != WRITE)
3421 return; 3504 return;
3422 3505
@@ -3533,7 +3616,7 @@ static void md_do_sync(mddev_t *mddev)
3533 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3616 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
3534 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3617 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
3535 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3618 " %d KB/sec/disc.\n", sysctl_speed_limit_min);
3536 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3619 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
3537 "(but not more than %d KB/sec) for reconstruction.\n", 3620 "(but not more than %d KB/sec) for reconstruction.\n",
3538 sysctl_speed_limit_max); 3621 sysctl_speed_limit_max);
3539 3622
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 2d2ca7fa0265..286342375fb7 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -169,6 +169,11 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio)
169 struct multipath_bh * mp_bh; 169 struct multipath_bh * mp_bh;
170 struct multipath_info *multipath; 170 struct multipath_info *multipath;
171 171
172 if (unlikely(bio_barrier(bio))) {
173 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
174 return 0;
175 }
176
172 mp_bh = mempool_alloc(conf->pool, GFP_NOIO); 177 mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
173 178
174 mp_bh->master_bio = bio; 179 mp_bh->master_bio = bio;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 2120710172c5..f6757259ce7f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -404,6 +404,11 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
404 unsigned long chunk; 404 unsigned long chunk;
405 sector_t block, rsect; 405 sector_t block, rsect;
406 406
407 if (unlikely(bio_barrier(bio))) {
408 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
409 return 0;
410 }
411
407 if (bio_data_dir(bio)==WRITE) { 412 if (bio_data_dir(bio)==WRITE) {
408 disk_stat_inc(mddev->gendisk, writes); 413 disk_stat_inc(mddev->gendisk, writes);
409 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); 414 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 51d9645ed09c..a93ca478142a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
222{ 222{
223 struct bio *bio = r1_bio->master_bio; 223 struct bio *bio = r1_bio->master_bio;
224 224
225 bio_endio(bio, bio->bi_size, 225 /* if nobody has done the final endio yet, do it now */
226 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 226 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
227 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
228 (bio_data_dir(bio) == WRITE) ? "write" : "read",
229 (unsigned long long) bio->bi_sector,
230 (unsigned long long) bio->bi_sector +
231 (bio->bi_size >> 9) - 1);
232
233 bio_endio(bio, bio->bi_size,
234 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
235 }
227 free_r1bio(r1_bio); 236 free_r1bio(r1_bio);
228} 237}
229 238
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
292{ 301{
293 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
294 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
295 int mirror; 304 int mirror, behind;
296 conf_t *conf = mddev_to_conf(r1_bio->mddev); 305 conf_t *conf = mddev_to_conf(r1_bio->mddev);
297 306
298 if (bio->bi_size) 307 if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
323 332
324 update_head_pos(mirror, r1_bio); 333 update_head_pos(mirror, r1_bio);
325 334
335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
336 if (behind) {
337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
338 atomic_dec(&r1_bio->behind_remaining);
339
340 /* In behind mode, we ACK the master bio once the I/O has safely
341 * reached all non-writemostly disks. Setting the Returned bit
342 * ensures that this gets done only once -- we don't ever want to
343 * return -EIO here, instead we'll wait */
344
345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
346 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
347 /* Maybe we can return now */
348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
349 struct bio *mbio = r1_bio->master_bio;
350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
351 (unsigned long long) mbio->bi_sector,
352 (unsigned long long) mbio->bi_sector +
353 (mbio->bi_size >> 9) - 1);
354 bio_endio(mbio, mbio->bi_size, 0);
355 }
356 }
357 }
326 /* 358 /*
327 * 359 *
328 * Let's see if all mirrored write operations have finished 360 * Let's see if all mirrored write operations have finished
329 * already. 361 * already.
330 */ 362 */
331 if (atomic_dec_and_test(&r1_bio->remaining)) { 363 if (atomic_dec_and_test(&r1_bio->remaining)) {
364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365 /* free extra copy of the data pages */
366 int i = bio->bi_vcnt;
367 while (i--)
368 __free_page(bio->bi_io_vec[i].bv_page);
369 }
332 /* clear the bitmap if all writes complete successfully */ 370 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 371 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors, 372 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state)); 373 !test_bit(R1BIO_Degraded, &r1_bio->state),
374 behind);
336 md_write_end(r1_bio->mddev); 375 md_write_end(r1_bio->mddev);
337 raid_end_bio_io(r1_bio); 376 raid_end_bio_io(r1_bio);
338 } 377 }
@@ -360,13 +399,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
360{ 399{
361 const unsigned long this_sector = r1_bio->sector; 400 const unsigned long this_sector = r1_bio->sector;
362 int new_disk = conf->last_used, disk = new_disk; 401 int new_disk = conf->last_used, disk = new_disk;
402 int wonly_disk = -1;
363 const int sectors = r1_bio->sectors; 403 const int sectors = r1_bio->sectors;
364 sector_t new_distance, current_distance; 404 sector_t new_distance, current_distance;
365 mdk_rdev_t *new_rdev, *rdev; 405 mdk_rdev_t *rdev;
366 406
367 rcu_read_lock(); 407 rcu_read_lock();
368 /* 408 /*
369 * Check if it if we can balance. We can balance on the whole 409 * Check if we can balance. We can balance on the whole
370 * device if no resync is going on, or below the resync window. 410 * device if no resync is going on, or below the resync window.
371 * We take the first readable disk when above the resync window. 411 * We take the first readable disk when above the resync window.
372 */ 412 */
@@ -376,11 +416,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
376 /* Choose the first operation device, for consistancy */ 416 /* Choose the first operation device, for consistancy */
377 new_disk = 0; 417 new_disk = 0;
378 418
379 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || 419 for (rdev = conf->mirrors[new_disk].rdev;
380 !new_rdev->in_sync) { 420 !rdev || !rdev->in_sync
381 new_disk++; 421 || test_bit(WriteMostly, &rdev->flags);
382 if (new_disk == conf->raid_disks) { 422 rdev = conf->mirrors[++new_disk].rdev) {
383 new_disk = -1; 423
424 if (rdev && rdev->in_sync)
425 wonly_disk = new_disk;
426
427 if (new_disk == conf->raid_disks - 1) {
428 new_disk = wonly_disk;
384 break; 429 break;
385 } 430 }
386 } 431 }
@@ -389,16 +434,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
389 434
390 435
391 /* make sure the disk is operational */ 436 /* make sure the disk is operational */
392 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || 437 for (rdev = conf->mirrors[new_disk].rdev;
393 !new_rdev->in_sync) { 438 !rdev || !rdev->in_sync ||
439 test_bit(WriteMostly, &rdev->flags);
440 rdev = conf->mirrors[new_disk].rdev) {
441
442 if (rdev && rdev->in_sync)
443 wonly_disk = new_disk;
444
394 if (new_disk <= 0) 445 if (new_disk <= 0)
395 new_disk = conf->raid_disks; 446 new_disk = conf->raid_disks;
396 new_disk--; 447 new_disk--;
397 if (new_disk == disk) { 448 if (new_disk == disk) {
398 new_disk = -1; 449 new_disk = wonly_disk;
399 goto rb_out; 450 break;
400 } 451 }
401 } 452 }
453
454 if (new_disk < 0)
455 goto rb_out;
456
402 disk = new_disk; 457 disk = new_disk;
403 /* now disk == new_disk == starting point for search */ 458 /* now disk == new_disk == starting point for search */
404 459
@@ -419,37 +474,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419 disk = conf->raid_disks; 474 disk = conf->raid_disks;
420 disk--; 475 disk--;
421 476
422 if ((rdev=conf->mirrors[disk].rdev) == NULL || 477 rdev = conf->mirrors[disk].rdev;
423 !rdev->in_sync) 478
479 if (!rdev ||
480 !rdev->in_sync ||
481 test_bit(WriteMostly, &rdev->flags))
424 continue; 482 continue;
425 483
426 if (!atomic_read(&rdev->nr_pending)) { 484 if (!atomic_read(&rdev->nr_pending)) {
427 new_disk = disk; 485 new_disk = disk;
428 new_rdev = rdev;
429 break; 486 break;
430 } 487 }
431 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 488 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
432 if (new_distance < current_distance) { 489 if (new_distance < current_distance) {
433 current_distance = new_distance; 490 current_distance = new_distance;
434 new_disk = disk; 491 new_disk = disk;
435 new_rdev = rdev;
436 } 492 }
437 } while (disk != conf->last_used); 493 } while (disk != conf->last_used);
438 494
439rb_out: 495 rb_out:
440 496
441 497
442 if (new_disk >= 0) { 498 if (new_disk >= 0) {
443 conf->next_seq_sect = this_sector + sectors; 499 rdev = conf->mirrors[new_disk].rdev;
444 conf->last_used = new_disk; 500 if (!rdev)
445 atomic_inc(&new_rdev->nr_pending); 501 goto retry;
446 if (!new_rdev->in_sync) { 502 atomic_inc(&rdev->nr_pending);
503 if (!rdev->in_sync) {
447 /* cannot risk returning a device that failed 504 /* cannot risk returning a device that failed
448 * before we inc'ed nr_pending 505 * before we inc'ed nr_pending
449 */ 506 */
450 atomic_dec(&new_rdev->nr_pending); 507 atomic_dec(&rdev->nr_pending);
451 goto retry; 508 goto retry;
452 } 509 }
510 conf->next_seq_sect = this_sector + sectors;
511 conf->last_used = new_disk;
453 } 512 }
454 rcu_read_unlock(); 513 rcu_read_unlock();
455 514
@@ -542,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
542 spin_unlock_irq(&conf->resync_lock); 601 spin_unlock_irq(&conf->resync_lock);
543} 602}
544 603
604/* duplicate the data pages for behind I/O */
605static struct page **alloc_behind_pages(struct bio *bio)
606{
607 int i;
608 struct bio_vec *bvec;
609 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
610 GFP_NOIO);
611 if (unlikely(!pages))
612 goto do_sync_io;
613
614 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
615
616 bio_for_each_segment(bvec, bio, i) {
617 pages[i] = alloc_page(GFP_NOIO);
618 if (unlikely(!pages[i]))
619 goto do_sync_io;
620 memcpy(kmap(pages[i]) + bvec->bv_offset,
621 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
622 kunmap(pages[i]);
623 kunmap(bvec->bv_page);
624 }
625
626 return pages;
627
628do_sync_io:
629 if (pages)
630 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
631 __free_page(pages[i]);
632 kfree(pages);
633 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
634 return NULL;
635}
636
545static int make_request(request_queue_t *q, struct bio * bio) 637static int make_request(request_queue_t *q, struct bio * bio)
546{ 638{
547 mddev_t *mddev = q->queuedata; 639 mddev_t *mddev = q->queuedata;
@@ -554,7 +646,12 @@ static int make_request(request_queue_t *q, struct bio * bio)
554 struct bitmap *bitmap = mddev->bitmap; 646 struct bitmap *bitmap = mddev->bitmap;
555 unsigned long flags; 647 unsigned long flags;
556 struct bio_list bl; 648 struct bio_list bl;
649 struct page **behind_pages = NULL;
557 650
651 if (unlikely(bio_barrier(bio))) {
652 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
653 return 0;
654 }
558 655
559 /* 656 /*
560 * Register the new request and wait if the reconstruction 657 * Register the new request and wait if the reconstruction
@@ -589,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
589 r1_bio->mddev = mddev; 686 r1_bio->mddev = mddev;
590 r1_bio->sector = bio->bi_sector; 687 r1_bio->sector = bio->bi_sector;
591 688
592 r1_bio->state = 0;
593
594 if (bio_data_dir(bio) == READ) { 689 if (bio_data_dir(bio) == READ) {
595 /* 690 /*
596 * read balancing logic: 691 * read balancing logic:
@@ -651,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
651 } 746 }
652 rcu_read_unlock(); 747 rcu_read_unlock();
653 748
749 BUG_ON(targets == 0); /* we never fail the last device */
750
654 if (targets < conf->raid_disks) { 751 if (targets < conf->raid_disks) {
655 /* array is degraded, we will not clear the bitmap 752 /* array is degraded, we will not clear the bitmap
656 * on I/O completion (see raid1_end_write_request) */ 753 * on I/O completion (see raid1_end_write_request) */
657 set_bit(R1BIO_Degraded, &r1_bio->state); 754 set_bit(R1BIO_Degraded, &r1_bio->state);
658 } 755 }
659 756
757 /* do behind I/O ? */
758 if (bitmap &&
759 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
760 (behind_pages = alloc_behind_pages(bio)) != NULL)
761 set_bit(R1BIO_BehindIO, &r1_bio->state);
762
660 atomic_set(&r1_bio->remaining, 0); 763 atomic_set(&r1_bio->remaining, 0);
764 atomic_set(&r1_bio->behind_remaining, 0);
661 765
662 bio_list_init(&bl); 766 bio_list_init(&bl);
663 for (i = 0; i < disks; i++) { 767 for (i = 0; i < disks; i++) {
@@ -674,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
674 mbio->bi_rw = WRITE; 778 mbio->bi_rw = WRITE;
675 mbio->bi_private = r1_bio; 779 mbio->bi_private = r1_bio;
676 780
781 if (behind_pages) {
782 struct bio_vec *bvec;
783 int j;
784
785 /* Yes, I really want the '__' version so that
786 * we clear any unused pointer in the io_vec, rather
787 * than leave them unchanged. This is important
788 * because when we come to free the pages, we won't
789 * know the originial bi_idx, so we just free
790 * them all
791 */
792 __bio_for_each_segment(bvec, mbio, j, 0)
793 bvec->bv_page = behind_pages[j];
794 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
795 atomic_inc(&r1_bio->behind_remaining);
796 }
797
677 atomic_inc(&r1_bio->remaining); 798 atomic_inc(&r1_bio->remaining);
678 799
679 bio_list_add(&bl, mbio); 800 bio_list_add(&bl, mbio);
680 } 801 }
802 kfree(behind_pages); /* the behind pages are attached to the bios now */
681 803
682 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); 804 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
805 test_bit(R1BIO_BehindIO, &r1_bio->state));
683 spin_lock_irqsave(&conf->device_lock, flags); 806 spin_lock_irqsave(&conf->device_lock, flags);
684 bio_list_merge(&conf->pending_bio_list, &bl); 807 bio_list_merge(&conf->pending_bio_list, &bl);
685 bio_list_init(&bl); 808 bio_list_init(&bl);
@@ -1105,6 +1228,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1105 sector_t max_sector, nr_sectors; 1228 sector_t max_sector, nr_sectors;
1106 int disk; 1229 int disk;
1107 int i; 1230 int i;
1231 int wonly;
1108 int write_targets = 0; 1232 int write_targets = 0;
1109 int sync_blocks; 1233 int sync_blocks;
1110 int still_degraded = 0; 1234 int still_degraded = 0;
@@ -1160,14 +1284,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1160 */ 1284 */
1161 disk = conf->last_used; 1285 disk = conf->last_used;
1162 /* make sure disk is operational */ 1286 /* make sure disk is operational */
1163 1287 wonly = disk;
1164 while (conf->mirrors[disk].rdev == NULL || 1288 while (conf->mirrors[disk].rdev == NULL ||
1165 !conf->mirrors[disk].rdev->in_sync) { 1289 !conf->mirrors[disk].rdev->in_sync ||
1290 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1291 ) {
1292 if (conf->mirrors[disk].rdev &&
1293 conf->mirrors[disk].rdev->in_sync)
1294 wonly = disk;
1166 if (disk <= 0) 1295 if (disk <= 0)
1167 disk = conf->raid_disks; 1296 disk = conf->raid_disks;
1168 disk--; 1297 disk--;
1169 if (disk == conf->last_used) 1298 if (disk == conf->last_used) {
1299 disk = wonly;
1170 break; 1300 break;
1301 }
1171 } 1302 }
1172 conf->last_used = disk; 1303 conf->last_used = disk;
1173 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 1304 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
@@ -1439,6 +1570,17 @@ out:
1439static int stop(mddev_t *mddev) 1570static int stop(mddev_t *mddev)
1440{ 1571{
1441 conf_t *conf = mddev_to_conf(mddev); 1572 conf_t *conf = mddev_to_conf(mddev);
1573 struct bitmap *bitmap = mddev->bitmap;
1574 int behind_wait = 0;
1575
1576 /* wait for behind writes to complete */
1577 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1578 behind_wait++;
1579 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
1580 set_current_state(TASK_UNINTERRUPTIBLE);
1581 schedule_timeout(HZ); /* wait a second */
1582 /* need to kick something here to make sure I/O goes? */
1583 }
1442 1584
1443 md_unregister_thread(mddev->thread); 1585 md_unregister_thread(mddev->thread);
1444 mddev->thread = NULL; 1586 mddev->thread = NULL;
@@ -1561,6 +1703,35 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1561 return 0; 1703 return 0;
1562} 1704}
1563 1705
1706static void raid1_quiesce(mddev_t *mddev, int state)
1707{
1708 conf_t *conf = mddev_to_conf(mddev);
1709
1710 switch(state) {
1711 case 1:
1712 spin_lock_irq(&conf->resync_lock);
1713 conf->barrier++;
1714 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1715 conf->resync_lock, raid1_unplug(mddev->queue));
1716 spin_unlock_irq(&conf->resync_lock);
1717 break;
1718 case 0:
1719 spin_lock_irq(&conf->resync_lock);
1720 conf->barrier--;
1721 spin_unlock_irq(&conf->resync_lock);
1722 wake_up(&conf->wait_resume);
1723 wake_up(&conf->wait_idle);
1724 break;
1725 }
1726 if (mddev->thread) {
1727 if (mddev->bitmap)
1728 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1729 else
1730 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1731 md_wakeup_thread(mddev->thread);
1732 }
1733}
1734
1564 1735
1565static mdk_personality_t raid1_personality = 1736static mdk_personality_t raid1_personality =
1566{ 1737{
@@ -1577,6 +1748,7 @@ static mdk_personality_t raid1_personality =
1577 .sync_request = sync_request, 1748 .sync_request = sync_request,
1578 .resize = raid1_resize, 1749 .resize = raid1_resize,
1579 .reshape = raid1_reshape, 1750 .reshape = raid1_reshape,
1751 .quiesce = raid1_quiesce,
1580}; 1752};
1581 1753
1582static int __init raid_init(void) 1754static int __init raid_init(void)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 62ebb1bc72be..5bd1e9ec899d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -538,7 +538,8 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
538 } 538 }
539 539
540 540
541 current_distance = abs(this_sector - conf->mirrors[disk].head_position); 541 current_distance = abs(r10_bio->devs[slot].addr -
542 conf->mirrors[disk].head_position);
542 543
543 /* Find the disk whose head is closest */ 544 /* Find the disk whose head is closest */
544 545
@@ -668,6 +669,11 @@ static int make_request(request_queue_t *q, struct bio * bio)
668 int i; 669 int i;
669 int chunk_sects = conf->chunk_mask + 1; 670 int chunk_sects = conf->chunk_mask + 1;
670 671
672 if (unlikely(bio_barrier(bio))) {
673 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
674 return 0;
675 }
676
671 /* If this request crosses a chunk boundary, we need to 677 /* If this request crosses a chunk boundary, we need to
672 * split it. This will only happen for 1 PAGE (or less) requests. 678 * split it. This will only happen for 1 PAGE (or less) requests.
673 */ 679 */
@@ -900,6 +906,27 @@ static void close_sync(conf_t *conf)
900 conf->r10buf_pool = NULL; 906 conf->r10buf_pool = NULL;
901} 907}
902 908
909/* check if there are enough drives for
910 * every block to appear on atleast one
911 */
912static int enough(conf_t *conf)
913{
914 int first = 0;
915
916 do {
917 int n = conf->copies;
918 int cnt = 0;
919 while (n--) {
920 if (conf->mirrors[first].rdev)
921 cnt++;
922 first = (first+1) % conf->raid_disks;
923 }
924 if (cnt == 0)
925 return 0;
926 } while (first != 0);
927 return 1;
928}
929
903static int raid10_spare_active(mddev_t *mddev) 930static int raid10_spare_active(mddev_t *mddev)
904{ 931{
905 int i; 932 int i;
@@ -938,6 +965,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
938 * very different from resync 965 * very different from resync
939 */ 966 */
940 return 0; 967 return 0;
968 if (!enough(conf))
969 return 0;
941 970
942 for (mirror=0; mirror < mddev->raid_disks; mirror++) 971 for (mirror=0; mirror < mddev->raid_disks; mirror++)
943 if ( !(p=conf->mirrors+mirror)->rdev) { 972 if ( !(p=conf->mirrors+mirror)->rdev) {
@@ -1445,7 +1474,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1445 } 1474 }
1446 } 1475 }
1447 if (j == conf->copies) { 1476 if (j == conf->copies) {
1448 BUG(); 1477 /* Cannot recover, so abort the recovery */
1478 put_buf(r10_bio);
1479 r10_bio = rb2;
1480 if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
1481 printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1482 mdname(mddev));
1483 break;
1449 } 1484 }
1450 } 1485 }
1451 if (biolist == NULL) { 1486 if (biolist == NULL) {
@@ -1678,9 +1713,10 @@ static int run(mddev_t *mddev)
1678 init_waitqueue_head(&conf->wait_idle); 1713 init_waitqueue_head(&conf->wait_idle);
1679 init_waitqueue_head(&conf->wait_resume); 1714 init_waitqueue_head(&conf->wait_resume);
1680 1715
1681 if (!conf->working_disks) { 1716 /* need to check that every block has at least one working mirror */
1682 printk(KERN_ERR "raid10: no operational mirrors for %s\n", 1717 if (!enough(conf)) {
1683 mdname(mddev)); 1718 printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
1719 mdname(mddev));
1684 goto out_free_conf; 1720 goto out_free_conf;
1685 } 1721 }
1686 1722
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 43f231a467d5..4683ca24c046 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,6 +24,8 @@
24#include <linux/bitops.h> 24#include <linux/bitops.h>
25#include <asm/atomic.h> 25#include <asm/atomic.h>
26 26
27#include <linux/raid/bitmap.h>
28
27/* 29/*
28 * Stripe cache 30 * Stripe cache
29 */ 31 */
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
79 if (test_bit(STRIPE_HANDLE, &sh->state)) { 81 if (test_bit(STRIPE_HANDLE, &sh->state)) {
80 if (test_bit(STRIPE_DELAYED, &sh->state)) 82 if (test_bit(STRIPE_DELAYED, &sh->state))
81 list_add_tail(&sh->lru, &conf->delayed_list); 83 list_add_tail(&sh->lru, &conf->delayed_list);
82 else 84 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
85 conf->seq_write == sh->bm_seq)
86 list_add_tail(&sh->lru, &conf->bitmap_list);
87 else {
88 clear_bit(STRIPE_BIT_DELAY, &sh->state);
83 list_add_tail(&sh->lru, &conf->handle_list); 89 list_add_tail(&sh->lru, &conf->handle_list);
90 }
84 md_wakeup_thread(conf->mddev->thread); 91 md_wakeup_thread(conf->mddev->thread);
85 } else { 92 } else {
86 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 93 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
244 spin_lock_irq(&conf->device_lock); 251 spin_lock_irq(&conf->device_lock);
245 252
246 do { 253 do {
254 wait_event_lock_irq(conf->wait_for_stripe,
255 conf->quiesce == 0,
256 conf->device_lock, /* nothing */);
247 sh = __find_stripe(conf, sector); 257 sh = __find_stripe(conf, sector);
248 if (!sh) { 258 if (!sh) {
249 if (!conf->inactive_blocked) 259 if (!conf->inactive_blocked)
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
803{ 813{
804 struct bio **bip; 814 struct bio **bip;
805 raid5_conf_t *conf = sh->raid_conf; 815 raid5_conf_t *conf = sh->raid_conf;
816 int firstwrite=0;
806 817
807 PRINTK("adding bh b#%llu to stripe s#%llu\n", 818 PRINTK("adding bh b#%llu to stripe s#%llu\n",
808 (unsigned long long)bi->bi_sector, 819 (unsigned long long)bi->bi_sector,
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
811 822
812 spin_lock(&sh->lock); 823 spin_lock(&sh->lock);
813 spin_lock_irq(&conf->device_lock); 824 spin_lock_irq(&conf->device_lock);
814 if (forwrite) 825 if (forwrite) {
815 bip = &sh->dev[dd_idx].towrite; 826 bip = &sh->dev[dd_idx].towrite;
816 else 827 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
828 firstwrite = 1;
829 } else
817 bip = &sh->dev[dd_idx].toread; 830 bip = &sh->dev[dd_idx].toread;
818 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 831 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
819 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 832 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
836 (unsigned long long)bi->bi_sector, 849 (unsigned long long)bi->bi_sector,
837 (unsigned long long)sh->sector, dd_idx); 850 (unsigned long long)sh->sector, dd_idx);
838 851
852 if (conf->mddev->bitmap && firstwrite) {
853 sh->bm_seq = conf->seq_write;
854 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
855 STRIPE_SECTORS, 0);
856 set_bit(STRIPE_BIT_DELAY, &sh->state);
857 }
858
839 if (forwrite) { 859 if (forwrite) {
840 /* check if page is covered */ 860 /* check if page is covered */
841 sector_t sector = sh->dev[dd_idx].sector; 861 sector_t sector = sh->dev[dd_idx].sector;
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
958 * need to be failed 978 * need to be failed
959 */ 979 */
960 if (failed > 1 && to_read+to_write+written) { 980 if (failed > 1 && to_read+to_write+written) {
961 spin_lock_irq(&conf->device_lock);
962 for (i=disks; i--; ) { 981 for (i=disks; i--; ) {
982 int bitmap_end = 0;
983 spin_lock_irq(&conf->device_lock);
963 /* fail all writes first */ 984 /* fail all writes first */
964 bi = sh->dev[i].towrite; 985 bi = sh->dev[i].towrite;
965 sh->dev[i].towrite = NULL; 986 sh->dev[i].towrite = NULL;
966 if (bi) to_write--; 987 if (bi) { to_write--; bitmap_end = 1; }
967 988
968 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 989 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
969 wake_up(&conf->wait_for_overlap); 990 wake_up(&conf->wait_for_overlap);
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
981 /* and fail all 'written' */ 1002 /* and fail all 'written' */
982 bi = sh->dev[i].written; 1003 bi = sh->dev[i].written;
983 sh->dev[i].written = NULL; 1004 sh->dev[i].written = NULL;
1005 if (bi) bitmap_end = 1;
984 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { 1006 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
985 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 1007 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
986 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1008 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
1009 bi = nextbi; 1031 bi = nextbi;
1010 } 1032 }
1011 } 1033 }
1034 spin_unlock_irq(&conf->device_lock);
1035 if (bitmap_end)
1036 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1037 STRIPE_SECTORS, 0, 0);
1012 } 1038 }
1013 spin_unlock_irq(&conf->device_lock);
1014 } 1039 }
1015 if (failed > 1 && syncing) { 1040 if (failed > 1 && syncing) {
1016 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 1041 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
1038 test_bit(R5_UPTODATE, &dev->flags) ) { 1063 test_bit(R5_UPTODATE, &dev->flags) ) {
1039 /* We can return any write requests */ 1064 /* We can return any write requests */
1040 struct bio *wbi, *wbi2; 1065 struct bio *wbi, *wbi2;
1066 int bitmap_end = 0;
1041 PRINTK("Return write for disc %d\n", i); 1067 PRINTK("Return write for disc %d\n", i);
1042 spin_lock_irq(&conf->device_lock); 1068 spin_lock_irq(&conf->device_lock);
1043 wbi = dev->written; 1069 wbi = dev->written;
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
1051 } 1077 }
1052 wbi = wbi2; 1078 wbi = wbi2;
1053 } 1079 }
1080 if (dev->towrite == NULL)
1081 bitmap_end = 1;
1054 spin_unlock_irq(&conf->device_lock); 1082 spin_unlock_irq(&conf->device_lock);
1083 if (bitmap_end)
1084 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1085 STRIPE_SECTORS,
1086 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1055 } 1087 }
1056 } 1088 }
1057 } 1089 }
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
1175 } 1207 }
1176 } 1208 }
1177 /* now if nothing is locked, and if we have enough data, we can start a write request */ 1209 /* now if nothing is locked, and if we have enough data, we can start a write request */
1178 if (locked == 0 && (rcw == 0 ||rmw == 0)) { 1210 if (locked == 0 && (rcw == 0 ||rmw == 0) &&
1211 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1179 PRINTK("Computing parity...\n"); 1212 PRINTK("Computing parity...\n");
1180 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); 1213 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1181 /* now every locked buffer is ready to be written */ 1214 /* now every locked buffer is ready to be written */
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
1231 dev = &sh->dev[failed_num]; 1264 dev = &sh->dev[failed_num];
1232 set_bit(R5_LOCKED, &dev->flags); 1265 set_bit(R5_LOCKED, &dev->flags);
1233 set_bit(R5_Wantwrite, &dev->flags); 1266 set_bit(R5_Wantwrite, &dev->flags);
1267 clear_bit(STRIPE_DEGRADED, &sh->state);
1234 locked++; 1268 locked++;
1235 set_bit(STRIPE_INSYNC, &sh->state); 1269 set_bit(STRIPE_INSYNC, &sh->state);
1236 set_bit(R5_Syncio, &dev->flags); 1270 set_bit(R5_Syncio, &dev->flags);
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
1298 bi->bi_next = NULL; 1332 bi->bi_next = NULL;
1299 generic_make_request(bi); 1333 generic_make_request(bi);
1300 } else { 1334 } else {
1335 if (rw == 1)
1336 set_bit(STRIPE_DEGRADED, &sh->state);
1301 PRINTK("skip op %ld on disc %d for sector %llu\n", 1337 PRINTK("skip op %ld on disc %d for sector %llu\n",
1302 bi->bi_rw, i, (unsigned long long)sh->sector); 1338 bi->bi_rw, i, (unsigned long long)sh->sector);
1303 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1339 clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
1322 } 1358 }
1323} 1359}
1324 1360
1361static inline void activate_bit_delay(raid5_conf_t *conf)
1362{
1363 /* device_lock is held */
1364 struct list_head head;
1365 list_add(&head, &conf->bitmap_list);
1366 list_del_init(&conf->bitmap_list);
1367 while (!list_empty(&head)) {
1368 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1369 list_del_init(&sh->lru);
1370 atomic_inc(&sh->count);
1371 __release_stripe(conf, sh);
1372 }
1373}
1374
1325static void unplug_slaves(mddev_t *mddev) 1375static void unplug_slaves(mddev_t *mddev)
1326{ 1376{
1327 raid5_conf_t *conf = mddev_to_conf(mddev); 1377 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
1354 1404
1355 spin_lock_irqsave(&conf->device_lock, flags); 1405 spin_lock_irqsave(&conf->device_lock, flags);
1356 1406
1357 if (blk_remove_plug(q)) 1407 if (blk_remove_plug(q)) {
1408 conf->seq_flush++;
1358 raid5_activate_delayed(conf); 1409 raid5_activate_delayed(conf);
1410 }
1359 md_wakeup_thread(mddev->thread); 1411 md_wakeup_thread(mddev->thread);
1360 1412
1361 spin_unlock_irqrestore(&conf->device_lock, flags); 1413 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1411,6 +1463,11 @@ static int make_request (request_queue_t *q, struct bio * bi)
1411 sector_t logical_sector, last_sector; 1463 sector_t logical_sector, last_sector;
1412 struct stripe_head *sh; 1464 struct stripe_head *sh;
1413 1465
1466 if (unlikely(bio_barrier(bi))) {
1467 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
1468 return 0;
1469 }
1470
1414 md_write_start(mddev, bi); 1471 md_write_start(mddev, bi);
1415 1472
1416 if (bio_data_dir(bi)==WRITE) { 1473 if (bio_data_dir(bi)==WRITE) {
@@ -1488,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1488 sector_t first_sector; 1545 sector_t first_sector;
1489 int raid_disks = conf->raid_disks; 1546 int raid_disks = conf->raid_disks;
1490 int data_disks = raid_disks-1; 1547 int data_disks = raid_disks-1;
1548 sector_t max_sector = mddev->size << 1;
1549 int sync_blocks;
1491 1550
1492 if (sector_nr >= mddev->size <<1) { 1551 if (sector_nr >= max_sector) {
1493 /* just being told to finish up .. nothing much to do */ 1552 /* just being told to finish up .. nothing much to do */
1494 unplug_slaves(mddev); 1553 unplug_slaves(mddev);
1554
1555 if (mddev->curr_resync < max_sector) /* aborted */
1556 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1557 &sync_blocks, 1);
1558 else /* compelted sync */
1559 conf->fullsync = 0;
1560 bitmap_close_sync(mddev->bitmap);
1561
1495 return 0; 1562 return 0;
1496 } 1563 }
1497 /* if there is 1 or more failed drives and we are trying 1564 /* if there is 1 or more failed drives and we are trying
@@ -1503,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1503 *skipped = 1; 1570 *skipped = 1;
1504 return rv; 1571 return rv;
1505 } 1572 }
1573 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1574 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1575 /* we can skip this block, and probably more */
1576 sync_blocks /= STRIPE_SECTORS;
1577 *skipped = 1;
1578 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1579 }
1506 1580
1507 x = sector_nr; 1581 x = sector_nr;
1508 chunk_offset = sector_div(x, sectors_per_chunk); 1582 chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1520,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1520 set_current_state(TASK_UNINTERRUPTIBLE); 1594 set_current_state(TASK_UNINTERRUPTIBLE);
1521 schedule_timeout(1); 1595 schedule_timeout(1);
1522 } 1596 }
1597 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
1523 spin_lock(&sh->lock); 1598 spin_lock(&sh->lock);
1524 set_bit(STRIPE_SYNCING, &sh->state); 1599 set_bit(STRIPE_SYNCING, &sh->state);
1525 clear_bit(STRIPE_INSYNC, &sh->state); 1600 clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1553,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
1553 while (1) { 1628 while (1) {
1554 struct list_head *first; 1629 struct list_head *first;
1555 1630
1631 if (conf->seq_flush - conf->seq_write > 0) {
1632 int seq = conf->seq_flush;
1633 bitmap_unplug(mddev->bitmap);
1634 conf->seq_write = seq;
1635 activate_bit_delay(conf);
1636 }
1637
1556 if (list_empty(&conf->handle_list) && 1638 if (list_empty(&conf->handle_list) &&
1557 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && 1639 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1558 !blk_queue_plugged(mddev->queue) && 1640 !blk_queue_plugged(mddev->queue) &&
@@ -1586,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
1586 PRINTK("--- raid5d inactive\n"); 1668 PRINTK("--- raid5d inactive\n");
1587} 1669}
1588 1670
1589static int run (mddev_t *mddev) 1671static int run(mddev_t *mddev)
1590{ 1672{
1591 raid5_conf_t *conf; 1673 raid5_conf_t *conf;
1592 int raid_disk, memory; 1674 int raid_disk, memory;
@@ -1616,6 +1698,7 @@ static int run (mddev_t *mddev)
1616 init_waitqueue_head(&conf->wait_for_overlap); 1698 init_waitqueue_head(&conf->wait_for_overlap);
1617 INIT_LIST_HEAD(&conf->handle_list); 1699 INIT_LIST_HEAD(&conf->handle_list);
1618 INIT_LIST_HEAD(&conf->delayed_list); 1700 INIT_LIST_HEAD(&conf->delayed_list);
1701 INIT_LIST_HEAD(&conf->bitmap_list);
1619 INIT_LIST_HEAD(&conf->inactive_list); 1702 INIT_LIST_HEAD(&conf->inactive_list);
1620 atomic_set(&conf->active_stripes, 0); 1703 atomic_set(&conf->active_stripes, 0);
1621 atomic_set(&conf->preread_active_stripes, 0); 1704 atomic_set(&conf->preread_active_stripes, 0);
@@ -1727,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1727 1810
1728 /* Ok, everything is just fine now */ 1811 /* Ok, everything is just fine now */
1729 1812
1813 if (mddev->bitmap)
1814 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1815
1730 mddev->queue->unplug_fn = raid5_unplug_device; 1816 mddev->queue->unplug_fn = raid5_unplug_device;
1731 mddev->queue->issue_flush_fn = raid5_issue_flush; 1817 mddev->queue->issue_flush_fn = raid5_issue_flush;
1732 1818
@@ -1907,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1907 rdev->in_sync = 0; 1993 rdev->in_sync = 0;
1908 rdev->raid_disk = disk; 1994 rdev->raid_disk = disk;
1909 found = 1; 1995 found = 1;
1996 if (rdev->saved_raid_disk != disk)
1997 conf->fullsync = 1;
1910 p->rdev = rdev; 1998 p->rdev = rdev;
1911 break; 1999 break;
1912 } 2000 }
@@ -1936,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
1936 return 0; 2024 return 0;
1937} 2025}
1938 2026
2027static void raid5_quiesce(mddev_t *mddev, int state)
2028{
2029 raid5_conf_t *conf = mddev_to_conf(mddev);
2030
2031 switch(state) {
2032 case 1: /* stop all writes */
2033 spin_lock_irq(&conf->device_lock);
2034 conf->quiesce = 1;
2035 wait_event_lock_irq(conf->wait_for_stripe,
2036 atomic_read(&conf->active_stripes) == 0,
2037 conf->device_lock, /* nothing */);
2038 spin_unlock_irq(&conf->device_lock);
2039 break;
2040
2041 case 0: /* re-enable writes */
2042 spin_lock_irq(&conf->device_lock);
2043 conf->quiesce = 0;
2044 wake_up(&conf->wait_for_stripe);
2045 spin_unlock_irq(&conf->device_lock);
2046 break;
2047 }
2048 if (mddev->thread) {
2049 if (mddev->bitmap)
2050 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2051 else
2052 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2053 md_wakeup_thread(mddev->thread);
2054 }
2055}
1939static mdk_personality_t raid5_personality= 2056static mdk_personality_t raid5_personality=
1940{ 2057{
1941 .name = "raid5", 2058 .name = "raid5",
@@ -1950,6 +2067,7 @@ static mdk_personality_t raid5_personality=
1950 .spare_active = raid5_spare_active, 2067 .spare_active = raid5_spare_active,
1951 .sync_request = sync_request, 2068 .sync_request = sync_request,
1952 .resize = raid5_resize, 2069 .resize = raid5_resize,
2070 .quiesce = raid5_quiesce,
1953}; 2071};
1954 2072
1955static int __init raid5_init (void) 2073static int __init raid5_init (void)
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 495dee1d1e83..267eb1430c83 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -29,6 +29,8 @@
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include "raid6.h" 30#include "raid6.h"
31 31
32#include <linux/raid/bitmap.h>
33
32/* 34/*
33 * Stripe cache 35 * Stripe cache
34 */ 36 */
@@ -98,8 +100,13 @@ static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
98 if (test_bit(STRIPE_HANDLE, &sh->state)) { 100 if (test_bit(STRIPE_HANDLE, &sh->state)) {
99 if (test_bit(STRIPE_DELAYED, &sh->state)) 101 if (test_bit(STRIPE_DELAYED, &sh->state))
100 list_add_tail(&sh->lru, &conf->delayed_list); 102 list_add_tail(&sh->lru, &conf->delayed_list);
101 else 103 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
104 conf->seq_write == sh->bm_seq)
105 list_add_tail(&sh->lru, &conf->bitmap_list);
106 else {
107 clear_bit(STRIPE_BIT_DELAY, &sh->state);
102 list_add_tail(&sh->lru, &conf->handle_list); 108 list_add_tail(&sh->lru, &conf->handle_list);
109 }
103 md_wakeup_thread(conf->mddev->thread); 110 md_wakeup_thread(conf->mddev->thread);
104 } else { 111 } else {
105 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 112 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -262,6 +269,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector
262 spin_lock_irq(&conf->device_lock); 269 spin_lock_irq(&conf->device_lock);
263 270
264 do { 271 do {
272 wait_event_lock_irq(conf->wait_for_stripe,
273 conf->quiesce == 0,
274 conf->device_lock, /* nothing */);
265 sh = __find_stripe(conf, sector); 275 sh = __find_stripe(conf, sector);
266 if (!sh) { 276 if (!sh) {
267 if (!conf->inactive_blocked) 277 if (!conf->inactive_blocked)
@@ -906,6 +916,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
906{ 916{
907 struct bio **bip; 917 struct bio **bip;
908 raid6_conf_t *conf = sh->raid_conf; 918 raid6_conf_t *conf = sh->raid_conf;
919 int firstwrite=0;
909 920
910 PRINTK("adding bh b#%llu to stripe s#%llu\n", 921 PRINTK("adding bh b#%llu to stripe s#%llu\n",
911 (unsigned long long)bi->bi_sector, 922 (unsigned long long)bi->bi_sector,
@@ -914,9 +925,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
914 925
915 spin_lock(&sh->lock); 926 spin_lock(&sh->lock);
916 spin_lock_irq(&conf->device_lock); 927 spin_lock_irq(&conf->device_lock);
917 if (forwrite) 928 if (forwrite) {
918 bip = &sh->dev[dd_idx].towrite; 929 bip = &sh->dev[dd_idx].towrite;
919 else 930 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
931 firstwrite = 1;
932 } else
920 bip = &sh->dev[dd_idx].toread; 933 bip = &sh->dev[dd_idx].toread;
921 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 934 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
922 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 935 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -939,6 +952,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
939 (unsigned long long)bi->bi_sector, 952 (unsigned long long)bi->bi_sector,
940 (unsigned long long)sh->sector, dd_idx); 953 (unsigned long long)sh->sector, dd_idx);
941 954
955 if (conf->mddev->bitmap && firstwrite) {
956 sh->bm_seq = conf->seq_write;
957 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
958 STRIPE_SECTORS, 0);
959 set_bit(STRIPE_BIT_DELAY, &sh->state);
960 }
961
942 if (forwrite) { 962 if (forwrite) {
943 /* check if page is covered */ 963 /* check if page is covered */
944 sector_t sector = sh->dev[dd_idx].sector; 964 sector_t sector = sh->dev[dd_idx].sector;
@@ -1066,12 +1086,13 @@ static void handle_stripe(struct stripe_head *sh)
1066 * need to be failed 1086 * need to be failed
1067 */ 1087 */
1068 if (failed > 2 && to_read+to_write+written) { 1088 if (failed > 2 && to_read+to_write+written) {
1069 spin_lock_irq(&conf->device_lock);
1070 for (i=disks; i--; ) { 1089 for (i=disks; i--; ) {
1090 int bitmap_end = 0;
1091 spin_lock_irq(&conf->device_lock);
1071 /* fail all writes first */ 1092 /* fail all writes first */
1072 bi = sh->dev[i].towrite; 1093 bi = sh->dev[i].towrite;
1073 sh->dev[i].towrite = NULL; 1094 sh->dev[i].towrite = NULL;
1074 if (bi) to_write--; 1095 if (bi) { to_write--; bitmap_end = 1; }
1075 1096
1076 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1097 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1077 wake_up(&conf->wait_for_overlap); 1098 wake_up(&conf->wait_for_overlap);
@@ -1089,6 +1110,7 @@ static void handle_stripe(struct stripe_head *sh)
1089 /* and fail all 'written' */ 1110 /* and fail all 'written' */
1090 bi = sh->dev[i].written; 1111 bi = sh->dev[i].written;
1091 sh->dev[i].written = NULL; 1112 sh->dev[i].written = NULL;
1113 if (bi) bitmap_end = 1;
1092 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { 1114 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1093 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 1115 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1094 clear_bit(BIO_UPTODATE, &bi->bi_flags); 1116 clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1117,8 +1139,11 @@ static void handle_stripe(struct stripe_head *sh)
1117 bi = nextbi; 1139 bi = nextbi;
1118 } 1140 }
1119 } 1141 }
1142 spin_unlock_irq(&conf->device_lock);
1143 if (bitmap_end)
1144 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1145 STRIPE_SECTORS, 0, 0);
1120 } 1146 }
1121 spin_unlock_irq(&conf->device_lock);
1122 } 1147 }
1123 if (failed > 2 && syncing) { 1148 if (failed > 2 && syncing) {
1124 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 1149 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1155,6 +1180,7 @@ static void handle_stripe(struct stripe_head *sh)
1155 if (!test_bit(R5_LOCKED, &dev->flags) && 1180 if (!test_bit(R5_LOCKED, &dev->flags) &&
1156 test_bit(R5_UPTODATE, &dev->flags) ) { 1181 test_bit(R5_UPTODATE, &dev->flags) ) {
1157 /* We can return any write requests */ 1182 /* We can return any write requests */
1183 int bitmap_end = 0;
1158 struct bio *wbi, *wbi2; 1184 struct bio *wbi, *wbi2;
1159 PRINTK("Return write for stripe %llu disc %d\n", 1185 PRINTK("Return write for stripe %llu disc %d\n",
1160 (unsigned long long)sh->sector, i); 1186 (unsigned long long)sh->sector, i);
@@ -1170,7 +1196,13 @@ static void handle_stripe(struct stripe_head *sh)
1170 } 1196 }
1171 wbi = wbi2; 1197 wbi = wbi2;
1172 } 1198 }
1199 if (dev->towrite == NULL)
1200 bitmap_end = 1;
1173 spin_unlock_irq(&conf->device_lock); 1201 spin_unlock_irq(&conf->device_lock);
1202 if (bitmap_end)
1203 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
1204 STRIPE_SECTORS,
1205 !test_bit(STRIPE_DEGRADED, &sh->state), 0);
1174 } 1206 }
1175 } 1207 }
1176 } 1208 }
@@ -1285,7 +1317,8 @@ static void handle_stripe(struct stripe_head *sh)
1285 } 1317 }
1286 } 1318 }
1287 /* now if nothing is locked, and if we have enough data, we can start a write request */ 1319 /* now if nothing is locked, and if we have enough data, we can start a write request */
1288 if (locked == 0 && rcw == 0) { 1320 if (locked == 0 && rcw == 0 &&
1321 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
1289 if ( must_compute > 0 ) { 1322 if ( must_compute > 0 ) {
1290 /* We have failed blocks and need to compute them */ 1323 /* We have failed blocks and need to compute them */
1291 switch ( failed ) { 1324 switch ( failed ) {
@@ -1388,6 +1421,7 @@ static void handle_stripe(struct stripe_head *sh)
1388 bdev = &sh->dev[failed_num[1]]; 1421 bdev = &sh->dev[failed_num[1]];
1389 locked += !test_bit(R5_LOCKED, &bdev->flags); 1422 locked += !test_bit(R5_LOCKED, &bdev->flags);
1390 set_bit(R5_LOCKED, &bdev->flags); 1423 set_bit(R5_LOCKED, &bdev->flags);
1424 clear_bit(STRIPE_DEGRADED, &sh->state);
1391 set_bit(R5_Wantwrite, &bdev->flags); 1425 set_bit(R5_Wantwrite, &bdev->flags);
1392 1426
1393 set_bit(STRIPE_INSYNC, &sh->state); 1427 set_bit(STRIPE_INSYNC, &sh->state);
@@ -1457,6 +1491,8 @@ static void handle_stripe(struct stripe_head *sh)
1457 bi->bi_next = NULL; 1491 bi->bi_next = NULL;
1458 generic_make_request(bi); 1492 generic_make_request(bi);
1459 } else { 1493 } else {
1494 if (rw == 1)
1495 set_bit(STRIPE_DEGRADED, &sh->state);
1460 PRINTK("skip op %ld on disc %d for sector %llu\n", 1496 PRINTK("skip op %ld on disc %d for sector %llu\n",
1461 bi->bi_rw, i, (unsigned long long)sh->sector); 1497 bi->bi_rw, i, (unsigned long long)sh->sector);
1462 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1498 clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1481,6 +1517,20 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
1481 } 1517 }
1482} 1518}
1483 1519
1520static inline void activate_bit_delay(raid6_conf_t *conf)
1521{
1522 /* device_lock is held */
1523 struct list_head head;
1524 list_add(&head, &conf->bitmap_list);
1525 list_del_init(&conf->bitmap_list);
1526 while (!list_empty(&head)) {
1527 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
1528 list_del_init(&sh->lru);
1529 atomic_inc(&sh->count);
1530 __release_stripe(conf, sh);
1531 }
1532}
1533
1484static void unplug_slaves(mddev_t *mddev) 1534static void unplug_slaves(mddev_t *mddev)
1485{ 1535{
1486 raid6_conf_t *conf = mddev_to_conf(mddev); 1536 raid6_conf_t *conf = mddev_to_conf(mddev);
@@ -1513,8 +1563,10 @@ static void raid6_unplug_device(request_queue_t *q)
1513 1563
1514 spin_lock_irqsave(&conf->device_lock, flags); 1564 spin_lock_irqsave(&conf->device_lock, flags);
1515 1565
1516 if (blk_remove_plug(q)) 1566 if (blk_remove_plug(q)) {
1567 conf->seq_flush++;
1517 raid6_activate_delayed(conf); 1568 raid6_activate_delayed(conf);
1569 }
1518 md_wakeup_thread(mddev->thread); 1570 md_wakeup_thread(mddev->thread);
1519 1571
1520 spin_unlock_irqrestore(&conf->device_lock, flags); 1572 spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1570,6 +1622,11 @@ static int make_request (request_queue_t *q, struct bio * bi)
1570 sector_t logical_sector, last_sector; 1622 sector_t logical_sector, last_sector;
1571 struct stripe_head *sh; 1623 struct stripe_head *sh;
1572 1624
1625 if (unlikely(bio_barrier(bi))) {
1626 bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
1627 return 0;
1628 }
1629
1573 md_write_start(mddev, bi); 1630 md_write_start(mddev, bi);
1574 1631
1575 if (bio_data_dir(bi)==WRITE) { 1632 if (bio_data_dir(bi)==WRITE) {
@@ -1647,10 +1704,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1647 sector_t first_sector; 1704 sector_t first_sector;
1648 int raid_disks = conf->raid_disks; 1705 int raid_disks = conf->raid_disks;
1649 int data_disks = raid_disks - 2; 1706 int data_disks = raid_disks - 2;
1707 sector_t max_sector = mddev->size << 1;
1708 int sync_blocks;
1650 1709
1651 if (sector_nr >= mddev->size <<1) { 1710 if (sector_nr >= max_sector) {
1652 /* just being told to finish up .. nothing much to do */ 1711 /* just being told to finish up .. nothing much to do */
1653 unplug_slaves(mddev); 1712 unplug_slaves(mddev);
1713
1714 if (mddev->curr_resync < max_sector) /* aborted */
1715 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1716 &sync_blocks, 1);
1717 else /* compelted sync */
1718 conf->fullsync = 0;
1719 bitmap_close_sync(mddev->bitmap);
1720
1654 return 0; 1721 return 0;
1655 } 1722 }
1656 /* if there are 2 or more failed drives and we are trying 1723 /* if there are 2 or more failed drives and we are trying
@@ -1662,6 +1729,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1662 *skipped = 1; 1729 *skipped = 1;
1663 return rv; 1730 return rv;
1664 } 1731 }
1732 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1733 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1734 /* we can skip this block, and probably more */
1735 sync_blocks /= STRIPE_SECTORS;
1736 *skipped = 1;
1737 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
1738 }
1665 1739
1666 x = sector_nr; 1740 x = sector_nr;
1667 chunk_offset = sector_div(x, sectors_per_chunk); 1741 chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1679,6 +1753,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1679 set_current_state(TASK_UNINTERRUPTIBLE); 1753 set_current_state(TASK_UNINTERRUPTIBLE);
1680 schedule_timeout(1); 1754 schedule_timeout(1);
1681 } 1755 }
1756 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
1682 spin_lock(&sh->lock); 1757 spin_lock(&sh->lock);
1683 set_bit(STRIPE_SYNCING, &sh->state); 1758 set_bit(STRIPE_SYNCING, &sh->state);
1684 clear_bit(STRIPE_INSYNC, &sh->state); 1759 clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1712,6 +1787,13 @@ static void raid6d (mddev_t *mddev)
1712 while (1) { 1787 while (1) {
1713 struct list_head *first; 1788 struct list_head *first;
1714 1789
1790 if (conf->seq_flush - conf->seq_write > 0) {
1791 int seq = conf->seq_flush;
1792 bitmap_unplug(mddev->bitmap);
1793 conf->seq_write = seq;
1794 activate_bit_delay(conf);
1795 }
1796
1715 if (list_empty(&conf->handle_list) && 1797 if (list_empty(&conf->handle_list) &&
1716 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && 1798 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1717 !blk_queue_plugged(mddev->queue) && 1799 !blk_queue_plugged(mddev->queue) &&
@@ -1745,7 +1827,7 @@ static void raid6d (mddev_t *mddev)
1745 PRINTK("--- raid6d inactive\n"); 1827 PRINTK("--- raid6d inactive\n");
1746} 1828}
1747 1829
1748static int run (mddev_t *mddev) 1830static int run(mddev_t *mddev)
1749{ 1831{
1750 raid6_conf_t *conf; 1832 raid6_conf_t *conf;
1751 int raid_disk, memory; 1833 int raid_disk, memory;
@@ -1775,6 +1857,7 @@ static int run (mddev_t *mddev)
1775 init_waitqueue_head(&conf->wait_for_overlap); 1857 init_waitqueue_head(&conf->wait_for_overlap);
1776 INIT_LIST_HEAD(&conf->handle_list); 1858 INIT_LIST_HEAD(&conf->handle_list);
1777 INIT_LIST_HEAD(&conf->delayed_list); 1859 INIT_LIST_HEAD(&conf->delayed_list);
1860 INIT_LIST_HEAD(&conf->bitmap_list);
1778 INIT_LIST_HEAD(&conf->inactive_list); 1861 INIT_LIST_HEAD(&conf->inactive_list);
1779 atomic_set(&conf->active_stripes, 0); 1862 atomic_set(&conf->active_stripes, 0);
1780 atomic_set(&conf->preread_active_stripes, 0); 1863 atomic_set(&conf->preread_active_stripes, 0);
@@ -1894,6 +1977,9 @@ static int run (mddev_t *mddev)
1894 /* Ok, everything is just fine now */ 1977 /* Ok, everything is just fine now */
1895 mddev->array_size = mddev->size * (mddev->raid_disks - 2); 1978 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
1896 1979
1980 if (mddev->bitmap)
1981 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1982
1897 mddev->queue->unplug_fn = raid6_unplug_device; 1983 mddev->queue->unplug_fn = raid6_unplug_device;
1898 mddev->queue->issue_flush_fn = raid6_issue_flush; 1984 mddev->queue->issue_flush_fn = raid6_issue_flush;
1899 return 0; 1985 return 0;
@@ -2071,6 +2157,8 @@ static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2071 rdev->in_sync = 0; 2157 rdev->in_sync = 0;
2072 rdev->raid_disk = disk; 2158 rdev->raid_disk = disk;
2073 found = 1; 2159 found = 1;
2160 if (rdev->saved_raid_disk != disk)
2161 conf->fullsync = 1;
2074 p->rdev = rdev; 2162 p->rdev = rdev;
2075 break; 2163 break;
2076 } 2164 }
@@ -2100,6 +2188,35 @@ static int raid6_resize(mddev_t *mddev, sector_t sectors)
2100 return 0; 2188 return 0;
2101} 2189}
2102 2190
2191static void raid6_quiesce(mddev_t *mddev, int state)
2192{
2193 raid6_conf_t *conf = mddev_to_conf(mddev);
2194
2195 switch(state) {
2196 case 1: /* stop all writes */
2197 spin_lock_irq(&conf->device_lock);
2198 conf->quiesce = 1;
2199 wait_event_lock_irq(conf->wait_for_stripe,
2200 atomic_read(&conf->active_stripes) == 0,
2201 conf->device_lock, /* nothing */);
2202 spin_unlock_irq(&conf->device_lock);
2203 break;
2204
2205 case 0: /* re-enable writes */
2206 spin_lock_irq(&conf->device_lock);
2207 conf->quiesce = 0;
2208 wake_up(&conf->wait_for_stripe);
2209 spin_unlock_irq(&conf->device_lock);
2210 break;
2211 }
2212 if (mddev->thread) {
2213 if (mddev->bitmap)
2214 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2215 else
2216 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2217 md_wakeup_thread(mddev->thread);
2218 }
2219}
2103static mdk_personality_t raid6_personality= 2220static mdk_personality_t raid6_personality=
2104{ 2221{
2105 .name = "raid6", 2222 .name = "raid6",
@@ -2114,6 +2231,7 @@ static mdk_personality_t raid6_personality=
2114 .spare_active = raid6_spare_active, 2231 .spare_active = raid6_spare_active,
2115 .sync_request = sync_request, 2232 .sync_request = sync_request,
2116 .resize = raid6_resize, 2233 .resize = raid6_resize,
2234 .quiesce = raid6_quiesce,
2117}; 2235};
2118 2236
2119static int __init raid6_init (void) 2237static int __init raid6_init (void)