aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-22 15:29:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-22 15:29:50 -0400
commit267d7b23dd62f6ec55e0fba777e456495c308fc7 (patch)
tree5c9fe0f07d5b87029b9c07eb003596c05d161a8f /drivers/md
parent28f23d1f3b6a6078312b6e9585e583cc7326fe22 (diff)
parentecb178bb2b154a40cfae9fa4c42e62ccfa81ac6b (diff)
Merge tag 'md-3.4' of git://neil.brown.name/md
Pull md updates for 3.4 from Neil Brown: "Mostly tidying up code in preparation for some bigger changes next time. A few bug fixes tagged for -stable. Main functionality change is that some RAID10 arrays can now grow to use extra space that may have been made available on the individual devices." Fixed up trivial conflicts with the k[un]map_atomic() cleanups in drivers/md/bitmap.c. * tag 'md-3.4' of git://neil.brown.name/md: (22 commits) md: Add judgement bb->unacked_exist in function md_ack_all_badblocks(). md: fix clearing of the 'changed' flags for the bad blocks list. md/bitmap: discard CHUNK_BLOCK_SHIFT macro md/bitmap: remove unnecessary indirection when allocating. md/bitmap: remove some pointless locking. md/bitmap: change a 'goto' to a normal 'if' construct. md/bitmap: move printing of bitmap status to bitmap.c md/bitmap: remove some unused noise from bitmap.h md/raid10 - support resizing some RAID10 arrays. md/raid1: handle merge_bvec_fn in member devices. md/raid10: handle merge_bvec_fn in member devices. md: add proper merge_bvec handling to RAID0 and Linear. md: tidy up rdev_for_each usage. md/raid1,raid10: avoid deadlock during resync/recovery. md/bitmap: ensure to load bitmap when creating via sysfs. md: don't set md arrays to readonly on shutdown. md: allow re-add to failed arrays. md/raid5: use atomic_dec_return() instead of atomic_dec() and atomic_read(). md: Use existed macros instead of numbers md/raid5: removed unused 'added_devices' variable. ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c152
-rw-r--r--drivers/md/bitmap.h22
-rw-r--r--drivers/md/dm-raid.c16
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c32
-rw-r--r--drivers/md/md.c140
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/raid0.c164
-rw-r--r--drivers/md/raid0.h11
-rw-r--r--drivers/md/raid1.c98
-rw-r--r--drivers/md/raid10.c187
-rw-r--r--drivers/md/raid5.c25
13 files changed, 491 insertions, 373 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 045e086144ad..3d0dfa7a89a2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/seq_file.h>
29#include "md.h" 30#include "md.h"
30#include "bitmap.h" 31#include "bitmap.h"
31 32
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
35} 36}
36 37
37/* 38/*
38 * just a placeholder - calls kmalloc for bitmap pages
39 */
40static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
41{
42 unsigned char *page;
43
44 page = kzalloc(PAGE_SIZE, GFP_NOIO);
45 if (!page)
46 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
47 else
48 pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
49 bmname(bitmap), page);
50 return page;
51}
52
53/*
54 * for now just a placeholder -- just calls kfree for bitmap pages
55 */
56static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
57{
58 pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
59 kfree(page);
60}
61
62/*
63 * check a page and, if necessary, allocate it (or hijack it if the alloc fails) 39 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
64 * 40 *
65 * 1) check to see if this page is allocated, if it's not then try to alloc 41 * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
96 /* this page has not been allocated yet */ 72 /* this page has not been allocated yet */
97 73
98 spin_unlock_irq(&bitmap->lock); 74 spin_unlock_irq(&bitmap->lock);
99 mappage = bitmap_alloc_page(bitmap); 75 mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
100 spin_lock_irq(&bitmap->lock); 76 spin_lock_irq(&bitmap->lock);
101 77
102 if (mappage == NULL) { 78 if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
109 } else if (bitmap->bp[page].map || 85 } else if (bitmap->bp[page].map ||
110 bitmap->bp[page].hijacked) { 86 bitmap->bp[page].hijacked) {
111 /* somebody beat us to getting the page */ 87 /* somebody beat us to getting the page */
112 bitmap_free_page(bitmap, mappage); 88 kfree(mappage);
113 return 0; 89 return 0;
114 } else { 90 } else {
115 91
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
141 ptr = bitmap->bp[page].map; 117 ptr = bitmap->bp[page].map;
142 bitmap->bp[page].map = NULL; 118 bitmap->bp[page].map = NULL;
143 bitmap->missing_pages++; 119 bitmap->missing_pages++;
144 bitmap_free_page(bitmap, ptr); 120 kfree(ptr);
145 } 121 }
146} 122}
147 123
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
171 did_alloc = 1; 147 did_alloc = 1;
172 } 148 }
173 149
174 list_for_each_entry(rdev, &mddev->disks, same_set) { 150 rdev_for_each(rdev, mddev) {
175 if (! test_bit(In_sync, &rdev->flags) 151 if (! test_bit(In_sync, &rdev->flags)
176 || test_bit(Faulty, &rdev->flags)) 152 || test_bit(Faulty, &rdev->flags))
177 continue; 153 continue;
@@ -445,18 +421,13 @@ out:
445void bitmap_update_sb(struct bitmap *bitmap) 421void bitmap_update_sb(struct bitmap *bitmap)
446{ 422{
447 bitmap_super_t *sb; 423 bitmap_super_t *sb;
448 unsigned long flags;
449 424
450 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 425 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
451 return; 426 return;
452 if (bitmap->mddev->bitmap_info.external) 427 if (bitmap->mddev->bitmap_info.external)
453 return; 428 return;
454 spin_lock_irqsave(&bitmap->lock, flags); 429 if (!bitmap->sb_page) /* no superblock */
455 if (!bitmap->sb_page) { /* no superblock */
456 spin_unlock_irqrestore(&bitmap->lock, flags);
457 return; 430 return;
458 }
459 spin_unlock_irqrestore(&bitmap->lock, flags);
460 sb = kmap_atomic(bitmap->sb_page); 431 sb = kmap_atomic(bitmap->sb_page);
461 sb->events = cpu_to_le64(bitmap->mddev->events); 432 sb->events = cpu_to_le64(bitmap->mddev->events);
462 if (bitmap->mddev->events < bitmap->events_cleared) 433 if (bitmap->mddev->events < bitmap->events_cleared)
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
632 /* keep the array size field of the bitmap superblock up to date */ 603 /* keep the array size field of the bitmap superblock up to date */
633 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); 604 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
634 605
635 if (!bitmap->mddev->persistent) 606 if (bitmap->mddev->persistent) {
636 goto success; 607 /*
637 608 * We have a persistent array superblock, so compare the
638 /* 609 * bitmap's UUID and event counter to the mddev's
639 * if we have a persistent array superblock, compare the 610 */
640 * bitmap's UUID and event counter to the mddev's 611 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
641 */ 612 printk(KERN_INFO
642 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { 613 "%s: bitmap superblock UUID mismatch\n",
643 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", 614 bmname(bitmap));
644 bmname(bitmap)); 615 goto out;
645 goto out; 616 }
646 } 617 events = le64_to_cpu(sb->events);
647 events = le64_to_cpu(sb->events); 618 if (events < bitmap->mddev->events) {
648 if (events < bitmap->mddev->events) { 619 printk(KERN_INFO
649 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " 620 "%s: bitmap file is out of date (%llu < %llu) "
650 "-- forcing full recovery\n", bmname(bitmap), events, 621 "-- forcing full recovery\n",
651 (unsigned long long) bitmap->mddev->events); 622 bmname(bitmap), events,
652 sb->state |= cpu_to_le32(BITMAP_STALE); 623 (unsigned long long) bitmap->mddev->events);
624 sb->state |= cpu_to_le32(BITMAP_STALE);
625 }
653 } 626 }
654success: 627
655 /* assign fields using values from superblock */ 628 /* assign fields using values from superblock */
656 bitmap->mddev->bitmap_info.chunksize = chunksize; 629 bitmap->mddev->bitmap_info.chunksize = chunksize;
657 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; 630 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -680,15 +653,10 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
680 enum bitmap_mask_op op) 653 enum bitmap_mask_op op)
681{ 654{
682 bitmap_super_t *sb; 655 bitmap_super_t *sb;
683 unsigned long flags;
684 int old; 656 int old;
685 657
686 spin_lock_irqsave(&bitmap->lock, flags); 658 if (!bitmap->sb_page) /* can't set the state */
687 if (!bitmap->sb_page) { /* can't set the state */
688 spin_unlock_irqrestore(&bitmap->lock, flags);
689 return 0; 659 return 0;
690 }
691 spin_unlock_irqrestore(&bitmap->lock, flags);
692 sb = kmap_atomic(bitmap->sb_page); 660 sb = kmap_atomic(bitmap->sb_page);
693 old = le32_to_cpu(sb->state) & bits; 661 old = le32_to_cpu(sb->state) & bits;
694 switch (op) { 662 switch (op) {
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
870 unsigned long bit; 838 unsigned long bit;
871 struct page *page; 839 struct page *page;
872 void *kaddr; 840 void *kaddr;
873 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 841 unsigned long chunk = block >> bitmap->chunkshift;
874 842
875 if (!bitmap->filemap) 843 if (!bitmap->filemap)
876 return; 844 return;
@@ -1069,10 +1037,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1069 kunmap_atomic(paddr); 1037 kunmap_atomic(paddr);
1070 if (b) { 1038 if (b) {
1071 /* if the disk bit is set, set the memory bit */ 1039 /* if the disk bit is set, set the memory bit */
1072 int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) 1040 int needed = ((sector_t)(i+1) << bitmap->chunkshift
1073 >= start); 1041 >= start);
1074 bitmap_set_memory_bits(bitmap, 1042 bitmap_set_memory_bits(bitmap,
1075 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1043 (sector_t)i << bitmap->chunkshift,
1076 needed); 1044 needed);
1077 bit_cnt++; 1045 bit_cnt++;
1078 } 1046 }
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
1116 1084
1117static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1085static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
1118{ 1086{
1119 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1087 sector_t chunk = offset >> bitmap->chunkshift;
1120 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1088 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1121 bitmap->bp[page].count += inc; 1089 bitmap->bp[page].count += inc;
1122 bitmap_checkfree(bitmap, page); 1090 bitmap_checkfree(bitmap, page);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1222 bitmap->allclean = 0; 1190 bitmap->allclean = 0;
1223 } 1191 }
1224 bmc = bitmap_get_counter(bitmap, 1192 bmc = bitmap_get_counter(bitmap,
1225 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1193 (sector_t)j << bitmap->chunkshift,
1226 &blocks, 0); 1194 &blocks, 0);
1227 if (!bmc) 1195 if (!bmc)
1228 j |= PAGE_COUNTER_MASK; 1196 j |= PAGE_COUNTER_MASK;
@@ -1231,7 +1199,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1231 /* we can clear the bit */ 1199 /* we can clear the bit */
1232 *bmc = 0; 1200 *bmc = 0;
1233 bitmap_count_page(bitmap, 1201 bitmap_count_page(bitmap,
1234 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1202 (sector_t)j << bitmap->chunkshift,
1235 -1); 1203 -1);
1236 1204
1237 /* clear the bit */ 1205 /* clear the bit */
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
1285 * The lock must have been taken with interrupts enabled. 1253 * The lock must have been taken with interrupts enabled.
1286 * If !create, we don't release the lock. 1254 * If !create, we don't release the lock.
1287 */ 1255 */
1288 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1256 sector_t chunk = offset >> bitmap->chunkshift;
1289 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1257 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1290 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1258 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1291 sector_t csize; 1259 sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
1295 1263
1296 if (bitmap->bp[page].hijacked || 1264 if (bitmap->bp[page].hijacked ||
1297 bitmap->bp[page].map == NULL) 1265 bitmap->bp[page].map == NULL)
1298 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + 1266 csize = ((sector_t)1) << (bitmap->chunkshift +
1299 PAGE_COUNTER_SHIFT - 1); 1267 PAGE_COUNTER_SHIFT - 1);
1300 else 1268 else
1301 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); 1269 csize = ((sector_t)1) << bitmap->chunkshift;
1302 *blocks = csize - (offset & (csize - 1)); 1270 *blocks = csize - (offset & (csize - 1));
1303 1271
1304 if (err < 0) 1272 if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1424 set_page_attr(bitmap, 1392 set_page_attr(bitmap,
1425 filemap_get_page( 1393 filemap_get_page(
1426 bitmap, 1394 bitmap,
1427 offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1395 offset >> bitmap->chunkshift),
1428 BITMAP_PAGE_PENDING); 1396 BITMAP_PAGE_PENDING);
1429 bitmap->allclean = 0; 1397 bitmap->allclean = 0;
1430 } 1398 }
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1512 else { 1480 else {
1513 if (*bmc <= 2) { 1481 if (*bmc <= 2) {
1514 set_page_attr(bitmap, 1482 set_page_attr(bitmap,
1515 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1483 filemap_get_page(bitmap, offset >> bitmap->chunkshift),
1516 BITMAP_PAGE_PENDING); 1484 BITMAP_PAGE_PENDING);
1517 bitmap->allclean = 0; 1485 bitmap->allclean = 0;
1518 } 1486 }
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1559 1527
1560 bitmap->mddev->curr_resync_completed = sector; 1528 bitmap->mddev->curr_resync_completed = sector;
1561 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1529 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1562 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1530 sector &= ~((1ULL << bitmap->chunkshift) - 1);
1563 s = 0; 1531 s = 0;
1564 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1532 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1565 bitmap_end_sync(bitmap, s, &blocks, 0); 1533 bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1589 struct page *page; 1557 struct page *page;
1590 *bmc = 2 | (needed ? NEEDED_MASK : 0); 1558 *bmc = 2 | (needed ? NEEDED_MASK : 0);
1591 bitmap_count_page(bitmap, offset, 1); 1559 bitmap_count_page(bitmap, offset, 1);
1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1560 page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1561 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
1594 bitmap->allclean = 0; 1562 bitmap->allclean = 0;
1595 } 1563 }
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1602 unsigned long chunk; 1570 unsigned long chunk;
1603 1571
1604 for (chunk = s; chunk <= e; chunk++) { 1572 for (chunk = s; chunk <= e; chunk++) {
1605 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); 1573 sector_t sec = (sector_t)chunk << bitmap->chunkshift;
1606 bitmap_set_memory_bits(bitmap, sec, 1); 1574 bitmap_set_memory_bits(bitmap, sec, 1);
1607 spin_lock_irq(&bitmap->lock); 1575 spin_lock_irq(&bitmap->lock);
1608 bitmap_file_set_bit(bitmap, sec); 1576 bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
1759 goto error; 1727 goto error;
1760 1728
1761 bitmap->daemon_lastrun = jiffies; 1729 bitmap->daemon_lastrun = jiffies;
1762 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); 1730 bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
1731 - BITMAP_BLOCK_SHIFT);
1763 1732
1764 /* now that chunksize and chunkshift are set, we can use these macros */ 1733 /* now that chunksize and chunkshift are set, we can use these macros */
1765 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1734 chunks = (blocks + bitmap->chunkshift - 1) >>
1766 CHUNK_BLOCK_SHIFT(bitmap); 1735 bitmap->chunkshift;
1767 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; 1736 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1768 1737
1769 BUG_ON(!pages); 1738 BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
1836} 1805}
1837EXPORT_SYMBOL_GPL(bitmap_load); 1806EXPORT_SYMBOL_GPL(bitmap_load);
1838 1807
1808void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
1809{
1810 unsigned long chunk_kb;
1811 unsigned long flags;
1812
1813 if (!bitmap)
1814 return;
1815
1816 spin_lock_irqsave(&bitmap->lock, flags);
1817 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
1818 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
1819 "%lu%s chunk",
1820 bitmap->pages - bitmap->missing_pages,
1821 bitmap->pages,
1822 (bitmap->pages - bitmap->missing_pages)
1823 << (PAGE_SHIFT - 10),
1824 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
1825 chunk_kb ? "KB" : "B");
1826 if (bitmap->file) {
1827 seq_printf(seq, ", file: ");
1828 seq_path(seq, &bitmap->file->f_path, " \t\n");
1829 }
1830
1831 seq_printf(seq, "\n");
1832 spin_unlock_irqrestore(&bitmap->lock, flags);
1833}
1834
1839static ssize_t 1835static ssize_t
1840location_show(struct mddev *mddev, char *page) 1836location_show(struct mddev *mddev, char *page)
1841{ 1837{
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
1904 if (mddev->pers) { 1900 if (mddev->pers) {
1905 mddev->pers->quiesce(mddev, 1); 1901 mddev->pers->quiesce(mddev, 1);
1906 rv = bitmap_create(mddev); 1902 rv = bitmap_create(mddev);
1903 if (!rv)
1904 rv = bitmap_load(mddev);
1907 if (rv) { 1905 if (rv) {
1908 bitmap_destroy(mddev); 1906 bitmap_destroy(mddev);
1909 mddev->bitmap_info.offset = 0; 1907 mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3e..55ca5aec84e4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
13#define BITMAP_MAJOR_HI 4 13#define BITMAP_MAJOR_HI 4
14#define BITMAP_MAJOR_HOSTENDIAN 3 14#define BITMAP_MAJOR_HOSTENDIAN 3
15 15
16#define BITMAP_MINOR 39
17
18/* 16/*
19 * in-memory bitmap: 17 * in-memory bitmap:
20 * 18 *
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
101/* same, except a mask value for more efficient bitops */ 99/* same, except a mask value for more efficient bitops */
102#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) 100#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
103 101
104#define BITMAP_BLOCK_SIZE 512
105#define BITMAP_BLOCK_SHIFT 9 102#define BITMAP_BLOCK_SHIFT 9
106 103
107/* how many blocks per chunk? (this is variable) */ 104/* how many blocks per chunk? (this is variable) */
108#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) 105#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
109#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
111
112/* when hijacked, the counters and bits represent even larger "chunks" */
113/* there will be 1024 chunks represented by each counter in the page pointers */
114#define PAGEPTR_BLOCK_RATIO(bitmap) \
115 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
116#define PAGEPTR_BLOCK_SHIFT(bitmap) \
117 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
118#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
119 106
120#endif 107#endif
121 108
@@ -181,12 +168,6 @@ struct bitmap_page {
181 unsigned int count:31; 168 unsigned int count:31;
182}; 169};
183 170
184/* keep track of bitmap file pages that have pending writes on them */
185struct page_list {
186 struct list_head list;
187 struct page *page;
188};
189
190/* the main bitmap structure - one per mddev */ 171/* the main bitmap structure - one per mddev */
191struct bitmap { 172struct bitmap {
192 struct bitmap_page *bp; 173 struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
196 struct mddev *mddev; /* the md device that the bitmap is for */ 177 struct mddev *mddev; /* the md device that the bitmap is for */
197 178
198 /* bitmap chunksize -- how much data does each bit represent? */ 179 /* bitmap chunksize -- how much data does each bit represent? */
199 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 180 unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
200 unsigned long chunks; /* total number of data chunks for the array */ 181 unsigned long chunks; /* total number of data chunks for the array */
201 182
202 __u64 events_cleared; 183 __u64 events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
245 226
246void bitmap_print_sb(struct bitmap *bitmap); 227void bitmap_print_sb(struct bitmap *bitmap);
247void bitmap_update_sb(struct bitmap *bitmap); 228void bitmap_update_sb(struct bitmap *bitmap);
229void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
248 230
249int bitmap_setallbits(struct bitmap *bitmap); 231int bitmap_setallbits(struct bitmap *bitmap);
250void bitmap_write_all(struct bitmap *bitmap); 232void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 787022c18187..c5a875d7b882 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -615,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
615 615
616static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 616static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
617{ 617{
618 struct md_rdev *r, *t; 618 struct md_rdev *r;
619 uint64_t failed_devices; 619 uint64_t failed_devices;
620 struct dm_raid_superblock *sb; 620 struct dm_raid_superblock *sb;
621 621
622 sb = page_address(rdev->sb_page); 622 sb = page_address(rdev->sb_page);
623 failed_devices = le64_to_cpu(sb->failed_devices); 623 failed_devices = le64_to_cpu(sb->failed_devices);
624 624
625 rdev_for_each(r, t, mddev) 625 rdev_for_each(r, mddev)
626 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 626 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
627 failed_devices |= (1ULL << r->raid_disk); 627 failed_devices |= (1ULL << r->raid_disk);
628 628
@@ -707,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
707 struct dm_raid_superblock *sb; 707 struct dm_raid_superblock *sb;
708 uint32_t new_devs = 0; 708 uint32_t new_devs = 0;
709 uint32_t rebuilds = 0; 709 uint32_t rebuilds = 0;
710 struct md_rdev *r, *t; 710 struct md_rdev *r;
711 struct dm_raid_superblock *sb2; 711 struct dm_raid_superblock *sb2;
712 712
713 sb = page_address(rdev->sb_page); 713 sb = page_address(rdev->sb_page);
@@ -750,7 +750,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
750 * case the In_sync bit will /not/ be set and 750 * case the In_sync bit will /not/ be set and
751 * recovery_cp must be MaxSector. 751 * recovery_cp must be MaxSector.
752 */ 752 */
753 rdev_for_each(r, t, mddev) { 753 rdev_for_each(r, mddev) {
754 if (!test_bit(In_sync, &r->flags)) { 754 if (!test_bit(In_sync, &r->flags)) {
755 DMINFO("Device %d specified for rebuild: " 755 DMINFO("Device %d specified for rebuild: "
756 "Clearing superblock", r->raid_disk); 756 "Clearing superblock", r->raid_disk);
@@ -782,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
782 * Now we set the Faulty bit for those devices that are 782 * Now we set the Faulty bit for those devices that are
783 * recorded in the superblock as failed. 783 * recorded in the superblock as failed.
784 */ 784 */
785 rdev_for_each(r, t, mddev) { 785 rdev_for_each(r, mddev) {
786 if (!r->sb_page) 786 if (!r->sb_page)
787 continue; 787 continue;
788 sb2 = page_address(r->sb_page); 788 sb2 = page_address(r->sb_page);
@@ -855,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
855static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 855static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
856{ 856{
857 int ret; 857 int ret;
858 struct md_rdev *rdev, *freshest, *tmp; 858 struct md_rdev *rdev, *freshest;
859 struct mddev *mddev = &rs->md; 859 struct mddev *mddev = &rs->md;
860 860
861 freshest = NULL; 861 freshest = NULL;
862 rdev_for_each(rdev, tmp, mddev) { 862 rdev_for_each(rdev, mddev) {
863 if (!rdev->meta_bdev) 863 if (!rdev->meta_bdev)
864 continue; 864 continue;
865 865
@@ -888,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
888 if (super_validate(mddev, freshest)) 888 if (super_validate(mddev, freshest))
889 return -EINVAL; 889 return -EINVAL;
890 890
891 rdev_for_each(rdev, tmp, mddev) 891 rdev_for_each(rdev, mddev)
892 if ((rdev != freshest) && super_validate(mddev, rdev)) 892 if ((rdev != freshest) && super_validate(mddev, rdev))
893 return -EINVAL; 893 return -EINVAL;
894 894
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb44..45135f69509c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
315 } 315 }
316 conf->nfaults = 0; 316 conf->nfaults = 0;
317 317
318 list_for_each_entry(rdev, &mddev->disks, same_set) 318 rdev_for_each(rdev, mddev)
319 conf->rdev = rdev; 319 conf->rdev = rdev;
320 320
321 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); 321 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb3..b0fcc7d02adb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
68 struct dev_info *dev0; 68 struct dev_info *dev0;
69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; 69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
71 int maxbytes = biovec->bv_len;
72 struct request_queue *subq;
71 73
72 rcu_read_lock(); 74 rcu_read_lock();
73 dev0 = which_dev(mddev, sector); 75 dev0 = which_dev(mddev, sector);
74 maxsectors = dev0->end_sector - sector; 76 maxsectors = dev0->end_sector - sector;
77 subq = bdev_get_queue(dev0->rdev->bdev);
78 if (subq->merge_bvec_fn) {
79 bvm->bi_bdev = dev0->rdev->bdev;
80 bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
81 maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
82 biovec));
83 }
75 rcu_read_unlock(); 84 rcu_read_unlock();
76 85
77 if (maxsectors < bio_sectors) 86 if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
80 maxsectors -= bio_sectors; 89 maxsectors -= bio_sectors;
81 90
82 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) 91 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
83 return biovec->bv_len; 92 return maxbytes;
84 /* The bytes available at this offset could be really big, 93
85 * so we cap at 2^31 to avoid overflow */ 94 if (maxsectors > (maxbytes >> 9))
86 if (maxsectors > (1 << (31-9))) 95 return maxbytes;
87 return 1<<31; 96 else
88 return maxsectors << 9; 97 return maxsectors << 9;
89} 98}
90 99
91static int linear_congested(void *data, int bits) 100static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
138 cnt = 0; 147 cnt = 0;
139 conf->array_sectors = 0; 148 conf->array_sectors = 0;
140 149
141 list_for_each_entry(rdev, &mddev->disks, same_set) { 150 rdev_for_each(rdev, mddev) {
142 int j = rdev->raid_disk; 151 int j = rdev->raid_disk;
143 struct dev_info *disk = conf->disks + j; 152 struct dev_info *disk = conf->disks + j;
144 sector_t sectors; 153 sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
158 167
159 disk_stack_limits(mddev->gendisk, rdev->bdev, 168 disk_stack_limits(mddev->gendisk, rdev->bdev,
160 rdev->data_offset << 9); 169 rdev->data_offset << 9);
161 /* as we don't honour merge_bvec_fn, we must never risk
162 * violating it, so limit max_segments to 1 lying within
163 * a single page.
164 */
165 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
166 blk_queue_max_segments(mddev->queue, 1);
167 blk_queue_segment_boundary(mddev->queue,
168 PAGE_CACHE_SIZE - 1);
169 }
170 170
171 conf->array_sectors += rdev->sectors; 171 conf->array_sectors += rdev->sectors;
172 cnt++; 172 cnt++;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755baf4a..b572e1e386ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
439 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 439 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
440 atomic_set(&mddev->flush_pending, 1); 440 atomic_set(&mddev->flush_pending, 1);
441 rcu_read_lock(); 441 rcu_read_lock();
442 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 442 rdev_for_each_rcu(rdev, mddev)
443 if (rdev->raid_disk >= 0 && 443 if (rdev->raid_disk >= 0 &&
444 !test_bit(Faulty, &rdev->flags)) { 444 !test_bit(Faulty, &rdev->flags)) {
445 /* Take two references, one is dropped 445 /* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
749{ 749{
750 struct md_rdev *rdev; 750 struct md_rdev *rdev;
751 751
752 list_for_each_entry(rdev, &mddev->disks, same_set) 752 rdev_for_each(rdev, mddev)
753 if (rdev->desc_nr == nr) 753 if (rdev->desc_nr == nr)
754 return rdev; 754 return rdev;
755 755
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
760{ 760{
761 struct md_rdev *rdev; 761 struct md_rdev *rdev;
762 762
763 list_for_each_entry(rdev, &mddev->disks, same_set) 763 rdev_for_each(rdev, mddev)
764 if (rdev->bdev->bd_dev == dev) 764 if (rdev->bdev->bd_dev == dev)
765 return rdev; 765 return rdev;
766 766
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1342 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1342 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1343 1343
1344 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1344 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1345 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1345 rdev_for_each(rdev2, mddev) {
1346 mdp_disk_t *d; 1346 mdp_disk_t *d;
1347 int desc_nr; 1347 int desc_nr;
1348 int is_active = test_bit(In_sync, &rdev2->flags); 1348 int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
1805 | BB_LEN(internal_bb)); 1805 | BB_LEN(internal_bb));
1806 *bbp++ = cpu_to_le64(store_bb); 1806 *bbp++ = cpu_to_le64(store_bb);
1807 } 1807 }
1808 bb->changed = 0;
1808 if (read_seqretry(&bb->lock, seq)) 1809 if (read_seqretry(&bb->lock, seq))
1809 goto retry; 1810 goto retry;
1810 1811
1811 bb->sector = (rdev->sb_start + 1812 bb->sector = (rdev->sb_start +
1812 (int)le32_to_cpu(sb->bblog_offset)); 1813 (int)le32_to_cpu(sb->bblog_offset));
1813 bb->size = le16_to_cpu(sb->bblog_size); 1814 bb->size = le16_to_cpu(sb->bblog_size);
1814 bb->changed = 0;
1815 } 1815 }
1816 } 1816 }
1817 1817
1818 max_dev = 0; 1818 max_dev = 0;
1819 list_for_each_entry(rdev2, &mddev->disks, same_set) 1819 rdev_for_each(rdev2, mddev)
1820 if (rdev2->desc_nr+1 > max_dev) 1820 if (rdev2->desc_nr+1 > max_dev)
1821 max_dev = rdev2->desc_nr+1; 1821 max_dev = rdev2->desc_nr+1;
1822 1822
@@ -1833,7 +1833,7 @@ retry:
1833 for (i=0; i<max_dev;i++) 1833 for (i=0; i<max_dev;i++)
1834 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1834 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1835 1835
1836 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1836 rdev_for_each(rdev2, mddev) {
1837 i = rdev2->desc_nr; 1837 i = rdev2->desc_nr;
1838 if (test_bit(Faulty, &rdev2->flags)) 1838 if (test_bit(Faulty, &rdev2->flags))
1839 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1839 sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
1948 return 0; /* nothing to do */ 1948 return 0; /* nothing to do */
1949 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1949 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1950 return 0; /* shouldn't register, or already is */ 1950 return 0; /* shouldn't register, or already is */
1951 list_for_each_entry(rdev, &mddev->disks, same_set) { 1951 rdev_for_each(rdev, mddev) {
1952 /* skip spares and non-functional disks */ 1952 /* skip spares and non-functional disks */
1953 if (test_bit(Faulty, &rdev->flags)) 1953 if (test_bit(Faulty, &rdev->flags))
1954 continue; 1954 continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
2175{ 2175{
2176 struct md_rdev *rdev, *tmp; 2176 struct md_rdev *rdev, *tmp;
2177 2177
2178 rdev_for_each(rdev, tmp, mddev) { 2178 rdev_for_each_safe(rdev, tmp, mddev) {
2179 if (!rdev->mddev) { 2179 if (!rdev->mddev) {
2180 MD_BUG(); 2180 MD_BUG();
2181 continue; 2181 continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
2307 bitmap_print_sb(mddev->bitmap); 2307 bitmap_print_sb(mddev->bitmap);
2308 else 2308 else
2309 printk("%s: ", mdname(mddev)); 2309 printk("%s: ", mdname(mddev));
2310 list_for_each_entry(rdev, &mddev->disks, same_set) 2310 rdev_for_each(rdev, mddev)
2311 printk("<%s>", bdevname(rdev->bdev,b)); 2311 printk("<%s>", bdevname(rdev->bdev,b));
2312 printk("\n"); 2312 printk("\n");
2313 2313
2314 list_for_each_entry(rdev, &mddev->disks, same_set) 2314 rdev_for_each(rdev, mddev)
2315 print_rdev(rdev, mddev->major_version); 2315 print_rdev(rdev, mddev->major_version);
2316 } 2316 }
2317 printk("md: **********************************\n"); 2317 printk("md: **********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
2328 * with the rest of the array) 2328 * with the rest of the array)
2329 */ 2329 */
2330 struct md_rdev *rdev; 2330 struct md_rdev *rdev;
2331 list_for_each_entry(rdev, &mddev->disks, same_set) { 2331 rdev_for_each(rdev, mddev) {
2332 if (rdev->sb_events == mddev->events || 2332 if (rdev->sb_events == mddev->events ||
2333 (nospares && 2333 (nospares &&
2334 rdev->raid_disk < 0 && 2334 rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
2351 2351
2352repeat: 2352repeat:
2353 /* First make sure individual recovery_offsets are correct */ 2353 /* First make sure individual recovery_offsets are correct */
2354 list_for_each_entry(rdev, &mddev->disks, same_set) { 2354 rdev_for_each(rdev, mddev) {
2355 if (rdev->raid_disk >= 0 && 2355 if (rdev->raid_disk >= 0 &&
2356 mddev->delta_disks >= 0 && 2356 mddev->delta_disks >= 0 &&
2357 !test_bit(In_sync, &rdev->flags) && 2357 !test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
2364 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2364 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2365 if (!mddev->external) { 2365 if (!mddev->external) {
2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2367 list_for_each_entry(rdev, &mddev->disks, same_set) { 2367 rdev_for_each(rdev, mddev) {
2368 if (rdev->badblocks.changed) { 2368 if (rdev->badblocks.changed) {
2369 rdev->badblocks.changed = 0;
2369 md_ack_all_badblocks(&rdev->badblocks); 2370 md_ack_all_badblocks(&rdev->badblocks);
2370 md_error(mddev, rdev); 2371 md_error(mddev, rdev);
2371 } 2372 }
@@ -2430,7 +2431,7 @@ repeat:
2430 mddev->events --; 2431 mddev->events --;
2431 } 2432 }
2432 2433
2433 list_for_each_entry(rdev, &mddev->disks, same_set) { 2434 rdev_for_each(rdev, mddev) {
2434 if (rdev->badblocks.changed) 2435 if (rdev->badblocks.changed)
2435 any_badblocks_changed++; 2436 any_badblocks_changed++;
2436 if (test_bit(Faulty, &rdev->flags)) 2437 if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
2444 mdname(mddev), mddev->in_sync); 2445 mdname(mddev), mddev->in_sync);
2445 2446
2446 bitmap_update_sb(mddev->bitmap); 2447 bitmap_update_sb(mddev->bitmap);
2447 list_for_each_entry(rdev, &mddev->disks, same_set) { 2448 rdev_for_each(rdev, mddev) {
2448 char b[BDEVNAME_SIZE]; 2449 char b[BDEVNAME_SIZE];
2449 2450
2450 if (rdev->sb_loaded != 1) 2451 if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
2493 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2494 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2494 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2495 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2495 2496
2496 list_for_each_entry(rdev, &mddev->disks, same_set) { 2497 rdev_for_each(rdev, mddev) {
2497 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2498 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2498 clear_bit(Blocked, &rdev->flags); 2499 clear_bit(Blocked, &rdev->flags);
2499 2500
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2896 struct md_rdev *rdev2; 2897 struct md_rdev *rdev2;
2897 2898
2898 mddev_lock(mddev); 2899 mddev_lock(mddev);
2899 list_for_each_entry(rdev2, &mddev->disks, same_set) 2900 rdev_for_each(rdev2, mddev)
2900 if (rdev->bdev == rdev2->bdev && 2901 if (rdev->bdev == rdev2->bdev &&
2901 rdev != rdev2 && 2902 rdev != rdev2 &&
2902 overlaps(rdev->data_offset, rdev->sectors, 2903 overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
3193 char b[BDEVNAME_SIZE]; 3194 char b[BDEVNAME_SIZE];
3194 3195
3195 freshest = NULL; 3196 freshest = NULL;
3196 rdev_for_each(rdev, tmp, mddev) 3197 rdev_for_each_safe(rdev, tmp, mddev)
3197 switch (super_types[mddev->major_version]. 3198 switch (super_types[mddev->major_version].
3198 load_super(rdev, freshest, mddev->minor_version)) { 3199 load_super(rdev, freshest, mddev->minor_version)) {
3199 case 1: 3200 case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
3214 validate_super(mddev, freshest); 3215 validate_super(mddev, freshest);
3215 3216
3216 i = 0; 3217 i = 0;
3217 rdev_for_each(rdev, tmp, mddev) { 3218 rdev_for_each_safe(rdev, tmp, mddev) {
3218 if (mddev->max_disks && 3219 if (mddev->max_disks &&
3219 (rdev->desc_nr >= mddev->max_disks || 3220 (rdev->desc_nr >= mddev->max_disks ||
3220 i > mddev->max_disks)) { 3221 i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3403 return -EINVAL; 3404 return -EINVAL;
3404 } 3405 }
3405 3406
3406 list_for_each_entry(rdev, &mddev->disks, same_set) 3407 rdev_for_each(rdev, mddev)
3407 rdev->new_raid_disk = rdev->raid_disk; 3408 rdev->new_raid_disk = rdev->raid_disk;
3408 3409
3409 /* ->takeover must set new_* and/or delta_disks 3410 /* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3456 mddev->safemode = 0; 3457 mddev->safemode = 0;
3457 } 3458 }
3458 3459
3459 list_for_each_entry(rdev, &mddev->disks, same_set) { 3460 rdev_for_each(rdev, mddev) {
3460 if (rdev->raid_disk < 0) 3461 if (rdev->raid_disk < 0)
3461 continue; 3462 continue;
3462 if (rdev->new_raid_disk >= mddev->raid_disks) 3463 if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3465 continue; 3466 continue;
3466 sysfs_unlink_rdev(mddev, rdev); 3467 sysfs_unlink_rdev(mddev, rdev);
3467 } 3468 }
3468 list_for_each_entry(rdev, &mddev->disks, same_set) { 3469 rdev_for_each(rdev, mddev) {
3469 if (rdev->raid_disk < 0) 3470 if (rdev->raid_disk < 0)
3470 continue; 3471 continue;
3471 if (rdev->new_raid_disk == rdev->raid_disk) 3472 if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev)
4796 * the only valid external interface is through the md 4797 * the only valid external interface is through the md
4797 * device. 4798 * device.
4798 */ 4799 */
4799 list_for_each_entry(rdev, &mddev->disks, same_set) { 4800 rdev_for_each(rdev, mddev) {
4800 if (test_bit(Faulty, &rdev->flags)) 4801 if (test_bit(Faulty, &rdev->flags))
4801 continue; 4802 continue;
4802 sync_blockdev(rdev->bdev); 4803 sync_blockdev(rdev->bdev);
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev)
4867 struct md_rdev *rdev2; 4868 struct md_rdev *rdev2;
4868 int warned = 0; 4869 int warned = 0;
4869 4870
4870 list_for_each_entry(rdev, &mddev->disks, same_set) 4871 rdev_for_each(rdev, mddev)
4871 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4872 rdev_for_each(rdev2, mddev) {
4872 if (rdev < rdev2 && 4873 if (rdev < rdev2 &&
4873 rdev->bdev->bd_contains == 4874 rdev->bdev->bd_contains ==
4874 rdev2->bdev->bd_contains) { 4875 rdev2->bdev->bd_contains) {
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev)
4945 mddev->in_sync = 1; 4946 mddev->in_sync = 1;
4946 smp_wmb(); 4947 smp_wmb();
4947 mddev->ready = 1; 4948 mddev->ready = 1;
4948 list_for_each_entry(rdev, &mddev->disks, same_set) 4949 rdev_for_each(rdev, mddev)
4949 if (rdev->raid_disk >= 0) 4950 if (rdev->raid_disk >= 0)
4950 if (sysfs_link_rdev(mddev, rdev)) 4951 if (sysfs_link_rdev(mddev, rdev))
4951 /* failure here is OK */; 4952 /* failure here is OK */;
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
5073 mddev->changed = 0; 5074 mddev->changed = 0;
5074 mddev->degraded = 0; 5075 mddev->degraded = 0;
5075 mddev->safemode = 0; 5076 mddev->safemode = 0;
5077 mddev->merge_check_needed = 0;
5076 mddev->bitmap_info.offset = 0; 5078 mddev->bitmap_info.offset = 0;
5077 mddev->bitmap_info.default_offset = 0; 5079 mddev->bitmap_info.default_offset = 0;
5078 mddev->bitmap_info.chunksize = 0; 5080 mddev->bitmap_info.chunksize = 0;
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
5175 /* tell userspace to handle 'inactive' */ 5177 /* tell userspace to handle 'inactive' */
5176 sysfs_notify_dirent_safe(mddev->sysfs_state); 5178 sysfs_notify_dirent_safe(mddev->sysfs_state);
5177 5179
5178 list_for_each_entry(rdev, &mddev->disks, same_set) 5180 rdev_for_each(rdev, mddev)
5179 if (rdev->raid_disk >= 0) 5181 if (rdev->raid_disk >= 0)
5180 sysfs_unlink_rdev(mddev, rdev); 5182 sysfs_unlink_rdev(mddev, rdev);
5181 5183
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
5226 5228
5227 printk(KERN_INFO "md: running: "); 5229 printk(KERN_INFO "md: running: ");
5228 5230
5229 list_for_each_entry(rdev, &mddev->disks, same_set) { 5231 rdev_for_each(rdev, mddev) {
5230 char b[BDEVNAME_SIZE]; 5232 char b[BDEVNAME_SIZE];
5231 printk("<%s>", bdevname(rdev->bdev,b)); 5233 printk("<%s>", bdevname(rdev->bdev,b));
5232 } 5234 }
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
5356 struct md_rdev *rdev; 5358 struct md_rdev *rdev;
5357 5359
5358 nr=working=insync=failed=spare=0; 5360 nr=working=insync=failed=spare=0;
5359 list_for_each_entry(rdev, &mddev->disks, same_set) { 5361 rdev_for_each(rdev, mddev) {
5360 nr++; 5362 nr++;
5361 if (test_bit(Faulty, &rdev->flags)) 5363 if (test_bit(Faulty, &rdev->flags))
5362 failed++; 5364 failed++;
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
5923 * grow, and re-add. 5925 * grow, and re-add.
5924 */ 5926 */
5925 return -EBUSY; 5927 return -EBUSY;
5926 list_for_each_entry(rdev, &mddev->disks, same_set) { 5928 rdev_for_each(rdev, mddev) {
5927 sector_t avail = rdev->sectors; 5929 sector_t avail = rdev->sectors;
5928 5930
5929 if (fit && (num_sectors == 0 || num_sectors > avail)) 5931 if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
6724 struct mddev *mddev = v; 6726 struct mddev *mddev = v;
6725 sector_t sectors; 6727 sector_t sectors;
6726 struct md_rdev *rdev; 6728 struct md_rdev *rdev;
6727 struct bitmap *bitmap;
6728 6729
6729 if (v == (void*)1) { 6730 if (v == (void*)1) {
6730 struct md_personality *pers; 6731 struct md_personality *pers;
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6758 } 6759 }
6759 6760
6760 sectors = 0; 6761 sectors = 0;
6761 list_for_each_entry(rdev, &mddev->disks, same_set) { 6762 rdev_for_each(rdev, mddev) {
6762 char b[BDEVNAME_SIZE]; 6763 char b[BDEVNAME_SIZE];
6763 seq_printf(seq, " %s[%d]", 6764 seq_printf(seq, " %s[%d]",
6764 bdevname(rdev->bdev,b), rdev->desc_nr); 6765 bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6812 } else 6813 } else
6813 seq_printf(seq, "\n "); 6814 seq_printf(seq, "\n ");
6814 6815
6815 if ((bitmap = mddev->bitmap)) { 6816 bitmap_status(seq, mddev->bitmap);
6816 unsigned long chunk_kb;
6817 unsigned long flags;
6818 spin_lock_irqsave(&bitmap->lock, flags);
6819 chunk_kb = mddev->bitmap_info.chunksize >> 10;
6820 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6821 "%lu%s chunk",
6822 bitmap->pages - bitmap->missing_pages,
6823 bitmap->pages,
6824 (bitmap->pages - bitmap->missing_pages)
6825 << (PAGE_SHIFT - 10),
6826 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6827 chunk_kb ? "KB" : "B");
6828 if (bitmap->file) {
6829 seq_printf(seq, ", file: ");
6830 seq_path(seq, &bitmap->file->f_path, " \t\n");
6831 }
6832
6833 seq_printf(seq, "\n");
6834 spin_unlock_irqrestore(&bitmap->lock, flags);
6835 }
6836 6817
6837 seq_printf(seq, "\n"); 6818 seq_printf(seq, "\n");
6838 } 6819 }
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
7170 max_sectors = mddev->dev_sectors; 7151 max_sectors = mddev->dev_sectors;
7171 j = MaxSector; 7152 j = MaxSector;
7172 rcu_read_lock(); 7153 rcu_read_lock();
7173 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 7154 rdev_for_each_rcu(rdev, mddev)
7174 if (rdev->raid_disk >= 0 && 7155 if (rdev->raid_disk >= 0 &&
7175 !test_bit(Faulty, &rdev->flags) && 7156 !test_bit(Faulty, &rdev->flags) &&
7176 !test_bit(In_sync, &rdev->flags) && 7157 !test_bit(In_sync, &rdev->flags) &&
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
7342 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7323 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7343 mddev->curr_resync = MaxSector; 7324 mddev->curr_resync = MaxSector;
7344 rcu_read_lock(); 7325 rcu_read_lock();
7345 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 7326 rdev_for_each_rcu(rdev, mddev)
7346 if (rdev->raid_disk >= 0 && 7327 if (rdev->raid_disk >= 0 &&
7347 mddev->delta_disks >= 0 && 7328 mddev->delta_disks >= 0 &&
7348 !test_bit(Faulty, &rdev->flags) && 7329 !test_bit(Faulty, &rdev->flags) &&
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev)
7388 7369
7389 mddev->curr_resync_completed = 0; 7370 mddev->curr_resync_completed = 0;
7390 7371
7391 list_for_each_entry(rdev, &mddev->disks, same_set) 7372 rdev_for_each(rdev, mddev)
7392 if (rdev->raid_disk >= 0 && 7373 if (rdev->raid_disk >= 0 &&
7393 !test_bit(Blocked, &rdev->flags) && 7374 !test_bit(Blocked, &rdev->flags) &&
7394 (test_bit(Faulty, &rdev->flags) || 7375 (test_bit(Faulty, &rdev->flags) ||
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev)
7406 "degraded"); 7387 "degraded");
7407 7388
7408 7389
7409 list_for_each_entry(rdev, &mddev->disks, same_set) { 7390 rdev_for_each(rdev, mddev) {
7410 if (rdev->raid_disk >= 0 && 7391 if (rdev->raid_disk >= 0 &&
7411 !test_bit(In_sync, &rdev->flags) && 7392 !test_bit(In_sync, &rdev->flags) &&
7412 !test_bit(Faulty, &rdev->flags)) 7393 !test_bit(Faulty, &rdev->flags))
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
7451 * do the superblock for an incrementally recovered device 7432 * do the superblock for an incrementally recovered device
7452 * written out. 7433 * written out.
7453 */ 7434 */
7454 list_for_each_entry(rdev, &mddev->disks, same_set) 7435 rdev_for_each(rdev, mddev)
7455 if (!mddev->degraded || 7436 if (!mddev->degraded ||
7456 test_bit(In_sync, &rdev->flags)) 7437 test_bit(In_sync, &rdev->flags))
7457 rdev->saved_raid_disk = -1; 7438 rdev->saved_raid_disk = -1;
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
7529 * failed devices. 7510 * failed devices.
7530 */ 7511 */
7531 struct md_rdev *rdev; 7512 struct md_rdev *rdev;
7532 list_for_each_entry(rdev, &mddev->disks, same_set) 7513 rdev_for_each(rdev, mddev)
7533 if (rdev->raid_disk >= 0 && 7514 if (rdev->raid_disk >= 0 &&
7534 !test_bit(Blocked, &rdev->flags) && 7515 !test_bit(Blocked, &rdev->flags) &&
7535 test_bit(Faulty, &rdev->flags) && 7516 test_bit(Faulty, &rdev->flags) &&
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
8040 return; 8021 return;
8041 write_seqlock_irq(&bb->lock); 8022 write_seqlock_irq(&bb->lock);
8042 8023
8043 if (bb->changed == 0) { 8024 if (bb->changed == 0 && bb->unacked_exist) {
8044 u64 *p = bb->page; 8025 u64 *p = bb->page;
8045 int i; 8026 int i;
8046 for (i = 0; i < bb->count ; i++) { 8027 for (i = 0; i < bb->count ; i++) {
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
8157 struct mddev *mddev; 8138 struct mddev *mddev;
8158 int need_delay = 0; 8139 int need_delay = 0;
8159 8140
8160 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 8141 for_each_mddev(mddev, tmp) {
8161 8142 if (mddev_trylock(mddev)) {
8162 printk(KERN_INFO "md: stopping all md devices.\n"); 8143 __md_stop_writes(mddev);
8163 8144 mddev->safemode = 2;
8164 for_each_mddev(mddev, tmp) { 8145 mddev_unlock(mddev);
8165 if (mddev_trylock(mddev)) {
8166 /* Force a switch to readonly even array
8167 * appears to still be in use. Hence
8168 * the '100'.
8169 */
8170 md_set_readonly(mddev, 100);
8171 mddev_unlock(mddev);
8172 }
8173 need_delay = 1;
8174 } 8146 }
8175 /* 8147 need_delay = 1;
8176 * certain more exotic SCSI devices are known to be
8177 * volatile wrt too early system reboots. While the
8178 * right place to handle this issue is the given
8179 * driver, we do want to have a safe RAID driver ...
8180 */
8181 if (need_delay)
8182 mdelay(1000*1);
8183 } 8148 }
8149 /*
8150 * certain more exotic SCSI devices are known to be
8151 * volatile wrt too early system reboots. While the
8152 * right place to handle this issue is the given
8153 * driver, we do want to have a safe RAID driver ...
8154 */
8155 if (need_delay)
8156 mdelay(1000*1);
8157
8184 return NOTIFY_DONE; 8158 return NOTIFY_DONE;
8185} 8159}
8186 8160
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2b..1c2063ccf48e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
128enum flag_bits { 128enum flag_bits {
129 Faulty, /* device is known to have a fault */ 129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */ 130 In_sync, /* device is in_sync with rest of array */
131 Unmerged, /* device is being added to array and should
132 * be considerred for bvec_merge_fn but not
133 * yet for actual IO
134 */
131 WriteMostly, /* Avoid reading if at all possible */ 135 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */ 136 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet 137 Blocked, /* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
345 int degraded; /* whether md should consider 349 int degraded; /* whether md should consider
346 * adding a spare 350 * adding a spare
347 */ 351 */
352 int merge_check_needed; /* at least one
353 * member device
354 * has a
355 * merge_bvec_fn */
348 356
349 atomic_t recovery_active; /* blocks scheduled, but not written */ 357 atomic_t recovery_active; /* blocks scheduled, but not written */
350 wait_queue_head_t recovery_wait; 358 wait_queue_head_t recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
519/* 527/*
520 * iterates through the 'same array disks' ringlist 528 * iterates through the 'same array disks' ringlist
521 */ 529 */
522#define rdev_for_each(rdev, tmp, mddev) \ 530#define rdev_for_each(rdev, mddev) \
531 list_for_each_entry(rdev, &((mddev)->disks), same_set)
532
533#define rdev_for_each_safe(rdev, tmp, mddev) \
523 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 534 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
524 535
525#define rdev_for_each_rcu(rdev, mddev) \ 536#define rdev_for_each_rcu(rdev, mddev) \
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660e..9339e67fcc79 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
428 } 428 }
429 429
430 working_disks = 0; 430 working_disks = 0;
431 list_for_each_entry(rdev, &mddev->disks, same_set) { 431 rdev_for_each(rdev, mddev) {
432 disk_idx = rdev->raid_disk; 432 disk_idx = rdev->raid_disk;
433 if (disk_idx < 0 || 433 if (disk_idx < 0 ||
434 disk_idx >= mddev->raid_disks) 434 disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e34..6f31f5596e01 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
91 91
92 if (!conf) 92 if (!conf)
93 return -ENOMEM; 93 return -ENOMEM;
94 list_for_each_entry(rdev1, &mddev->disks, same_set) { 94 rdev_for_each(rdev1, mddev) {
95 pr_debug("md/raid0:%s: looking at %s\n", 95 pr_debug("md/raid0:%s: looking at %s\n",
96 mdname(mddev), 96 mdname(mddev),
97 bdevname(rdev1->bdev, b)); 97 bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
102 sector_div(sectors, mddev->chunk_sectors); 102 sector_div(sectors, mddev->chunk_sectors);
103 rdev1->sectors = sectors * mddev->chunk_sectors; 103 rdev1->sectors = sectors * mddev->chunk_sectors;
104 104
105 list_for_each_entry(rdev2, &mddev->disks, same_set) { 105 rdev_for_each(rdev2, mddev) {
106 pr_debug("md/raid0:%s: comparing %s(%llu)" 106 pr_debug("md/raid0:%s: comparing %s(%llu)"
107 " with %s(%llu)\n", 107 " with %s(%llu)\n",
108 mdname(mddev), 108 mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
157 smallest = NULL; 157 smallest = NULL;
158 dev = conf->devlist; 158 dev = conf->devlist;
159 err = -EINVAL; 159 err = -EINVAL;
160 list_for_each_entry(rdev1, &mddev->disks, same_set) { 160 rdev_for_each(rdev1, mddev) {
161 int j = rdev1->raid_disk; 161 int j = rdev1->raid_disk;
162 162
163 if (mddev->level == 10) { 163 if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
188 188
189 disk_stack_limits(mddev->gendisk, rdev1->bdev, 189 disk_stack_limits(mddev->gendisk, rdev1->bdev,
190 rdev1->data_offset << 9); 190 rdev1->data_offset << 9);
191 /* as we don't honour merge_bvec_fn, we must never risk
192 * violating it, so limit ->max_segments to 1, lying within
193 * a single page.
194 */
195 191
196 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { 192 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
197 blk_queue_max_segments(mddev->queue, 1); 193 conf->has_merge_bvec = 1;
198 blk_queue_segment_boundary(mddev->queue, 194
199 PAGE_CACHE_SIZE - 1);
200 }
201 if (!smallest || (rdev1->sectors < smallest->sectors)) 195 if (!smallest || (rdev1->sectors < smallest->sectors))
202 smallest = rdev1; 196 smallest = rdev1;
203 cnt++; 197 cnt++;
@@ -290,8 +284,64 @@ abort:
290 return err; 284 return err;
291} 285}
292 286
287/* Find the zone which holds a particular offset
288 * Update *sectorp to be an offset in that zone
289 */
290static struct strip_zone *find_zone(struct r0conf *conf,
291 sector_t *sectorp)
292{
293 int i;
294 struct strip_zone *z = conf->strip_zone;
295 sector_t sector = *sectorp;
296
297 for (i = 0; i < conf->nr_strip_zones; i++)
298 if (sector < z[i].zone_end) {
299 if (i)
300 *sectorp = sector - z[i-1].zone_end;
301 return z + i;
302 }
303 BUG();
304}
305
306/*
307 * remaps the bio to the target device. we separate two flows.
308 * power 2 flow and a general flow for the sake of perfromance
309*/
310static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
311 sector_t sector, sector_t *sector_offset)
312{
313 unsigned int sect_in_chunk;
314 sector_t chunk;
315 struct r0conf *conf = mddev->private;
316 int raid_disks = conf->strip_zone[0].nb_dev;
317 unsigned int chunk_sects = mddev->chunk_sectors;
318
319 if (is_power_of_2(chunk_sects)) {
320 int chunksect_bits = ffz(~chunk_sects);
321 /* find the sector offset inside the chunk */
322 sect_in_chunk = sector & (chunk_sects - 1);
323 sector >>= chunksect_bits;
324 /* chunk in zone */
325 chunk = *sector_offset;
326 /* quotient is the chunk in real device*/
327 sector_div(chunk, zone->nb_dev << chunksect_bits);
328 } else{
329 sect_in_chunk = sector_div(sector, chunk_sects);
330 chunk = *sector_offset;
331 sector_div(chunk, chunk_sects * zone->nb_dev);
332 }
333 /*
334 * position the bio over the real device
335 * real sector = chunk in device + starting of zone
336 * + the position in the chunk
337 */
338 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
339 return conf->devlist[(zone - conf->strip_zone)*raid_disks
340 + sector_div(sector, zone->nb_dev)];
341}
342
293/** 343/**
294 * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged 344 * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
295 * @q: request queue 345 * @q: request queue
296 * @bvm: properties of new bio 346 * @bvm: properties of new bio
297 * @biovec: the request that could be merged to it. 347 * @biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
303 struct bio_vec *biovec) 353 struct bio_vec *biovec)
304{ 354{
305 struct mddev *mddev = q->queuedata; 355 struct mddev *mddev = q->queuedata;
356 struct r0conf *conf = mddev->private;
306 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 357 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
358 sector_t sector_offset = sector;
307 int max; 359 int max;
308 unsigned int chunk_sectors = mddev->chunk_sectors; 360 unsigned int chunk_sectors = mddev->chunk_sectors;
309 unsigned int bio_sectors = bvm->bi_size >> 9; 361 unsigned int bio_sectors = bvm->bi_size >> 9;
362 struct strip_zone *zone;
363 struct md_rdev *rdev;
364 struct request_queue *subq;
310 365
311 if (is_power_of_2(chunk_sectors)) 366 if (is_power_of_2(chunk_sectors))
312 max = (chunk_sectors - ((sector & (chunk_sectors-1)) 367 max = (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
314 else 369 else
315 max = (chunk_sectors - (sector_div(sector, chunk_sectors) 370 max = (chunk_sectors - (sector_div(sector, chunk_sectors)
316 + bio_sectors)) << 9; 371 + bio_sectors)) << 9;
317 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 372 if (max < 0)
373 max = 0; /* bio_add cannot handle a negative return */
318 if (max <= biovec->bv_len && bio_sectors == 0) 374 if (max <= biovec->bv_len && bio_sectors == 0)
319 return biovec->bv_len; 375 return biovec->bv_len;
320 else 376 if (max < biovec->bv_len)
377 /* too small already, no need to check further */
378 return max;
379 if (!conf->has_merge_bvec)
380 return max;
381
382 /* May need to check subordinate device */
383 sector = sector_offset;
384 zone = find_zone(mddev->private, &sector_offset);
385 rdev = map_sector(mddev, zone, sector, &sector_offset);
386 subq = bdev_get_queue(rdev->bdev);
387 if (subq->merge_bvec_fn) {
388 bvm->bi_bdev = rdev->bdev;
389 bvm->bi_sector = sector_offset + zone->dev_start +
390 rdev->data_offset;
391 return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
392 } else
321 return max; 393 return max;
322} 394}
323 395
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
329 WARN_ONCE(sectors || raid_disks, 401 WARN_ONCE(sectors || raid_disks,
330 "%s does not support generic reshape\n", __func__); 402 "%s does not support generic reshape\n", __func__);
331 403
332 list_for_each_entry(rdev, &mddev->disks, same_set) 404 rdev_for_each(rdev, mddev)
333 array_sectors += rdev->sectors; 405 array_sectors += rdev->sectors;
334 406
335 return array_sectors; 407 return array_sectors;
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
397 return 0; 469 return 0;
398} 470}
399 471
400/* Find the zone which holds a particular offset
401 * Update *sectorp to be an offset in that zone
402 */
403static struct strip_zone *find_zone(struct r0conf *conf,
404 sector_t *sectorp)
405{
406 int i;
407 struct strip_zone *z = conf->strip_zone;
408 sector_t sector = *sectorp;
409
410 for (i = 0; i < conf->nr_strip_zones; i++)
411 if (sector < z[i].zone_end) {
412 if (i)
413 *sectorp = sector - z[i-1].zone_end;
414 return z + i;
415 }
416 BUG();
417}
418
419/*
420 * remaps the bio to the target device. we separate two flows.
421 * power 2 flow and a general flow for the sake of perfromance
422*/
423static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
424 sector_t sector, sector_t *sector_offset)
425{
426 unsigned int sect_in_chunk;
427 sector_t chunk;
428 struct r0conf *conf = mddev->private;
429 int raid_disks = conf->strip_zone[0].nb_dev;
430 unsigned int chunk_sects = mddev->chunk_sectors;
431
432 if (is_power_of_2(chunk_sects)) {
433 int chunksect_bits = ffz(~chunk_sects);
434 /* find the sector offset inside the chunk */
435 sect_in_chunk = sector & (chunk_sects - 1);
436 sector >>= chunksect_bits;
437 /* chunk in zone */
438 chunk = *sector_offset;
439 /* quotient is the chunk in real device*/
440 sector_div(chunk, zone->nb_dev << chunksect_bits);
441 } else{
442 sect_in_chunk = sector_div(sector, chunk_sects);
443 chunk = *sector_offset;
444 sector_div(chunk, chunk_sects * zone->nb_dev);
445 }
446 /*
447 * position the bio over the real device
448 * real sector = chunk in device + starting of zone
449 * + the position in the chunk
450 */
451 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
452 return conf->devlist[(zone - conf->strip_zone)*raid_disks
453 + sector_div(sector, zone->nb_dev)];
454}
455
456/* 472/*
457 * Is io distribute over 1 or more chunks ? 473 * Is io distribute over 1 or more chunks ?
458*/ 474*/
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
505 } 521 }
506 522
507 sector_offset = bio->bi_sector; 523 sector_offset = bio->bi_sector;
508 zone = find_zone(mddev->private, &sector_offset); 524 zone = find_zone(mddev->private, &sector_offset);
509 tmp_dev = map_sector(mddev, zone, bio->bi_sector, 525 tmp_dev = map_sector(mddev, zone, bio->bi_sector,
510 &sector_offset); 526 &sector_offset);
511 bio->bi_bdev = tmp_dev->bdev; 527 bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
543 return ERR_PTR(-EINVAL); 559 return ERR_PTR(-EINVAL);
544 } 560 }
545 561
546 list_for_each_entry(rdev, &mddev->disks, same_set) { 562 rdev_for_each(rdev, mddev) {
547 /* check slot number for a disk */ 563 /* check slot number for a disk */
548 if (rdev->raid_disk == mddev->raid_disks-1) { 564 if (rdev->raid_disk == mddev->raid_disks-1) {
549 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 565 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4c..05539d9c97f0 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
4struct strip_zone { 4struct strip_zone {
5 sector_t zone_end; /* Start of the next zone (in sectors) */ 5 sector_t zone_end; /* Start of the next zone (in sectors) */
6 sector_t dev_start; /* Zone offset in real dev (in sectors) */ 6 sector_t dev_start; /* Zone offset in real dev (in sectors) */
7 int nb_dev; /* # of devices attached to the zone */ 7 int nb_dev; /* # of devices attached to the zone */
8}; 8};
9 9
10struct r0conf { 10struct r0conf {
11 struct strip_zone *strip_zone; 11 struct strip_zone *strip_zone;
12 struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 12 struct md_rdev **devlist; /* lists of rdevs, pointed to
13 int nr_strip_zones; 13 * by strip_zone->dev */
14 int nr_strip_zones;
15 int has_merge_bvec; /* at least one member has
16 * a merge_bvec_fn */
14}; 17};
15 18
16#endif 19#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a0b225eb4ac4..4a40a200d769 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 523 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 524 if (r1_bio->bios[disk] == IO_BLOCKED
525 || rdev == NULL 525 || rdev == NULL
526 || test_bit(Unmerged, &rdev->flags)
526 || test_bit(Faulty, &rdev->flags)) 527 || test_bit(Faulty, &rdev->flags))
527 continue; 528 continue;
528 if (!test_bit(In_sync, &rdev->flags) && 529 if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
614 return best_disk; 615 return best_disk;
615} 616}
616 617
618static int raid1_mergeable_bvec(struct request_queue *q,
619 struct bvec_merge_data *bvm,
620 struct bio_vec *biovec)
621{
622 struct mddev *mddev = q->queuedata;
623 struct r1conf *conf = mddev->private;
624 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
625 int max = biovec->bv_len;
626
627 if (mddev->merge_check_needed) {
628 int disk;
629 rcu_read_lock();
630 for (disk = 0; disk < conf->raid_disks * 2; disk++) {
631 struct md_rdev *rdev = rcu_dereference(
632 conf->mirrors[disk].rdev);
633 if (rdev && !test_bit(Faulty, &rdev->flags)) {
634 struct request_queue *q =
635 bdev_get_queue(rdev->bdev);
636 if (q->merge_bvec_fn) {
637 bvm->bi_sector = sector +
638 rdev->data_offset;
639 bvm->bi_bdev = rdev->bdev;
640 max = min(max, q->merge_bvec_fn(
641 q, bvm, biovec));
642 }
643 }
644 }
645 rcu_read_unlock();
646 }
647 return max;
648
649}
650
617int md_raid1_congested(struct mddev *mddev, int bits) 651int md_raid1_congested(struct mddev *mddev, int bits)
618{ 652{
619 struct r1conf *conf = mddev->private; 653 struct r1conf *conf = mddev->private;
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
737 spin_lock_irq(&conf->resync_lock); 771 spin_lock_irq(&conf->resync_lock);
738 if (conf->barrier) { 772 if (conf->barrier) {
739 conf->nr_waiting++; 773 conf->nr_waiting++;
740 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 774 /* Wait for the barrier to drop.
775 * However if there are already pending
776 * requests (preventing the barrier from
777 * rising completely), and the
778 * pre-process bio queue isn't empty,
779 * then don't wait, as we need to empty
780 * that queue to get the nr_pending
781 * count down.
782 */
783 wait_event_lock_irq(conf->wait_barrier,
784 !conf->barrier ||
785 (conf->nr_pending &&
786 current->bio_list &&
787 !bio_list_empty(current->bio_list)),
741 conf->resync_lock, 788 conf->resync_lock,
742 ); 789 );
743 conf->nr_waiting--; 790 conf->nr_waiting--;
744 } 791 }
745 conf->nr_pending++; 792 conf->nr_pending++;
@@ -1002,7 +1049,8 @@ read_again:
1002 break; 1049 break;
1003 } 1050 }
1004 r1_bio->bios[i] = NULL; 1051 r1_bio->bios[i] = NULL;
1005 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1052 if (!rdev || test_bit(Faulty, &rdev->flags)
1053 || test_bit(Unmerged, &rdev->flags)) {
1006 if (i < conf->raid_disks) 1054 if (i < conf->raid_disks)
1007 set_bit(R1BIO_Degraded, &r1_bio->state); 1055 set_bit(R1BIO_Degraded, &r1_bio->state);
1008 continue; 1056 continue;
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1322 struct mirror_info *p; 1370 struct mirror_info *p;
1323 int first = 0; 1371 int first = 0;
1324 int last = conf->raid_disks - 1; 1372 int last = conf->raid_disks - 1;
1373 struct request_queue *q = bdev_get_queue(rdev->bdev);
1325 1374
1326 if (mddev->recovery_disabled == conf->recovery_disabled) 1375 if (mddev->recovery_disabled == conf->recovery_disabled)
1327 return -EBUSY; 1376 return -EBUSY;
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1329 if (rdev->raid_disk >= 0) 1378 if (rdev->raid_disk >= 0)
1330 first = last = rdev->raid_disk; 1379 first = last = rdev->raid_disk;
1331 1380
1381 if (q->merge_bvec_fn) {
1382 set_bit(Unmerged, &rdev->flags);
1383 mddev->merge_check_needed = 1;
1384 }
1385
1332 for (mirror = first; mirror <= last; mirror++) { 1386 for (mirror = first; mirror <= last; mirror++) {
1333 p = conf->mirrors+mirror; 1387 p = conf->mirrors+mirror;
1334 if (!p->rdev) { 1388 if (!p->rdev) {
1335 1389
1336 disk_stack_limits(mddev->gendisk, rdev->bdev, 1390 disk_stack_limits(mddev->gendisk, rdev->bdev,
1337 rdev->data_offset << 9); 1391 rdev->data_offset << 9);
1338 /* as we don't honour merge_bvec_fn, we must
1339 * never risk violating it, so limit
1340 * ->max_segments to one lying with a single
1341 * page, as a one page request is never in
1342 * violation.
1343 */
1344 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1345 blk_queue_max_segments(mddev->queue, 1);
1346 blk_queue_segment_boundary(mddev->queue,
1347 PAGE_CACHE_SIZE - 1);
1348 }
1349 1392
1350 p->head_position = 0; 1393 p->head_position = 0;
1351 rdev->raid_disk = mirror; 1394 rdev->raid_disk = mirror;
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1370 break; 1413 break;
1371 } 1414 }
1372 } 1415 }
1416 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1417 /* Some requests might not have seen this new
1418 * merge_bvec_fn. We must wait for them to complete
1419 * before merging the device fully.
1420 * First we make sure any code which has tested
1421 * our function has submitted the request, then
1422 * we wait for all outstanding requests to complete.
1423 */
1424 synchronize_sched();
1425 raise_barrier(conf);
1426 lower_barrier(conf);
1427 clear_bit(Unmerged, &rdev->flags);
1428 }
1373 md_integrity_add_rdev(rdev, mddev); 1429 md_integrity_add_rdev(rdev, mddev);
1374 print_conf(conf); 1430 print_conf(conf);
1375 return err; 1431 return err;
@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2491 2547
2492 err = -EINVAL; 2548 err = -EINVAL;
2493 spin_lock_init(&conf->device_lock); 2549 spin_lock_init(&conf->device_lock);
2494 list_for_each_entry(rdev, &mddev->disks, same_set) { 2550 rdev_for_each(rdev, mddev) {
2495 int disk_idx = rdev->raid_disk; 2551 int disk_idx = rdev->raid_disk;
2496 if (disk_idx >= mddev->raid_disks 2552 if (disk_idx >= mddev->raid_disks
2497 || disk_idx < 0) 2553 || disk_idx < 0)
@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev)
2609 if (IS_ERR(conf)) 2665 if (IS_ERR(conf))
2610 return PTR_ERR(conf); 2666 return PTR_ERR(conf);
2611 2667
2612 list_for_each_entry(rdev, &mddev->disks, same_set) { 2668 rdev_for_each(rdev, mddev) {
2613 if (!mddev->gendisk) 2669 if (!mddev->gendisk)
2614 continue; 2670 continue;
2615 disk_stack_limits(mddev->gendisk, rdev->bdev, 2671 disk_stack_limits(mddev->gendisk, rdev->bdev,
2616 rdev->data_offset << 9); 2672 rdev->data_offset << 9);
2617 /* as we don't honour merge_bvec_fn, we must never risk
2618 * violating it, so limit ->max_segments to 1 lying within
2619 * a single page, as a one page request is never in violation.
2620 */
2621 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2622 blk_queue_max_segments(mddev->queue, 1);
2623 blk_queue_segment_boundary(mddev->queue,
2624 PAGE_CACHE_SIZE - 1);
2625 }
2626 } 2673 }
2627 2674
2628 mddev->degraded = 0; 2675 mddev->degraded = 0;
@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev)
2656 if (mddev->queue) { 2703 if (mddev->queue) {
2657 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2704 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2658 mddev->queue->backing_dev_info.congested_data = mddev; 2705 mddev->queue->backing_dev_info.congested_data = mddev;
2706 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2659 } 2707 }
2660 return md_integrity_register(mddev); 2708 return md_integrity_register(mddev);
2661} 2709}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 58c44d6453a0..3540316886f2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -586,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
586 * @biovec: the request that could be merged to it. 586 * @biovec: the request that could be merged to it.
587 * 587 *
588 * Return amount of bytes we can accept at this offset 588 * Return amount of bytes we can accept at this offset
589 * If near_copies == raid_disk, there are no striping issues, 589 * This requires checking for end-of-chunk if near_copies != raid_disks,
590 * but in that case, the function isn't called at all. 590 * and for subordinate merge_bvec_fns if merge_check_needed.
591 */ 591 */
592static int raid10_mergeable_bvec(struct request_queue *q, 592static int raid10_mergeable_bvec(struct request_queue *q,
593 struct bvec_merge_data *bvm, 593 struct bvec_merge_data *bvm,
594 struct bio_vec *biovec) 594 struct bio_vec *biovec)
595{ 595{
596 struct mddev *mddev = q->queuedata; 596 struct mddev *mddev = q->queuedata;
597 struct r10conf *conf = mddev->private;
597 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
598 int max; 599 int max;
599 unsigned int chunk_sectors = mddev->chunk_sectors; 600 unsigned int chunk_sectors = mddev->chunk_sectors;
600 unsigned int bio_sectors = bvm->bi_size >> 9; 601 unsigned int bio_sectors = bvm->bi_size >> 9;
601 602
602 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 603 if (conf->near_copies < conf->raid_disks) {
603 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 604 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
604 if (max <= biovec->bv_len && bio_sectors == 0) 605 + bio_sectors)) << 9;
605 return biovec->bv_len; 606 if (max < 0)
606 else 607 /* bio_add cannot handle a negative return */
607 return max; 608 max = 0;
609 if (max <= biovec->bv_len && bio_sectors == 0)
610 return biovec->bv_len;
611 } else
612 max = biovec->bv_len;
613
614 if (mddev->merge_check_needed) {
615 struct r10bio r10_bio;
616 int s;
617 r10_bio.sector = sector;
618 raid10_find_phys(conf, &r10_bio);
619 rcu_read_lock();
620 for (s = 0; s < conf->copies; s++) {
621 int disk = r10_bio.devs[s].devnum;
622 struct md_rdev *rdev = rcu_dereference(
623 conf->mirrors[disk].rdev);
624 if (rdev && !test_bit(Faulty, &rdev->flags)) {
625 struct request_queue *q =
626 bdev_get_queue(rdev->bdev);
627 if (q->merge_bvec_fn) {
628 bvm->bi_sector = r10_bio.devs[s].addr
629 + rdev->data_offset;
630 bvm->bi_bdev = rdev->bdev;
631 max = min(max, q->merge_bvec_fn(
632 q, bvm, biovec));
633 }
634 }
635 rdev = rcu_dereference(conf->mirrors[disk].replacement);
636 if (rdev && !test_bit(Faulty, &rdev->flags)) {
637 struct request_queue *q =
638 bdev_get_queue(rdev->bdev);
639 if (q->merge_bvec_fn) {
640 bvm->bi_sector = r10_bio.devs[s].addr
641 + rdev->data_offset;
642 bvm->bi_bdev = rdev->bdev;
643 max = min(max, q->merge_bvec_fn(
644 q, bvm, biovec));
645 }
646 }
647 }
648 rcu_read_unlock();
649 }
650 return max;
608} 651}
609 652
610/* 653/*
@@ -668,11 +711,12 @@ retry:
668 disk = r10_bio->devs[slot].devnum; 711 disk = r10_bio->devs[slot].devnum;
669 rdev = rcu_dereference(conf->mirrors[disk].replacement); 712 rdev = rcu_dereference(conf->mirrors[disk].replacement);
670 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 713 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
714 test_bit(Unmerged, &rdev->flags) ||
671 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 715 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
672 rdev = rcu_dereference(conf->mirrors[disk].rdev); 716 rdev = rcu_dereference(conf->mirrors[disk].rdev);
673 if (rdev == NULL) 717 if (rdev == NULL ||
674 continue; 718 test_bit(Faulty, &rdev->flags) ||
675 if (test_bit(Faulty, &rdev->flags)) 719 test_bit(Unmerged, &rdev->flags))
676 continue; 720 continue;
677 if (!test_bit(In_sync, &rdev->flags) && 721 if (!test_bit(In_sync, &rdev->flags) &&
678 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 722 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -863,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
863 spin_lock_irq(&conf->resync_lock); 907 spin_lock_irq(&conf->resync_lock);
864 if (conf->barrier) { 908 if (conf->barrier) {
865 conf->nr_waiting++; 909 conf->nr_waiting++;
866 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 910 /* Wait for the barrier to drop.
911 * However if there are already pending
912 * requests (preventing the barrier from
913 * rising completely), and the
914 * pre-process bio queue isn't empty,
915 * then don't wait, as we need to empty
916 * that queue to get the nr_pending
917 * count down.
918 */
919 wait_event_lock_irq(conf->wait_barrier,
920 !conf->barrier ||
921 (conf->nr_pending &&
922 current->bio_list &&
923 !bio_list_empty(current->bio_list)),
867 conf->resync_lock, 924 conf->resync_lock,
868 ); 925 );
869 conf->nr_waiting--; 926 conf->nr_waiting--;
870 } 927 }
871 conf->nr_pending++; 928 conf->nr_pending++;
@@ -1121,12 +1178,14 @@ retry_write:
1121 blocked_rdev = rrdev; 1178 blocked_rdev = rrdev;
1122 break; 1179 break;
1123 } 1180 }
1124 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1181 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1182 || test_bit(Unmerged, &rrdev->flags)))
1125 rrdev = NULL; 1183 rrdev = NULL;
1126 1184
1127 r10_bio->devs[i].bio = NULL; 1185 r10_bio->devs[i].bio = NULL;
1128 r10_bio->devs[i].repl_bio = NULL; 1186 r10_bio->devs[i].repl_bio = NULL;
1129 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1187 if (!rdev || test_bit(Faulty, &rdev->flags) ||
1188 test_bit(Unmerged, &rdev->flags)) {
1130 set_bit(R10BIO_Degraded, &r10_bio->state); 1189 set_bit(R10BIO_Degraded, &r10_bio->state);
1131 continue; 1190 continue;
1132 } 1191 }
@@ -1477,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1477 int mirror; 1536 int mirror;
1478 int first = 0; 1537 int first = 0;
1479 int last = conf->raid_disks - 1; 1538 int last = conf->raid_disks - 1;
1539 struct request_queue *q = bdev_get_queue(rdev->bdev);
1480 1540
1481 if (mddev->recovery_cp < MaxSector) 1541 if (mddev->recovery_cp < MaxSector)
1482 /* only hot-add to in-sync arrays, as recovery is 1542 /* only hot-add to in-sync arrays, as recovery is
1483 * very different from resync 1543 * very different from resync
1484 */ 1544 */
1485 return -EBUSY; 1545 return -EBUSY;
1486 if (!enough(conf, -1)) 1546 if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
1487 return -EINVAL; 1547 return -EINVAL;
1488 1548
1489 if (rdev->raid_disk >= 0) 1549 if (rdev->raid_disk >= 0)
1490 first = last = rdev->raid_disk; 1550 first = last = rdev->raid_disk;
1491 1551
1552 if (q->merge_bvec_fn) {
1553 set_bit(Unmerged, &rdev->flags);
1554 mddev->merge_check_needed = 1;
1555 }
1556
1492 if (rdev->saved_raid_disk >= first && 1557 if (rdev->saved_raid_disk >= first &&
1493 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1558 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1494 mirror = rdev->saved_raid_disk; 1559 mirror = rdev->saved_raid_disk;
@@ -1508,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1508 err = 0; 1573 err = 0;
1509 disk_stack_limits(mddev->gendisk, rdev->bdev, 1574 disk_stack_limits(mddev->gendisk, rdev->bdev,
1510 rdev->data_offset << 9); 1575 rdev->data_offset << 9);
1511 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1512 blk_queue_max_segments(mddev->queue, 1);
1513 blk_queue_segment_boundary(mddev->queue,
1514 PAGE_CACHE_SIZE - 1);
1515 }
1516 conf->fullsync = 1; 1576 conf->fullsync = 1;
1517 rcu_assign_pointer(p->replacement, rdev); 1577 rcu_assign_pointer(p->replacement, rdev);
1518 break; 1578 break;
@@ -1520,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1520 1580
1521 disk_stack_limits(mddev->gendisk, rdev->bdev, 1581 disk_stack_limits(mddev->gendisk, rdev->bdev,
1522 rdev->data_offset << 9); 1582 rdev->data_offset << 9);
1523 /* as we don't honour merge_bvec_fn, we must
1524 * never risk violating it, so limit
1525 * ->max_segments to one lying with a single
1526 * page, as a one page request is never in
1527 * violation.
1528 */
1529 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1530 blk_queue_max_segments(mddev->queue, 1);
1531 blk_queue_segment_boundary(mddev->queue,
1532 PAGE_CACHE_SIZE - 1);
1533 }
1534 1583
1535 p->head_position = 0; 1584 p->head_position = 0;
1536 p->recovery_disabled = mddev->recovery_disabled - 1; 1585 p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1541,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1541 rcu_assign_pointer(p->rdev, rdev); 1590 rcu_assign_pointer(p->rdev, rdev);
1542 break; 1591 break;
1543 } 1592 }
1544 1593 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1594 /* Some requests might not have seen this new
1595 * merge_bvec_fn. We must wait for them to complete
1596 * before merging the device fully.
1597 * First we make sure any code which has tested
1598 * our function has submitted the request, then
1599 * we wait for all outstanding requests to complete.
1600 */
1601 synchronize_sched();
1602 raise_barrier(conf, 0);
1603 lower_barrier(conf);
1604 clear_bit(Unmerged, &rdev->flags);
1605 }
1545 md_integrity_add_rdev(rdev, mddev); 1606 md_integrity_add_rdev(rdev, mddev);
1546 print_conf(conf); 1607 print_conf(conf);
1547 return err; 1608 return err;
@@ -1682,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
1682 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1743 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1683 if (repl) 1744 if (repl)
1684 rdev = conf->mirrors[d].replacement; 1745 rdev = conf->mirrors[d].replacement;
1685 if (!rdev) { 1746 else
1686 smp_mb();
1687 rdev = conf->mirrors[d].rdev; 1747 rdev = conf->mirrors[d].rdev;
1688 }
1689 1748
1690 if (!uptodate) { 1749 if (!uptodate) {
1691 if (repl) 1750 if (repl)
@@ -2087,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2087 d = r10_bio->devs[sl].devnum; 2146 d = r10_bio->devs[sl].devnum;
2088 rdev = rcu_dereference(conf->mirrors[d].rdev); 2147 rdev = rcu_dereference(conf->mirrors[d].rdev);
2089 if (rdev && 2148 if (rdev &&
2149 !test_bit(Unmerged, &rdev->flags) &&
2090 test_bit(In_sync, &rdev->flags) && 2150 test_bit(In_sync, &rdev->flags) &&
2091 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2151 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2092 &first_bad, &bad_sectors) == 0) { 2152 &first_bad, &bad_sectors) == 0) {
@@ -2140,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2140 d = r10_bio->devs[sl].devnum; 2200 d = r10_bio->devs[sl].devnum;
2141 rdev = rcu_dereference(conf->mirrors[d].rdev); 2201 rdev = rcu_dereference(conf->mirrors[d].rdev);
2142 if (!rdev || 2202 if (!rdev ||
2203 test_bit(Unmerged, &rdev->flags) ||
2143 !test_bit(In_sync, &rdev->flags)) 2204 !test_bit(In_sync, &rdev->flags))
2144 continue; 2205 continue;
2145 2206
@@ -3242,7 +3303,7 @@ static int run(struct mddev *mddev)
3242 blk_queue_io_opt(mddev->queue, chunk_size * 3303 blk_queue_io_opt(mddev->queue, chunk_size *
3243 (conf->raid_disks / conf->near_copies)); 3304 (conf->raid_disks / conf->near_copies));
3244 3305
3245 list_for_each_entry(rdev, &mddev->disks, same_set) { 3306 rdev_for_each(rdev, mddev) {
3246 3307
3247 disk_idx = rdev->raid_disk; 3308 disk_idx = rdev->raid_disk;
3248 if (disk_idx >= conf->raid_disks 3309 if (disk_idx >= conf->raid_disks
@@ -3262,15 +3323,6 @@ static int run(struct mddev *mddev)
3262 3323
3263 disk_stack_limits(mddev->gendisk, rdev->bdev, 3324 disk_stack_limits(mddev->gendisk, rdev->bdev,
3264 rdev->data_offset << 9); 3325 rdev->data_offset << 9);
3265 /* as we don't honour merge_bvec_fn, we must never risk
3266 * violating it, so limit max_segments to 1 lying
3267 * within a single page.
3268 */
3269 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
3270 blk_queue_max_segments(mddev->queue, 1);
3271 blk_queue_segment_boundary(mddev->queue,
3272 PAGE_CACHE_SIZE - 1);
3273 }
3274 3326
3275 disk->head_position = 0; 3327 disk->head_position = 0;
3276 } 3328 }
@@ -3334,8 +3386,7 @@ static int run(struct mddev *mddev)
3334 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3386 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
3335 } 3387 }
3336 3388
3337 if (conf->near_copies < conf->raid_disks) 3389 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3338 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3339 3390
3340 if (md_integrity_register(mddev)) 3391 if (md_integrity_register(mddev))
3341 goto out_free_conf; 3392 goto out_free_conf;
@@ -3385,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
3385 } 3436 }
3386} 3437}
3387 3438
3439static int raid10_resize(struct mddev *mddev, sector_t sectors)
3440{
3441 /* Resize of 'far' arrays is not supported.
3442 * For 'near' and 'offset' arrays we can set the
3443 * number of sectors used to be an appropriate multiple
3444 * of the chunk size.
3445 * For 'offset', this is far_copies*chunksize.
3446 * For 'near' the multiplier is the LCM of
3447 * near_copies and raid_disks.
3448 * So if far_copies > 1 && !far_offset, fail.
3449 * Else find LCM(raid_disks, near_copy)*far_copies and
3450 * multiply by chunk_size. Then round to this number.
3451 * This is mostly done by raid10_size()
3452 */
3453 struct r10conf *conf = mddev->private;
3454 sector_t oldsize, size;
3455
3456 if (conf->far_copies > 1 && !conf->far_offset)
3457 return -EINVAL;
3458
3459 oldsize = raid10_size(mddev, 0, 0);
3460 size = raid10_size(mddev, sectors, 0);
3461 md_set_array_sectors(mddev, size);
3462 if (mddev->array_sectors > size)
3463 return -EINVAL;
3464 set_capacity(mddev->gendisk, mddev->array_sectors);
3465 revalidate_disk(mddev->gendisk);
3466 if (sectors > mddev->dev_sectors &&
3467 mddev->recovery_cp > oldsize) {
3468 mddev->recovery_cp = oldsize;
3469 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3470 }
3471 mddev->dev_sectors = sectors;
3472 mddev->resync_max_sectors = size;
3473 return 0;
3474}
3475
3388static void *raid10_takeover_raid0(struct mddev *mddev) 3476static void *raid10_takeover_raid0(struct mddev *mddev)
3389{ 3477{
3390 struct md_rdev *rdev; 3478 struct md_rdev *rdev;
@@ -3408,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3408 3496
3409 conf = setup_conf(mddev); 3497 conf = setup_conf(mddev);
3410 if (!IS_ERR(conf)) { 3498 if (!IS_ERR(conf)) {
3411 list_for_each_entry(rdev, &mddev->disks, same_set) 3499 rdev_for_each(rdev, mddev)
3412 if (rdev->raid_disk >= 0) 3500 if (rdev->raid_disk >= 0)
3413 rdev->new_raid_disk = rdev->raid_disk * 2; 3501 rdev->new_raid_disk = rdev->raid_disk * 2;
3414 conf->barrier = 1; 3502 conf->barrier = 1;
@@ -3454,6 +3542,7 @@ static struct md_personality raid10_personality =
3454 .sync_request = sync_request, 3542 .sync_request = sync_request,
3455 .quiesce = raid10_quiesce, 3543 .quiesce = raid10_quiesce,
3456 .size = raid10_size, 3544 .size = raid10_size,
3545 .resize = raid10_resize,
3457 .takeover = raid10_takeover, 3546 .takeover = raid10_takeover,
3458}; 3547};
3459 3548
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62b..23ac880bba9a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
208 md_wakeup_thread(conf->mddev->thread); 208 md_wakeup_thread(conf->mddev->thread);
209 } else { 209 } else {
210 BUG_ON(stripe_operations_active(sh)); 210 BUG_ON(stripe_operations_active(sh));
211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
212 atomic_dec(&conf->preread_active_stripes); 212 if (atomic_dec_return(&conf->preread_active_stripes)
213 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 213 < IO_THRESHOLD)
214 md_wakeup_thread(conf->mddev->thread); 214 md_wakeup_thread(conf->mddev->thread);
215 }
216 atomic_dec(&conf->active_stripes); 215 atomic_dec(&conf->active_stripes);
217 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 216 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 list_add_tail(&sh->lru, &conf->inactive_list); 217 list_add_tail(&sh->lru, &conf->inactive_list);
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4843 4842
4844 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4843 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4845 4844
4846 list_for_each_entry(rdev, &mddev->disks, same_set) { 4845 rdev_for_each(rdev, mddev) {
4847 raid_disk = rdev->raid_disk; 4846 raid_disk = rdev->raid_disk;
4848 if (raid_disk >= max_disks 4847 if (raid_disk >= max_disks
4849 || raid_disk < 0) 4848 || raid_disk < 0)
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
5178 blk_queue_io_opt(mddev->queue, chunk_size * 5177 blk_queue_io_opt(mddev->queue, chunk_size *
5179 (conf->raid_disks - conf->max_degraded)); 5178 (conf->raid_disks - conf->max_degraded));
5180 5179
5181 list_for_each_entry(rdev, &mddev->disks, same_set) 5180 rdev_for_each(rdev, mddev)
5182 disk_stack_limits(mddev->gendisk, rdev->bdev, 5181 disk_stack_limits(mddev->gendisk, rdev->bdev,
5183 rdev->data_offset << 9); 5182 rdev->data_offset << 9);
5184 } 5183 }
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5362 if (mddev->recovery_disabled == conf->recovery_disabled) 5361 if (mddev->recovery_disabled == conf->recovery_disabled)
5363 return -EBUSY; 5362 return -EBUSY;
5364 5363
5365 if (has_failed(conf)) 5364 if (rdev->saved_raid_disk < 0 && has_failed(conf))
5366 /* no point adding a device */ 5365 /* no point adding a device */
5367 return -EINVAL; 5366 return -EINVAL;
5368 5367
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5501 if (!check_stripe_cache(mddev)) 5500 if (!check_stripe_cache(mddev))
5502 return -ENOSPC; 5501 return -ENOSPC;
5503 5502
5504 list_for_each_entry(rdev, &mddev->disks, same_set) 5503 rdev_for_each(rdev, mddev)
5505 if (!test_bit(In_sync, &rdev->flags) 5504 if (!test_bit(In_sync, &rdev->flags)
5506 && !test_bit(Faulty, &rdev->flags)) 5505 && !test_bit(Faulty, &rdev->flags))
5507 spares++; 5506 spares++;
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
5547 * such devices during the reshape and confusion could result. 5546 * such devices during the reshape and confusion could result.
5548 */ 5547 */
5549 if (mddev->delta_disks >= 0) { 5548 if (mddev->delta_disks >= 0) {
5550 int added_devices = 0; 5549 rdev_for_each(rdev, mddev)
5551 list_for_each_entry(rdev, &mddev->disks, same_set)
5552 if (rdev->raid_disk < 0 && 5550 if (rdev->raid_disk < 0 &&
5553 !test_bit(Faulty, &rdev->flags)) { 5551 !test_bit(Faulty, &rdev->flags)) {
5554 if (raid5_add_disk(mddev, rdev) == 0) { 5552 if (raid5_add_disk(mddev, rdev) == 0) {
5555 if (rdev->raid_disk 5553 if (rdev->raid_disk
5556 >= conf->previous_raid_disks) { 5554 >= conf->previous_raid_disks)
5557 set_bit(In_sync, &rdev->flags); 5555 set_bit(In_sync, &rdev->flags);
5558 added_devices++; 5556 else
5559 } else
5560 rdev->recovery_offset = 0; 5557 rdev->recovery_offset = 0;
5561 5558
5562 if (sysfs_link_rdev(mddev, rdev)) 5559 if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
5566 && !test_bit(Faulty, &rdev->flags)) { 5563 && !test_bit(Faulty, &rdev->flags)) {
5567 /* This is a spare that was manually added */ 5564 /* This is a spare that was manually added */
5568 set_bit(In_sync, &rdev->flags); 5565 set_bit(In_sync, &rdev->flags);
5569 added_devices++;
5570 } 5566 }
5571 5567
5572 /* When a reshape changes the number of devices, 5568 /* When a reshape changes the number of devices,
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5592 spin_lock_irq(&conf->device_lock); 5588 spin_lock_irq(&conf->device_lock);
5593 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5589 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5594 conf->reshape_progress = MaxSector; 5590 conf->reshape_progress = MaxSector;
5591 mddev->reshape_position = MaxSector;
5595 spin_unlock_irq(&conf->device_lock); 5592 spin_unlock_irq(&conf->device_lock);
5596 return -EAGAIN; 5593 return -EAGAIN;
5597 } 5594 }