aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c152
-rw-r--r--drivers/md/bitmap.h22
-rw-r--r--drivers/md/dm-raid.c16
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c32
-rw-r--r--drivers/md/md.c140
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/raid0.c164
-rw-r--r--drivers/md/raid0.h11
-rw-r--r--drivers/md/raid1.c98
-rw-r--r--drivers/md/raid10.c187
-rw-r--r--drivers/md/raid5.c25
13 files changed, 491 insertions, 373 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 045e086144ad..3d0dfa7a89a2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/seq_file.h>
29#include "md.h" 30#include "md.h"
30#include "bitmap.h" 31#include "bitmap.h"
31 32
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
35} 36}
36 37
37/* 38/*
38 * just a placeholder - calls kmalloc for bitmap pages
39 */
40static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
41{
42 unsigned char *page;
43
44 page = kzalloc(PAGE_SIZE, GFP_NOIO);
45 if (!page)
46 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
47 else
48 pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
49 bmname(bitmap), page);
50 return page;
51}
52
53/*
54 * for now just a placeholder -- just calls kfree for bitmap pages
55 */
56static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
57{
58 pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
59 kfree(page);
60}
61
62/*
63 * check a page and, if necessary, allocate it (or hijack it if the alloc fails) 39 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
64 * 40 *
65 * 1) check to see if this page is allocated, if it's not then try to alloc 41 * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
96 /* this page has not been allocated yet */ 72 /* this page has not been allocated yet */
97 73
98 spin_unlock_irq(&bitmap->lock); 74 spin_unlock_irq(&bitmap->lock);
99 mappage = bitmap_alloc_page(bitmap); 75 mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
100 spin_lock_irq(&bitmap->lock); 76 spin_lock_irq(&bitmap->lock);
101 77
102 if (mappage == NULL) { 78 if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
109 } else if (bitmap->bp[page].map || 85 } else if (bitmap->bp[page].map ||
110 bitmap->bp[page].hijacked) { 86 bitmap->bp[page].hijacked) {
111 /* somebody beat us to getting the page */ 87 /* somebody beat us to getting the page */
112 bitmap_free_page(bitmap, mappage); 88 kfree(mappage);
113 return 0; 89 return 0;
114 } else { 90 } else {
115 91
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
141 ptr = bitmap->bp[page].map; 117 ptr = bitmap->bp[page].map;
142 bitmap->bp[page].map = NULL; 118 bitmap->bp[page].map = NULL;
143 bitmap->missing_pages++; 119 bitmap->missing_pages++;
144 bitmap_free_page(bitmap, ptr); 120 kfree(ptr);
145 } 121 }
146} 122}
147 123
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
171 did_alloc = 1; 147 did_alloc = 1;
172 } 148 }
173 149
174 list_for_each_entry(rdev, &mddev->disks, same_set) { 150 rdev_for_each(rdev, mddev) {
175 if (! test_bit(In_sync, &rdev->flags) 151 if (! test_bit(In_sync, &rdev->flags)
176 || test_bit(Faulty, &rdev->flags)) 152 || test_bit(Faulty, &rdev->flags))
177 continue; 153 continue;
@@ -445,18 +421,13 @@ out:
445void bitmap_update_sb(struct bitmap *bitmap) 421void bitmap_update_sb(struct bitmap *bitmap)
446{ 422{
447 bitmap_super_t *sb; 423 bitmap_super_t *sb;
448 unsigned long flags;
449 424
450 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 425 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
451 return; 426 return;
452 if (bitmap->mddev->bitmap_info.external) 427 if (bitmap->mddev->bitmap_info.external)
453 return; 428 return;
454 spin_lock_irqsave(&bitmap->lock, flags); 429 if (!bitmap->sb_page) /* no superblock */
455 if (!bitmap->sb_page) { /* no superblock */
456 spin_unlock_irqrestore(&bitmap->lock, flags);
457 return; 430 return;
458 }
459 spin_unlock_irqrestore(&bitmap->lock, flags);
460 sb = kmap_atomic(bitmap->sb_page); 431 sb = kmap_atomic(bitmap->sb_page);
461 sb->events = cpu_to_le64(bitmap->mddev->events); 432 sb->events = cpu_to_le64(bitmap->mddev->events);
462 if (bitmap->mddev->events < bitmap->events_cleared) 433 if (bitmap->mddev->events < bitmap->events_cleared)
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
632 /* keep the array size field of the bitmap superblock up to date */ 603 /* keep the array size field of the bitmap superblock up to date */
633 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); 604 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
634 605
635 if (!bitmap->mddev->persistent) 606 if (bitmap->mddev->persistent) {
636 goto success; 607 /*
637 608 * We have a persistent array superblock, so compare the
638 /* 609 * bitmap's UUID and event counter to the mddev's
639 * if we have a persistent array superblock, compare the 610 */
640 * bitmap's UUID and event counter to the mddev's 611 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
641 */ 612 printk(KERN_INFO
642 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { 613 "%s: bitmap superblock UUID mismatch\n",
643 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", 614 bmname(bitmap));
644 bmname(bitmap)); 615 goto out;
645 goto out; 616 }
646 } 617 events = le64_to_cpu(sb->events);
647 events = le64_to_cpu(sb->events); 618 if (events < bitmap->mddev->events) {
648 if (events < bitmap->mddev->events) { 619 printk(KERN_INFO
649 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " 620 "%s: bitmap file is out of date (%llu < %llu) "
650 "-- forcing full recovery\n", bmname(bitmap), events, 621 "-- forcing full recovery\n",
651 (unsigned long long) bitmap->mddev->events); 622 bmname(bitmap), events,
652 sb->state |= cpu_to_le32(BITMAP_STALE); 623 (unsigned long long) bitmap->mddev->events);
624 sb->state |= cpu_to_le32(BITMAP_STALE);
625 }
653 } 626 }
654success: 627
655 /* assign fields using values from superblock */ 628 /* assign fields using values from superblock */
656 bitmap->mddev->bitmap_info.chunksize = chunksize; 629 bitmap->mddev->bitmap_info.chunksize = chunksize;
657 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; 630 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -680,15 +653,10 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
680 enum bitmap_mask_op op) 653 enum bitmap_mask_op op)
681{ 654{
682 bitmap_super_t *sb; 655 bitmap_super_t *sb;
683 unsigned long flags;
684 int old; 656 int old;
685 657
686 spin_lock_irqsave(&bitmap->lock, flags); 658 if (!bitmap->sb_page) /* can't set the state */
687 if (!bitmap->sb_page) { /* can't set the state */
688 spin_unlock_irqrestore(&bitmap->lock, flags);
689 return 0; 659 return 0;
690 }
691 spin_unlock_irqrestore(&bitmap->lock, flags);
692 sb = kmap_atomic(bitmap->sb_page); 660 sb = kmap_atomic(bitmap->sb_page);
693 old = le32_to_cpu(sb->state) & bits; 661 old = le32_to_cpu(sb->state) & bits;
694 switch (op) { 662 switch (op) {
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
870 unsigned long bit; 838 unsigned long bit;
871 struct page *page; 839 struct page *page;
872 void *kaddr; 840 void *kaddr;
873 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 841 unsigned long chunk = block >> bitmap->chunkshift;
874 842
875 if (!bitmap->filemap) 843 if (!bitmap->filemap)
876 return; 844 return;
@@ -1069,10 +1037,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1069 kunmap_atomic(paddr); 1037 kunmap_atomic(paddr);
1070 if (b) { 1038 if (b) {
1071 /* if the disk bit is set, set the memory bit */ 1039 /* if the disk bit is set, set the memory bit */
1072 int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) 1040 int needed = ((sector_t)(i+1) << bitmap->chunkshift
1073 >= start); 1041 >= start);
1074 bitmap_set_memory_bits(bitmap, 1042 bitmap_set_memory_bits(bitmap,
1075 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1043 (sector_t)i << bitmap->chunkshift,
1076 needed); 1044 needed);
1077 bit_cnt++; 1045 bit_cnt++;
1078 } 1046 }
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
1116 1084
1117static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1085static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
1118{ 1086{
1119 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1087 sector_t chunk = offset >> bitmap->chunkshift;
1120 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1088 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1121 bitmap->bp[page].count += inc; 1089 bitmap->bp[page].count += inc;
1122 bitmap_checkfree(bitmap, page); 1090 bitmap_checkfree(bitmap, page);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1222 bitmap->allclean = 0; 1190 bitmap->allclean = 0;
1223 } 1191 }
1224 bmc = bitmap_get_counter(bitmap, 1192 bmc = bitmap_get_counter(bitmap,
1225 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1193 (sector_t)j << bitmap->chunkshift,
1226 &blocks, 0); 1194 &blocks, 0);
1227 if (!bmc) 1195 if (!bmc)
1228 j |= PAGE_COUNTER_MASK; 1196 j |= PAGE_COUNTER_MASK;
@@ -1231,7 +1199,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1231 /* we can clear the bit */ 1199 /* we can clear the bit */
1232 *bmc = 0; 1200 *bmc = 0;
1233 bitmap_count_page(bitmap, 1201 bitmap_count_page(bitmap,
1234 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1202 (sector_t)j << bitmap->chunkshift,
1235 -1); 1203 -1);
1236 1204
1237 /* clear the bit */ 1205 /* clear the bit */
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
1285 * The lock must have been taken with interrupts enabled. 1253 * The lock must have been taken with interrupts enabled.
1286 * If !create, we don't release the lock. 1254 * If !create, we don't release the lock.
1287 */ 1255 */
1288 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1256 sector_t chunk = offset >> bitmap->chunkshift;
1289 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1257 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1290 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1258 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1291 sector_t csize; 1259 sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
1295 1263
1296 if (bitmap->bp[page].hijacked || 1264 if (bitmap->bp[page].hijacked ||
1297 bitmap->bp[page].map == NULL) 1265 bitmap->bp[page].map == NULL)
1298 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + 1266 csize = ((sector_t)1) << (bitmap->chunkshift +
1299 PAGE_COUNTER_SHIFT - 1); 1267 PAGE_COUNTER_SHIFT - 1);
1300 else 1268 else
1301 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); 1269 csize = ((sector_t)1) << bitmap->chunkshift;
1302 *blocks = csize - (offset & (csize - 1)); 1270 *blocks = csize - (offset & (csize - 1));
1303 1271
1304 if (err < 0) 1272 if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1424 set_page_attr(bitmap, 1392 set_page_attr(bitmap,
1425 filemap_get_page( 1393 filemap_get_page(
1426 bitmap, 1394 bitmap,
1427 offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1395 offset >> bitmap->chunkshift),
1428 BITMAP_PAGE_PENDING); 1396 BITMAP_PAGE_PENDING);
1429 bitmap->allclean = 0; 1397 bitmap->allclean = 0;
1430 } 1398 }
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1512 else { 1480 else {
1513 if (*bmc <= 2) { 1481 if (*bmc <= 2) {
1514 set_page_attr(bitmap, 1482 set_page_attr(bitmap,
1515 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1483 filemap_get_page(bitmap, offset >> bitmap->chunkshift),
1516 BITMAP_PAGE_PENDING); 1484 BITMAP_PAGE_PENDING);
1517 bitmap->allclean = 0; 1485 bitmap->allclean = 0;
1518 } 1486 }
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1559 1527
1560 bitmap->mddev->curr_resync_completed = sector; 1528 bitmap->mddev->curr_resync_completed = sector;
1561 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1529 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1562 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1530 sector &= ~((1ULL << bitmap->chunkshift) - 1);
1563 s = 0; 1531 s = 0;
1564 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1532 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1565 bitmap_end_sync(bitmap, s, &blocks, 0); 1533 bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1589 struct page *page; 1557 struct page *page;
1590 *bmc = 2 | (needed ? NEEDED_MASK : 0); 1558 *bmc = 2 | (needed ? NEEDED_MASK : 0);
1591 bitmap_count_page(bitmap, offset, 1); 1559 bitmap_count_page(bitmap, offset, 1);
1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1560 page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1561 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
1594 bitmap->allclean = 0; 1562 bitmap->allclean = 0;
1595 } 1563 }
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1602 unsigned long chunk; 1570 unsigned long chunk;
1603 1571
1604 for (chunk = s; chunk <= e; chunk++) { 1572 for (chunk = s; chunk <= e; chunk++) {
1605 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); 1573 sector_t sec = (sector_t)chunk << bitmap->chunkshift;
1606 bitmap_set_memory_bits(bitmap, sec, 1); 1574 bitmap_set_memory_bits(bitmap, sec, 1);
1607 spin_lock_irq(&bitmap->lock); 1575 spin_lock_irq(&bitmap->lock);
1608 bitmap_file_set_bit(bitmap, sec); 1576 bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
1759 goto error; 1727 goto error;
1760 1728
1761 bitmap->daemon_lastrun = jiffies; 1729 bitmap->daemon_lastrun = jiffies;
1762 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); 1730 bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
1731 - BITMAP_BLOCK_SHIFT);
1763 1732
1764 /* now that chunksize and chunkshift are set, we can use these macros */ 1733 /* now that chunksize and chunkshift are set, we can use these macros */
1765 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1734 chunks = (blocks + bitmap->chunkshift - 1) >>
1766 CHUNK_BLOCK_SHIFT(bitmap); 1735 bitmap->chunkshift;
1767 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; 1736 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1768 1737
1769 BUG_ON(!pages); 1738 BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
1836} 1805}
1837EXPORT_SYMBOL_GPL(bitmap_load); 1806EXPORT_SYMBOL_GPL(bitmap_load);
1838 1807
1808void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
1809{
1810 unsigned long chunk_kb;
1811 unsigned long flags;
1812
1813 if (!bitmap)
1814 return;
1815
1816 spin_lock_irqsave(&bitmap->lock, flags);
1817 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
1818 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
1819 "%lu%s chunk",
1820 bitmap->pages - bitmap->missing_pages,
1821 bitmap->pages,
1822 (bitmap->pages - bitmap->missing_pages)
1823 << (PAGE_SHIFT - 10),
1824 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
1825 chunk_kb ? "KB" : "B");
1826 if (bitmap->file) {
1827 seq_printf(seq, ", file: ");
1828 seq_path(seq, &bitmap->file->f_path, " \t\n");
1829 }
1830
1831 seq_printf(seq, "\n");
1832 spin_unlock_irqrestore(&bitmap->lock, flags);
1833}
1834
1839static ssize_t 1835static ssize_t
1840location_show(struct mddev *mddev, char *page) 1836location_show(struct mddev *mddev, char *page)
1841{ 1837{
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
1904 if (mddev->pers) { 1900 if (mddev->pers) {
1905 mddev->pers->quiesce(mddev, 1); 1901 mddev->pers->quiesce(mddev, 1);
1906 rv = bitmap_create(mddev); 1902 rv = bitmap_create(mddev);
1903 if (!rv)
1904 rv = bitmap_load(mddev);
1907 if (rv) { 1905 if (rv) {
1908 bitmap_destroy(mddev); 1906 bitmap_destroy(mddev);
1909 mddev->bitmap_info.offset = 0; 1907 mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3e..55ca5aec84e4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
13#define BITMAP_MAJOR_HI 4 13#define BITMAP_MAJOR_HI 4
14#define BITMAP_MAJOR_HOSTENDIAN 3 14#define BITMAP_MAJOR_HOSTENDIAN 3
15 15
16#define BITMAP_MINOR 39
17
18/* 16/*
19 * in-memory bitmap: 17 * in-memory bitmap:
20 * 18 *
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
101/* same, except a mask value for more efficient bitops */ 99/* same, except a mask value for more efficient bitops */
102#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) 100#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
103 101
104#define BITMAP_BLOCK_SIZE 512
105#define BITMAP_BLOCK_SHIFT 9 102#define BITMAP_BLOCK_SHIFT 9
106 103
107/* how many blocks per chunk? (this is variable) */ 104/* how many blocks per chunk? (this is variable) */
108#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) 105#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
109#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
111
112/* when hijacked, the counters and bits represent even larger "chunks" */
113/* there will be 1024 chunks represented by each counter in the page pointers */
114#define PAGEPTR_BLOCK_RATIO(bitmap) \
115 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
116#define PAGEPTR_BLOCK_SHIFT(bitmap) \
117 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
118#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
119 106
120#endif 107#endif
121 108
@@ -181,12 +168,6 @@ struct bitmap_page {
181 unsigned int count:31; 168 unsigned int count:31;
182}; 169};
183 170
184/* keep track of bitmap file pages that have pending writes on them */
185struct page_list {
186 struct list_head list;
187 struct page *page;
188};
189
190/* the main bitmap structure - one per mddev */ 171/* the main bitmap structure - one per mddev */
191struct bitmap { 172struct bitmap {
192 struct bitmap_page *bp; 173 struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
196 struct mddev *mddev; /* the md device that the bitmap is for */ 177 struct mddev *mddev; /* the md device that the bitmap is for */
197 178
198 /* bitmap chunksize -- how much data does each bit represent? */ 179 /* bitmap chunksize -- how much data does each bit represent? */
199 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 180 unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
200 unsigned long chunks; /* total number of data chunks for the array */ 181 unsigned long chunks; /* total number of data chunks for the array */
201 182
202 __u64 events_cleared; 183 __u64 events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
245 226
246void bitmap_print_sb(struct bitmap *bitmap); 227void bitmap_print_sb(struct bitmap *bitmap);
247void bitmap_update_sb(struct bitmap *bitmap); 228void bitmap_update_sb(struct bitmap *bitmap);
229void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
248 230
249int bitmap_setallbits(struct bitmap *bitmap); 231int bitmap_setallbits(struct bitmap *bitmap);
250void bitmap_write_all(struct bitmap *bitmap); 232void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 787022c18187..c5a875d7b882 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -615,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
615 615
616static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 616static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
617{ 617{
618 struct md_rdev *r, *t; 618 struct md_rdev *r;
619 uint64_t failed_devices; 619 uint64_t failed_devices;
620 struct dm_raid_superblock *sb; 620 struct dm_raid_superblock *sb;
621 621
622 sb = page_address(rdev->sb_page); 622 sb = page_address(rdev->sb_page);
623 failed_devices = le64_to_cpu(sb->failed_devices); 623 failed_devices = le64_to_cpu(sb->failed_devices);
624 624
625 rdev_for_each(r, t, mddev) 625 rdev_for_each(r, mddev)
626 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 626 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
627 failed_devices |= (1ULL << r->raid_disk); 627 failed_devices |= (1ULL << r->raid_disk);
628 628
@@ -707,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
707 struct dm_raid_superblock *sb; 707 struct dm_raid_superblock *sb;
708 uint32_t new_devs = 0; 708 uint32_t new_devs = 0;
709 uint32_t rebuilds = 0; 709 uint32_t rebuilds = 0;
710 struct md_rdev *r, *t; 710 struct md_rdev *r;
711 struct dm_raid_superblock *sb2; 711 struct dm_raid_superblock *sb2;
712 712
713 sb = page_address(rdev->sb_page); 713 sb = page_address(rdev->sb_page);
@@ -750,7 +750,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
750 * case the In_sync bit will /not/ be set and 750 * case the In_sync bit will /not/ be set and
751 * recovery_cp must be MaxSector. 751 * recovery_cp must be MaxSector.
752 */ 752 */
753 rdev_for_each(r, t, mddev) { 753 rdev_for_each(r, mddev) {
754 if (!test_bit(In_sync, &r->flags)) { 754 if (!test_bit(In_sync, &r->flags)) {
755 DMINFO("Device %d specified for rebuild: " 755 DMINFO("Device %d specified for rebuild: "
756 "Clearing superblock", r->raid_disk); 756 "Clearing superblock", r->raid_disk);
@@ -782,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
782 * Now we set the Faulty bit for those devices that are 782 * Now we set the Faulty bit for those devices that are
783 * recorded in the superblock as failed. 783 * recorded in the superblock as failed.
784 */ 784 */
785 rdev_for_each(r, t, mddev) { 785 rdev_for_each(r, mddev) {
786 if (!r->sb_page) 786 if (!r->sb_page)
787 continue; 787 continue;
788 sb2 = page_address(r->sb_page); 788 sb2 = page_address(r->sb_page);
@@ -855,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
855static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 855static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
856{ 856{
857 int ret; 857 int ret;
858 struct md_rdev *rdev, *freshest, *tmp; 858 struct md_rdev *rdev, *freshest;
859 struct mddev *mddev = &rs->md; 859 struct mddev *mddev = &rs->md;
860 860
861 freshest = NULL; 861 freshest = NULL;
862 rdev_for_each(rdev, tmp, mddev) { 862 rdev_for_each(rdev, mddev) {
863 if (!rdev->meta_bdev) 863 if (!rdev->meta_bdev)
864 continue; 864 continue;
865 865
@@ -888,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
888 if (super_validate(mddev, freshest)) 888 if (super_validate(mddev, freshest))
889 return -EINVAL; 889 return -EINVAL;
890 890
891 rdev_for_each(rdev, tmp, mddev) 891 rdev_for_each(rdev, mddev)
892 if ((rdev != freshest) && super_validate(mddev, rdev)) 892 if ((rdev != freshest) && super_validate(mddev, rdev))
893 return -EINVAL; 893 return -EINVAL;
894 894
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb44..45135f69509c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
315 } 315 }
316 conf->nfaults = 0; 316 conf->nfaults = 0;
317 317
318 list_for_each_entry(rdev, &mddev->disks, same_set) 318 rdev_for_each(rdev, mddev)
319 conf->rdev = rdev; 319 conf->rdev = rdev;
320 320
321 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); 321 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb3..b0fcc7d02adb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
68 struct dev_info *dev0; 68 struct dev_info *dev0;
69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; 69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
71 int maxbytes = biovec->bv_len;
72 struct request_queue *subq;
71 73
72 rcu_read_lock(); 74 rcu_read_lock();
73 dev0 = which_dev(mddev, sector); 75 dev0 = which_dev(mddev, sector);
74 maxsectors = dev0->end_sector - sector; 76 maxsectors = dev0->end_sector - sector;
77 subq = bdev_get_queue(dev0->rdev->bdev);
78 if (subq->merge_bvec_fn) {
79 bvm->bi_bdev = dev0->rdev->bdev;
80 bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
81 maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
82 biovec));
83 }
75 rcu_read_unlock(); 84 rcu_read_unlock();
76 85
77 if (maxsectors < bio_sectors) 86 if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
80 maxsectors -= bio_sectors; 89 maxsectors -= bio_sectors;
81 90
82 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) 91 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
83 return biovec->bv_len; 92 return maxbytes;
84 /* The bytes available at this offset could be really big, 93
85 * so we cap at 2^31 to avoid overflow */ 94 if (maxsectors > (maxbytes >> 9))
86 if (maxsectors > (1 << (31-9))) 95 return maxbytes;
87 return 1<<31; 96 else
88 return maxsectors << 9; 97 return maxsectors << 9;
89} 98}
90 99
91static int linear_congested(void *data, int bits) 100static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
138 cnt = 0; 147 cnt = 0;
139 conf->array_sectors = 0; 148 conf->array_sectors = 0;
140 149
141 list_for_each_entry(rdev, &mddev->disks, same_set) { 150 rdev_for_each(rdev, mddev) {
142 int j = rdev->raid_disk; 151 int j = rdev->raid_disk;
143 struct dev_info *disk = conf->disks + j; 152 struct dev_info *disk = conf->disks + j;
144 sector_t sectors; 153 sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
158 167
159 disk_stack_limits(mddev->gendisk, rdev->bdev, 168 disk_stack_limits(mddev->gendisk, rdev->bdev,
160 rdev->data_offset << 9); 169 rdev->data_offset << 9);
161 /* as we don't honour merge_bvec_fn, we must never risk
162 * violating it, so limit max_segments to 1 lying within
163 * a single page.
164 */
165 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
166 blk_queue_max_segments(mddev->queue, 1);
167 blk_queue_segment_boundary(mddev->queue,
168 PAGE_CACHE_SIZE - 1);
169 }
170 170
171 conf->array_sectors += rdev->sectors; 171 conf->array_sectors += rdev->sectors;
172 cnt++; 172 cnt++;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755baf4a..b572e1e386ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
439 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 439 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
440 atomic_set(&mddev->flush_pending, 1); 440 atomic_set(&mddev->flush_pending, 1);
441 rcu_read_lock(); 441 rcu_read_lock();
442 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 442 rdev_for_each_rcu(rdev, mddev)
443 if (rdev->raid_disk >= 0 && 443 if (rdev->raid_disk >= 0 &&
444 !test_bit(Faulty, &rdev->flags)) { 444 !test_bit(Faulty, &rdev->flags)) {
445 /* Take two references, one is dropped 445 /* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
749{ 749{
750 struct md_rdev *rdev; 750 struct md_rdev *rdev;
751 751
752 list_for_each_entry(rdev, &mddev->disks, same_set) 752 rdev_for_each(rdev, mddev)
753 if (rdev->desc_nr == nr) 753 if (rdev->desc_nr == nr)
754 return rdev; 754 return rdev;
755 755
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
760{ 760{
761 struct md_rdev *rdev; 761 struct md_rdev *rdev;
762 762
763 list_for_each_entry(rdev, &mddev->disks, same_set) 763 rdev_for_each(rdev, mddev)
764 if (rdev->bdev->bd_dev == dev) 764 if (rdev->bdev->bd_dev == dev)
765 return rdev; 765 return rdev;
766 766
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1342 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1342 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1343 1343
1344 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1344 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1345 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1345 rdev_for_each(rdev2, mddev) {
1346 mdp_disk_t *d; 1346 mdp_disk_t *d;
1347 int desc_nr; 1347 int desc_nr;
1348 int is_active = test_bit(In_sync, &rdev2->flags); 1348 int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
1805 | BB_LEN(internal_bb)); 1805 | BB_LEN(internal_bb));
1806 *bbp++ = cpu_to_le64(store_bb); 1806 *bbp++ = cpu_to_le64(store_bb);
1807 } 1807 }
1808 bb->changed = 0;
1808 if (read_seqretry(&bb->lock, seq)) 1809 if (read_seqretry(&bb->lock, seq))
1809 goto retry; 1810 goto retry;
1810 1811
1811 bb->sector = (rdev->sb_start + 1812 bb->sector = (rdev->sb_start +
1812 (int)le32_to_cpu(sb->bblog_offset)); 1813 (int)le32_to_cpu(sb->bblog_offset));
1813 bb->size = le16_to_cpu(sb->bblog_size); 1814 bb->size = le16_to_cpu(sb->bblog_size);
1814 bb->changed = 0;
1815 } 1815 }
1816 } 1816 }
1817 1817
1818 max_dev = 0; 1818 max_dev = 0;
1819 list_for_each_entry(rdev2, &mddev->disks, same_set) 1819 rdev_for_each(rdev2, mddev)
1820 if (rdev2->desc_nr+1 > max_dev) 1820 if (rdev2->desc_nr+1 > max_dev)
1821 max_dev = rdev2->desc_nr+1; 1821 max_dev = rdev2->desc_nr+1;
1822 1822
@@ -1833,7 +1833,7 @@ retry:
1833 for (i=0; i<max_dev;i++) 1833 for (i=0; i<max_dev;i++)
1834 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1834 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1835 1835
1836 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1836 rdev_for_each(rdev2, mddev) {
1837 i = rdev2->desc_nr; 1837 i = rdev2->desc_nr;
1838 if (test_bit(Faulty, &rdev2->flags)) 1838 if (test_bit(Faulty, &rdev2->flags))
1839 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1839 sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
1948 return 0; /* nothing to do */ 1948 return 0; /* nothing to do */
1949 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1949 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1950 return 0; /* shouldn't register, or already is */ 1950 return 0; /* shouldn't register, or already is */
1951 list_for_each_entry(rdev, &mddev->disks, same_set) { 1951 rdev_for_each(rdev, mddev) {
1952 /* skip spares and non-functional disks */ 1952 /* skip spares and non-functional disks */
1953 if (test_bit(Faulty, &rdev->flags)) 1953 if (test_bit(Faulty, &rdev->flags))
1954 continue; 1954 continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
2175{ 2175{
2176 struct md_rdev *rdev, *tmp; 2176 struct md_rdev *rdev, *tmp;
2177 2177
2178 rdev_for_each(rdev, tmp, mddev) { 2178 rdev_for_each_safe(rdev, tmp, mddev) {
2179 if (!rdev->mddev) { 2179 if (!rdev->mddev) {
2180 MD_BUG(); 2180 MD_BUG();
2181 continue; 2181 continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
2307 bitmap_print_sb(mddev->bitmap); 2307 bitmap_print_sb(mddev->bitmap);
2308 else 2308 else
2309 printk("%s: ", mdname(mddev)); 2309 printk("%s: ", mdname(mddev));
2310 list_for_each_entry(rdev, &mddev->disks, same_set) 2310 rdev_for_each(rdev, mddev)
2311 printk("<%s>", bdevname(rdev->bdev,b)); 2311 printk("<%s>", bdevname(rdev->bdev,b));
2312 printk("\n"); 2312 printk("\n");
2313 2313
2314 list_for_each_entry(rdev, &mddev->disks, same_set) 2314 rdev_for_each(rdev, mddev)
2315 print_rdev(rdev, mddev->major_version); 2315 print_rdev(rdev, mddev->major_version);
2316 } 2316 }
2317 printk("md: **********************************\n"); 2317 printk("md: **********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
2328 * with the rest of the array) 2328 * with the rest of the array)
2329 */ 2329 */
2330 struct md_rdev *rdev; 2330 struct md_rdev *rdev;
2331 list_for_each_entry(rdev, &mddev->disks, same_set) { 2331 rdev_for_each(rdev, mddev) {
2332 if (rdev->sb_events == mddev->events || 2332 if (rdev->sb_events == mddev->events ||
2333 (nospares && 2333 (nospares &&
2334 rdev->raid_disk < 0 && 2334 rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
2351 2351
2352repeat: 2352repeat:
2353 /* First make sure individual recovery_offsets are correct */ 2353 /* First make sure individual recovery_offsets are correct */
2354 list_for_each_entry(rdev, &mddev->disks, same_set) { 2354 rdev_for_each(rdev, mddev) {
2355 if (rdev->raid_disk >= 0 && 2355 if (rdev->raid_disk >= 0 &&
2356 mddev->delta_disks >= 0 && 2356 mddev->delta_disks >= 0 &&
2357 !test_bit(In_sync, &rdev->flags) && 2357 !test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
2364 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2364 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2365 if (!mddev->external) { 2365 if (!mddev->external) {
2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2367 list_for_each_entry(rdev, &mddev->disks, same_set) { 2367 rdev_for_each(rdev, mddev) {
2368 if (rdev->badblocks.changed) { 2368 if (rdev->badblocks.changed) {
2369 rdev->badblocks.changed = 0;
2369 md_ack_all_badblocks(&rdev->badblocks); 2370 md_ack_all_badblocks(&rdev->badblocks);
2370 md_error(mddev, rdev); 2371 md_error(mddev, rdev);
2371 } 2372 }
@@ -2430,7 +2431,7 @@ repeat:
2430 mddev->events --; 2431 mddev->events --;
2431 } 2432 }
2432 2433
2433 list_for_each_entry(rdev, &mddev->disks, same_set) { 2434 rdev_for_each(rdev, mddev) {
2434 if (rdev->badblocks.changed) 2435 if (rdev->badblocks.changed)
2435 any_badblocks_changed++; 2436 any_badblocks_changed++;
2436 if (test_bit(Faulty, &rdev->flags)) 2437 if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
2444 mdname(mddev), mddev->in_sync); 2445 mdname(mddev), mddev->in_sync);
2445 2446
2446 bitmap_update_sb(mddev->bitmap); 2447 bitmap_update_sb(mddev->bitmap);
2447 list_for_each_entry(rdev, &mddev->disks, same_set) { 2448 rdev_for_each(rdev, mddev) {
2448 char b[BDEVNAME_SIZE]; 2449 char b[BDEVNAME_SIZE];
2449 2450
2450 if (rdev->sb_loaded != 1) 2451 if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
2493 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2494 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2494 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2495 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2495 2496
2496 list_for_each_entry(rdev, &mddev->disks, same_set) { 2497 rdev_for_each(rdev, mddev) {
2497 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2498 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2498 clear_bit(Blocked, &rdev->flags); 2499 clear_bit(Blocked, &rdev->flags);
2499 2500
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2896 struct md_rdev *rdev2; 2897 struct md_rdev *rdev2;
2897 2898
2898 mddev_lock(mddev); 2899 mddev_lock(mddev);
2899 list_for_each_entry(rdev2, &mddev->disks, same_set) 2900 rdev_for_each(rdev2, mddev)
2900 if (rdev->bdev == rdev2->bdev && 2901 if (rdev->bdev == rdev2->bdev &&
2901 rdev != rdev2 && 2902 rdev != rdev2 &&
2902 overlaps(rdev->data_offset, rdev->sectors, 2903 overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
3193 char b[BDEVNAME_SIZE]; 3194 char b[BDEVNAME_SIZE];
3194 3195
3195 freshest = NULL; 3196 freshest = NULL;
3196 rdev_for_each(rdev, tmp, mddev) 3197 rdev_for_each_safe(rdev, tmp, mddev)
3197 switch (super_types[mddev->major_version]. 3198 switch (super_types[mddev->major_version].
3198 load_super(rdev, freshest, mddev->minor_version)) { 3199 load_super(rdev, freshest, mddev->minor_version)) {
3199 case 1: 3200 case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
3214 validate_super(mddev, freshest); 3215 validate_super(mddev, freshest);
3215 3216
3216 i = 0; 3217 i = 0;
3217 rdev_for_each(rdev, tmp, mddev) { 3218 rdev_for_each_safe(rdev, tmp, mddev) {
3218 if (mddev->max_disks && 3219 if (mddev->max_disks &&
3219 (rdev->desc_nr >= mddev->max_disks || 3220 (rdev->desc_nr >= mddev->max_disks ||
3220 i > mddev->max_disks)) { 3221 i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3403 return -EINVAL; 3404 return -EINVAL;
3404 } 3405 }
3405 3406
3406 list_for_each_entry(rdev, &mddev->disks, same_set) 3407 rdev_for_each(rdev, mddev)
3407 rdev->new_raid_disk = rdev->raid_disk; 3408 rdev->new_raid_disk = rdev->raid_disk;
3408 3409
3409 /* ->takeover must set new_* and/or delta_disks 3410 /* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3456 mddev->safemode = 0; 3457 mddev->safemode = 0;
3457 } 3458 }
3458 3459
3459 list_for_each_entry(rdev, &mddev->disks, same_set) { 3460 rdev_for_each(rdev, mddev) {
3460 if (rdev->raid_disk < 0) 3461 if (rdev->raid_disk < 0)
3461 continue; 3462 continue;
3462 if (rdev->new_raid_disk >= mddev->raid_disks) 3463 if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3465 continue; 3466 continue;
3466 sysfs_unlink_rdev(mddev, rdev); 3467 sysfs_unlink_rdev(mddev, rdev);
3467 } 3468 }
3468 list_for_each_entry(rdev, &mddev->disks, same_set) { 3469 rdev_for_each(rdev, mddev) {
3469 if (rdev->raid_disk < 0) 3470 if (rdev->raid_disk < 0)
3470 continue; 3471 continue;
3471 if (rdev->new_raid_disk == rdev->raid_disk) 3472 if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev)
4796 * the only valid external interface is through the md 4797 * the only valid external interface is through the md
4797 * device. 4798 * device.
4798 */ 4799 */
4799 list_for_each_entry(rdev, &mddev->disks, same_set) { 4800 rdev_for_each(rdev, mddev) {
4800 if (test_bit(Faulty, &rdev->flags)) 4801 if (test_bit(Faulty, &rdev->flags))
4801 continue; 4802 continue;
4802 sync_blockdev(rdev->bdev); 4803 sync_blockdev(rdev->bdev);
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev)
4867 struct md_rdev *rdev2; 4868 struct md_rdev *rdev2;
4868 int warned = 0; 4869 int warned = 0;
4869 4870
4870 list_for_each_entry(rdev, &mddev->disks, same_set) 4871 rdev_for_each(rdev, mddev)
4871 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4872 rdev_for_each(rdev2, mddev) {
4872 if (rdev < rdev2 && 4873 if (rdev < rdev2 &&
4873 rdev->bdev->bd_contains == 4874 rdev->bdev->bd_contains ==
4874 rdev2->bdev->bd_contains) { 4875 rdev2->bdev->bd_contains) {
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev)
4945 mddev->in_sync = 1; 4946 mddev->in_sync = 1;
4946 smp_wmb(); 4947 smp_wmb();
4947 mddev->ready = 1; 4948 mddev->ready = 1;
4948 list_for_each_entry(rdev, &mddev->disks, same_set) 4949 rdev_for_each(rdev, mddev)
4949 if (rdev->raid_disk >= 0) 4950 if (rdev->raid_disk >= 0)
4950 if (sysfs_link_rdev(mddev, rdev)) 4951 if (sysfs_link_rdev(mddev, rdev))
4951 /* failure here is OK */; 4952 /* failure here is OK */;
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
5073 mddev->changed = 0; 5074 mddev->changed = 0;
5074 mddev->degraded = 0; 5075 mddev->degraded = 0;
5075 mddev->safemode = 0; 5076 mddev->safemode = 0;
5077 mddev->merge_check_needed = 0;
5076 mddev->bitmap_info.offset = 0; 5078 mddev->bitmap_info.offset = 0;
5077 mddev->bitmap_info.default_offset = 0; 5079 mddev->bitmap_info.default_offset = 0;
5078 mddev->bitmap_info.chunksize = 0; 5080 mddev->bitmap_info.chunksize = 0;
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
5175 /* tell userspace to handle 'inactive' */ 5177 /* tell userspace to handle 'inactive' */
5176 sysfs_notify_dirent_safe(mddev->sysfs_state); 5178 sysfs_notify_dirent_safe(mddev->sysfs_state);
5177 5179
5178 list_for_each_entry(rdev, &mddev->disks, same_set) 5180 rdev_for_each(rdev, mddev)
5179 if (rdev->raid_disk >= 0) 5181 if (rdev->raid_disk >= 0)
5180 sysfs_unlink_rdev(mddev, rdev); 5182 sysfs_unlink_rdev(mddev, rdev);
5181 5183
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
5226 5228
5227 printk(KERN_INFO "md: running: "); 5229 printk(KERN_INFO "md: running: ");
5228 5230
5229 list_for_each_entry(rdev, &mddev->disks, same_set) { 5231 rdev_for_each(rdev, mddev) {
5230 char b[BDEVNAME_SIZE]; 5232 char b[BDEVNAME_SIZE];
5231 printk("<%s>", bdevname(rdev->bdev,b)); 5233 printk("<%s>", bdevname(rdev->bdev,b));
5232 } 5234 }
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
5356 struct md_rdev *rdev; 5358 struct md_rdev *rdev;
5357 5359
5358 nr=working=insync=failed=spare=0; 5360 nr=working=insync=failed=spare=0;
5359 list_for_each_entry(rdev, &mddev->disks, same_set) { 5361 rdev_for_each(rdev, mddev) {
5360 nr++; 5362 nr++;
5361 if (test_bit(Faulty, &rdev->flags)) 5363 if (test_bit(Faulty, &rdev->flags))
5362 failed++; 5364 failed++;
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
5923 * grow, and re-add. 5925 * grow, and re-add.
5924 */ 5926 */
5925 return -EBUSY; 5927 return -EBUSY;
5926 list_for_each_entry(rdev, &mddev->disks, same_set) { 5928 rdev_for_each(rdev, mddev) {
5927 sector_t avail = rdev->sectors; 5929 sector_t avail = rdev->sectors;
5928 5930
5929 if (fit && (num_sectors == 0 || num_sectors > avail)) 5931 if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
6724 struct mddev *mddev = v; 6726 struct mddev *mddev = v;
6725 sector_t sectors; 6727 sector_t sectors;
6726 struct md_rdev *rdev; 6728 struct md_rdev *rdev;
6727 struct bitmap *bitmap;
6728 6729
6729 if (v == (void*)1) { 6730 if (v == (void*)1) {
6730 struct md_personality *pers; 6731 struct md_personality *pers;
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6758 } 6759 }
6759 6760
6760 sectors = 0; 6761 sectors = 0;
6761 list_for_each_entry(rdev, &mddev->disks, same_set) { 6762 rdev_for_each(rdev, mddev) {
6762 char b[BDEVNAME_SIZE]; 6763 char b[BDEVNAME_SIZE];
6763 seq_printf(seq, " %s[%d]", 6764 seq_printf(seq, " %s[%d]",
6764 bdevname(rdev->bdev,b), rdev->desc_nr); 6765 bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6812 } else 6813 } else
6813 seq_printf(seq, "\n "); 6814 seq_printf(seq, "\n ");
6814 6815
6815 if ((bitmap = mddev->bitmap)) { 6816 bitmap_status(seq, mddev->bitmap);
6816 unsigned long chunk_kb;
6817 unsigned long flags;
6818 spin_lock_irqsave(&bitmap->lock, flags);
6819 chunk_kb = mddev->bitmap_info.chunksize >> 10;
6820 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6821 "%lu%s chunk",
6822 bitmap->pages - bitmap->missing_pages,
6823 bitmap->pages,
6824 (bitmap->pages - bitmap->missing_pages)
6825 << (PAGE_SHIFT - 10),
6826 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6827 chunk_kb ? "KB" : "B");
6828 if (bitmap->file) {
6829 seq_printf(seq, ", file: ");
6830 seq_path(seq, &bitmap->file->f_path, " \t\n");
6831 }
6832
6833 seq_printf(seq, "\n");
6834 spin_unlock_irqrestore(&bitmap->lock, flags);
6835 }
6836 6817
6837 seq_printf(seq, "\n"); 6818 seq_printf(seq, "\n");
6838 } 6819 }
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
7170 max_sectors = mddev->dev_sectors; 7151 max_sectors = mddev->dev_sectors;
7171 j = MaxSector; 7152 j = MaxSector;
7172 rcu_read_lock(); 7153 rcu_read_lock();
7173 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 7154 rdev_for_each_rcu(rdev, mddev)
7174 if (rdev->raid_disk >= 0 && 7155 if (rdev->raid_disk >= 0 &&
7175 !test_bit(Faulty, &rdev->flags) && 7156 !test_bit(Faulty, &rdev->flags) &&
7176 !test_bit(In_sync, &rdev->flags) && 7157 !test_bit(In_sync, &rdev->flags) &&
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
7342 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7323 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7343 mddev->curr_resync = MaxSector; 7324 mddev->curr_resync = MaxSector;
7344 rcu_read_lock(); 7325 rcu_read_lock();
7345 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 7326 rdev_for_each_rcu(rdev, mddev)
7346 if (rdev->raid_disk >= 0 && 7327 if (rdev->raid_disk >= 0 &&
7347 mddev->delta_disks >= 0 && 7328 mddev->delta_disks >= 0 &&
7348 !test_bit(Faulty, &rdev->flags) && 7329 !test_bit(Faulty, &rdev->flags) &&
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev)
7388 7369
7389 mddev->curr_resync_completed = 0; 7370 mddev->curr_resync_completed = 0;
7390 7371
7391 list_for_each_entry(rdev, &mddev->disks, same_set) 7372 rdev_for_each(rdev, mddev)
7392 if (rdev->raid_disk >= 0 && 7373 if (rdev->raid_disk >= 0 &&
7393 !test_bit(Blocked, &rdev->flags) && 7374 !test_bit(Blocked, &rdev->flags) &&
7394 (test_bit(Faulty, &rdev->flags) || 7375 (test_bit(Faulty, &rdev->flags) ||
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev)
7406 "degraded"); 7387 "degraded");
7407 7388
7408 7389
7409 list_for_each_entry(rdev, &mddev->disks, same_set) { 7390 rdev_for_each(rdev, mddev) {
7410 if (rdev->raid_disk >= 0 && 7391 if (rdev->raid_disk >= 0 &&
7411 !test_bit(In_sync, &rdev->flags) && 7392 !test_bit(In_sync, &rdev->flags) &&
7412 !test_bit(Faulty, &rdev->flags)) 7393 !test_bit(Faulty, &rdev->flags))
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
7451 * do the superblock for an incrementally recovered device 7432 * do the superblock for an incrementally recovered device
7452 * written out. 7433 * written out.
7453 */ 7434 */
7454 list_for_each_entry(rdev, &mddev->disks, same_set) 7435 rdev_for_each(rdev, mddev)
7455 if (!mddev->degraded || 7436 if (!mddev->degraded ||
7456 test_bit(In_sync, &rdev->flags)) 7437 test_bit(In_sync, &rdev->flags))
7457 rdev->saved_raid_disk = -1; 7438 rdev->saved_raid_disk = -1;
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
7529 * failed devices. 7510 * failed devices.
7530 */ 7511 */
7531 struct md_rdev *rdev; 7512 struct md_rdev *rdev;
7532 list_for_each_entry(rdev, &mddev->disks, same_set) 7513 rdev_for_each(rdev, mddev)
7533 if (rdev->raid_disk >= 0 && 7514 if (rdev->raid_disk >= 0 &&
7534 !test_bit(Blocked, &rdev->flags) && 7515 !test_bit(Blocked, &rdev->flags) &&
7535 test_bit(Faulty, &rdev->flags) && 7516 test_bit(Faulty, &rdev->flags) &&
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
8040 return; 8021 return;
8041 write_seqlock_irq(&bb->lock); 8022 write_seqlock_irq(&bb->lock);
8042 8023
8043 if (bb->changed == 0) { 8024 if (bb->changed == 0 && bb->unacked_exist) {
8044 u64 *p = bb->page; 8025 u64 *p = bb->page;
8045 int i; 8026 int i;
8046 for (i = 0; i < bb->count ; i++) { 8027 for (i = 0; i < bb->count ; i++) {
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
8157 struct mddev *mddev; 8138 struct mddev *mddev;
8158 int need_delay = 0; 8139 int need_delay = 0;
8159 8140
8160 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 8141 for_each_mddev(mddev, tmp) {
8161 8142 if (mddev_trylock(mddev)) {
8162 printk(KERN_INFO "md: stopping all md devices.\n"); 8143 __md_stop_writes(mddev);
8163 8144 mddev->safemode = 2;
8164 for_each_mddev(mddev, tmp) { 8145 mddev_unlock(mddev);
8165 if (mddev_trylock(mddev)) {
8166 /* Force a switch to readonly even array
8167 * appears to still be in use. Hence
8168 * the '100'.
8169 */
8170 md_set_readonly(mddev, 100);
8171 mddev_unlock(mddev);
8172 }
8173 need_delay = 1;
8174 } 8146 }
8175 /* 8147 need_delay = 1;
8176 * certain more exotic SCSI devices are known to be
8177 * volatile wrt too early system reboots. While the
8178 * right place to handle this issue is the given
8179 * driver, we do want to have a safe RAID driver ...
8180 */
8181 if (need_delay)
8182 mdelay(1000*1);
8183 } 8148 }
8149 /*
8150 * certain more exotic SCSI devices are known to be
8151 * volatile wrt too early system reboots. While the
8152 * right place to handle this issue is the given
8153 * driver, we do want to have a safe RAID driver ...
8154 */
8155 if (need_delay)
8156 mdelay(1000*1);
8157
8184 return NOTIFY_DONE; 8158 return NOTIFY_DONE;
8185} 8159}
8186 8160
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2b..1c2063ccf48e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
128enum flag_bits { 128enum flag_bits {
129 Faulty, /* device is known to have a fault */ 129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */ 130 In_sync, /* device is in_sync with rest of array */
131 Unmerged, /* device is being added to array and should
132 * be considerred for bvec_merge_fn but not
133 * yet for actual IO
134 */
131 WriteMostly, /* Avoid reading if at all possible */ 135 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */ 136 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet 137 Blocked, /* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
345 int degraded; /* whether md should consider 349 int degraded; /* whether md should consider
346 * adding a spare 350 * adding a spare
347 */ 351 */
352 int merge_check_needed; /* at least one
353 * member device
354 * has a
355 * merge_bvec_fn */
348 356
349 atomic_t recovery_active; /* blocks scheduled, but not written */ 357 atomic_t recovery_active; /* blocks scheduled, but not written */
350 wait_queue_head_t recovery_wait; 358 wait_queue_head_t recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
519/* 527/*
520 * iterates through the 'same array disks' ringlist 528 * iterates through the 'same array disks' ringlist
521 */ 529 */
522#define rdev_for_each(rdev, tmp, mddev) \ 530#define rdev_for_each(rdev, mddev) \
531 list_for_each_entry(rdev, &((mddev)->disks), same_set)
532
533#define rdev_for_each_safe(rdev, tmp, mddev) \
523 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 534 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
524 535
525#define rdev_for_each_rcu(rdev, mddev) \ 536#define rdev_for_each_rcu(rdev, mddev) \
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660e..9339e67fcc79 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
428 } 428 }
429 429
430 working_disks = 0; 430 working_disks = 0;
431 list_for_each_entry(rdev, &mddev->disks, same_set) { 431 rdev_for_each(rdev, mddev) {
432 disk_idx = rdev->raid_disk; 432 disk_idx = rdev->raid_disk;
433 if (disk_idx < 0 || 433 if (disk_idx < 0 ||
434 disk_idx >= mddev->raid_disks) 434 disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e34..6f31f5596e01 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
91 91
92 if (!conf) 92 if (!conf)
93 return -ENOMEM; 93 return -ENOMEM;
94 list_for_each_entry(rdev1, &mddev->disks, same_set) { 94 rdev_for_each(rdev1, mddev) {
95 pr_debug("md/raid0:%s: looking at %s\n", 95 pr_debug("md/raid0:%s: looking at %s\n",
96 mdname(mddev), 96 mdname(mddev),
97 bdevname(rdev1->bdev, b)); 97 bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
102 sector_div(sectors, mddev->chunk_sectors); 102 sector_div(sectors, mddev->chunk_sectors);
103 rdev1->sectors = sectors * mddev->chunk_sectors; 103 rdev1->sectors = sectors * mddev->chunk_sectors;
104 104
105 list_for_each_entry(rdev2, &mddev->disks, same_set) { 105 rdev_for_each(rdev2, mddev) {
106 pr_debug("md/raid0:%s: comparing %s(%llu)" 106 pr_debug("md/raid0:%s: comparing %s(%llu)"
107 " with %s(%llu)\n", 107 " with %s(%llu)\n",
108 mdname(mddev), 108 mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
157 smallest = NULL; 157 smallest = NULL;
158 dev = conf->devlist; 158 dev = conf->devlist;
159 err = -EINVAL; 159 err = -EINVAL;
160 list_for_each_entry(rdev1, &mddev->disks, same_set) { 160 rdev_for_each(rdev1, mddev) {
161 int j = rdev1->raid_disk; 161 int j = rdev1->raid_disk;
162 162
163 if (mddev->level == 10) { 163 if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
188 188
189 disk_stack_limits(mddev->gendisk, rdev1->bdev, 189 disk_stack_limits(mddev->gendisk, rdev1->bdev,
190 rdev1->data_offset << 9); 190 rdev1->data_offset << 9);
191 /* as we don't honour merge_bvec_fn, we must never risk
192 * violating it, so limit ->max_segments to 1, lying within
193 * a single page.
194 */
195 191
196 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { 192 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
197 blk_queue_max_segments(mddev->queue, 1); 193 conf->has_merge_bvec = 1;
198 blk_queue_segment_boundary(mddev->queue, 194
199 PAGE_CACHE_SIZE - 1);
200 }
201 if (!smallest || (rdev1->sectors < smallest->sectors)) 195 if (!smallest || (rdev1->sectors < smallest->sectors))
202 smallest = rdev1; 196 smallest = rdev1;
203 cnt++; 197 cnt++;
@@ -290,8 +284,64 @@ abort:
290 return err; 284 return err;
291} 285}
292 286
287/* Find the zone which holds a particular offset
288 * Update *sectorp to be an offset in that zone
289 */
290static struct strip_zone *find_zone(struct r0conf *conf,
291 sector_t *sectorp)
292{
293 int i;
294 struct strip_zone *z = conf->strip_zone;
295 sector_t sector = *sectorp;
296
297 for (i = 0; i < conf->nr_strip_zones; i++)
298 if (sector < z[i].zone_end) {
299 if (i)
300 *sectorp = sector - z[i-1].zone_end;
301 return z + i;
302 }
303 BUG();
304}
305
306/*
307 * remaps the bio to the target device. we separate two flows.
308 * power 2 flow and a general flow for the sake of perfromance
309*/
310static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
311 sector_t sector, sector_t *sector_offset)
312{
313 unsigned int sect_in_chunk;
314 sector_t chunk;
315 struct r0conf *conf = mddev->private;
316 int raid_disks = conf->strip_zone[0].nb_dev;
317 unsigned int chunk_sects = mddev->chunk_sectors;
318
319 if (is_power_of_2(chunk_sects)) {
320 int chunksect_bits = ffz(~chunk_sects);
321 /* find the sector offset inside the chunk */
322 sect_in_chunk = sector & (chunk_sects - 1);
323 sector >>= chunksect_bits;
324 /* chunk in zone */
325 chunk = *sector_offset;
326 /* quotient is the chunk in real device*/
327 sector_div(chunk, zone->nb_dev << chunksect_bits);
328 } else{
329 sect_in_chunk = sector_div(sector, chunk_sects);
330 chunk = *sector_offset;
331 sector_div(chunk, chunk_sects * zone->nb_dev);
332 }
333 /*
334 * position the bio over the real device
335 * real sector = chunk in device + starting of zone
336 * + the position in the chunk
337 */
338 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
339 return conf->devlist[(zone - conf->strip_zone)*raid_disks
340 + sector_div(sector, zone->nb_dev)];
341}
342
293/** 343/**
294 * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged 344 * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
295 * @q: request queue 345 * @q: request queue
296 * @bvm: properties of new bio 346 * @bvm: properties of new bio
297 * @biovec: the request that could be merged to it. 347 * @biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
303 struct bio_vec *biovec) 353 struct bio_vec *biovec)
304{ 354{
305 struct mddev *mddev = q->queuedata; 355 struct mddev *mddev = q->queuedata;
356 struct r0conf *conf = mddev->private;
306 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 357 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
358 sector_t sector_offset = sector;
307 int max; 359 int max;
308 unsigned int chunk_sectors = mddev->chunk_sectors; 360 unsigned int chunk_sectors = mddev->chunk_sectors;
309 unsigned int bio_sectors = bvm->bi_size >> 9; 361 unsigned int bio_sectors = bvm->bi_size >> 9;
362 struct strip_zone *zone;
363 struct md_rdev *rdev;
364 struct request_queue *subq;
310 365
311 if (is_power_of_2(chunk_sectors)) 366 if (is_power_of_2(chunk_sectors))
312 max = (chunk_sectors - ((sector & (chunk_sectors-1)) 367 max = (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
314 else 369 else
315 max = (chunk_sectors - (sector_div(sector, chunk_sectors) 370 max = (chunk_sectors - (sector_div(sector, chunk_sectors)
316 + bio_sectors)) << 9; 371 + bio_sectors)) << 9;
317 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 372 if (max < 0)
373 max = 0; /* bio_add cannot handle a negative return */
318 if (max <= biovec->bv_len && bio_sectors == 0) 374 if (max <= biovec->bv_len && bio_sectors == 0)
319 return biovec->bv_len; 375 return biovec->bv_len;
320 else 376 if (max < biovec->bv_len)
377 /* too small already, no need to check further */
378 return max;
379 if (!conf->has_merge_bvec)
380 return max;
381
382 /* May need to check subordinate device */
383 sector = sector_offset;
384 zone = find_zone(mddev->private, &sector_offset);
385 rdev = map_sector(mddev, zone, sector, &sector_offset);
386 subq = bdev_get_queue(rdev->bdev);
387 if (subq->merge_bvec_fn) {
388 bvm->bi_bdev = rdev->bdev;
389 bvm->bi_sector = sector_offset + zone->dev_start +
390 rdev->data_offset;
391 return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
392 } else
321 return max; 393 return max;
322} 394}
323 395
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
329 WARN_ONCE(sectors || raid_disks, 401 WARN_ONCE(sectors || raid_disks,
330 "%s does not support generic reshape\n", __func__); 402 "%s does not support generic reshape\n", __func__);
331 403
332 list_for_each_entry(rdev, &mddev->disks, same_set) 404 rdev_for_each(rdev, mddev)
333 array_sectors += rdev->sectors; 405 array_sectors += rdev->sectors;
334 406
335 return array_sectors; 407 return array_sectors;
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
397 return 0; 469 return 0;
398} 470}
399 471
400/* Find the zone which holds a particular offset
401 * Update *sectorp to be an offset in that zone
402 */
403static struct strip_zone *find_zone(struct r0conf *conf,
404 sector_t *sectorp)
405{
406 int i;
407 struct strip_zone *z = conf->strip_zone;
408 sector_t sector = *sectorp;
409
410 for (i = 0; i < conf->nr_strip_zones; i++)
411 if (sector < z[i].zone_end) {
412 if (i)
413 *sectorp = sector - z[i-1].zone_end;
414 return z + i;
415 }
416 BUG();
417}
418
419/*
420 * remaps the bio to the target device. we separate two flows.
421 * power 2 flow and a general flow for the sake of perfromance
422*/
423static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
424 sector_t sector, sector_t *sector_offset)
425{
426 unsigned int sect_in_chunk;
427 sector_t chunk;
428 struct r0conf *conf = mddev->private;
429 int raid_disks = conf->strip_zone[0].nb_dev;
430 unsigned int chunk_sects = mddev->chunk_sectors;
431
432 if (is_power_of_2(chunk_sects)) {
433 int chunksect_bits = ffz(~chunk_sects);
434 /* find the sector offset inside the chunk */
435 sect_in_chunk = sector & (chunk_sects - 1);
436 sector >>= chunksect_bits;
437 /* chunk in zone */
438 chunk = *sector_offset;
439 /* quotient is the chunk in real device*/
440 sector_div(chunk, zone->nb_dev << chunksect_bits);
441 } else{
442 sect_in_chunk = sector_div(sector, chunk_sects);
443 chunk = *sector_offset;
444 sector_div(chunk, chunk_sects * zone->nb_dev);
445 }
446 /*
447 * position the bio over the real device
448 * real sector = chunk in device + starting of zone
449 * + the position in the chunk
450 */
451 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
452 return conf->devlist[(zone - conf->strip_zone)*raid_disks
453 + sector_div(sector, zone->nb_dev)];
454}
455
456/* 472/*
457 * Is io distribute over 1 or more chunks ? 473 * Is io distribute over 1 or more chunks ?
458*/ 474*/
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
505 } 521 }
506 522
507 sector_offset = bio->bi_sector; 523 sector_offset = bio->bi_sector;
508 zone = find_zone(mddev->private, &sector_offset); 524 zone = find_zone(mddev->private, &sector_offset);
509 tmp_dev = map_sector(mddev, zone, bio->bi_sector, 525 tmp_dev = map_sector(mddev, zone, bio->bi_sector,
510 &sector_offset); 526 &sector_offset);
511 bio->bi_bdev = tmp_dev->bdev; 527 bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
543 return ERR_PTR(-EINVAL); 559 return ERR_PTR(-EINVAL);
544 } 560 }
545 561
546 list_for_each_entry(rdev, &mddev->disks, same_set) { 562 rdev_for_each(rdev, mddev) {
547 /* check slot number for a disk */ 563 /* check slot number for a disk */
548 if (rdev->raid_disk == mddev->raid_disks-1) { 564 if (rdev->raid_disk == mddev->raid_disks-1) {
549 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 565 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4c..05539d9c97f0 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
4struct strip_zone { 4struct strip_zone {
5 sector_t zone_end; /* Start of the next zone (in sectors) */ 5 sector_t zone_end; /* Start of the next zone (in sectors) */
6 sector_t dev_start; /* Zone offset in real dev (in sectors) */ 6 sector_t dev_start; /* Zone offset in real dev (in sectors) */
7 int nb_dev; /* # of devices attached to the zone */ 7 int nb_dev; /* # of devices attached to the zone */
8}; 8};
9 9
10struct r0conf { 10struct r0conf {
11 struct strip_zone *strip_zone; 11 struct strip_zone *strip_zone;
12 struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 12 struct md_rdev **devlist; /* lists of rdevs, pointed to
13 int nr_strip_zones; 13 * by strip_zone->dev */
14 int nr_strip_zones;
15 int has_merge_bvec; /* at least one member has
16 * a merge_bvec_fn */
14}; 17};
15 18
16#endif 19#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a0b225eb4ac4..4a40a200d769 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 523 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 524 if (r1_bio->bios[disk] == IO_BLOCKED
525 || rdev == NULL 525 || rdev == NULL
526 || test_bit(Unmerged, &rdev->flags)
526 || test_bit(Faulty, &rdev->flags)) 527 || test_bit(Faulty, &rdev->flags))
527 continue; 528 continue;
528 if (!test_bit(In_sync, &rdev->flags) && 529 if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
614 return best_disk; 615 return best_disk;
615} 616}
616 617
618static int raid1_mergeable_bvec(struct request_queue *q,
619 struct bvec_merge_data *bvm,
620 struct bio_vec *biovec)
621{
622 struct mddev *mddev = q->queuedata;
623 struct r1conf *conf = mddev->private;
624 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
625 int max = biovec->bv_len;
626
627 if (mddev->merge_check_needed) {
628 int disk;
629 rcu_read_lock();
630 for (disk = 0; disk < conf->raid_disks * 2; disk++) {
631 struct md_rdev *rdev = rcu_dereference(
632 conf->mirrors[disk].rdev);
633 if (rdev && !test_bit(Faulty, &rdev->flags)) {
634 struct request_queue *q =
635 bdev_get_queue(rdev->bdev);
636 if (q->merge_bvec_fn) {
637 bvm->bi_sector = sector +
638 rdev->data_offset;
639 bvm->bi_bdev = rdev->bdev;
640 max = min(max, q->merge_bvec_fn(
641 q, bvm, biovec));
642 }
643 }
644 }
645 rcu_read_unlock();
646 }
647 return max;
648
649}
650
617int md_raid1_congested(struct mddev *mddev, int bits) 651int md_raid1_congested(struct mddev *mddev, int bits)
618{ 652{
619 struct r1conf *conf = mddev->private; 653 struct r1conf *conf = mddev->private;
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
737 spin_lock_irq(&conf->resync_lock); 771 spin_lock_irq(&conf->resync_lock);
738 if (conf->barrier) { 772 if (conf->barrier) {
739 conf->nr_waiting++; 773 conf->nr_waiting++;
740 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 774 /* Wait for the barrier to drop.
775 * However if there are already pending
776 * requests (preventing the barrier from
777 * rising completely), and the
778 * pre-process bio queue isn't empty,
779 * then don't wait, as we need to empty
780 * that queue to get the nr_pending
781 * count down.
782 */
783 wait_event_lock_irq(conf->wait_barrier,
784 !conf->barrier ||
785 (conf->nr_pending &&
786 current->bio_list &&
787 !bio_list_empty(current->bio_list)),
741 conf->resync_lock, 788 conf->resync_lock,
742 ); 789 );
743 conf->nr_waiting--; 790 conf->nr_waiting--;
744 } 791 }
745 conf->nr_pending++; 792 conf->nr_pending++;
@@ -1002,7 +1049,8 @@ read_again:
1002 break; 1049 break;
1003 } 1050 }
1004 r1_bio->bios[i] = NULL; 1051 r1_bio->bios[i] = NULL;
1005 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1052 if (!rdev || test_bit(Faulty, &rdev->flags)
1053 || test_bit(Unmerged, &rdev->flags)) {
1006 if (i < conf->raid_disks) 1054 if (i < conf->raid_disks)
1007 set_bit(R1BIO_Degraded, &r1_bio->state); 1055 set_bit(R1BIO_Degraded, &r1_bio->state);
1008 continue; 1056 continue;
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1322 struct mirror_info *p; 1370 struct mirror_info *p;
1323 int first = 0; 1371 int first = 0;
1324 int last = conf->raid_disks - 1; 1372 int last = conf->raid_disks - 1;
1373 struct request_queue *q = bdev_get_queue(rdev->bdev);
1325 1374
1326 if (mddev->recovery_disabled == conf->recovery_disabled) 1375 if (mddev->recovery_disabled == conf->recovery_disabled)
1327 return -EBUSY; 1376 return -EBUSY;
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1329 if (rdev->raid_disk >= 0) 1378 if (rdev->raid_disk >= 0)
1330 first = last = rdev->raid_disk; 1379 first = last = rdev->raid_disk;
1331 1380
1381 if (q->merge_bvec_fn) {
1382 set_bit(Unmerged, &rdev->flags);
1383 mddev->merge_check_needed = 1;
1384 }
1385
1332 for (mirror = first; mirror <= last; mirror++) { 1386 for (mirror = first; mirror <= last; mirror++) {
1333 p = conf->mirrors+mirror; 1387 p = conf->mirrors+mirror;
1334 if (!p->rdev) { 1388 if (!p->rdev) {
1335 1389
1336 disk_stack_limits(mddev->gendisk, rdev->bdev, 1390 disk_stack_limits(mddev->gendisk, rdev->bdev,
1337 rdev->data_offset << 9); 1391 rdev->data_offset << 9);
1338 /* as we don't honour merge_bvec_fn, we must
1339 * never risk violating it, so limit
1340 * ->max_segments to one lying with a single
1341 * page, as a one page request is never in
1342 * violation.
1343 */
1344 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1345 blk_queue_max_segments(mddev->queue, 1);
1346 blk_queue_segment_boundary(mddev->queue,
1347 PAGE_CACHE_SIZE - 1);
1348 }
1349 1392
1350 p->head_position = 0; 1393 p->head_position = 0;
1351 rdev->raid_disk = mirror; 1394 rdev->raid_disk = mirror;
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1370 break; 1413 break;
1371 } 1414 }
1372 } 1415 }
1416 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1417 /* Some requests might not have seen this new
1418 * merge_bvec_fn. We must wait for them to complete
1419 * before merging the device fully.
1420 * First we make sure any code which has tested
1421 * our function has submitted the request, then
1422 * we wait for all outstanding requests to complete.
1423 */
1424 synchronize_sched();
1425 raise_barrier(conf);
1426 lower_barrier(conf);
1427 clear_bit(Unmerged, &rdev->flags);
1428 }
1373 md_integrity_add_rdev(rdev, mddev); 1429 md_integrity_add_rdev(rdev, mddev);
1374 print_conf(conf); 1430 print_conf(conf);
1375 return err; 1431 return err;
@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2491 2547
2492 err = -EINVAL; 2548 err = -EINVAL;
2493 spin_lock_init(&conf->device_lock); 2549 spin_lock_init(&conf->device_lock);
2494 list_for_each_entry(rdev, &mddev->disks, same_set) { 2550 rdev_for_each(rdev, mddev) {
2495 int disk_idx = rdev->raid_disk; 2551 int disk_idx = rdev->raid_disk;
2496 if (disk_idx >= mddev->raid_disks 2552 if (disk_idx >= mddev->raid_disks
2497 || disk_idx < 0) 2553 || disk_idx < 0)
@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev)
2609 if (IS_ERR(conf)) 2665 if (IS_ERR(conf))
2610 return PTR_ERR(conf); 2666 return PTR_ERR(conf);
2611 2667
2612 list_for_each_entry(rdev, &mddev->disks, same_set) { 2668 rdev_for_each(rdev, mddev) {
2613 if (!mddev->gendisk) 2669 if (!mddev->gendisk)
2614 continue; 2670 continue;
2615 disk_stack_limits(mddev->gendisk, rdev->bdev, 2671 disk_stack_limits(mddev->gendisk, rdev->bdev,
2616 rdev->data_offset << 9); 2672 rdev->data_offset << 9);
2617 /* as we don't honour merge_bvec_fn, we must never risk
2618 * violating it, so limit ->max_segments to 1 lying within
2619 * a single page, as a one page request is never in violation.
2620 */
2621 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2622 blk_queue_max_segments(mddev->queue, 1);
2623 blk_queue_segment_boundary(mddev->queue,
2624 PAGE_CACHE_SIZE - 1);
2625 }
2626 } 2673 }
2627 2674
2628 mddev->degraded = 0; 2675 mddev->degraded = 0;
@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev)
2656 if (mddev->queue) { 2703 if (mddev->queue) {
2657 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2704 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2658 mddev->queue->backing_dev_info.congested_data = mddev; 2705 mddev->queue->backing_dev_info.congested_data = mddev;
2706 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2659 } 2707 }
2660 return md_integrity_register(mddev); 2708 return md_integrity_register(mddev);
2661} 2709}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 58c44d6453a0..3540316886f2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -586,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
586 * @biovec: the request that could be merged to it. 586 * @biovec: the request that could be merged to it.
587 * 587 *
588 * Return amount of bytes we can accept at this offset 588 * Return amount of bytes we can accept at this offset
589 * If near_copies == raid_disk, there are no striping issues, 589 * This requires checking for end-of-chunk if near_copies != raid_disks,
590 * but in that case, the function isn't called at all. 590 * and for subordinate merge_bvec_fns if merge_check_needed.
591 */ 591 */
592static int raid10_mergeable_bvec(struct request_queue *q, 592static int raid10_mergeable_bvec(struct request_queue *q,
593 struct bvec_merge_data *bvm, 593 struct bvec_merge_data *bvm,
594 struct bio_vec *biovec) 594 struct bio_vec *biovec)
595{ 595{
596 struct mddev *mddev = q->queuedata; 596 struct mddev *mddev = q->queuedata;
597 struct r10conf *conf = mddev->private;
597 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
598 int max; 599 int max;
599 unsigned int chunk_sectors = mddev->chunk_sectors; 600 unsigned int chunk_sectors = mddev->chunk_sectors;
600 unsigned int bio_sectors = bvm->bi_size >> 9; 601 unsigned int bio_sectors = bvm->bi_size >> 9;
601 602
602 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 603 if (conf->near_copies < conf->raid_disks) {
603 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 604 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
604 if (max <= biovec->bv_len && bio_sectors == 0) 605 + bio_sectors)) << 9;
605 return biovec->bv_len; 606 if (max < 0)
606 else 607 /* bio_add cannot handle a negative return */
607 return max; 608 max = 0;
609 if (max <= biovec->bv_len && bio_sectors == 0)
610 return biovec->bv_len;
611 } else
612 max = biovec->bv_len;
613
614 if (mddev->merge_check_needed) {
615 struct r10bio r10_bio;
616 int s;
617 r10_bio.sector = sector;
618 raid10_find_phys(conf, &r10_bio);
619 rcu_read_lock();
620 for (s = 0; s < conf->copies; s++) {
621 int disk = r10_bio.devs[s].devnum;
622 struct md_rdev *rdev = rcu_dereference(
623 conf->mirrors[disk].rdev);
624 if (rdev && !test_bit(Faulty, &rdev->flags)) {
625 struct request_queue *q =
626 bdev_get_queue(rdev->bdev);
627 if (q->merge_bvec_fn) {
628 bvm->bi_sector = r10_bio.devs[s].addr
629 + rdev->data_offset;
630 bvm->bi_bdev = rdev->bdev;
631 max = min(max, q->merge_bvec_fn(
632 q, bvm, biovec));
633 }
634 }
635 rdev = rcu_dereference(conf->mirrors[disk].replacement);
636 if (rdev && !test_bit(Faulty, &rdev->flags)) {
637 struct request_queue *q =
638 bdev_get_queue(rdev->bdev);
639 if (q->merge_bvec_fn) {
640 bvm->bi_sector = r10_bio.devs[s].addr
641 + rdev->data_offset;
642 bvm->bi_bdev = rdev->bdev;
643 max = min(max, q->merge_bvec_fn(
644 q, bvm, biovec));
645 }
646 }
647 }
648 rcu_read_unlock();
649 }
650 return max;
608} 651}
609 652
610/* 653/*
@@ -668,11 +711,12 @@ retry:
668 disk = r10_bio->devs[slot].devnum; 711 disk = r10_bio->devs[slot].devnum;
669 rdev = rcu_dereference(conf->mirrors[disk].replacement); 712 rdev = rcu_dereference(conf->mirrors[disk].replacement);
670 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 713 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
714 test_bit(Unmerged, &rdev->flags) ||
671 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 715 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
672 rdev = rcu_dereference(conf->mirrors[disk].rdev); 716 rdev = rcu_dereference(conf->mirrors[disk].rdev);
673 if (rdev == NULL) 717 if (rdev == NULL ||
674 continue; 718 test_bit(Faulty, &rdev->flags) ||
675 if (test_bit(Faulty, &rdev->flags)) 719 test_bit(Unmerged, &rdev->flags))
676 continue; 720 continue;
677 if (!test_bit(In_sync, &rdev->flags) && 721 if (!test_bit(In_sync, &rdev->flags) &&
678 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 722 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -863,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
863 spin_lock_irq(&conf->resync_lock); 907 spin_lock_irq(&conf->resync_lock);
864 if (conf->barrier) { 908 if (conf->barrier) {
865 conf->nr_waiting++; 909 conf->nr_waiting++;
866 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 910 /* Wait for the barrier to drop.
911 * However if there are already pending
912 * requests (preventing the barrier from
913 * rising completely), and the
914 * pre-process bio queue isn't empty,
915 * then don't wait, as we need to empty
916 * that queue to get the nr_pending
917 * count down.
918 */
919 wait_event_lock_irq(conf->wait_barrier,
920 !conf->barrier ||
921 (conf->nr_pending &&
922 current->bio_list &&
923 !bio_list_empty(current->bio_list)),
867 conf->resync_lock, 924 conf->resync_lock,
868 ); 925 );
869 conf->nr_waiting--; 926 conf->nr_waiting--;
870 } 927 }
871 conf->nr_pending++; 928 conf->nr_pending++;
@@ -1121,12 +1178,14 @@ retry_write:
1121 blocked_rdev = rrdev; 1178 blocked_rdev = rrdev;
1122 break; 1179 break;
1123 } 1180 }
1124 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1181 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1182 || test_bit(Unmerged, &rrdev->flags)))
1125 rrdev = NULL; 1183 rrdev = NULL;
1126 1184
1127 r10_bio->devs[i].bio = NULL; 1185 r10_bio->devs[i].bio = NULL;
1128 r10_bio->devs[i].repl_bio = NULL; 1186 r10_bio->devs[i].repl_bio = NULL;
1129 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1187 if (!rdev || test_bit(Faulty, &rdev->flags) ||
1188 test_bit(Unmerged, &rdev->flags)) {
1130 set_bit(R10BIO_Degraded, &r10_bio->state); 1189 set_bit(R10BIO_Degraded, &r10_bio->state);
1131 continue; 1190 continue;
1132 } 1191 }
@@ -1477,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1477 int mirror; 1536 int mirror;
1478 int first = 0; 1537 int first = 0;
1479 int last = conf->raid_disks - 1; 1538 int last = conf->raid_disks - 1;
1539 struct request_queue *q = bdev_get_queue(rdev->bdev);
1480 1540
1481 if (mddev->recovery_cp < MaxSector) 1541 if (mddev->recovery_cp < MaxSector)
1482 /* only hot-add to in-sync arrays, as recovery is 1542 /* only hot-add to in-sync arrays, as recovery is
1483 * very different from resync 1543 * very different from resync
1484 */ 1544 */
1485 return -EBUSY; 1545 return -EBUSY;
1486 if (!enough(conf, -1)) 1546 if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
1487 return -EINVAL; 1547 return -EINVAL;
1488 1548
1489 if (rdev->raid_disk >= 0) 1549 if (rdev->raid_disk >= 0)
1490 first = last = rdev->raid_disk; 1550 first = last = rdev->raid_disk;
1491 1551
1552 if (q->merge_bvec_fn) {
1553 set_bit(Unmerged, &rdev->flags);
1554 mddev->merge_check_needed = 1;
1555 }
1556
1492 if (rdev->saved_raid_disk >= first && 1557 if (rdev->saved_raid_disk >= first &&
1493 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1558 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1494 mirror = rdev->saved_raid_disk; 1559 mirror = rdev->saved_raid_disk;
@@ -1508,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1508 err = 0; 1573 err = 0;
1509 disk_stack_limits(mddev->gendisk, rdev->bdev, 1574 disk_stack_limits(mddev->gendisk, rdev->bdev,
1510 rdev->data_offset << 9); 1575 rdev->data_offset << 9);
1511 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1512 blk_queue_max_segments(mddev->queue, 1);
1513 blk_queue_segment_boundary(mddev->queue,
1514 PAGE_CACHE_SIZE - 1);
1515 }
1516 conf->fullsync = 1; 1576 conf->fullsync = 1;
1517 rcu_assign_pointer(p->replacement, rdev); 1577 rcu_assign_pointer(p->replacement, rdev);
1518 break; 1578 break;
@@ -1520,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1520 1580
1521 disk_stack_limits(mddev->gendisk, rdev->bdev, 1581 disk_stack_limits(mddev->gendisk, rdev->bdev,
1522 rdev->data_offset << 9); 1582 rdev->data_offset << 9);
1523 /* as we don't honour merge_bvec_fn, we must
1524 * never risk violating it, so limit
1525 * ->max_segments to one lying with a single
1526 * page, as a one page request is never in
1527 * violation.
1528 */
1529 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1530 blk_queue_max_segments(mddev->queue, 1);
1531 blk_queue_segment_boundary(mddev->queue,
1532 PAGE_CACHE_SIZE - 1);
1533 }
1534 1583
1535 p->head_position = 0; 1584 p->head_position = 0;
1536 p->recovery_disabled = mddev->recovery_disabled - 1; 1585 p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1541,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1541 rcu_assign_pointer(p->rdev, rdev); 1590 rcu_assign_pointer(p->rdev, rdev);
1542 break; 1591 break;
1543 } 1592 }
1544 1593 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1594 /* Some requests might not have seen this new
1595 * merge_bvec_fn. We must wait for them to complete
1596 * before merging the device fully.
1597 * First we make sure any code which has tested
1598 * our function has submitted the request, then
1599 * we wait for all outstanding requests to complete.
1600 */
1601 synchronize_sched();
1602 raise_barrier(conf, 0);
1603 lower_barrier(conf);
1604 clear_bit(Unmerged, &rdev->flags);
1605 }
1545 md_integrity_add_rdev(rdev, mddev); 1606 md_integrity_add_rdev(rdev, mddev);
1546 print_conf(conf); 1607 print_conf(conf);
1547 return err; 1608 return err;
@@ -1682,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
1682 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1743 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1683 if (repl) 1744 if (repl)
1684 rdev = conf->mirrors[d].replacement; 1745 rdev = conf->mirrors[d].replacement;
1685 if (!rdev) { 1746 else
1686 smp_mb();
1687 rdev = conf->mirrors[d].rdev; 1747 rdev = conf->mirrors[d].rdev;
1688 }
1689 1748
1690 if (!uptodate) { 1749 if (!uptodate) {
1691 if (repl) 1750 if (repl)
@@ -2087,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2087 d = r10_bio->devs[sl].devnum; 2146 d = r10_bio->devs[sl].devnum;
2088 rdev = rcu_dereference(conf->mirrors[d].rdev); 2147 rdev = rcu_dereference(conf->mirrors[d].rdev);
2089 if (rdev && 2148 if (rdev &&
2149 !test_bit(Unmerged, &rdev->flags) &&
2090 test_bit(In_sync, &rdev->flags) && 2150 test_bit(In_sync, &rdev->flags) &&
2091 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2151 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2092 &first_bad, &bad_sectors) == 0) { 2152 &first_bad, &bad_sectors) == 0) {
@@ -2140,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2140 d = r10_bio->devs[sl].devnum; 2200 d = r10_bio->devs[sl].devnum;
2141 rdev = rcu_dereference(conf->mirrors[d].rdev); 2201 rdev = rcu_dereference(conf->mirrors[d].rdev);
2142 if (!rdev || 2202 if (!rdev ||
2203 test_bit(Unmerged, &rdev->flags) ||
2143 !test_bit(In_sync, &rdev->flags)) 2204 !test_bit(In_sync, &rdev->flags))
2144 continue; 2205 continue;
2145 2206
@@ -3242,7 +3303,7 @@ static int run(struct mddev *mddev)
3242 blk_queue_io_opt(mddev->queue, chunk_size * 3303 blk_queue_io_opt(mddev->queue, chunk_size *
3243 (conf->raid_disks / conf->near_copies)); 3304 (conf->raid_disks / conf->near_copies));
3244 3305
3245 list_for_each_entry(rdev, &mddev->disks, same_set) { 3306 rdev_for_each(rdev, mddev) {
3246 3307
3247 disk_idx = rdev->raid_disk; 3308 disk_idx = rdev->raid_disk;
3248 if (disk_idx >= conf->raid_disks 3309 if (disk_idx >= conf->raid_disks
@@ -3262,15 +3323,6 @@ static int run(struct mddev *mddev)
3262 3323
3263 disk_stack_limits(mddev->gendisk, rdev->bdev, 3324 disk_stack_limits(mddev->gendisk, rdev->bdev,
3264 rdev->data_offset << 9); 3325 rdev->data_offset << 9);
3265 /* as we don't honour merge_bvec_fn, we must never risk
3266 * violating it, so limit max_segments to 1 lying
3267 * within a single page.
3268 */
3269 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
3270 blk_queue_max_segments(mddev->queue, 1);
3271 blk_queue_segment_boundary(mddev->queue,
3272 PAGE_CACHE_SIZE - 1);
3273 }
3274 3326
3275 disk->head_position = 0; 3327 disk->head_position = 0;
3276 } 3328 }
@@ -3334,8 +3386,7 @@ static int run(struct mddev *mddev)
3334 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3386 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
3335 } 3387 }
3336 3388
3337 if (conf->near_copies < conf->raid_disks) 3389 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3338 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3339 3390
3340 if (md_integrity_register(mddev)) 3391 if (md_integrity_register(mddev))
3341 goto out_free_conf; 3392 goto out_free_conf;
@@ -3385,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
3385 } 3436 }
3386} 3437}
3387 3438
3439static int raid10_resize(struct mddev *mddev, sector_t sectors)
3440{
3441 /* Resize of 'far' arrays is not supported.
3442 * For 'near' and 'offset' arrays we can set the
3443 * number of sectors used to be an appropriate multiple
3444 * of the chunk size.
3445 * For 'offset', this is far_copies*chunksize.
3446 * For 'near' the multiplier is the LCM of
3447 * near_copies and raid_disks.
3448 * So if far_copies > 1 && !far_offset, fail.
3449 * Else find LCM(raid_disks, near_copy)*far_copies and
3450 * multiply by chunk_size. Then round to this number.
3451 * This is mostly done by raid10_size()
3452 */
3453 struct r10conf *conf = mddev->private;
3454 sector_t oldsize, size;
3455
3456 if (conf->far_copies > 1 && !conf->far_offset)
3457 return -EINVAL;
3458
3459 oldsize = raid10_size(mddev, 0, 0);
3460 size = raid10_size(mddev, sectors, 0);
3461 md_set_array_sectors(mddev, size);
3462 if (mddev->array_sectors > size)
3463 return -EINVAL;
3464 set_capacity(mddev->gendisk, mddev->array_sectors);
3465 revalidate_disk(mddev->gendisk);
3466 if (sectors > mddev->dev_sectors &&
3467 mddev->recovery_cp > oldsize) {
3468 mddev->recovery_cp = oldsize;
3469 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3470 }
3471 mddev->dev_sectors = sectors;
3472 mddev->resync_max_sectors = size;
3473 return 0;
3474}
3475
3388static void *raid10_takeover_raid0(struct mddev *mddev) 3476static void *raid10_takeover_raid0(struct mddev *mddev)
3389{ 3477{
3390 struct md_rdev *rdev; 3478 struct md_rdev *rdev;
@@ -3408,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3408 3496
3409 conf = setup_conf(mddev); 3497 conf = setup_conf(mddev);
3410 if (!IS_ERR(conf)) { 3498 if (!IS_ERR(conf)) {
3411 list_for_each_entry(rdev, &mddev->disks, same_set) 3499 rdev_for_each(rdev, mddev)
3412 if (rdev->raid_disk >= 0) 3500 if (rdev->raid_disk >= 0)
3413 rdev->new_raid_disk = rdev->raid_disk * 2; 3501 rdev->new_raid_disk = rdev->raid_disk * 2;
3414 conf->barrier = 1; 3502 conf->barrier = 1;
@@ -3454,6 +3542,7 @@ static struct md_personality raid10_personality =
3454 .sync_request = sync_request, 3542 .sync_request = sync_request,
3455 .quiesce = raid10_quiesce, 3543 .quiesce = raid10_quiesce,
3456 .size = raid10_size, 3544 .size = raid10_size,
3545 .resize = raid10_resize,
3457 .takeover = raid10_takeover, 3546 .takeover = raid10_takeover,
3458}; 3547};
3459 3548
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62b..23ac880bba9a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
208 md_wakeup_thread(conf->mddev->thread); 208 md_wakeup_thread(conf->mddev->thread);
209 } else { 209 } else {
210 BUG_ON(stripe_operations_active(sh)); 210 BUG_ON(stripe_operations_active(sh));
211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
212 atomic_dec(&conf->preread_active_stripes); 212 if (atomic_dec_return(&conf->preread_active_stripes)
213 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 213 < IO_THRESHOLD)
214 md_wakeup_thread(conf->mddev->thread); 214 md_wakeup_thread(conf->mddev->thread);
215 }
216 atomic_dec(&conf->active_stripes); 215 atomic_dec(&conf->active_stripes);
217 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 216 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 list_add_tail(&sh->lru, &conf->inactive_list); 217 list_add_tail(&sh->lru, &conf->inactive_list);
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4843 4842
4844 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4843 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4845 4844
4846 list_for_each_entry(rdev, &mddev->disks, same_set) { 4845 rdev_for_each(rdev, mddev) {
4847 raid_disk = rdev->raid_disk; 4846 raid_disk = rdev->raid_disk;
4848 if (raid_disk >= max_disks 4847 if (raid_disk >= max_disks
4849 || raid_disk < 0) 4848 || raid_disk < 0)
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
5178 blk_queue_io_opt(mddev->queue, chunk_size * 5177 blk_queue_io_opt(mddev->queue, chunk_size *
5179 (conf->raid_disks - conf->max_degraded)); 5178 (conf->raid_disks - conf->max_degraded));
5180 5179
5181 list_for_each_entry(rdev, &mddev->disks, same_set) 5180 rdev_for_each(rdev, mddev)
5182 disk_stack_limits(mddev->gendisk, rdev->bdev, 5181 disk_stack_limits(mddev->gendisk, rdev->bdev,
5183 rdev->data_offset << 9); 5182 rdev->data_offset << 9);
5184 } 5183 }
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5362 if (mddev->recovery_disabled == conf->recovery_disabled) 5361 if (mddev->recovery_disabled == conf->recovery_disabled)
5363 return -EBUSY; 5362 return -EBUSY;
5364 5363
5365 if (has_failed(conf)) 5364 if (rdev->saved_raid_disk < 0 && has_failed(conf))
5366 /* no point adding a device */ 5365 /* no point adding a device */
5367 return -EINVAL; 5366 return -EINVAL;
5368 5367
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5501 if (!check_stripe_cache(mddev)) 5500 if (!check_stripe_cache(mddev))
5502 return -ENOSPC; 5501 return -ENOSPC;
5503 5502
5504 list_for_each_entry(rdev, &mddev->disks, same_set) 5503 rdev_for_each(rdev, mddev)
5505 if (!test_bit(In_sync, &rdev->flags) 5504 if (!test_bit(In_sync, &rdev->flags)
5506 && !test_bit(Faulty, &rdev->flags)) 5505 && !test_bit(Faulty, &rdev->flags))
5507 spares++; 5506 spares++;
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
5547 * such devices during the reshape and confusion could result. 5546 * such devices during the reshape and confusion could result.
5548 */ 5547 */
5549 if (mddev->delta_disks >= 0) { 5548 if (mddev->delta_disks >= 0) {
5550 int added_devices = 0; 5549 rdev_for_each(rdev, mddev)
5551 list_for_each_entry(rdev, &mddev->disks, same_set)
5552 if (rdev->raid_disk < 0 && 5550 if (rdev->raid_disk < 0 &&
5553 !test_bit(Faulty, &rdev->flags)) { 5551 !test_bit(Faulty, &rdev->flags)) {
5554 if (raid5_add_disk(mddev, rdev) == 0) { 5552 if (raid5_add_disk(mddev, rdev) == 0) {
5555 if (rdev->raid_disk 5553 if (rdev->raid_disk
5556 >= conf->previous_raid_disks) { 5554 >= conf->previous_raid_disks)
5557 set_bit(In_sync, &rdev->flags); 5555 set_bit(In_sync, &rdev->flags);
5558 added_devices++; 5556 else
5559 } else
5560 rdev->recovery_offset = 0; 5557 rdev->recovery_offset = 0;
5561 5558
5562 if (sysfs_link_rdev(mddev, rdev)) 5559 if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
5566 && !test_bit(Faulty, &rdev->flags)) { 5563 && !test_bit(Faulty, &rdev->flags)) {
5567 /* This is a spare that was manually added */ 5564 /* This is a spare that was manually added */
5568 set_bit(In_sync, &rdev->flags); 5565 set_bit(In_sync, &rdev->flags);
5569 added_devices++;
5570 } 5566 }
5571 5567
5572 /* When a reshape changes the number of devices, 5568 /* When a reshape changes the number of devices,
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5592 spin_lock_irq(&conf->device_lock); 5588 spin_lock_irq(&conf->device_lock);
5593 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5589 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5594 conf->reshape_progress = MaxSector; 5590 conf->reshape_progress = MaxSector;
5591 mddev->reshape_position = MaxSector;
5595 spin_unlock_irq(&conf->device_lock); 5592 spin_unlock_irq(&conf->device_lock);
5596 return -EAGAIN; 5593 return -EAGAIN;
5597 } 5594 }