aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig31
-rw-r--r--drivers/md/Makefile16
-rw-r--r--drivers/md/bitmap.c49
-rw-r--r--drivers/md/bitmap.h288
-rw-r--r--drivers/md/faulty.c19
-rw-r--r--drivers/md/linear.c25
-rw-r--r--drivers/md/linear.h29
-rw-r--r--drivers/md/md.c615
-rw-r--r--drivers/md/md.h436
-rw-r--r--drivers/md/mktables.c14
-rw-r--r--drivers/md/multipath.c17
-rw-r--r--drivers/md/multipath.h40
-rw-r--r--drivers/md/raid0.c66
-rw-r--r--drivers/md/raid0.h28
-rw-r--r--drivers/md/raid1.c35
-rw-r--r--drivers/md/raid1.h132
-rw-r--r--drivers/md/raid10.c42
-rw-r--r--drivers/md/raid10.h121
-rw-r--r--drivers/md/raid5.c1494
-rw-r--r--drivers/md/raid5.h474
-rw-r--r--drivers/md/raid6.h130
-rw-r--r--drivers/md/raid6algos.c21
-rw-r--r--drivers/md/raid6altivec.uc4
-rw-r--r--drivers/md/raid6int.uc4
-rw-r--r--drivers/md/raid6mmx.c4
-rw-r--r--drivers/md/raid6recov.c13
-rw-r--r--drivers/md/raid6sse1.c4
-rw-r--r--drivers/md/raid6sse2.c4
-rw-r--r--drivers/md/raid6test/Makefile2
-rw-r--r--drivers/md/raid6test/test.c2
-rw-r--r--drivers/md/raid6x86.h2
31 files changed, 3324 insertions, 837 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2281b5098e95..36e0675be9f7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -121,6 +121,7 @@ config MD_RAID10
121config MD_RAID456 121config MD_RAID456
122 tristate "RAID-4/RAID-5/RAID-6 mode" 122 tristate "RAID-4/RAID-5/RAID-6 mode"
123 depends on BLK_DEV_MD 123 depends on BLK_DEV_MD
124 select MD_RAID6_PQ
124 select ASYNC_MEMCPY 125 select ASYNC_MEMCPY
125 select ASYNC_XOR 126 select ASYNC_XOR
126 ---help--- 127 ---help---
@@ -151,34 +152,8 @@ config MD_RAID456
151 152
152 If unsure, say Y. 153 If unsure, say Y.
153 154
154config MD_RAID5_RESHAPE 155config MD_RAID6_PQ
155 bool "Support adding drives to a raid-5 array" 156 tristate
156 depends on MD_RAID456
157 default y
158 ---help---
159 A RAID-5 set can be expanded by adding extra drives. This
160 requires "restriping" the array which means (almost) every
161 block must be written to a different place.
162
163 This option allows such restriping to be done while the array
164 is online.
165
166 You will need mdadm version 2.4.1 or later to use this
167 feature safely. During the early stage of reshape there is
168 a critical section where live data is being over-written. A
169 crash during this time needs extra care for recovery. The
170 newer mdadm takes a copy of the data in the critical section
171 and will restore it, if necessary, after a crash.
172
173 The mdadm usage is e.g.
174 mdadm --grow /dev/md1 --raid-disks=6
175 to grow '/dev/md1' to having 6 disks.
176
177 Note: The array can only be expanded, not contracted.
178 There should be enough spares already present to make the new
179 array workable.
180
181 If unsure, say Y.
182 157
183config MD_MULTIPATH 158config MD_MULTIPATH
184 tristate "Multipath I/O support" 159 tristate "Multipath I/O support"
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 72880b7e28d9..45cc5951d928 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -2,20 +2,21 @@
2# Makefile for the kernel software RAID and LVM drivers. 2# Makefile for the kernel software RAID and LVM drivers.
3# 3#
4 4
5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 5dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o 6 dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
7dm-multipath-objs := dm-path-selector.o dm-mpath.o 7dm-multipath-y += dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ 8dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
9 dm-snap-persistent.o 9 dm-snap-persistent.o
10dm-mirror-objs := dm-raid1.o 10dm-mirror-y += dm-raid1.o
11md-mod-objs := md.o bitmap.o 11md-mod-y += md.o bitmap.o
12raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ 12raid456-y += raid5.o
13raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
13 raid6int1.o raid6int2.o raid6int4.o \ 14 raid6int1.o raid6int2.o raid6int4.o \
14 raid6int8.o raid6int16.o raid6int32.o \ 15 raid6int8.o raid6int16.o raid6int32.o \
15 raid6altivec1.o raid6altivec2.o raid6altivec4.o \ 16 raid6altivec1.o raid6altivec2.o raid6altivec4.o \
16 raid6altivec8.o \ 17 raid6altivec8.o \
17 raid6mmx.o raid6sse1.o raid6sse2.o 18 raid6mmx.o raid6sse1.o raid6sse2.o
18hostprogs-y := mktables 19hostprogs-y += mktables
19 20
20# Note: link order is important. All raid personalities 21# Note: link order is important. All raid personalities
21# and must come before md.o, as they each initialise 22# and must come before md.o, as they each initialise
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
26obj-$(CONFIG_MD_RAID0) += raid0.o 27obj-$(CONFIG_MD_RAID0) += raid0.o
27obj-$(CONFIG_MD_RAID1) += raid1.o 28obj-$(CONFIG_MD_RAID1) += raid1.o
28obj-$(CONFIG_MD_RAID10) += raid10.o 29obj-$(CONFIG_MD_RAID10) += raid10.o
30obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o
29obj-$(CONFIG_MD_RAID456) += raid456.o 31obj-$(CONFIG_MD_RAID456) += raid456.o
30obj-$(CONFIG_MD_MULTIPATH) += multipath.o 32obj-$(CONFIG_MD_MULTIPATH) += multipath.o
31obj-$(CONFIG_MD_FAULTY) += faulty.o 33obj-$(CONFIG_MD_FAULTY) += faulty.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 719943763391..f8a9f7ab2cb8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -16,6 +16,7 @@
16 * wait if count gets too high, wake when it drops to half. 16 * wait if count gets too high, wake when it drops to half.
17 */ 17 */
18 18
19#include <linux/blkdev.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/slab.h> 22#include <linux/slab.h>
@@ -26,8 +27,8 @@
26#include <linux/file.h> 27#include <linux/file.h>
27#include <linux/mount.h> 28#include <linux/mount.h>
28#include <linux/buffer_head.h> 29#include <linux/buffer_head.h>
29#include <linux/raid/md.h> 30#include "md.h"
30#include <linux/raid/bitmap.h> 31#include "bitmap.h"
31 32
32/* debug macros */ 33/* debug macros */
33 34
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
111 unsigned char *mappage; 112 unsigned char *mappage;
112 113
113 if (page >= bitmap->pages) { 114 if (page >= bitmap->pages) {
114 printk(KERN_ALERT 115 /* This can happen if bitmap_start_sync goes beyond
115 "%s: invalid bitmap page request: %lu (> %lu)\n", 116 * End-of-device while looking for a whole page.
116 bmname(bitmap), page, bitmap->pages-1); 117 * It is harmless.
118 */
117 return -EINVAL; 119 return -EINVAL;
118 } 120 }
119 121
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
265 list_for_each_continue_rcu(pos, &mddev->disks) { 267 list_for_each_continue_rcu(pos, &mddev->disks) {
266 rdev = list_entry(pos, mdk_rdev_t, same_set); 268 rdev = list_entry(pos, mdk_rdev_t, same_set);
267 if (rdev->raid_disk >= 0 && 269 if (rdev->raid_disk >= 0 &&
268 test_bit(In_sync, &rdev->flags) &&
269 !test_bit(Faulty, &rdev->flags)) { 270 !test_bit(Faulty, &rdev->flags)) {
270 /* this is a usable devices */ 271 /* this is a usable devices */
271 atomic_inc(&rdev->nr_pending); 272 atomic_inc(&rdev->nr_pending);
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
297 + size/512 > 0) 298 + size/512 > 0)
298 /* bitmap runs in to metadata */ 299 /* bitmap runs in to metadata */
299 goto bad_alignment; 300 goto bad_alignment;
300 if (rdev->data_offset + mddev->size*2 301 if (rdev->data_offset + mddev->dev_sectors
301 > rdev->sb_start + bitmap->offset) 302 > rdev->sb_start + bitmap->offset)
302 /* data runs in to bitmap */ 303 /* data runs in to bitmap */
303 goto bad_alignment; 304 goto bad_alignment;
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
570 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || 571 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
571 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) 572 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
572 reason = "unrecognized superblock version"; 573 reason = "unrecognized superblock version";
573 else if (chunksize < PAGE_SIZE) 574 else if (chunksize < 512)
574 reason = "bitmap chunksize too small"; 575 reason = "bitmap chunksize too small";
575 else if ((1 << ffz(~chunksize)) != chunksize) 576 else if ((1 << ffz(~chunksize)) != chunksize)
576 reason = "bitmap chunksize not a power of 2"; 577 reason = "bitmap chunksize not a power of 2";
@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1306 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", 1307 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
1307 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1308 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
1308 } 1309 }
1310 if (bitmap->mddev->degraded)
1311 /* Never clear bits or update events_cleared when degraded */
1312 success = 0;
1309 1313
1310 while (sectors) { 1314 while (sectors) {
1311 int blocks; 1315 int blocks;
@@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1345 } 1349 }
1346} 1350}
1347 1351
1348int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, 1352static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1349 int degraded) 1353 int degraded)
1350{ 1354{
1351 bitmap_counter_t *bmc; 1355 bitmap_counter_t *bmc;
1352 int rv; 1356 int rv;
@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1374 return rv; 1378 return rv;
1375} 1379}
1376 1380
1381int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
1382 int degraded)
1383{
1384 /* bitmap_start_sync must always report on multiples of whole
1385 * pages, otherwise resync (which is very PAGE_SIZE based) will
1386 * get confused.
1387 * So call __bitmap_start_sync repeatedly (if needed) until
1388 * At least PAGE_SIZE>>9 blocks are covered.
1389 * Return the 'or' of the result.
1390 */
1391 int rv = 0;
1392 int blocks1;
1393
1394 *blocks = 0;
1395 while (*blocks < (PAGE_SIZE>>9)) {
1396 rv |= __bitmap_start_sync(bitmap, offset,
1397 &blocks1, degraded);
1398 offset += blocks1;
1399 *blocks += blocks1;
1400 }
1401 return rv;
1402}
1403
1377void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) 1404void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
1378{ 1405{
1379 bitmap_counter_t *bmc; 1406 bitmap_counter_t *bmc;
@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1443 wait_event(bitmap->mddev->recovery_wait, 1470 wait_event(bitmap->mddev->recovery_wait,
1444 atomic_read(&bitmap->mddev->recovery_active) == 0); 1471 atomic_read(&bitmap->mddev->recovery_active) == 0);
1445 1472
1473 bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
1474 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1446 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1475 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
1447 s = 0; 1476 s = 0;
1448 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1477 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
new file mode 100644
index 000000000000..e98900671ca9
--- /dev/null
+++ b/drivers/md/bitmap.h
@@ -0,0 +1,288 @@
1/*
2 * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
3 *
4 * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
5 */
6#ifndef BITMAP_H
7#define BITMAP_H 1
8
9#define BITMAP_MAJOR_LO 3
10/* version 4 insists the bitmap is in little-endian order
11 * with version 3, it is host-endian which is non-portable
12 */
13#define BITMAP_MAJOR_HI 4
14#define BITMAP_MAJOR_HOSTENDIAN 3
15
16#define BITMAP_MINOR 39
17
18/*
19 * in-memory bitmap:
20 *
21 * Use 16 bit block counters to track pending writes to each "chunk".
22 * The 2 high order bits are special-purpose, the first is a flag indicating
23 * whether a resync is needed. The second is a flag indicating whether a
24 * resync is active.
25 * This means that the counter is actually 14 bits:
26 *
27 * +--------+--------+------------------------------------------------+
28 * | resync | resync | counter |
29 * | needed | active | |
30 * | (0-1) | (0-1) | (0-16383) |
31 * +--------+--------+------------------------------------------------+
32 *
33 * The "resync needed" bit is set when:
34 * a '1' bit is read from storage at startup.
35 * a write request fails on some drives
36 * a resync is aborted on a chunk with 'resync active' set
37 * It is cleared (and resync-active set) when a resync starts across all drives
38 * of the chunk.
39 *
40 *
41 * The "resync active" bit is set when:
42 * a resync is started on all drives, and resync_needed is set.
43 * resync_needed will be cleared (as long as resync_active wasn't already set).
44 * It is cleared when a resync completes.
45 *
46 * The counter counts pending write requests, plus the on-disk bit.
47 * When the counter is '1' and the resync bits are clear, the on-disk
48 * bit can be cleared aswell, thus setting the counter to 0.
49 * When we set a bit, or in the counter (to start a write), if the fields is
50 * 0, we first set the disk bit and set the counter to 1.
51 *
52 * If the counter is 0, the on-disk bit is clear and the stipe is clean
53 * Anything that dirties the stipe pushes the counter to 2 (at least)
54 * and sets the on-disk bit (lazily).
55 * If a periodic sweep find the counter at 2, it is decremented to 1.
56 * If the sweep find the counter at 1, the on-disk bit is cleared and the
57 * counter goes to zero.
58 *
59 * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
60 * counters as a fallback when "page" memory cannot be allocated:
61 *
62 * Normal case (page memory allocated):
63 *
64 * page pointer (32-bit)
65 *
66 * [ ] ------+
67 * |
68 * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
69 * c1 c2 c2048
70 *
71 * Hijacked case (page memory allocation failed):
72 *
73 * hijacked page pointer (32-bit)
74 *
75 * [ ][ ] (no page memory allocated)
76 * counter #1 (16-bit) counter #2 (16-bit)
77 *
78 */
79
80#ifdef __KERNEL__
81
82#define PAGE_BITS (PAGE_SIZE << 3)
83#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
84
85typedef __u16 bitmap_counter_t;
86#define COUNTER_BITS 16
87#define COUNTER_BIT_SHIFT 4
88#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
89#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
90
91#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
92#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
93#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
94#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
95#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
96#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
97
98/* how many counters per page? */
99#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
100/* same, except a shift value for more efficient bitops */
101#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
102/* same, except a mask value for more efficient bitops */
103#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
104
105#define BITMAP_BLOCK_SIZE 512
106#define BITMAP_BLOCK_SHIFT 9
107
108/* how many blocks per chunk? (this is variable) */
109#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
111#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
112
113/* when hijacked, the counters and bits represent even larger "chunks" */
114/* there will be 1024 chunks represented by each counter in the page pointers */
115#define PAGEPTR_BLOCK_RATIO(bitmap) \
116 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
117#define PAGEPTR_BLOCK_SHIFT(bitmap) \
118 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
119#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
120
121/*
122 * on-disk bitmap:
123 *
124 * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
125 * file a page at a time. There's a superblock at the start of the file.
126 */
127
128/* map chunks (bits) to file pages - offset by the size of the superblock */
129#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
130
131#endif
132
133/*
134 * bitmap structures:
135 */
136
137#define BITMAP_MAGIC 0x6d746962
138
139/* use these for bitmap->flags and bitmap->sb->state bit-fields */
140enum bitmap_state {
141 BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */
142 BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
143 BITMAP_HOSTENDIAN = 0x8000,
144};
145
146/* the superblock at the front of the bitmap file -- little endian */
147typedef struct bitmap_super_s {
148 __le32 magic; /* 0 BITMAP_MAGIC */
149 __le32 version; /* 4 the bitmap major for now, could change... */
150 __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
151 __le64 events; /* 24 event counter for the bitmap (1)*/
152 __le64 events_cleared;/*32 event counter when last bit cleared (2) */
153 __le64 sync_size; /* 40 the size of the md device's sync range(3) */
154 __le32 state; /* 48 bitmap state information */
155 __le32 chunksize; /* 52 the bitmap chunk size in bytes */
156 __le32 daemon_sleep; /* 56 seconds between disk flushes */
157 __le32 write_behind; /* 60 number of outstanding write-behind writes */
158
159 __u8 pad[256 - 64]; /* set to zero */
160} bitmap_super_t;
161
162/* notes:
163 * (1) This event counter is updated before the eventcounter in the md superblock
164 * When a bitmap is loaded, it is only accepted if this event counter is equal
165 * to, or one greater than, the event counter in the superblock.
166 * (2) This event counter is updated when the other one is *if*and*only*if* the
167 * array is not degraded. As bits are not cleared when the array is degraded,
168 * this represents the last time that any bits were cleared.
169 * If a device is being added that has an event count with this value or
170 * higher, it is accepted as conforming to the bitmap.
171 * (3)This is the number of sectors represented by the bitmap, and is the range that
172 * resync happens across. For raid1 and raid5/6 it is the size of individual
173 * devices. For raid10 it is the size of the array.
174 */
175
176#ifdef __KERNEL__
177
178/* the in-memory bitmap is represented by bitmap_pages */
179struct bitmap_page {
180 /*
181 * map points to the actual memory page
182 */
183 char *map;
184 /*
185 * in emergencies (when map cannot be alloced), hijack the map
186 * pointer and use it as two counters itself
187 */
188 unsigned int hijacked:1;
189 /*
190 * count of dirty bits on the page
191 */
192 unsigned int count:31;
193};
194
195/* keep track of bitmap file pages that have pending writes on them */
196struct page_list {
197 struct list_head list;
198 struct page *page;
199};
200
201/* the main bitmap structure - one per mddev */
202struct bitmap {
203 struct bitmap_page *bp;
204 unsigned long pages; /* total number of pages in the bitmap */
205 unsigned long missing_pages; /* number of pages not yet allocated */
206
207 mddev_t *mddev; /* the md device that the bitmap is for */
208
209 int counter_bits; /* how many bits per block counter */
210
211 /* bitmap chunksize -- how much data does each bit represent? */
212 unsigned long chunksize;
213 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
214 unsigned long chunks; /* total number of data chunks for the array */
215
216 /* We hold a count on the chunk currently being synced, and drop
217 * it when the last block is started. If the resync is aborted
218 * midway, we need to be able to drop that count, so we remember
219 * the counted chunk..
220 */
221 unsigned long syncchunk;
222
223 __u64 events_cleared;
224 int need_sync;
225
226 /* bitmap spinlock */
227 spinlock_t lock;
228
229 long offset; /* offset from superblock if file is NULL */
230 struct file *file; /* backing disk file */
231 struct page *sb_page; /* cached copy of the bitmap file superblock */
232 struct page **filemap; /* list of cache pages for the file */
233 unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
234 unsigned long file_pages; /* number of pages in the file */
235 int last_page_size; /* bytes in the last page */
236
237 unsigned long flags;
238
239 int allclean;
240
241 unsigned long max_write_behind; /* write-behind mode */
242 atomic_t behind_writes;
243
244 /*
245 * the bitmap daemon - periodically wakes up and sweeps the bitmap
246 * file, cleaning up bits and flushing out pages to disk as necessary
247 */
248 unsigned long daemon_lastrun; /* jiffies of last run */
249 unsigned long daemon_sleep; /* how many seconds between updates? */
250 unsigned long last_end_sync; /* when we lasted called end_sync to
251 * update bitmap with resync progress */
252
253 atomic_t pending_writes; /* pending writes to the bitmap file */
254 wait_queue_head_t write_wait;
255 wait_queue_head_t overflow_wait;
256
257};
258
259/* the bitmap API */
260
261/* these are used only by md/bitmap */
262int bitmap_create(mddev_t *mddev);
263void bitmap_flush(mddev_t *mddev);
264void bitmap_destroy(mddev_t *mddev);
265
266void bitmap_print_sb(struct bitmap *bitmap);
267void bitmap_update_sb(struct bitmap *bitmap);
268
269int bitmap_setallbits(struct bitmap *bitmap);
270void bitmap_write_all(struct bitmap *bitmap);
271
272void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
273
274/* these are exported */
275int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
276 unsigned long sectors, int behind);
277void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
278 unsigned long sectors, int success, int behind);
279int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
280void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
281void bitmap_close_sync(struct bitmap *bitmap);
282void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
283
284void bitmap_unplug(struct bitmap *bitmap);
285void bitmap_daemon_work(struct bitmap *bitmap);
286#endif
287
288#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 86d9adf90e79..8695809b24b0 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -62,7 +62,10 @@
62#define ModeShift 5 62#define ModeShift 5
63 63
64#define MaxFault 50 64#define MaxFault 50
65#include <linux/raid/md.h> 65#include <linux/blkdev.h>
66#include <linux/raid/md_u.h>
67#include "md.h"
68#include <linux/seq_file.h>
66 69
67 70
68static void faulty_fail(struct bio *bio, int error) 71static void faulty_fail(struct bio *bio, int error)
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
280 return 0; 283 return 0;
281} 284}
282 285
286static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
287{
288 WARN_ONCE(raid_disks,
289 "%s does not support generic reshape\n", __func__);
290
291 if (sectors == 0)
292 return mddev->dev_sectors;
293
294 return sectors;
295}
296
283static int run(mddev_t *mddev) 297static int run(mddev_t *mddev)
284{ 298{
285 mdk_rdev_t *rdev; 299 mdk_rdev_t *rdev;
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
298 list_for_each_entry(rdev, &mddev->disks, same_set) 312 list_for_each_entry(rdev, &mddev->disks, same_set)
299 conf->rdev = rdev; 313 conf->rdev = rdev;
300 314
301 mddev->array_sectors = mddev->size * 2; 315 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
302 mddev->private = conf; 316 mddev->private = conf;
303 317
304 reconfig(mddev, mddev->layout, -1); 318 reconfig(mddev, mddev->layout, -1);
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
325 .stop = stop, 339 .stop = stop,
326 .status = status, 340 .status = status,
327 .reconfig = reconfig, 341 .reconfig = reconfig,
342 .size = faulty_size,
328}; 343};
329 344
330static int __init raid_init(void) 345static int __init raid_init(void)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 09658b218474..7a36e38393a1 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -16,7 +16,11 @@
16 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 16 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17*/ 17*/
18 18
19#include <linux/raid/linear.h> 19#include <linux/blkdev.h>
20#include <linux/raid/md_u.h>
21#include <linux/seq_file.h>
22#include "md.h"
23#include "linear.h"
20 24
21/* 25/*
22 * find which device holds a particular offset 26 * find which device holds a particular offset
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
97 return ret; 101 return ret;
98} 102}
99 103
104static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
105{
106 linear_conf_t *conf = mddev_to_conf(mddev);
107
108 WARN_ONCE(sectors || raid_disks,
109 "%s does not support generic reshape\n", __func__);
110
111 return conf->array_sectors;
112}
113
100static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) 114static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
101{ 115{
102 linear_conf_t *conf; 116 linear_conf_t *conf;
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
135 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 149 mddev->queue->max_sectors > (PAGE_SIZE>>9))
136 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 150 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
137 151
138 disk->num_sectors = rdev->size * 2; 152 disk->num_sectors = rdev->sectors;
139 conf->array_sectors += rdev->size * 2; 153 conf->array_sectors += rdev->sectors;
140 154
141 cnt++; 155 cnt++;
142 } 156 }
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
249 if (!conf) 263 if (!conf)
250 return 1; 264 return 1;
251 mddev->private = conf; 265 mddev->private = conf;
252 mddev->array_sectors = conf->array_sectors; 266 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
253 267
254 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 268 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
255 mddev->queue->unplug_fn = linear_unplug; 269 mddev->queue->unplug_fn = linear_unplug;
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
283 newconf->prev = mddev_to_conf(mddev); 297 newconf->prev = mddev_to_conf(mddev);
284 mddev->private = newconf; 298 mddev->private = newconf;
285 mddev->raid_disks++; 299 mddev->raid_disks++;
286 mddev->array_sectors = newconf->array_sectors; 300 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
287 set_capacity(mddev->gendisk, mddev->array_sectors); 301 set_capacity(mddev->gendisk, mddev->array_sectors);
288 return 0; 302 return 0;
289} 303}
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
381 .stop = linear_stop, 395 .stop = linear_stop,
382 .status = linear_status, 396 .status = linear_status,
383 .hot_add_disk = linear_add, 397 .hot_add_disk = linear_add,
398 .size = linear_size,
384}; 399};
385 400
386static int __init linear_init (void) 401static int __init linear_init (void)
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
new file mode 100644
index 000000000000..bf8179587f95
--- /dev/null
+++ b/drivers/md/linear.h
@@ -0,0 +1,29 @@
1#ifndef _LINEAR_H
2#define _LINEAR_H
3
4struct dev_info {
5 mdk_rdev_t *rdev;
6 sector_t num_sectors;
7 sector_t start_sector;
8};
9
10typedef struct dev_info dev_info_t;
11
12struct linear_private_data
13{
14 struct linear_private_data *prev; /* earlier version */
15 dev_info_t **hash_table;
16 sector_t spacing;
17 sector_t array_sectors;
18 int sector_shift; /* shift before dividing
19 * by spacing
20 */
21 dev_info_t disks[0];
22};
23
24
25typedef struct linear_private_data linear_conf_t;
26
27#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
28
29#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a307f87eb90e..ed5727c089a9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,9 +33,9 @@
33*/ 33*/
34 34
35#include <linux/kthread.h> 35#include <linux/kthread.h>
36#include <linux/raid/md.h> 36#include <linux/blkdev.h>
37#include <linux/raid/bitmap.h>
38#include <linux/sysctl.h> 37#include <linux/sysctl.h>
38#include <linux/seq_file.h>
39#include <linux/buffer_head.h> /* for invalidate_bdev */ 39#include <linux/buffer_head.h> /* for invalidate_bdev */
40#include <linux/poll.h> 40#include <linux/poll.h>
41#include <linux/ctype.h> 41#include <linux/ctype.h>
@@ -45,11 +45,10 @@
45#include <linux/reboot.h> 45#include <linux/reboot.h>
46#include <linux/file.h> 46#include <linux/file.h>
47#include <linux/delay.h> 47#include <linux/delay.h>
48 48#include <linux/raid/md_p.h>
49#define MAJOR_NR MD_MAJOR 49#include <linux/raid/md_u.h>
50 50#include "md.h"
51/* 63 partitions with the alternate major number (mdp) */ 51#include "bitmap.h"
52#define MdpMinorShift 6
53 52
54#define DEBUG 0 53#define DEBUG 0
55#define dprintk(x...) ((void)(DEBUG && printk(x))) 54#define dprintk(x...) ((void)(DEBUG && printk(x)))
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
202 ) 201 )
203 202
204 203
205static int md_fail_request(struct request_queue *q, struct bio *bio) 204/* Rather than calling directly into the personality make_request function,
205 * IO requests come here first so that we can check if the device is
206 * being suspended pending a reconfiguration.
207 * We hold a refcount over the call to ->make_request. By the time that
208 * call has finished, the bio has been linked into some internal structure
209 * and so is visible to ->quiesce(), so we don't need the refcount any more.
210 */
211static int md_make_request(struct request_queue *q, struct bio *bio)
206{ 212{
207 bio_io_error(bio); 213 mddev_t *mddev = q->queuedata;
208 return 0; 214 int rv;
215 if (mddev == NULL || mddev->pers == NULL) {
216 bio_io_error(bio);
217 return 0;
218 }
219 rcu_read_lock();
220 if (mddev->suspended) {
221 DEFINE_WAIT(__wait);
222 for (;;) {
223 prepare_to_wait(&mddev->sb_wait, &__wait,
224 TASK_UNINTERRUPTIBLE);
225 if (!mddev->suspended)
226 break;
227 rcu_read_unlock();
228 schedule();
229 rcu_read_lock();
230 }
231 finish_wait(&mddev->sb_wait, &__wait);
232 }
233 atomic_inc(&mddev->active_io);
234 rcu_read_unlock();
235 rv = mddev->pers->make_request(q, bio);
236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237 wake_up(&mddev->sb_wait);
238
239 return rv;
240}
241
242static void mddev_suspend(mddev_t *mddev)
243{
244 BUG_ON(mddev->suspended);
245 mddev->suspended = 1;
246 synchronize_rcu();
247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248 mddev->pers->quiesce(mddev, 1);
249 md_unregister_thread(mddev->thread);
250 mddev->thread = NULL;
251 /* we now know that no code is executing in the personality module,
252 * except possibly the tail end of a ->bi_end_io function, but that
253 * is certain to complete before the module has a chance to get
254 * unloaded
255 */
256}
257
258static void mddev_resume(mddev_t *mddev)
259{
260 mddev->suspended = 0;
261 wake_up(&mddev->sb_wait);
262 mddev->pers->quiesce(mddev, 0);
209} 263}
210 264
265
211static inline mddev_t *mddev_get(mddev_t *mddev) 266static inline mddev_t *mddev_get(mddev_t *mddev)
212{ 267{
213 atomic_inc(&mddev->active); 268 atomic_inc(&mddev->active);
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
310 init_timer(&new->safemode_timer); 365 init_timer(&new->safemode_timer);
311 atomic_set(&new->active, 1); 366 atomic_set(&new->active, 1);
312 atomic_set(&new->openers, 0); 367 atomic_set(&new->openers, 0);
368 atomic_set(&new->active_io, 0);
313 spin_lock_init(&new->write_lock); 369 spin_lock_init(&new->write_lock);
314 init_waitqueue_head(&new->sb_wait); 370 init_waitqueue_head(&new->sb_wait);
315 init_waitqueue_head(&new->recovery_wait); 371 init_waitqueue_head(&new->recovery_wait);
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
326 return mutex_lock_interruptible(&mddev->reconfig_mutex); 382 return mutex_lock_interruptible(&mddev->reconfig_mutex);
327} 383}
328 384
385static inline int mddev_is_locked(mddev_t *mddev)
386{
387 return mutex_is_locked(&mddev->reconfig_mutex);
388}
389
329static inline int mddev_trylock(mddev_t * mddev) 390static inline int mddev_trylock(mddev_t * mddev)
330{ 391{
331 return mutex_trylock(&mddev->reconfig_mutex); 392 return mutex_trylock(&mddev->reconfig_mutex);
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
409 rdev->sb_loaded = 0; 470 rdev->sb_loaded = 0;
410 rdev->sb_page = NULL; 471 rdev->sb_page = NULL;
411 rdev->sb_start = 0; 472 rdev->sb_start = 0;
412 rdev->size = 0; 473 rdev->sectors = 0;
413 } 474 }
414} 475}
415 476
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
775 else 836 else
776 ret = 0; 837 ret = 0;
777 } 838 }
778 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; 839 rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
779 840
780 if (rdev->size < sb->size && sb->level > 1) 841 if (rdev->sectors < sb->size * 2 && sb->level > 1)
781 /* "this cannot possibly happen" ... */ 842 /* "this cannot possibly happen" ... */
782 ret = -EINVAL; 843 ret = -EINVAL;
783 844
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
812 mddev->clevel[0] = 0; 873 mddev->clevel[0] = 0;
813 mddev->layout = sb->layout; 874 mddev->layout = sb->layout;
814 mddev->raid_disks = sb->raid_disks; 875 mddev->raid_disks = sb->raid_disks;
815 mddev->size = sb->size; 876 mddev->dev_sectors = sb->size * 2;
816 mddev->events = ev1; 877 mddev->events = ev1;
817 mddev->bitmap_offset = 0; 878 mddev->bitmap_offset = 0;
818 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 879 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
926 987
927 sb->ctime = mddev->ctime; 988 sb->ctime = mddev->ctime;
928 sb->level = mddev->level; 989 sb->level = mddev->level;
929 sb->size = mddev->size; 990 sb->size = mddev->dev_sectors / 2;
930 sb->raid_disks = mddev->raid_disks; 991 sb->raid_disks = mddev->raid_disks;
931 sb->md_minor = mddev->md_minor; 992 sb->md_minor = mddev->md_minor;
932 sb->not_persistent = 0; 993 sb->not_persistent = 0;
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1024static unsigned long long 1085static unsigned long long
1025super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1086super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1026{ 1087{
1027 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1088 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1028 return 0; /* component must fit device */ 1089 return 0; /* component must fit device */
1029 if (rdev->mddev->bitmap_offset) 1090 if (rdev->mddev->bitmap_offset)
1030 return 0; /* can't move bitmap */ 1091 return 0; /* can't move bitmap */
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1180 ret = 0; 1241 ret = 0;
1181 } 1242 }
1182 if (minor_version) 1243 if (minor_version)
1183 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1244 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1245 le64_to_cpu(sb->data_offset);
1184 else 1246 else
1185 rdev->size = rdev->sb_start / 2; 1247 rdev->sectors = rdev->sb_start;
1186 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1248 if (rdev->sectors < le64_to_cpu(sb->data_size))
1187 return -EINVAL; 1249 return -EINVAL;
1188 rdev->size = le64_to_cpu(sb->data_size)/2; 1250 rdev->sectors = le64_to_cpu(sb->data_size);
1189 if (le32_to_cpu(sb->chunksize)) 1251 if (le32_to_cpu(sb->chunksize))
1190 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1252 rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
1191 1253
1192 if (le64_to_cpu(sb->size) > rdev->size*2) 1254 if (le64_to_cpu(sb->size) > rdev->sectors)
1193 return -EINVAL; 1255 return -EINVAL;
1194 return ret; 1256 return ret;
1195} 1257}
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1216 mddev->clevel[0] = 0; 1278 mddev->clevel[0] = 0;
1217 mddev->layout = le32_to_cpu(sb->layout); 1279 mddev->layout = le32_to_cpu(sb->layout);
1218 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1280 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1219 mddev->size = le64_to_cpu(sb->size)/2; 1281 mddev->dev_sectors = le64_to_cpu(sb->size);
1220 mddev->events = ev1; 1282 mddev->events = ev1;
1221 mddev->bitmap_offset = 0; 1283 mddev->bitmap_offset = 0;
1222 mddev->default_bitmap_offset = 1024 >> 9; 1284 mddev->default_bitmap_offset = 1024 >> 9;
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1312 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1374 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1313 1375
1314 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1376 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1315 sb->size = cpu_to_le64(mddev->size<<1); 1377 sb->size = cpu_to_le64(mddev->dev_sectors);
1316 1378
1317 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1379 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1318 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1380 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1320 } 1382 }
1321 1383
1322 if (rdev->raid_disk >= 0 && 1384 if (rdev->raid_disk >= 0 &&
1323 !test_bit(In_sync, &rdev->flags) && 1385 !test_bit(In_sync, &rdev->flags)) {
1324 rdev->recovery_offset > 0) { 1386 if (mddev->curr_resync_completed > rdev->recovery_offset)
1325 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1387 rdev->recovery_offset = mddev->curr_resync_completed;
1326 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1388 if (rdev->recovery_offset > 0) {
1389 sb->feature_map |=
1390 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1391 sb->recovery_offset =
1392 cpu_to_le64(rdev->recovery_offset);
1393 }
1327 } 1394 }
1328 1395
1329 if (mddev->reshape_position != MaxSector) { 1396 if (mddev->reshape_position != MaxSector) {
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1365{ 1432{
1366 struct mdp_superblock_1 *sb; 1433 struct mdp_superblock_1 *sb;
1367 sector_t max_sectors; 1434 sector_t max_sectors;
1368 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1435 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1369 return 0; /* component must fit device */ 1436 return 0; /* component must fit device */
1370 if (rdev->sb_start < rdev->data_offset) { 1437 if (rdev->sb_start < rdev->data_offset) {
1371 /* minor versions 1 and 2; superblock before data */ 1438 /* minor versions 1 and 2; superblock before data */
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1381 sector_t sb_start; 1448 sector_t sb_start;
1382 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1449 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1383 sb_start &= ~(sector_t)(4*2 - 1); 1450 sb_start &= ~(sector_t)(4*2 - 1);
1384 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; 1451 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1385 if (!num_sectors || num_sectors > max_sectors) 1452 if (!num_sectors || num_sectors > max_sectors)
1386 num_sectors = max_sectors; 1453 num_sectors = max_sectors;
1387 rdev->sb_start = sb_start; 1454 rdev->sb_start = sb_start;
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1433 1500
1434static LIST_HEAD(pending_raid_disks); 1501static LIST_HEAD(pending_raid_disks);
1435 1502
1503static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
1504{
1505 struct mdk_personality *pers = mddev->pers;
1506 struct gendisk *disk = mddev->gendisk;
1507 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1508 struct blk_integrity *bi_mddev = blk_get_integrity(disk);
1509
1510 /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
1511 if (pers && pers->level >= 4 && pers->level <= 6)
1512 return;
1513
1514 /* If rdev is integrity capable, register profile for mddev */
1515 if (!bi_mddev && bi_rdev) {
1516 if (blk_integrity_register(disk, bi_rdev))
1517 printk(KERN_ERR "%s: %s Could not register integrity!\n",
1518 __func__, disk->disk_name);
1519 else
1520 printk(KERN_NOTICE "Enabling data integrity on %s\n",
1521 disk->disk_name);
1522 return;
1523 }
1524
1525 /* Check that mddev and rdev have matching profiles */
1526 if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
1527 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
1528 disk->disk_name, rdev->bdev->bd_disk->disk_name);
1529 printk(KERN_NOTICE "Disabling data integrity on %s\n",
1530 disk->disk_name);
1531 blk_integrity_unregister(disk);
1532 }
1533}
1534
1436static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1535static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1437{ 1536{
1438 char b[BDEVNAME_SIZE]; 1537 char b[BDEVNAME_SIZE];
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1449 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1548 if (find_rdev(mddev, rdev->bdev->bd_dev))
1450 return -EEXIST; 1549 return -EEXIST;
1451 1550
1452 /* make sure rdev->size exceeds mddev->size */ 1551 /* make sure rdev->sectors exceeds mddev->dev_sectors */
1453 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1552 if (rdev->sectors && (mddev->dev_sectors == 0 ||
1553 rdev->sectors < mddev->dev_sectors)) {
1454 if (mddev->pers) { 1554 if (mddev->pers) {
1455 /* Cannot change size, so fail 1555 /* Cannot change size, so fail
1456 * If mddev->level <= 0, then we don't care 1556 * If mddev->level <= 0, then we don't care
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1459 if (mddev->level > 0) 1559 if (mddev->level > 0)
1460 return -ENOSPC; 1560 return -ENOSPC;
1461 } else 1561 } else
1462 mddev->size = rdev->size; 1562 mddev->dev_sectors = rdev->sectors;
1463 } 1563 }
1464 1564
1465 /* Verify rdev->desc_nr is unique. 1565 /* Verify rdev->desc_nr is unique.
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1503 1603
1504 /* May as well allow recovery to be retried once */ 1604 /* May as well allow recovery to be retried once */
1505 mddev->recovery_disabled = 0; 1605 mddev->recovery_disabled = 0;
1606
1607 md_integrity_check(rdev, mddev);
1506 return 0; 1608 return 0;
1507 1609
1508 fail: 1610 fail:
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
1713static void print_rdev(mdk_rdev_t *rdev, int major_version) 1815static void print_rdev(mdk_rdev_t *rdev, int major_version)
1714{ 1816{
1715 char b[BDEVNAME_SIZE]; 1817 char b[BDEVNAME_SIZE];
1716 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1818 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1717 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1819 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1718 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1820 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1719 rdev->desc_nr); 1821 rdev->desc_nr);
1720 if (rdev->sb_loaded) { 1822 if (rdev->sb_loaded) {
@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2153 return -EINVAL; 2255 return -EINVAL;
2154 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2256 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2155 return -EBUSY; 2257 return -EBUSY;
2156 if (rdev->size && rdev->mddev->external) 2258 if (rdev->sectors && rdev->mddev->external)
2157 /* Must set offset before size, so overlap checks 2259 /* Must set offset before size, so overlap checks
2158 * can be sane */ 2260 * can be sane */
2159 return -EBUSY; 2261 return -EBUSY;
@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2167static ssize_t 2269static ssize_t
2168rdev_size_show(mdk_rdev_t *rdev, char *page) 2270rdev_size_show(mdk_rdev_t *rdev, char *page)
2169{ 2271{
2170 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 2272 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2171} 2273}
2172 2274
2173static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2275static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2180 return 1; 2282 return 1;
2181} 2283}
2182 2284
2285static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2286{
2287 unsigned long long blocks;
2288 sector_t new;
2289
2290 if (strict_strtoull(buf, 10, &blocks) < 0)
2291 return -EINVAL;
2292
2293 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2294 return -EINVAL; /* sector conversion overflow */
2295
2296 new = blocks * 2;
2297 if (new != blocks * 2)
2298 return -EINVAL; /* unsigned long long to sector_t overflow */
2299
2300 *sectors = new;
2301 return 0;
2302}
2303
2183static ssize_t 2304static ssize_t
2184rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2305rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2185{ 2306{
2186 unsigned long long size;
2187 unsigned long long oldsize = rdev->size;
2188 mddev_t *my_mddev = rdev->mddev; 2307 mddev_t *my_mddev = rdev->mddev;
2308 sector_t oldsectors = rdev->sectors;
2309 sector_t sectors;
2189 2310
2190 if (strict_strtoull(buf, 10, &size) < 0) 2311 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2191 return -EINVAL; 2312 return -EINVAL;
2192 if (my_mddev->pers && rdev->raid_disk >= 0) { 2313 if (my_mddev->pers && rdev->raid_disk >= 0) {
2193 if (my_mddev->persistent) { 2314 if (my_mddev->persistent) {
2194 size = super_types[my_mddev->major_version]. 2315 sectors = super_types[my_mddev->major_version].
2195 rdev_size_change(rdev, size * 2); 2316 rdev_size_change(rdev, sectors);
2196 if (!size) 2317 if (!sectors)
2197 return -EBUSY; 2318 return -EBUSY;
2198 } else if (!size) { 2319 } else if (!sectors)
2199 size = (rdev->bdev->bd_inode->i_size >> 10); 2320 sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2200 size -= rdev->data_offset/2; 2321 rdev->data_offset;
2201 }
2202 } 2322 }
2203 if (size < my_mddev->size) 2323 if (sectors < my_mddev->dev_sectors)
2204 return -EINVAL; /* component must fit device */ 2324 return -EINVAL; /* component must fit device */
2205 2325
2206 rdev->size = size; 2326 rdev->sectors = sectors;
2207 if (size > oldsize && my_mddev->external) { 2327 if (sectors > oldsectors && my_mddev->external) {
2208 /* need to check that all other rdevs with the same ->bdev 2328 /* need to check that all other rdevs with the same ->bdev
2209 * do not overlap. We need to unlock the mddev to avoid 2329 * do not overlap. We need to unlock the mddev to avoid
2210 * a deadlock. We have already changed rdev->size, and if 2330 * a deadlock. We have already changed rdev->sectors, and if
2211 * we have to change it back, we will have the lock again. 2331 * we have to change it back, we will have the lock again.
2212 */ 2332 */
2213 mddev_t *mddev; 2333 mddev_t *mddev;
@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2223 if (test_bit(AllReserved, &rdev2->flags) || 2343 if (test_bit(AllReserved, &rdev2->flags) ||
2224 (rdev->bdev == rdev2->bdev && 2344 (rdev->bdev == rdev2->bdev &&
2225 rdev != rdev2 && 2345 rdev != rdev2 &&
2226 overlaps(rdev->data_offset, rdev->size * 2, 2346 overlaps(rdev->data_offset, rdev->sectors,
2227 rdev2->data_offset, 2347 rdev2->data_offset,
2228 rdev2->size * 2))) { 2348 rdev2->sectors))) {
2229 overlap = 1; 2349 overlap = 1;
2230 break; 2350 break;
2231 } 2351 }
@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2239 if (overlap) { 2359 if (overlap) {
2240 /* Someone else could have slipped in a size 2360 /* Someone else could have slipped in a size
2241 * change here, but doing so is just silly. 2361 * change here, but doing so is just silly.
2242 * We put oldsize back because we *know* it is 2362 * We put oldsectors back because we *know* it is
2243 * safe, and trust userspace not to race with 2363 * safe, and trust userspace not to race with
2244 * itself 2364 * itself
2245 */ 2365 */
2246 rdev->size = oldsize; 2366 rdev->sectors = oldsectors;
2247 return -EBUSY; 2367 return -EBUSY;
2248 } 2368 }
2249 } 2369 }
@@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page)
2547static ssize_t 2667static ssize_t
2548level_store(mddev_t *mddev, const char *buf, size_t len) 2668level_store(mddev_t *mddev, const char *buf, size_t len)
2549{ 2669{
2670 char level[16];
2550 ssize_t rv = len; 2671 ssize_t rv = len;
2551 if (mddev->pers) 2672 struct mdk_personality *pers;
2673 void *priv;
2674
2675 if (mddev->pers == NULL) {
2676 if (len == 0)
2677 return 0;
2678 if (len >= sizeof(mddev->clevel))
2679 return -ENOSPC;
2680 strncpy(mddev->clevel, buf, len);
2681 if (mddev->clevel[len-1] == '\n')
2682 len--;
2683 mddev->clevel[len] = 0;
2684 mddev->level = LEVEL_NONE;
2685 return rv;
2686 }
2687
2688 /* request to change the personality. Need to ensure:
2689 * - array is not engaged in resync/recovery/reshape
2690 * - old personality can be suspended
2691 * - new personality will access other array.
2692 */
2693
2694 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2552 return -EBUSY; 2695 return -EBUSY;
2553 if (len == 0) 2696
2554 return 0; 2697 if (!mddev->pers->quiesce) {
2555 if (len >= sizeof(mddev->clevel)) 2698 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2556 return -ENOSPC; 2699 mdname(mddev), mddev->pers->name);
2557 strncpy(mddev->clevel, buf, len); 2700 return -EINVAL;
2558 if (mddev->clevel[len-1] == '\n') 2701 }
2702
2703 /* Now find the new personality */
2704 if (len == 0 || len >= sizeof(level))
2705 return -EINVAL;
2706 strncpy(level, buf, len);
2707 if (level[len-1] == '\n')
2559 len--; 2708 len--;
2560 mddev->clevel[len] = 0; 2709 level[len] = 0;
2561 mddev->level = LEVEL_NONE; 2710
2711 request_module("md-%s", level);
2712 spin_lock(&pers_lock);
2713 pers = find_pers(LEVEL_NONE, level);
2714 if (!pers || !try_module_get(pers->owner)) {
2715 spin_unlock(&pers_lock);
2716 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2717 return -EINVAL;
2718 }
2719 spin_unlock(&pers_lock);
2720
2721 if (pers == mddev->pers) {
2722 /* Nothing to do! */
2723 module_put(pers->owner);
2724 return rv;
2725 }
2726 if (!pers->takeover) {
2727 module_put(pers->owner);
2728 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2729 mdname(mddev), level);
2730 return -EINVAL;
2731 }
2732
2733 /* ->takeover must set new_* and/or delta_disks
2734 * if it succeeds, and may set them when it fails.
2735 */
2736 priv = pers->takeover(mddev);
2737 if (IS_ERR(priv)) {
2738 mddev->new_level = mddev->level;
2739 mddev->new_layout = mddev->layout;
2740 mddev->new_chunk = mddev->chunk_size;
2741 mddev->raid_disks -= mddev->delta_disks;
2742 mddev->delta_disks = 0;
2743 module_put(pers->owner);
2744 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2745 mdname(mddev), level);
2746 return PTR_ERR(priv);
2747 }
2748
2749 /* Looks like we have a winner */
2750 mddev_suspend(mddev);
2751 mddev->pers->stop(mddev);
2752 module_put(mddev->pers->owner);
2753 mddev->pers = pers;
2754 mddev->private = priv;
2755 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2756 mddev->level = mddev->new_level;
2757 mddev->layout = mddev->new_layout;
2758 mddev->chunk_size = mddev->new_chunk;
2759 mddev->delta_disks = 0;
2760 pers->run(mddev);
2761 mddev_resume(mddev);
2762 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2763 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2764 md_wakeup_thread(mddev->thread);
2562 return rv; 2765 return rv;
2563} 2766}
2564 2767
@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
2586 if (!*buf || (*e && *e != '\n')) 2789 if (!*buf || (*e && *e != '\n'))
2587 return -EINVAL; 2790 return -EINVAL;
2588 2791
2589 if (mddev->pers) 2792 if (mddev->pers) {
2590 return -EBUSY; 2793 int err;
2591 if (mddev->reshape_position != MaxSector) 2794 if (mddev->pers->reconfig == NULL)
2795 return -EBUSY;
2796 err = mddev->pers->reconfig(mddev, n, -1);
2797 if (err)
2798 return err;
2799 } else {
2592 mddev->new_layout = n; 2800 mddev->new_layout = n;
2593 else 2801 if (mddev->reshape_position == MaxSector)
2594 mddev->layout = n; 2802 mddev->layout = n;
2803 }
2595 return len; 2804 return len;
2596} 2805}
2597static struct md_sysfs_entry md_layout = 2806static struct md_sysfs_entry md_layout =
@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
2648static ssize_t 2857static ssize_t
2649chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2858chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2650{ 2859{
2651 /* can only set chunk_size if array is not yet active */
2652 char *e; 2860 char *e;
2653 unsigned long n = simple_strtoul(buf, &e, 10); 2861 unsigned long n = simple_strtoul(buf, &e, 10);
2654 2862
2655 if (!*buf || (*e && *e != '\n')) 2863 if (!*buf || (*e && *e != '\n'))
2656 return -EINVAL; 2864 return -EINVAL;
2657 2865
2658 if (mddev->pers) 2866 if (mddev->pers) {
2659 return -EBUSY; 2867 int err;
2660 else if (mddev->reshape_position != MaxSector) 2868 if (mddev->pers->reconfig == NULL)
2869 return -EBUSY;
2870 err = mddev->pers->reconfig(mddev, -1, n);
2871 if (err)
2872 return err;
2873 } else {
2661 mddev->new_chunk = n; 2874 mddev->new_chunk = n;
2662 else 2875 if (mddev->reshape_position == MaxSector)
2663 mddev->chunk_size = n; 2876 mddev->chunk_size = n;
2877 }
2664 return len; 2878 return len;
2665} 2879}
2666static struct md_sysfs_entry md_chunk_size = 2880static struct md_sysfs_entry md_chunk_size =
@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2669static ssize_t 2883static ssize_t
2670resync_start_show(mddev_t *mddev, char *page) 2884resync_start_show(mddev_t *mddev, char *page)
2671{ 2885{
2886 if (mddev->recovery_cp == MaxSector)
2887 return sprintf(page, "none\n");
2672 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2888 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2673} 2889}
2674 2890
@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page)
2766 else { 2982 else {
2767 if (list_empty(&mddev->disks) && 2983 if (list_empty(&mddev->disks) &&
2768 mddev->raid_disks == 0 && 2984 mddev->raid_disks == 0 &&
2769 mddev->size == 0) 2985 mddev->dev_sectors == 0)
2770 st = clear; 2986 st = clear;
2771 else 2987 else
2772 st = inactive; 2988 st = inactive;
@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
2973static ssize_t 3189static ssize_t
2974size_show(mddev_t *mddev, char *page) 3190size_show(mddev_t *mddev, char *page)
2975{ 3191{
2976 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 3192 return sprintf(page, "%llu\n",
3193 (unsigned long long)mddev->dev_sectors / 2);
2977} 3194}
2978 3195
2979static int update_size(mddev_t *mddev, sector_t num_sectors); 3196static int update_size(mddev_t *mddev, sector_t num_sectors);
@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
2985 * not increase it (except from 0). 3202 * not increase it (except from 0).
2986 * If array is active, we can try an on-line resize 3203 * If array is active, we can try an on-line resize
2987 */ 3204 */
2988 char *e; 3205 sector_t sectors;
2989 int err = 0; 3206 int err = strict_blocks_to_sectors(buf, &sectors);
2990 unsigned long long size = simple_strtoull(buf, &e, 10);
2991 if (!*buf || *buf == '\n' ||
2992 (*e && *e != '\n'))
2993 return -EINVAL;
2994 3207
3208 if (err < 0)
3209 return err;
2995 if (mddev->pers) { 3210 if (mddev->pers) {
2996 err = update_size(mddev, size * 2); 3211 err = update_size(mddev, sectors);
2997 md_update_sb(mddev, 1); 3212 md_update_sb(mddev, 1);
2998 } else { 3213 } else {
2999 if (mddev->size == 0 || 3214 if (mddev->dev_sectors == 0 ||
3000 mddev->size > size) 3215 mddev->dev_sectors > sectors)
3001 mddev->size = size; 3216 mddev->dev_sectors = sectors;
3002 else 3217 else
3003 err = -ENOSPC; 3218 err = -ENOSPC;
3004 } 3219 }
@@ -3251,6 +3466,8 @@ static ssize_t
3251sync_speed_show(mddev_t *mddev, char *page) 3466sync_speed_show(mddev_t *mddev, char *page)
3252{ 3467{
3253 unsigned long resync, dt, db; 3468 unsigned long resync, dt, db;
3469 if (mddev->curr_resync == 0)
3470 return sprintf(page, "none\n");
3254 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3471 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3255 dt = (jiffies - mddev->resync_mark) / HZ; 3472 dt = (jiffies - mddev->resync_mark) / HZ;
3256 if (!dt) dt++; 3473 if (!dt) dt++;
@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3263static ssize_t 3480static ssize_t
3264sync_completed_show(mddev_t *mddev, char *page) 3481sync_completed_show(mddev_t *mddev, char *page)
3265{ 3482{
3266 unsigned long max_blocks, resync; 3483 unsigned long max_sectors, resync;
3267 3484
3268 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3485 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3269 max_blocks = mddev->resync_max_sectors; 3486 max_sectors = mddev->resync_max_sectors;
3270 else 3487 else
3271 max_blocks = mddev->size << 1; 3488 max_sectors = mddev->dev_sectors;
3272 3489
3273 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 3490 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
3274 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 3491 return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3275} 3492}
3276 3493
3277static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3494static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
3431__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3648__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3432 reshape_position_store); 3649 reshape_position_store);
3433 3650
3651static ssize_t
3652array_size_show(mddev_t *mddev, char *page)
3653{
3654 if (mddev->external_size)
3655 return sprintf(page, "%llu\n",
3656 (unsigned long long)mddev->array_sectors/2);
3657 else
3658 return sprintf(page, "default\n");
3659}
3660
3661static ssize_t
3662array_size_store(mddev_t *mddev, const char *buf, size_t len)
3663{
3664 sector_t sectors;
3665
3666 if (strncmp(buf, "default", 7) == 0) {
3667 if (mddev->pers)
3668 sectors = mddev->pers->size(mddev, 0, 0);
3669 else
3670 sectors = mddev->array_sectors;
3671
3672 mddev->external_size = 0;
3673 } else {
3674 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3675 return -EINVAL;
3676 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3677 return -EINVAL;
3678
3679 mddev->external_size = 1;
3680 }
3681
3682 mddev->array_sectors = sectors;
3683 set_capacity(mddev->gendisk, mddev->array_sectors);
3684 if (mddev->pers) {
3685 struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
3686
3687 if (bdev) {
3688 mutex_lock(&bdev->bd_inode->i_mutex);
3689 i_size_write(bdev->bd_inode,
3690 (loff_t)mddev->array_sectors << 9);
3691 mutex_unlock(&bdev->bd_inode->i_mutex);
3692 bdput(bdev);
3693 }
3694 }
3695
3696 return len;
3697}
3698
3699static struct md_sysfs_entry md_array_size =
3700__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3701 array_size_store);
3434 3702
3435static struct attribute *md_default_attrs[] = { 3703static struct attribute *md_default_attrs[] = {
3436 &md_level.attr, 3704 &md_level.attr,
@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
3444 &md_safe_delay.attr, 3712 &md_safe_delay.attr,
3445 &md_array_state.attr, 3713 &md_array_state.attr,
3446 &md_reshape_position.attr, 3714 &md_reshape_position.attr,
3715 &md_array_size.attr,
3447 NULL, 3716 NULL,
3448}; 3717};
3449 3718
@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
3602 mddev_put(mddev); 3871 mddev_put(mddev);
3603 return -ENOMEM; 3872 return -ENOMEM;
3604 } 3873 }
3874 mddev->queue->queuedata = mddev;
3875
3605 /* Can be unlocked because the queue is new: no concurrency */ 3876 /* Can be unlocked because the queue is new: no concurrency */
3606 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3877 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3607 3878
3608 blk_queue_make_request(mddev->queue, md_fail_request); 3879 blk_queue_make_request(mddev->queue, md_make_request);
3609 3880
3610 disk = alloc_disk(1 << shift); 3881 disk = alloc_disk(1 << shift);
3611 if (!disk) { 3882 if (!disk) {
@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev)
3731 list_for_each_entry(rdev, &mddev->disks, same_set) { 4002 list_for_each_entry(rdev, &mddev->disks, same_set) {
3732 if (test_bit(Faulty, &rdev->flags)) 4003 if (test_bit(Faulty, &rdev->flags))
3733 continue; 4004 continue;
3734 if (rdev->size < chunk_size / 1024) { 4005 if (rdev->sectors < chunk_size / 512) {
3735 printk(KERN_WARNING 4006 printk(KERN_WARNING
3736 "md: Dev %s smaller than chunk_size:" 4007 "md: Dev %s smaller than chunk_size:"
3737 " %lluk < %dk\n", 4008 " %llu < %d\n",
3738 bdevname(rdev->bdev,b), 4009 bdevname(rdev->bdev,b),
3739 (unsigned long long)rdev->size, 4010 (unsigned long long)rdev->sectors,
3740 chunk_size / 1024); 4011 chunk_size / 512);
3741 return -EINVAL; 4012 return -EINVAL;
3742 } 4013 }
3743 } 4014 }
@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev)
3761 4032
3762 /* perform some consistency tests on the device. 4033 /* perform some consistency tests on the device.
3763 * We don't want the data to overlap the metadata, 4034 * We don't want the data to overlap the metadata,
3764 * Internal Bitmap issues has handled elsewhere. 4035 * Internal Bitmap issues have been handled elsewhere.
3765 */ 4036 */
3766 if (rdev->data_offset < rdev->sb_start) { 4037 if (rdev->data_offset < rdev->sb_start) {
3767 if (mddev->size && 4038 if (mddev->dev_sectors &&
3768 rdev->data_offset + mddev->size*2 4039 rdev->data_offset + mddev->dev_sectors
3769 > rdev->sb_start) { 4040 > rdev->sb_start) {
3770 printk("md: %s: data overlaps metadata\n", 4041 printk("md: %s: data overlaps metadata\n",
3771 mdname(mddev)); 4042 mdname(mddev));
@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev)
3801 } 4072 }
3802 mddev->pers = pers; 4073 mddev->pers = pers;
3803 spin_unlock(&pers_lock); 4074 spin_unlock(&pers_lock);
3804 mddev->level = pers->level; 4075 if (mddev->level != pers->level) {
4076 mddev->level = pers->level;
4077 mddev->new_level = pers->level;
4078 }
3805 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4079 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3806 4080
4081 if (pers->level >= 4 && pers->level <= 6)
4082 /* Cannot support integrity (yet) */
4083 blk_integrity_unregister(mddev->gendisk);
4084
3807 if (mddev->reshape_position != MaxSector && 4085 if (mddev->reshape_position != MaxSector &&
3808 pers->start_reshape == NULL) { 4086 pers->start_reshape == NULL) {
3809 /* This personality cannot handle reshaping... */ 4087 /* This personality cannot handle reshaping... */
@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev)
3843 } 4121 }
3844 4122
3845 mddev->recovery = 0; 4123 mddev->recovery = 0;
3846 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 4124 /* may be over-ridden by personality */
4125 mddev->resync_max_sectors = mddev->dev_sectors;
4126
3847 mddev->barriers_work = 1; 4127 mddev->barriers_work = 1;
3848 mddev->ok_start_degraded = start_dirty_degraded; 4128 mddev->ok_start_degraded = start_dirty_degraded;
3849 4129
@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
3853 err = mddev->pers->run(mddev); 4133 err = mddev->pers->run(mddev);
3854 if (err) 4134 if (err)
3855 printk(KERN_ERR "md: pers->run() failed ...\n"); 4135 printk(KERN_ERR "md: pers->run() failed ...\n");
3856 else if (mddev->pers->sync_request) { 4136 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4137 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4138 " but 'external_size' not in effect?\n", __func__);
4139 printk(KERN_ERR
4140 "md: invalid array_size %llu > default size %llu\n",
4141 (unsigned long long)mddev->array_sectors / 2,
4142 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4143 err = -EINVAL;
4144 mddev->pers->stop(mddev);
4145 }
4146 if (err == 0 && mddev->pers->sync_request) {
3857 err = bitmap_create(mddev); 4147 err = bitmap_create(mddev);
3858 if (err) { 4148 if (err) {
3859 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4149 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
3899 4189
3900 set_capacity(disk, mddev->array_sectors); 4190 set_capacity(disk, mddev->array_sectors);
3901 4191
3902 /* If we call blk_queue_make_request here, it will
3903 * re-initialise max_sectors etc which may have been
3904 * refined inside -> run. So just set the bits we need to set.
3905 * Most initialisation happended when we called
3906 * blk_queue_make_request(..., md_fail_request)
3907 * earlier.
3908 */
3909 mddev->queue->queuedata = mddev;
3910 mddev->queue->make_request_fn = mddev->pers->make_request;
3911
3912 /* If there is a partially-recovered drive we need to 4192 /* If there is a partially-recovered drive we need to
3913 * start recovery here. If we leave it to md_check_recovery, 4193 * start recovery here. If we leave it to md_check_recovery,
3914 * it will remove the drives and not do the right thing 4194 * it will remove the drives and not do the right thing
@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4038 md_super_wait(mddev); 4318 md_super_wait(mddev);
4039 if (mddev->ro) 4319 if (mddev->ro)
4040 set_disk_ro(disk, 0); 4320 set_disk_ro(disk, 0);
4041 blk_queue_make_request(mddev->queue, md_fail_request); 4321
4042 mddev->pers->stop(mddev); 4322 mddev->pers->stop(mddev);
4043 mddev->queue->merge_bvec_fn = NULL; 4323 mddev->queue->merge_bvec_fn = NULL;
4044 mddev->queue->unplug_fn = NULL; 4324 mddev->queue->unplug_fn = NULL;
@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4095 export_array(mddev); 4375 export_array(mddev);
4096 4376
4097 mddev->array_sectors = 0; 4377 mddev->array_sectors = 0;
4098 mddev->size = 0; 4378 mddev->external_size = 0;
4379 mddev->dev_sectors = 0;
4099 mddev->raid_disks = 0; 4380 mddev->raid_disks = 0;
4100 mddev->recovery_cp = 0; 4381 mddev->recovery_cp = 0;
4101 mddev->resync_min = 0; 4382 mddev->resync_min = 0;
@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4135 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4416 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4136 mdname(mddev)); 4417 mdname(mddev));
4137 err = 0; 4418 err = 0;
4419 blk_integrity_unregister(disk);
4138 md_new_event(mddev); 4420 md_new_event(mddev);
4139 sysfs_notify_dirent(mddev->sysfs_state); 4421 sysfs_notify_dirent(mddev->sysfs_state);
4140out: 4422out:
@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
4300 info.patch_version = MD_PATCHLEVEL_VERSION; 4582 info.patch_version = MD_PATCHLEVEL_VERSION;
4301 info.ctime = mddev->ctime; 4583 info.ctime = mddev->ctime;
4302 info.level = mddev->level; 4584 info.level = mddev->level;
4303 info.size = mddev->size; 4585 info.size = mddev->dev_sectors / 2;
4304 if (info.size != mddev->size) /* overflow */ 4586 if (info.size != mddev->dev_sectors / 2) /* overflow */
4305 info.size = -1; 4587 info.size = -1;
4306 info.nr_disks = nr; 4588 info.nr_disks = nr;
4307 info.raid_disks = mddev->raid_disks; 4589 info.raid_disks = mddev->raid_disks;
@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4480 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4762 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4481 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4763 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4482 set_bit(WriteMostly, &rdev->flags); 4764 set_bit(WriteMostly, &rdev->flags);
4765 else
4766 clear_bit(WriteMostly, &rdev->flags);
4483 4767
4484 rdev->raid_disk = -1; 4768 rdev->raid_disk = -1;
4485 err = bind_rdev_to_array(rdev, mddev); 4769 err = bind_rdev_to_array(rdev, mddev);
@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4543 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4827 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4544 } else 4828 } else
4545 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4829 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4546 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4830 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
4547 4831
4548 err = bind_rdev_to_array(rdev, mddev); 4832 err = bind_rdev_to_array(rdev, mddev);
4549 if (err) { 4833 if (err) {
@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4613 else 4897 else
4614 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4898 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4615 4899
4616 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4900 rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
4617 4901
4618 if (test_bit(Faulty, &rdev->flags)) { 4902 if (test_bit(Faulty, &rdev->flags)) {
4619 printk(KERN_WARNING 4903 printk(KERN_WARNING
@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4749 5033
4750 mddev->level = info->level; 5034 mddev->level = info->level;
4751 mddev->clevel[0] = 0; 5035 mddev->clevel[0] = 0;
4752 mddev->size = info->size; 5036 mddev->dev_sectors = 2 * (sector_t)info->size;
4753 mddev->raid_disks = info->raid_disks; 5037 mddev->raid_disks = info->raid_disks;
4754 /* don't set md_minor, it is determined by which /dev/md* was 5038 /* don't set md_minor, it is determined by which /dev/md* was
4755 * openned 5039 * openned
@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4788 return 0; 5072 return 0;
4789} 5073}
4790 5074
5075void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5076{
5077 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5078
5079 if (mddev->external_size)
5080 return;
5081
5082 mddev->array_sectors = array_sectors;
5083}
5084EXPORT_SYMBOL(md_set_array_sectors);
5085
4791static int update_size(mddev_t *mddev, sector_t num_sectors) 5086static int update_size(mddev_t *mddev, sector_t num_sectors)
4792{ 5087{
4793 mdk_rdev_t *rdev; 5088 mdk_rdev_t *rdev;
@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
4814 */ 5109 */
4815 return -EBUSY; 5110 return -EBUSY;
4816 list_for_each_entry(rdev, &mddev->disks, same_set) { 5111 list_for_each_entry(rdev, &mddev->disks, same_set) {
4817 sector_t avail; 5112 sector_t avail = rdev->sectors;
4818 avail = rdev->size * 2;
4819 5113
4820 if (fit && (num_sectors == 0 || num_sectors > avail)) 5114 if (fit && (num_sectors == 0 || num_sectors > avail))
4821 num_sectors = avail; 5115 num_sectors = avail;
@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4887 ) 5181 )
4888 return -EINVAL; 5182 return -EINVAL;
4889 /* Check there is only one change */ 5183 /* Check there is only one change */
4890 if (info->size >= 0 && mddev->size != info->size) cnt++; 5184 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
4891 if (mddev->raid_disks != info->raid_disks) cnt++; 5185 cnt++;
4892 if (mddev->layout != info->layout) cnt++; 5186 if (mddev->raid_disks != info->raid_disks)
4893 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 5187 cnt++;
4894 if (cnt == 0) return 0; 5188 if (mddev->layout != info->layout)
4895 if (cnt > 1) return -EINVAL; 5189 cnt++;
5190 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5191 cnt++;
5192 if (cnt == 0)
5193 return 0;
5194 if (cnt > 1)
5195 return -EINVAL;
4896 5196
4897 if (mddev->layout != info->layout) { 5197 if (mddev->layout != info->layout) {
4898 /* Change layout 5198 /* Change layout
@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4904 else 5204 else
4905 return mddev->pers->reconfig(mddev, info->layout, -1); 5205 return mddev->pers->reconfig(mddev, info->layout, -1);
4906 } 5206 }
4907 if (info->size >= 0 && mddev->size != info->size) 5207 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
4908 rv = update_size(mddev, (sector_t)info->size * 2); 5208 rv = update_size(mddev, (sector_t)info->size * 2);
4909 5209
4910 if (mddev->raid_disks != info->raid_disks) 5210 if (mddev->raid_disks != info->raid_disks)
@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5331 5631
5332void md_unregister_thread(mdk_thread_t *thread) 5632void md_unregister_thread(mdk_thread_t *thread)
5333{ 5633{
5634 if (!thread)
5635 return;
5334 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5636 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5335 5637
5336 kthread_stop(thread->tsk); 5638 kthread_stop(thread->tsk);
@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
5404 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5706 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5405 max_blocks = mddev->resync_max_sectors >> 1; 5707 max_blocks = mddev->resync_max_sectors >> 1;
5406 else 5708 else
5407 max_blocks = mddev->size; 5709 max_blocks = mddev->dev_sectors / 2;
5408 5710
5409 /* 5711 /*
5410 * Should not happen. 5712 * Should not happen.
@@ -5537,7 +5839,7 @@ struct mdstat_info {
5537static int md_seq_show(struct seq_file *seq, void *v) 5839static int md_seq_show(struct seq_file *seq, void *v)
5538{ 5840{
5539 mddev_t *mddev = v; 5841 mddev_t *mddev = v;
5540 sector_t size; 5842 sector_t sectors;
5541 mdk_rdev_t *rdev; 5843 mdk_rdev_t *rdev;
5542 struct mdstat_info *mi = seq->private; 5844 struct mdstat_info *mi = seq->private;
5543 struct bitmap *bitmap; 5845 struct bitmap *bitmap;
@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5573 seq_printf(seq, " %s", mddev->pers->name); 5875 seq_printf(seq, " %s", mddev->pers->name);
5574 } 5876 }
5575 5877
5576 size = 0; 5878 sectors = 0;
5577 list_for_each_entry(rdev, &mddev->disks, same_set) { 5879 list_for_each_entry(rdev, &mddev->disks, same_set) {
5578 char b[BDEVNAME_SIZE]; 5880 char b[BDEVNAME_SIZE];
5579 seq_printf(seq, " %s[%d]", 5881 seq_printf(seq, " %s[%d]",
@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5585 continue; 5887 continue;
5586 } else if (rdev->raid_disk < 0) 5888 } else if (rdev->raid_disk < 0)
5587 seq_printf(seq, "(S)"); /* spare */ 5889 seq_printf(seq, "(S)"); /* spare */
5588 size += rdev->size; 5890 sectors += rdev->sectors;
5589 } 5891 }
5590 5892
5591 if (!list_empty(&mddev->disks)) { 5893 if (!list_empty(&mddev->disks)) {
@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
5595 mddev->array_sectors / 2); 5897 mddev->array_sectors / 2);
5596 else 5898 else
5597 seq_printf(seq, "\n %llu blocks", 5899 seq_printf(seq, "\n %llu blocks",
5598 (unsigned long long)size); 5900 (unsigned long long)sectors / 2);
5599 } 5901 }
5600 if (mddev->persistent) { 5902 if (mddev->persistent) {
5601 if (mddev->major_version != 0 || 5903 if (mddev->major_version != 0 ||
@@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p)
5722 return 0; 6024 return 0;
5723} 6025}
5724 6026
5725static int is_mddev_idle(mddev_t *mddev) 6027static int is_mddev_idle(mddev_t *mddev, int init)
5726{ 6028{
5727 mdk_rdev_t * rdev; 6029 mdk_rdev_t * rdev;
5728 int idle; 6030 int idle;
5729 long curr_events; 6031 int curr_events;
5730 6032
5731 idle = 1; 6033 idle = 1;
5732 rcu_read_lock(); 6034 rcu_read_lock();
5733 rdev_for_each_rcu(rdev, mddev) { 6035 rdev_for_each_rcu(rdev, mddev) {
5734 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6036 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5735 curr_events = part_stat_read(&disk->part0, sectors[0]) + 6037 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
5736 part_stat_read(&disk->part0, sectors[1]) - 6038 (int)part_stat_read(&disk->part0, sectors[1]) -
5737 atomic_read(&disk->sync_io); 6039 atomic_read(&disk->sync_io);
5738 /* sync IO will cause sync_io to increase before the disk_stats 6040 /* sync IO will cause sync_io to increase before the disk_stats
5739 * as sync_io is counted when a request starts, and 6041 * as sync_io is counted when a request starts, and
5740 * disk_stats is counted when it completes. 6042 * disk_stats is counted when it completes.
@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev)
5757 * always make curr_events less than last_events. 6059 * always make curr_events less than last_events.
5758 * 6060 *
5759 */ 6061 */
5760 if (curr_events - rdev->last_events > 4096) { 6062 if (init || curr_events - rdev->last_events > 64) {
5761 rdev->last_events = curr_events; 6063 rdev->last_events = curr_events;
5762 idle = 0; 6064 idle = 0;
5763 } 6065 }
@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev)
5980 j = mddev->recovery_cp; 6282 j = mddev->recovery_cp;
5981 6283
5982 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6284 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5983 max_sectors = mddev->size << 1; 6285 max_sectors = mddev->dev_sectors;
5984 else { 6286 else {
5985 /* recovery follows the physical size of devices */ 6287 /* recovery follows the physical size of devices */
5986 max_sectors = mddev->size << 1; 6288 max_sectors = mddev->dev_sectors;
5987 j = MaxSector; 6289 j = MaxSector;
5988 list_for_each_entry(rdev, &mddev->disks, same_set) 6290 list_for_each_entry(rdev, &mddev->disks, same_set)
5989 if (rdev->raid_disk >= 0 && 6291 if (rdev->raid_disk >= 0 &&
@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev)
6000 "(but not more than %d KB/sec) for %s.\n", 6302 "(but not more than %d KB/sec) for %s.\n",
6001 speed_max(mddev), desc); 6303 speed_max(mddev), desc);
6002 6304
6003 is_mddev_idle(mddev); /* this also initializes IO event counters */ 6305 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6004 6306
6005 io_sectors = 0; 6307 io_sectors = 0;
6006 for (m = 0; m < SYNC_MARKS; m++) { 6308 for (m = 0; m < SYNC_MARKS; m++) {
@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev)
6040 } 6342 }
6041 if (kthread_should_stop()) 6343 if (kthread_should_stop())
6042 goto interrupted; 6344 goto interrupted;
6345
6346 if (mddev->curr_resync > mddev->curr_resync_completed &&
6347 (mddev->curr_resync - mddev->curr_resync_completed)
6348 > (max_sectors >> 4)) {
6349 /* time to update curr_resync_completed */
6350 blk_unplug(mddev->queue);
6351 wait_event(mddev->recovery_wait,
6352 atomic_read(&mddev->recovery_active) == 0);
6353 mddev->curr_resync_completed =
6354 mddev->curr_resync;
6355 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6356 }
6043 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6357 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6044 currspeed < speed_min(mddev)); 6358 currspeed < speed_min(mddev));
6045 if (sectors == 0) { 6359 if (sectors == 0) {
@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev)
6102 6416
6103 if (currspeed > speed_min(mddev)) { 6417 if (currspeed > speed_min(mddev)) {
6104 if ((currspeed > speed_max(mddev)) || 6418 if ((currspeed > speed_max(mddev)) ||
6105 !is_mddev_idle(mddev)) { 6419 !is_mddev_idle(mddev, 0)) {
6106 msleep(500); 6420 msleep(500);
6107 goto repeat; 6421 goto repeat;
6108 } 6422 }
@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev)
6173 mdk_rdev_t *rdev; 6487 mdk_rdev_t *rdev;
6174 int spares = 0; 6488 int spares = 0;
6175 6489
6490 mddev->curr_resync_completed = 0;
6491
6176 list_for_each_entry(rdev, &mddev->disks, same_set) 6492 list_for_each_entry(rdev, &mddev->disks, same_set)
6177 if (rdev->raid_disk >= 0 && 6493 if (rdev->raid_disk >= 0 &&
6178 !test_bit(Blocked, &rdev->flags) && 6494 !test_bit(Blocked, &rdev->flags) &&
@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
6327 sysfs_notify(&mddev->kobj, NULL, 6643 sysfs_notify(&mddev->kobj, NULL,
6328 "degraded"); 6644 "degraded");
6329 } 6645 }
6646 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6647 mddev->pers->finish_reshape)
6648 mddev->pers->finish_reshape(mddev);
6330 md_update_sb(mddev, 1); 6649 md_update_sb(mddev, 1);
6331 6650
6332 /* if array is no-longer degraded, then any saved_raid_disk 6651 /* if array is no-longer degraded, then any saved_raid_disk
@@ -6470,13 +6789,13 @@ static void md_geninit(void)
6470 6789
6471static int __init md_init(void) 6790static int __init md_init(void)
6472{ 6791{
6473 if (register_blkdev(MAJOR_NR, "md")) 6792 if (register_blkdev(MD_MAJOR, "md"))
6474 return -1; 6793 return -1;
6475 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6794 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6476 unregister_blkdev(MAJOR_NR, "md"); 6795 unregister_blkdev(MD_MAJOR, "md");
6477 return -1; 6796 return -1;
6478 } 6797 }
6479 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 6798 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6480 md_probe, NULL, NULL); 6799 md_probe, NULL, NULL);
6481 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6800 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6482 md_probe, NULL, NULL); 6801 md_probe, NULL, NULL);
@@ -6562,10 +6881,10 @@ static __exit void md_exit(void)
6562 mddev_t *mddev; 6881 mddev_t *mddev;
6563 struct list_head *tmp; 6882 struct list_head *tmp;
6564 6883
6565 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 6884 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6566 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6885 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6567 6886
6568 unregister_blkdev(MAJOR_NR,"md"); 6887 unregister_blkdev(MD_MAJOR,"md");
6569 unregister_blkdev(mdp_major, "mdp"); 6888 unregister_blkdev(mdp_major, "mdp");
6570 unregister_reboot_notifier(&md_notifier); 6889 unregister_reboot_notifier(&md_notifier);
6571 unregister_sysctl_table(raid_table_header); 6890 unregister_sysctl_table(raid_table_header);
diff --git a/drivers/md/md.h b/drivers/md/md.h
new file mode 100644
index 000000000000..e9b7f54c24d6
--- /dev/null
+++ b/drivers/md/md.h
@@ -0,0 +1,436 @@
1/*
2 md_k.h : kernel internal structure of the Linux MD driver
3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 You should have received a copy of the GNU General Public License
11 (for example /usr/src/linux/COPYING); if not, write to the Free
12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13*/
14
15#ifndef _MD_K_H
16#define _MD_K_H
17
18#ifdef CONFIG_BLOCK
19
20#define MaxSector (~(sector_t)0)
21
22typedef struct mddev_s mddev_t;
23typedef struct mdk_rdev_s mdk_rdev_t;
24
25/*
26 * options passed in raidrun:
27 */
28
29/* Currently this must fit in an 'int' */
30#define MAX_CHUNK_SIZE (1<<30)
31
32/*
33 * MD's 'extended' device
34 */
35struct mdk_rdev_s
36{
37 struct list_head same_set; /* RAID devices within the same set */
38
39 sector_t sectors; /* Device size (in 512bytes sectors) */
40 mddev_t *mddev; /* RAID array if running */
41 int last_events; /* IO event timestamp */
42
43 struct block_device *bdev; /* block device handle */
44
45 struct page *sb_page;
46 int sb_loaded;
47 __u64 sb_events;
48 sector_t data_offset; /* start of data in array */
49 sector_t sb_start; /* offset of the super block (in 512byte sectors) */
50 int sb_size; /* bytes in the superblock */
51 int preferred_minor; /* autorun support */
52
53 struct kobject kobj;
54
55 /* A device can be in one of three states based on two flags:
56 * Not working: faulty==1 in_sync==0
57 * Fully working: faulty==0 in_sync==1
58 * Working, but not
59 * in sync with array
60 * faulty==0 in_sync==0
61 *
62 * It can never have faulty==1, in_sync==1
63 * This reduces the burden of testing multiple flags in many cases
64 */
65
66 unsigned long flags;
67#define Faulty 1 /* device is known to have a fault */
68#define In_sync 2 /* device is in_sync with rest of array */
69#define WriteMostly 4 /* Avoid reading if at all possible */
70#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
71#define AllReserved 6 /* If whole device is reserved for
72 * one array */
73#define AutoDetected 7 /* added by auto-detect */
74#define Blocked 8 /* An error occured on an externally
75 * managed array, don't allow writes
76 * until it is cleared */
77#define StateChanged 9 /* Faulty or Blocked has changed during
78 * interrupt, so it needs to be
79 * notified by the thread */
80 wait_queue_head_t blocked_wait;
81
82 int desc_nr; /* descriptor index in the superblock */
83 int raid_disk; /* role of device in array */
84 int saved_raid_disk; /* role that device used to have in the
85 * array and could again if we did a partial
86 * resync from the bitmap
87 */
88 sector_t recovery_offset;/* If this device has been partially
89 * recovered, this is where we were
90 * up to.
91 */
92
93 atomic_t nr_pending; /* number of pending requests.
94 * only maintained for arrays that
95 * support hot removal
96 */
97 atomic_t read_errors; /* number of consecutive read errors that
98 * we have tried to ignore.
99 */
100 atomic_t corrected_errors; /* number of corrected read errors,
101 * for reporting to userspace and storing
102 * in superblock.
103 */
104 struct work_struct del_work; /* used for delayed sysfs removal */
105
106 struct sysfs_dirent *sysfs_state; /* handle for 'state'
107 * sysfs entry */
108};
109
110struct mddev_s
111{
112 void *private;
113 struct mdk_personality *pers;
114 dev_t unit;
115 int md_minor;
116 struct list_head disks;
117 unsigned long flags;
118#define MD_CHANGE_DEVS 0 /* Some device status has changed */
119#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
120#define MD_CHANGE_PENDING 2 /* superblock update in progress */
121
122 int suspended;
123 atomic_t active_io;
124 int ro;
125
126 struct gendisk *gendisk;
127
128 struct kobject kobj;
129 int hold_active;
130#define UNTIL_IOCTL 1
131#define UNTIL_STOP 2
132
133 /* Superblock information */
134 int major_version,
135 minor_version,
136 patch_version;
137 int persistent;
138 int external; /* metadata is
139 * managed externally */
140 char metadata_type[17]; /* externally set*/
141 int chunk_size;
142 time_t ctime, utime;
143 int level, layout;
144 char clevel[16];
145 int raid_disks;
146 int max_disks;
147 sector_t dev_sectors; /* used size of
148 * component devices */
149 sector_t array_sectors; /* exported array size */
150 int external_size; /* size managed
151 * externally */
152 __u64 events;
153
154 char uuid[16];
155
156 /* If the array is being reshaped, we need to record the
157 * new shape and an indication of where we are up to.
158 * This is written to the superblock.
159 * If reshape_position is MaxSector, then no reshape is happening (yet).
160 */
161 sector_t reshape_position;
162 int delta_disks, new_level, new_layout, new_chunk;
163
164 struct mdk_thread_s *thread; /* management thread */
165 struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
166 sector_t curr_resync; /* last block scheduled */
167 /* As resync requests can complete out of order, we cannot easily track
168 * how much resync has been completed. So we occasionally pause until
169 * everything completes, then set curr_resync_completed to curr_resync.
170 * As such it may be well behind the real resync mark, but it is a value
171 * we are certain of.
172 */
173 sector_t curr_resync_completed;
174 unsigned long resync_mark; /* a recent timestamp */
175 sector_t resync_mark_cnt;/* blocks written at resync_mark */
176 sector_t curr_mark_cnt; /* blocks scheduled now */
177
178 sector_t resync_max_sectors; /* may be set by personality */
179
180 sector_t resync_mismatches; /* count of sectors where
181 * parity/replica mismatch found
182 */
183
184 /* allow user-space to request suspension of IO to regions of the array */
185 sector_t suspend_lo;
186 sector_t suspend_hi;
187 /* if zero, use the system-wide default */
188 int sync_speed_min;
189 int sync_speed_max;
190
191 /* resync even though the same disks are shared among md-devices */
192 int parallel_resync;
193
194 int ok_start_degraded;
195 /* recovery/resync flags
196 * NEEDED: we might need to start a resync/recover
197 * RUNNING: a thread is running, or about to be started
198 * SYNC: actually doing a resync, not a recovery
199 * RECOVER: doing recovery, or need to try it.
200 * INTR: resync needs to be aborted for some reason
201 * DONE: thread is done and is waiting to be reaped
202 * REQUEST: user-space has requested a sync (used with SYNC)
203 * CHECK: user-space request for for check-only, no repair
204 * RESHAPE: A reshape is happening
205 *
206 * If neither SYNC or RESHAPE are set, then it is a recovery.
207 */
208#define MD_RECOVERY_RUNNING 0
209#define MD_RECOVERY_SYNC 1
210#define MD_RECOVERY_RECOVER 2
211#define MD_RECOVERY_INTR 3
212#define MD_RECOVERY_DONE 4
213#define MD_RECOVERY_NEEDED 5
214#define MD_RECOVERY_REQUESTED 6
215#define MD_RECOVERY_CHECK 7
216#define MD_RECOVERY_RESHAPE 8
217#define MD_RECOVERY_FROZEN 9
218
219 unsigned long recovery;
220 int recovery_disabled; /* if we detect that recovery
221 * will always fail, set this
222 * so we don't loop trying */
223
224 int in_sync; /* know to not need resync */
225 struct mutex reconfig_mutex;
226 atomic_t active; /* general refcount */
227 atomic_t openers; /* number of active opens */
228
229 int changed; /* true if we might need to reread partition info */
230 int degraded; /* whether md should consider
231 * adding a spare
232 */
233 int barriers_work; /* initialised to true, cleared as soon
234 * as a barrier request to slave
235 * fails. Only supported
236 */
237 struct bio *biolist; /* bios that need to be retried
238 * because BIO_RW_BARRIER is not supported
239 */
240
241 atomic_t recovery_active; /* blocks scheduled, but not written */
242 wait_queue_head_t recovery_wait;
243 sector_t recovery_cp;
244 sector_t resync_min; /* user requested sync
245 * starts here */
246 sector_t resync_max; /* resync should pause
247 * when it gets here */
248
249 struct sysfs_dirent *sysfs_state; /* handle for 'array_state'
250 * file in sysfs.
251 */
252 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
253
254 struct work_struct del_work; /* used for delayed sysfs removal */
255
256 spinlock_t write_lock;
257 wait_queue_head_t sb_wait; /* for waiting on superblock updates */
258 atomic_t pending_writes; /* number of active superblock writes */
259
260 unsigned int safemode; /* if set, update "clean" superblock
261 * when no writes pending.
262 */
263 unsigned int safemode_delay;
264 struct timer_list safemode_timer;
265 atomic_t writes_pending;
266 struct request_queue *queue; /* for plugging ... */
267
268 atomic_t write_behind; /* outstanding async IO */
269 unsigned int max_write_behind; /* 0 = sync */
270
271 struct bitmap *bitmap; /* the bitmap for the device */
272 struct file *bitmap_file; /* the bitmap file */
273 long bitmap_offset; /* offset from superblock of
274 * start of bitmap. May be
275 * negative, but not '0'
276 */
277 long default_bitmap_offset; /* this is the offset to use when
278 * hot-adding a bitmap. It should
279 * eventually be settable by sysfs.
280 */
281
282 struct list_head all_mddevs;
283};
284
285
286static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev)
287{
288 int faulty = test_bit(Faulty, &rdev->flags);
289 if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
290 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
291}
292
293static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
294{
295 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
296}
297
298struct mdk_personality
299{
300 char *name;
301 int level;
302 struct list_head list;
303 struct module *owner;
304 int (*make_request)(struct request_queue *q, struct bio *bio);
305 int (*run)(mddev_t *mddev);
306 int (*stop)(mddev_t *mddev);
307 void (*status)(struct seq_file *seq, mddev_t *mddev);
308 /* error_handler must set ->faulty and clear ->in_sync
309 * if appropriate, and should abort recovery if needed
310 */
311 void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
312 int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
313 int (*hot_remove_disk) (mddev_t *mddev, int number);
314 int (*spare_active) (mddev_t *mddev);
315 sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
316 int (*resize) (mddev_t *mddev, sector_t sectors);
317 sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
318 int (*check_reshape) (mddev_t *mddev);
319 int (*start_reshape) (mddev_t *mddev);
320 void (*finish_reshape) (mddev_t *mddev);
321 int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
322 /* quiesce moves between quiescence states
323 * 0 - fully active
324 * 1 - no new requests allowed
325 * others - reserved
326 */
327 void (*quiesce) (mddev_t *mddev, int state);
328 /* takeover is used to transition an array from one
329 * personality to another. The new personality must be able
330 * to handle the data in the current layout.
331 * e.g. 2drive raid1 -> 2drive raid5
332 * ndrive raid5 -> degraded n+1drive raid6 with special layout
333 * If the takeover succeeds, a new 'private' structure is returned.
334 * This needs to be installed and then ->run used to activate the
335 * array.
336 */
337 void *(*takeover) (mddev_t *mddev);
338};
339
340
341struct md_sysfs_entry {
342 struct attribute attr;
343 ssize_t (*show)(mddev_t *, char *);
344 ssize_t (*store)(mddev_t *, const char *, size_t);
345};
346
347
348static inline char * mdname (mddev_t * mddev)
349{
350 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
351}
352
353/*
354 * iterates through some rdev ringlist. It's safe to remove the
355 * current 'rdev'. Dont touch 'tmp' though.
356 */
357#define rdev_for_each_list(rdev, tmp, head) \
358 list_for_each_entry_safe(rdev, tmp, head, same_set)
359
360/*
361 * iterates through the 'same array disks' ringlist
362 */
363#define rdev_for_each(rdev, tmp, mddev) \
364 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
365
366#define rdev_for_each_rcu(rdev, mddev) \
367 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
368
369typedef struct mdk_thread_s {
370 void (*run) (mddev_t *mddev);
371 mddev_t *mddev;
372 wait_queue_head_t wqueue;
373 unsigned long flags;
374 struct task_struct *tsk;
375 unsigned long timeout;
376} mdk_thread_t;
377
378#define THREAD_WAKEUP 0
379
380#define __wait_event_lock_irq(wq, condition, lock, cmd) \
381do { \
382 wait_queue_t __wait; \
383 init_waitqueue_entry(&__wait, current); \
384 \
385 add_wait_queue(&wq, &__wait); \
386 for (;;) { \
387 set_current_state(TASK_UNINTERRUPTIBLE); \
388 if (condition) \
389 break; \
390 spin_unlock_irq(&lock); \
391 cmd; \
392 schedule(); \
393 spin_lock_irq(&lock); \
394 } \
395 current->state = TASK_RUNNING; \
396 remove_wait_queue(&wq, &__wait); \
397} while (0)
398
399#define wait_event_lock_irq(wq, condition, lock, cmd) \
400do { \
401 if (condition) \
402 break; \
403 __wait_event_lock_irq(wq, condition, lock, cmd); \
404} while (0)
405
406static inline void safe_put_page(struct page *p)
407{
408 if (p) put_page(p);
409}
410
411#endif /* CONFIG_BLOCK */
412#endif
413
414
415extern int register_md_personality(struct mdk_personality *p);
416extern int unregister_md_personality(struct mdk_personality *p);
417extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
418 mddev_t *mddev, const char *name);
419extern void md_unregister_thread(mdk_thread_t *thread);
420extern void md_wakeup_thread(mdk_thread_t *thread);
421extern void md_check_recovery(mddev_t *mddev);
422extern void md_write_start(mddev_t *mddev, struct bio *bi);
423extern void md_write_end(mddev_t *mddev);
424extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
425extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
426
427extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
428 sector_t sector, int size, struct page *page);
429extern void md_super_wait(mddev_t *mddev);
430extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
431 struct page *page, int rw);
432extern void md_do_sync(mddev_t *mddev);
433extern void md_new_event(mddev_t *mddev);
434extern int md_allow_write(mddev_t *mddev);
435extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
436extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
index b61d5767aae7..3b1500843bba 100644
--- a/drivers/md/mktables.c
+++ b/drivers/md/mktables.c
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
59 uint8_t v; 59 uint8_t v;
60 uint8_t exptbl[256], invtbl[256]; 60 uint8_t exptbl[256], invtbl[256];
61 61
62 printf("#include \"raid6.h\"\n"); 62 printf("#include <linux/raid/pq.h>\n");
63 63
64 /* Compute multiplication table */ 64 /* Compute multiplication table */
65 printf("\nconst u8 __attribute__((aligned(256)))\n" 65 printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
76 printf("\t},\n"); 76 printf("\t},\n");
77 } 77 }
78 printf("};\n"); 78 printf("};\n");
79 printf("#ifdef __KERNEL__\n");
80 printf("EXPORT_SYMBOL(raid6_gfmul);\n");
81 printf("#endif\n");
79 82
80 /* Compute power-of-2 table (exponent) */ 83 /* Compute power-of-2 table (exponent) */
81 v = 1; 84 v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
92 } 95 }
93 } 96 }
94 printf("};\n"); 97 printf("};\n");
98 printf("#ifdef __KERNEL__\n");
99 printf("EXPORT_SYMBOL(raid6_gfexp);\n");
100 printf("#endif\n");
95 101
96 /* Compute inverse table x^-1 == x^254 */ 102 /* Compute inverse table x^-1 == x^254 */
97 printf("\nconst u8 __attribute__((aligned(256)))\n" 103 printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
104 } 110 }
105 } 111 }
106 printf("};\n"); 112 printf("};\n");
113 printf("#ifdef __KERNEL__\n");
114 printf("EXPORT_SYMBOL(raid6_gfinv);\n");
115 printf("#endif\n");
107 116
108 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ 117 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
109 printf("\nconst u8 __attribute__((aligned(256)))\n" 118 printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
115 (j == 7) ? '\n' : ' '); 124 (j == 7) ? '\n' : ' ');
116 } 125 }
117 printf("};\n"); 126 printf("};\n");
127 printf("#ifdef __KERNEL__\n");
128 printf("EXPORT_SYMBOL(raid6_gfexi);\n");
129 printf("#endif\n");
118 130
119 return 0; 131 return 0;
120} 132}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index f6d08f241671..41ced0cbe823 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -19,7 +19,11 @@
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */ 20 */
21 21
22#include <linux/raid/multipath.h> 22#include <linux/blkdev.h>
23#include <linux/raid/md_u.h>
24#include <linux/seq_file.h>
25#include "md.h"
26#include "multipath.h"
23 27
24#define MAX_WORK_PER_DISK 128 28#define MAX_WORK_PER_DISK 128
25 29
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
402 spin_unlock_irqrestore(&conf->device_lock, flags); 406 spin_unlock_irqrestore(&conf->device_lock, flags);
403} 407}
404 408
409static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
410{
411 WARN_ONCE(sectors || raid_disks,
412 "%s does not support generic reshape\n", __func__);
413
414 return mddev->dev_sectors;
415}
416
405static int multipath_run (mddev_t *mddev) 417static int multipath_run (mddev_t *mddev)
406{ 418{
407 multipath_conf_t *conf; 419 multipath_conf_t *conf;
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
498 /* 510 /*
499 * Ok, everything is just fine now 511 * Ok, everything is just fine now
500 */ 512 */
501 mddev->array_sectors = mddev->size * 2; 513 md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
502 514
503 mddev->queue->unplug_fn = multipath_unplug; 515 mddev->queue->unplug_fn = multipath_unplug;
504 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 516 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
543 .error_handler = multipath_error, 555 .error_handler = multipath_error,
544 .hot_add_disk = multipath_add_disk, 556 .hot_add_disk = multipath_add_disk,
545 .hot_remove_disk= multipath_remove_disk, 557 .hot_remove_disk= multipath_remove_disk,
558 .size = multipath_size,
546}; 559};
547 560
548static int __init multipath_init (void) 561static int __init multipath_init (void)
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
new file mode 100644
index 000000000000..6fa70b400cda
--- /dev/null
+++ b/drivers/md/multipath.h
@@ -0,0 +1,40 @@
1#ifndef _MULTIPATH_H
2#define _MULTIPATH_H
3
4struct multipath_info {
5 mdk_rdev_t *rdev;
6};
7
8struct multipath_private_data {
9 mddev_t *mddev;
10 struct multipath_info *multipaths;
11 int raid_disks;
12 int working_disks;
13 spinlock_t device_lock;
14 struct list_head retry_list;
15
16 mempool_t *pool;
17};
18
19typedef struct multipath_private_data multipath_conf_t;
20
21/*
22 * this is the only point in the RAID code where we violate
23 * C type safety. mddev->private is an 'opaque' pointer.
24 */
25#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
26
27/*
28 * this is our 'private' 'collective' MULTIPATH buffer head.
29 * it contains information about what kind of IO operations were started
30 * for this MULTIPATH operation, and about their status:
31 */
32
33struct multipath_bh {
34 mddev_t *mddev;
35 struct bio *master_bio;
36 struct bio bio;
37 int path;
38 struct list_head retry_list;
39};
40#endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c605ba805586..c08d7559be55 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,7 +18,10 @@
18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19*/ 19*/
20 20
21#include <linux/raid/raid0.h> 21#include <linux/blkdev.h>
22#include <linux/seq_file.h>
23#include "md.h"
24#include "raid0.h"
22 25
23static void raid0_unplug(struct request_queue *q) 26static void raid0_unplug(struct request_queue *q)
24{ 27{
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
73 list_for_each_entry(rdev2, &mddev->disks, same_set) { 76 list_for_each_entry(rdev2, &mddev->disks, same_set) {
74 printk(KERN_INFO "raid0: comparing %s(%llu)", 77 printk(KERN_INFO "raid0: comparing %s(%llu)",
75 bdevname(rdev1->bdev,b), 78 bdevname(rdev1->bdev,b),
76 (unsigned long long)rdev1->size); 79 (unsigned long long)rdev1->sectors);
77 printk(KERN_INFO " with %s(%llu)\n", 80 printk(KERN_INFO " with %s(%llu)\n",
78 bdevname(rdev2->bdev,b), 81 bdevname(rdev2->bdev,b),
79 (unsigned long long)rdev2->size); 82 (unsigned long long)rdev2->sectors);
80 if (rdev2 == rdev1) { 83 if (rdev2 == rdev1) {
81 printk(KERN_INFO "raid0: END\n"); 84 printk(KERN_INFO "raid0: END\n");
82 break; 85 break;
83 } 86 }
84 if (rdev2->size == rdev1->size) 87 if (rdev2->sectors == rdev1->sectors) {
85 {
86 /* 88 /*
87 * Not unique, don't count it as a new 89 * Not unique, don't count it as a new
88 * group 90 * group
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
145 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 147 mddev->queue->max_sectors > (PAGE_SIZE>>9))
146 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 148 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
147 149
148 if (!smallest || (rdev1->size <smallest->size)) 150 if (!smallest || (rdev1->sectors < smallest->sectors))
149 smallest = rdev1; 151 smallest = rdev1;
150 cnt++; 152 cnt++;
151 } 153 }
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
155 goto abort; 157 goto abort;
156 } 158 }
157 zone->nb_dev = cnt; 159 zone->nb_dev = cnt;
158 zone->sectors = smallest->size * cnt * 2; 160 zone->sectors = smallest->sectors * cnt;
159 zone->zone_start = 0; 161 zone->zone_start = 0;
160 162
161 current_start = smallest->size * 2; 163 current_start = smallest->sectors;
162 curr_zone_start = zone->sectors; 164 curr_zone_start = zone->sectors;
163 165
164 /* now do the other zones */ 166 /* now do the other zones */
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
177 rdev = conf->strip_zone[0].dev[j]; 179 rdev = conf->strip_zone[0].dev[j];
178 printk(KERN_INFO "raid0: checking %s ...", 180 printk(KERN_INFO "raid0: checking %s ...",
179 bdevname(rdev->bdev, b)); 181 bdevname(rdev->bdev, b));
180 if (rdev->size > current_start / 2) { 182 if (rdev->sectors <= current_start) {
181 printk(KERN_INFO " contained as device %d\n",
182 c);
183 zone->dev[c] = rdev;
184 c++;
185 if (!smallest || (rdev->size <smallest->size)) {
186 smallest = rdev;
187 printk(KERN_INFO " (%llu) is smallest!.\n",
188 (unsigned long long)rdev->size);
189 }
190 } else
191 printk(KERN_INFO " nope.\n"); 183 printk(KERN_INFO " nope.\n");
184 continue;
185 }
186 printk(KERN_INFO " contained as device %d\n", c);
187 zone->dev[c] = rdev;
188 c++;
189 if (!smallest || rdev->sectors < smallest->sectors) {
190 smallest = rdev;
191 printk(KERN_INFO " (%llu) is smallest!.\n",
192 (unsigned long long)rdev->sectors);
193 }
192 } 194 }
193 195
194 zone->nb_dev = c; 196 zone->nb_dev = c;
195 zone->sectors = (smallest->size * 2 - current_start) * c; 197 zone->sectors = (smallest->sectors - current_start) * c;
196 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", 198 printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
197 zone->nb_dev, (unsigned long long)zone->sectors); 199 zone->nb_dev, (unsigned long long)zone->sectors);
198 200
199 zone->zone_start = curr_zone_start; 201 zone->zone_start = curr_zone_start;
200 curr_zone_start += zone->sectors; 202 curr_zone_start += zone->sectors;
201 203
202 current_start = smallest->size * 2; 204 current_start = smallest->sectors;
203 printk(KERN_INFO "raid0: current zone start: %llu\n", 205 printk(KERN_INFO "raid0: current zone start: %llu\n",
204 (unsigned long long)current_start); 206 (unsigned long long)current_start);
205 } 207 }
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
261 return max; 263 return max;
262} 264}
263 265
266static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
267{
268 sector_t array_sectors = 0;
269 mdk_rdev_t *rdev;
270
271 WARN_ONCE(sectors || raid_disks,
272 "%s does not support generic reshape\n", __func__);
273
274 list_for_each_entry(rdev, &mddev->disks, same_set)
275 array_sectors += rdev->sectors;
276
277 return array_sectors;
278}
279
264static int raid0_run (mddev_t *mddev) 280static int raid0_run (mddev_t *mddev)
265{ 281{
266 unsigned cur=0, i=0, nb_zone; 282 unsigned cur=0, i=0, nb_zone;
267 s64 sectors; 283 s64 sectors;
268 raid0_conf_t *conf; 284 raid0_conf_t *conf;
269 mdk_rdev_t *rdev;
270 285
271 if (mddev->chunk_size == 0) { 286 if (mddev->chunk_size == 0) {
272 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); 287 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
291 goto out_free_conf; 306 goto out_free_conf;
292 307
293 /* calculate array device size */ 308 /* calculate array device size */
294 mddev->array_sectors = 0; 309 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
295 list_for_each_entry(rdev, &mddev->disks, same_set)
296 mddev->array_sectors += rdev->size * 2;
297 310
298 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", 311 printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
299 (unsigned long long)mddev->array_sectors); 312 (unsigned long long)mddev->array_sectors);
300 printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", 313 printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
301 (unsigned long long)conf->spacing); 314 (unsigned long long)conf->spacing);
302 { 315 {
303 sector_t s = mddev->array_sectors; 316 sector_t s = raid0_size(mddev, 0, 0);
304 sector_t space = conf->spacing; 317 sector_t space = conf->spacing;
305 int round; 318 int round;
306 conf->sector_shift = 0; 319 conf->sector_shift = 0;
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
509 .run = raid0_run, 522 .run = raid0_run,
510 .stop = raid0_stop, 523 .stop = raid0_stop,
511 .status = raid0_status, 524 .status = raid0_status,
525 .size = raid0_size,
512}; 526};
513 527
514static int __init raid0_init (void) 528static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
new file mode 100644
index 000000000000..824b12eb1d4f
--- /dev/null
+++ b/drivers/md/raid0.h
@@ -0,0 +1,28 @@
1#ifndef _RAID0_H
2#define _RAID0_H
3
4struct strip_zone
5{
6 sector_t zone_start; /* Zone offset in md_dev (in sectors) */
7 sector_t dev_start; /* Zone offset in real dev (in sectors) */
8 sector_t sectors; /* Zone size in sectors */
9 int nb_dev; /* # of devices attached to the zone */
10 mdk_rdev_t **dev; /* Devices attached to the zone */
11};
12
13struct raid0_private_data
14{
15 struct strip_zone **hash_table; /* Table of indexes into strip_zone */
16 struct strip_zone *strip_zone;
17 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
18 int nr_strip_zones;
19
20 sector_t spacing;
21 int sector_shift; /* shift this before divide by spacing */
22};
23
24typedef struct raid0_private_data raid0_conf_t;
25
26#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
27
28#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2466425d9ca..b4f4badc0068 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -31,10 +31,13 @@
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */ 32 */
33 33
34#include "dm-bio-list.h"
35#include <linux/delay.h> 34#include <linux/delay.h>
36#include <linux/raid/raid1.h> 35#include <linux/blkdev.h>
37#include <linux/raid/bitmap.h> 36#include <linux/seq_file.h>
37#include "md.h"
38#include "dm-bio-list.h"
39#include "raid1.h"
40#include "bitmap.h"
38 41
39#define DEBUG 0 42#define DEBUG 0
40#if DEBUG 43#if DEBUG
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1723 return 0; 1726 return 0;
1724 } 1727 }
1725 1728
1726 max_sector = mddev->size << 1; 1729 max_sector = mddev->dev_sectors;
1727 if (sector_nr >= max_sector) { 1730 if (sector_nr >= max_sector) {
1728 /* If we aborted, we need to abort the 1731 /* If we aborted, we need to abort the
1729 * sync on the 'current' bitmap chunk (there will 1732 * sync on the 'current' bitmap chunk (there will
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1919 return nr_sectors; 1922 return nr_sectors;
1920} 1923}
1921 1924
1925static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
1926{
1927 if (sectors)
1928 return sectors;
1929
1930 return mddev->dev_sectors;
1931}
1932
1922static int run(mddev_t *mddev) 1933static int run(mddev_t *mddev)
1923{ 1934{
1924 conf_t *conf; 1935 conf_t *conf;
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
2048 /* 2059 /*
2049 * Ok, everything is just fine now 2060 * Ok, everything is just fine now
2050 */ 2061 */
2051 mddev->array_sectors = mddev->size * 2; 2062 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2052 2063
2053 mddev->queue->unplug_fn = raid1_unplug; 2064 mddev->queue->unplug_fn = raid1_unplug;
2054 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2065 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
2089 /* need to kick something here to make sure I/O goes? */ 2100 /* need to kick something here to make sure I/O goes? */
2090 } 2101 }
2091 2102
2103 raise_barrier(conf);
2104 lower_barrier(conf);
2105
2092 md_unregister_thread(mddev->thread); 2106 md_unregister_thread(mddev->thread);
2093 mddev->thread = NULL; 2107 mddev->thread = NULL;
2094 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2108 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2110 * any io in the removed space completes, but it hardly seems 2124 * any io in the removed space completes, but it hardly seems
2111 * worth it. 2125 * worth it.
2112 */ 2126 */
2113 mddev->array_sectors = sectors; 2127 md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
2128 if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2129 return -EINVAL;
2114 set_capacity(mddev->gendisk, mddev->array_sectors); 2130 set_capacity(mddev->gendisk, mddev->array_sectors);
2115 mddev->changed = 1; 2131 mddev->changed = 1;
2116 if (mddev->array_sectors / 2 > mddev->size && 2132 if (sectors > mddev->dev_sectors &&
2117 mddev->recovery_cp == MaxSector) { 2133 mddev->recovery_cp == MaxSector) {
2118 mddev->recovery_cp = mddev->size << 1; 2134 mddev->recovery_cp = mddev->dev_sectors;
2119 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2135 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2120 } 2136 }
2121 mddev->size = mddev->array_sectors / 2; 2137 mddev->dev_sectors = sectors;
2122 mddev->resync_max_sectors = sectors; 2138 mddev->resync_max_sectors = sectors;
2123 return 0; 2139 return 0;
2124} 2140}
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
2264 .spare_active = raid1_spare_active, 2280 .spare_active = raid1_spare_active,
2265 .sync_request = sync_request, 2281 .sync_request = sync_request,
2266 .resize = raid1_resize, 2282 .resize = raid1_resize,
2283 .size = raid1_size,
2267 .check_reshape = raid1_reshape, 2284 .check_reshape = raid1_reshape,
2268 .quiesce = raid1_quiesce, 2285 .quiesce = raid1_quiesce,
2269}; 2286};
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
new file mode 100644
index 000000000000..1620eea3d57c
--- /dev/null
+++ b/drivers/md/raid1.h
@@ -0,0 +1,132 @@
1#ifndef _RAID1_H
2#define _RAID1_H
3
4typedef struct mirror_info mirror_info_t;
5
6struct mirror_info {
7 mdk_rdev_t *rdev;
8 sector_t head_position;
9};
10
11/*
12 * memory pools need a pointer to the mddev, so they can force an unplug
13 * when memory is tight, and a count of the number of drives that the
14 * pool was allocated for, so they know how much to allocate and free.
15 * mddev->raid_disks cannot be used, as it can change while a pool is active
16 * These two datums are stored in a kmalloced struct.
17 */
18
19struct pool_info {
20 mddev_t *mddev;
21 int raid_disks;
22};
23
24
25typedef struct r1bio_s r1bio_t;
26
27struct r1_private_data_s {
28 mddev_t *mddev;
29 mirror_info_t *mirrors;
30 int raid_disks;
31 int last_used;
32 sector_t next_seq_sect;
33 spinlock_t device_lock;
34
35 struct list_head retry_list;
36 /* queue pending writes and submit them on unplug */
37 struct bio_list pending_bio_list;
38 /* queue of writes that have been unplugged */
39 struct bio_list flushing_bio_list;
40
41 /* for use when syncing mirrors: */
42
43 spinlock_t resync_lock;
44 int nr_pending;
45 int nr_waiting;
46 int nr_queued;
47 int barrier;
48 sector_t next_resync;
49 int fullsync; /* set to 1 if a full sync is needed,
50 * (fresh device added).
51 * Cleared when a sync completes.
52 */
53
54 wait_queue_head_t wait_barrier;
55
56 struct pool_info *poolinfo;
57
58 struct page *tmppage;
59
60 mempool_t *r1bio_pool;
61 mempool_t *r1buf_pool;
62};
63
64typedef struct r1_private_data_s conf_t;
65
66/*
67 * this is the only point in the RAID code where we violate
68 * C type safety. mddev->private is an 'opaque' pointer.
69 */
70#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
71
72/*
73 * this is our 'private' RAID1 bio.
74 *
75 * it contains information about what kind of IO operations were started
76 * for this RAID1 operation, and about their status:
77 */
78
79struct r1bio_s {
80 atomic_t remaining; /* 'have we finished' count,
81 * used from IRQ handlers
82 */
83 atomic_t behind_remaining; /* number of write-behind ios remaining
84 * in this BehindIO request
85 */
86 sector_t sector;
87 int sectors;
88 unsigned long state;
89 mddev_t *mddev;
90 /*
91 * original bio going to /dev/mdx
92 */
93 struct bio *master_bio;
94 /*
95 * if the IO is in READ direction, then this is where we read
96 */
97 int read_disk;
98
99 struct list_head retry_list;
100 struct bitmap_update *bitmap_update;
101 /*
102 * if the IO is in WRITE direction, then multiple bios are used.
103 * We choose the number when they are allocated.
104 */
105 struct bio *bios[0];
106 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
107};
108
109/* when we get a read error on a read-only array, we redirect to another
110 * device without failing the first device, or trying to over-write to
111 * correct the read error. To keep track of bad blocks on a per-bio
112 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
113 */
114#define IO_BLOCKED ((struct bio*)1)
115
116/* bits for r1bio.state */
117#define R1BIO_Uptodate 0
118#define R1BIO_IsSync 1
119#define R1BIO_Degraded 2
120#define R1BIO_BehindIO 3
121#define R1BIO_Barrier 4
122#define R1BIO_BarrierRetry 5
123/* For write-behind requests, we call bi_end_io when
124 * the last non-write-behind device completes, providing
125 * any write was successful. Otherwise we call when
126 * any write-behind write succeeds, otherwise we call
127 * with failure when last write completes (and all failed).
128 * Record that bi_end_io was called with this flag...
129 */
130#define R1BIO_Returned 6
131
132#endif
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7301631abe04..e293d92641ac 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,10 +18,13 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include "dm-bio-list.h"
22#include <linux/delay.h> 21#include <linux/delay.h>
23#include <linux/raid/raid10.h> 22#include <linux/blkdev.h>
24#include <linux/raid/bitmap.h> 23#include <linux/seq_file.h>
24#include "md.h"
25#include "dm-bio-list.h"
26#include "raid10.h"
27#include "bitmap.h"
25 28
26/* 29/*
27 * RAID10 provides a combination of RAID0 and RAID1 functionality. 30 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1695 return 0; 1698 return 0;
1696 1699
1697 skipped: 1700 skipped:
1698 max_sector = mddev->size << 1; 1701 max_sector = mddev->dev_sectors;
1699 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 1702 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1700 max_sector = mddev->resync_max_sectors; 1703 max_sector = mddev->resync_max_sectors;
1701 if (sector_nr >= max_sector) { 1704 if (sector_nr >= max_sector) {
@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
2020 goto skipped; 2023 goto skipped;
2021} 2024}
2022 2025
2026static sector_t
2027raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2028{
2029 sector_t size;
2030 conf_t *conf = mddev_to_conf(mddev);
2031
2032 if (!raid_disks)
2033 raid_disks = mddev->raid_disks;
2034 if (!sectors)
2035 sectors = mddev->dev_sectors;
2036
2037 size = sectors >> conf->chunk_shift;
2038 sector_div(size, conf->far_copies);
2039 size = size * raid_disks;
2040 sector_div(size, conf->near_copies);
2041
2042 return size << conf->chunk_shift;
2043}
2044
2023static int run(mddev_t *mddev) 2045static int run(mddev_t *mddev)
2024{ 2046{
2025 conf_t *conf; 2047 conf_t *conf;
@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev)
2076 conf->far_offset = fo; 2098 conf->far_offset = fo;
2077 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; 2099 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
2078 conf->chunk_shift = ffz(~mddev->chunk_size) - 9; 2100 conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
2079 size = mddev->size >> (conf->chunk_shift-1); 2101 size = mddev->dev_sectors >> conf->chunk_shift;
2080 sector_div(size, fc); 2102 sector_div(size, fc);
2081 size = size * conf->raid_disks; 2103 size = size * conf->raid_disks;
2082 sector_div(size, nc); 2104 sector_div(size, nc);
@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev)
2089 */ 2111 */
2090 stride += conf->raid_disks - 1; 2112 stride += conf->raid_disks - 1;
2091 sector_div(stride, conf->raid_disks); 2113 sector_div(stride, conf->raid_disks);
2092 mddev->size = stride << (conf->chunk_shift-1); 2114 mddev->dev_sectors = stride << conf->chunk_shift;
2093 2115
2094 if (fo) 2116 if (fo)
2095 stride = 1; 2117 stride = 1;
@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev)
2171 /* 2193 /*
2172 * Ok, everything is just fine now 2194 * Ok, everything is just fine now
2173 */ 2195 */
2174 mddev->array_sectors = size << conf->chunk_shift; 2196 md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
2175 mddev->resync_max_sectors = size << conf->chunk_shift; 2197 mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
2176 2198
2177 mddev->queue->unplug_fn = raid10_unplug; 2199 mddev->queue->unplug_fn = raid10_unplug;
2178 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 2200 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev)
2208{ 2230{
2209 conf_t *conf = mddev_to_conf(mddev); 2231 conf_t *conf = mddev_to_conf(mddev);
2210 2232
2233 raise_barrier(conf, 0);
2234 lower_barrier(conf);
2235
2211 md_unregister_thread(mddev->thread); 2236 md_unregister_thread(mddev->thread);
2212 mddev->thread = NULL; 2237 mddev->thread = NULL;
2213 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2238 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality =
2255 .spare_active = raid10_spare_active, 2280 .spare_active = raid10_spare_active,
2256 .sync_request = sync_request, 2281 .sync_request = sync_request,
2257 .quiesce = raid10_quiesce, 2282 .quiesce = raid10_quiesce,
2283 .size = raid10_size,
2258}; 2284};
2259 2285
2260static int __init raid_init(void) 2286static int __init raid_init(void)
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
new file mode 100644
index 000000000000..244dbe507a54
--- /dev/null
+++ b/drivers/md/raid10.h
@@ -0,0 +1,121 @@
1#ifndef _RAID10_H
2#define _RAID10_H
3
4typedef struct mirror_info mirror_info_t;
5
6struct mirror_info {
7 mdk_rdev_t *rdev;
8 sector_t head_position;
9};
10
11typedef struct r10bio_s r10bio_t;
12
13struct r10_private_data_s {
14 mddev_t *mddev;
15 mirror_info_t *mirrors;
16 int raid_disks;
17 spinlock_t device_lock;
18
19 /* geometry */
20 int near_copies; /* number of copies layed out raid0 style */
21 int far_copies; /* number of copies layed out
22 * at large strides across drives
23 */
24 int far_offset; /* far_copies are offset by 1 stripe
25 * instead of many
26 */
27 int copies; /* near_copies * far_copies.
28 * must be <= raid_disks
29 */
30 sector_t stride; /* distance between far copies.
31 * This is size / far_copies unless
32 * far_offset, in which case it is
33 * 1 stripe.
34 */
35
36 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask;
38
39 struct list_head retry_list;
40 /* queue pending writes and submit them on unplug */
41 struct bio_list pending_bio_list;
42
43
44 spinlock_t resync_lock;
45 int nr_pending;
46 int nr_waiting;
47 int nr_queued;
48 int barrier;
49 sector_t next_resync;
50 int fullsync; /* set to 1 if a full sync is needed,
51 * (fresh device added).
52 * Cleared when a sync completes.
53 */
54
55 wait_queue_head_t wait_barrier;
56
57 mempool_t *r10bio_pool;
58 mempool_t *r10buf_pool;
59 struct page *tmppage;
60};
61
62typedef struct r10_private_data_s conf_t;
63
64/*
65 * this is the only point in the RAID code where we violate
66 * C type safety. mddev->private is an 'opaque' pointer.
67 */
68#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
69
70/*
71 * this is our 'private' RAID10 bio.
72 *
73 * it contains information about what kind of IO operations were started
74 * for this RAID10 operation, and about their status:
75 */
76
77struct r10bio_s {
78 atomic_t remaining; /* 'have we finished' count,
79 * used from IRQ handlers
80 */
81 sector_t sector; /* virtual sector number */
82 int sectors;
83 unsigned long state;
84 mddev_t *mddev;
85 /*
86 * original bio going to /dev/mdx
87 */
88 struct bio *master_bio;
89 /*
90 * if the IO is in READ direction, then this is where we read
91 */
92 int read_slot;
93
94 struct list_head retry_list;
95 /*
96 * if the IO is in WRITE direction, then multiple bios are used,
97 * one for each copy.
98 * When resyncing we also use one for each copy.
99 * When reconstructing, we use 2 bios, one for read, one for write.
100 * We choose the number when they are allocated.
101 */
102 struct {
103 struct bio *bio;
104 sector_t addr;
105 int devnum;
106 } devs[0];
107};
108
109/* when we get a read error on a read-only array, we redirect to another
110 * device without failing the first device, or trying to over-write to
111 * correct the read error. To keep track of bad blocks on a per-bio
112 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
113 */
114#define IO_BLOCKED ((struct bio*)1)
115
116/* bits for r10bio.state */
117#define R10BIO_Uptodate 0
118#define R10BIO_IsSync 1
119#define R10BIO_IsRecover 2
120#define R10BIO_Degraded 3
121#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5ba080d303b..3bbc6d647044 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -43,11 +43,14 @@
43 * miss any bits. 43 * miss any bits.
44 */ 44 */
45 45
46#include <linux/blkdev.h>
46#include <linux/kthread.h> 47#include <linux/kthread.h>
47#include "raid6.h" 48#include <linux/raid/pq.h>
48
49#include <linux/raid/bitmap.h>
50#include <linux/async_tx.h> 49#include <linux/async_tx.h>
50#include <linux/seq_file.h>
51#include "md.h"
52#include "raid5.h"
53#include "bitmap.h"
51 54
52/* 55/*
53 * Stripe cache 56 * Stripe cache
@@ -91,11 +94,6 @@
91 94
92#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 95#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
93 96
94#if !RAID6_USE_EMPTY_ZERO_PAGE
95/* In .bss so it's zeroed */
96const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
97#endif
98
99/* 97/*
100 * We maintain a biased count of active stripes in the bottom 16 bits of 98 * We maintain a biased count of active stripes in the bottom 16 bits of
101 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 99 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
130 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 128 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
131} 129}
132 130
131/* Find first data disk in a raid6 stripe */
132static inline int raid6_d0(struct stripe_head *sh)
133{
134 if (sh->ddf_layout)
135 /* ddf always start from first device */
136 return 0;
137 /* md starts just after Q block */
138 if (sh->qd_idx == sh->disks - 1)
139 return 0;
140 else
141 return sh->qd_idx + 1;
142}
133static inline int raid6_next_disk(int disk, int raid_disks) 143static inline int raid6_next_disk(int disk, int raid_disks)
134{ 144{
135 disk++; 145 disk++;
136 return (disk < raid_disks) ? disk : 0; 146 return (disk < raid_disks) ? disk : 0;
137} 147}
138 148
149/* When walking through the disks in a raid5, starting at raid6_d0,
150 * We need to map each disk to a 'slot', where the data disks are slot
151 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
152 * is raid_disks-1. This help does that mapping.
153 */
154static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
155 int *count, int syndrome_disks)
156{
157 int slot;
158
159 if (idx == sh->pd_idx)
160 return syndrome_disks;
161 if (idx == sh->qd_idx)
162 return syndrome_disks + 1;
163 slot = (*count)++;
164 return slot;
165}
166
139static void return_io(struct bio *return_bi) 167static void return_io(struct bio *return_bi)
140{ 168{
141 struct bio *bi = return_bi; 169 struct bio *bi = return_bi;
@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
193 } 221 }
194 } 222 }
195} 223}
224
196static void release_stripe(struct stripe_head *sh) 225static void release_stripe(struct stripe_head *sh)
197{ 226{
198 raid5_conf_t *conf = sh->raid_conf; 227 raid5_conf_t *conf = sh->raid_conf;
@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num)
270 return 0; 299 return 0;
271} 300}
272 301
273static void raid5_build_block(struct stripe_head *sh, int i); 302static void raid5_build_block(struct stripe_head *sh, int i, int previous);
303static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
304 struct stripe_head *sh);
274 305
275static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) 306static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
276{ 307{
277 raid5_conf_t *conf = sh->raid_conf; 308 raid5_conf_t *conf = sh->raid_conf;
278 int i; 309 int i;
@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
287 318
288 remove_hash(sh); 319 remove_hash(sh);
289 320
321 sh->generation = conf->generation - previous;
322 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
290 sh->sector = sector; 323 sh->sector = sector;
291 sh->pd_idx = pd_idx; 324 stripe_set_idx(sector, conf, previous, sh);
292 sh->state = 0; 325 sh->state = 0;
293 326
294 sh->disks = disks;
295 327
296 for (i = sh->disks; i--; ) { 328 for (i = sh->disks; i--; ) {
297 struct r5dev *dev = &sh->dev[i]; 329 struct r5dev *dev = &sh->dev[i];
@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
305 BUG(); 337 BUG();
306 } 338 }
307 dev->flags = 0; 339 dev->flags = 0;
308 raid5_build_block(sh, i); 340 raid5_build_block(sh, i, previous);
309 } 341 }
310 insert_hash(conf, sh); 342 insert_hash(conf, sh);
311} 343}
312 344
313static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) 345static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
346 short generation)
314{ 347{
315 struct stripe_head *sh; 348 struct stripe_head *sh;
316 struct hlist_node *hn; 349 struct hlist_node *hn;
@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
318 CHECK_DEVLOCK(); 351 CHECK_DEVLOCK();
319 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 352 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
320 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 353 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
321 if (sh->sector == sector && sh->disks == disks) 354 if (sh->sector == sector && sh->generation == generation)
322 return sh; 355 return sh;
323 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 356 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
324 return NULL; 357 return NULL;
@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
327static void unplug_slaves(mddev_t *mddev); 360static void unplug_slaves(mddev_t *mddev);
328static void raid5_unplug_device(struct request_queue *q); 361static void raid5_unplug_device(struct request_queue *q);
329 362
330static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, 363static struct stripe_head *
331 int pd_idx, int noblock) 364get_active_stripe(raid5_conf_t *conf, sector_t sector,
365 int previous, int noblock)
332{ 366{
333 struct stripe_head *sh; 367 struct stripe_head *sh;
334 368
@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
340 wait_event_lock_irq(conf->wait_for_stripe, 374 wait_event_lock_irq(conf->wait_for_stripe,
341 conf->quiesce == 0, 375 conf->quiesce == 0,
342 conf->device_lock, /* nothing */); 376 conf->device_lock, /* nothing */);
343 sh = __find_stripe(conf, sector, disks); 377 sh = __find_stripe(conf, sector, conf->generation - previous);
344 if (!sh) { 378 if (!sh) {
345 if (!conf->inactive_blocked) 379 if (!conf->inactive_blocked)
346 sh = get_free_stripe(conf); 380 sh = get_free_stripe(conf);
@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
358 ); 392 );
359 conf->inactive_blocked = 0; 393 conf->inactive_blocked = 0;
360 } else 394 } else
361 init_stripe(sh, sector, pd_idx, disks); 395 init_stripe(sh, sector, previous);
362 } else { 396 } else {
363 if (atomic_read(&sh->count)) { 397 if (atomic_read(&sh->count)) {
364 BUG_ON(!list_empty(&sh->lru)); 398 BUG_ON(!list_empty(&sh->lru)
399 && !test_bit(STRIPE_EXPANDING, &sh->state));
365 } else { 400 } else {
366 if (!test_bit(STRIPE_HANDLE, &sh->state)) 401 if (!test_bit(STRIPE_HANDLE, &sh->state))
367 atomic_inc(&conf->active_stripes); 402 atomic_inc(&conf->active_stripes);
@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
895 struct kmem_cache *sc; 930 struct kmem_cache *sc;
896 int devs = conf->raid_disks; 931 int devs = conf->raid_disks;
897 932
898 sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); 933 sprintf(conf->cache_name[0],
899 sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); 934 "raid%d-%s", conf->level, mdname(conf->mddev));
935 sprintf(conf->cache_name[1],
936 "raid%d-%s-alt", conf->level, mdname(conf->mddev));
900 conf->active_name = 0; 937 conf->active_name = 0;
901 sc = kmem_cache_create(conf->cache_name[conf->active_name], 938 sc = kmem_cache_create(conf->cache_name[conf->active_name],
902 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 939 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
911 return 0; 948 return 0;
912} 949}
913 950
914#ifdef CONFIG_MD_RAID5_RESHAPE
915static int resize_stripes(raid5_conf_t *conf, int newsize) 951static int resize_stripes(raid5_conf_t *conf, int newsize)
916{ 952{
917 /* Make all the stripes able to hold 'newsize' devices. 953 /* Make all the stripes able to hold 'newsize' devices.
@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
1036 conf->pool_size = newsize; 1072 conf->pool_size = newsize;
1037 return err; 1073 return err;
1038} 1074}
1039#endif
1040 1075
1041static int drop_one_stripe(raid5_conf_t *conf) 1076static int drop_one_stripe(raid5_conf_t *conf)
1042{ 1077{
@@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf)
1066 1101
1067static void raid5_end_read_request(struct bio * bi, int error) 1102static void raid5_end_read_request(struct bio * bi, int error)
1068{ 1103{
1069 struct stripe_head *sh = bi->bi_private; 1104 struct stripe_head *sh = bi->bi_private;
1070 raid5_conf_t *conf = sh->raid_conf; 1105 raid5_conf_t *conf = sh->raid_conf;
1071 int disks = sh->disks, i; 1106 int disks = sh->disks, i;
1072 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1107 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
1148 1183
1149static void raid5_end_write_request(struct bio *bi, int error) 1184static void raid5_end_write_request(struct bio *bi, int error)
1150{ 1185{
1151 struct stripe_head *sh = bi->bi_private; 1186 struct stripe_head *sh = bi->bi_private;
1152 raid5_conf_t *conf = sh->raid_conf; 1187 raid5_conf_t *conf = sh->raid_conf;
1153 int disks = sh->disks, i; 1188 int disks = sh->disks, i;
1154 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1189 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
1176} 1211}
1177 1212
1178 1213
1179static sector_t compute_blocknr(struct stripe_head *sh, int i); 1214static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1180 1215
1181static void raid5_build_block(struct stripe_head *sh, int i) 1216static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1182{ 1217{
1183 struct r5dev *dev = &sh->dev[i]; 1218 struct r5dev *dev = &sh->dev[i];
1184 1219
@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
1194 dev->req.bi_private = sh; 1229 dev->req.bi_private = sh;
1195 1230
1196 dev->flags = 0; 1231 dev->flags = 0;
1197 dev->sector = compute_blocknr(sh, i); 1232 dev->sector = compute_blocknr(sh, i, previous);
1198} 1233}
1199 1234
1200static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1235static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1227 * Input: a 'big' sector number, 1262 * Input: a 'big' sector number,
1228 * Output: index of the data and parity disk, and the sector # in them. 1263 * Output: index of the data and parity disk, and the sector # in them.
1229 */ 1264 */
1230static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, 1265static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1231 unsigned int data_disks, unsigned int * dd_idx, 1266 int previous, int *dd_idx,
1232 unsigned int * pd_idx, raid5_conf_t *conf) 1267 struct stripe_head *sh)
1233{ 1268{
1234 long stripe; 1269 long stripe;
1235 unsigned long chunk_number; 1270 unsigned long chunk_number;
1236 unsigned int chunk_offset; 1271 unsigned int chunk_offset;
1272 int pd_idx, qd_idx;
1273 int ddf_layout = 0;
1237 sector_t new_sector; 1274 sector_t new_sector;
1238 int sectors_per_chunk = conf->chunk_size >> 9; 1275 int algorithm = previous ? conf->prev_algo
1276 : conf->algorithm;
1277 int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
1278 : (conf->chunk_size >> 9);
1279 int raid_disks = previous ? conf->previous_raid_disks
1280 : conf->raid_disks;
1281 int data_disks = raid_disks - conf->max_degraded;
1239 1282
1240 /* First compute the information on this sector */ 1283 /* First compute the information on this sector */
1241 1284
@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
1259 /* 1302 /*
1260 * Select the parity disk based on the user selected algorithm. 1303 * Select the parity disk based on the user selected algorithm.
1261 */ 1304 */
1305 pd_idx = qd_idx = ~0;
1262 switch(conf->level) { 1306 switch(conf->level) {
1263 case 4: 1307 case 4:
1264 *pd_idx = data_disks; 1308 pd_idx = data_disks;
1265 break; 1309 break;
1266 case 5: 1310 case 5:
1267 switch (conf->algorithm) { 1311 switch (algorithm) {
1268 case ALGORITHM_LEFT_ASYMMETRIC: 1312 case ALGORITHM_LEFT_ASYMMETRIC:
1269 *pd_idx = data_disks - stripe % raid_disks; 1313 pd_idx = data_disks - stripe % raid_disks;
1270 if (*dd_idx >= *pd_idx) 1314 if (*dd_idx >= pd_idx)
1271 (*dd_idx)++; 1315 (*dd_idx)++;
1272 break; 1316 break;
1273 case ALGORITHM_RIGHT_ASYMMETRIC: 1317 case ALGORITHM_RIGHT_ASYMMETRIC:
1274 *pd_idx = stripe % raid_disks; 1318 pd_idx = stripe % raid_disks;
1275 if (*dd_idx >= *pd_idx) 1319 if (*dd_idx >= pd_idx)
1276 (*dd_idx)++; 1320 (*dd_idx)++;
1277 break; 1321 break;
1278 case ALGORITHM_LEFT_SYMMETRIC: 1322 case ALGORITHM_LEFT_SYMMETRIC:
1279 *pd_idx = data_disks - stripe % raid_disks; 1323 pd_idx = data_disks - stripe % raid_disks;
1280 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 1324 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1281 break; 1325 break;
1282 case ALGORITHM_RIGHT_SYMMETRIC: 1326 case ALGORITHM_RIGHT_SYMMETRIC:
1283 *pd_idx = stripe % raid_disks; 1327 pd_idx = stripe % raid_disks;
1284 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 1328 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1329 break;
1330 case ALGORITHM_PARITY_0:
1331 pd_idx = 0;
1332 (*dd_idx)++;
1333 break;
1334 case ALGORITHM_PARITY_N:
1335 pd_idx = data_disks;
1285 break; 1336 break;
1286 default: 1337 default:
1287 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1338 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1288 conf->algorithm); 1339 algorithm);
1340 BUG();
1289 } 1341 }
1290 break; 1342 break;
1291 case 6: 1343 case 6:
1292 1344
1293 /**** FIX THIS ****/ 1345 switch (algorithm) {
1294 switch (conf->algorithm) {
1295 case ALGORITHM_LEFT_ASYMMETRIC: 1346 case ALGORITHM_LEFT_ASYMMETRIC:
1296 *pd_idx = raid_disks - 1 - (stripe % raid_disks); 1347 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1297 if (*pd_idx == raid_disks-1) 1348 qd_idx = pd_idx + 1;
1298 (*dd_idx)++; /* Q D D D P */ 1349 if (pd_idx == raid_disks-1) {
1299 else if (*dd_idx >= *pd_idx) 1350 (*dd_idx)++; /* Q D D D P */
1351 qd_idx = 0;
1352 } else if (*dd_idx >= pd_idx)
1300 (*dd_idx) += 2; /* D D P Q D */ 1353 (*dd_idx) += 2; /* D D P Q D */
1301 break; 1354 break;
1302 case ALGORITHM_RIGHT_ASYMMETRIC: 1355 case ALGORITHM_RIGHT_ASYMMETRIC:
1303 *pd_idx = stripe % raid_disks; 1356 pd_idx = stripe % raid_disks;
1304 if (*pd_idx == raid_disks-1) 1357 qd_idx = pd_idx + 1;
1305 (*dd_idx)++; /* Q D D D P */ 1358 if (pd_idx == raid_disks-1) {
1306 else if (*dd_idx >= *pd_idx) 1359 (*dd_idx)++; /* Q D D D P */
1360 qd_idx = 0;
1361 } else if (*dd_idx >= pd_idx)
1307 (*dd_idx) += 2; /* D D P Q D */ 1362 (*dd_idx) += 2; /* D D P Q D */
1308 break; 1363 break;
1309 case ALGORITHM_LEFT_SYMMETRIC: 1364 case ALGORITHM_LEFT_SYMMETRIC:
1310 *pd_idx = raid_disks - 1 - (stripe % raid_disks); 1365 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1311 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; 1366 qd_idx = (pd_idx + 1) % raid_disks;
1367 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1312 break; 1368 break;
1313 case ALGORITHM_RIGHT_SYMMETRIC: 1369 case ALGORITHM_RIGHT_SYMMETRIC:
1314 *pd_idx = stripe % raid_disks; 1370 pd_idx = stripe % raid_disks;
1315 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; 1371 qd_idx = (pd_idx + 1) % raid_disks;
1372 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1373 break;
1374
1375 case ALGORITHM_PARITY_0:
1376 pd_idx = 0;
1377 qd_idx = 1;
1378 (*dd_idx) += 2;
1379 break;
1380 case ALGORITHM_PARITY_N:
1381 pd_idx = data_disks;
1382 qd_idx = data_disks + 1;
1316 break; 1383 break;
1384
1385 case ALGORITHM_ROTATING_ZERO_RESTART:
1386 /* Exactly the same as RIGHT_ASYMMETRIC, but or
1387 * of blocks for computing Q is different.
1388 */
1389 pd_idx = stripe % raid_disks;
1390 qd_idx = pd_idx + 1;
1391 if (pd_idx == raid_disks-1) {
1392 (*dd_idx)++; /* Q D D D P */
1393 qd_idx = 0;
1394 } else if (*dd_idx >= pd_idx)
1395 (*dd_idx) += 2; /* D D P Q D */
1396 ddf_layout = 1;
1397 break;
1398
1399 case ALGORITHM_ROTATING_N_RESTART:
1400 /* Same a left_asymmetric, by first stripe is
1401 * D D D P Q rather than
1402 * Q D D D P
1403 */
1404 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
1405 qd_idx = pd_idx + 1;
1406 if (pd_idx == raid_disks-1) {
1407 (*dd_idx)++; /* Q D D D P */
1408 qd_idx = 0;
1409 } else if (*dd_idx >= pd_idx)
1410 (*dd_idx) += 2; /* D D P Q D */
1411 ddf_layout = 1;
1412 break;
1413
1414 case ALGORITHM_ROTATING_N_CONTINUE:
1415 /* Same as left_symmetric but Q is before P */
1416 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1417 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1418 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1419 ddf_layout = 1;
1420 break;
1421
1422 case ALGORITHM_LEFT_ASYMMETRIC_6:
1423 /* RAID5 left_asymmetric, with Q on last device */
1424 pd_idx = data_disks - stripe % (raid_disks-1);
1425 if (*dd_idx >= pd_idx)
1426 (*dd_idx)++;
1427 qd_idx = raid_disks - 1;
1428 break;
1429
1430 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1431 pd_idx = stripe % (raid_disks-1);
1432 if (*dd_idx >= pd_idx)
1433 (*dd_idx)++;
1434 qd_idx = raid_disks - 1;
1435 break;
1436
1437 case ALGORITHM_LEFT_SYMMETRIC_6:
1438 pd_idx = data_disks - stripe % (raid_disks-1);
1439 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1440 qd_idx = raid_disks - 1;
1441 break;
1442
1443 case ALGORITHM_RIGHT_SYMMETRIC_6:
1444 pd_idx = stripe % (raid_disks-1);
1445 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1446 qd_idx = raid_disks - 1;
1447 break;
1448
1449 case ALGORITHM_PARITY_0_6:
1450 pd_idx = 0;
1451 (*dd_idx)++;
1452 qd_idx = raid_disks - 1;
1453 break;
1454
1455
1317 default: 1456 default:
1318 printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1457 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1319 conf->algorithm); 1458 algorithm);
1459 BUG();
1320 } 1460 }
1321 break; 1461 break;
1322 } 1462 }
1323 1463
1464 if (sh) {
1465 sh->pd_idx = pd_idx;
1466 sh->qd_idx = qd_idx;
1467 sh->ddf_layout = ddf_layout;
1468 }
1324 /* 1469 /*
1325 * Finally, compute the new sector number 1470 * Finally, compute the new sector number
1326 */ 1471 */
@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
1329} 1474}
1330 1475
1331 1476
1332static sector_t compute_blocknr(struct stripe_head *sh, int i) 1477static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1333{ 1478{
1334 raid5_conf_t *conf = sh->raid_conf; 1479 raid5_conf_t *conf = sh->raid_conf;
1335 int raid_disks = sh->disks; 1480 int raid_disks = sh->disks;
1336 int data_disks = raid_disks - conf->max_degraded; 1481 int data_disks = raid_disks - conf->max_degraded;
1337 sector_t new_sector = sh->sector, check; 1482 sector_t new_sector = sh->sector, check;
1338 int sectors_per_chunk = conf->chunk_size >> 9; 1483 int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
1484 : (conf->chunk_size >> 9);
1485 int algorithm = previous ? conf->prev_algo
1486 : conf->algorithm;
1339 sector_t stripe; 1487 sector_t stripe;
1340 int chunk_offset; 1488 int chunk_offset;
1341 int chunk_number, dummy1, dummy2, dd_idx = i; 1489 int chunk_number, dummy1, dd_idx = i;
1342 sector_t r_sector; 1490 sector_t r_sector;
1491 struct stripe_head sh2;
1343 1492
1344 1493
1345 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1494 chunk_offset = sector_div(new_sector, sectors_per_chunk);
@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
1351 switch(conf->level) { 1500 switch(conf->level) {
1352 case 4: break; 1501 case 4: break;
1353 case 5: 1502 case 5:
1354 switch (conf->algorithm) { 1503 switch (algorithm) {
1355 case ALGORITHM_LEFT_ASYMMETRIC: 1504 case ALGORITHM_LEFT_ASYMMETRIC:
1356 case ALGORITHM_RIGHT_ASYMMETRIC: 1505 case ALGORITHM_RIGHT_ASYMMETRIC:
1357 if (i > sh->pd_idx) 1506 if (i > sh->pd_idx)
@@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
1363 i += raid_disks; 1512 i += raid_disks;
1364 i -= (sh->pd_idx + 1); 1513 i -= (sh->pd_idx + 1);
1365 break; 1514 break;
1515 case ALGORITHM_PARITY_0:
1516 i -= 1;
1517 break;
1518 case ALGORITHM_PARITY_N:
1519 break;
1366 default: 1520 default:
1367 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1521 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1368 conf->algorithm); 1522 algorithm);
1523 BUG();
1369 } 1524 }
1370 break; 1525 break;
1371 case 6: 1526 case 6:
1372 if (i == raid6_next_disk(sh->pd_idx, raid_disks)) 1527 if (i == sh->qd_idx)
1373 return 0; /* It is the Q disk */ 1528 return 0; /* It is the Q disk */
1374 switch (conf->algorithm) { 1529 switch (algorithm) {
1375 case ALGORITHM_LEFT_ASYMMETRIC: 1530 case ALGORITHM_LEFT_ASYMMETRIC:
1376 case ALGORITHM_RIGHT_ASYMMETRIC: 1531 case ALGORITHM_RIGHT_ASYMMETRIC:
1377 if (sh->pd_idx == raid_disks-1) 1532 case ALGORITHM_ROTATING_ZERO_RESTART:
1378 i--; /* Q D D D P */ 1533 case ALGORITHM_ROTATING_N_RESTART:
1534 if (sh->pd_idx == raid_disks-1)
1535 i--; /* Q D D D P */
1379 else if (i > sh->pd_idx) 1536 else if (i > sh->pd_idx)
1380 i -= 2; /* D D P Q D */ 1537 i -= 2; /* D D P Q D */
1381 break; 1538 break;
@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
1390 i -= (sh->pd_idx + 2); 1547 i -= (sh->pd_idx + 2);
1391 } 1548 }
1392 break; 1549 break;
1550 case ALGORITHM_PARITY_0:
1551 i -= 2;
1552 break;
1553 case ALGORITHM_PARITY_N:
1554 break;
1555 case ALGORITHM_ROTATING_N_CONTINUE:
1556 if (sh->pd_idx == 0)
1557 i--; /* P D D D Q */
1558 else if (i > sh->pd_idx)
1559 i -= 2; /* D D Q P D */
1560 break;
1561 case ALGORITHM_LEFT_ASYMMETRIC_6:
1562 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1563 if (i > sh->pd_idx)
1564 i--;
1565 break;
1566 case ALGORITHM_LEFT_SYMMETRIC_6:
1567 case ALGORITHM_RIGHT_SYMMETRIC_6:
1568 if (i < sh->pd_idx)
1569 i += data_disks + 1;
1570 i -= (sh->pd_idx + 1);
1571 break;
1572 case ALGORITHM_PARITY_0_6:
1573 i -= 1;
1574 break;
1393 default: 1575 default:
1394 printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1576 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1395 conf->algorithm); 1577 algorithm);
1578 BUG();
1396 } 1579 }
1397 break; 1580 break;
1398 } 1581 }
@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
1400 chunk_number = stripe * data_disks + i; 1583 chunk_number = stripe * data_disks + i;
1401 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 1584 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
1402 1585
1403 check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); 1586 check = raid5_compute_sector(conf, r_sector,
1404 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { 1587 previous, &dummy1, &sh2);
1588 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
1589 || sh2.qd_idx != sh->qd_idx) {
1405 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1590 printk(KERN_ERR "compute_blocknr: map not correct\n");
1406 return 0; 1591 return 0;
1407 } 1592 }
@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio,
1468 1653
1469static void compute_parity6(struct stripe_head *sh, int method) 1654static void compute_parity6(struct stripe_head *sh, int method)
1470{ 1655{
1471 raid6_conf_t *conf = sh->raid_conf; 1656 raid5_conf_t *conf = sh->raid_conf;
1472 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; 1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1473 struct bio *chosen; 1659 struct bio *chosen;
1474 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1660 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1475 void *ptrs[disks]; 1661 void *ptrs[syndrome_disks+2];
1476 1662
1477 qd_idx = raid6_next_disk(pd_idx, disks); 1663 pd_idx = sh->pd_idx;
1478 d0_idx = raid6_next_disk(qd_idx, disks); 1664 qd_idx = sh->qd_idx;
1665 d0_idx = raid6_d0(sh);
1479 1666
1480 pr_debug("compute_parity, stripe %llu, method %d\n", 1667 pr_debug("compute_parity, stripe %llu, method %d\n",
1481 (unsigned long long)sh->sector, method); 1668 (unsigned long long)sh->sector, method);
@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method)
1513 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1700 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1514 } 1701 }
1515 1702
1516// switch(method) { 1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
1517// case RECONSTRUCT_WRITE: 1704
1518// case CHECK_PARITY: 1705 for (i = 0; i < disks; i++)
1519// case UPDATE_PARITY: 1706 ptrs[i] = (void *)raid6_empty_zero_page;
1520 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ 1707
1521 /* FIX: Is this ordering of drives even remotely optimal? */ 1708 count = 0;
1522 count = 0; 1709 i = d0_idx;
1523 i = d0_idx; 1710 do {
1524 do { 1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1525 ptrs[count++] = page_address(sh->dev[i].page); 1712
1526 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) 1713 ptrs[slot] = page_address(sh->dev[i].page);
1527 printk("block %d/%d not uptodate on parity calc\n", i,count); 1714 if (slot < syndrome_disks &&
1528 i = raid6_next_disk(i, disks); 1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
1529 } while ( i != d0_idx ); 1716 printk(KERN_ERR "block %d/%d not uptodate "
1530// break; 1717 "on parity calc\n", i, count);
1531// } 1718 BUG();
1532 1719 }
1533 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); 1720
1721 i = raid6_next_disk(i, disks);
1722 } while (i != d0_idx);
1723 BUG_ON(count != syndrome_disks);
1724
1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
1534 1726
1535 switch(method) { 1727 switch(method) {
1536 case RECONSTRUCT_WRITE: 1728 case RECONSTRUCT_WRITE:
@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1552{ 1744{
1553 int i, count, disks = sh->disks; 1745 int i, count, disks = sh->disks;
1554 void *ptr[MAX_XOR_BLOCKS], *dest, *p; 1746 void *ptr[MAX_XOR_BLOCKS], *dest, *p;
1555 int pd_idx = sh->pd_idx; 1747 int qd_idx = sh->qd_idx;
1556 int qd_idx = raid6_next_disk(pd_idx, disks);
1557 1748
1558 pr_debug("compute_block_1, stripe %llu, idx %d\n", 1749 pr_debug("compute_block_1, stripe %llu, idx %d\n",
1559 (unsigned long long)sh->sector, dd_idx); 1750 (unsigned long long)sh->sector, dd_idx);
@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
1589static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) 1780static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1590{ 1781{
1591 int i, count, disks = sh->disks; 1782 int i, count, disks = sh->disks;
1592 int pd_idx = sh->pd_idx; 1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1593 int qd_idx = raid6_next_disk(pd_idx, disks); 1784 int d0_idx = raid6_d0(sh);
1594 int d0_idx = raid6_next_disk(qd_idx, disks); 1785 int faila = -1, failb = -1;
1595 int faila, failb; 1786 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
1787 void *ptrs[syndrome_disks+2];
1596 1788
1597 /* faila and failb are disk numbers relative to d0_idx */ 1789 for (i = 0; i < disks ; i++)
1598 /* pd_idx become disks-2 and qd_idx become disks-1 */ 1790 ptrs[i] = (void *)raid6_empty_zero_page;
1599 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; 1791 count = 0;
1600 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; 1792 i = d0_idx;
1793 do {
1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1795
1796 ptrs[slot] = page_address(sh->dev[i].page);
1797
1798 if (i == dd_idx1)
1799 faila = slot;
1800 if (i == dd_idx2)
1801 failb = slot;
1802 i = raid6_next_disk(i, disks);
1803 } while (i != d0_idx);
1804 BUG_ON(count != syndrome_disks);
1601 1805
1602 BUG_ON(faila == failb); 1806 BUG_ON(faila == failb);
1603 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } 1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
1604 1808
1605 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", 1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
1606 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); 1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2,
1811 faila, failb);
1607 1812
1608 if ( failb == disks-1 ) { 1813 if (failb == syndrome_disks+1) {
1609 /* Q disk is one of the missing disks */ 1814 /* Q disk is one of the missing disks */
1610 if ( faila == disks-2 ) { 1815 if (faila == syndrome_disks) {
1611 /* Missing P+Q, just recompute */ 1816 /* Missing P+Q, just recompute */
1612 compute_parity6(sh, UPDATE_PARITY); 1817 compute_parity6(sh, UPDATE_PARITY);
1613 return; 1818 return;
1614 } else { 1819 } else {
1615 /* We're missing D+Q; recompute D from P */ 1820 /* We're missing D+Q; recompute D from P */
1616 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); 1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
1822 dd_idx2 : dd_idx1),
1823 0);
1617 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ 1824 compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
1618 return; 1825 return;
1619 } 1826 }
1620 } 1827 }
1621 1828
1622 /* We're missing D+P or D+D; build pointer table */ 1829 /* We're missing D+P or D+D; */
1623 { 1830 if (failb == syndrome_disks) {
1624 /**** FIX THIS: This could be very bad if disks is close to 256 ****/ 1831 /* We're missing D+P. */
1625 void *ptrs[disks]; 1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
1626 1833 } else {
1627 count = 0; 1834 /* We're missing D+D. */
1628 i = d0_idx; 1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
1629 do { 1836 ptrs);
1630 ptrs[count++] = page_address(sh->dev[i].page);
1631 i = raid6_next_disk(i, disks);
1632 if (i != dd_idx1 && i != dd_idx2 &&
1633 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
1634 printk("compute_2 with missing block %d/%d\n", count, i);
1635 } while ( i != d0_idx );
1636
1637 if ( failb == disks-2 ) {
1638 /* We're missing D+P. */
1639 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
1640 } else {
1641 /* We're missing D+D. */
1642 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
1643 }
1644
1645 /* Both the above update both missing blocks */
1646 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1647 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1648 } 1837 }
1838
1839 /* Both the above update both missing blocks */
1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
1649} 1842}
1650 1843
1651static void 1844static void
@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p)
1800 memcmp(a, a+4, STRIPE_SIZE-4)==0); 1993 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1801} 1994}
1802 1995
1803static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) 1996static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
1997 struct stripe_head *sh)
1804{ 1998{
1805 int sectors_per_chunk = conf->chunk_size >> 9; 1999 int sectors_per_chunk =
1806 int pd_idx, dd_idx; 2000 previous ? (conf->prev_chunk >> 9)
2001 : (conf->chunk_size >> 9);
2002 int dd_idx;
1807 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2003 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2004 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
1808 2005
1809 raid5_compute_sector(stripe * (disks - conf->max_degraded) 2006 raid5_compute_sector(conf,
2007 stripe * (disks - conf->max_degraded)
1810 *sectors_per_chunk + chunk_offset, 2008 *sectors_per_chunk + chunk_offset,
1811 disks, disks - conf->max_degraded, 2009 previous,
1812 &dd_idx, &pd_idx, conf); 2010 &dd_idx, sh);
1813 return pd_idx;
1814} 2011}
1815 2012
1816static void 2013static void
@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
2181 struct r6_state *r6s, int disks) 2378 struct r6_state *r6s, int disks)
2182{ 2379{
2183 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; 2380 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2184 int qd_idx = r6s->qd_idx; 2381 int qd_idx = sh->qd_idx;
2185 for (i = disks; i--; ) { 2382 for (i = disks; i--; ) {
2186 struct r5dev *dev = &sh->dev[i]; 2383 struct r5dev *dev = &sh->dev[i];
2187 /* Would I have to read this buffer for reconstruct_write */ 2384 /* Would I have to read this buffer for reconstruct_write */
@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2371 int update_p = 0, update_q = 0; 2568 int update_p = 0, update_q = 0;
2372 struct r5dev *dev; 2569 struct r5dev *dev;
2373 int pd_idx = sh->pd_idx; 2570 int pd_idx = sh->pd_idx;
2374 int qd_idx = r6s->qd_idx; 2571 int qd_idx = sh->qd_idx;
2375 2572
2376 set_bit(STRIPE_HANDLE, &sh->state); 2573 set_bit(STRIPE_HANDLE, &sh->state);
2377 2574
@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2467 struct dma_async_tx_descriptor *tx = NULL; 2664 struct dma_async_tx_descriptor *tx = NULL;
2468 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2665 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2469 for (i = 0; i < sh->disks; i++) 2666 for (i = 0; i < sh->disks; i++)
2470 if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { 2667 if (i != sh->pd_idx && i != sh->qd_idx) {
2471 int dd_idx, pd_idx, j; 2668 int dd_idx, j;
2472 struct stripe_head *sh2; 2669 struct stripe_head *sh2;
2473 2670
2474 sector_t bn = compute_blocknr(sh, i); 2671 sector_t bn = compute_blocknr(sh, i, 1);
2475 sector_t s = raid5_compute_sector(bn, conf->raid_disks, 2672 sector_t s = raid5_compute_sector(conf, bn, 0,
2476 conf->raid_disks - 2673 &dd_idx, NULL);
2477 conf->max_degraded, &dd_idx, 2674 sh2 = get_active_stripe(conf, s, 0, 1);
2478 &pd_idx, conf);
2479 sh2 = get_active_stripe(conf, s, conf->raid_disks,
2480 pd_idx, 1);
2481 if (sh2 == NULL) 2675 if (sh2 == NULL)
2482 /* so far only the early blocks of this stripe 2676 /* so far only the early blocks of this stripe
2483 * have been requested. When later blocks 2677 * have been requested. When later blocks
@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2500 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2694 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2501 for (j = 0; j < conf->raid_disks; j++) 2695 for (j = 0; j < conf->raid_disks; j++)
2502 if (j != sh2->pd_idx && 2696 if (j != sh2->pd_idx &&
2503 (!r6s || j != raid6_next_disk(sh2->pd_idx, 2697 (!r6s || j != sh2->qd_idx) &&
2504 sh2->disks)) &&
2505 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2698 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2506 break; 2699 break;
2507 if (j == conf->raid_disks) { 2700 if (j == conf->raid_disks) {
@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
2750 2943
2751 /* Finish reconstruct operations initiated by the expansion process */ 2944 /* Finish reconstruct operations initiated by the expansion process */
2752 if (sh->reconstruct_state == reconstruct_state_result) { 2945 if (sh->reconstruct_state == reconstruct_state_result) {
2946 struct stripe_head *sh2
2947 = get_active_stripe(conf, sh->sector, 1, 1);
2948 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
2949 /* sh cannot be written until sh2 has been read.
2950 * so arrange for sh to be delayed a little
2951 */
2952 set_bit(STRIPE_DELAYED, &sh->state);
2953 set_bit(STRIPE_HANDLE, &sh->state);
2954 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
2955 &sh2->state))
2956 atomic_inc(&conf->preread_active_stripes);
2957 release_stripe(sh2);
2958 goto unlock;
2959 }
2960 if (sh2)
2961 release_stripe(sh2);
2962
2753 sh->reconstruct_state = reconstruct_state_idle; 2963 sh->reconstruct_state = reconstruct_state_idle;
2754 clear_bit(STRIPE_EXPANDING, &sh->state); 2964 clear_bit(STRIPE_EXPANDING, &sh->state);
2755 for (i = conf->raid_disks; i--; ) { 2965 for (i = conf->raid_disks; i--; ) {
@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh)
2763 !sh->reconstruct_state) { 2973 !sh->reconstruct_state) {
2764 /* Need to write out all blocks after computing parity */ 2974 /* Need to write out all blocks after computing parity */
2765 sh->disks = conf->raid_disks; 2975 sh->disks = conf->raid_disks;
2766 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2976 stripe_set_idx(sh->sector, conf, 0, sh);
2767 conf->raid_disks);
2768 schedule_reconstruction5(sh, &s, 1, 1); 2977 schedule_reconstruction5(sh, &s, 1, 1);
2769 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 2978 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2770 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2979 clear_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh)
2796 3005
2797static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 3006static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2798{ 3007{
2799 raid6_conf_t *conf = sh->raid_conf; 3008 raid5_conf_t *conf = sh->raid_conf;
2800 int disks = sh->disks; 3009 int disks = sh->disks;
2801 struct bio *return_bi = NULL; 3010 struct bio *return_bi = NULL;
2802 int i, pd_idx = sh->pd_idx; 3011 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
2803 struct stripe_head_state s; 3012 struct stripe_head_state s;
2804 struct r6_state r6s; 3013 struct r6_state r6s;
2805 struct r5dev *dev, *pdev, *qdev; 3014 struct r5dev *dev, *pdev, *qdev;
2806 mdk_rdev_t *blocked_rdev = NULL; 3015 mdk_rdev_t *blocked_rdev = NULL;
2807 3016
2808 r6s.qd_idx = raid6_next_disk(pd_idx, disks);
2809 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3017 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
2810 "pd_idx=%d, qd_idx=%d\n", 3018 "pd_idx=%d, qd_idx=%d\n",
2811 (unsigned long long)sh->sector, sh->state, 3019 (unsigned long long)sh->sector, sh->state,
2812 atomic_read(&sh->count), pd_idx, r6s.qd_idx); 3020 atomic_read(&sh->count), pd_idx, qd_idx);
2813 memset(&s, 0, sizeof(s)); 3021 memset(&s, 0, sizeof(s));
2814 3022
2815 spin_lock(&sh->lock); 3023 spin_lock(&sh->lock);
@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2920 pdev = &sh->dev[pd_idx]; 3128 pdev = &sh->dev[pd_idx];
2921 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3129 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
2922 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3130 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
2923 qdev = &sh->dev[r6s.qd_idx]; 3131 qdev = &sh->dev[qd_idx];
2924 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) 3132 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
2925 || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); 3133 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
2926 3134
2927 if ( s.written && 3135 if ( s.written &&
2928 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3136 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2980 } 3188 }
2981 3189
2982 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { 3190 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
3191 struct stripe_head *sh2
3192 = get_active_stripe(conf, sh->sector, 1, 1);
3193 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3194 /* sh cannot be written until sh2 has been read.
3195 * so arrange for sh to be delayed a little
3196 */
3197 set_bit(STRIPE_DELAYED, &sh->state);
3198 set_bit(STRIPE_HANDLE, &sh->state);
3199 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3200 &sh2->state))
3201 atomic_inc(&conf->preread_active_stripes);
3202 release_stripe(sh2);
3203 goto unlock;
3204 }
3205 if (sh2)
3206 release_stripe(sh2);
3207
2983 /* Need to write out all blocks after computing P&Q */ 3208 /* Need to write out all blocks after computing P&Q */
2984 sh->disks = conf->raid_disks; 3209 sh->disks = conf->raid_disks;
2985 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 3210 stripe_set_idx(sh->sector, conf, 0, sh);
2986 conf->raid_disks);
2987 compute_parity6(sh, RECONSTRUCT_WRITE); 3211 compute_parity6(sh, RECONSTRUCT_WRITE);
2988 for (i = conf->raid_disks ; i-- ; ) { 3212 for (i = conf->raid_disks ; i-- ; ) {
2989 set_bit(R5_LOCKED, &sh->dev[i].flags); 3213 set_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
3134 if ((bvm->bi_rw & 1) == WRITE) 3358 if ((bvm->bi_rw & 1) == WRITE)
3135 return biovec->bv_len; /* always allow writes to be mergeable */ 3359 return biovec->bv_len; /* always allow writes to be mergeable */
3136 3360
3361 if (mddev->new_chunk < mddev->chunk_size)
3362 chunk_sectors = mddev->new_chunk >> 9;
3137 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3363 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3138 if (max < 0) max = 0; 3364 if (max < 0) max = 0;
3139 if (max <= biovec->bv_len && bio_sectors == 0) 3365 if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3149 unsigned int chunk_sectors = mddev->chunk_size >> 9; 3375 unsigned int chunk_sectors = mddev->chunk_size >> 9;
3150 unsigned int bio_sectors = bio->bi_size >> 9; 3376 unsigned int bio_sectors = bio->bi_size >> 9;
3151 3377
3378 if (mddev->new_chunk < mddev->chunk_size)
3379 chunk_sectors = mddev->new_chunk >> 9;
3152 return chunk_sectors >= 3380 return chunk_sectors >=
3153 ((sector & (chunk_sectors - 1)) + bio_sectors); 3381 ((sector & (chunk_sectors - 1)) + bio_sectors);
3154} 3382}
@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3255{ 3483{
3256 mddev_t *mddev = q->queuedata; 3484 mddev_t *mddev = q->queuedata;
3257 raid5_conf_t *conf = mddev_to_conf(mddev); 3485 raid5_conf_t *conf = mddev_to_conf(mddev);
3258 const unsigned int raid_disks = conf->raid_disks; 3486 unsigned int dd_idx;
3259 const unsigned int data_disks = raid_disks - conf->max_degraded;
3260 unsigned int dd_idx, pd_idx;
3261 struct bio* align_bi; 3487 struct bio* align_bi;
3262 mdk_rdev_t *rdev; 3488 mdk_rdev_t *rdev;
3263 3489
@@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3266 return 0; 3492 return 0;
3267 } 3493 }
3268 /* 3494 /*
3269 * use bio_clone to make a copy of the bio 3495 * use bio_clone to make a copy of the bio
3270 */ 3496 */
3271 align_bi = bio_clone(raid_bio, GFP_NOIO); 3497 align_bi = bio_clone(raid_bio, GFP_NOIO);
3272 if (!align_bi) 3498 if (!align_bi)
@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3280 /* 3506 /*
3281 * compute position 3507 * compute position
3282 */ 3508 */
3283 align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, 3509 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
3284 raid_disks, 3510 0,
3285 data_disks, 3511 &dd_idx, NULL);
3286 &dd_idx,
3287 &pd_idx,
3288 conf);
3289 3512
3290 rcu_read_lock(); 3513 rcu_read_lock();
3291 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3514 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3377{ 3600{
3378 mddev_t *mddev = q->queuedata; 3601 mddev_t *mddev = q->queuedata;
3379 raid5_conf_t *conf = mddev_to_conf(mddev); 3602 raid5_conf_t *conf = mddev_to_conf(mddev);
3380 unsigned int dd_idx, pd_idx; 3603 int dd_idx;
3381 sector_t new_sector; 3604 sector_t new_sector;
3382 sector_t logical_sector, last_sector; 3605 sector_t logical_sector, last_sector;
3383 struct stripe_head *sh; 3606 struct stripe_head *sh;
@@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3400 if (rw == READ && 3623 if (rw == READ &&
3401 mddev->reshape_position == MaxSector && 3624 mddev->reshape_position == MaxSector &&
3402 chunk_aligned_read(q,bi)) 3625 chunk_aligned_read(q,bi))
3403 return 0; 3626 return 0;
3404 3627
3405 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3628 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3406 last_sector = bi->bi_sector + (bi->bi_size>>9); 3629 last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi)
3410 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3633 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3411 DEFINE_WAIT(w); 3634 DEFINE_WAIT(w);
3412 int disks, data_disks; 3635 int disks, data_disks;
3636 int previous;
3413 3637
3414 retry: 3638 retry:
3639 previous = 0;
3640 disks = conf->raid_disks;
3415 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3641 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3416 if (likely(conf->expand_progress == MaxSector)) 3642 if (unlikely(conf->reshape_progress != MaxSector)) {
3417 disks = conf->raid_disks; 3643 /* spinlock is needed as reshape_progress may be
3418 else {
3419 /* spinlock is needed as expand_progress may be
3420 * 64bit on a 32bit platform, and so it might be 3644 * 64bit on a 32bit platform, and so it might be
3421 * possible to see a half-updated value 3645 * possible to see a half-updated value
3422 * Ofcourse expand_progress could change after 3646 * Ofcourse reshape_progress could change after
3423 * the lock is dropped, so once we get a reference 3647 * the lock is dropped, so once we get a reference
3424 * to the stripe that we think it is, we will have 3648 * to the stripe that we think it is, we will have
3425 * to check again. 3649 * to check again.
3426 */ 3650 */
3427 spin_lock_irq(&conf->device_lock); 3651 spin_lock_irq(&conf->device_lock);
3428 disks = conf->raid_disks; 3652 if (mddev->delta_disks < 0
3429 if (logical_sector >= conf->expand_progress) 3653 ? logical_sector < conf->reshape_progress
3654 : logical_sector >= conf->reshape_progress) {
3430 disks = conf->previous_raid_disks; 3655 disks = conf->previous_raid_disks;
3431 else { 3656 previous = 1;
3432 if (logical_sector >= conf->expand_lo) { 3657 } else {
3658 if (mddev->delta_disks < 0
3659 ? logical_sector < conf->reshape_safe
3660 : logical_sector >= conf->reshape_safe) {
3433 spin_unlock_irq(&conf->device_lock); 3661 spin_unlock_irq(&conf->device_lock);
3434 schedule(); 3662 schedule();
3435 goto retry; 3663 goto retry;
@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi)
3439 } 3667 }
3440 data_disks = disks - conf->max_degraded; 3668 data_disks = disks - conf->max_degraded;
3441 3669
3442 new_sector = raid5_compute_sector(logical_sector, disks, data_disks, 3670 new_sector = raid5_compute_sector(conf, logical_sector,
3443 &dd_idx, &pd_idx, conf); 3671 previous,
3672 &dd_idx, NULL);
3444 pr_debug("raid5: make_request, sector %llu logical %llu\n", 3673 pr_debug("raid5: make_request, sector %llu logical %llu\n",
3445 (unsigned long long)new_sector, 3674 (unsigned long long)new_sector,
3446 (unsigned long long)logical_sector); 3675 (unsigned long long)logical_sector);
3447 3676
3448 sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); 3677 sh = get_active_stripe(conf, new_sector, previous,
3678 (bi->bi_rw&RWA_MASK));
3449 if (sh) { 3679 if (sh) {
3450 if (unlikely(conf->expand_progress != MaxSector)) { 3680 if (unlikely(previous)) {
3451 /* expansion might have moved on while waiting for a 3681 /* expansion might have moved on while waiting for a
3452 * stripe, so we must do the range check again. 3682 * stripe, so we must do the range check again.
3453 * Expansion could still move past after this 3683 * Expansion could still move past after this
@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
3458 */ 3688 */
3459 int must_retry = 0; 3689 int must_retry = 0;
3460 spin_lock_irq(&conf->device_lock); 3690 spin_lock_irq(&conf->device_lock);
3461 if (logical_sector < conf->expand_progress && 3691 if (mddev->delta_disks < 0
3462 disks == conf->previous_raid_disks) 3692 ? logical_sector >= conf->reshape_progress
3693 : logical_sector < conf->reshape_progress)
3463 /* mismatch, need to try again */ 3694 /* mismatch, need to try again */
3464 must_retry = 1; 3695 must_retry = 1;
3465 spin_unlock_irq(&conf->device_lock); 3696 spin_unlock_irq(&conf->device_lock);
@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
3514 return 0; 3745 return 0;
3515} 3746}
3516 3747
3748static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
3749
3517static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 3750static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
3518{ 3751{
3519 /* reshaping is quite different to recovery/resync so it is 3752 /* reshaping is quite different to recovery/resync so it is
@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3527 */ 3760 */
3528 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 3761 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3529 struct stripe_head *sh; 3762 struct stripe_head *sh;
3530 int pd_idx;
3531 sector_t first_sector, last_sector; 3763 sector_t first_sector, last_sector;
3532 int raid_disks = conf->previous_raid_disks; 3764 int raid_disks = conf->previous_raid_disks;
3533 int data_disks = raid_disks - conf->max_degraded; 3765 int data_disks = raid_disks - conf->max_degraded;
3534 int new_data_disks = conf->raid_disks - conf->max_degraded; 3766 int new_data_disks = conf->raid_disks - conf->max_degraded;
3535 int i; 3767 int i;
3536 int dd_idx; 3768 int dd_idx;
3537 sector_t writepos, safepos, gap; 3769 sector_t writepos, readpos, safepos;
3538 3770 sector_t stripe_addr;
3539 if (sector_nr == 0 && 3771 int reshape_sectors;
3540 conf->expand_progress != 0) { 3772 struct list_head stripes;
3541 /* restarting in the middle, skip the initial sectors */ 3773
3542 sector_nr = conf->expand_progress; 3774 if (sector_nr == 0) {
3775 /* If restarting in the middle, skip the initial sectors */
3776 if (mddev->delta_disks < 0 &&
3777 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
3778 sector_nr = raid5_size(mddev, 0, 0)
3779 - conf->reshape_progress;
3780 } else if (mddev->delta_disks > 0 &&
3781 conf->reshape_progress > 0)
3782 sector_nr = conf->reshape_progress;
3543 sector_div(sector_nr, new_data_disks); 3783 sector_div(sector_nr, new_data_disks);
3544 *skipped = 1; 3784 if (sector_nr) {
3545 return sector_nr; 3785 *skipped = 1;
3786 return sector_nr;
3787 }
3546 } 3788 }
3547 3789
3790 /* We need to process a full chunk at a time.
3791 * If old and new chunk sizes differ, we need to process the
3792 * largest of these
3793 */
3794 if (mddev->new_chunk > mddev->chunk_size)
3795 reshape_sectors = mddev->new_chunk / 512;
3796 else
3797 reshape_sectors = mddev->chunk_size / 512;
3798
3548 /* we update the metadata when there is more than 3Meg 3799 /* we update the metadata when there is more than 3Meg
3549 * in the block range (that is rather arbitrary, should 3800 * in the block range (that is rather arbitrary, should
3550 * probably be time based) or when the data about to be 3801 * probably be time based) or when the data about to be
3551 * copied would over-write the source of the data at 3802 * copied would over-write the source of the data at
3552 * the front of the range. 3803 * the front of the range.
3553 * i.e. one new_stripe forward from expand_progress new_maps 3804 * i.e. one new_stripe along from reshape_progress new_maps
3554 * to after where expand_lo old_maps to 3805 * to after where reshape_safe old_maps to
3555 */ 3806 */
3556 writepos = conf->expand_progress + 3807 writepos = conf->reshape_progress;
3557 conf->chunk_size/512*(new_data_disks);
3558 sector_div(writepos, new_data_disks); 3808 sector_div(writepos, new_data_disks);
3559 safepos = conf->expand_lo; 3809 readpos = conf->reshape_progress;
3810 sector_div(readpos, data_disks);
3811 safepos = conf->reshape_safe;
3560 sector_div(safepos, data_disks); 3812 sector_div(safepos, data_disks);
3561 gap = conf->expand_progress - conf->expand_lo; 3813 if (mddev->delta_disks < 0) {
3814 writepos -= reshape_sectors;
3815 readpos += reshape_sectors;
3816 safepos += reshape_sectors;
3817 } else {
3818 writepos += reshape_sectors;
3819 readpos -= reshape_sectors;
3820 safepos -= reshape_sectors;
3821 }
3562 3822
3563 if (writepos >= safepos || 3823 /* 'writepos' is the most advanced device address we might write.
3564 gap > (new_data_disks)*3000*2 /*3Meg*/) { 3824 * 'readpos' is the least advanced device address we might read.
3825 * 'safepos' is the least address recorded in the metadata as having
3826 * been reshaped.
3827 * If 'readpos' is behind 'writepos', then there is no way that we can
3828 * ensure safety in the face of a crash - that must be done by userspace
3829 * making a backup of the data. So in that case there is no particular
3830 * rush to update metadata.
3831 * Otherwise if 'safepos' is behind 'writepos', then we really need to
3832 * update the metadata to advance 'safepos' to match 'readpos' so that
3833 * we can be safe in the event of a crash.
3834 * So we insist on updating metadata if safepos is behind writepos and
3835 * readpos is beyond writepos.
3836 * In any case, update the metadata every 10 seconds.
3837 * Maybe that number should be configurable, but I'm not sure it is
3838 * worth it.... maybe it could be a multiple of safemode_delay???
3839 */
3840 if ((mddev->delta_disks < 0
3841 ? (safepos > writepos && readpos < writepos)
3842 : (safepos < writepos && readpos > writepos)) ||
3843 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
3565 /* Cannot proceed until we've updated the superblock... */ 3844 /* Cannot proceed until we've updated the superblock... */
3566 wait_event(conf->wait_for_overlap, 3845 wait_event(conf->wait_for_overlap,
3567 atomic_read(&conf->reshape_stripes)==0); 3846 atomic_read(&conf->reshape_stripes)==0);
3568 mddev->reshape_position = conf->expand_progress; 3847 mddev->reshape_position = conf->reshape_progress;
3848 conf->reshape_checkpoint = jiffies;
3569 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3849 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3570 md_wakeup_thread(mddev->thread); 3850 md_wakeup_thread(mddev->thread);
3571 wait_event(mddev->sb_wait, mddev->flags == 0 || 3851 wait_event(mddev->sb_wait, mddev->flags == 0 ||
3572 kthread_should_stop()); 3852 kthread_should_stop());
3573 spin_lock_irq(&conf->device_lock); 3853 spin_lock_irq(&conf->device_lock);
3574 conf->expand_lo = mddev->reshape_position; 3854 conf->reshape_safe = mddev->reshape_position;
3575 spin_unlock_irq(&conf->device_lock); 3855 spin_unlock_irq(&conf->device_lock);
3576 wake_up(&conf->wait_for_overlap); 3856 wake_up(&conf->wait_for_overlap);
3577 } 3857 }
3578 3858
3579 for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { 3859 if (mddev->delta_disks < 0) {
3860 BUG_ON(conf->reshape_progress == 0);
3861 stripe_addr = writepos;
3862 BUG_ON((mddev->dev_sectors &
3863 ~((sector_t)reshape_sectors - 1))
3864 - reshape_sectors - stripe_addr
3865 != sector_nr);
3866 } else {
3867 BUG_ON(writepos != sector_nr + reshape_sectors);
3868 stripe_addr = sector_nr;
3869 }
3870 INIT_LIST_HEAD(&stripes);
3871 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
3580 int j; 3872 int j;
3581 int skipped = 0; 3873 int skipped = 0;
3582 pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); 3874 sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
3583 sh = get_active_stripe(conf, sector_nr+i,
3584 conf->raid_disks, pd_idx, 0);
3585 set_bit(STRIPE_EXPANDING, &sh->state); 3875 set_bit(STRIPE_EXPANDING, &sh->state);
3586 atomic_inc(&conf->reshape_stripes); 3876 atomic_inc(&conf->reshape_stripes);
3587 /* If any of this stripe is beyond the end of the old 3877 /* If any of this stripe is beyond the end of the old
@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3592 if (j == sh->pd_idx) 3882 if (j == sh->pd_idx)
3593 continue; 3883 continue;
3594 if (conf->level == 6 && 3884 if (conf->level == 6 &&
3595 j == raid6_next_disk(sh->pd_idx, sh->disks)) 3885 j == sh->qd_idx)
3596 continue; 3886 continue;
3597 s = compute_blocknr(sh, j); 3887 s = compute_blocknr(sh, j, 0);
3598 if (s < mddev->array_sectors) { 3888 if (s < raid5_size(mddev, 0, 0)) {
3599 skipped = 1; 3889 skipped = 1;
3600 continue; 3890 continue;
3601 } 3891 }
@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3607 set_bit(STRIPE_EXPAND_READY, &sh->state); 3897 set_bit(STRIPE_EXPAND_READY, &sh->state);
3608 set_bit(STRIPE_HANDLE, &sh->state); 3898 set_bit(STRIPE_HANDLE, &sh->state);
3609 } 3899 }
3610 release_stripe(sh); 3900 list_add(&sh->lru, &stripes);
3611 } 3901 }
3612 spin_lock_irq(&conf->device_lock); 3902 spin_lock_irq(&conf->device_lock);
3613 conf->expand_progress = (sector_nr + i) * new_data_disks; 3903 if (mddev->delta_disks < 0)
3904 conf->reshape_progress -= reshape_sectors * new_data_disks;
3905 else
3906 conf->reshape_progress += reshape_sectors * new_data_disks;
3614 spin_unlock_irq(&conf->device_lock); 3907 spin_unlock_irq(&conf->device_lock);
3615 /* Ok, those stripe are ready. We can start scheduling 3908 /* Ok, those stripe are ready. We can start scheduling
3616 * reads on the source stripes. 3909 * reads on the source stripes.
@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3618 * block on the destination stripes. 3911 * block on the destination stripes.
3619 */ 3912 */
3620 first_sector = 3913 first_sector =
3621 raid5_compute_sector(sector_nr*(new_data_disks), 3914 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
3622 raid_disks, data_disks, 3915 1, &dd_idx, NULL);
3623 &dd_idx, &pd_idx, conf);
3624 last_sector = 3916 last_sector =
3625 raid5_compute_sector((sector_nr+conf->chunk_size/512) 3917 raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
3626 *(new_data_disks) -1, 3918 *(new_data_disks) - 1),
3627 raid_disks, data_disks, 3919 1, &dd_idx, NULL);
3628 &dd_idx, &pd_idx, conf); 3920 if (last_sector >= mddev->dev_sectors)
3629 if (last_sector >= (mddev->size<<1)) 3921 last_sector = mddev->dev_sectors - 1;
3630 last_sector = (mddev->size<<1)-1;
3631 while (first_sector <= last_sector) { 3922 while (first_sector <= last_sector) {
3632 pd_idx = stripe_to_pdidx(first_sector, conf, 3923 sh = get_active_stripe(conf, first_sector, 1, 0);
3633 conf->previous_raid_disks);
3634 sh = get_active_stripe(conf, first_sector,
3635 conf->previous_raid_disks, pd_idx, 0);
3636 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3924 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3637 set_bit(STRIPE_HANDLE, &sh->state); 3925 set_bit(STRIPE_HANDLE, &sh->state);
3638 release_stripe(sh); 3926 release_stripe(sh);
3639 first_sector += STRIPE_SECTORS; 3927 first_sector += STRIPE_SECTORS;
3640 } 3928 }
3929 /* Now that the sources are clearly marked, we can release
3930 * the destination stripes
3931 */
3932 while (!list_empty(&stripes)) {
3933 sh = list_entry(stripes.next, struct stripe_head, lru);
3934 list_del_init(&sh->lru);
3935 release_stripe(sh);
3936 }
3641 /* If this takes us to the resync_max point where we have to pause, 3937 /* If this takes us to the resync_max point where we have to pause,
3642 * then we need to write out the superblock. 3938 * then we need to write out the superblock.
3643 */ 3939 */
3644 sector_nr += conf->chunk_size>>9; 3940 sector_nr += reshape_sectors;
3645 if (sector_nr >= mddev->resync_max) { 3941 if (sector_nr >= mddev->resync_max) {
3646 /* Cannot proceed until we've updated the superblock... */ 3942 /* Cannot proceed until we've updated the superblock... */
3647 wait_event(conf->wait_for_overlap, 3943 wait_event(conf->wait_for_overlap,
3648 atomic_read(&conf->reshape_stripes) == 0); 3944 atomic_read(&conf->reshape_stripes) == 0);
3649 mddev->reshape_position = conf->expand_progress; 3945 mddev->reshape_position = conf->reshape_progress;
3946 conf->reshape_checkpoint = jiffies;
3650 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3947 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3651 md_wakeup_thread(mddev->thread); 3948 md_wakeup_thread(mddev->thread);
3652 wait_event(mddev->sb_wait, 3949 wait_event(mddev->sb_wait,
3653 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 3950 !test_bit(MD_CHANGE_DEVS, &mddev->flags)
3654 || kthread_should_stop()); 3951 || kthread_should_stop());
3655 spin_lock_irq(&conf->device_lock); 3952 spin_lock_irq(&conf->device_lock);
3656 conf->expand_lo = mddev->reshape_position; 3953 conf->reshape_safe = mddev->reshape_position;
3657 spin_unlock_irq(&conf->device_lock); 3954 spin_unlock_irq(&conf->device_lock);
3658 wake_up(&conf->wait_for_overlap); 3955 wake_up(&conf->wait_for_overlap);
3659 } 3956 }
3660 return conf->chunk_size>>9; 3957 return reshape_sectors;
3661} 3958}
3662 3959
3663/* FIXME go_faster isn't used */ 3960/* FIXME go_faster isn't used */
@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3665{ 3962{
3666 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 3963 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
3667 struct stripe_head *sh; 3964 struct stripe_head *sh;
3668 int pd_idx; 3965 sector_t max_sector = mddev->dev_sectors;
3669 int raid_disks = conf->raid_disks;
3670 sector_t max_sector = mddev->size << 1;
3671 int sync_blocks; 3966 int sync_blocks;
3672 int still_degraded = 0; 3967 int still_degraded = 0;
3673 int i; 3968 int i;
@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3675 if (sector_nr >= max_sector) { 3970 if (sector_nr >= max_sector) {
3676 /* just being told to finish up .. nothing much to do */ 3971 /* just being told to finish up .. nothing much to do */
3677 unplug_slaves(mddev); 3972 unplug_slaves(mddev);
3973
3678 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 3974 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
3679 end_reshape(conf); 3975 end_reshape(conf);
3680 return 0; 3976 return 0;
@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3705 */ 4001 */
3706 if (mddev->degraded >= conf->max_degraded && 4002 if (mddev->degraded >= conf->max_degraded &&
3707 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4003 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3708 sector_t rv = (mddev->size << 1) - sector_nr; 4004 sector_t rv = mddev->dev_sectors - sector_nr;
3709 *skipped = 1; 4005 *skipped = 1;
3710 return rv; 4006 return rv;
3711 } 4007 }
@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3721 4017
3722 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4018 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3723 4019
3724 pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); 4020 sh = get_active_stripe(conf, sector_nr, 0, 1);
3725 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
3726 if (sh == NULL) { 4021 if (sh == NULL) {
3727 sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); 4022 sh = get_active_stripe(conf, sector_nr, 0, 0);
3728 /* make sure we don't swamp the stripe cache if someone else 4023 /* make sure we don't swamp the stripe cache if someone else
3729 * is trying to get access 4024 * is trying to get access
3730 */ 4025 */
@@ -3766,19 +4061,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
3766 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4061 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
3767 */ 4062 */
3768 struct stripe_head *sh; 4063 struct stripe_head *sh;
3769 int dd_idx, pd_idx; 4064 int dd_idx;
3770 sector_t sector, logical_sector, last_sector; 4065 sector_t sector, logical_sector, last_sector;
3771 int scnt = 0; 4066 int scnt = 0;
3772 int remaining; 4067 int remaining;
3773 int handled = 0; 4068 int handled = 0;
3774 4069
3775 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4070 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3776 sector = raid5_compute_sector( logical_sector, 4071 sector = raid5_compute_sector(conf, logical_sector,
3777 conf->raid_disks, 4072 0, &dd_idx, NULL);
3778 conf->raid_disks - conf->max_degraded,
3779 &dd_idx,
3780 &pd_idx,
3781 conf);
3782 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4073 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
3783 4074
3784 for (; logical_sector < last_sector; 4075 for (; logical_sector < last_sector;
@@ -3790,7 +4081,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
3790 /* already done this stripe */ 4081 /* already done this stripe */
3791 continue; 4082 continue;
3792 4083
3793 sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); 4084 sh = get_active_stripe(conf, sector, 0, 1);
3794 4085
3795 if (!sh) { 4086 if (!sh) {
3796 /* failed to get a stripe - must wait */ 4087 /* failed to get a stripe - must wait */
@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = {
3992 .attrs = raid5_attrs, 4283 .attrs = raid5_attrs,
3993}; 4284};
3994 4285
3995static int run(mddev_t *mddev) 4286static sector_t
4287raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4288{
4289 raid5_conf_t *conf = mddev_to_conf(mddev);
4290
4291 if (!sectors)
4292 sectors = mddev->dev_sectors;
4293 if (!raid_disks) {
4294 /* size is defined by the smallest of previous and new size */
4295 if (conf->raid_disks < conf->previous_raid_disks)
4296 raid_disks = conf->raid_disks;
4297 else
4298 raid_disks = conf->previous_raid_disks;
4299 }
4300
4301 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4302 sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
4303 return sectors * (raid_disks - conf->max_degraded);
4304}
4305
4306static raid5_conf_t *setup_conf(mddev_t *mddev)
3996{ 4307{
3997 raid5_conf_t *conf; 4308 raid5_conf_t *conf;
3998 int raid_disk, memory; 4309 int raid_disk, memory;
3999 mdk_rdev_t *rdev; 4310 mdk_rdev_t *rdev;
4000 struct disk_info *disk; 4311 struct disk_info *disk;
4001 int working_disks = 0;
4002 4312
4003 if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { 4313 if (mddev->new_level != 5
4314 && mddev->new_level != 4
4315 && mddev->new_level != 6) {
4004 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4316 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
4005 mdname(mddev), mddev->level); 4317 mdname(mddev), mddev->new_level);
4006 return -EIO; 4318 return ERR_PTR(-EIO);
4007 } 4319 }
4008 4320 if ((mddev->new_level == 5
4009 if (mddev->chunk_size < PAGE_SIZE) { 4321 && !algorithm_valid_raid5(mddev->new_layout)) ||
4010 printk(KERN_ERR "md/raid5: chunk_size must be at least " 4322 (mddev->new_level == 6
4011 "PAGE_SIZE but %d < %ld\n", 4323 && !algorithm_valid_raid6(mddev->new_layout))) {
4012 mddev->chunk_size, PAGE_SIZE); 4324 printk(KERN_ERR "raid5: %s: layout %d not supported\n",
4013 return -EINVAL; 4325 mdname(mddev), mddev->new_layout);
4326 return ERR_PTR(-EIO);
4014 } 4327 }
4015 4328 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4016 if (mddev->reshape_position != MaxSector) { 4329 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
4017 /* Check that we can continue the reshape. 4330 mdname(mddev), mddev->raid_disks);
4018 * Currently only disks can change, it must 4331 return ERR_PTR(-EINVAL);
4019 * increase, and we must be past the point where
4020 * a stripe over-writes itself
4021 */
4022 sector_t here_new, here_old;
4023 int old_disks;
4024 int max_degraded = (mddev->level == 5 ? 1 : 2);
4025
4026 if (mddev->new_level != mddev->level ||
4027 mddev->new_layout != mddev->layout ||
4028 mddev->new_chunk != mddev->chunk_size) {
4029 printk(KERN_ERR "raid5: %s: unsupported reshape "
4030 "required - aborting.\n",
4031 mdname(mddev));
4032 return -EINVAL;
4033 }
4034 if (mddev->delta_disks <= 0) {
4035 printk(KERN_ERR "raid5: %s: unsupported reshape "
4036 "(reduce disks) required - aborting.\n",
4037 mdname(mddev));
4038 return -EINVAL;
4039 }
4040 old_disks = mddev->raid_disks - mddev->delta_disks;
4041 /* reshape_position must be on a new-stripe boundary, and one
4042 * further up in new geometry must map after here in old
4043 * geometry.
4044 */
4045 here_new = mddev->reshape_position;
4046 if (sector_div(here_new, (mddev->chunk_size>>9)*
4047 (mddev->raid_disks - max_degraded))) {
4048 printk(KERN_ERR "raid5: reshape_position not "
4049 "on a stripe boundary\n");
4050 return -EINVAL;
4051 }
4052 /* here_new is the stripe we will write to */
4053 here_old = mddev->reshape_position;
4054 sector_div(here_old, (mddev->chunk_size>>9)*
4055 (old_disks-max_degraded));
4056 /* here_old is the first stripe that we might need to read
4057 * from */
4058 if (here_new >= here_old) {
4059 /* Reading from the same stripe as writing to - bad */
4060 printk(KERN_ERR "raid5: reshape_position too early for "
4061 "auto-recovery - aborting.\n");
4062 return -EINVAL;
4063 }
4064 printk(KERN_INFO "raid5: reshape will continue\n");
4065 /* OK, we should be able to continue; */
4066 } 4332 }
4067 4333
4334 if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
4335 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4336 mddev->new_chunk, mdname(mddev));
4337 return ERR_PTR(-EINVAL);
4338 }
4068 4339
4069 mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); 4340 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
4070 if ((conf = mddev->private) == NULL) 4341 if (conf == NULL)
4071 goto abort; 4342 goto abort;
4072 if (mddev->reshape_position == MaxSector) { 4343
4073 conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; 4344 conf->raid_disks = mddev->raid_disks;
4074 } else { 4345 if (mddev->reshape_position == MaxSector)
4075 conf->raid_disks = mddev->raid_disks; 4346 conf->previous_raid_disks = mddev->raid_disks;
4347 else
4076 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4348 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4077 }
4078 4349
4079 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4350 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
4080 GFP_KERNEL); 4351 GFP_KERNEL);
@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev)
4086 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4357 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4087 goto abort; 4358 goto abort;
4088 4359
4089 if (mddev->level == 6) { 4360 if (mddev->new_level == 6) {
4090 conf->spare_page = alloc_page(GFP_KERNEL); 4361 conf->spare_page = alloc_page(GFP_KERNEL);
4091 if (!conf->spare_page) 4362 if (!conf->spare_page)
4092 goto abort; 4363 goto abort;
4093 } 4364 }
4094 spin_lock_init(&conf->device_lock); 4365 spin_lock_init(&conf->device_lock);
4095 mddev->queue->queue_lock = &conf->device_lock;
4096 init_waitqueue_head(&conf->wait_for_stripe); 4366 init_waitqueue_head(&conf->wait_for_stripe);
4097 init_waitqueue_head(&conf->wait_for_overlap); 4367 init_waitqueue_head(&conf->wait_for_overlap);
4098 INIT_LIST_HEAD(&conf->handle_list); 4368 INIT_LIST_HEAD(&conf->handle_list);
@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev)
4121 printk(KERN_INFO "raid5: device %s operational as raid" 4391 printk(KERN_INFO "raid5: device %s operational as raid"
4122 " disk %d\n", bdevname(rdev->bdev,b), 4392 " disk %d\n", bdevname(rdev->bdev,b),
4123 raid_disk); 4393 raid_disk);
4124 working_disks++;
4125 } else 4394 } else
4126 /* Cannot rely on bitmap to complete recovery */ 4395 /* Cannot rely on bitmap to complete recovery */
4127 conf->fullsync = 1; 4396 conf->fullsync = 1;
4128 } 4397 }
4129 4398
4130 /* 4399 conf->chunk_size = mddev->new_chunk;
4131 * 0 for a fully functional array, 1 or 2 for a degraded array. 4400 conf->level = mddev->new_level;
4132 */
4133 mddev->degraded = conf->raid_disks - working_disks;
4134 conf->mddev = mddev;
4135 conf->chunk_size = mddev->chunk_size;
4136 conf->level = mddev->level;
4137 if (conf->level == 6) 4401 if (conf->level == 6)
4138 conf->max_degraded = 2; 4402 conf->max_degraded = 2;
4139 else 4403 else
4140 conf->max_degraded = 1; 4404 conf->max_degraded = 1;
4141 conf->algorithm = mddev->layout; 4405 conf->algorithm = mddev->new_layout;
4142 conf->max_nr_stripes = NR_STRIPES; 4406 conf->max_nr_stripes = NR_STRIPES;
4143 conf->expand_progress = mddev->reshape_position; 4407 conf->reshape_progress = mddev->reshape_position;
4144 4408 if (conf->reshape_progress != MaxSector) {
4145 /* device size must be a multiple of chunk size */ 4409 conf->prev_chunk = mddev->chunk_size;
4146 mddev->size &= ~(mddev->chunk_size/1024 -1); 4410 conf->prev_algo = mddev->layout;
4147 mddev->resync_max_sectors = mddev->size << 1; 4411 }
4148 4412
4149 if (conf->level == 6 && conf->raid_disks < 4) { 4413 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4150 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4414 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4151 mdname(mddev), conf->raid_disks); 4415 if (grow_stripes(conf, conf->max_nr_stripes)) {
4416 printk(KERN_ERR
4417 "raid5: couldn't allocate %dkB for buffers\n", memory);
4152 goto abort; 4418 goto abort;
4153 } 4419 } else
4154 if (!conf->chunk_size || conf->chunk_size % 4) { 4420 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4155 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4421 memory, mdname(mddev));
4156 conf->chunk_size, mdname(mddev)); 4422
4423 conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
4424 if (!conf->thread) {
4425 printk(KERN_ERR
4426 "raid5: couldn't allocate thread for %s\n",
4427 mdname(mddev));
4157 goto abort; 4428 goto abort;
4158 } 4429 }
4159 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { 4430
4160 printk(KERN_ERR 4431 return conf;
4161 "raid5: unsupported parity algorithm %d for %s\n", 4432
4162 conf->algorithm, mdname(mddev)); 4433 abort:
4163 goto abort; 4434 if (conf) {
4435 shrink_stripes(conf);
4436 safe_put_page(conf->spare_page);
4437 kfree(conf->disks);
4438 kfree(conf->stripe_hashtbl);
4439 kfree(conf);
4440 return ERR_PTR(-EIO);
4441 } else
4442 return ERR_PTR(-ENOMEM);
4443}
4444
4445static int run(mddev_t *mddev)
4446{
4447 raid5_conf_t *conf;
4448 int working_disks = 0;
4449 mdk_rdev_t *rdev;
4450
4451 if (mddev->reshape_position != MaxSector) {
4452 /* Check that we can continue the reshape.
4453 * Currently only disks can change, it must
4454 * increase, and we must be past the point where
4455 * a stripe over-writes itself
4456 */
4457 sector_t here_new, here_old;
4458 int old_disks;
4459 int max_degraded = (mddev->level == 6 ? 2 : 1);
4460
4461 if (mddev->new_level != mddev->level) {
4462 printk(KERN_ERR "raid5: %s: unsupported reshape "
4463 "required - aborting.\n",
4464 mdname(mddev));
4465 return -EINVAL;
4466 }
4467 old_disks = mddev->raid_disks - mddev->delta_disks;
4468 /* reshape_position must be on a new-stripe boundary, and one
4469 * further up in new geometry must map after here in old
4470 * geometry.
4471 */
4472 here_new = mddev->reshape_position;
4473 if (sector_div(here_new, (mddev->new_chunk>>9)*
4474 (mddev->raid_disks - max_degraded))) {
4475 printk(KERN_ERR "raid5: reshape_position not "
4476 "on a stripe boundary\n");
4477 return -EINVAL;
4478 }
4479 /* here_new is the stripe we will write to */
4480 here_old = mddev->reshape_position;
4481 sector_div(here_old, (mddev->chunk_size>>9)*
4482 (old_disks-max_degraded));
4483 /* here_old is the first stripe that we might need to read
4484 * from */
4485 if (here_new >= here_old) {
4486 /* Reading from the same stripe as writing to - bad */
4487 printk(KERN_ERR "raid5: reshape_position too early for "
4488 "auto-recovery - aborting.\n");
4489 return -EINVAL;
4490 }
4491 printk(KERN_INFO "raid5: reshape will continue\n");
4492 /* OK, we should be able to continue; */
4493 } else {
4494 BUG_ON(mddev->level != mddev->new_level);
4495 BUG_ON(mddev->layout != mddev->new_layout);
4496 BUG_ON(mddev->chunk_size != mddev->new_chunk);
4497 BUG_ON(mddev->delta_disks != 0);
4164 } 4498 }
4499
4500 if (mddev->private == NULL)
4501 conf = setup_conf(mddev);
4502 else
4503 conf = mddev->private;
4504
4505 if (IS_ERR(conf))
4506 return PTR_ERR(conf);
4507
4508 mddev->thread = conf->thread;
4509 conf->thread = NULL;
4510 mddev->private = conf;
4511
4512 /*
4513 * 0 for a fully functional array, 1 or 2 for a degraded array.
4514 */
4515 list_for_each_entry(rdev, &mddev->disks, same_set)
4516 if (rdev->raid_disk >= 0 &&
4517 test_bit(In_sync, &rdev->flags))
4518 working_disks++;
4519
4520 mddev->degraded = conf->raid_disks - working_disks;
4521
4165 if (mddev->degraded > conf->max_degraded) { 4522 if (mddev->degraded > conf->max_degraded) {
4166 printk(KERN_ERR "raid5: not enough operational devices for %s" 4523 printk(KERN_ERR "raid5: not enough operational devices for %s"
4167 " (%d/%d failed)\n", 4524 " (%d/%d failed)\n",
@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev)
4169 goto abort; 4526 goto abort;
4170 } 4527 }
4171 4528
4529 /* device size must be a multiple of chunk size */
4530 mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
4531 mddev->resync_max_sectors = mddev->dev_sectors;
4532
4172 if (mddev->degraded > 0 && 4533 if (mddev->degraded > 0 &&
4173 mddev->recovery_cp != MaxSector) { 4534 mddev->recovery_cp != MaxSector) {
4174 if (mddev->ok_start_degraded) 4535 if (mddev->ok_start_degraded)
@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev)
4184 } 4545 }
4185 } 4546 }
4186 4547
4187 {
4188 mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
4189 if (!mddev->thread) {
4190 printk(KERN_ERR
4191 "raid5: couldn't allocate thread for %s\n",
4192 mdname(mddev));
4193 goto abort;
4194 }
4195 }
4196 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4197 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4198 if (grow_stripes(conf, conf->max_nr_stripes)) {
4199 printk(KERN_ERR
4200 "raid5: couldn't allocate %dkB for buffers\n", memory);
4201 shrink_stripes(conf);
4202 md_unregister_thread(mddev->thread);
4203 goto abort;
4204 } else
4205 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4206 memory, mdname(mddev));
4207
4208 if (mddev->degraded == 0) 4548 if (mddev->degraded == 0)
4209 printk("raid5: raid level %d set %s active with %d out of %d" 4549 printk("raid5: raid level %d set %s active with %d out of %d"
4210 " devices, algorithm %d\n", conf->level, mdname(mddev), 4550 " devices, algorithm %d\n", conf->level, mdname(mddev),
4211 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 4551 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4212 conf->algorithm); 4552 mddev->new_layout);
4213 else 4553 else
4214 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 4554 printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
4215 " out of %d devices, algorithm %d\n", conf->level, 4555 " out of %d devices, algorithm %d\n", conf->level,
4216 mdname(mddev), mddev->raid_disks - mddev->degraded, 4556 mdname(mddev), mddev->raid_disks - mddev->degraded,
4217 mddev->raid_disks, conf->algorithm); 4557 mddev->raid_disks, mddev->new_layout);
4218 4558
4219 print_raid5_conf(conf); 4559 print_raid5_conf(conf);
4220 4560
4221 if (conf->expand_progress != MaxSector) { 4561 if (conf->reshape_progress != MaxSector) {
4222 printk("...ok start reshape thread\n"); 4562 printk("...ok start reshape thread\n");
4223 conf->expand_lo = conf->expand_progress; 4563 conf->reshape_safe = conf->reshape_progress;
4224 atomic_set(&conf->reshape_stripes, 0); 4564 atomic_set(&conf->reshape_stripes, 0);
4225 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4565 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4226 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4566 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev)
4247 "raid5: failed to create sysfs attributes for %s\n", 4587 "raid5: failed to create sysfs attributes for %s\n",
4248 mdname(mddev)); 4588 mdname(mddev));
4249 4589
4590 mddev->queue->queue_lock = &conf->device_lock;
4591
4250 mddev->queue->unplug_fn = raid5_unplug_device; 4592 mddev->queue->unplug_fn = raid5_unplug_device;
4251 mddev->queue->backing_dev_info.congested_data = mddev; 4593 mddev->queue->backing_dev_info.congested_data = mddev;
4252 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4594 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4253 4595
4254 mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - 4596 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
4255 conf->max_degraded);
4256 4597
4257 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4598 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
4258 4599
4259 return 0; 4600 return 0;
4260abort: 4601abort:
4602 md_unregister_thread(mddev->thread);
4603 mddev->thread = NULL;
4261 if (conf) { 4604 if (conf) {
4605 shrink_stripes(conf);
4262 print_raid5_conf(conf); 4606 print_raid5_conf(conf);
4263 safe_put_page(conf->spare_page); 4607 safe_put_page(conf->spare_page);
4264 kfree(conf->disks); 4608 kfree(conf->disks);
@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
4396 print_raid5_conf(conf); 4740 print_raid5_conf(conf);
4397 rdev = p->rdev; 4741 rdev = p->rdev;
4398 if (rdev) { 4742 if (rdev) {
4743 if (number >= conf->raid_disks &&
4744 conf->reshape_progress == MaxSector)
4745 clear_bit(In_sync, &rdev->flags);
4746
4399 if (test_bit(In_sync, &rdev->flags) || 4747 if (test_bit(In_sync, &rdev->flags) ||
4400 atomic_read(&rdev->nr_pending)) { 4748 atomic_read(&rdev->nr_pending)) {
4401 err = -EBUSY; 4749 err = -EBUSY;
@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
4405 * isn't possible. 4753 * isn't possible.
4406 */ 4754 */
4407 if (!test_bit(Faulty, &rdev->flags) && 4755 if (!test_bit(Faulty, &rdev->flags) &&
4408 mddev->degraded <= conf->max_degraded) { 4756 mddev->degraded <= conf->max_degraded &&
4757 number < conf->raid_disks) {
4409 err = -EBUSY; 4758 err = -EBUSY;
4410 goto abort; 4759 goto abort;
4411 } 4760 }
@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
4472 * any io in the removed space completes, but it hardly seems 4821 * any io in the removed space completes, but it hardly seems
4473 * worth it. 4822 * worth it.
4474 */ 4823 */
4475 raid5_conf_t *conf = mddev_to_conf(mddev);
4476
4477 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4824 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4478 mddev->array_sectors = sectors * (mddev->raid_disks 4825 md_set_array_sectors(mddev, raid5_size(mddev, sectors,
4479 - conf->max_degraded); 4826 mddev->raid_disks));
4827 if (mddev->array_sectors >
4828 raid5_size(mddev, sectors, mddev->raid_disks))
4829 return -EINVAL;
4480 set_capacity(mddev->gendisk, mddev->array_sectors); 4830 set_capacity(mddev->gendisk, mddev->array_sectors);
4481 mddev->changed = 1; 4831 mddev->changed = 1;
4482 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 4832 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
4483 mddev->recovery_cp = mddev->size << 1; 4833 mddev->recovery_cp = mddev->dev_sectors;
4484 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4834 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4485 } 4835 }
4486 mddev->size = sectors /2; 4836 mddev->dev_sectors = sectors;
4487 mddev->resync_max_sectors = sectors; 4837 mddev->resync_max_sectors = sectors;
4488 return 0; 4838 return 0;
4489} 4839}
4490 4840
4491#ifdef CONFIG_MD_RAID5_RESHAPE
4492static int raid5_check_reshape(mddev_t *mddev) 4841static int raid5_check_reshape(mddev_t *mddev)
4493{ 4842{
4494 raid5_conf_t *conf = mddev_to_conf(mddev); 4843 raid5_conf_t *conf = mddev_to_conf(mddev);
4495 int err;
4496 4844
4497 if (mddev->delta_disks < 0 || 4845 if (mddev->delta_disks == 0 &&
4498 mddev->new_level != mddev->level) 4846 mddev->new_layout == mddev->layout &&
4499 return -EINVAL; /* Cannot shrink array or change level yet */ 4847 mddev->new_chunk == mddev->chunk_size)
4500 if (mddev->delta_disks == 0) 4848 return -EINVAL; /* nothing to do */
4501 return 0; /* nothing to do */
4502 if (mddev->bitmap) 4849 if (mddev->bitmap)
4503 /* Cannot grow a bitmap yet */ 4850 /* Cannot grow a bitmap yet */
4504 return -EBUSY; 4851 return -EBUSY;
4852 if (mddev->degraded > conf->max_degraded)
4853 return -EINVAL;
4854 if (mddev->delta_disks < 0) {
4855 /* We might be able to shrink, but the devices must
4856 * be made bigger first.
4857 * For raid6, 4 is the minimum size.
4858 * Otherwise 2 is the minimum
4859 */
4860 int min = 2;
4861 if (mddev->level == 6)
4862 min = 4;
4863 if (mddev->raid_disks + mddev->delta_disks < min)
4864 return -EINVAL;
4865 }
4505 4866
4506 /* Can only proceed if there are plenty of stripe_heads. 4867 /* Can only proceed if there are plenty of stripe_heads.
4507 * We need a minimum of one full stripe,, and for sensible progress 4868 * We need a minimum of one full stripe,, and for sensible progress
@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev)
4514 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || 4875 if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
4515 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { 4876 (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
4516 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 4877 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
4517 (mddev->chunk_size / STRIPE_SIZE)*4); 4878 (max(mddev->chunk_size, mddev->new_chunk)
4879 / STRIPE_SIZE)*4);
4518 return -ENOSPC; 4880 return -ENOSPC;
4519 } 4881 }
4520 4882
4521 err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 4883 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
4522 if (err)
4523 return err;
4524
4525 if (mddev->degraded > conf->max_degraded)
4526 return -EINVAL;
4527 /* looks like we might be able to manage this */
4528 return 0;
4529} 4884}
4530 4885
4531static int raid5_start_reshape(mddev_t *mddev) 4886static int raid5_start_reshape(mddev_t *mddev)
@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev)
4550 */ 4905 */
4551 return -EINVAL; 4906 return -EINVAL;
4552 4907
4908 /* Refuse to reduce size of the array. Any reductions in
4909 * array size must be through explicit setting of array_size
4910 * attribute.
4911 */
4912 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
4913 < mddev->array_sectors) {
4914 printk(KERN_ERR "md: %s: array size must be reduced "
4915 "before number of disks\n", mdname(mddev));
4916 return -EINVAL;
4917 }
4918
4553 atomic_set(&conf->reshape_stripes, 0); 4919 atomic_set(&conf->reshape_stripes, 0);
4554 spin_lock_irq(&conf->device_lock); 4920 spin_lock_irq(&conf->device_lock);
4555 conf->previous_raid_disks = conf->raid_disks; 4921 conf->previous_raid_disks = conf->raid_disks;
4556 conf->raid_disks += mddev->delta_disks; 4922 conf->raid_disks += mddev->delta_disks;
4557 conf->expand_progress = 0; 4923 conf->prev_chunk = conf->chunk_size;
4558 conf->expand_lo = 0; 4924 conf->chunk_size = mddev->new_chunk;
4925 conf->prev_algo = conf->algorithm;
4926 conf->algorithm = mddev->new_layout;
4927 if (mddev->delta_disks < 0)
4928 conf->reshape_progress = raid5_size(mddev, 0, 0);
4929 else
4930 conf->reshape_progress = 0;
4931 conf->reshape_safe = conf->reshape_progress;
4932 conf->generation++;
4559 spin_unlock_irq(&conf->device_lock); 4933 spin_unlock_irq(&conf->device_lock);
4560 4934
4561 /* Add some new drives, as many as will fit. 4935 /* Add some new drives, as many as will fit.
@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
4580 break; 4954 break;
4581 } 4955 }
4582 4956
4583 spin_lock_irqsave(&conf->device_lock, flags); 4957 if (mddev->delta_disks > 0) {
4584 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; 4958 spin_lock_irqsave(&conf->device_lock, flags);
4585 spin_unlock_irqrestore(&conf->device_lock, flags); 4959 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
4960 - added_devices;
4961 spin_unlock_irqrestore(&conf->device_lock, flags);
4962 }
4586 mddev->raid_disks = conf->raid_disks; 4963 mddev->raid_disks = conf->raid_disks;
4587 mddev->reshape_position = 0; 4964 mddev->reshape_position = 0;
4588 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4965 set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev)
4597 mddev->recovery = 0; 4974 mddev->recovery = 0;
4598 spin_lock_irq(&conf->device_lock); 4975 spin_lock_irq(&conf->device_lock);
4599 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 4976 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
4600 conf->expand_progress = MaxSector; 4977 conf->reshape_progress = MaxSector;
4601 spin_unlock_irq(&conf->device_lock); 4978 spin_unlock_irq(&conf->device_lock);
4602 return -EAGAIN; 4979 return -EAGAIN;
4603 } 4980 }
4981 conf->reshape_checkpoint = jiffies;
4604 md_wakeup_thread(mddev->sync_thread); 4982 md_wakeup_thread(mddev->sync_thread);
4605 md_new_event(mddev); 4983 md_new_event(mddev);
4606 return 0; 4984 return 0;
4607} 4985}
4608#endif
4609 4986
4987/* This is called from the reshape thread and should make any
4988 * changes needed in 'conf'
4989 */
4610static void end_reshape(raid5_conf_t *conf) 4990static void end_reshape(raid5_conf_t *conf)
4611{ 4991{
4612 struct block_device *bdev;
4613 4992
4614 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 4993 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4615 conf->mddev->array_sectors = 2 * conf->mddev->size * 4994
4616 (conf->raid_disks - conf->max_degraded);
4617 set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
4618 conf->mddev->changed = 1;
4619
4620 bdev = bdget_disk(conf->mddev->gendisk, 0);
4621 if (bdev) {
4622 mutex_lock(&bdev->bd_inode->i_mutex);
4623 i_size_write(bdev->bd_inode,
4624 (loff_t)conf->mddev->array_sectors << 9);
4625 mutex_unlock(&bdev->bd_inode->i_mutex);
4626 bdput(bdev);
4627 }
4628 spin_lock_irq(&conf->device_lock); 4995 spin_lock_irq(&conf->device_lock);
4629 conf->expand_progress = MaxSector; 4996 conf->previous_raid_disks = conf->raid_disks;
4997 conf->reshape_progress = MaxSector;
4630 spin_unlock_irq(&conf->device_lock); 4998 spin_unlock_irq(&conf->device_lock);
4631 conf->mddev->reshape_position = MaxSector; 4999 wake_up(&conf->wait_for_overlap);
4632 5000
4633 /* read-ahead size must cover two whole stripes, which is 5001 /* read-ahead size must cover two whole stripes, which is
4634 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5002 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4635 */ 5003 */
4636 { 5004 {
4637 int data_disks = conf->previous_raid_disks - conf->max_degraded; 5005 int data_disks = conf->raid_disks - conf->max_degraded;
4638 int stripe = data_disks * 5006 int stripe = data_disks * (conf->chunk_size
4639 (conf->mddev->chunk_size / PAGE_SIZE); 5007 / PAGE_SIZE);
4640 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5008 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4641 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5009 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4642 } 5010 }
4643 } 5011 }
4644} 5012}
4645 5013
5014/* This is called from the raid5d thread with mddev_lock held.
5015 * It makes config changes to the device.
5016 */
5017static void raid5_finish_reshape(mddev_t *mddev)
5018{
5019 struct block_device *bdev;
5020 raid5_conf_t *conf = mddev_to_conf(mddev);
5021
5022 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5023
5024 if (mddev->delta_disks > 0) {
5025 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5026 set_capacity(mddev->gendisk, mddev->array_sectors);
5027 mddev->changed = 1;
5028
5029 bdev = bdget_disk(mddev->gendisk, 0);
5030 if (bdev) {
5031 mutex_lock(&bdev->bd_inode->i_mutex);
5032 i_size_write(bdev->bd_inode,
5033 (loff_t)mddev->array_sectors << 9);
5034 mutex_unlock(&bdev->bd_inode->i_mutex);
5035 bdput(bdev);
5036 }
5037 } else {
5038 int d;
5039 mddev->degraded = conf->raid_disks;
5040 for (d = 0; d < conf->raid_disks ; d++)
5041 if (conf->disks[d].rdev &&
5042 test_bit(In_sync,
5043 &conf->disks[d].rdev->flags))
5044 mddev->degraded--;
5045 for (d = conf->raid_disks ;
5046 d < conf->raid_disks - mddev->delta_disks;
5047 d++)
5048 raid5_remove_disk(mddev, d);
5049 }
5050 mddev->layout = conf->algorithm;
5051 mddev->chunk_size = conf->chunk_size;
5052 mddev->reshape_position = MaxSector;
5053 mddev->delta_disks = 0;
5054 }
5055}
5056
4646static void raid5_quiesce(mddev_t *mddev, int state) 5057static void raid5_quiesce(mddev_t *mddev, int state)
4647{ 5058{
4648 raid5_conf_t *conf = mddev_to_conf(mddev); 5059 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
4672 } 5083 }
4673} 5084}
4674 5085
5086
5087static void *raid5_takeover_raid1(mddev_t *mddev)
5088{
5089 int chunksect;
5090
5091 if (mddev->raid_disks != 2 ||
5092 mddev->degraded > 1)
5093 return ERR_PTR(-EINVAL);
5094
5095 /* Should check if there are write-behind devices? */
5096
5097 chunksect = 64*2; /* 64K by default */
5098
5099 /* The array must be an exact multiple of chunksize */
5100 while (chunksect && (mddev->array_sectors & (chunksect-1)))
5101 chunksect >>= 1;
5102
5103 if ((chunksect<<9) < STRIPE_SIZE)
5104 /* array size does not allow a suitable chunk size */
5105 return ERR_PTR(-EINVAL);
5106
5107 mddev->new_level = 5;
5108 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
5109 mddev->new_chunk = chunksect << 9;
5110
5111 return setup_conf(mddev);
5112}
5113
5114static void *raid5_takeover_raid6(mddev_t *mddev)
5115{
5116 int new_layout;
5117
5118 switch (mddev->layout) {
5119 case ALGORITHM_LEFT_ASYMMETRIC_6:
5120 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
5121 break;
5122 case ALGORITHM_RIGHT_ASYMMETRIC_6:
5123 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
5124 break;
5125 case ALGORITHM_LEFT_SYMMETRIC_6:
5126 new_layout = ALGORITHM_LEFT_SYMMETRIC;
5127 break;
5128 case ALGORITHM_RIGHT_SYMMETRIC_6:
5129 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
5130 break;
5131 case ALGORITHM_PARITY_0_6:
5132 new_layout = ALGORITHM_PARITY_0;
5133 break;
5134 case ALGORITHM_PARITY_N:
5135 new_layout = ALGORITHM_PARITY_N;
5136 break;
5137 default:
5138 return ERR_PTR(-EINVAL);
5139 }
5140 mddev->new_level = 5;
5141 mddev->new_layout = new_layout;
5142 mddev->delta_disks = -1;
5143 mddev->raid_disks -= 1;
5144 return setup_conf(mddev);
5145}
5146
5147
5148static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
5149{
5150 /* For a 2-drive array, the layout and chunk size can be changed
5151 * immediately as not restriping is needed.
5152 * For larger arrays we record the new value - after validation
5153 * to be used by a reshape pass.
5154 */
5155 raid5_conf_t *conf = mddev_to_conf(mddev);
5156
5157 if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
5158 return -EINVAL;
5159 if (new_chunk > 0) {
5160 if (new_chunk & (new_chunk-1))
5161 /* not a power of 2 */
5162 return -EINVAL;
5163 if (new_chunk < PAGE_SIZE)
5164 return -EINVAL;
5165 if (mddev->array_sectors & ((new_chunk>>9)-1))
5166 /* not factor of array size */
5167 return -EINVAL;
5168 }
5169
5170 /* They look valid */
5171
5172 if (mddev->raid_disks == 2) {
5173
5174 if (new_layout >= 0) {
5175 conf->algorithm = new_layout;
5176 mddev->layout = mddev->new_layout = new_layout;
5177 }
5178 if (new_chunk > 0) {
5179 conf->chunk_size = new_chunk;
5180 mddev->chunk_size = mddev->new_chunk = new_chunk;
5181 }
5182 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5183 md_wakeup_thread(mddev->thread);
5184 } else {
5185 if (new_layout >= 0)
5186 mddev->new_layout = new_layout;
5187 if (new_chunk > 0)
5188 mddev->new_chunk = new_chunk;
5189 }
5190 return 0;
5191}
5192
5193static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
5194{
5195 if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
5196 return -EINVAL;
5197 if (new_chunk > 0) {
5198 if (new_chunk & (new_chunk-1))
5199 /* not a power of 2 */
5200 return -EINVAL;
5201 if (new_chunk < PAGE_SIZE)
5202 return -EINVAL;
5203 if (mddev->array_sectors & ((new_chunk>>9)-1))
5204 /* not factor of array size */
5205 return -EINVAL;
5206 }
5207
5208 /* They look valid */
5209
5210 if (new_layout >= 0)
5211 mddev->new_layout = new_layout;
5212 if (new_chunk > 0)
5213 mddev->new_chunk = new_chunk;
5214
5215 return 0;
5216}
5217
5218static void *raid5_takeover(mddev_t *mddev)
5219{
5220 /* raid5 can take over:
5221 * raid0 - if all devices are the same - make it a raid4 layout
5222 * raid1 - if there are two drives. We need to know the chunk size
5223 * raid4 - trivial - just use a raid4 layout.
5224 * raid6 - Providing it is a *_6 layout
5225 *
5226 * For now, just do raid1
5227 */
5228
5229 if (mddev->level == 1)
5230 return raid5_takeover_raid1(mddev);
5231 if (mddev->level == 4) {
5232 mddev->new_layout = ALGORITHM_PARITY_N;
5233 mddev->new_level = 5;
5234 return setup_conf(mddev);
5235 }
5236 if (mddev->level == 6)
5237 return raid5_takeover_raid6(mddev);
5238
5239 return ERR_PTR(-EINVAL);
5240}
5241
5242
5243static struct mdk_personality raid5_personality;
5244
5245static void *raid6_takeover(mddev_t *mddev)
5246{
5247 /* Currently can only take over a raid5. We map the
5248 * personality to an equivalent raid6 personality
5249 * with the Q block at the end.
5250 */
5251 int new_layout;
5252
5253 if (mddev->pers != &raid5_personality)
5254 return ERR_PTR(-EINVAL);
5255 if (mddev->degraded > 1)
5256 return ERR_PTR(-EINVAL);
5257 if (mddev->raid_disks > 253)
5258 return ERR_PTR(-EINVAL);
5259 if (mddev->raid_disks < 3)
5260 return ERR_PTR(-EINVAL);
5261
5262 switch (mddev->layout) {
5263 case ALGORITHM_LEFT_ASYMMETRIC:
5264 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
5265 break;
5266 case ALGORITHM_RIGHT_ASYMMETRIC:
5267 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
5268 break;
5269 case ALGORITHM_LEFT_SYMMETRIC:
5270 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
5271 break;
5272 case ALGORITHM_RIGHT_SYMMETRIC:
5273 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
5274 break;
5275 case ALGORITHM_PARITY_0:
5276 new_layout = ALGORITHM_PARITY_0_6;
5277 break;
5278 case ALGORITHM_PARITY_N:
5279 new_layout = ALGORITHM_PARITY_N;
5280 break;
5281 default:
5282 return ERR_PTR(-EINVAL);
5283 }
5284 mddev->new_level = 6;
5285 mddev->new_layout = new_layout;
5286 mddev->delta_disks = 1;
5287 mddev->raid_disks += 1;
5288 return setup_conf(mddev);
5289}
5290
5291
4675static struct mdk_personality raid6_personality = 5292static struct mdk_personality raid6_personality =
4676{ 5293{
4677 .name = "raid6", 5294 .name = "raid6",
@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality =
4687 .spare_active = raid5_spare_active, 5304 .spare_active = raid5_spare_active,
4688 .sync_request = sync_request, 5305 .sync_request = sync_request,
4689 .resize = raid5_resize, 5306 .resize = raid5_resize,
4690#ifdef CONFIG_MD_RAID5_RESHAPE 5307 .size = raid5_size,
4691 .check_reshape = raid5_check_reshape, 5308 .check_reshape = raid5_check_reshape,
4692 .start_reshape = raid5_start_reshape, 5309 .start_reshape = raid5_start_reshape,
4693#endif 5310 .finish_reshape = raid5_finish_reshape,
4694 .quiesce = raid5_quiesce, 5311 .quiesce = raid5_quiesce,
5312 .takeover = raid6_takeover,
5313 .reconfig = raid6_reconfig,
4695}; 5314};
4696static struct mdk_personality raid5_personality = 5315static struct mdk_personality raid5_personality =
4697{ 5316{
@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality =
4708 .spare_active = raid5_spare_active, 5327 .spare_active = raid5_spare_active,
4709 .sync_request = sync_request, 5328 .sync_request = sync_request,
4710 .resize = raid5_resize, 5329 .resize = raid5_resize,
4711#ifdef CONFIG_MD_RAID5_RESHAPE 5330 .size = raid5_size,
4712 .check_reshape = raid5_check_reshape, 5331 .check_reshape = raid5_check_reshape,
4713 .start_reshape = raid5_start_reshape, 5332 .start_reshape = raid5_start_reshape,
4714#endif 5333 .finish_reshape = raid5_finish_reshape,
4715 .quiesce = raid5_quiesce, 5334 .quiesce = raid5_quiesce,
5335 .takeover = raid5_takeover,
5336 .reconfig = raid5_reconfig,
4716}; 5337};
4717 5338
4718static struct mdk_personality raid4_personality = 5339static struct mdk_personality raid4_personality =
@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality =
4730 .spare_active = raid5_spare_active, 5351 .spare_active = raid5_spare_active,
4731 .sync_request = sync_request, 5352 .sync_request = sync_request,
4732 .resize = raid5_resize, 5353 .resize = raid5_resize,
4733#ifdef CONFIG_MD_RAID5_RESHAPE 5354 .size = raid5_size,
4734 .check_reshape = raid5_check_reshape, 5355 .check_reshape = raid5_check_reshape,
4735 .start_reshape = raid5_start_reshape, 5356 .start_reshape = raid5_start_reshape,
4736#endif 5357 .finish_reshape = raid5_finish_reshape,
4737 .quiesce = raid5_quiesce, 5358 .quiesce = raid5_quiesce,
4738}; 5359};
4739 5360
4740static int __init raid5_init(void) 5361static int __init raid5_init(void)
4741{ 5362{
4742 int e;
4743
4744 e = raid6_select_algo();
4745 if ( e )
4746 return e;
4747 register_md_personality(&raid6_personality); 5363 register_md_personality(&raid6_personality);
4748 register_md_personality(&raid5_personality); 5364 register_md_personality(&raid5_personality);
4749 register_md_personality(&raid4_personality); 5365 register_md_personality(&raid4_personality);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
new file mode 100644
index 000000000000..52ba99954dec
--- /dev/null
+++ b/drivers/md/raid5.h
@@ -0,0 +1,474 @@
1#ifndef _RAID5_H
2#define _RAID5_H
3
4#include <linux/raid/xor.h>
5
6/*
7 *
8 * Each stripe contains one buffer per disc. Each buffer can be in
9 * one of a number of states stored in "flags". Changes between
10 * these states happen *almost* exclusively under a per-stripe
11 * spinlock. Some very specific changes can happen in bi_end_io, and
12 * these are not protected by the spin lock.
13 *
14 * The flag bits that are used to represent these states are:
15 * R5_UPTODATE and R5_LOCKED
16 *
17 * State Empty == !UPTODATE, !LOCK
18 * We have no data, and there is no active request
19 * State Want == !UPTODATE, LOCK
20 * A read request is being submitted for this block
21 * State Dirty == UPTODATE, LOCK
22 * Some new data is in this buffer, and it is being written out
23 * State Clean == UPTODATE, !LOCK
24 * We have valid data which is the same as on disc
25 *
26 * The possible state transitions are:
27 *
28 * Empty -> Want - on read or write to get old data for parity calc
29 * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
30 * Empty -> Clean - on compute_block when computing a block for failed drive
31 * Want -> Empty - on failed read
32 * Want -> Clean - on successful completion of read request
33 * Dirty -> Clean - on successful completion of write request
34 * Dirty -> Clean - on failed write
35 * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
36 *
37 * The Want->Empty, Want->Clean, Dirty->Clean, transitions
38 * all happen in b_end_io at interrupt time.
39 * Each sets the Uptodate bit before releasing the Lock bit.
40 * This leaves one multi-stage transition:
41 * Want->Dirty->Clean
42 * This is safe because thinking that a Clean buffer is actually dirty
43 * will at worst delay some action, and the stripe will be scheduled
44 * for attention after the transition is complete.
45 *
46 * There is one possibility that is not covered by these states. That
47 * is if one drive has failed and there is a spare being rebuilt. We
48 * can't distinguish between a clean block that has been generated
49 * from parity calculations, and a clean block that has been
50 * successfully written to the spare ( or to parity when resyncing).
51 * To distingush these states we have a stripe bit STRIPE_INSYNC that
52 * is set whenever a write is scheduled to the spare, or to the parity
53 * disc if there is no spare. A sync request clears this bit, and
54 * when we find it set with no buffers locked, we know the sync is
55 * complete.
56 *
57 * Buffers for the md device that arrive via make_request are attached
58 * to the appropriate stripe in one of two lists linked on b_reqnext.
59 * One list (bh_read) for read requests, one (bh_write) for write.
60 * There should never be more than one buffer on the two lists
61 * together, but we are not guaranteed of that so we allow for more.
62 *
63 * If a buffer is on the read list when the associated cache buffer is
64 * Uptodate, the data is copied into the read buffer and it's b_end_io
65 * routine is called. This may happen in the end_request routine only
66 * if the buffer has just successfully been read. end_request should
67 * remove the buffers from the list and then set the Uptodate bit on
68 * the buffer. Other threads may do this only if they first check
69 * that the Uptodate bit is set. Once they have checked that they may
70 * take buffers off the read queue.
71 *
72 * When a buffer on the write list is committed for write it is copied
73 * into the cache buffer, which is then marked dirty, and moved onto a
74 * third list, the written list (bh_written). Once both the parity
75 * block and the cached buffer are successfully written, any buffer on
76 * a written list can be returned with b_end_io.
77 *
78 * The write list and read list both act as fifos. The read list is
79 * protected by the device_lock. The write and written lists are
80 * protected by the stripe lock. The device_lock, which can be
81 * claimed while the stipe lock is held, is only for list
82 * manipulations and will only be held for a very short time. It can
83 * be claimed from interrupts.
84 *
85 *
86 * Stripes in the stripe cache can be on one of two lists (or on
87 * neither). The "inactive_list" contains stripes which are not
88 * currently being used for any request. They can freely be reused
89 * for another stripe. The "handle_list" contains stripes that need
90 * to be handled in some way. Both of these are fifo queues. Each
91 * stripe is also (potentially) linked to a hash bucket in the hash
92 * table so that it can be found by sector number. Stripes that are
93 * not hashed must be on the inactive_list, and will normally be at
94 * the front. All stripes start life this way.
95 *
96 * The inactive_list, handle_list and hash bucket lists are all protected by the
97 * device_lock.
98 * - stripes on the inactive_list never have their stripe_lock held.
99 * - stripes have a reference counter. If count==0, they are on a list.
100 * - If a stripe might need handling, STRIPE_HANDLE is set.
101 * - When refcount reaches zero, then if STRIPE_HANDLE it is put on
102 * handle_list else inactive_list
103 *
104 * This, combined with the fact that STRIPE_HANDLE is only ever
105 * cleared while a stripe has a non-zero count means that if the
106 * refcount is 0 and STRIPE_HANDLE is set, then it is on the
107 * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
108 * the stripe is on inactive_list.
109 *
110 * The possible transitions are:
111 * activate an unhashed/inactive stripe (get_active_stripe())
112 * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
113 * activate a hashed, possibly active stripe (get_active_stripe())
114 * lockdev check-hash if(!cnt++)unlink-stripe unlockdev
115 * attach a request to an active stripe (add_stripe_bh())
116 * lockdev attach-buffer unlockdev
117 * handle a stripe (handle_stripe())
118 * lockstripe clrSTRIPE_HANDLE ...
119 * (lockdev check-buffers unlockdev) ..
120 * change-state ..
121 * record io/ops needed unlockstripe schedule io/ops
122 * release an active stripe (release_stripe())
123 * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
124 *
125 * The refcount counts each thread that have activated the stripe,
126 * plus raid5d if it is handling it, plus one for each active request
127 * on a cached buffer, and plus one if the stripe is undergoing stripe
128 * operations.
129 *
130 * Stripe operations are performed outside the stripe lock,
131 * the stripe operations are:
132 * -copying data between the stripe cache and user application buffers
133 * -computing blocks to save a disk access, or to recover a missing block
134 * -updating the parity on a write operation (reconstruct write and
135 * read-modify-write)
136 * -checking parity correctness
137 * -running i/o to disk
138 * These operations are carried out by raid5_run_ops which uses the async_tx
139 * api to (optionally) offload operations to dedicated hardware engines.
140 * When requesting an operation handle_stripe sets the pending bit for the
141 * operation and increments the count. raid5_run_ops is then run whenever
142 * the count is non-zero.
143 * There are some critical dependencies between the operations that prevent some
144 * from being requested while another is in flight.
145 * 1/ Parity check operations destroy the in cache version of the parity block,
146 * so we prevent parity dependent operations like writes and compute_blocks
147 * from starting while a check is in progress. Some dma engines can perform
148 * the check without damaging the parity block, in these cases the parity
149 * block is re-marked up to date (assuming the check was successful) and is
150 * not re-read from disk.
151 * 2/ When a write operation is requested we immediately lock the affected
152 * blocks, and mark them as not up to date. This causes new read requests
153 * to be held off, as well as parity checks and compute block operations.
154 * 3/ Once a compute block operation has been requested handle_stripe treats
155 * that block as if it is up to date. raid5_run_ops guaruntees that any
156 * operation that is dependent on the compute block result is initiated after
157 * the compute block completes.
158 */
159
160/*
161 * Operations state - intermediate states that are visible outside of sh->lock
162 * In general _idle indicates nothing is running, _run indicates a data
163 * processing operation is active, and _result means the data processing result
164 * is stable and can be acted upon. For simple operations like biofill and
165 * compute that only have an _idle and _run state they are indicated with
166 * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
167 */
168/**
169 * enum check_states - handles syncing / repairing a stripe
170 * @check_state_idle - check operations are quiesced
171 * @check_state_run - check operation is running
172 * @check_state_result - set outside lock when check result is valid
173 * @check_state_compute_run - check failed and we are repairing
174 * @check_state_compute_result - set outside lock when compute result is valid
175 */
176enum check_states {
177 check_state_idle = 0,
178 check_state_run, /* parity check */
179 check_state_check_result,
180 check_state_compute_run, /* parity repair */
181 check_state_compute_result,
182};
183
184/**
185 * enum reconstruct_states - handles writing or expanding a stripe
186 */
187enum reconstruct_states {
188 reconstruct_state_idle = 0,
189 reconstruct_state_prexor_drain_run, /* prexor-write */
190 reconstruct_state_drain_run, /* write */
191 reconstruct_state_run, /* expand */
192 reconstruct_state_prexor_drain_result,
193 reconstruct_state_drain_result,
194 reconstruct_state_result,
195};
196
197struct stripe_head {
198 struct hlist_node hash;
199 struct list_head lru; /* inactive_list or handle_list */
200 struct raid5_private_data *raid_conf;
201 short generation; /* increments with every
202 * reshape */
203 sector_t sector; /* sector of this row */
204 short pd_idx; /* parity disk index */
205 short qd_idx; /* 'Q' disk index for raid6 */
206 short ddf_layout;/* use DDF ordering to calculate Q */
207 unsigned long state; /* state flags */
208 atomic_t count; /* nr of active thread/requests */
209 spinlock_t lock;
210 int bm_seq; /* sequence number for bitmap flushes */
211 int disks; /* disks in stripe */
212 enum check_states check_state;
213 enum reconstruct_states reconstruct_state;
214 /* stripe_operations
215 * @target - STRIPE_OP_COMPUTE_BLK target
216 */
217 struct stripe_operations {
218 int target;
219 u32 zero_sum_result;
220 } ops;
221 struct r5dev {
222 struct bio req;
223 struct bio_vec vec;
224 struct page *page;
225 struct bio *toread, *read, *towrite, *written;
226 sector_t sector; /* sector of this page */
227 unsigned long flags;
228 } dev[1]; /* allocated with extra space depending of RAID geometry */
229};
230
231/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
232 * for handle_stripe. It is only valid under spin_lock(sh->lock);
233 */
234struct stripe_head_state {
235 int syncing, expanding, expanded;
236 int locked, uptodate, to_read, to_write, failed, written;
237 int to_fill, compute, req_compute, non_overwrite;
238 int failed_num;
239 unsigned long ops_request;
240};
241
242/* r6_state - extra state data only relevant to r6 */
243struct r6_state {
244 int p_failed, q_failed, failed_num[2];
245};
246
247/* Flags */
248#define R5_UPTODATE 0 /* page contains current data */
249#define R5_LOCKED 1 /* IO has been submitted on "req" */
250#define R5_OVERWRITE 2 /* towrite covers whole page */
251/* and some that are internal to handle_stripe */
252#define R5_Insync 3 /* rdev && rdev->in_sync at start */
253#define R5_Wantread 4 /* want to schedule a read */
254#define R5_Wantwrite 5
255#define R5_Overlap 7 /* There is a pending overlapping request on this block */
256#define R5_ReadError 8 /* seen a read error here recently */
257#define R5_ReWrite 9 /* have tried to over-write the readerror */
258
259#define R5_Expanded 10 /* This block now has post-expand data */
260#define R5_Wantcompute 11 /* compute_block in progress treat as
261 * uptodate
262 */
263#define R5_Wantfill 12 /* dev->toread contains a bio that needs
264 * filling
265 */
266#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
267/*
268 * Write method
269 */
270#define RECONSTRUCT_WRITE 1
271#define READ_MODIFY_WRITE 2
272/* not a write method, but a compute_parity mode */
273#define CHECK_PARITY 3
274/* Additional compute_parity mode -- updates the parity w/o LOCKING */
275#define UPDATE_PARITY 4
276
277/*
278 * Stripe state
279 */
280#define STRIPE_HANDLE 2
281#define STRIPE_SYNCING 3
282#define STRIPE_INSYNC 4
283#define STRIPE_PREREAD_ACTIVE 5
284#define STRIPE_DELAYED 6
285#define STRIPE_DEGRADED 7
286#define STRIPE_BIT_DELAY 8
287#define STRIPE_EXPANDING 9
288#define STRIPE_EXPAND_SOURCE 10
289#define STRIPE_EXPAND_READY 11
290#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
291#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
292#define STRIPE_BIOFILL_RUN 14
293#define STRIPE_COMPUTE_RUN 15
294/*
295 * Operation request flags
296 */
297#define STRIPE_OP_BIOFILL 0
298#define STRIPE_OP_COMPUTE_BLK 1
299#define STRIPE_OP_PREXOR 2
300#define STRIPE_OP_BIODRAIN 3
301#define STRIPE_OP_POSTXOR 4
302#define STRIPE_OP_CHECK 5
303
304/*
305 * Plugging:
306 *
307 * To improve write throughput, we need to delay the handling of some
308 * stripes until there has been a chance that several write requests
309 * for the one stripe have all been collected.
310 * In particular, any write request that would require pre-reading
311 * is put on a "delayed" queue until there are no stripes currently
312 * in a pre-read phase. Further, if the "delayed" queue is empty when
313 * a stripe is put on it then we "plug" the queue and do not process it
314 * until an unplug call is made. (the unplug_io_fn() is called).
315 *
316 * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
317 * it to the count of prereading stripes.
318 * When write is initiated, or the stripe refcnt == 0 (just in case) we
319 * clear the PREREAD_ACTIVE flag and decrement the count
320 * Whenever the 'handle' queue is empty and the device is not plugged, we
321 * move any strips from delayed to handle and clear the DELAYED flag and set
322 * PREREAD_ACTIVE.
323 * In stripe_handle, if we find pre-reading is necessary, we do it if
324 * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
325 * HANDLE gets cleared if stripe_handle leave nothing locked.
326 */
327
328
329struct disk_info {
330 mdk_rdev_t *rdev;
331};
332
333struct raid5_private_data {
334 struct hlist_head *stripe_hashtbl;
335 mddev_t *mddev;
336 struct disk_info *spare;
337 int chunk_size, level, algorithm;
338 int max_degraded;
339 int raid_disks;
340 int max_nr_stripes;
341
342 /* reshape_progress is the leading edge of a 'reshape'
343 * It has value MaxSector when no reshape is happening
344 * If delta_disks < 0, it is the last sector we started work on,
345 * else is it the next sector to work on.
346 */
347 sector_t reshape_progress;
348 /* reshape_safe is the trailing edge of a reshape. We know that
349 * before (or after) this address, all reshape has completed.
350 */
351 sector_t reshape_safe;
352 int previous_raid_disks;
353 int prev_chunk, prev_algo;
354 short generation; /* increments with every reshape */
355 unsigned long reshape_checkpoint; /* Time we last updated
356 * metadata */
357
358 struct list_head handle_list; /* stripes needing handling */
359 struct list_head hold_list; /* preread ready stripes */
360 struct list_head delayed_list; /* stripes that have plugged requests */
361 struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
362 struct bio *retry_read_aligned; /* currently retrying aligned bios */
363 struct bio *retry_read_aligned_list; /* aligned bios retry list */
364 atomic_t preread_active_stripes; /* stripes with scheduled io */
365 atomic_t active_aligned_reads;
366 atomic_t pending_full_writes; /* full write backlog */
367 int bypass_count; /* bypassed prereads */
368 int bypass_threshold; /* preread nice */
369 struct list_head *last_hold; /* detect hold_list promotions */
370
371 atomic_t reshape_stripes; /* stripes with pending writes for reshape */
372 /* unfortunately we need two cache names as we temporarily have
373 * two caches.
374 */
375 int active_name;
376 char cache_name[2][20];
377 struct kmem_cache *slab_cache; /* for allocating stripes */
378
379 int seq_flush, seq_write;
380 int quiesce;
381
382 int fullsync; /* set to 1 if a full sync is needed,
383 * (fresh device added).
384 * Cleared when a sync completes.
385 */
386
387 struct page *spare_page; /* Used when checking P/Q in raid6 */
388
389 /*
390 * Free stripes pool
391 */
392 atomic_t active_stripes;
393 struct list_head inactive_list;
394 wait_queue_head_t wait_for_stripe;
395 wait_queue_head_t wait_for_overlap;
396 int inactive_blocked; /* release of inactive stripes blocked,
397 * waiting for 25% to be free
398 */
399 int pool_size; /* number of disks in stripeheads in pool */
400 spinlock_t device_lock;
401 struct disk_info *disks;
402
403 /* When taking over an array from a different personality, we store
404 * the new thread here until we fully activate the array.
405 */
406 struct mdk_thread_s *thread;
407};
408
409typedef struct raid5_private_data raid5_conf_t;
410
411#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
412
413/*
414 * Our supported algorithms
415 */
416#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
417#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
418#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
419#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
420
421/* Define non-rotating (raid4) algorithms. These allow
422 * conversion of raid4 to raid5.
423 */
424#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
425#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
426
427/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
428 * Firstly, the exact positioning of the parity block is slightly
429 * different between the 'LEFT_*' modes of md and the "_N_*" modes
430 * of DDF.
431 * Secondly, or order of datablocks over which the Q syndrome is computed
432 * is different.
433 * Consequently we have different layouts for DDF/raid6 than md/raid6.
434 * These layouts are from the DDFv1.2 spec.
435 * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
436 * leaves RLQ=3 as 'Vendor Specific'
437 */
438
439#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
440#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
441#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
442
443
444/* For every RAID5 algorithm we define a RAID6 algorithm
445 * with exactly the same layout for data and parity, and
446 * with the Q block always on the last device (N-1).
447 * This allows trivial conversion from RAID5 to RAID6
448 */
449#define ALGORITHM_LEFT_ASYMMETRIC_6 16
450#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
451#define ALGORITHM_LEFT_SYMMETRIC_6 18
452#define ALGORITHM_RIGHT_SYMMETRIC_6 19
453#define ALGORITHM_PARITY_0_6 20
454#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
455
456static inline int algorithm_valid_raid5(int layout)
457{
458 return (layout >= 0) &&
459 (layout <= 5);
460}
461static inline int algorithm_valid_raid6(int layout)
462{
463 return (layout >= 0 && layout <= 5)
464 ||
465 (layout == 8 || layout == 10)
466 ||
467 (layout >= 16 && layout <= 20);
468}
469
470static inline int algorithm_is_DDF(int layout)
471{
472 return layout >= 8 && layout <= 10;
473}
474#endif
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
deleted file mode 100644
index 98dcde88470e..000000000000
--- a/drivers/md/raid6.h
+++ /dev/null
@@ -1,130 +0,0 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2003 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13#ifndef LINUX_RAID_RAID6_H
14#define LINUX_RAID_RAID6_H
15
16#ifdef __KERNEL__
17
18/* Set to 1 to use kernel-wide empty_zero_page */
19#define RAID6_USE_EMPTY_ZERO_PAGE 0
20
21#include <linux/raid/md.h>
22#include <linux/raid/raid5.h>
23
24typedef raid5_conf_t raid6_conf_t; /* Same configuration */
25
26/* Additional compute_parity mode -- updates the parity w/o LOCKING */
27#define UPDATE_PARITY 4
28
29/* We need a pre-zeroed page... if we don't want to use the kernel-provided
30 one define it here */
31#if RAID6_USE_EMPTY_ZERO_PAGE
32# define raid6_empty_zero_page empty_zero_page
33#else
34extern const char raid6_empty_zero_page[PAGE_SIZE];
35#endif
36
37#else /* ! __KERNEL__ */
38/* Used for testing in user space */
39
40#include <errno.h>
41#include <inttypes.h>
42#include <limits.h>
43#include <stddef.h>
44#include <sys/mman.h>
45#include <sys/types.h>
46
47/* Not standard, but glibc defines it */
48#define BITS_PER_LONG __WORDSIZE
49
50typedef uint8_t u8;
51typedef uint16_t u16;
52typedef uint32_t u32;
53typedef uint64_t u64;
54
55#ifndef PAGE_SIZE
56# define PAGE_SIZE 4096
57#endif
58extern const char raid6_empty_zero_page[PAGE_SIZE];
59
60#define __init
61#define __exit
62#define __attribute_const__ __attribute__((const))
63#define noinline __attribute__((noinline))
64
65#define preempt_enable()
66#define preempt_disable()
67#define cpu_has_feature(x) 1
68#define enable_kernel_altivec()
69#define disable_kernel_altivec()
70
71#endif /* __KERNEL__ */
72
73/* Routine choices */
74struct raid6_calls {
75 void (*gen_syndrome)(int, size_t, void **);
76 int (*valid)(void); /* Returns 1 if this routine set is usable */
77 const char *name; /* Name of this routine set */
78 int prefer; /* Has special performance attribute */
79};
80
81/* Selected algorithm */
82extern struct raid6_calls raid6_call;
83
84/* Algorithm list */
85extern const struct raid6_calls * const raid6_algos[];
86int raid6_select_algo(void);
87
88/* Return values from chk_syndrome */
89#define RAID6_OK 0
90#define RAID6_P_BAD 1
91#define RAID6_Q_BAD 2
92#define RAID6_PQ_BAD 3
93
94/* Galois field tables */
95extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
96extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
97extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
98extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
99
100/* Recovery routines */
101void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
102void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
103void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
104
105/* Some definitions to allow code to be compiled for testing in userspace */
106#ifndef __KERNEL__
107
108# define jiffies raid6_jiffies()
109# define printk printf
110# define GFP_KERNEL 0
111# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
112# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE)
113
114static inline void cpu_relax(void)
115{
116 /* Nothing */
117}
118
119#undef HZ
120#define HZ 1000
121static inline uint32_t raid6_jiffies(void)
122{
123 struct timeval tv;
124 gettimeofday(&tv, NULL);
125 return tv.tv_sec*1000 + tv.tv_usec/1000;
126}
127
128#endif /* ! __KERNEL__ */
129
130#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 21987e3dbe6c..866215ac7f25 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -16,13 +16,20 @@
16 * Algorithm list and algorithm selection for RAID-6 16 * Algorithm list and algorithm selection for RAID-6
17 */ 17 */
18 18
19#include "raid6.h" 19#include <linux/raid/pq.h>
20#ifndef __KERNEL__ 20#ifndef __KERNEL__
21#include <sys/mman.h> 21#include <sys/mman.h>
22#include <stdio.h> 22#include <stdio.h>
23#else
24#if !RAID6_USE_EMPTY_ZERO_PAGE
25/* In .bss so it's zeroed */
26const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
27EXPORT_SYMBOL(raid6_empty_zero_page);
28#endif
23#endif 29#endif
24 30
25struct raid6_calls raid6_call; 31struct raid6_calls raid6_call;
32EXPORT_SYMBOL_GPL(raid6_call);
26 33
27/* Various routine sets */ 34/* Various routine sets */
28extern const struct raid6_calls raid6_intx1; 35extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
79#else 86#else
80/* Need more time to be stable in userspace */ 87/* Need more time to be stable in userspace */
81#define RAID6_TIME_JIFFIES_LG2 9 88#define RAID6_TIME_JIFFIES_LG2 9
89#define time_before(x, y) ((x) < (y))
82#endif 90#endif
83 91
84/* Try to pick the best algorithm */ 92/* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
152 160
153 return best ? 0 : -EINVAL; 161 return best ? 0 : -EINVAL;
154} 162}
163
164static void raid6_exit(void)
165{
166 do { } while (0);
167}
168
169subsys_initcall(raid6_select_algo);
170module_exit(raid6_exit);
171MODULE_LICENSE("GPL");
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
index b9afd35b8812..699dfeee4944 100644
--- a/drivers/md/raid6altivec.uc
+++ b/drivers/md/raid6altivec.uc
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -22,7 +22,7 @@
22 * bracked this with preempt_disable/enable or in a lock) 22 * bracked this with preempt_disable/enable or in a lock)
23 */ 23 */
24 24
25#include "raid6.h" 25#include <linux/raid/pq.h>
26 26
27#ifdef CONFIG_ALTIVEC 27#ifdef CONFIG_ALTIVEC
28 28
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
index ad004cee0e26..f9bf9cba357f 100644
--- a/drivers/md/raid6int.uc
+++ b/drivers/md/raid6int.uc
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
18 * This file is postprocessed using unroll.pl 18 * This file is postprocessed using unroll.pl
19 */ 19 */
20 20
21#include "raid6.h" 21#include <linux/raid/pq.h>
22 22
23/* 23/*
24 * This is the C data type to use 24 * This is the C data type to use
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
index d4e4a1bd70ad..e7f6c13132bf 100644
--- a/drivers/md/raid6mmx.c
+++ b/drivers/md/raid6mmx.c
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
18 18
19#if defined(__i386__) && !defined(__arch_um__) 19#if defined(__i386__) && !defined(__arch_um__)
20 20
21#include "raid6.h" 21#include <linux/raid/pq.h>
22#include "raid6x86.h" 22#include "raid6x86.h"
23 23
24/* Shared with raid6sse1.c */ 24/* Shared with raid6sse1.c */
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
index a8c4d9451bd9..2609f00e0d61 100644
--- a/drivers/md/raid6recov.c
+++ b/drivers/md/raid6recov.c
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
18 * the syndrome.) 18 * the syndrome.)
19 */ 19 */
20 20
21#include "raid6.h" 21#include <linux/raid/pq.h>
22 22
23/* Recover two failed data blocks. */ 23/* Recover two failed data blocks. */
24void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, 24void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
63 p++; q++; 63 p++; q++;
64 } 64 }
65} 65}
66 66EXPORT_SYMBOL_GPL(raid6_2data_recov);
67
68
69 67
70/* Recover failure of one data block plus the P block */ 68/* Recover failure of one data block plus the P block */
71void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) 69void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
97 q++; dq++; 95 q++; dq++;
98 } 96 }
99} 97}
98EXPORT_SYMBOL_GPL(raid6_datap_recov);
100 99
101 100#ifndef __KERNEL__
102#ifndef __KERNEL__ /* Testing only */ 101/* Testing only */
103 102
104/* Recover two failed blocks. */ 103/* Recover two failed blocks. */
105void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) 104void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
index 0666237276ff..b274dd5eab8f 100644
--- a/drivers/md/raid6sse1.c
+++ b/drivers/md/raid6sse1.c
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -23,7 +23,7 @@
23 23
24#if defined(__i386__) && !defined(__arch_um__) 24#if defined(__i386__) && !defined(__arch_um__)
25 25
26#include "raid6.h" 26#include <linux/raid/pq.h>
27#include "raid6x86.h" 27#include "raid6x86.h"
28 28
29/* Defined in raid6mmx.c */ 29/* Defined in raid6mmx.c */
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
index b034ad868039..6ed6c6c0389f 100644
--- a/drivers/md/raid6sse2.c
+++ b/drivers/md/raid6sse2.c
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */
@@ -19,7 +19,7 @@
19 19
20#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) 20#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
21 21
22#include "raid6.h" 22#include <linux/raid/pq.h>
23#include "raid6x86.h" 23#include "raid6x86.h"
24 24
25static const struct raid6_sse_constants { 25static const struct raid6_sse_constants {
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
index 78e0396adf2a..58ffdf4f5161 100644
--- a/drivers/md/raid6test/Makefile
+++ b/drivers/md/raid6test/Makefile
@@ -5,7 +5,7 @@
5 5
6CC = gcc 6CC = gcc
7OPTFLAGS = -O2 # Adjust as desired 7OPTFLAGS = -O2 # Adjust as desired
8CFLAGS = -I.. -g $(OPTFLAGS) 8CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
9LD = ld 9LD = ld
10PERL = perl 10PERL = perl
11AR = ar 11AR = ar
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
index 559cc41b2585..7a930318b17d 100644
--- a/drivers/md/raid6test/test.c
+++ b/drivers/md/raid6test/test.c
@@ -17,7 +17,7 @@
17#include <stdlib.h> 17#include <stdlib.h>
18#include <stdio.h> 18#include <stdio.h>
19#include <string.h> 19#include <string.h>
20#include "raid6.h" 20#include <linux/raid/pq.h>
21 21
22#define NDISKS 16 /* Including P and Q */ 22#define NDISKS 16 /* Including P and Q */
23 23
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h
index 99fea7a70ca7..4c22c1568558 100644
--- a/drivers/md/raid6x86.h
+++ b/drivers/md/raid6x86.h
@@ -5,7 +5,7 @@
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330, 7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or 8 * Boston MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference. 9 * (at your option) any later version; incorporated herein by reference.
10 * 10 *
11 * ----------------------------------------------------------------------- */ 11 * ----------------------------------------------------------------------- */