diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-03 12:08:19 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-03 12:08:19 -0400 |
commit | 223cdea4c4b5af5181b2da00ac85711d1e0c737c (patch) | |
tree | dfe7226c70ddabbf2e2e63924ba636345278e79c /drivers/md | |
parent | 31e6e2dac575c9d21a6ec56ca52ae89086baa705 (diff) | |
parent | c8f517c444e4f9f55b5b5ca202b8404691a35805 (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (53 commits)
md/raid5 revise rules for when to update metadata during reshape
md/raid5: minor code cleanups in make_request.
md: remove CONFIG_MD_RAID_RESHAPE config option.
md/raid5: be more careful about write ordering when reshaping.
md: don't display meaningless values in sysfs files resync_start and sync_speed
md/raid5: allow layout and chunksize to be changed on active array.
md/raid5: reshape using largest of old and new chunk size
md/raid5: prepare for allowing reshape to change layout
md/raid5: prepare for allowing reshape to change chunksize.
md/raid5: clearly differentiate 'before' and 'after' stripes during reshape.
Documentation/md.txt update
md: allow number of drives in raid5 to be reduced
md/raid5: change reshape-progress measurement to cope with reshaping backwards.
md: add explicit method to signal the end of a reshape.
md/raid5: enhance raid5_size to work correctly with negative delta_disks
md/raid5: drop qd_idx from r6_state
md/raid6: move raid6 data processing to raid6_pq.ko
md: raid5 run(): Fix max_degraded for raid level 4.
md: 'array_size' sysfs attribute
md: centralize ->array_sectors modifications
...
Diffstat (limited to 'drivers/md')
31 files changed, 3324 insertions, 837 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2281b5098e95..36e0675be9f7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -121,6 +121,7 @@ config MD_RAID10 | |||
121 | config MD_RAID456 | 121 | config MD_RAID456 |
122 | tristate "RAID-4/RAID-5/RAID-6 mode" | 122 | tristate "RAID-4/RAID-5/RAID-6 mode" |
123 | depends on BLK_DEV_MD | 123 | depends on BLK_DEV_MD |
124 | select MD_RAID6_PQ | ||
124 | select ASYNC_MEMCPY | 125 | select ASYNC_MEMCPY |
125 | select ASYNC_XOR | 126 | select ASYNC_XOR |
126 | ---help--- | 127 | ---help--- |
@@ -151,34 +152,8 @@ config MD_RAID456 | |||
151 | 152 | ||
152 | If unsure, say Y. | 153 | If unsure, say Y. |
153 | 154 | ||
154 | config MD_RAID5_RESHAPE | 155 | config MD_RAID6_PQ |
155 | bool "Support adding drives to a raid-5 array" | 156 | tristate |
156 | depends on MD_RAID456 | ||
157 | default y | ||
158 | ---help--- | ||
159 | A RAID-5 set can be expanded by adding extra drives. This | ||
160 | requires "restriping" the array which means (almost) every | ||
161 | block must be written to a different place. | ||
162 | |||
163 | This option allows such restriping to be done while the array | ||
164 | is online. | ||
165 | |||
166 | You will need mdadm version 2.4.1 or later to use this | ||
167 | feature safely. During the early stage of reshape there is | ||
168 | a critical section where live data is being over-written. A | ||
169 | crash during this time needs extra care for recovery. The | ||
170 | newer mdadm takes a copy of the data in the critical section | ||
171 | and will restore it, if necessary, after a crash. | ||
172 | |||
173 | The mdadm usage is e.g. | ||
174 | mdadm --grow /dev/md1 --raid-disks=6 | ||
175 | to grow '/dev/md1' to having 6 disks. | ||
176 | |||
177 | Note: The array can only be expanded, not contracted. | ||
178 | There should be enough spares already present to make the new | ||
179 | array workable. | ||
180 | |||
181 | If unsure, say Y. | ||
182 | 157 | ||
183 | config MD_MULTIPATH | 158 | config MD_MULTIPATH |
184 | tristate "Multipath I/O support" | 159 | tristate "Multipath I/O support" |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 72880b7e28d9..45cc5951d928 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -2,20 +2,21 @@ | |||
2 | # Makefile for the kernel software RAID and LVM drivers. | 2 | # Makefile for the kernel software RAID and LVM drivers. |
3 | # | 3 | # |
4 | 4 | ||
5 | dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ | 5 | dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ |
6 | dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o | 6 | dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o |
7 | dm-multipath-objs := dm-path-selector.o dm-mpath.o | 7 | dm-multipath-y += dm-path-selector.o dm-mpath.o |
8 | dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ | 8 | dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ |
9 | dm-snap-persistent.o | 9 | dm-snap-persistent.o |
10 | dm-mirror-objs := dm-raid1.o | 10 | dm-mirror-y += dm-raid1.o |
11 | md-mod-objs := md.o bitmap.o | 11 | md-mod-y += md.o bitmap.o |
12 | raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ | 12 | raid456-y += raid5.o |
13 | raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ | ||
13 | raid6int1.o raid6int2.o raid6int4.o \ | 14 | raid6int1.o raid6int2.o raid6int4.o \ |
14 | raid6int8.o raid6int16.o raid6int32.o \ | 15 | raid6int8.o raid6int16.o raid6int32.o \ |
15 | raid6altivec1.o raid6altivec2.o raid6altivec4.o \ | 16 | raid6altivec1.o raid6altivec2.o raid6altivec4.o \ |
16 | raid6altivec8.o \ | 17 | raid6altivec8.o \ |
17 | raid6mmx.o raid6sse1.o raid6sse2.o | 18 | raid6mmx.o raid6sse1.o raid6sse2.o |
18 | hostprogs-y := mktables | 19 | hostprogs-y += mktables |
19 | 20 | ||
20 | # Note: link order is important. All raid personalities | 21 | # Note: link order is important. All raid personalities |
21 | # and must come before md.o, as they each initialise | 22 | # and must come before md.o, as they each initialise |
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o | |||
26 | obj-$(CONFIG_MD_RAID0) += raid0.o | 27 | obj-$(CONFIG_MD_RAID0) += raid0.o |
27 | obj-$(CONFIG_MD_RAID1) += raid1.o | 28 | obj-$(CONFIG_MD_RAID1) += raid1.o |
28 | obj-$(CONFIG_MD_RAID10) += raid10.o | 29 | obj-$(CONFIG_MD_RAID10) += raid10.o |
30 | obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o | ||
29 | obj-$(CONFIG_MD_RAID456) += raid456.o | 31 | obj-$(CONFIG_MD_RAID456) += raid456.o |
30 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o | 32 | obj-$(CONFIG_MD_MULTIPATH) += multipath.o |
31 | obj-$(CONFIG_MD_FAULTY) += faulty.o | 33 | obj-$(CONFIG_MD_FAULTY) += faulty.o |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 719943763391..f8a9f7ab2cb8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -16,6 +16,7 @@ | |||
16 | * wait if count gets too high, wake when it drops to half. | 16 | * wait if count gets too high, wake when it drops to half. |
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/blkdev.h> | ||
19 | #include <linux/module.h> | 20 | #include <linux/module.h> |
20 | #include <linux/errno.h> | 21 | #include <linux/errno.h> |
21 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
@@ -26,8 +27,8 @@ | |||
26 | #include <linux/file.h> | 27 | #include <linux/file.h> |
27 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
28 | #include <linux/buffer_head.h> | 29 | #include <linux/buffer_head.h> |
29 | #include <linux/raid/md.h> | 30 | #include "md.h" |
30 | #include <linux/raid/bitmap.h> | 31 | #include "bitmap.h" |
31 | 32 | ||
32 | /* debug macros */ | 33 | /* debug macros */ |
33 | 34 | ||
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat | |||
111 | unsigned char *mappage; | 112 | unsigned char *mappage; |
112 | 113 | ||
113 | if (page >= bitmap->pages) { | 114 | if (page >= bitmap->pages) { |
114 | printk(KERN_ALERT | 115 | /* This can happen if bitmap_start_sync goes beyond |
115 | "%s: invalid bitmap page request: %lu (> %lu)\n", | 116 | * End-of-device while looking for a whole page. |
116 | bmname(bitmap), page, bitmap->pages-1); | 117 | * It is harmless. |
118 | */ | ||
117 | return -EINVAL; | 119 | return -EINVAL; |
118 | } | 120 | } |
119 | 121 | ||
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
265 | list_for_each_continue_rcu(pos, &mddev->disks) { | 267 | list_for_each_continue_rcu(pos, &mddev->disks) { |
266 | rdev = list_entry(pos, mdk_rdev_t, same_set); | 268 | rdev = list_entry(pos, mdk_rdev_t, same_set); |
267 | if (rdev->raid_disk >= 0 && | 269 | if (rdev->raid_disk >= 0 && |
268 | test_bit(In_sync, &rdev->flags) && | ||
269 | !test_bit(Faulty, &rdev->flags)) { | 270 | !test_bit(Faulty, &rdev->flags)) { |
270 | /* this is a usable devices */ | 271 | /* this is a usable devices */ |
271 | atomic_inc(&rdev->nr_pending); | 272 | atomic_inc(&rdev->nr_pending); |
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
297 | + size/512 > 0) | 298 | + size/512 > 0) |
298 | /* bitmap runs in to metadata */ | 299 | /* bitmap runs in to metadata */ |
299 | goto bad_alignment; | 300 | goto bad_alignment; |
300 | if (rdev->data_offset + mddev->size*2 | 301 | if (rdev->data_offset + mddev->dev_sectors |
301 | > rdev->sb_start + bitmap->offset) | 302 | > rdev->sb_start + bitmap->offset) |
302 | /* data runs in to bitmap */ | 303 | /* data runs in to bitmap */ |
303 | goto bad_alignment; | 304 | goto bad_alignment; |
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
570 | else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || | 571 | else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || |
571 | le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) | 572 | le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) |
572 | reason = "unrecognized superblock version"; | 573 | reason = "unrecognized superblock version"; |
573 | else if (chunksize < PAGE_SIZE) | 574 | else if (chunksize < 512) |
574 | reason = "bitmap chunksize too small"; | 575 | reason = "bitmap chunksize too small"; |
575 | else if ((1 << ffz(~chunksize)) != chunksize) | 576 | else if ((1 << ffz(~chunksize)) != chunksize) |
576 | reason = "bitmap chunksize not a power of 2"; | 577 | reason = "bitmap chunksize not a power of 2"; |
@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1306 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", | 1307 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", |
1307 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | 1308 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); |
1308 | } | 1309 | } |
1310 | if (bitmap->mddev->degraded) | ||
1311 | /* Never clear bits or update events_cleared when degraded */ | ||
1312 | success = 0; | ||
1309 | 1313 | ||
1310 | while (sectors) { | 1314 | while (sectors) { |
1311 | int blocks; | 1315 | int blocks; |
@@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1345 | } | 1349 | } |
1346 | } | 1350 | } |
1347 | 1351 | ||
1348 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | 1352 | static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, |
1349 | int degraded) | 1353 | int degraded) |
1350 | { | 1354 | { |
1351 | bitmap_counter_t *bmc; | 1355 | bitmap_counter_t *bmc; |
1352 | int rv; | 1356 | int rv; |
@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | |||
1374 | return rv; | 1378 | return rv; |
1375 | } | 1379 | } |
1376 | 1380 | ||
1381 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, | ||
1382 | int degraded) | ||
1383 | { | ||
1384 | /* bitmap_start_sync must always report on multiples of whole | ||
1385 | * pages, otherwise resync (which is very PAGE_SIZE based) will | ||
1386 | * get confused. | ||
1387 | * So call __bitmap_start_sync repeatedly (if needed) until | ||
1388 | * At least PAGE_SIZE>>9 blocks are covered. | ||
1389 | * Return the 'or' of the result. | ||
1390 | */ | ||
1391 | int rv = 0; | ||
1392 | int blocks1; | ||
1393 | |||
1394 | *blocks = 0; | ||
1395 | while (*blocks < (PAGE_SIZE>>9)) { | ||
1396 | rv |= __bitmap_start_sync(bitmap, offset, | ||
1397 | &blocks1, degraded); | ||
1398 | offset += blocks1; | ||
1399 | *blocks += blocks1; | ||
1400 | } | ||
1401 | return rv; | ||
1402 | } | ||
1403 | |||
1377 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) | 1404 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) |
1378 | { | 1405 | { |
1379 | bitmap_counter_t *bmc; | 1406 | bitmap_counter_t *bmc; |
@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1443 | wait_event(bitmap->mddev->recovery_wait, | 1470 | wait_event(bitmap->mddev->recovery_wait, |
1444 | atomic_read(&bitmap->mddev->recovery_active) == 0); | 1471 | atomic_read(&bitmap->mddev->recovery_active) == 0); |
1445 | 1472 | ||
1473 | bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync; | ||
1474 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | ||
1446 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); | 1475 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); |
1447 | s = 0; | 1476 | s = 0; |
1448 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1477 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 000000000000..e98900671ca9 --- /dev/null +++ b/drivers/md/bitmap.h | |||
@@ -0,0 +1,288 @@ | |||
1 | /* | ||
2 | * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 | ||
3 | * | ||
4 | * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. | ||
5 | */ | ||
6 | #ifndef BITMAP_H | ||
7 | #define BITMAP_H 1 | ||
8 | |||
9 | #define BITMAP_MAJOR_LO 3 | ||
10 | /* version 4 insists the bitmap is in little-endian order | ||
11 | * with version 3, it is host-endian which is non-portable | ||
12 | */ | ||
13 | #define BITMAP_MAJOR_HI 4 | ||
14 | #define BITMAP_MAJOR_HOSTENDIAN 3 | ||
15 | |||
16 | #define BITMAP_MINOR 39 | ||
17 | |||
18 | /* | ||
19 | * in-memory bitmap: | ||
20 | * | ||
21 | * Use 16 bit block counters to track pending writes to each "chunk". | ||
22 | * The 2 high order bits are special-purpose, the first is a flag indicating | ||
23 | * whether a resync is needed. The second is a flag indicating whether a | ||
24 | * resync is active. | ||
25 | * This means that the counter is actually 14 bits: | ||
26 | * | ||
27 | * +--------+--------+------------------------------------------------+ | ||
28 | * | resync | resync | counter | | ||
29 | * | needed | active | | | ||
30 | * | (0-1) | (0-1) | (0-16383) | | ||
31 | * +--------+--------+------------------------------------------------+ | ||
32 | * | ||
33 | * The "resync needed" bit is set when: | ||
34 | * a '1' bit is read from storage at startup. | ||
35 | * a write request fails on some drives | ||
36 | * a resync is aborted on a chunk with 'resync active' set | ||
37 | * It is cleared (and resync-active set) when a resync starts across all drives | ||
38 | * of the chunk. | ||
39 | * | ||
40 | * | ||
41 | * The "resync active" bit is set when: | ||
42 | * a resync is started on all drives, and resync_needed is set. | ||
43 | * resync_needed will be cleared (as long as resync_active wasn't already set). | ||
44 | * It is cleared when a resync completes. | ||
45 | * | ||
46 | * The counter counts pending write requests, plus the on-disk bit. | ||
47 | * When the counter is '1' and the resync bits are clear, the on-disk | ||
48 | * bit can be cleared aswell, thus setting the counter to 0. | ||
49 | * When we set a bit, or in the counter (to start a write), if the fields is | ||
50 | * 0, we first set the disk bit and set the counter to 1. | ||
51 | * | ||
52 | * If the counter is 0, the on-disk bit is clear and the stipe is clean | ||
53 | * Anything that dirties the stipe pushes the counter to 2 (at least) | ||
54 | * and sets the on-disk bit (lazily). | ||
55 | * If a periodic sweep find the counter at 2, it is decremented to 1. | ||
56 | * If the sweep find the counter at 1, the on-disk bit is cleared and the | ||
57 | * counter goes to zero. | ||
58 | * | ||
59 | * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block | ||
60 | * counters as a fallback when "page" memory cannot be allocated: | ||
61 | * | ||
62 | * Normal case (page memory allocated): | ||
63 | * | ||
64 | * page pointer (32-bit) | ||
65 | * | ||
66 | * [ ] ------+ | ||
67 | * | | ||
68 | * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) | ||
69 | * c1 c2 c2048 | ||
70 | * | ||
71 | * Hijacked case (page memory allocation failed): | ||
72 | * | ||
73 | * hijacked page pointer (32-bit) | ||
74 | * | ||
75 | * [ ][ ] (no page memory allocated) | ||
76 | * counter #1 (16-bit) counter #2 (16-bit) | ||
77 | * | ||
78 | */ | ||
79 | |||
80 | #ifdef __KERNEL__ | ||
81 | |||
82 | #define PAGE_BITS (PAGE_SIZE << 3) | ||
83 | #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) | ||
84 | |||
85 | typedef __u16 bitmap_counter_t; | ||
86 | #define COUNTER_BITS 16 | ||
87 | #define COUNTER_BIT_SHIFT 4 | ||
88 | #define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) | ||
89 | #define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) | ||
90 | |||
91 | #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) | ||
92 | #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) | ||
93 | #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) | ||
94 | #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) | ||
95 | #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) | ||
96 | #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) | ||
97 | |||
98 | /* how many counters per page? */ | ||
99 | #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) | ||
100 | /* same, except a shift value for more efficient bitops */ | ||
101 | #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) | ||
102 | /* same, except a mask value for more efficient bitops */ | ||
103 | #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) | ||
104 | |||
105 | #define BITMAP_BLOCK_SIZE 512 | ||
106 | #define BITMAP_BLOCK_SHIFT 9 | ||
107 | |||
108 | /* how many blocks per chunk? (this is variable) */ | ||
109 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) | ||
110 | #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) | ||
111 | #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) | ||
112 | |||
113 | /* when hijacked, the counters and bits represent even larger "chunks" */ | ||
114 | /* there will be 1024 chunks represented by each counter in the page pointers */ | ||
115 | #define PAGEPTR_BLOCK_RATIO(bitmap) \ | ||
116 | (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) | ||
117 | #define PAGEPTR_BLOCK_SHIFT(bitmap) \ | ||
118 | (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) | ||
119 | #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) | ||
120 | |||
121 | /* | ||
122 | * on-disk bitmap: | ||
123 | * | ||
124 | * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap | ||
125 | * file a page at a time. There's a superblock at the start of the file. | ||
126 | */ | ||
127 | |||
128 | /* map chunks (bits) to file pages - offset by the size of the superblock */ | ||
129 | #define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) | ||
130 | |||
131 | #endif | ||
132 | |||
133 | /* | ||
134 | * bitmap structures: | ||
135 | */ | ||
136 | |||
137 | #define BITMAP_MAGIC 0x6d746962 | ||
138 | |||
139 | /* use these for bitmap->flags and bitmap->sb->state bit-fields */ | ||
140 | enum bitmap_state { | ||
141 | BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ | ||
142 | BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ | ||
143 | BITMAP_HOSTENDIAN = 0x8000, | ||
144 | }; | ||
145 | |||
146 | /* the superblock at the front of the bitmap file -- little endian */ | ||
147 | typedef struct bitmap_super_s { | ||
148 | __le32 magic; /* 0 BITMAP_MAGIC */ | ||
149 | __le32 version; /* 4 the bitmap major for now, could change... */ | ||
150 | __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ | ||
151 | __le64 events; /* 24 event counter for the bitmap (1)*/ | ||
152 | __le64 events_cleared;/*32 event counter when last bit cleared (2) */ | ||
153 | __le64 sync_size; /* 40 the size of the md device's sync range(3) */ | ||
154 | __le32 state; /* 48 bitmap state information */ | ||
155 | __le32 chunksize; /* 52 the bitmap chunk size in bytes */ | ||
156 | __le32 daemon_sleep; /* 56 seconds between disk flushes */ | ||
157 | __le32 write_behind; /* 60 number of outstanding write-behind writes */ | ||
158 | |||
159 | __u8 pad[256 - 64]; /* set to zero */ | ||
160 | } bitmap_super_t; | ||
161 | |||
162 | /* notes: | ||
163 | * (1) This event counter is updated before the eventcounter in the md superblock | ||
164 | * When a bitmap is loaded, it is only accepted if this event counter is equal | ||
165 | * to, or one greater than, the event counter in the superblock. | ||
166 | * (2) This event counter is updated when the other one is *if*and*only*if* the | ||
167 | * array is not degraded. As bits are not cleared when the array is degraded, | ||
168 | * this represents the last time that any bits were cleared. | ||
169 | * If a device is being added that has an event count with this value or | ||
170 | * higher, it is accepted as conforming to the bitmap. | ||
171 | * (3)This is the number of sectors represented by the bitmap, and is the range that | ||
172 | * resync happens across. For raid1 and raid5/6 it is the size of individual | ||
173 | * devices. For raid10 it is the size of the array. | ||
174 | */ | ||
175 | |||
176 | #ifdef __KERNEL__ | ||
177 | |||
178 | /* the in-memory bitmap is represented by bitmap_pages */ | ||
179 | struct bitmap_page { | ||
180 | /* | ||
181 | * map points to the actual memory page | ||
182 | */ | ||
183 | char *map; | ||
184 | /* | ||
185 | * in emergencies (when map cannot be alloced), hijack the map | ||
186 | * pointer and use it as two counters itself | ||
187 | */ | ||
188 | unsigned int hijacked:1; | ||
189 | /* | ||
190 | * count of dirty bits on the page | ||
191 | */ | ||
192 | unsigned int count:31; | ||
193 | }; | ||
194 | |||
195 | /* keep track of bitmap file pages that have pending writes on them */ | ||
196 | struct page_list { | ||
197 | struct list_head list; | ||
198 | struct page *page; | ||
199 | }; | ||
200 | |||
201 | /* the main bitmap structure - one per mddev */ | ||
202 | struct bitmap { | ||
203 | struct bitmap_page *bp; | ||
204 | unsigned long pages; /* total number of pages in the bitmap */ | ||
205 | unsigned long missing_pages; /* number of pages not yet allocated */ | ||
206 | |||
207 | mddev_t *mddev; /* the md device that the bitmap is for */ | ||
208 | |||
209 | int counter_bits; /* how many bits per block counter */ | ||
210 | |||
211 | /* bitmap chunksize -- how much data does each bit represent? */ | ||
212 | unsigned long chunksize; | ||
213 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ | ||
214 | unsigned long chunks; /* total number of data chunks for the array */ | ||
215 | |||
216 | /* We hold a count on the chunk currently being synced, and drop | ||
217 | * it when the last block is started. If the resync is aborted | ||
218 | * midway, we need to be able to drop that count, so we remember | ||
219 | * the counted chunk.. | ||
220 | */ | ||
221 | unsigned long syncchunk; | ||
222 | |||
223 | __u64 events_cleared; | ||
224 | int need_sync; | ||
225 | |||
226 | /* bitmap spinlock */ | ||
227 | spinlock_t lock; | ||
228 | |||
229 | long offset; /* offset from superblock if file is NULL */ | ||
230 | struct file *file; /* backing disk file */ | ||
231 | struct page *sb_page; /* cached copy of the bitmap file superblock */ | ||
232 | struct page **filemap; /* list of cache pages for the file */ | ||
233 | unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ | ||
234 | unsigned long file_pages; /* number of pages in the file */ | ||
235 | int last_page_size; /* bytes in the last page */ | ||
236 | |||
237 | unsigned long flags; | ||
238 | |||
239 | int allclean; | ||
240 | |||
241 | unsigned long max_write_behind; /* write-behind mode */ | ||
242 | atomic_t behind_writes; | ||
243 | |||
244 | /* | ||
245 | * the bitmap daemon - periodically wakes up and sweeps the bitmap | ||
246 | * file, cleaning up bits and flushing out pages to disk as necessary | ||
247 | */ | ||
248 | unsigned long daemon_lastrun; /* jiffies of last run */ | ||
249 | unsigned long daemon_sleep; /* how many seconds between updates? */ | ||
250 | unsigned long last_end_sync; /* when we lasted called end_sync to | ||
251 | * update bitmap with resync progress */ | ||
252 | |||
253 | atomic_t pending_writes; /* pending writes to the bitmap file */ | ||
254 | wait_queue_head_t write_wait; | ||
255 | wait_queue_head_t overflow_wait; | ||
256 | |||
257 | }; | ||
258 | |||
259 | /* the bitmap API */ | ||
260 | |||
261 | /* these are used only by md/bitmap */ | ||
262 | int bitmap_create(mddev_t *mddev); | ||
263 | void bitmap_flush(mddev_t *mddev); | ||
264 | void bitmap_destroy(mddev_t *mddev); | ||
265 | |||
266 | void bitmap_print_sb(struct bitmap *bitmap); | ||
267 | void bitmap_update_sb(struct bitmap *bitmap); | ||
268 | |||
269 | int bitmap_setallbits(struct bitmap *bitmap); | ||
270 | void bitmap_write_all(struct bitmap *bitmap); | ||
271 | |||
272 | void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); | ||
273 | |||
274 | /* these are exported */ | ||
275 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, | ||
276 | unsigned long sectors, int behind); | ||
277 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, | ||
278 | unsigned long sectors, int success, int behind); | ||
279 | int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); | ||
280 | void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); | ||
281 | void bitmap_close_sync(struct bitmap *bitmap); | ||
282 | void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); | ||
283 | |||
284 | void bitmap_unplug(struct bitmap *bitmap); | ||
285 | void bitmap_daemon_work(struct bitmap *bitmap); | ||
286 | #endif | ||
287 | |||
288 | #endif | ||
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 86d9adf90e79..8695809b24b0 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -62,7 +62,10 @@ | |||
62 | #define ModeShift 5 | 62 | #define ModeShift 5 |
63 | 63 | ||
64 | #define MaxFault 50 | 64 | #define MaxFault 50 |
65 | #include <linux/raid/md.h> | 65 | #include <linux/blkdev.h> |
66 | #include <linux/raid/md_u.h> | ||
67 | #include "md.h" | ||
68 | #include <linux/seq_file.h> | ||
66 | 69 | ||
67 | 70 | ||
68 | static void faulty_fail(struct bio *bio, int error) | 71 | static void faulty_fail(struct bio *bio, int error) |
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) | |||
280 | return 0; | 283 | return 0; |
281 | } | 284 | } |
282 | 285 | ||
286 | static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
287 | { | ||
288 | WARN_ONCE(raid_disks, | ||
289 | "%s does not support generic reshape\n", __func__); | ||
290 | |||
291 | if (sectors == 0) | ||
292 | return mddev->dev_sectors; | ||
293 | |||
294 | return sectors; | ||
295 | } | ||
296 | |||
283 | static int run(mddev_t *mddev) | 297 | static int run(mddev_t *mddev) |
284 | { | 298 | { |
285 | mdk_rdev_t *rdev; | 299 | mdk_rdev_t *rdev; |
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev) | |||
298 | list_for_each_entry(rdev, &mddev->disks, same_set) | 312 | list_for_each_entry(rdev, &mddev->disks, same_set) |
299 | conf->rdev = rdev; | 313 | conf->rdev = rdev; |
300 | 314 | ||
301 | mddev->array_sectors = mddev->size * 2; | 315 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); |
302 | mddev->private = conf; | 316 | mddev->private = conf; |
303 | 317 | ||
304 | reconfig(mddev, mddev->layout, -1); | 318 | reconfig(mddev, mddev->layout, -1); |
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality = | |||
325 | .stop = stop, | 339 | .stop = stop, |
326 | .status = status, | 340 | .status = status, |
327 | .reconfig = reconfig, | 341 | .reconfig = reconfig, |
342 | .size = faulty_size, | ||
328 | }; | 343 | }; |
329 | 344 | ||
330 | static int __init raid_init(void) | 345 | static int __init raid_init(void) |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09658b218474..7a36e38393a1 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -16,7 +16,11 @@ | |||
16 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 16 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/raid/linear.h> | 19 | #include <linux/blkdev.h> |
20 | #include <linux/raid/md_u.h> | ||
21 | #include <linux/seq_file.h> | ||
22 | #include "md.h" | ||
23 | #include "linear.h" | ||
20 | 24 | ||
21 | /* | 25 | /* |
22 | * find which device holds a particular offset | 26 | * find which device holds a particular offset |
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits) | |||
97 | return ret; | 101 | return ret; |
98 | } | 102 | } |
99 | 103 | ||
104 | static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
105 | { | ||
106 | linear_conf_t *conf = mddev_to_conf(mddev); | ||
107 | |||
108 | WARN_ONCE(sectors || raid_disks, | ||
109 | "%s does not support generic reshape\n", __func__); | ||
110 | |||
111 | return conf->array_sectors; | ||
112 | } | ||
113 | |||
100 | static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | 114 | static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) |
101 | { | 115 | { |
102 | linear_conf_t *conf; | 116 | linear_conf_t *conf; |
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
135 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 149 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) |
136 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 150 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
137 | 151 | ||
138 | disk->num_sectors = rdev->size * 2; | 152 | disk->num_sectors = rdev->sectors; |
139 | conf->array_sectors += rdev->size * 2; | 153 | conf->array_sectors += rdev->sectors; |
140 | 154 | ||
141 | cnt++; | 155 | cnt++; |
142 | } | 156 | } |
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev) | |||
249 | if (!conf) | 263 | if (!conf) |
250 | return 1; | 264 | return 1; |
251 | mddev->private = conf; | 265 | mddev->private = conf; |
252 | mddev->array_sectors = conf->array_sectors; | 266 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
253 | 267 | ||
254 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 268 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
255 | mddev->queue->unplug_fn = linear_unplug; | 269 | mddev->queue->unplug_fn = linear_unplug; |
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
283 | newconf->prev = mddev_to_conf(mddev); | 297 | newconf->prev = mddev_to_conf(mddev); |
284 | mddev->private = newconf; | 298 | mddev->private = newconf; |
285 | mddev->raid_disks++; | 299 | mddev->raid_disks++; |
286 | mddev->array_sectors = newconf->array_sectors; | 300 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
287 | set_capacity(mddev->gendisk, mddev->array_sectors); | 301 | set_capacity(mddev->gendisk, mddev->array_sectors); |
288 | return 0; | 302 | return 0; |
289 | } | 303 | } |
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality = | |||
381 | .stop = linear_stop, | 395 | .stop = linear_stop, |
382 | .status = linear_status, | 396 | .status = linear_status, |
383 | .hot_add_disk = linear_add, | 397 | .hot_add_disk = linear_add, |
398 | .size = linear_size, | ||
384 | }; | 399 | }; |
385 | 400 | ||
386 | static int __init linear_init (void) | 401 | static int __init linear_init (void) |
diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 000000000000..bf8179587f95 --- /dev/null +++ b/drivers/md/linear.h | |||
@@ -0,0 +1,29 @@ | |||
1 | #ifndef _LINEAR_H | ||
2 | #define _LINEAR_H | ||
3 | |||
4 | struct dev_info { | ||
5 | mdk_rdev_t *rdev; | ||
6 | sector_t num_sectors; | ||
7 | sector_t start_sector; | ||
8 | }; | ||
9 | |||
10 | typedef struct dev_info dev_info_t; | ||
11 | |||
12 | struct linear_private_data | ||
13 | { | ||
14 | struct linear_private_data *prev; /* earlier version */ | ||
15 | dev_info_t **hash_table; | ||
16 | sector_t spacing; | ||
17 | sector_t array_sectors; | ||
18 | int sector_shift; /* shift before dividing | ||
19 | * by spacing | ||
20 | */ | ||
21 | dev_info_t disks[0]; | ||
22 | }; | ||
23 | |||
24 | |||
25 | typedef struct linear_private_data linear_conf_t; | ||
26 | |||
27 | #define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) | ||
28 | |||
29 | #endif | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index a307f87eb90e..ed5727c089a9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -33,9 +33,9 @@ | |||
33 | */ | 33 | */ |
34 | 34 | ||
35 | #include <linux/kthread.h> | 35 | #include <linux/kthread.h> |
36 | #include <linux/raid/md.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/raid/bitmap.h> | ||
38 | #include <linux/sysctl.h> | 37 | #include <linux/sysctl.h> |
38 | #include <linux/seq_file.h> | ||
39 | #include <linux/buffer_head.h> /* for invalidate_bdev */ | 39 | #include <linux/buffer_head.h> /* for invalidate_bdev */ |
40 | #include <linux/poll.h> | 40 | #include <linux/poll.h> |
41 | #include <linux/ctype.h> | 41 | #include <linux/ctype.h> |
@@ -45,11 +45,10 @@ | |||
45 | #include <linux/reboot.h> | 45 | #include <linux/reboot.h> |
46 | #include <linux/file.h> | 46 | #include <linux/file.h> |
47 | #include <linux/delay.h> | 47 | #include <linux/delay.h> |
48 | 48 | #include <linux/raid/md_p.h> | |
49 | #define MAJOR_NR MD_MAJOR | 49 | #include <linux/raid/md_u.h> |
50 | 50 | #include "md.h" | |
51 | /* 63 partitions with the alternate major number (mdp) */ | 51 | #include "bitmap.h" |
52 | #define MdpMinorShift 6 | ||
53 | 52 | ||
54 | #define DEBUG 0 | 53 | #define DEBUG 0 |
55 | #define dprintk(x...) ((void)(DEBUG && printk(x))) | 54 | #define dprintk(x...) ((void)(DEBUG && printk(x))) |
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); | |||
202 | ) | 201 | ) |
203 | 202 | ||
204 | 203 | ||
205 | static int md_fail_request(struct request_queue *q, struct bio *bio) | 204 | /* Rather than calling directly into the personality make_request function, |
205 | * IO requests come here first so that we can check if the device is | ||
206 | * being suspended pending a reconfiguration. | ||
207 | * We hold a refcount over the call to ->make_request. By the time that | ||
208 | * call has finished, the bio has been linked into some internal structure | ||
209 | * and so is visible to ->quiesce(), so we don't need the refcount any more. | ||
210 | */ | ||
211 | static int md_make_request(struct request_queue *q, struct bio *bio) | ||
206 | { | 212 | { |
207 | bio_io_error(bio); | 213 | mddev_t *mddev = q->queuedata; |
208 | return 0; | 214 | int rv; |
215 | if (mddev == NULL || mddev->pers == NULL) { | ||
216 | bio_io_error(bio); | ||
217 | return 0; | ||
218 | } | ||
219 | rcu_read_lock(); | ||
220 | if (mddev->suspended) { | ||
221 | DEFINE_WAIT(__wait); | ||
222 | for (;;) { | ||
223 | prepare_to_wait(&mddev->sb_wait, &__wait, | ||
224 | TASK_UNINTERRUPTIBLE); | ||
225 | if (!mddev->suspended) | ||
226 | break; | ||
227 | rcu_read_unlock(); | ||
228 | schedule(); | ||
229 | rcu_read_lock(); | ||
230 | } | ||
231 | finish_wait(&mddev->sb_wait, &__wait); | ||
232 | } | ||
233 | atomic_inc(&mddev->active_io); | ||
234 | rcu_read_unlock(); | ||
235 | rv = mddev->pers->make_request(q, bio); | ||
236 | if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) | ||
237 | wake_up(&mddev->sb_wait); | ||
238 | |||
239 | return rv; | ||
240 | } | ||
241 | |||
242 | static void mddev_suspend(mddev_t *mddev) | ||
243 | { | ||
244 | BUG_ON(mddev->suspended); | ||
245 | mddev->suspended = 1; | ||
246 | synchronize_rcu(); | ||
247 | wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); | ||
248 | mddev->pers->quiesce(mddev, 1); | ||
249 | md_unregister_thread(mddev->thread); | ||
250 | mddev->thread = NULL; | ||
251 | /* we now know that no code is executing in the personality module, | ||
252 | * except possibly the tail end of a ->bi_end_io function, but that | ||
253 | * is certain to complete before the module has a chance to get | ||
254 | * unloaded | ||
255 | */ | ||
256 | } | ||
257 | |||
258 | static void mddev_resume(mddev_t *mddev) | ||
259 | { | ||
260 | mddev->suspended = 0; | ||
261 | wake_up(&mddev->sb_wait); | ||
262 | mddev->pers->quiesce(mddev, 0); | ||
209 | } | 263 | } |
210 | 264 | ||
265 | |||
211 | static inline mddev_t *mddev_get(mddev_t *mddev) | 266 | static inline mddev_t *mddev_get(mddev_t *mddev) |
212 | { | 267 | { |
213 | atomic_inc(&mddev->active); | 268 | atomic_inc(&mddev->active); |
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit) | |||
310 | init_timer(&new->safemode_timer); | 365 | init_timer(&new->safemode_timer); |
311 | atomic_set(&new->active, 1); | 366 | atomic_set(&new->active, 1); |
312 | atomic_set(&new->openers, 0); | 367 | atomic_set(&new->openers, 0); |
368 | atomic_set(&new->active_io, 0); | ||
313 | spin_lock_init(&new->write_lock); | 369 | spin_lock_init(&new->write_lock); |
314 | init_waitqueue_head(&new->sb_wait); | 370 | init_waitqueue_head(&new->sb_wait); |
315 | init_waitqueue_head(&new->recovery_wait); | 371 | init_waitqueue_head(&new->recovery_wait); |
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev) | |||
326 | return mutex_lock_interruptible(&mddev->reconfig_mutex); | 382 | return mutex_lock_interruptible(&mddev->reconfig_mutex); |
327 | } | 383 | } |
328 | 384 | ||
385 | static inline int mddev_is_locked(mddev_t *mddev) | ||
386 | { | ||
387 | return mutex_is_locked(&mddev->reconfig_mutex); | ||
388 | } | ||
389 | |||
329 | static inline int mddev_trylock(mddev_t * mddev) | 390 | static inline int mddev_trylock(mddev_t * mddev) |
330 | { | 391 | { |
331 | return mutex_trylock(&mddev->reconfig_mutex); | 392 | return mutex_trylock(&mddev->reconfig_mutex); |
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
409 | rdev->sb_loaded = 0; | 470 | rdev->sb_loaded = 0; |
410 | rdev->sb_page = NULL; | 471 | rdev->sb_page = NULL; |
411 | rdev->sb_start = 0; | 472 | rdev->sb_start = 0; |
412 | rdev->size = 0; | 473 | rdev->sectors = 0; |
413 | } | 474 | } |
414 | } | 475 | } |
415 | 476 | ||
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
775 | else | 836 | else |
776 | ret = 0; | 837 | ret = 0; |
777 | } | 838 | } |
778 | rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; | 839 | rdev->sectors = calc_num_sectors(rdev, sb->chunk_size); |
779 | 840 | ||
780 | if (rdev->size < sb->size && sb->level > 1) | 841 | if (rdev->sectors < sb->size * 2 && sb->level > 1) |
781 | /* "this cannot possibly happen" ... */ | 842 | /* "this cannot possibly happen" ... */ |
782 | ret = -EINVAL; | 843 | ret = -EINVAL; |
783 | 844 | ||
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
812 | mddev->clevel[0] = 0; | 873 | mddev->clevel[0] = 0; |
813 | mddev->layout = sb->layout; | 874 | mddev->layout = sb->layout; |
814 | mddev->raid_disks = sb->raid_disks; | 875 | mddev->raid_disks = sb->raid_disks; |
815 | mddev->size = sb->size; | 876 | mddev->dev_sectors = sb->size * 2; |
816 | mddev->events = ev1; | 877 | mddev->events = ev1; |
817 | mddev->bitmap_offset = 0; | 878 | mddev->bitmap_offset = 0; |
818 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; | 879 | mddev->default_bitmap_offset = MD_SB_BYTES >> 9; |
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
926 | 987 | ||
927 | sb->ctime = mddev->ctime; | 988 | sb->ctime = mddev->ctime; |
928 | sb->level = mddev->level; | 989 | sb->level = mddev->level; |
929 | sb->size = mddev->size; | 990 | sb->size = mddev->dev_sectors / 2; |
930 | sb->raid_disks = mddev->raid_disks; | 991 | sb->raid_disks = mddev->raid_disks; |
931 | sb->md_minor = mddev->md_minor; | 992 | sb->md_minor = mddev->md_minor; |
932 | sb->not_persistent = 0; | 993 | sb->not_persistent = 0; |
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1024 | static unsigned long long | 1085 | static unsigned long long |
1025 | super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | 1086 | super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) |
1026 | { | 1087 | { |
1027 | if (num_sectors && num_sectors < rdev->mddev->size * 2) | 1088 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1028 | return 0; /* component must fit device */ | 1089 | return 0; /* component must fit device */ |
1029 | if (rdev->mddev->bitmap_offset) | 1090 | if (rdev->mddev->bitmap_offset) |
1030 | return 0; /* can't move bitmap */ | 1091 | return 0; /* can't move bitmap */ |
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1180 | ret = 0; | 1241 | ret = 0; |
1181 | } | 1242 | } |
1182 | if (minor_version) | 1243 | if (minor_version) |
1183 | rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; | 1244 | rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - |
1245 | le64_to_cpu(sb->data_offset); | ||
1184 | else | 1246 | else |
1185 | rdev->size = rdev->sb_start / 2; | 1247 | rdev->sectors = rdev->sb_start; |
1186 | if (rdev->size < le64_to_cpu(sb->data_size)/2) | 1248 | if (rdev->sectors < le64_to_cpu(sb->data_size)) |
1187 | return -EINVAL; | 1249 | return -EINVAL; |
1188 | rdev->size = le64_to_cpu(sb->data_size)/2; | 1250 | rdev->sectors = le64_to_cpu(sb->data_size); |
1189 | if (le32_to_cpu(sb->chunksize)) | 1251 | if (le32_to_cpu(sb->chunksize)) |
1190 | rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); | 1252 | rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1); |
1191 | 1253 | ||
1192 | if (le64_to_cpu(sb->size) > rdev->size*2) | 1254 | if (le64_to_cpu(sb->size) > rdev->sectors) |
1193 | return -EINVAL; | 1255 | return -EINVAL; |
1194 | return ret; | 1256 | return ret; |
1195 | } | 1257 | } |
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1216 | mddev->clevel[0] = 0; | 1278 | mddev->clevel[0] = 0; |
1217 | mddev->layout = le32_to_cpu(sb->layout); | 1279 | mddev->layout = le32_to_cpu(sb->layout); |
1218 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); | 1280 | mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
1219 | mddev->size = le64_to_cpu(sb->size)/2; | 1281 | mddev->dev_sectors = le64_to_cpu(sb->size); |
1220 | mddev->events = ev1; | 1282 | mddev->events = ev1; |
1221 | mddev->bitmap_offset = 0; | 1283 | mddev->bitmap_offset = 0; |
1222 | mddev->default_bitmap_offset = 1024 >> 9; | 1284 | mddev->default_bitmap_offset = 1024 >> 9; |
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1312 | sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); | 1374 | sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); |
1313 | 1375 | ||
1314 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); | 1376 | sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
1315 | sb->size = cpu_to_le64(mddev->size<<1); | 1377 | sb->size = cpu_to_le64(mddev->dev_sectors); |
1316 | 1378 | ||
1317 | if (mddev->bitmap && mddev->bitmap_file == NULL) { | 1379 | if (mddev->bitmap && mddev->bitmap_file == NULL) { |
1318 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); | 1380 | sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); |
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1320 | } | 1382 | } |
1321 | 1383 | ||
1322 | if (rdev->raid_disk >= 0 && | 1384 | if (rdev->raid_disk >= 0 && |
1323 | !test_bit(In_sync, &rdev->flags) && | 1385 | !test_bit(In_sync, &rdev->flags)) { |
1324 | rdev->recovery_offset > 0) { | 1386 | if (mddev->curr_resync_completed > rdev->recovery_offset) |
1325 | sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | 1387 | rdev->recovery_offset = mddev->curr_resync_completed; |
1326 | sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); | 1388 | if (rdev->recovery_offset > 0) { |
1389 | sb->feature_map |= | ||
1390 | cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); | ||
1391 | sb->recovery_offset = | ||
1392 | cpu_to_le64(rdev->recovery_offset); | ||
1393 | } | ||
1327 | } | 1394 | } |
1328 | 1395 | ||
1329 | if (mddev->reshape_position != MaxSector) { | 1396 | if (mddev->reshape_position != MaxSector) { |
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1365 | { | 1432 | { |
1366 | struct mdp_superblock_1 *sb; | 1433 | struct mdp_superblock_1 *sb; |
1367 | sector_t max_sectors; | 1434 | sector_t max_sectors; |
1368 | if (num_sectors && num_sectors < rdev->mddev->size * 2) | 1435 | if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
1369 | return 0; /* component must fit device */ | 1436 | return 0; /* component must fit device */ |
1370 | if (rdev->sb_start < rdev->data_offset) { | 1437 | if (rdev->sb_start < rdev->data_offset) { |
1371 | /* minor versions 1 and 2; superblock before data */ | 1438 | /* minor versions 1 and 2; superblock before data */ |
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1381 | sector_t sb_start; | 1448 | sector_t sb_start; |
1382 | sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; | 1449 | sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; |
1383 | sb_start &= ~(sector_t)(4*2 - 1); | 1450 | sb_start &= ~(sector_t)(4*2 - 1); |
1384 | max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; | 1451 | max_sectors = rdev->sectors + sb_start - rdev->sb_start; |
1385 | if (!num_sectors || num_sectors > max_sectors) | 1452 | if (!num_sectors || num_sectors > max_sectors) |
1386 | num_sectors = max_sectors; | 1453 | num_sectors = max_sectors; |
1387 | rdev->sb_start = sb_start; | 1454 | rdev->sb_start = sb_start; |
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | |||
1433 | 1500 | ||
1434 | static LIST_HEAD(pending_raid_disks); | 1501 | static LIST_HEAD(pending_raid_disks); |
1435 | 1502 | ||
1503 | static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) | ||
1504 | { | ||
1505 | struct mdk_personality *pers = mddev->pers; | ||
1506 | struct gendisk *disk = mddev->gendisk; | ||
1507 | struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); | ||
1508 | struct blk_integrity *bi_mddev = blk_get_integrity(disk); | ||
1509 | |||
1510 | /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ | ||
1511 | if (pers && pers->level >= 4 && pers->level <= 6) | ||
1512 | return; | ||
1513 | |||
1514 | /* If rdev is integrity capable, register profile for mddev */ | ||
1515 | if (!bi_mddev && bi_rdev) { | ||
1516 | if (blk_integrity_register(disk, bi_rdev)) | ||
1517 | printk(KERN_ERR "%s: %s Could not register integrity!\n", | ||
1518 | __func__, disk->disk_name); | ||
1519 | else | ||
1520 | printk(KERN_NOTICE "Enabling data integrity on %s\n", | ||
1521 | disk->disk_name); | ||
1522 | return; | ||
1523 | } | ||
1524 | |||
1525 | /* Check that mddev and rdev have matching profiles */ | ||
1526 | if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { | ||
1527 | printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, | ||
1528 | disk->disk_name, rdev->bdev->bd_disk->disk_name); | ||
1529 | printk(KERN_NOTICE "Disabling data integrity on %s\n", | ||
1530 | disk->disk_name); | ||
1531 | blk_integrity_unregister(disk); | ||
1532 | } | ||
1533 | } | ||
1534 | |||
1436 | static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | 1535 | static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) |
1437 | { | 1536 | { |
1438 | char b[BDEVNAME_SIZE]; | 1537 | char b[BDEVNAME_SIZE]; |
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1449 | if (find_rdev(mddev, rdev->bdev->bd_dev)) | 1548 | if (find_rdev(mddev, rdev->bdev->bd_dev)) |
1450 | return -EEXIST; | 1549 | return -EEXIST; |
1451 | 1550 | ||
1452 | /* make sure rdev->size exceeds mddev->size */ | 1551 | /* make sure rdev->sectors exceeds mddev->dev_sectors */ |
1453 | if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { | 1552 | if (rdev->sectors && (mddev->dev_sectors == 0 || |
1553 | rdev->sectors < mddev->dev_sectors)) { | ||
1454 | if (mddev->pers) { | 1554 | if (mddev->pers) { |
1455 | /* Cannot change size, so fail | 1555 | /* Cannot change size, so fail |
1456 | * If mddev->level <= 0, then we don't care | 1556 | * If mddev->level <= 0, then we don't care |
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1459 | if (mddev->level > 0) | 1559 | if (mddev->level > 0) |
1460 | return -ENOSPC; | 1560 | return -ENOSPC; |
1461 | } else | 1561 | } else |
1462 | mddev->size = rdev->size; | 1562 | mddev->dev_sectors = rdev->sectors; |
1463 | } | 1563 | } |
1464 | 1564 | ||
1465 | /* Verify rdev->desc_nr is unique. | 1565 | /* Verify rdev->desc_nr is unique. |
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1503 | 1603 | ||
1504 | /* May as well allow recovery to be retried once */ | 1604 | /* May as well allow recovery to be retried once */ |
1505 | mddev->recovery_disabled = 0; | 1605 | mddev->recovery_disabled = 0; |
1606 | |||
1607 | md_integrity_check(rdev, mddev); | ||
1506 | return 0; | 1608 | return 0; |
1507 | 1609 | ||
1508 | fail: | 1610 | fail: |
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb) | |||
1713 | static void print_rdev(mdk_rdev_t *rdev, int major_version) | 1815 | static void print_rdev(mdk_rdev_t *rdev, int major_version) |
1714 | { | 1816 | { |
1715 | char b[BDEVNAME_SIZE]; | 1817 | char b[BDEVNAME_SIZE]; |
1716 | printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", | 1818 | printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", |
1717 | bdevname(rdev->bdev,b), (unsigned long long)rdev->size, | 1819 | bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, |
1718 | test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), | 1820 | test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), |
1719 | rdev->desc_nr); | 1821 | rdev->desc_nr); |
1720 | if (rdev->sb_loaded) { | 1822 | if (rdev->sb_loaded) { |
@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2153 | return -EINVAL; | 2255 | return -EINVAL; |
2154 | if (rdev->mddev->pers && rdev->raid_disk >= 0) | 2256 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
2155 | return -EBUSY; | 2257 | return -EBUSY; |
2156 | if (rdev->size && rdev->mddev->external) | 2258 | if (rdev->sectors && rdev->mddev->external) |
2157 | /* Must set offset before size, so overlap checks | 2259 | /* Must set offset before size, so overlap checks |
2158 | * can be sane */ | 2260 | * can be sane */ |
2159 | return -EBUSY; | 2261 | return -EBUSY; |
@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); | |||
2167 | static ssize_t | 2269 | static ssize_t |
2168 | rdev_size_show(mdk_rdev_t *rdev, char *page) | 2270 | rdev_size_show(mdk_rdev_t *rdev, char *page) |
2169 | { | 2271 | { |
2170 | return sprintf(page, "%llu\n", (unsigned long long)rdev->size); | 2272 | return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); |
2171 | } | 2273 | } |
2172 | 2274 | ||
2173 | static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) | 2275 | static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) |
@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) | |||
2180 | return 1; | 2282 | return 1; |
2181 | } | 2283 | } |
2182 | 2284 | ||
2285 | static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) | ||
2286 | { | ||
2287 | unsigned long long blocks; | ||
2288 | sector_t new; | ||
2289 | |||
2290 | if (strict_strtoull(buf, 10, &blocks) < 0) | ||
2291 | return -EINVAL; | ||
2292 | |||
2293 | if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) | ||
2294 | return -EINVAL; /* sector conversion overflow */ | ||
2295 | |||
2296 | new = blocks * 2; | ||
2297 | if (new != blocks * 2) | ||
2298 | return -EINVAL; /* unsigned long long to sector_t overflow */ | ||
2299 | |||
2300 | *sectors = new; | ||
2301 | return 0; | ||
2302 | } | ||
2303 | |||
2183 | static ssize_t | 2304 | static ssize_t |
2184 | rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2305 | rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
2185 | { | 2306 | { |
2186 | unsigned long long size; | ||
2187 | unsigned long long oldsize = rdev->size; | ||
2188 | mddev_t *my_mddev = rdev->mddev; | 2307 | mddev_t *my_mddev = rdev->mddev; |
2308 | sector_t oldsectors = rdev->sectors; | ||
2309 | sector_t sectors; | ||
2189 | 2310 | ||
2190 | if (strict_strtoull(buf, 10, &size) < 0) | 2311 | if (strict_blocks_to_sectors(buf, §ors) < 0) |
2191 | return -EINVAL; | 2312 | return -EINVAL; |
2192 | if (my_mddev->pers && rdev->raid_disk >= 0) { | 2313 | if (my_mddev->pers && rdev->raid_disk >= 0) { |
2193 | if (my_mddev->persistent) { | 2314 | if (my_mddev->persistent) { |
2194 | size = super_types[my_mddev->major_version]. | 2315 | sectors = super_types[my_mddev->major_version]. |
2195 | rdev_size_change(rdev, size * 2); | 2316 | rdev_size_change(rdev, sectors); |
2196 | if (!size) | 2317 | if (!sectors) |
2197 | return -EBUSY; | 2318 | return -EBUSY; |
2198 | } else if (!size) { | 2319 | } else if (!sectors) |
2199 | size = (rdev->bdev->bd_inode->i_size >> 10); | 2320 | sectors = (rdev->bdev->bd_inode->i_size >> 9) - |
2200 | size -= rdev->data_offset/2; | 2321 | rdev->data_offset; |
2201 | } | ||
2202 | } | 2322 | } |
2203 | if (size < my_mddev->size) | 2323 | if (sectors < my_mddev->dev_sectors) |
2204 | return -EINVAL; /* component must fit device */ | 2324 | return -EINVAL; /* component must fit device */ |
2205 | 2325 | ||
2206 | rdev->size = size; | 2326 | rdev->sectors = sectors; |
2207 | if (size > oldsize && my_mddev->external) { | 2327 | if (sectors > oldsectors && my_mddev->external) { |
2208 | /* need to check that all other rdevs with the same ->bdev | 2328 | /* need to check that all other rdevs with the same ->bdev |
2209 | * do not overlap. We need to unlock the mddev to avoid | 2329 | * do not overlap. We need to unlock the mddev to avoid |
2210 | * a deadlock. We have already changed rdev->size, and if | 2330 | * a deadlock. We have already changed rdev->sectors, and if |
2211 | * we have to change it back, we will have the lock again. | 2331 | * we have to change it back, we will have the lock again. |
2212 | */ | 2332 | */ |
2213 | mddev_t *mddev; | 2333 | mddev_t *mddev; |
@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2223 | if (test_bit(AllReserved, &rdev2->flags) || | 2343 | if (test_bit(AllReserved, &rdev2->flags) || |
2224 | (rdev->bdev == rdev2->bdev && | 2344 | (rdev->bdev == rdev2->bdev && |
2225 | rdev != rdev2 && | 2345 | rdev != rdev2 && |
2226 | overlaps(rdev->data_offset, rdev->size * 2, | 2346 | overlaps(rdev->data_offset, rdev->sectors, |
2227 | rdev2->data_offset, | 2347 | rdev2->data_offset, |
2228 | rdev2->size * 2))) { | 2348 | rdev2->sectors))) { |
2229 | overlap = 1; | 2349 | overlap = 1; |
2230 | break; | 2350 | break; |
2231 | } | 2351 | } |
@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2239 | if (overlap) { | 2359 | if (overlap) { |
2240 | /* Someone else could have slipped in a size | 2360 | /* Someone else could have slipped in a size |
2241 | * change here, but doing so is just silly. | 2361 | * change here, but doing so is just silly. |
2242 | * We put oldsize back because we *know* it is | 2362 | * We put oldsectors back because we *know* it is |
2243 | * safe, and trust userspace not to race with | 2363 | * safe, and trust userspace not to race with |
2244 | * itself | 2364 | * itself |
2245 | */ | 2365 | */ |
2246 | rdev->size = oldsize; | 2366 | rdev->sectors = oldsectors; |
2247 | return -EBUSY; | 2367 | return -EBUSY; |
2248 | } | 2368 | } |
2249 | } | 2369 | } |
@@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page) | |||
2547 | static ssize_t | 2667 | static ssize_t |
2548 | level_store(mddev_t *mddev, const char *buf, size_t len) | 2668 | level_store(mddev_t *mddev, const char *buf, size_t len) |
2549 | { | 2669 | { |
2670 | char level[16]; | ||
2550 | ssize_t rv = len; | 2671 | ssize_t rv = len; |
2551 | if (mddev->pers) | 2672 | struct mdk_personality *pers; |
2673 | void *priv; | ||
2674 | |||
2675 | if (mddev->pers == NULL) { | ||
2676 | if (len == 0) | ||
2677 | return 0; | ||
2678 | if (len >= sizeof(mddev->clevel)) | ||
2679 | return -ENOSPC; | ||
2680 | strncpy(mddev->clevel, buf, len); | ||
2681 | if (mddev->clevel[len-1] == '\n') | ||
2682 | len--; | ||
2683 | mddev->clevel[len] = 0; | ||
2684 | mddev->level = LEVEL_NONE; | ||
2685 | return rv; | ||
2686 | } | ||
2687 | |||
2688 | /* request to change the personality. Need to ensure: | ||
2689 | * - array is not engaged in resync/recovery/reshape | ||
2690 | * - old personality can be suspended | ||
2691 | * - new personality will access other array. | ||
2692 | */ | ||
2693 | |||
2694 | if (mddev->sync_thread || mddev->reshape_position != MaxSector) | ||
2552 | return -EBUSY; | 2695 | return -EBUSY; |
2553 | if (len == 0) | 2696 | |
2554 | return 0; | 2697 | if (!mddev->pers->quiesce) { |
2555 | if (len >= sizeof(mddev->clevel)) | 2698 | printk(KERN_WARNING "md: %s: %s does not support online personality change\n", |
2556 | return -ENOSPC; | 2699 | mdname(mddev), mddev->pers->name); |
2557 | strncpy(mddev->clevel, buf, len); | 2700 | return -EINVAL; |
2558 | if (mddev->clevel[len-1] == '\n') | 2701 | } |
2702 | |||
2703 | /* Now find the new personality */ | ||
2704 | if (len == 0 || len >= sizeof(level)) | ||
2705 | return -EINVAL; | ||
2706 | strncpy(level, buf, len); | ||
2707 | if (level[len-1] == '\n') | ||
2559 | len--; | 2708 | len--; |
2560 | mddev->clevel[len] = 0; | 2709 | level[len] = 0; |
2561 | mddev->level = LEVEL_NONE; | 2710 | |
2711 | request_module("md-%s", level); | ||
2712 | spin_lock(&pers_lock); | ||
2713 | pers = find_pers(LEVEL_NONE, level); | ||
2714 | if (!pers || !try_module_get(pers->owner)) { | ||
2715 | spin_unlock(&pers_lock); | ||
2716 | printk(KERN_WARNING "md: personality %s not loaded\n", level); | ||
2717 | return -EINVAL; | ||
2718 | } | ||
2719 | spin_unlock(&pers_lock); | ||
2720 | |||
2721 | if (pers == mddev->pers) { | ||
2722 | /* Nothing to do! */ | ||
2723 | module_put(pers->owner); | ||
2724 | return rv; | ||
2725 | } | ||
2726 | if (!pers->takeover) { | ||
2727 | module_put(pers->owner); | ||
2728 | printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", | ||
2729 | mdname(mddev), level); | ||
2730 | return -EINVAL; | ||
2731 | } | ||
2732 | |||
2733 | /* ->takeover must set new_* and/or delta_disks | ||
2734 | * if it succeeds, and may set them when it fails. | ||
2735 | */ | ||
2736 | priv = pers->takeover(mddev); | ||
2737 | if (IS_ERR(priv)) { | ||
2738 | mddev->new_level = mddev->level; | ||
2739 | mddev->new_layout = mddev->layout; | ||
2740 | mddev->new_chunk = mddev->chunk_size; | ||
2741 | mddev->raid_disks -= mddev->delta_disks; | ||
2742 | mddev->delta_disks = 0; | ||
2743 | module_put(pers->owner); | ||
2744 | printk(KERN_WARNING "md: %s: %s would not accept array\n", | ||
2745 | mdname(mddev), level); | ||
2746 | return PTR_ERR(priv); | ||
2747 | } | ||
2748 | |||
2749 | /* Looks like we have a winner */ | ||
2750 | mddev_suspend(mddev); | ||
2751 | mddev->pers->stop(mddev); | ||
2752 | module_put(mddev->pers->owner); | ||
2753 | mddev->pers = pers; | ||
2754 | mddev->private = priv; | ||
2755 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | ||
2756 | mddev->level = mddev->new_level; | ||
2757 | mddev->layout = mddev->new_layout; | ||
2758 | mddev->chunk_size = mddev->new_chunk; | ||
2759 | mddev->delta_disks = 0; | ||
2760 | pers->run(mddev); | ||
2761 | mddev_resume(mddev); | ||
2762 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
2763 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
2764 | md_wakeup_thread(mddev->thread); | ||
2562 | return rv; | 2765 | return rv; |
2563 | } | 2766 | } |
2564 | 2767 | ||
@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) | |||
2586 | if (!*buf || (*e && *e != '\n')) | 2789 | if (!*buf || (*e && *e != '\n')) |
2587 | return -EINVAL; | 2790 | return -EINVAL; |
2588 | 2791 | ||
2589 | if (mddev->pers) | 2792 | if (mddev->pers) { |
2590 | return -EBUSY; | 2793 | int err; |
2591 | if (mddev->reshape_position != MaxSector) | 2794 | if (mddev->pers->reconfig == NULL) |
2795 | return -EBUSY; | ||
2796 | err = mddev->pers->reconfig(mddev, n, -1); | ||
2797 | if (err) | ||
2798 | return err; | ||
2799 | } else { | ||
2592 | mddev->new_layout = n; | 2800 | mddev->new_layout = n; |
2593 | else | 2801 | if (mddev->reshape_position == MaxSector) |
2594 | mddev->layout = n; | 2802 | mddev->layout = n; |
2803 | } | ||
2595 | return len; | 2804 | return len; |
2596 | } | 2805 | } |
2597 | static struct md_sysfs_entry md_layout = | 2806 | static struct md_sysfs_entry md_layout = |
@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page) | |||
2648 | static ssize_t | 2857 | static ssize_t |
2649 | chunk_size_store(mddev_t *mddev, const char *buf, size_t len) | 2858 | chunk_size_store(mddev_t *mddev, const char *buf, size_t len) |
2650 | { | 2859 | { |
2651 | /* can only set chunk_size if array is not yet active */ | ||
2652 | char *e; | 2860 | char *e; |
2653 | unsigned long n = simple_strtoul(buf, &e, 10); | 2861 | unsigned long n = simple_strtoul(buf, &e, 10); |
2654 | 2862 | ||
2655 | if (!*buf || (*e && *e != '\n')) | 2863 | if (!*buf || (*e && *e != '\n')) |
2656 | return -EINVAL; | 2864 | return -EINVAL; |
2657 | 2865 | ||
2658 | if (mddev->pers) | 2866 | if (mddev->pers) { |
2659 | return -EBUSY; | 2867 | int err; |
2660 | else if (mddev->reshape_position != MaxSector) | 2868 | if (mddev->pers->reconfig == NULL) |
2869 | return -EBUSY; | ||
2870 | err = mddev->pers->reconfig(mddev, -1, n); | ||
2871 | if (err) | ||
2872 | return err; | ||
2873 | } else { | ||
2661 | mddev->new_chunk = n; | 2874 | mddev->new_chunk = n; |
2662 | else | 2875 | if (mddev->reshape_position == MaxSector) |
2663 | mddev->chunk_size = n; | 2876 | mddev->chunk_size = n; |
2877 | } | ||
2664 | return len; | 2878 | return len; |
2665 | } | 2879 | } |
2666 | static struct md_sysfs_entry md_chunk_size = | 2880 | static struct md_sysfs_entry md_chunk_size = |
@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); | |||
2669 | static ssize_t | 2883 | static ssize_t |
2670 | resync_start_show(mddev_t *mddev, char *page) | 2884 | resync_start_show(mddev_t *mddev, char *page) |
2671 | { | 2885 | { |
2886 | if (mddev->recovery_cp == MaxSector) | ||
2887 | return sprintf(page, "none\n"); | ||
2672 | return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); | 2888 | return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); |
2673 | } | 2889 | } |
2674 | 2890 | ||
@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page) | |||
2766 | else { | 2982 | else { |
2767 | if (list_empty(&mddev->disks) && | 2983 | if (list_empty(&mddev->disks) && |
2768 | mddev->raid_disks == 0 && | 2984 | mddev->raid_disks == 0 && |
2769 | mddev->size == 0) | 2985 | mddev->dev_sectors == 0) |
2770 | st = clear; | 2986 | st = clear; |
2771 | else | 2987 | else |
2772 | st = inactive; | 2988 | st = inactive; |
@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); | |||
2973 | static ssize_t | 3189 | static ssize_t |
2974 | size_show(mddev_t *mddev, char *page) | 3190 | size_show(mddev_t *mddev, char *page) |
2975 | { | 3191 | { |
2976 | return sprintf(page, "%llu\n", (unsigned long long)mddev->size); | 3192 | return sprintf(page, "%llu\n", |
3193 | (unsigned long long)mddev->dev_sectors / 2); | ||
2977 | } | 3194 | } |
2978 | 3195 | ||
2979 | static int update_size(mddev_t *mddev, sector_t num_sectors); | 3196 | static int update_size(mddev_t *mddev, sector_t num_sectors); |
@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len) | |||
2985 | * not increase it (except from 0). | 3202 | * not increase it (except from 0). |
2986 | * If array is active, we can try an on-line resize | 3203 | * If array is active, we can try an on-line resize |
2987 | */ | 3204 | */ |
2988 | char *e; | 3205 | sector_t sectors; |
2989 | int err = 0; | 3206 | int err = strict_blocks_to_sectors(buf, §ors); |
2990 | unsigned long long size = simple_strtoull(buf, &e, 10); | ||
2991 | if (!*buf || *buf == '\n' || | ||
2992 | (*e && *e != '\n')) | ||
2993 | return -EINVAL; | ||
2994 | 3207 | ||
3208 | if (err < 0) | ||
3209 | return err; | ||
2995 | if (mddev->pers) { | 3210 | if (mddev->pers) { |
2996 | err = update_size(mddev, size * 2); | 3211 | err = update_size(mddev, sectors); |
2997 | md_update_sb(mddev, 1); | 3212 | md_update_sb(mddev, 1); |
2998 | } else { | 3213 | } else { |
2999 | if (mddev->size == 0 || | 3214 | if (mddev->dev_sectors == 0 || |
3000 | mddev->size > size) | 3215 | mddev->dev_sectors > sectors) |
3001 | mddev->size = size; | 3216 | mddev->dev_sectors = sectors; |
3002 | else | 3217 | else |
3003 | err = -ENOSPC; | 3218 | err = -ENOSPC; |
3004 | } | 3219 | } |
@@ -3251,6 +3466,8 @@ static ssize_t | |||
3251 | sync_speed_show(mddev_t *mddev, char *page) | 3466 | sync_speed_show(mddev_t *mddev, char *page) |
3252 | { | 3467 | { |
3253 | unsigned long resync, dt, db; | 3468 | unsigned long resync, dt, db; |
3469 | if (mddev->curr_resync == 0) | ||
3470 | return sprintf(page, "none\n"); | ||
3254 | resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); | 3471 | resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); |
3255 | dt = (jiffies - mddev->resync_mark) / HZ; | 3472 | dt = (jiffies - mddev->resync_mark) / HZ; |
3256 | if (!dt) dt++; | 3473 | if (!dt) dt++; |
@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); | |||
3263 | static ssize_t | 3480 | static ssize_t |
3264 | sync_completed_show(mddev_t *mddev, char *page) | 3481 | sync_completed_show(mddev_t *mddev, char *page) |
3265 | { | 3482 | { |
3266 | unsigned long max_blocks, resync; | 3483 | unsigned long max_sectors, resync; |
3267 | 3484 | ||
3268 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 3485 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
3269 | max_blocks = mddev->resync_max_sectors; | 3486 | max_sectors = mddev->resync_max_sectors; |
3270 | else | 3487 | else |
3271 | max_blocks = mddev->size << 1; | 3488 | max_sectors = mddev->dev_sectors; |
3272 | 3489 | ||
3273 | resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); | 3490 | resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); |
3274 | return sprintf(page, "%lu / %lu\n", resync, max_blocks); | 3491 | return sprintf(page, "%lu / %lu\n", resync, max_sectors); |
3275 | } | 3492 | } |
3276 | 3493 | ||
3277 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); | 3494 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); |
@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position = | |||
3431 | __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, | 3648 | __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, |
3432 | reshape_position_store); | 3649 | reshape_position_store); |
3433 | 3650 | ||
3651 | static ssize_t | ||
3652 | array_size_show(mddev_t *mddev, char *page) | ||
3653 | { | ||
3654 | if (mddev->external_size) | ||
3655 | return sprintf(page, "%llu\n", | ||
3656 | (unsigned long long)mddev->array_sectors/2); | ||
3657 | else | ||
3658 | return sprintf(page, "default\n"); | ||
3659 | } | ||
3660 | |||
3661 | static ssize_t | ||
3662 | array_size_store(mddev_t *mddev, const char *buf, size_t len) | ||
3663 | { | ||
3664 | sector_t sectors; | ||
3665 | |||
3666 | if (strncmp(buf, "default", 7) == 0) { | ||
3667 | if (mddev->pers) | ||
3668 | sectors = mddev->pers->size(mddev, 0, 0); | ||
3669 | else | ||
3670 | sectors = mddev->array_sectors; | ||
3671 | |||
3672 | mddev->external_size = 0; | ||
3673 | } else { | ||
3674 | if (strict_blocks_to_sectors(buf, §ors) < 0) | ||
3675 | return -EINVAL; | ||
3676 | if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) | ||
3677 | return -EINVAL; | ||
3678 | |||
3679 | mddev->external_size = 1; | ||
3680 | } | ||
3681 | |||
3682 | mddev->array_sectors = sectors; | ||
3683 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
3684 | if (mddev->pers) { | ||
3685 | struct block_device *bdev = bdget_disk(mddev->gendisk, 0); | ||
3686 | |||
3687 | if (bdev) { | ||
3688 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
3689 | i_size_write(bdev->bd_inode, | ||
3690 | (loff_t)mddev->array_sectors << 9); | ||
3691 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
3692 | bdput(bdev); | ||
3693 | } | ||
3694 | } | ||
3695 | |||
3696 | return len; | ||
3697 | } | ||
3698 | |||
3699 | static struct md_sysfs_entry md_array_size = | ||
3700 | __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, | ||
3701 | array_size_store); | ||
3434 | 3702 | ||
3435 | static struct attribute *md_default_attrs[] = { | 3703 | static struct attribute *md_default_attrs[] = { |
3436 | &md_level.attr, | 3704 | &md_level.attr, |
@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = { | |||
3444 | &md_safe_delay.attr, | 3712 | &md_safe_delay.attr, |
3445 | &md_array_state.attr, | 3713 | &md_array_state.attr, |
3446 | &md_reshape_position.attr, | 3714 | &md_reshape_position.attr, |
3715 | &md_array_size.attr, | ||
3447 | NULL, | 3716 | NULL, |
3448 | }; | 3717 | }; |
3449 | 3718 | ||
@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name) | |||
3602 | mddev_put(mddev); | 3871 | mddev_put(mddev); |
3603 | return -ENOMEM; | 3872 | return -ENOMEM; |
3604 | } | 3873 | } |
3874 | mddev->queue->queuedata = mddev; | ||
3875 | |||
3605 | /* Can be unlocked because the queue is new: no concurrency */ | 3876 | /* Can be unlocked because the queue is new: no concurrency */ |
3606 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); | 3877 | queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); |
3607 | 3878 | ||
3608 | blk_queue_make_request(mddev->queue, md_fail_request); | 3879 | blk_queue_make_request(mddev->queue, md_make_request); |
3609 | 3880 | ||
3610 | disk = alloc_disk(1 << shift); | 3881 | disk = alloc_disk(1 << shift); |
3611 | if (!disk) { | 3882 | if (!disk) { |
@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev) | |||
3731 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 4002 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3732 | if (test_bit(Faulty, &rdev->flags)) | 4003 | if (test_bit(Faulty, &rdev->flags)) |
3733 | continue; | 4004 | continue; |
3734 | if (rdev->size < chunk_size / 1024) { | 4005 | if (rdev->sectors < chunk_size / 512) { |
3735 | printk(KERN_WARNING | 4006 | printk(KERN_WARNING |
3736 | "md: Dev %s smaller than chunk_size:" | 4007 | "md: Dev %s smaller than chunk_size:" |
3737 | " %lluk < %dk\n", | 4008 | " %llu < %d\n", |
3738 | bdevname(rdev->bdev,b), | 4009 | bdevname(rdev->bdev,b), |
3739 | (unsigned long long)rdev->size, | 4010 | (unsigned long long)rdev->sectors, |
3740 | chunk_size / 1024); | 4011 | chunk_size / 512); |
3741 | return -EINVAL; | 4012 | return -EINVAL; |
3742 | } | 4013 | } |
3743 | } | 4014 | } |
@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev) | |||
3761 | 4032 | ||
3762 | /* perform some consistency tests on the device. | 4033 | /* perform some consistency tests on the device. |
3763 | * We don't want the data to overlap the metadata, | 4034 | * We don't want the data to overlap the metadata, |
3764 | * Internal Bitmap issues has handled elsewhere. | 4035 | * Internal Bitmap issues have been handled elsewhere. |
3765 | */ | 4036 | */ |
3766 | if (rdev->data_offset < rdev->sb_start) { | 4037 | if (rdev->data_offset < rdev->sb_start) { |
3767 | if (mddev->size && | 4038 | if (mddev->dev_sectors && |
3768 | rdev->data_offset + mddev->size*2 | 4039 | rdev->data_offset + mddev->dev_sectors |
3769 | > rdev->sb_start) { | 4040 | > rdev->sb_start) { |
3770 | printk("md: %s: data overlaps metadata\n", | 4041 | printk("md: %s: data overlaps metadata\n", |
3771 | mdname(mddev)); | 4042 | mdname(mddev)); |
@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev) | |||
3801 | } | 4072 | } |
3802 | mddev->pers = pers; | 4073 | mddev->pers = pers; |
3803 | spin_unlock(&pers_lock); | 4074 | spin_unlock(&pers_lock); |
3804 | mddev->level = pers->level; | 4075 | if (mddev->level != pers->level) { |
4076 | mddev->level = pers->level; | ||
4077 | mddev->new_level = pers->level; | ||
4078 | } | ||
3805 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); | 4079 | strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); |
3806 | 4080 | ||
4081 | if (pers->level >= 4 && pers->level <= 6) | ||
4082 | /* Cannot support integrity (yet) */ | ||
4083 | blk_integrity_unregister(mddev->gendisk); | ||
4084 | |||
3807 | if (mddev->reshape_position != MaxSector && | 4085 | if (mddev->reshape_position != MaxSector && |
3808 | pers->start_reshape == NULL) { | 4086 | pers->start_reshape == NULL) { |
3809 | /* This personality cannot handle reshaping... */ | 4087 | /* This personality cannot handle reshaping... */ |
@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev) | |||
3843 | } | 4121 | } |
3844 | 4122 | ||
3845 | mddev->recovery = 0; | 4123 | mddev->recovery = 0; |
3846 | mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ | 4124 | /* may be over-ridden by personality */ |
4125 | mddev->resync_max_sectors = mddev->dev_sectors; | ||
4126 | |||
3847 | mddev->barriers_work = 1; | 4127 | mddev->barriers_work = 1; |
3848 | mddev->ok_start_degraded = start_dirty_degraded; | 4128 | mddev->ok_start_degraded = start_dirty_degraded; |
3849 | 4129 | ||
@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev) | |||
3853 | err = mddev->pers->run(mddev); | 4133 | err = mddev->pers->run(mddev); |
3854 | if (err) | 4134 | if (err) |
3855 | printk(KERN_ERR "md: pers->run() failed ...\n"); | 4135 | printk(KERN_ERR "md: pers->run() failed ...\n"); |
3856 | else if (mddev->pers->sync_request) { | 4136 | else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { |
4137 | WARN_ONCE(!mddev->external_size, "%s: default size too small," | ||
4138 | " but 'external_size' not in effect?\n", __func__); | ||
4139 | printk(KERN_ERR | ||
4140 | "md: invalid array_size %llu > default size %llu\n", | ||
4141 | (unsigned long long)mddev->array_sectors / 2, | ||
4142 | (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); | ||
4143 | err = -EINVAL; | ||
4144 | mddev->pers->stop(mddev); | ||
4145 | } | ||
4146 | if (err == 0 && mddev->pers->sync_request) { | ||
3857 | err = bitmap_create(mddev); | 4147 | err = bitmap_create(mddev); |
3858 | if (err) { | 4148 | if (err) { |
3859 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 4149 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev) | |||
3899 | 4189 | ||
3900 | set_capacity(disk, mddev->array_sectors); | 4190 | set_capacity(disk, mddev->array_sectors); |
3901 | 4191 | ||
3902 | /* If we call blk_queue_make_request here, it will | ||
3903 | * re-initialise max_sectors etc which may have been | ||
3904 | * refined inside -> run. So just set the bits we need to set. | ||
3905 | * Most initialisation happended when we called | ||
3906 | * blk_queue_make_request(..., md_fail_request) | ||
3907 | * earlier. | ||
3908 | */ | ||
3909 | mddev->queue->queuedata = mddev; | ||
3910 | mddev->queue->make_request_fn = mddev->pers->make_request; | ||
3911 | |||
3912 | /* If there is a partially-recovered drive we need to | 4192 | /* If there is a partially-recovered drive we need to |
3913 | * start recovery here. If we leave it to md_check_recovery, | 4193 | * start recovery here. If we leave it to md_check_recovery, |
3914 | * it will remove the drives and not do the right thing | 4194 | * it will remove the drives and not do the right thing |
@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4038 | md_super_wait(mddev); | 4318 | md_super_wait(mddev); |
4039 | if (mddev->ro) | 4319 | if (mddev->ro) |
4040 | set_disk_ro(disk, 0); | 4320 | set_disk_ro(disk, 0); |
4041 | blk_queue_make_request(mddev->queue, md_fail_request); | 4321 | |
4042 | mddev->pers->stop(mddev); | 4322 | mddev->pers->stop(mddev); |
4043 | mddev->queue->merge_bvec_fn = NULL; | 4323 | mddev->queue->merge_bvec_fn = NULL; |
4044 | mddev->queue->unplug_fn = NULL; | 4324 | mddev->queue->unplug_fn = NULL; |
@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4095 | export_array(mddev); | 4375 | export_array(mddev); |
4096 | 4376 | ||
4097 | mddev->array_sectors = 0; | 4377 | mddev->array_sectors = 0; |
4098 | mddev->size = 0; | 4378 | mddev->external_size = 0; |
4379 | mddev->dev_sectors = 0; | ||
4099 | mddev->raid_disks = 0; | 4380 | mddev->raid_disks = 0; |
4100 | mddev->recovery_cp = 0; | 4381 | mddev->recovery_cp = 0; |
4101 | mddev->resync_min = 0; | 4382 | mddev->resync_min = 0; |
@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4135 | printk(KERN_INFO "md: %s switched to read-only mode.\n", | 4416 | printk(KERN_INFO "md: %s switched to read-only mode.\n", |
4136 | mdname(mddev)); | 4417 | mdname(mddev)); |
4137 | err = 0; | 4418 | err = 0; |
4419 | blk_integrity_unregister(disk); | ||
4138 | md_new_event(mddev); | 4420 | md_new_event(mddev); |
4139 | sysfs_notify_dirent(mddev->sysfs_state); | 4421 | sysfs_notify_dirent(mddev->sysfs_state); |
4140 | out: | 4422 | out: |
@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) | |||
4300 | info.patch_version = MD_PATCHLEVEL_VERSION; | 4582 | info.patch_version = MD_PATCHLEVEL_VERSION; |
4301 | info.ctime = mddev->ctime; | 4583 | info.ctime = mddev->ctime; |
4302 | info.level = mddev->level; | 4584 | info.level = mddev->level; |
4303 | info.size = mddev->size; | 4585 | info.size = mddev->dev_sectors / 2; |
4304 | if (info.size != mddev->size) /* overflow */ | 4586 | if (info.size != mddev->dev_sectors / 2) /* overflow */ |
4305 | info.size = -1; | 4587 | info.size = -1; |
4306 | info.nr_disks = nr; | 4588 | info.nr_disks = nr; |
4307 | info.raid_disks = mddev->raid_disks; | 4589 | info.raid_disks = mddev->raid_disks; |
@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4480 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ | 4762 | clear_bit(In_sync, &rdev->flags); /* just to be sure */ |
4481 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) | 4763 | if (info->state & (1<<MD_DISK_WRITEMOSTLY)) |
4482 | set_bit(WriteMostly, &rdev->flags); | 4764 | set_bit(WriteMostly, &rdev->flags); |
4765 | else | ||
4766 | clear_bit(WriteMostly, &rdev->flags); | ||
4483 | 4767 | ||
4484 | rdev->raid_disk = -1; | 4768 | rdev->raid_disk = -1; |
4485 | err = bind_rdev_to_array(rdev, mddev); | 4769 | err = bind_rdev_to_array(rdev, mddev); |
@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4543 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 4827 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
4544 | } else | 4828 | } else |
4545 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | 4829 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
4546 | rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; | 4830 | rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); |
4547 | 4831 | ||
4548 | err = bind_rdev_to_array(rdev, mddev); | 4832 | err = bind_rdev_to_array(rdev, mddev); |
4549 | if (err) { | 4833 | if (err) { |
@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
4613 | else | 4897 | else |
4614 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; | 4898 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
4615 | 4899 | ||
4616 | rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; | 4900 | rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size); |
4617 | 4901 | ||
4618 | if (test_bit(Faulty, &rdev->flags)) { | 4902 | if (test_bit(Faulty, &rdev->flags)) { |
4619 | printk(KERN_WARNING | 4903 | printk(KERN_WARNING |
@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
4749 | 5033 | ||
4750 | mddev->level = info->level; | 5034 | mddev->level = info->level; |
4751 | mddev->clevel[0] = 0; | 5035 | mddev->clevel[0] = 0; |
4752 | mddev->size = info->size; | 5036 | mddev->dev_sectors = 2 * (sector_t)info->size; |
4753 | mddev->raid_disks = info->raid_disks; | 5037 | mddev->raid_disks = info->raid_disks; |
4754 | /* don't set md_minor, it is determined by which /dev/md* was | 5038 | /* don't set md_minor, it is determined by which /dev/md* was |
4755 | * openned | 5039 | * openned |
@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
4788 | return 0; | 5072 | return 0; |
4789 | } | 5073 | } |
4790 | 5074 | ||
5075 | void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) | ||
5076 | { | ||
5077 | WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); | ||
5078 | |||
5079 | if (mddev->external_size) | ||
5080 | return; | ||
5081 | |||
5082 | mddev->array_sectors = array_sectors; | ||
5083 | } | ||
5084 | EXPORT_SYMBOL(md_set_array_sectors); | ||
5085 | |||
4791 | static int update_size(mddev_t *mddev, sector_t num_sectors) | 5086 | static int update_size(mddev_t *mddev, sector_t num_sectors) |
4792 | { | 5087 | { |
4793 | mdk_rdev_t *rdev; | 5088 | mdk_rdev_t *rdev; |
@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) | |||
4814 | */ | 5109 | */ |
4815 | return -EBUSY; | 5110 | return -EBUSY; |
4816 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5111 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
4817 | sector_t avail; | 5112 | sector_t avail = rdev->sectors; |
4818 | avail = rdev->size * 2; | ||
4819 | 5113 | ||
4820 | if (fit && (num_sectors == 0 || num_sectors > avail)) | 5114 | if (fit && (num_sectors == 0 || num_sectors > avail)) |
4821 | num_sectors = avail; | 5115 | num_sectors = avail; |
@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
4887 | ) | 5181 | ) |
4888 | return -EINVAL; | 5182 | return -EINVAL; |
4889 | /* Check there is only one change */ | 5183 | /* Check there is only one change */ |
4890 | if (info->size >= 0 && mddev->size != info->size) cnt++; | 5184 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
4891 | if (mddev->raid_disks != info->raid_disks) cnt++; | 5185 | cnt++; |
4892 | if (mddev->layout != info->layout) cnt++; | 5186 | if (mddev->raid_disks != info->raid_disks) |
4893 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; | 5187 | cnt++; |
4894 | if (cnt == 0) return 0; | 5188 | if (mddev->layout != info->layout) |
4895 | if (cnt > 1) return -EINVAL; | 5189 | cnt++; |
5190 | if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) | ||
5191 | cnt++; | ||
5192 | if (cnt == 0) | ||
5193 | return 0; | ||
5194 | if (cnt > 1) | ||
5195 | return -EINVAL; | ||
4896 | 5196 | ||
4897 | if (mddev->layout != info->layout) { | 5197 | if (mddev->layout != info->layout) { |
4898 | /* Change layout | 5198 | /* Change layout |
@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
4904 | else | 5204 | else |
4905 | return mddev->pers->reconfig(mddev, info->layout, -1); | 5205 | return mddev->pers->reconfig(mddev, info->layout, -1); |
4906 | } | 5206 | } |
4907 | if (info->size >= 0 && mddev->size != info->size) | 5207 | if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) |
4908 | rv = update_size(mddev, (sector_t)info->size * 2); | 5208 | rv = update_size(mddev, (sector_t)info->size * 2); |
4909 | 5209 | ||
4910 | if (mddev->raid_disks != info->raid_disks) | 5210 | if (mddev->raid_disks != info->raid_disks) |
@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, | |||
5331 | 5631 | ||
5332 | void md_unregister_thread(mdk_thread_t *thread) | 5632 | void md_unregister_thread(mdk_thread_t *thread) |
5333 | { | 5633 | { |
5634 | if (!thread) | ||
5635 | return; | ||
5334 | dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); | 5636 | dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); |
5335 | 5637 | ||
5336 | kthread_stop(thread->tsk); | 5638 | kthread_stop(thread->tsk); |
@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) | |||
5404 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 5706 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
5405 | max_blocks = mddev->resync_max_sectors >> 1; | 5707 | max_blocks = mddev->resync_max_sectors >> 1; |
5406 | else | 5708 | else |
5407 | max_blocks = mddev->size; | 5709 | max_blocks = mddev->dev_sectors / 2; |
5408 | 5710 | ||
5409 | /* | 5711 | /* |
5410 | * Should not happen. | 5712 | * Should not happen. |
@@ -5537,7 +5839,7 @@ struct mdstat_info { | |||
5537 | static int md_seq_show(struct seq_file *seq, void *v) | 5839 | static int md_seq_show(struct seq_file *seq, void *v) |
5538 | { | 5840 | { |
5539 | mddev_t *mddev = v; | 5841 | mddev_t *mddev = v; |
5540 | sector_t size; | 5842 | sector_t sectors; |
5541 | mdk_rdev_t *rdev; | 5843 | mdk_rdev_t *rdev; |
5542 | struct mdstat_info *mi = seq->private; | 5844 | struct mdstat_info *mi = seq->private; |
5543 | struct bitmap *bitmap; | 5845 | struct bitmap *bitmap; |
@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
5573 | seq_printf(seq, " %s", mddev->pers->name); | 5875 | seq_printf(seq, " %s", mddev->pers->name); |
5574 | } | 5876 | } |
5575 | 5877 | ||
5576 | size = 0; | 5878 | sectors = 0; |
5577 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5879 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
5578 | char b[BDEVNAME_SIZE]; | 5880 | char b[BDEVNAME_SIZE]; |
5579 | seq_printf(seq, " %s[%d]", | 5881 | seq_printf(seq, " %s[%d]", |
@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
5585 | continue; | 5887 | continue; |
5586 | } else if (rdev->raid_disk < 0) | 5888 | } else if (rdev->raid_disk < 0) |
5587 | seq_printf(seq, "(S)"); /* spare */ | 5889 | seq_printf(seq, "(S)"); /* spare */ |
5588 | size += rdev->size; | 5890 | sectors += rdev->sectors; |
5589 | } | 5891 | } |
5590 | 5892 | ||
5591 | if (!list_empty(&mddev->disks)) { | 5893 | if (!list_empty(&mddev->disks)) { |
@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
5595 | mddev->array_sectors / 2); | 5897 | mddev->array_sectors / 2); |
5596 | else | 5898 | else |
5597 | seq_printf(seq, "\n %llu blocks", | 5899 | seq_printf(seq, "\n %llu blocks", |
5598 | (unsigned long long)size); | 5900 | (unsigned long long)sectors / 2); |
5599 | } | 5901 | } |
5600 | if (mddev->persistent) { | 5902 | if (mddev->persistent) { |
5601 | if (mddev->major_version != 0 || | 5903 | if (mddev->major_version != 0 || |
@@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p) | |||
5722 | return 0; | 6024 | return 0; |
5723 | } | 6025 | } |
5724 | 6026 | ||
5725 | static int is_mddev_idle(mddev_t *mddev) | 6027 | static int is_mddev_idle(mddev_t *mddev, int init) |
5726 | { | 6028 | { |
5727 | mdk_rdev_t * rdev; | 6029 | mdk_rdev_t * rdev; |
5728 | int idle; | 6030 | int idle; |
5729 | long curr_events; | 6031 | int curr_events; |
5730 | 6032 | ||
5731 | idle = 1; | 6033 | idle = 1; |
5732 | rcu_read_lock(); | 6034 | rcu_read_lock(); |
5733 | rdev_for_each_rcu(rdev, mddev) { | 6035 | rdev_for_each_rcu(rdev, mddev) { |
5734 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; | 6036 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; |
5735 | curr_events = part_stat_read(&disk->part0, sectors[0]) + | 6037 | curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + |
5736 | part_stat_read(&disk->part0, sectors[1]) - | 6038 | (int)part_stat_read(&disk->part0, sectors[1]) - |
5737 | atomic_read(&disk->sync_io); | 6039 | atomic_read(&disk->sync_io); |
5738 | /* sync IO will cause sync_io to increase before the disk_stats | 6040 | /* sync IO will cause sync_io to increase before the disk_stats |
5739 | * as sync_io is counted when a request starts, and | 6041 | * as sync_io is counted when a request starts, and |
5740 | * disk_stats is counted when it completes. | 6042 | * disk_stats is counted when it completes. |
@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev) | |||
5757 | * always make curr_events less than last_events. | 6059 | * always make curr_events less than last_events. |
5758 | * | 6060 | * |
5759 | */ | 6061 | */ |
5760 | if (curr_events - rdev->last_events > 4096) { | 6062 | if (init || curr_events - rdev->last_events > 64) { |
5761 | rdev->last_events = curr_events; | 6063 | rdev->last_events = curr_events; |
5762 | idle = 0; | 6064 | idle = 0; |
5763 | } | 6065 | } |
@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev) | |||
5980 | j = mddev->recovery_cp; | 6282 | j = mddev->recovery_cp; |
5981 | 6283 | ||
5982 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 6284 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
5983 | max_sectors = mddev->size << 1; | 6285 | max_sectors = mddev->dev_sectors; |
5984 | else { | 6286 | else { |
5985 | /* recovery follows the physical size of devices */ | 6287 | /* recovery follows the physical size of devices */ |
5986 | max_sectors = mddev->size << 1; | 6288 | max_sectors = mddev->dev_sectors; |
5987 | j = MaxSector; | 6289 | j = MaxSector; |
5988 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6290 | list_for_each_entry(rdev, &mddev->disks, same_set) |
5989 | if (rdev->raid_disk >= 0 && | 6291 | if (rdev->raid_disk >= 0 && |
@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev) | |||
6000 | "(but not more than %d KB/sec) for %s.\n", | 6302 | "(but not more than %d KB/sec) for %s.\n", |
6001 | speed_max(mddev), desc); | 6303 | speed_max(mddev), desc); |
6002 | 6304 | ||
6003 | is_mddev_idle(mddev); /* this also initializes IO event counters */ | 6305 | is_mddev_idle(mddev, 1); /* this initializes IO event counters */ |
6004 | 6306 | ||
6005 | io_sectors = 0; | 6307 | io_sectors = 0; |
6006 | for (m = 0; m < SYNC_MARKS; m++) { | 6308 | for (m = 0; m < SYNC_MARKS; m++) { |
@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev) | |||
6040 | } | 6342 | } |
6041 | if (kthread_should_stop()) | 6343 | if (kthread_should_stop()) |
6042 | goto interrupted; | 6344 | goto interrupted; |
6345 | |||
6346 | if (mddev->curr_resync > mddev->curr_resync_completed && | ||
6347 | (mddev->curr_resync - mddev->curr_resync_completed) | ||
6348 | > (max_sectors >> 4)) { | ||
6349 | /* time to update curr_resync_completed */ | ||
6350 | blk_unplug(mddev->queue); | ||
6351 | wait_event(mddev->recovery_wait, | ||
6352 | atomic_read(&mddev->recovery_active) == 0); | ||
6353 | mddev->curr_resync_completed = | ||
6354 | mddev->curr_resync; | ||
6355 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | ||
6356 | } | ||
6043 | sectors = mddev->pers->sync_request(mddev, j, &skipped, | 6357 | sectors = mddev->pers->sync_request(mddev, j, &skipped, |
6044 | currspeed < speed_min(mddev)); | 6358 | currspeed < speed_min(mddev)); |
6045 | if (sectors == 0) { | 6359 | if (sectors == 0) { |
@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev) | |||
6102 | 6416 | ||
6103 | if (currspeed > speed_min(mddev)) { | 6417 | if (currspeed > speed_min(mddev)) { |
6104 | if ((currspeed > speed_max(mddev)) || | 6418 | if ((currspeed > speed_max(mddev)) || |
6105 | !is_mddev_idle(mddev)) { | 6419 | !is_mddev_idle(mddev, 0)) { |
6106 | msleep(500); | 6420 | msleep(500); |
6107 | goto repeat; | 6421 | goto repeat; |
6108 | } | 6422 | } |
@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
6173 | mdk_rdev_t *rdev; | 6487 | mdk_rdev_t *rdev; |
6174 | int spares = 0; | 6488 | int spares = 0; |
6175 | 6489 | ||
6490 | mddev->curr_resync_completed = 0; | ||
6491 | |||
6176 | list_for_each_entry(rdev, &mddev->disks, same_set) | 6492 | list_for_each_entry(rdev, &mddev->disks, same_set) |
6177 | if (rdev->raid_disk >= 0 && | 6493 | if (rdev->raid_disk >= 0 && |
6178 | !test_bit(Blocked, &rdev->flags) && | 6494 | !test_bit(Blocked, &rdev->flags) && |
@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev) | |||
6327 | sysfs_notify(&mddev->kobj, NULL, | 6643 | sysfs_notify(&mddev->kobj, NULL, |
6328 | "degraded"); | 6644 | "degraded"); |
6329 | } | 6645 | } |
6646 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && | ||
6647 | mddev->pers->finish_reshape) | ||
6648 | mddev->pers->finish_reshape(mddev); | ||
6330 | md_update_sb(mddev, 1); | 6649 | md_update_sb(mddev, 1); |
6331 | 6650 | ||
6332 | /* if array is no-longer degraded, then any saved_raid_disk | 6651 | /* if array is no-longer degraded, then any saved_raid_disk |
@@ -6470,13 +6789,13 @@ static void md_geninit(void) | |||
6470 | 6789 | ||
6471 | static int __init md_init(void) | 6790 | static int __init md_init(void) |
6472 | { | 6791 | { |
6473 | if (register_blkdev(MAJOR_NR, "md")) | 6792 | if (register_blkdev(MD_MAJOR, "md")) |
6474 | return -1; | 6793 | return -1; |
6475 | if ((mdp_major=register_blkdev(0, "mdp"))<=0) { | 6794 | if ((mdp_major=register_blkdev(0, "mdp"))<=0) { |
6476 | unregister_blkdev(MAJOR_NR, "md"); | 6795 | unregister_blkdev(MD_MAJOR, "md"); |
6477 | return -1; | 6796 | return -1; |
6478 | } | 6797 | } |
6479 | blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, | 6798 | blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, |
6480 | md_probe, NULL, NULL); | 6799 | md_probe, NULL, NULL); |
6481 | blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, | 6800 | blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, |
6482 | md_probe, NULL, NULL); | 6801 | md_probe, NULL, NULL); |
@@ -6562,10 +6881,10 @@ static __exit void md_exit(void) | |||
6562 | mddev_t *mddev; | 6881 | mddev_t *mddev; |
6563 | struct list_head *tmp; | 6882 | struct list_head *tmp; |
6564 | 6883 | ||
6565 | blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); | 6884 | blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); |
6566 | blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); | 6885 | blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); |
6567 | 6886 | ||
6568 | unregister_blkdev(MAJOR_NR,"md"); | 6887 | unregister_blkdev(MD_MAJOR,"md"); |
6569 | unregister_blkdev(mdp_major, "mdp"); | 6888 | unregister_blkdev(mdp_major, "mdp"); |
6570 | unregister_reboot_notifier(&md_notifier); | 6889 | unregister_reboot_notifier(&md_notifier); |
6571 | unregister_sysctl_table(raid_table_header); | 6890 | unregister_sysctl_table(raid_table_header); |
diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 000000000000..e9b7f54c24d6 --- /dev/null +++ b/drivers/md/md.h | |||
@@ -0,0 +1,436 @@ | |||
1 | /* | ||
2 | md_k.h : kernel internal structure of the Linux MD driver | ||
3 | Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman | ||
4 | |||
5 | This program is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published by | ||
7 | the Free Software Foundation; either version 2, or (at your option) | ||
8 | any later version. | ||
9 | |||
10 | You should have received a copy of the GNU General Public License | ||
11 | (for example /usr/src/linux/COPYING); if not, write to the Free | ||
12 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
13 | */ | ||
14 | |||
15 | #ifndef _MD_K_H | ||
16 | #define _MD_K_H | ||
17 | |||
18 | #ifdef CONFIG_BLOCK | ||
19 | |||
20 | #define MaxSector (~(sector_t)0) | ||
21 | |||
22 | typedef struct mddev_s mddev_t; | ||
23 | typedef struct mdk_rdev_s mdk_rdev_t; | ||
24 | |||
25 | /* | ||
26 | * options passed in raidrun: | ||
27 | */ | ||
28 | |||
29 | /* Currently this must fit in an 'int' */ | ||
30 | #define MAX_CHUNK_SIZE (1<<30) | ||
31 | |||
32 | /* | ||
33 | * MD's 'extended' device | ||
34 | */ | ||
35 | struct mdk_rdev_s | ||
36 | { | ||
37 | struct list_head same_set; /* RAID devices within the same set */ | ||
38 | |||
39 | sector_t sectors; /* Device size (in 512bytes sectors) */ | ||
40 | mddev_t *mddev; /* RAID array if running */ | ||
41 | int last_events; /* IO event timestamp */ | ||
42 | |||
43 | struct block_device *bdev; /* block device handle */ | ||
44 | |||
45 | struct page *sb_page; | ||
46 | int sb_loaded; | ||
47 | __u64 sb_events; | ||
48 | sector_t data_offset; /* start of data in array */ | ||
49 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ | ||
50 | int sb_size; /* bytes in the superblock */ | ||
51 | int preferred_minor; /* autorun support */ | ||
52 | |||
53 | struct kobject kobj; | ||
54 | |||
55 | /* A device can be in one of three states based on two flags: | ||
56 | * Not working: faulty==1 in_sync==0 | ||
57 | * Fully working: faulty==0 in_sync==1 | ||
58 | * Working, but not | ||
59 | * in sync with array | ||
60 | * faulty==0 in_sync==0 | ||
61 | * | ||
62 | * It can never have faulty==1, in_sync==1 | ||
63 | * This reduces the burden of testing multiple flags in many cases | ||
64 | */ | ||
65 | |||
66 | unsigned long flags; | ||
67 | #define Faulty 1 /* device is known to have a fault */ | ||
68 | #define In_sync 2 /* device is in_sync with rest of array */ | ||
69 | #define WriteMostly 4 /* Avoid reading if at all possible */ | ||
70 | #define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ | ||
71 | #define AllReserved 6 /* If whole device is reserved for | ||
72 | * one array */ | ||
73 | #define AutoDetected 7 /* added by auto-detect */ | ||
74 | #define Blocked 8 /* An error occured on an externally | ||
75 | * managed array, don't allow writes | ||
76 | * until it is cleared */ | ||
77 | #define StateChanged 9 /* Faulty or Blocked has changed during | ||
78 | * interrupt, so it needs to be | ||
79 | * notified by the thread */ | ||
80 | wait_queue_head_t blocked_wait; | ||
81 | |||
82 | int desc_nr; /* descriptor index in the superblock */ | ||
83 | int raid_disk; /* role of device in array */ | ||
84 | int saved_raid_disk; /* role that device used to have in the | ||
85 | * array and could again if we did a partial | ||
86 | * resync from the bitmap | ||
87 | */ | ||
88 | sector_t recovery_offset;/* If this device has been partially | ||
89 | * recovered, this is where we were | ||
90 | * up to. | ||
91 | */ | ||
92 | |||
93 | atomic_t nr_pending; /* number of pending requests. | ||
94 | * only maintained for arrays that | ||
95 | * support hot removal | ||
96 | */ | ||
97 | atomic_t read_errors; /* number of consecutive read errors that | ||
98 | * we have tried to ignore. | ||
99 | */ | ||
100 | atomic_t corrected_errors; /* number of corrected read errors, | ||
101 | * for reporting to userspace and storing | ||
102 | * in superblock. | ||
103 | */ | ||
104 | struct work_struct del_work; /* used for delayed sysfs removal */ | ||
105 | |||
106 | struct sysfs_dirent *sysfs_state; /* handle for 'state' | ||
107 | * sysfs entry */ | ||
108 | }; | ||
109 | |||
110 | struct mddev_s | ||
111 | { | ||
112 | void *private; | ||
113 | struct mdk_personality *pers; | ||
114 | dev_t unit; | ||
115 | int md_minor; | ||
116 | struct list_head disks; | ||
117 | unsigned long flags; | ||
118 | #define MD_CHANGE_DEVS 0 /* Some device status has changed */ | ||
119 | #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ | ||
120 | #define MD_CHANGE_PENDING 2 /* superblock update in progress */ | ||
121 | |||
122 | int suspended; | ||
123 | atomic_t active_io; | ||
124 | int ro; | ||
125 | |||
126 | struct gendisk *gendisk; | ||
127 | |||
128 | struct kobject kobj; | ||
129 | int hold_active; | ||
130 | #define UNTIL_IOCTL 1 | ||
131 | #define UNTIL_STOP 2 | ||
132 | |||
133 | /* Superblock information */ | ||
134 | int major_version, | ||
135 | minor_version, | ||
136 | patch_version; | ||
137 | int persistent; | ||
138 | int external; /* metadata is | ||
139 | * managed externally */ | ||
140 | char metadata_type[17]; /* externally set*/ | ||
141 | int chunk_size; | ||
142 | time_t ctime, utime; | ||
143 | int level, layout; | ||
144 | char clevel[16]; | ||
145 | int raid_disks; | ||
146 | int max_disks; | ||
147 | sector_t dev_sectors; /* used size of | ||
148 | * component devices */ | ||
149 | sector_t array_sectors; /* exported array size */ | ||
150 | int external_size; /* size managed | ||
151 | * externally */ | ||
152 | __u64 events; | ||
153 | |||
154 | char uuid[16]; | ||
155 | |||
156 | /* If the array is being reshaped, we need to record the | ||
157 | * new shape and an indication of where we are up to. | ||
158 | * This is written to the superblock. | ||
159 | * If reshape_position is MaxSector, then no reshape is happening (yet). | ||
160 | */ | ||
161 | sector_t reshape_position; | ||
162 | int delta_disks, new_level, new_layout, new_chunk; | ||
163 | |||
164 | struct mdk_thread_s *thread; /* management thread */ | ||
165 | struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ | ||
166 | sector_t curr_resync; /* last block scheduled */ | ||
167 | /* As resync requests can complete out of order, we cannot easily track | ||
168 | * how much resync has been completed. So we occasionally pause until | ||
169 | * everything completes, then set curr_resync_completed to curr_resync. | ||
170 | * As such it may be well behind the real resync mark, but it is a value | ||
171 | * we are certain of. | ||
172 | */ | ||
173 | sector_t curr_resync_completed; | ||
174 | unsigned long resync_mark; /* a recent timestamp */ | ||
175 | sector_t resync_mark_cnt;/* blocks written at resync_mark */ | ||
176 | sector_t curr_mark_cnt; /* blocks scheduled now */ | ||
177 | |||
178 | sector_t resync_max_sectors; /* may be set by personality */ | ||
179 | |||
180 | sector_t resync_mismatches; /* count of sectors where | ||
181 | * parity/replica mismatch found | ||
182 | */ | ||
183 | |||
184 | /* allow user-space to request suspension of IO to regions of the array */ | ||
185 | sector_t suspend_lo; | ||
186 | sector_t suspend_hi; | ||
187 | /* if zero, use the system-wide default */ | ||
188 | int sync_speed_min; | ||
189 | int sync_speed_max; | ||
190 | |||
191 | /* resync even though the same disks are shared among md-devices */ | ||
192 | int parallel_resync; | ||
193 | |||
194 | int ok_start_degraded; | ||
195 | /* recovery/resync flags | ||
196 | * NEEDED: we might need to start a resync/recover | ||
197 | * RUNNING: a thread is running, or about to be started | ||
198 | * SYNC: actually doing a resync, not a recovery | ||
199 | * RECOVER: doing recovery, or need to try it. | ||
200 | * INTR: resync needs to be aborted for some reason | ||
201 | * DONE: thread is done and is waiting to be reaped | ||
202 | * REQUEST: user-space has requested a sync (used with SYNC) | ||
203 | * CHECK: user-space request for for check-only, no repair | ||
204 | * RESHAPE: A reshape is happening | ||
205 | * | ||
206 | * If neither SYNC or RESHAPE are set, then it is a recovery. | ||
207 | */ | ||
208 | #define MD_RECOVERY_RUNNING 0 | ||
209 | #define MD_RECOVERY_SYNC 1 | ||
210 | #define MD_RECOVERY_RECOVER 2 | ||
211 | #define MD_RECOVERY_INTR 3 | ||
212 | #define MD_RECOVERY_DONE 4 | ||
213 | #define MD_RECOVERY_NEEDED 5 | ||
214 | #define MD_RECOVERY_REQUESTED 6 | ||
215 | #define MD_RECOVERY_CHECK 7 | ||
216 | #define MD_RECOVERY_RESHAPE 8 | ||
217 | #define MD_RECOVERY_FROZEN 9 | ||
218 | |||
219 | unsigned long recovery; | ||
220 | int recovery_disabled; /* if we detect that recovery | ||
221 | * will always fail, set this | ||
222 | * so we don't loop trying */ | ||
223 | |||
224 | int in_sync; /* know to not need resync */ | ||
225 | struct mutex reconfig_mutex; | ||
226 | atomic_t active; /* general refcount */ | ||
227 | atomic_t openers; /* number of active opens */ | ||
228 | |||
229 | int changed; /* true if we might need to reread partition info */ | ||
230 | int degraded; /* whether md should consider | ||
231 | * adding a spare | ||
232 | */ | ||
233 | int barriers_work; /* initialised to true, cleared as soon | ||
234 | * as a barrier request to slave | ||
235 | * fails. Only supported | ||
236 | */ | ||
237 | struct bio *biolist; /* bios that need to be retried | ||
238 | * because BIO_RW_BARRIER is not supported | ||
239 | */ | ||
240 | |||
241 | atomic_t recovery_active; /* blocks scheduled, but not written */ | ||
242 | wait_queue_head_t recovery_wait; | ||
243 | sector_t recovery_cp; | ||
244 | sector_t resync_min; /* user requested sync | ||
245 | * starts here */ | ||
246 | sector_t resync_max; /* resync should pause | ||
247 | * when it gets here */ | ||
248 | |||
249 | struct sysfs_dirent *sysfs_state; /* handle for 'array_state' | ||
250 | * file in sysfs. | ||
251 | */ | ||
252 | struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ | ||
253 | |||
254 | struct work_struct del_work; /* used for delayed sysfs removal */ | ||
255 | |||
256 | spinlock_t write_lock; | ||
257 | wait_queue_head_t sb_wait; /* for waiting on superblock updates */ | ||
258 | atomic_t pending_writes; /* number of active superblock writes */ | ||
259 | |||
260 | unsigned int safemode; /* if set, update "clean" superblock | ||
261 | * when no writes pending. | ||
262 | */ | ||
263 | unsigned int safemode_delay; | ||
264 | struct timer_list safemode_timer; | ||
265 | atomic_t writes_pending; | ||
266 | struct request_queue *queue; /* for plugging ... */ | ||
267 | |||
268 | atomic_t write_behind; /* outstanding async IO */ | ||
269 | unsigned int max_write_behind; /* 0 = sync */ | ||
270 | |||
271 | struct bitmap *bitmap; /* the bitmap for the device */ | ||
272 | struct file *bitmap_file; /* the bitmap file */ | ||
273 | long bitmap_offset; /* offset from superblock of | ||
274 | * start of bitmap. May be | ||
275 | * negative, but not '0' | ||
276 | */ | ||
277 | long default_bitmap_offset; /* this is the offset to use when | ||
278 | * hot-adding a bitmap. It should | ||
279 | * eventually be settable by sysfs. | ||
280 | */ | ||
281 | |||
282 | struct list_head all_mddevs; | ||
283 | }; | ||
284 | |||
285 | |||
286 | static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) | ||
287 | { | ||
288 | int faulty = test_bit(Faulty, &rdev->flags); | ||
289 | if (atomic_dec_and_test(&rdev->nr_pending) && faulty) | ||
290 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
291 | } | ||
292 | |||
293 | static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) | ||
294 | { | ||
295 | atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); | ||
296 | } | ||
297 | |||
298 | struct mdk_personality | ||
299 | { | ||
300 | char *name; | ||
301 | int level; | ||
302 | struct list_head list; | ||
303 | struct module *owner; | ||
304 | int (*make_request)(struct request_queue *q, struct bio *bio); | ||
305 | int (*run)(mddev_t *mddev); | ||
306 | int (*stop)(mddev_t *mddev); | ||
307 | void (*status)(struct seq_file *seq, mddev_t *mddev); | ||
308 | /* error_handler must set ->faulty and clear ->in_sync | ||
309 | * if appropriate, and should abort recovery if needed | ||
310 | */ | ||
311 | void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
312 | int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); | ||
313 | int (*hot_remove_disk) (mddev_t *mddev, int number); | ||
314 | int (*spare_active) (mddev_t *mddev); | ||
315 | sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); | ||
316 | int (*resize) (mddev_t *mddev, sector_t sectors); | ||
317 | sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks); | ||
318 | int (*check_reshape) (mddev_t *mddev); | ||
319 | int (*start_reshape) (mddev_t *mddev); | ||
320 | void (*finish_reshape) (mddev_t *mddev); | ||
321 | int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); | ||
322 | /* quiesce moves between quiescence states | ||
323 | * 0 - fully active | ||
324 | * 1 - no new requests allowed | ||
325 | * others - reserved | ||
326 | */ | ||
327 | void (*quiesce) (mddev_t *mddev, int state); | ||
328 | /* takeover is used to transition an array from one | ||
329 | * personality to another. The new personality must be able | ||
330 | * to handle the data in the current layout. | ||
331 | * e.g. 2drive raid1 -> 2drive raid5 | ||
332 | * ndrive raid5 -> degraded n+1drive raid6 with special layout | ||
333 | * If the takeover succeeds, a new 'private' structure is returned. | ||
334 | * This needs to be installed and then ->run used to activate the | ||
335 | * array. | ||
336 | */ | ||
337 | void *(*takeover) (mddev_t *mddev); | ||
338 | }; | ||
339 | |||
340 | |||
341 | struct md_sysfs_entry { | ||
342 | struct attribute attr; | ||
343 | ssize_t (*show)(mddev_t *, char *); | ||
344 | ssize_t (*store)(mddev_t *, const char *, size_t); | ||
345 | }; | ||
346 | |||
347 | |||
348 | static inline char * mdname (mddev_t * mddev) | ||
349 | { | ||
350 | return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * iterates through some rdev ringlist. It's safe to remove the | ||
355 | * current 'rdev'. Dont touch 'tmp' though. | ||
356 | */ | ||
357 | #define rdev_for_each_list(rdev, tmp, head) \ | ||
358 | list_for_each_entry_safe(rdev, tmp, head, same_set) | ||
359 | |||
360 | /* | ||
361 | * iterates through the 'same array disks' ringlist | ||
362 | */ | ||
363 | #define rdev_for_each(rdev, tmp, mddev) \ | ||
364 | list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) | ||
365 | |||
366 | #define rdev_for_each_rcu(rdev, mddev) \ | ||
367 | list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) | ||
368 | |||
369 | typedef struct mdk_thread_s { | ||
370 | void (*run) (mddev_t *mddev); | ||
371 | mddev_t *mddev; | ||
372 | wait_queue_head_t wqueue; | ||
373 | unsigned long flags; | ||
374 | struct task_struct *tsk; | ||
375 | unsigned long timeout; | ||
376 | } mdk_thread_t; | ||
377 | |||
378 | #define THREAD_WAKEUP 0 | ||
379 | |||
380 | #define __wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
381 | do { \ | ||
382 | wait_queue_t __wait; \ | ||
383 | init_waitqueue_entry(&__wait, current); \ | ||
384 | \ | ||
385 | add_wait_queue(&wq, &__wait); \ | ||
386 | for (;;) { \ | ||
387 | set_current_state(TASK_UNINTERRUPTIBLE); \ | ||
388 | if (condition) \ | ||
389 | break; \ | ||
390 | spin_unlock_irq(&lock); \ | ||
391 | cmd; \ | ||
392 | schedule(); \ | ||
393 | spin_lock_irq(&lock); \ | ||
394 | } \ | ||
395 | current->state = TASK_RUNNING; \ | ||
396 | remove_wait_queue(&wq, &__wait); \ | ||
397 | } while (0) | ||
398 | |||
399 | #define wait_event_lock_irq(wq, condition, lock, cmd) \ | ||
400 | do { \ | ||
401 | if (condition) \ | ||
402 | break; \ | ||
403 | __wait_event_lock_irq(wq, condition, lock, cmd); \ | ||
404 | } while (0) | ||
405 | |||
406 | static inline void safe_put_page(struct page *p) | ||
407 | { | ||
408 | if (p) put_page(p); | ||
409 | } | ||
410 | |||
411 | #endif /* CONFIG_BLOCK */ | ||
412 | #endif | ||
413 | |||
414 | |||
415 | extern int register_md_personality(struct mdk_personality *p); | ||
416 | extern int unregister_md_personality(struct mdk_personality *p); | ||
417 | extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), | ||
418 | mddev_t *mddev, const char *name); | ||
419 | extern void md_unregister_thread(mdk_thread_t *thread); | ||
420 | extern void md_wakeup_thread(mdk_thread_t *thread); | ||
421 | extern void md_check_recovery(mddev_t *mddev); | ||
422 | extern void md_write_start(mddev_t *mddev, struct bio *bi); | ||
423 | extern void md_write_end(mddev_t *mddev); | ||
424 | extern void md_done_sync(mddev_t *mddev, int blocks, int ok); | ||
425 | extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); | ||
426 | |||
427 | extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, | ||
428 | sector_t sector, int size, struct page *page); | ||
429 | extern void md_super_wait(mddev_t *mddev); | ||
430 | extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | ||
431 | struct page *page, int rw); | ||
432 | extern void md_do_sync(mddev_t *mddev); | ||
433 | extern void md_new_event(mddev_t *mddev); | ||
434 | extern int md_allow_write(mddev_t *mddev); | ||
435 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | ||
436 | extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); | ||
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index b61d5767aae7..3b1500843bba 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c | |||
@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) | |||
59 | uint8_t v; | 59 | uint8_t v; |
60 | uint8_t exptbl[256], invtbl[256]; | 60 | uint8_t exptbl[256], invtbl[256]; |
61 | 61 | ||
62 | printf("#include \"raid6.h\"\n"); | 62 | printf("#include <linux/raid/pq.h>\n"); |
63 | 63 | ||
64 | /* Compute multiplication table */ | 64 | /* Compute multiplication table */ |
65 | printf("\nconst u8 __attribute__((aligned(256)))\n" | 65 | printf("\nconst u8 __attribute__((aligned(256)))\n" |
@@ -76,6 +76,9 @@ int main(int argc, char *argv[]) | |||
76 | printf("\t},\n"); | 76 | printf("\t},\n"); |
77 | } | 77 | } |
78 | printf("};\n"); | 78 | printf("};\n"); |
79 | printf("#ifdef __KERNEL__\n"); | ||
80 | printf("EXPORT_SYMBOL(raid6_gfmul);\n"); | ||
81 | printf("#endif\n"); | ||
79 | 82 | ||
80 | /* Compute power-of-2 table (exponent) */ | 83 | /* Compute power-of-2 table (exponent) */ |
81 | v = 1; | 84 | v = 1; |
@@ -92,6 +95,9 @@ int main(int argc, char *argv[]) | |||
92 | } | 95 | } |
93 | } | 96 | } |
94 | printf("};\n"); | 97 | printf("};\n"); |
98 | printf("#ifdef __KERNEL__\n"); | ||
99 | printf("EXPORT_SYMBOL(raid6_gfexp);\n"); | ||
100 | printf("#endif\n"); | ||
95 | 101 | ||
96 | /* Compute inverse table x^-1 == x^254 */ | 102 | /* Compute inverse table x^-1 == x^254 */ |
97 | printf("\nconst u8 __attribute__((aligned(256)))\n" | 103 | printf("\nconst u8 __attribute__((aligned(256)))\n" |
@@ -104,6 +110,9 @@ int main(int argc, char *argv[]) | |||
104 | } | 110 | } |
105 | } | 111 | } |
106 | printf("};\n"); | 112 | printf("};\n"); |
113 | printf("#ifdef __KERNEL__\n"); | ||
114 | printf("EXPORT_SYMBOL(raid6_gfinv);\n"); | ||
115 | printf("#endif\n"); | ||
107 | 116 | ||
108 | /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ | 117 | /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ |
109 | printf("\nconst u8 __attribute__((aligned(256)))\n" | 118 | printf("\nconst u8 __attribute__((aligned(256)))\n" |
@@ -115,6 +124,9 @@ int main(int argc, char *argv[]) | |||
115 | (j == 7) ? '\n' : ' '); | 124 | (j == 7) ? '\n' : ' '); |
116 | } | 125 | } |
117 | printf("};\n"); | 126 | printf("};\n"); |
127 | printf("#ifdef __KERNEL__\n"); | ||
128 | printf("EXPORT_SYMBOL(raid6_gfexi);\n"); | ||
129 | printf("#endif\n"); | ||
118 | 130 | ||
119 | return 0; | 131 | return 0; |
120 | } | 132 | } |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index f6d08f241671..41ced0cbe823 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -19,7 +19,11 @@ | |||
19 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 19 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/raid/multipath.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/raid/md_u.h> | ||
24 | #include <linux/seq_file.h> | ||
25 | #include "md.h" | ||
26 | #include "multipath.h" | ||
23 | 27 | ||
24 | #define MAX_WORK_PER_DISK 128 | 28 | #define MAX_WORK_PER_DISK 128 |
25 | 29 | ||
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev) | |||
402 | spin_unlock_irqrestore(&conf->device_lock, flags); | 406 | spin_unlock_irqrestore(&conf->device_lock, flags); |
403 | } | 407 | } |
404 | 408 | ||
409 | static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
410 | { | ||
411 | WARN_ONCE(sectors || raid_disks, | ||
412 | "%s does not support generic reshape\n", __func__); | ||
413 | |||
414 | return mddev->dev_sectors; | ||
415 | } | ||
416 | |||
405 | static int multipath_run (mddev_t *mddev) | 417 | static int multipath_run (mddev_t *mddev) |
406 | { | 418 | { |
407 | multipath_conf_t *conf; | 419 | multipath_conf_t *conf; |
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev) | |||
498 | /* | 510 | /* |
499 | * Ok, everything is just fine now | 511 | * Ok, everything is just fine now |
500 | */ | 512 | */ |
501 | mddev->array_sectors = mddev->size * 2; | 513 | md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); |
502 | 514 | ||
503 | mddev->queue->unplug_fn = multipath_unplug; | 515 | mddev->queue->unplug_fn = multipath_unplug; |
504 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; | 516 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; |
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality = | |||
543 | .error_handler = multipath_error, | 555 | .error_handler = multipath_error, |
544 | .hot_add_disk = multipath_add_disk, | 556 | .hot_add_disk = multipath_add_disk, |
545 | .hot_remove_disk= multipath_remove_disk, | 557 | .hot_remove_disk= multipath_remove_disk, |
558 | .size = multipath_size, | ||
546 | }; | 559 | }; |
547 | 560 | ||
548 | static int __init multipath_init (void) | 561 | static int __init multipath_init (void) |
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h new file mode 100644 index 000000000000..6fa70b400cda --- /dev/null +++ b/drivers/md/multipath.h | |||
@@ -0,0 +1,40 @@ | |||
1 | #ifndef _MULTIPATH_H | ||
2 | #define _MULTIPATH_H | ||
3 | |||
4 | struct multipath_info { | ||
5 | mdk_rdev_t *rdev; | ||
6 | }; | ||
7 | |||
8 | struct multipath_private_data { | ||
9 | mddev_t *mddev; | ||
10 | struct multipath_info *multipaths; | ||
11 | int raid_disks; | ||
12 | int working_disks; | ||
13 | spinlock_t device_lock; | ||
14 | struct list_head retry_list; | ||
15 | |||
16 | mempool_t *pool; | ||
17 | }; | ||
18 | |||
19 | typedef struct multipath_private_data multipath_conf_t; | ||
20 | |||
21 | /* | ||
22 | * this is the only point in the RAID code where we violate | ||
23 | * C type safety. mddev->private is an 'opaque' pointer. | ||
24 | */ | ||
25 | #define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) | ||
26 | |||
27 | /* | ||
28 | * this is our 'private' 'collective' MULTIPATH buffer head. | ||
29 | * it contains information about what kind of IO operations were started | ||
30 | * for this MULTIPATH operation, and about their status: | ||
31 | */ | ||
32 | |||
33 | struct multipath_bh { | ||
34 | mddev_t *mddev; | ||
35 | struct bio *master_bio; | ||
36 | struct bio bio; | ||
37 | int path; | ||
38 | struct list_head retry_list; | ||
39 | }; | ||
40 | #endif | ||
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c605ba805586..c08d7559be55 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -18,7 +18,10 @@ | |||
18 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/raid/raid0.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/seq_file.h> | ||
23 | #include "md.h" | ||
24 | #include "raid0.h" | ||
22 | 25 | ||
23 | static void raid0_unplug(struct request_queue *q) | 26 | static void raid0_unplug(struct request_queue *q) |
24 | { | 27 | { |
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev) | |||
73 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 76 | list_for_each_entry(rdev2, &mddev->disks, same_set) { |
74 | printk(KERN_INFO "raid0: comparing %s(%llu)", | 77 | printk(KERN_INFO "raid0: comparing %s(%llu)", |
75 | bdevname(rdev1->bdev,b), | 78 | bdevname(rdev1->bdev,b), |
76 | (unsigned long long)rdev1->size); | 79 | (unsigned long long)rdev1->sectors); |
77 | printk(KERN_INFO " with %s(%llu)\n", | 80 | printk(KERN_INFO " with %s(%llu)\n", |
78 | bdevname(rdev2->bdev,b), | 81 | bdevname(rdev2->bdev,b), |
79 | (unsigned long long)rdev2->size); | 82 | (unsigned long long)rdev2->sectors); |
80 | if (rdev2 == rdev1) { | 83 | if (rdev2 == rdev1) { |
81 | printk(KERN_INFO "raid0: END\n"); | 84 | printk(KERN_INFO "raid0: END\n"); |
82 | break; | 85 | break; |
83 | } | 86 | } |
84 | if (rdev2->size == rdev1->size) | 87 | if (rdev2->sectors == rdev1->sectors) { |
85 | { | ||
86 | /* | 88 | /* |
87 | * Not unique, don't count it as a new | 89 | * Not unique, don't count it as a new |
88 | * group | 90 | * group |
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev) | |||
145 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) | 147 | mddev->queue->max_sectors > (PAGE_SIZE>>9)) |
146 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 148 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
147 | 149 | ||
148 | if (!smallest || (rdev1->size <smallest->size)) | 150 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
149 | smallest = rdev1; | 151 | smallest = rdev1; |
150 | cnt++; | 152 | cnt++; |
151 | } | 153 | } |
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev) | |||
155 | goto abort; | 157 | goto abort; |
156 | } | 158 | } |
157 | zone->nb_dev = cnt; | 159 | zone->nb_dev = cnt; |
158 | zone->sectors = smallest->size * cnt * 2; | 160 | zone->sectors = smallest->sectors * cnt; |
159 | zone->zone_start = 0; | 161 | zone->zone_start = 0; |
160 | 162 | ||
161 | current_start = smallest->size * 2; | 163 | current_start = smallest->sectors; |
162 | curr_zone_start = zone->sectors; | 164 | curr_zone_start = zone->sectors; |
163 | 165 | ||
164 | /* now do the other zones */ | 166 | /* now do the other zones */ |
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev) | |||
177 | rdev = conf->strip_zone[0].dev[j]; | 179 | rdev = conf->strip_zone[0].dev[j]; |
178 | printk(KERN_INFO "raid0: checking %s ...", | 180 | printk(KERN_INFO "raid0: checking %s ...", |
179 | bdevname(rdev->bdev, b)); | 181 | bdevname(rdev->bdev, b)); |
180 | if (rdev->size > current_start / 2) { | 182 | if (rdev->sectors <= current_start) { |
181 | printk(KERN_INFO " contained as device %d\n", | ||
182 | c); | ||
183 | zone->dev[c] = rdev; | ||
184 | c++; | ||
185 | if (!smallest || (rdev->size <smallest->size)) { | ||
186 | smallest = rdev; | ||
187 | printk(KERN_INFO " (%llu) is smallest!.\n", | ||
188 | (unsigned long long)rdev->size); | ||
189 | } | ||
190 | } else | ||
191 | printk(KERN_INFO " nope.\n"); | 183 | printk(KERN_INFO " nope.\n"); |
184 | continue; | ||
185 | } | ||
186 | printk(KERN_INFO " contained as device %d\n", c); | ||
187 | zone->dev[c] = rdev; | ||
188 | c++; | ||
189 | if (!smallest || rdev->sectors < smallest->sectors) { | ||
190 | smallest = rdev; | ||
191 | printk(KERN_INFO " (%llu) is smallest!.\n", | ||
192 | (unsigned long long)rdev->sectors); | ||
193 | } | ||
192 | } | 194 | } |
193 | 195 | ||
194 | zone->nb_dev = c; | 196 | zone->nb_dev = c; |
195 | zone->sectors = (smallest->size * 2 - current_start) * c; | 197 | zone->sectors = (smallest->sectors - current_start) * c; |
196 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", | 198 | printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", |
197 | zone->nb_dev, (unsigned long long)zone->sectors); | 199 | zone->nb_dev, (unsigned long long)zone->sectors); |
198 | 200 | ||
199 | zone->zone_start = curr_zone_start; | 201 | zone->zone_start = curr_zone_start; |
200 | curr_zone_start += zone->sectors; | 202 | curr_zone_start += zone->sectors; |
201 | 203 | ||
202 | current_start = smallest->size * 2; | 204 | current_start = smallest->sectors; |
203 | printk(KERN_INFO "raid0: current zone start: %llu\n", | 205 | printk(KERN_INFO "raid0: current zone start: %llu\n", |
204 | (unsigned long long)current_start); | 206 | (unsigned long long)current_start); |
205 | } | 207 | } |
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q, | |||
261 | return max; | 263 | return max; |
262 | } | 264 | } |
263 | 265 | ||
266 | static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
267 | { | ||
268 | sector_t array_sectors = 0; | ||
269 | mdk_rdev_t *rdev; | ||
270 | |||
271 | WARN_ONCE(sectors || raid_disks, | ||
272 | "%s does not support generic reshape\n", __func__); | ||
273 | |||
274 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
275 | array_sectors += rdev->sectors; | ||
276 | |||
277 | return array_sectors; | ||
278 | } | ||
279 | |||
264 | static int raid0_run (mddev_t *mddev) | 280 | static int raid0_run (mddev_t *mddev) |
265 | { | 281 | { |
266 | unsigned cur=0, i=0, nb_zone; | 282 | unsigned cur=0, i=0, nb_zone; |
267 | s64 sectors; | 283 | s64 sectors; |
268 | raid0_conf_t *conf; | 284 | raid0_conf_t *conf; |
269 | mdk_rdev_t *rdev; | ||
270 | 285 | ||
271 | if (mddev->chunk_size == 0) { | 286 | if (mddev->chunk_size == 0) { |
272 | printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); | 287 | printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); |
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev) | |||
291 | goto out_free_conf; | 306 | goto out_free_conf; |
292 | 307 | ||
293 | /* calculate array device size */ | 308 | /* calculate array device size */ |
294 | mddev->array_sectors = 0; | 309 | md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); |
295 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
296 | mddev->array_sectors += rdev->size * 2; | ||
297 | 310 | ||
298 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", | 311 | printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", |
299 | (unsigned long long)mddev->array_sectors); | 312 | (unsigned long long)mddev->array_sectors); |
300 | printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", | 313 | printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", |
301 | (unsigned long long)conf->spacing); | 314 | (unsigned long long)conf->spacing); |
302 | { | 315 | { |
303 | sector_t s = mddev->array_sectors; | 316 | sector_t s = raid0_size(mddev, 0, 0); |
304 | sector_t space = conf->spacing; | 317 | sector_t space = conf->spacing; |
305 | int round; | 318 | int round; |
306 | conf->sector_shift = 0; | 319 | conf->sector_shift = 0; |
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality= | |||
509 | .run = raid0_run, | 522 | .run = raid0_run, |
510 | .stop = raid0_stop, | 523 | .stop = raid0_stop, |
511 | .status = raid0_status, | 524 | .status = raid0_status, |
525 | .size = raid0_size, | ||
512 | }; | 526 | }; |
513 | 527 | ||
514 | static int __init raid0_init (void) | 528 | static int __init raid0_init (void) |
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 000000000000..824b12eb1d4f --- /dev/null +++ b/drivers/md/raid0.h | |||
@@ -0,0 +1,28 @@ | |||
1 | #ifndef _RAID0_H | ||
2 | #define _RAID0_H | ||
3 | |||
4 | struct strip_zone | ||
5 | { | ||
6 | sector_t zone_start; /* Zone offset in md_dev (in sectors) */ | ||
7 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ | ||
8 | sector_t sectors; /* Zone size in sectors */ | ||
9 | int nb_dev; /* # of devices attached to the zone */ | ||
10 | mdk_rdev_t **dev; /* Devices attached to the zone */ | ||
11 | }; | ||
12 | |||
13 | struct raid0_private_data | ||
14 | { | ||
15 | struct strip_zone **hash_table; /* Table of indexes into strip_zone */ | ||
16 | struct strip_zone *strip_zone; | ||
17 | mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ | ||
18 | int nr_strip_zones; | ||
19 | |||
20 | sector_t spacing; | ||
21 | int sector_shift; /* shift this before divide by spacing */ | ||
22 | }; | ||
23 | |||
24 | typedef struct raid0_private_data raid0_conf_t; | ||
25 | |||
26 | #define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) | ||
27 | |||
28 | #endif | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2466425d9ca..b4f4badc0068 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -31,10 +31,13 @@ | |||
31 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 31 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
32 | */ | 32 | */ |
33 | 33 | ||
34 | #include "dm-bio-list.h" | ||
35 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
36 | #include <linux/raid/raid1.h> | 35 | #include <linux/blkdev.h> |
37 | #include <linux/raid/bitmap.h> | 36 | #include <linux/seq_file.h> |
37 | #include "md.h" | ||
38 | #include "dm-bio-list.h" | ||
39 | #include "raid1.h" | ||
40 | #include "bitmap.h" | ||
38 | 41 | ||
39 | #define DEBUG 0 | 42 | #define DEBUG 0 |
40 | #if DEBUG | 43 | #if DEBUG |
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1723 | return 0; | 1726 | return 0; |
1724 | } | 1727 | } |
1725 | 1728 | ||
1726 | max_sector = mddev->size << 1; | 1729 | max_sector = mddev->dev_sectors; |
1727 | if (sector_nr >= max_sector) { | 1730 | if (sector_nr >= max_sector) { |
1728 | /* If we aborted, we need to abort the | 1731 | /* If we aborted, we need to abort the |
1729 | * sync on the 'current' bitmap chunk (there will | 1732 | * sync on the 'current' bitmap chunk (there will |
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1919 | return nr_sectors; | 1922 | return nr_sectors; |
1920 | } | 1923 | } |
1921 | 1924 | ||
1925 | static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
1926 | { | ||
1927 | if (sectors) | ||
1928 | return sectors; | ||
1929 | |||
1930 | return mddev->dev_sectors; | ||
1931 | } | ||
1932 | |||
1922 | static int run(mddev_t *mddev) | 1933 | static int run(mddev_t *mddev) |
1923 | { | 1934 | { |
1924 | conf_t *conf; | 1935 | conf_t *conf; |
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev) | |||
2048 | /* | 2059 | /* |
2049 | * Ok, everything is just fine now | 2060 | * Ok, everything is just fine now |
2050 | */ | 2061 | */ |
2051 | mddev->array_sectors = mddev->size * 2; | 2062 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
2052 | 2063 | ||
2053 | mddev->queue->unplug_fn = raid1_unplug; | 2064 | mddev->queue->unplug_fn = raid1_unplug; |
2054 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2065 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev) | |||
2089 | /* need to kick something here to make sure I/O goes? */ | 2100 | /* need to kick something here to make sure I/O goes? */ |
2090 | } | 2101 | } |
2091 | 2102 | ||
2103 | raise_barrier(conf); | ||
2104 | lower_barrier(conf); | ||
2105 | |||
2092 | md_unregister_thread(mddev->thread); | 2106 | md_unregister_thread(mddev->thread); |
2093 | mddev->thread = NULL; | 2107 | mddev->thread = NULL; |
2094 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 2108 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2110 | * any io in the removed space completes, but it hardly seems | 2124 | * any io in the removed space completes, but it hardly seems |
2111 | * worth it. | 2125 | * worth it. |
2112 | */ | 2126 | */ |
2113 | mddev->array_sectors = sectors; | 2127 | md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); |
2128 | if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) | ||
2129 | return -EINVAL; | ||
2114 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2130 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2115 | mddev->changed = 1; | 2131 | mddev->changed = 1; |
2116 | if (mddev->array_sectors / 2 > mddev->size && | 2132 | if (sectors > mddev->dev_sectors && |
2117 | mddev->recovery_cp == MaxSector) { | 2133 | mddev->recovery_cp == MaxSector) { |
2118 | mddev->recovery_cp = mddev->size << 1; | 2134 | mddev->recovery_cp = mddev->dev_sectors; |
2119 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2135 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2120 | } | 2136 | } |
2121 | mddev->size = mddev->array_sectors / 2; | 2137 | mddev->dev_sectors = sectors; |
2122 | mddev->resync_max_sectors = sectors; | 2138 | mddev->resync_max_sectors = sectors; |
2123 | return 0; | 2139 | return 0; |
2124 | } | 2140 | } |
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality = | |||
2264 | .spare_active = raid1_spare_active, | 2280 | .spare_active = raid1_spare_active, |
2265 | .sync_request = sync_request, | 2281 | .sync_request = sync_request, |
2266 | .resize = raid1_resize, | 2282 | .resize = raid1_resize, |
2283 | .size = raid1_size, | ||
2267 | .check_reshape = raid1_reshape, | 2284 | .check_reshape = raid1_reshape, |
2268 | .quiesce = raid1_quiesce, | 2285 | .quiesce = raid1_quiesce, |
2269 | }; | 2286 | }; |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 000000000000..1620eea3d57c --- /dev/null +++ b/drivers/md/raid1.h | |||
@@ -0,0 +1,132 @@ | |||
1 | #ifndef _RAID1_H | ||
2 | #define _RAID1_H | ||
3 | |||
4 | typedef struct mirror_info mirror_info_t; | ||
5 | |||
6 | struct mirror_info { | ||
7 | mdk_rdev_t *rdev; | ||
8 | sector_t head_position; | ||
9 | }; | ||
10 | |||
11 | /* | ||
12 | * memory pools need a pointer to the mddev, so they can force an unplug | ||
13 | * when memory is tight, and a count of the number of drives that the | ||
14 | * pool was allocated for, so they know how much to allocate and free. | ||
15 | * mddev->raid_disks cannot be used, as it can change while a pool is active | ||
16 | * These two datums are stored in a kmalloced struct. | ||
17 | */ | ||
18 | |||
19 | struct pool_info { | ||
20 | mddev_t *mddev; | ||
21 | int raid_disks; | ||
22 | }; | ||
23 | |||
24 | |||
25 | typedef struct r1bio_s r1bio_t; | ||
26 | |||
27 | struct r1_private_data_s { | ||
28 | mddev_t *mddev; | ||
29 | mirror_info_t *mirrors; | ||
30 | int raid_disks; | ||
31 | int last_used; | ||
32 | sector_t next_seq_sect; | ||
33 | spinlock_t device_lock; | ||
34 | |||
35 | struct list_head retry_list; | ||
36 | /* queue pending writes and submit them on unplug */ | ||
37 | struct bio_list pending_bio_list; | ||
38 | /* queue of writes that have been unplugged */ | ||
39 | struct bio_list flushing_bio_list; | ||
40 | |||
41 | /* for use when syncing mirrors: */ | ||
42 | |||
43 | spinlock_t resync_lock; | ||
44 | int nr_pending; | ||
45 | int nr_waiting; | ||
46 | int nr_queued; | ||
47 | int barrier; | ||
48 | sector_t next_resync; | ||
49 | int fullsync; /* set to 1 if a full sync is needed, | ||
50 | * (fresh device added). | ||
51 | * Cleared when a sync completes. | ||
52 | */ | ||
53 | |||
54 | wait_queue_head_t wait_barrier; | ||
55 | |||
56 | struct pool_info *poolinfo; | ||
57 | |||
58 | struct page *tmppage; | ||
59 | |||
60 | mempool_t *r1bio_pool; | ||
61 | mempool_t *r1buf_pool; | ||
62 | }; | ||
63 | |||
64 | typedef struct r1_private_data_s conf_t; | ||
65 | |||
66 | /* | ||
67 | * this is the only point in the RAID code where we violate | ||
68 | * C type safety. mddev->private is an 'opaque' pointer. | ||
69 | */ | ||
70 | #define mddev_to_conf(mddev) ((conf_t *) mddev->private) | ||
71 | |||
72 | /* | ||
73 | * this is our 'private' RAID1 bio. | ||
74 | * | ||
75 | * it contains information about what kind of IO operations were started | ||
76 | * for this RAID1 operation, and about their status: | ||
77 | */ | ||
78 | |||
79 | struct r1bio_s { | ||
80 | atomic_t remaining; /* 'have we finished' count, | ||
81 | * used from IRQ handlers | ||
82 | */ | ||
83 | atomic_t behind_remaining; /* number of write-behind ios remaining | ||
84 | * in this BehindIO request | ||
85 | */ | ||
86 | sector_t sector; | ||
87 | int sectors; | ||
88 | unsigned long state; | ||
89 | mddev_t *mddev; | ||
90 | /* | ||
91 | * original bio going to /dev/mdx | ||
92 | */ | ||
93 | struct bio *master_bio; | ||
94 | /* | ||
95 | * if the IO is in READ direction, then this is where we read | ||
96 | */ | ||
97 | int read_disk; | ||
98 | |||
99 | struct list_head retry_list; | ||
100 | struct bitmap_update *bitmap_update; | ||
101 | /* | ||
102 | * if the IO is in WRITE direction, then multiple bios are used. | ||
103 | * We choose the number when they are allocated. | ||
104 | */ | ||
105 | struct bio *bios[0]; | ||
106 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ | ||
107 | }; | ||
108 | |||
109 | /* when we get a read error on a read-only array, we redirect to another | ||
110 | * device without failing the first device, or trying to over-write to | ||
111 | * correct the read error. To keep track of bad blocks on a per-bio | ||
112 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
113 | */ | ||
114 | #define IO_BLOCKED ((struct bio*)1) | ||
115 | |||
116 | /* bits for r1bio.state */ | ||
117 | #define R1BIO_Uptodate 0 | ||
118 | #define R1BIO_IsSync 1 | ||
119 | #define R1BIO_Degraded 2 | ||
120 | #define R1BIO_BehindIO 3 | ||
121 | #define R1BIO_Barrier 4 | ||
122 | #define R1BIO_BarrierRetry 5 | ||
123 | /* For write-behind requests, we call bi_end_io when | ||
124 | * the last non-write-behind device completes, providing | ||
125 | * any write was successful. Otherwise we call when | ||
126 | * any write-behind write succeeds, otherwise we call | ||
127 | * with failure when last write completes (and all failed). | ||
128 | * Record that bi_end_io was called with this flag... | ||
129 | */ | ||
130 | #define R1BIO_Returned 6 | ||
131 | |||
132 | #endif | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7301631abe04..e293d92641ac 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -18,10 +18,13 @@ | |||
18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "dm-bio-list.h" | ||
22 | #include <linux/delay.h> | 21 | #include <linux/delay.h> |
23 | #include <linux/raid/raid10.h> | 22 | #include <linux/blkdev.h> |
24 | #include <linux/raid/bitmap.h> | 23 | #include <linux/seq_file.h> |
24 | #include "md.h" | ||
25 | #include "dm-bio-list.h" | ||
26 | #include "raid10.h" | ||
27 | #include "bitmap.h" | ||
25 | 28 | ||
26 | /* | 29 | /* |
27 | * RAID10 provides a combination of RAID0 and RAID1 functionality. | 30 | * RAID10 provides a combination of RAID0 and RAID1 functionality. |
@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1695 | return 0; | 1698 | return 0; |
1696 | 1699 | ||
1697 | skipped: | 1700 | skipped: |
1698 | max_sector = mddev->size << 1; | 1701 | max_sector = mddev->dev_sectors; |
1699 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 1702 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
1700 | max_sector = mddev->resync_max_sectors; | 1703 | max_sector = mddev->resync_max_sectors; |
1701 | if (sector_nr >= max_sector) { | 1704 | if (sector_nr >= max_sector) { |
@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2020 | goto skipped; | 2023 | goto skipped; |
2021 | } | 2024 | } |
2022 | 2025 | ||
2026 | static sector_t | ||
2027 | raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
2028 | { | ||
2029 | sector_t size; | ||
2030 | conf_t *conf = mddev_to_conf(mddev); | ||
2031 | |||
2032 | if (!raid_disks) | ||
2033 | raid_disks = mddev->raid_disks; | ||
2034 | if (!sectors) | ||
2035 | sectors = mddev->dev_sectors; | ||
2036 | |||
2037 | size = sectors >> conf->chunk_shift; | ||
2038 | sector_div(size, conf->far_copies); | ||
2039 | size = size * raid_disks; | ||
2040 | sector_div(size, conf->near_copies); | ||
2041 | |||
2042 | return size << conf->chunk_shift; | ||
2043 | } | ||
2044 | |||
2023 | static int run(mddev_t *mddev) | 2045 | static int run(mddev_t *mddev) |
2024 | { | 2046 | { |
2025 | conf_t *conf; | 2047 | conf_t *conf; |
@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev) | |||
2076 | conf->far_offset = fo; | 2098 | conf->far_offset = fo; |
2077 | conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; | 2099 | conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; |
2078 | conf->chunk_shift = ffz(~mddev->chunk_size) - 9; | 2100 | conf->chunk_shift = ffz(~mddev->chunk_size) - 9; |
2079 | size = mddev->size >> (conf->chunk_shift-1); | 2101 | size = mddev->dev_sectors >> conf->chunk_shift; |
2080 | sector_div(size, fc); | 2102 | sector_div(size, fc); |
2081 | size = size * conf->raid_disks; | 2103 | size = size * conf->raid_disks; |
2082 | sector_div(size, nc); | 2104 | sector_div(size, nc); |
@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev) | |||
2089 | */ | 2111 | */ |
2090 | stride += conf->raid_disks - 1; | 2112 | stride += conf->raid_disks - 1; |
2091 | sector_div(stride, conf->raid_disks); | 2113 | sector_div(stride, conf->raid_disks); |
2092 | mddev->size = stride << (conf->chunk_shift-1); | 2114 | mddev->dev_sectors = stride << conf->chunk_shift; |
2093 | 2115 | ||
2094 | if (fo) | 2116 | if (fo) |
2095 | stride = 1; | 2117 | stride = 1; |
@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev) | |||
2171 | /* | 2193 | /* |
2172 | * Ok, everything is just fine now | 2194 | * Ok, everything is just fine now |
2173 | */ | 2195 | */ |
2174 | mddev->array_sectors = size << conf->chunk_shift; | 2196 | md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); |
2175 | mddev->resync_max_sectors = size << conf->chunk_shift; | 2197 | mddev->resync_max_sectors = raid10_size(mddev, 0, 0); |
2176 | 2198 | ||
2177 | mddev->queue->unplug_fn = raid10_unplug; | 2199 | mddev->queue->unplug_fn = raid10_unplug; |
2178 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 2200 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; |
@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev) | |||
2208 | { | 2230 | { |
2209 | conf_t *conf = mddev_to_conf(mddev); | 2231 | conf_t *conf = mddev_to_conf(mddev); |
2210 | 2232 | ||
2233 | raise_barrier(conf, 0); | ||
2234 | lower_barrier(conf); | ||
2235 | |||
2211 | md_unregister_thread(mddev->thread); | 2236 | md_unregister_thread(mddev->thread); |
2212 | mddev->thread = NULL; | 2237 | mddev->thread = NULL; |
2213 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 2238 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ |
@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality = | |||
2255 | .spare_active = raid10_spare_active, | 2280 | .spare_active = raid10_spare_active, |
2256 | .sync_request = sync_request, | 2281 | .sync_request = sync_request, |
2257 | .quiesce = raid10_quiesce, | 2282 | .quiesce = raid10_quiesce, |
2283 | .size = raid10_size, | ||
2258 | }; | 2284 | }; |
2259 | 2285 | ||
2260 | static int __init raid_init(void) | 2286 | static int __init raid_init(void) |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 000000000000..244dbe507a54 --- /dev/null +++ b/drivers/md/raid10.h | |||
@@ -0,0 +1,121 @@ | |||
1 | #ifndef _RAID10_H | ||
2 | #define _RAID10_H | ||
3 | |||
4 | typedef struct mirror_info mirror_info_t; | ||
5 | |||
6 | struct mirror_info { | ||
7 | mdk_rdev_t *rdev; | ||
8 | sector_t head_position; | ||
9 | }; | ||
10 | |||
11 | typedef struct r10bio_s r10bio_t; | ||
12 | |||
13 | struct r10_private_data_s { | ||
14 | mddev_t *mddev; | ||
15 | mirror_info_t *mirrors; | ||
16 | int raid_disks; | ||
17 | spinlock_t device_lock; | ||
18 | |||
19 | /* geometry */ | ||
20 | int near_copies; /* number of copies layed out raid0 style */ | ||
21 | int far_copies; /* number of copies layed out | ||
22 | * at large strides across drives | ||
23 | */ | ||
24 | int far_offset; /* far_copies are offset by 1 stripe | ||
25 | * instead of many | ||
26 | */ | ||
27 | int copies; /* near_copies * far_copies. | ||
28 | * must be <= raid_disks | ||
29 | */ | ||
30 | sector_t stride; /* distance between far copies. | ||
31 | * This is size / far_copies unless | ||
32 | * far_offset, in which case it is | ||
33 | * 1 stripe. | ||
34 | */ | ||
35 | |||
36 | int chunk_shift; /* shift from chunks to sectors */ | ||
37 | sector_t chunk_mask; | ||
38 | |||
39 | struct list_head retry_list; | ||
40 | /* queue pending writes and submit them on unplug */ | ||
41 | struct bio_list pending_bio_list; | ||
42 | |||
43 | |||
44 | spinlock_t resync_lock; | ||
45 | int nr_pending; | ||
46 | int nr_waiting; | ||
47 | int nr_queued; | ||
48 | int barrier; | ||
49 | sector_t next_resync; | ||
50 | int fullsync; /* set to 1 if a full sync is needed, | ||
51 | * (fresh device added). | ||
52 | * Cleared when a sync completes. | ||
53 | */ | ||
54 | |||
55 | wait_queue_head_t wait_barrier; | ||
56 | |||
57 | mempool_t *r10bio_pool; | ||
58 | mempool_t *r10buf_pool; | ||
59 | struct page *tmppage; | ||
60 | }; | ||
61 | |||
62 | typedef struct r10_private_data_s conf_t; | ||
63 | |||
64 | /* | ||
65 | * this is the only point in the RAID code where we violate | ||
66 | * C type safety. mddev->private is an 'opaque' pointer. | ||
67 | */ | ||
68 | #define mddev_to_conf(mddev) ((conf_t *) mddev->private) | ||
69 | |||
70 | /* | ||
71 | * this is our 'private' RAID10 bio. | ||
72 | * | ||
73 | * it contains information about what kind of IO operations were started | ||
74 | * for this RAID10 operation, and about their status: | ||
75 | */ | ||
76 | |||
77 | struct r10bio_s { | ||
78 | atomic_t remaining; /* 'have we finished' count, | ||
79 | * used from IRQ handlers | ||
80 | */ | ||
81 | sector_t sector; /* virtual sector number */ | ||
82 | int sectors; | ||
83 | unsigned long state; | ||
84 | mddev_t *mddev; | ||
85 | /* | ||
86 | * original bio going to /dev/mdx | ||
87 | */ | ||
88 | struct bio *master_bio; | ||
89 | /* | ||
90 | * if the IO is in READ direction, then this is where we read | ||
91 | */ | ||
92 | int read_slot; | ||
93 | |||
94 | struct list_head retry_list; | ||
95 | /* | ||
96 | * if the IO is in WRITE direction, then multiple bios are used, | ||
97 | * one for each copy. | ||
98 | * When resyncing we also use one for each copy. | ||
99 | * When reconstructing, we use 2 bios, one for read, one for write. | ||
100 | * We choose the number when they are allocated. | ||
101 | */ | ||
102 | struct { | ||
103 | struct bio *bio; | ||
104 | sector_t addr; | ||
105 | int devnum; | ||
106 | } devs[0]; | ||
107 | }; | ||
108 | |||
109 | /* when we get a read error on a read-only array, we redirect to another | ||
110 | * device without failing the first device, or trying to over-write to | ||
111 | * correct the read error. To keep track of bad blocks on a per-bio | ||
112 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
113 | */ | ||
114 | #define IO_BLOCKED ((struct bio*)1) | ||
115 | |||
116 | /* bits for r10bio.state */ | ||
117 | #define R10BIO_Uptodate 0 | ||
118 | #define R10BIO_IsSync 1 | ||
119 | #define R10BIO_IsRecover 2 | ||
120 | #define R10BIO_Degraded 3 | ||
121 | #endif | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5ba080d303b..3bbc6d647044 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -43,11 +43,14 @@ | |||
43 | * miss any bits. | 43 | * miss any bits. |
44 | */ | 44 | */ |
45 | 45 | ||
46 | #include <linux/blkdev.h> | ||
46 | #include <linux/kthread.h> | 47 | #include <linux/kthread.h> |
47 | #include "raid6.h" | 48 | #include <linux/raid/pq.h> |
48 | |||
49 | #include <linux/raid/bitmap.h> | ||
50 | #include <linux/async_tx.h> | 49 | #include <linux/async_tx.h> |
50 | #include <linux/seq_file.h> | ||
51 | #include "md.h" | ||
52 | #include "raid5.h" | ||
53 | #include "bitmap.h" | ||
51 | 54 | ||
52 | /* | 55 | /* |
53 | * Stripe cache | 56 | * Stripe cache |
@@ -91,11 +94,6 @@ | |||
91 | 94 | ||
92 | #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) | 95 | #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) |
93 | 96 | ||
94 | #if !RAID6_USE_EMPTY_ZERO_PAGE | ||
95 | /* In .bss so it's zeroed */ | ||
96 | const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); | ||
97 | #endif | ||
98 | |||
99 | /* | 97 | /* |
100 | * We maintain a biased count of active stripes in the bottom 16 bits of | 98 | * We maintain a biased count of active stripes in the bottom 16 bits of |
101 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 99 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) | |||
130 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); | 128 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); |
131 | } | 129 | } |
132 | 130 | ||
131 | /* Find first data disk in a raid6 stripe */ | ||
132 | static inline int raid6_d0(struct stripe_head *sh) | ||
133 | { | ||
134 | if (sh->ddf_layout) | ||
135 | /* ddf always start from first device */ | ||
136 | return 0; | ||
137 | /* md starts just after Q block */ | ||
138 | if (sh->qd_idx == sh->disks - 1) | ||
139 | return 0; | ||
140 | else | ||
141 | return sh->qd_idx + 1; | ||
142 | } | ||
133 | static inline int raid6_next_disk(int disk, int raid_disks) | 143 | static inline int raid6_next_disk(int disk, int raid_disks) |
134 | { | 144 | { |
135 | disk++; | 145 | disk++; |
136 | return (disk < raid_disks) ? disk : 0; | 146 | return (disk < raid_disks) ? disk : 0; |
137 | } | 147 | } |
138 | 148 | ||
149 | /* When walking through the disks in a raid5, starting at raid6_d0, | ||
150 | * We need to map each disk to a 'slot', where the data disks are slot | ||
151 | * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk | ||
152 | * is raid_disks-1. This help does that mapping. | ||
153 | */ | ||
154 | static int raid6_idx_to_slot(int idx, struct stripe_head *sh, | ||
155 | int *count, int syndrome_disks) | ||
156 | { | ||
157 | int slot; | ||
158 | |||
159 | if (idx == sh->pd_idx) | ||
160 | return syndrome_disks; | ||
161 | if (idx == sh->qd_idx) | ||
162 | return syndrome_disks + 1; | ||
163 | slot = (*count)++; | ||
164 | return slot; | ||
165 | } | ||
166 | |||
139 | static void return_io(struct bio *return_bi) | 167 | static void return_io(struct bio *return_bi) |
140 | { | 168 | { |
141 | struct bio *bi = return_bi; | 169 | struct bio *bi = return_bi; |
@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
193 | } | 221 | } |
194 | } | 222 | } |
195 | } | 223 | } |
224 | |||
196 | static void release_stripe(struct stripe_head *sh) | 225 | static void release_stripe(struct stripe_head *sh) |
197 | { | 226 | { |
198 | raid5_conf_t *conf = sh->raid_conf; | 227 | raid5_conf_t *conf = sh->raid_conf; |
@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num) | |||
270 | return 0; | 299 | return 0; |
271 | } | 300 | } |
272 | 301 | ||
273 | static void raid5_build_block(struct stripe_head *sh, int i); | 302 | static void raid5_build_block(struct stripe_head *sh, int i, int previous); |
303 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, | ||
304 | struct stripe_head *sh); | ||
274 | 305 | ||
275 | static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) | 306 | static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) |
276 | { | 307 | { |
277 | raid5_conf_t *conf = sh->raid_conf; | 308 | raid5_conf_t *conf = sh->raid_conf; |
278 | int i; | 309 | int i; |
@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int | |||
287 | 318 | ||
288 | remove_hash(sh); | 319 | remove_hash(sh); |
289 | 320 | ||
321 | sh->generation = conf->generation - previous; | ||
322 | sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; | ||
290 | sh->sector = sector; | 323 | sh->sector = sector; |
291 | sh->pd_idx = pd_idx; | 324 | stripe_set_idx(sector, conf, previous, sh); |
292 | sh->state = 0; | 325 | sh->state = 0; |
293 | 326 | ||
294 | sh->disks = disks; | ||
295 | 327 | ||
296 | for (i = sh->disks; i--; ) { | 328 | for (i = sh->disks; i--; ) { |
297 | struct r5dev *dev = &sh->dev[i]; | 329 | struct r5dev *dev = &sh->dev[i]; |
@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int | |||
305 | BUG(); | 337 | BUG(); |
306 | } | 338 | } |
307 | dev->flags = 0; | 339 | dev->flags = 0; |
308 | raid5_build_block(sh, i); | 340 | raid5_build_block(sh, i, previous); |
309 | } | 341 | } |
310 | insert_hash(conf, sh); | 342 | insert_hash(conf, sh); |
311 | } | 343 | } |
312 | 344 | ||
313 | static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) | 345 | static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, |
346 | short generation) | ||
314 | { | 347 | { |
315 | struct stripe_head *sh; | 348 | struct stripe_head *sh; |
316 | struct hlist_node *hn; | 349 | struct hlist_node *hn; |
@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in | |||
318 | CHECK_DEVLOCK(); | 351 | CHECK_DEVLOCK(); |
319 | pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); | 352 | pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); |
320 | hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) | 353 | hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) |
321 | if (sh->sector == sector && sh->disks == disks) | 354 | if (sh->sector == sector && sh->generation == generation) |
322 | return sh; | 355 | return sh; |
323 | pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); | 356 | pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); |
324 | return NULL; | 357 | return NULL; |
@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in | |||
327 | static void unplug_slaves(mddev_t *mddev); | 360 | static void unplug_slaves(mddev_t *mddev); |
328 | static void raid5_unplug_device(struct request_queue *q); | 361 | static void raid5_unplug_device(struct request_queue *q); |
329 | 362 | ||
330 | static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, | 363 | static struct stripe_head * |
331 | int pd_idx, int noblock) | 364 | get_active_stripe(raid5_conf_t *conf, sector_t sector, |
365 | int previous, int noblock) | ||
332 | { | 366 | { |
333 | struct stripe_head *sh; | 367 | struct stripe_head *sh; |
334 | 368 | ||
@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
340 | wait_event_lock_irq(conf->wait_for_stripe, | 374 | wait_event_lock_irq(conf->wait_for_stripe, |
341 | conf->quiesce == 0, | 375 | conf->quiesce == 0, |
342 | conf->device_lock, /* nothing */); | 376 | conf->device_lock, /* nothing */); |
343 | sh = __find_stripe(conf, sector, disks); | 377 | sh = __find_stripe(conf, sector, conf->generation - previous); |
344 | if (!sh) { | 378 | if (!sh) { |
345 | if (!conf->inactive_blocked) | 379 | if (!conf->inactive_blocked) |
346 | sh = get_free_stripe(conf); | 380 | sh = get_free_stripe(conf); |
@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
358 | ); | 392 | ); |
359 | conf->inactive_blocked = 0; | 393 | conf->inactive_blocked = 0; |
360 | } else | 394 | } else |
361 | init_stripe(sh, sector, pd_idx, disks); | 395 | init_stripe(sh, sector, previous); |
362 | } else { | 396 | } else { |
363 | if (atomic_read(&sh->count)) { | 397 | if (atomic_read(&sh->count)) { |
364 | BUG_ON(!list_empty(&sh->lru)); | 398 | BUG_ON(!list_empty(&sh->lru) |
399 | && !test_bit(STRIPE_EXPANDING, &sh->state)); | ||
365 | } else { | 400 | } else { |
366 | if (!test_bit(STRIPE_HANDLE, &sh->state)) | 401 | if (!test_bit(STRIPE_HANDLE, &sh->state)) |
367 | atomic_inc(&conf->active_stripes); | 402 | atomic_inc(&conf->active_stripes); |
@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
895 | struct kmem_cache *sc; | 930 | struct kmem_cache *sc; |
896 | int devs = conf->raid_disks; | 931 | int devs = conf->raid_disks; |
897 | 932 | ||
898 | sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); | 933 | sprintf(conf->cache_name[0], |
899 | sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); | 934 | "raid%d-%s", conf->level, mdname(conf->mddev)); |
935 | sprintf(conf->cache_name[1], | ||
936 | "raid%d-%s-alt", conf->level, mdname(conf->mddev)); | ||
900 | conf->active_name = 0; | 937 | conf->active_name = 0; |
901 | sc = kmem_cache_create(conf->cache_name[conf->active_name], | 938 | sc = kmem_cache_create(conf->cache_name[conf->active_name], |
902 | sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), | 939 | sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), |
@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) | |||
911 | return 0; | 948 | return 0; |
912 | } | 949 | } |
913 | 950 | ||
914 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
915 | static int resize_stripes(raid5_conf_t *conf, int newsize) | 951 | static int resize_stripes(raid5_conf_t *conf, int newsize) |
916 | { | 952 | { |
917 | /* Make all the stripes able to hold 'newsize' devices. | 953 | /* Make all the stripes able to hold 'newsize' devices. |
@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1036 | conf->pool_size = newsize; | 1072 | conf->pool_size = newsize; |
1037 | return err; | 1073 | return err; |
1038 | } | 1074 | } |
1039 | #endif | ||
1040 | 1075 | ||
1041 | static int drop_one_stripe(raid5_conf_t *conf) | 1076 | static int drop_one_stripe(raid5_conf_t *conf) |
1042 | { | 1077 | { |
@@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf) | |||
1066 | 1101 | ||
1067 | static void raid5_end_read_request(struct bio * bi, int error) | 1102 | static void raid5_end_read_request(struct bio * bi, int error) |
1068 | { | 1103 | { |
1069 | struct stripe_head *sh = bi->bi_private; | 1104 | struct stripe_head *sh = bi->bi_private; |
1070 | raid5_conf_t *conf = sh->raid_conf; | 1105 | raid5_conf_t *conf = sh->raid_conf; |
1071 | int disks = sh->disks, i; | 1106 | int disks = sh->disks, i; |
1072 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1107 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
@@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
1148 | 1183 | ||
1149 | static void raid5_end_write_request(struct bio *bi, int error) | 1184 | static void raid5_end_write_request(struct bio *bi, int error) |
1150 | { | 1185 | { |
1151 | struct stripe_head *sh = bi->bi_private; | 1186 | struct stripe_head *sh = bi->bi_private; |
1152 | raid5_conf_t *conf = sh->raid_conf; | 1187 | raid5_conf_t *conf = sh->raid_conf; |
1153 | int disks = sh->disks, i; | 1188 | int disks = sh->disks, i; |
1154 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); | 1189 | int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); |
@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error) | |||
1176 | } | 1211 | } |
1177 | 1212 | ||
1178 | 1213 | ||
1179 | static sector_t compute_blocknr(struct stripe_head *sh, int i); | 1214 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); |
1180 | 1215 | ||
1181 | static void raid5_build_block(struct stripe_head *sh, int i) | 1216 | static void raid5_build_block(struct stripe_head *sh, int i, int previous) |
1182 | { | 1217 | { |
1183 | struct r5dev *dev = &sh->dev[i]; | 1218 | struct r5dev *dev = &sh->dev[i]; |
1184 | 1219 | ||
@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i) | |||
1194 | dev->req.bi_private = sh; | 1229 | dev->req.bi_private = sh; |
1195 | 1230 | ||
1196 | dev->flags = 0; | 1231 | dev->flags = 0; |
1197 | dev->sector = compute_blocknr(sh, i); | 1232 | dev->sector = compute_blocknr(sh, i, previous); |
1198 | } | 1233 | } |
1199 | 1234 | ||
1200 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) | 1235 | static void error(mddev_t *mddev, mdk_rdev_t *rdev) |
@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1227 | * Input: a 'big' sector number, | 1262 | * Input: a 'big' sector number, |
1228 | * Output: index of the data and parity disk, and the sector # in them. | 1263 | * Output: index of the data and parity disk, and the sector # in them. |
1229 | */ | 1264 | */ |
1230 | static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, | 1265 | static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, |
1231 | unsigned int data_disks, unsigned int * dd_idx, | 1266 | int previous, int *dd_idx, |
1232 | unsigned int * pd_idx, raid5_conf_t *conf) | 1267 | struct stripe_head *sh) |
1233 | { | 1268 | { |
1234 | long stripe; | 1269 | long stripe; |
1235 | unsigned long chunk_number; | 1270 | unsigned long chunk_number; |
1236 | unsigned int chunk_offset; | 1271 | unsigned int chunk_offset; |
1272 | int pd_idx, qd_idx; | ||
1273 | int ddf_layout = 0; | ||
1237 | sector_t new_sector; | 1274 | sector_t new_sector; |
1238 | int sectors_per_chunk = conf->chunk_size >> 9; | 1275 | int algorithm = previous ? conf->prev_algo |
1276 | : conf->algorithm; | ||
1277 | int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) | ||
1278 | : (conf->chunk_size >> 9); | ||
1279 | int raid_disks = previous ? conf->previous_raid_disks | ||
1280 | : conf->raid_disks; | ||
1281 | int data_disks = raid_disks - conf->max_degraded; | ||
1239 | 1282 | ||
1240 | /* First compute the information on this sector */ | 1283 | /* First compute the information on this sector */ |
1241 | 1284 | ||
@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, | |||
1259 | /* | 1302 | /* |
1260 | * Select the parity disk based on the user selected algorithm. | 1303 | * Select the parity disk based on the user selected algorithm. |
1261 | */ | 1304 | */ |
1305 | pd_idx = qd_idx = ~0; | ||
1262 | switch(conf->level) { | 1306 | switch(conf->level) { |
1263 | case 4: | 1307 | case 4: |
1264 | *pd_idx = data_disks; | 1308 | pd_idx = data_disks; |
1265 | break; | 1309 | break; |
1266 | case 5: | 1310 | case 5: |
1267 | switch (conf->algorithm) { | 1311 | switch (algorithm) { |
1268 | case ALGORITHM_LEFT_ASYMMETRIC: | 1312 | case ALGORITHM_LEFT_ASYMMETRIC: |
1269 | *pd_idx = data_disks - stripe % raid_disks; | 1313 | pd_idx = data_disks - stripe % raid_disks; |
1270 | if (*dd_idx >= *pd_idx) | 1314 | if (*dd_idx >= pd_idx) |
1271 | (*dd_idx)++; | 1315 | (*dd_idx)++; |
1272 | break; | 1316 | break; |
1273 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1317 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1274 | *pd_idx = stripe % raid_disks; | 1318 | pd_idx = stripe % raid_disks; |
1275 | if (*dd_idx >= *pd_idx) | 1319 | if (*dd_idx >= pd_idx) |
1276 | (*dd_idx)++; | 1320 | (*dd_idx)++; |
1277 | break; | 1321 | break; |
1278 | case ALGORITHM_LEFT_SYMMETRIC: | 1322 | case ALGORITHM_LEFT_SYMMETRIC: |
1279 | *pd_idx = data_disks - stripe % raid_disks; | 1323 | pd_idx = data_disks - stripe % raid_disks; |
1280 | *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; | 1324 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1281 | break; | 1325 | break; |
1282 | case ALGORITHM_RIGHT_SYMMETRIC: | 1326 | case ALGORITHM_RIGHT_SYMMETRIC: |
1283 | *pd_idx = stripe % raid_disks; | 1327 | pd_idx = stripe % raid_disks; |
1284 | *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; | 1328 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; |
1329 | break; | ||
1330 | case ALGORITHM_PARITY_0: | ||
1331 | pd_idx = 0; | ||
1332 | (*dd_idx)++; | ||
1333 | break; | ||
1334 | case ALGORITHM_PARITY_N: | ||
1335 | pd_idx = data_disks; | ||
1285 | break; | 1336 | break; |
1286 | default: | 1337 | default: |
1287 | printk(KERN_ERR "raid5: unsupported algorithm %d\n", | 1338 | printk(KERN_ERR "raid5: unsupported algorithm %d\n", |
1288 | conf->algorithm); | 1339 | algorithm); |
1340 | BUG(); | ||
1289 | } | 1341 | } |
1290 | break; | 1342 | break; |
1291 | case 6: | 1343 | case 6: |
1292 | 1344 | ||
1293 | /**** FIX THIS ****/ | 1345 | switch (algorithm) { |
1294 | switch (conf->algorithm) { | ||
1295 | case ALGORITHM_LEFT_ASYMMETRIC: | 1346 | case ALGORITHM_LEFT_ASYMMETRIC: |
1296 | *pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1347 | pd_idx = raid_disks - 1 - (stripe % raid_disks); |
1297 | if (*pd_idx == raid_disks-1) | 1348 | qd_idx = pd_idx + 1; |
1298 | (*dd_idx)++; /* Q D D D P */ | 1349 | if (pd_idx == raid_disks-1) { |
1299 | else if (*dd_idx >= *pd_idx) | 1350 | (*dd_idx)++; /* Q D D D P */ |
1351 | qd_idx = 0; | ||
1352 | } else if (*dd_idx >= pd_idx) | ||
1300 | (*dd_idx) += 2; /* D D P Q D */ | 1353 | (*dd_idx) += 2; /* D D P Q D */ |
1301 | break; | 1354 | break; |
1302 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1355 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1303 | *pd_idx = stripe % raid_disks; | 1356 | pd_idx = stripe % raid_disks; |
1304 | if (*pd_idx == raid_disks-1) | 1357 | qd_idx = pd_idx + 1; |
1305 | (*dd_idx)++; /* Q D D D P */ | 1358 | if (pd_idx == raid_disks-1) { |
1306 | else if (*dd_idx >= *pd_idx) | 1359 | (*dd_idx)++; /* Q D D D P */ |
1360 | qd_idx = 0; | ||
1361 | } else if (*dd_idx >= pd_idx) | ||
1307 | (*dd_idx) += 2; /* D D P Q D */ | 1362 | (*dd_idx) += 2; /* D D P Q D */ |
1308 | break; | 1363 | break; |
1309 | case ALGORITHM_LEFT_SYMMETRIC: | 1364 | case ALGORITHM_LEFT_SYMMETRIC: |
1310 | *pd_idx = raid_disks - 1 - (stripe % raid_disks); | 1365 | pd_idx = raid_disks - 1 - (stripe % raid_disks); |
1311 | *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; | 1366 | qd_idx = (pd_idx + 1) % raid_disks; |
1367 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; | ||
1312 | break; | 1368 | break; |
1313 | case ALGORITHM_RIGHT_SYMMETRIC: | 1369 | case ALGORITHM_RIGHT_SYMMETRIC: |
1314 | *pd_idx = stripe % raid_disks; | 1370 | pd_idx = stripe % raid_disks; |
1315 | *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; | 1371 | qd_idx = (pd_idx + 1) % raid_disks; |
1372 | *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; | ||
1373 | break; | ||
1374 | |||
1375 | case ALGORITHM_PARITY_0: | ||
1376 | pd_idx = 0; | ||
1377 | qd_idx = 1; | ||
1378 | (*dd_idx) += 2; | ||
1379 | break; | ||
1380 | case ALGORITHM_PARITY_N: | ||
1381 | pd_idx = data_disks; | ||
1382 | qd_idx = data_disks + 1; | ||
1316 | break; | 1383 | break; |
1384 | |||
1385 | case ALGORITHM_ROTATING_ZERO_RESTART: | ||
1386 | /* Exactly the same as RIGHT_ASYMMETRIC, but or | ||
1387 | * of blocks for computing Q is different. | ||
1388 | */ | ||
1389 | pd_idx = stripe % raid_disks; | ||
1390 | qd_idx = pd_idx + 1; | ||
1391 | if (pd_idx == raid_disks-1) { | ||
1392 | (*dd_idx)++; /* Q D D D P */ | ||
1393 | qd_idx = 0; | ||
1394 | } else if (*dd_idx >= pd_idx) | ||
1395 | (*dd_idx) += 2; /* D D P Q D */ | ||
1396 | ddf_layout = 1; | ||
1397 | break; | ||
1398 | |||
1399 | case ALGORITHM_ROTATING_N_RESTART: | ||
1400 | /* Same a left_asymmetric, by first stripe is | ||
1401 | * D D D P Q rather than | ||
1402 | * Q D D D P | ||
1403 | */ | ||
1404 | pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); | ||
1405 | qd_idx = pd_idx + 1; | ||
1406 | if (pd_idx == raid_disks-1) { | ||
1407 | (*dd_idx)++; /* Q D D D P */ | ||
1408 | qd_idx = 0; | ||
1409 | } else if (*dd_idx >= pd_idx) | ||
1410 | (*dd_idx) += 2; /* D D P Q D */ | ||
1411 | ddf_layout = 1; | ||
1412 | break; | ||
1413 | |||
1414 | case ALGORITHM_ROTATING_N_CONTINUE: | ||
1415 | /* Same as left_symmetric but Q is before P */ | ||
1416 | pd_idx = raid_disks - 1 - (stripe % raid_disks); | ||
1417 | qd_idx = (pd_idx + raid_disks - 1) % raid_disks; | ||
1418 | *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; | ||
1419 | ddf_layout = 1; | ||
1420 | break; | ||
1421 | |||
1422 | case ALGORITHM_LEFT_ASYMMETRIC_6: | ||
1423 | /* RAID5 left_asymmetric, with Q on last device */ | ||
1424 | pd_idx = data_disks - stripe % (raid_disks-1); | ||
1425 | if (*dd_idx >= pd_idx) | ||
1426 | (*dd_idx)++; | ||
1427 | qd_idx = raid_disks - 1; | ||
1428 | break; | ||
1429 | |||
1430 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | ||
1431 | pd_idx = stripe % (raid_disks-1); | ||
1432 | if (*dd_idx >= pd_idx) | ||
1433 | (*dd_idx)++; | ||
1434 | qd_idx = raid_disks - 1; | ||
1435 | break; | ||
1436 | |||
1437 | case ALGORITHM_LEFT_SYMMETRIC_6: | ||
1438 | pd_idx = data_disks - stripe % (raid_disks-1); | ||
1439 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); | ||
1440 | qd_idx = raid_disks - 1; | ||
1441 | break; | ||
1442 | |||
1443 | case ALGORITHM_RIGHT_SYMMETRIC_6: | ||
1444 | pd_idx = stripe % (raid_disks-1); | ||
1445 | *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); | ||
1446 | qd_idx = raid_disks - 1; | ||
1447 | break; | ||
1448 | |||
1449 | case ALGORITHM_PARITY_0_6: | ||
1450 | pd_idx = 0; | ||
1451 | (*dd_idx)++; | ||
1452 | qd_idx = raid_disks - 1; | ||
1453 | break; | ||
1454 | |||
1455 | |||
1317 | default: | 1456 | default: |
1318 | printk(KERN_CRIT "raid6: unsupported algorithm %d\n", | 1457 | printk(KERN_CRIT "raid6: unsupported algorithm %d\n", |
1319 | conf->algorithm); | 1458 | algorithm); |
1459 | BUG(); | ||
1320 | } | 1460 | } |
1321 | break; | 1461 | break; |
1322 | } | 1462 | } |
1323 | 1463 | ||
1464 | if (sh) { | ||
1465 | sh->pd_idx = pd_idx; | ||
1466 | sh->qd_idx = qd_idx; | ||
1467 | sh->ddf_layout = ddf_layout; | ||
1468 | } | ||
1324 | /* | 1469 | /* |
1325 | * Finally, compute the new sector number | 1470 | * Finally, compute the new sector number |
1326 | */ | 1471 | */ |
@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, | |||
1329 | } | 1474 | } |
1330 | 1475 | ||
1331 | 1476 | ||
1332 | static sector_t compute_blocknr(struct stripe_head *sh, int i) | 1477 | static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) |
1333 | { | 1478 | { |
1334 | raid5_conf_t *conf = sh->raid_conf; | 1479 | raid5_conf_t *conf = sh->raid_conf; |
1335 | int raid_disks = sh->disks; | 1480 | int raid_disks = sh->disks; |
1336 | int data_disks = raid_disks - conf->max_degraded; | 1481 | int data_disks = raid_disks - conf->max_degraded; |
1337 | sector_t new_sector = sh->sector, check; | 1482 | sector_t new_sector = sh->sector, check; |
1338 | int sectors_per_chunk = conf->chunk_size >> 9; | 1483 | int sectors_per_chunk = previous ? (conf->prev_chunk >> 9) |
1484 | : (conf->chunk_size >> 9); | ||
1485 | int algorithm = previous ? conf->prev_algo | ||
1486 | : conf->algorithm; | ||
1339 | sector_t stripe; | 1487 | sector_t stripe; |
1340 | int chunk_offset; | 1488 | int chunk_offset; |
1341 | int chunk_number, dummy1, dummy2, dd_idx = i; | 1489 | int chunk_number, dummy1, dd_idx = i; |
1342 | sector_t r_sector; | 1490 | sector_t r_sector; |
1491 | struct stripe_head sh2; | ||
1343 | 1492 | ||
1344 | 1493 | ||
1345 | chunk_offset = sector_div(new_sector, sectors_per_chunk); | 1494 | chunk_offset = sector_div(new_sector, sectors_per_chunk); |
@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) | |||
1351 | switch(conf->level) { | 1500 | switch(conf->level) { |
1352 | case 4: break; | 1501 | case 4: break; |
1353 | case 5: | 1502 | case 5: |
1354 | switch (conf->algorithm) { | 1503 | switch (algorithm) { |
1355 | case ALGORITHM_LEFT_ASYMMETRIC: | 1504 | case ALGORITHM_LEFT_ASYMMETRIC: |
1356 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1505 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1357 | if (i > sh->pd_idx) | 1506 | if (i > sh->pd_idx) |
@@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) | |||
1363 | i += raid_disks; | 1512 | i += raid_disks; |
1364 | i -= (sh->pd_idx + 1); | 1513 | i -= (sh->pd_idx + 1); |
1365 | break; | 1514 | break; |
1515 | case ALGORITHM_PARITY_0: | ||
1516 | i -= 1; | ||
1517 | break; | ||
1518 | case ALGORITHM_PARITY_N: | ||
1519 | break; | ||
1366 | default: | 1520 | default: |
1367 | printk(KERN_ERR "raid5: unsupported algorithm %d\n", | 1521 | printk(KERN_ERR "raid5: unsupported algorithm %d\n", |
1368 | conf->algorithm); | 1522 | algorithm); |
1523 | BUG(); | ||
1369 | } | 1524 | } |
1370 | break; | 1525 | break; |
1371 | case 6: | 1526 | case 6: |
1372 | if (i == raid6_next_disk(sh->pd_idx, raid_disks)) | 1527 | if (i == sh->qd_idx) |
1373 | return 0; /* It is the Q disk */ | 1528 | return 0; /* It is the Q disk */ |
1374 | switch (conf->algorithm) { | 1529 | switch (algorithm) { |
1375 | case ALGORITHM_LEFT_ASYMMETRIC: | 1530 | case ALGORITHM_LEFT_ASYMMETRIC: |
1376 | case ALGORITHM_RIGHT_ASYMMETRIC: | 1531 | case ALGORITHM_RIGHT_ASYMMETRIC: |
1377 | if (sh->pd_idx == raid_disks-1) | 1532 | case ALGORITHM_ROTATING_ZERO_RESTART: |
1378 | i--; /* Q D D D P */ | 1533 | case ALGORITHM_ROTATING_N_RESTART: |
1534 | if (sh->pd_idx == raid_disks-1) | ||
1535 | i--; /* Q D D D P */ | ||
1379 | else if (i > sh->pd_idx) | 1536 | else if (i > sh->pd_idx) |
1380 | i -= 2; /* D D P Q D */ | 1537 | i -= 2; /* D D P Q D */ |
1381 | break; | 1538 | break; |
@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) | |||
1390 | i -= (sh->pd_idx + 2); | 1547 | i -= (sh->pd_idx + 2); |
1391 | } | 1548 | } |
1392 | break; | 1549 | break; |
1550 | case ALGORITHM_PARITY_0: | ||
1551 | i -= 2; | ||
1552 | break; | ||
1553 | case ALGORITHM_PARITY_N: | ||
1554 | break; | ||
1555 | case ALGORITHM_ROTATING_N_CONTINUE: | ||
1556 | if (sh->pd_idx == 0) | ||
1557 | i--; /* P D D D Q */ | ||
1558 | else if (i > sh->pd_idx) | ||
1559 | i -= 2; /* D D Q P D */ | ||
1560 | break; | ||
1561 | case ALGORITHM_LEFT_ASYMMETRIC_6: | ||
1562 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | ||
1563 | if (i > sh->pd_idx) | ||
1564 | i--; | ||
1565 | break; | ||
1566 | case ALGORITHM_LEFT_SYMMETRIC_6: | ||
1567 | case ALGORITHM_RIGHT_SYMMETRIC_6: | ||
1568 | if (i < sh->pd_idx) | ||
1569 | i += data_disks + 1; | ||
1570 | i -= (sh->pd_idx + 1); | ||
1571 | break; | ||
1572 | case ALGORITHM_PARITY_0_6: | ||
1573 | i -= 1; | ||
1574 | break; | ||
1393 | default: | 1575 | default: |
1394 | printk(KERN_CRIT "raid6: unsupported algorithm %d\n", | 1576 | printk(KERN_CRIT "raid6: unsupported algorithm %d\n", |
1395 | conf->algorithm); | 1577 | algorithm); |
1578 | BUG(); | ||
1396 | } | 1579 | } |
1397 | break; | 1580 | break; |
1398 | } | 1581 | } |
@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) | |||
1400 | chunk_number = stripe * data_disks + i; | 1583 | chunk_number = stripe * data_disks + i; |
1401 | r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; | 1584 | r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; |
1402 | 1585 | ||
1403 | check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); | 1586 | check = raid5_compute_sector(conf, r_sector, |
1404 | if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { | 1587 | previous, &dummy1, &sh2); |
1588 | if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx | ||
1589 | || sh2.qd_idx != sh->qd_idx) { | ||
1405 | printk(KERN_ERR "compute_blocknr: map not correct\n"); | 1590 | printk(KERN_ERR "compute_blocknr: map not correct\n"); |
1406 | return 0; | 1591 | return 0; |
1407 | } | 1592 | } |
@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio, | |||
1468 | 1653 | ||
1469 | static void compute_parity6(struct stripe_head *sh, int method) | 1654 | static void compute_parity6(struct stripe_head *sh, int method) |
1470 | { | 1655 | { |
1471 | raid6_conf_t *conf = sh->raid_conf; | 1656 | raid5_conf_t *conf = sh->raid_conf; |
1472 | int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; | 1657 | int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; |
1658 | int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); | ||
1473 | struct bio *chosen; | 1659 | struct bio *chosen; |
1474 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | 1660 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ |
1475 | void *ptrs[disks]; | 1661 | void *ptrs[syndrome_disks+2]; |
1476 | 1662 | ||
1477 | qd_idx = raid6_next_disk(pd_idx, disks); | 1663 | pd_idx = sh->pd_idx; |
1478 | d0_idx = raid6_next_disk(qd_idx, disks); | 1664 | qd_idx = sh->qd_idx; |
1665 | d0_idx = raid6_d0(sh); | ||
1479 | 1666 | ||
1480 | pr_debug("compute_parity, stripe %llu, method %d\n", | 1667 | pr_debug("compute_parity, stripe %llu, method %d\n", |
1481 | (unsigned long long)sh->sector, method); | 1668 | (unsigned long long)sh->sector, method); |
@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method) | |||
1513 | set_bit(R5_UPTODATE, &sh->dev[i].flags); | 1700 | set_bit(R5_UPTODATE, &sh->dev[i].flags); |
1514 | } | 1701 | } |
1515 | 1702 | ||
1516 | // switch(method) { | 1703 | /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/ |
1517 | // case RECONSTRUCT_WRITE: | 1704 | |
1518 | // case CHECK_PARITY: | 1705 | for (i = 0; i < disks; i++) |
1519 | // case UPDATE_PARITY: | 1706 | ptrs[i] = (void *)raid6_empty_zero_page; |
1520 | /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ | 1707 | |
1521 | /* FIX: Is this ordering of drives even remotely optimal? */ | 1708 | count = 0; |
1522 | count = 0; | 1709 | i = d0_idx; |
1523 | i = d0_idx; | 1710 | do { |
1524 | do { | 1711 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); |
1525 | ptrs[count++] = page_address(sh->dev[i].page); | 1712 | |
1526 | if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) | 1713 | ptrs[slot] = page_address(sh->dev[i].page); |
1527 | printk("block %d/%d not uptodate on parity calc\n", i,count); | 1714 | if (slot < syndrome_disks && |
1528 | i = raid6_next_disk(i, disks); | 1715 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { |
1529 | } while ( i != d0_idx ); | 1716 | printk(KERN_ERR "block %d/%d not uptodate " |
1530 | // break; | 1717 | "on parity calc\n", i, count); |
1531 | // } | 1718 | BUG(); |
1532 | 1719 | } | |
1533 | raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); | 1720 | |
1721 | i = raid6_next_disk(i, disks); | ||
1722 | } while (i != d0_idx); | ||
1723 | BUG_ON(count != syndrome_disks); | ||
1724 | |||
1725 | raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); | ||
1534 | 1726 | ||
1535 | switch(method) { | 1727 | switch(method) { |
1536 | case RECONSTRUCT_WRITE: | 1728 | case RECONSTRUCT_WRITE: |
@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) | |||
1552 | { | 1744 | { |
1553 | int i, count, disks = sh->disks; | 1745 | int i, count, disks = sh->disks; |
1554 | void *ptr[MAX_XOR_BLOCKS], *dest, *p; | 1746 | void *ptr[MAX_XOR_BLOCKS], *dest, *p; |
1555 | int pd_idx = sh->pd_idx; | 1747 | int qd_idx = sh->qd_idx; |
1556 | int qd_idx = raid6_next_disk(pd_idx, disks); | ||
1557 | 1748 | ||
1558 | pr_debug("compute_block_1, stripe %llu, idx %d\n", | 1749 | pr_debug("compute_block_1, stripe %llu, idx %d\n", |
1559 | (unsigned long long)sh->sector, dd_idx); | 1750 | (unsigned long long)sh->sector, dd_idx); |
@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) | |||
1589 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | 1780 | static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) |
1590 | { | 1781 | { |
1591 | int i, count, disks = sh->disks; | 1782 | int i, count, disks = sh->disks; |
1592 | int pd_idx = sh->pd_idx; | 1783 | int syndrome_disks = sh->ddf_layout ? disks : disks-2; |
1593 | int qd_idx = raid6_next_disk(pd_idx, disks); | 1784 | int d0_idx = raid6_d0(sh); |
1594 | int d0_idx = raid6_next_disk(qd_idx, disks); | 1785 | int faila = -1, failb = -1; |
1595 | int faila, failb; | 1786 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ |
1787 | void *ptrs[syndrome_disks+2]; | ||
1596 | 1788 | ||
1597 | /* faila and failb are disk numbers relative to d0_idx */ | 1789 | for (i = 0; i < disks ; i++) |
1598 | /* pd_idx become disks-2 and qd_idx become disks-1 */ | 1790 | ptrs[i] = (void *)raid6_empty_zero_page; |
1599 | faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; | 1791 | count = 0; |
1600 | failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; | 1792 | i = d0_idx; |
1793 | do { | ||
1794 | int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); | ||
1795 | |||
1796 | ptrs[slot] = page_address(sh->dev[i].page); | ||
1797 | |||
1798 | if (i == dd_idx1) | ||
1799 | faila = slot; | ||
1800 | if (i == dd_idx2) | ||
1801 | failb = slot; | ||
1802 | i = raid6_next_disk(i, disks); | ||
1803 | } while (i != d0_idx); | ||
1804 | BUG_ON(count != syndrome_disks); | ||
1601 | 1805 | ||
1602 | BUG_ON(faila == failb); | 1806 | BUG_ON(faila == failb); |
1603 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } | 1807 | if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } |
1604 | 1808 | ||
1605 | pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", | 1809 | pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", |
1606 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); | 1810 | (unsigned long long)sh->sector, dd_idx1, dd_idx2, |
1811 | faila, failb); | ||
1607 | 1812 | ||
1608 | if ( failb == disks-1 ) { | 1813 | if (failb == syndrome_disks+1) { |
1609 | /* Q disk is one of the missing disks */ | 1814 | /* Q disk is one of the missing disks */ |
1610 | if ( faila == disks-2 ) { | 1815 | if (faila == syndrome_disks) { |
1611 | /* Missing P+Q, just recompute */ | 1816 | /* Missing P+Q, just recompute */ |
1612 | compute_parity6(sh, UPDATE_PARITY); | 1817 | compute_parity6(sh, UPDATE_PARITY); |
1613 | return; | 1818 | return; |
1614 | } else { | 1819 | } else { |
1615 | /* We're missing D+Q; recompute D from P */ | 1820 | /* We're missing D+Q; recompute D from P */ |
1616 | compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); | 1821 | compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? |
1822 | dd_idx2 : dd_idx1), | ||
1823 | 0); | ||
1617 | compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ | 1824 | compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ |
1618 | return; | 1825 | return; |
1619 | } | 1826 | } |
1620 | } | 1827 | } |
1621 | 1828 | ||
1622 | /* We're missing D+P or D+D; build pointer table */ | 1829 | /* We're missing D+P or D+D; */ |
1623 | { | 1830 | if (failb == syndrome_disks) { |
1624 | /**** FIX THIS: This could be very bad if disks is close to 256 ****/ | 1831 | /* We're missing D+P. */ |
1625 | void *ptrs[disks]; | 1832 | raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); |
1626 | 1833 | } else { | |
1627 | count = 0; | 1834 | /* We're missing D+D. */ |
1628 | i = d0_idx; | 1835 | raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, |
1629 | do { | 1836 | ptrs); |
1630 | ptrs[count++] = page_address(sh->dev[i].page); | ||
1631 | i = raid6_next_disk(i, disks); | ||
1632 | if (i != dd_idx1 && i != dd_idx2 && | ||
1633 | !test_bit(R5_UPTODATE, &sh->dev[i].flags)) | ||
1634 | printk("compute_2 with missing block %d/%d\n", count, i); | ||
1635 | } while ( i != d0_idx ); | ||
1636 | |||
1637 | if ( failb == disks-2 ) { | ||
1638 | /* We're missing D+P. */ | ||
1639 | raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); | ||
1640 | } else { | ||
1641 | /* We're missing D+D. */ | ||
1642 | raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); | ||
1643 | } | ||
1644 | |||
1645 | /* Both the above update both missing blocks */ | ||
1646 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
1647 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
1648 | } | 1837 | } |
1838 | |||
1839 | /* Both the above update both missing blocks */ | ||
1840 | set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); | ||
1841 | set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); | ||
1649 | } | 1842 | } |
1650 | 1843 | ||
1651 | static void | 1844 | static void |
@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p) | |||
1800 | memcmp(a, a+4, STRIPE_SIZE-4)==0); | 1993 | memcmp(a, a+4, STRIPE_SIZE-4)==0); |
1801 | } | 1994 | } |
1802 | 1995 | ||
1803 | static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | 1996 | static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, |
1997 | struct stripe_head *sh) | ||
1804 | { | 1998 | { |
1805 | int sectors_per_chunk = conf->chunk_size >> 9; | 1999 | int sectors_per_chunk = |
1806 | int pd_idx, dd_idx; | 2000 | previous ? (conf->prev_chunk >> 9) |
2001 | : (conf->chunk_size >> 9); | ||
2002 | int dd_idx; | ||
1807 | int chunk_offset = sector_div(stripe, sectors_per_chunk); | 2003 | int chunk_offset = sector_div(stripe, sectors_per_chunk); |
2004 | int disks = previous ? conf->previous_raid_disks : conf->raid_disks; | ||
1808 | 2005 | ||
1809 | raid5_compute_sector(stripe * (disks - conf->max_degraded) | 2006 | raid5_compute_sector(conf, |
2007 | stripe * (disks - conf->max_degraded) | ||
1810 | *sectors_per_chunk + chunk_offset, | 2008 | *sectors_per_chunk + chunk_offset, |
1811 | disks, disks - conf->max_degraded, | 2009 | previous, |
1812 | &dd_idx, &pd_idx, conf); | 2010 | &dd_idx, sh); |
1813 | return pd_idx; | ||
1814 | } | 2011 | } |
1815 | 2012 | ||
1816 | static void | 2013 | static void |
@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf, | |||
2181 | struct r6_state *r6s, int disks) | 2378 | struct r6_state *r6s, int disks) |
2182 | { | 2379 | { |
2183 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; | 2380 | int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; |
2184 | int qd_idx = r6s->qd_idx; | 2381 | int qd_idx = sh->qd_idx; |
2185 | for (i = disks; i--; ) { | 2382 | for (i = disks; i--; ) { |
2186 | struct r5dev *dev = &sh->dev[i]; | 2383 | struct r5dev *dev = &sh->dev[i]; |
2187 | /* Would I have to read this buffer for reconstruct_write */ | 2384 | /* Would I have to read this buffer for reconstruct_write */ |
@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, | |||
2371 | int update_p = 0, update_q = 0; | 2568 | int update_p = 0, update_q = 0; |
2372 | struct r5dev *dev; | 2569 | struct r5dev *dev; |
2373 | int pd_idx = sh->pd_idx; | 2570 | int pd_idx = sh->pd_idx; |
2374 | int qd_idx = r6s->qd_idx; | 2571 | int qd_idx = sh->qd_idx; |
2375 | 2572 | ||
2376 | set_bit(STRIPE_HANDLE, &sh->state); | 2573 | set_bit(STRIPE_HANDLE, &sh->state); |
2377 | 2574 | ||
@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2467 | struct dma_async_tx_descriptor *tx = NULL; | 2664 | struct dma_async_tx_descriptor *tx = NULL; |
2468 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 2665 | clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
2469 | for (i = 0; i < sh->disks; i++) | 2666 | for (i = 0; i < sh->disks; i++) |
2470 | if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { | 2667 | if (i != sh->pd_idx && i != sh->qd_idx) { |
2471 | int dd_idx, pd_idx, j; | 2668 | int dd_idx, j; |
2472 | struct stripe_head *sh2; | 2669 | struct stripe_head *sh2; |
2473 | 2670 | ||
2474 | sector_t bn = compute_blocknr(sh, i); | 2671 | sector_t bn = compute_blocknr(sh, i, 1); |
2475 | sector_t s = raid5_compute_sector(bn, conf->raid_disks, | 2672 | sector_t s = raid5_compute_sector(conf, bn, 0, |
2476 | conf->raid_disks - | 2673 | &dd_idx, NULL); |
2477 | conf->max_degraded, &dd_idx, | 2674 | sh2 = get_active_stripe(conf, s, 0, 1); |
2478 | &pd_idx, conf); | ||
2479 | sh2 = get_active_stripe(conf, s, conf->raid_disks, | ||
2480 | pd_idx, 1); | ||
2481 | if (sh2 == NULL) | 2675 | if (sh2 == NULL) |
2482 | /* so far only the early blocks of this stripe | 2676 | /* so far only the early blocks of this stripe |
2483 | * have been requested. When later blocks | 2677 | * have been requested. When later blocks |
@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, | |||
2500 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); | 2694 | set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); |
2501 | for (j = 0; j < conf->raid_disks; j++) | 2695 | for (j = 0; j < conf->raid_disks; j++) |
2502 | if (j != sh2->pd_idx && | 2696 | if (j != sh2->pd_idx && |
2503 | (!r6s || j != raid6_next_disk(sh2->pd_idx, | 2697 | (!r6s || j != sh2->qd_idx) && |
2504 | sh2->disks)) && | ||
2505 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) | 2698 | !test_bit(R5_Expanded, &sh2->dev[j].flags)) |
2506 | break; | 2699 | break; |
2507 | if (j == conf->raid_disks) { | 2700 | if (j == conf->raid_disks) { |
@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2750 | 2943 | ||
2751 | /* Finish reconstruct operations initiated by the expansion process */ | 2944 | /* Finish reconstruct operations initiated by the expansion process */ |
2752 | if (sh->reconstruct_state == reconstruct_state_result) { | 2945 | if (sh->reconstruct_state == reconstruct_state_result) { |
2946 | struct stripe_head *sh2 | ||
2947 | = get_active_stripe(conf, sh->sector, 1, 1); | ||
2948 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
2949 | /* sh cannot be written until sh2 has been read. | ||
2950 | * so arrange for sh to be delayed a little | ||
2951 | */ | ||
2952 | set_bit(STRIPE_DELAYED, &sh->state); | ||
2953 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2954 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
2955 | &sh2->state)) | ||
2956 | atomic_inc(&conf->preread_active_stripes); | ||
2957 | release_stripe(sh2); | ||
2958 | goto unlock; | ||
2959 | } | ||
2960 | if (sh2) | ||
2961 | release_stripe(sh2); | ||
2962 | |||
2753 | sh->reconstruct_state = reconstruct_state_idle; | 2963 | sh->reconstruct_state = reconstruct_state_idle; |
2754 | clear_bit(STRIPE_EXPANDING, &sh->state); | 2964 | clear_bit(STRIPE_EXPANDING, &sh->state); |
2755 | for (i = conf->raid_disks; i--; ) { | 2965 | for (i = conf->raid_disks; i--; ) { |
@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2763 | !sh->reconstruct_state) { | 2973 | !sh->reconstruct_state) { |
2764 | /* Need to write out all blocks after computing parity */ | 2974 | /* Need to write out all blocks after computing parity */ |
2765 | sh->disks = conf->raid_disks; | 2975 | sh->disks = conf->raid_disks; |
2766 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, | 2976 | stripe_set_idx(sh->sector, conf, 0, sh); |
2767 | conf->raid_disks); | ||
2768 | schedule_reconstruction5(sh, &s, 1, 1); | 2977 | schedule_reconstruction5(sh, &s, 1, 1); |
2769 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { | 2978 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2770 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 2979 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh) | |||
2796 | 3005 | ||
2797 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 3006 | static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) |
2798 | { | 3007 | { |
2799 | raid6_conf_t *conf = sh->raid_conf; | 3008 | raid5_conf_t *conf = sh->raid_conf; |
2800 | int disks = sh->disks; | 3009 | int disks = sh->disks; |
2801 | struct bio *return_bi = NULL; | 3010 | struct bio *return_bi = NULL; |
2802 | int i, pd_idx = sh->pd_idx; | 3011 | int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; |
2803 | struct stripe_head_state s; | 3012 | struct stripe_head_state s; |
2804 | struct r6_state r6s; | 3013 | struct r6_state r6s; |
2805 | struct r5dev *dev, *pdev, *qdev; | 3014 | struct r5dev *dev, *pdev, *qdev; |
2806 | mdk_rdev_t *blocked_rdev = NULL; | 3015 | mdk_rdev_t *blocked_rdev = NULL; |
2807 | 3016 | ||
2808 | r6s.qd_idx = raid6_next_disk(pd_idx, disks); | ||
2809 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " | 3017 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, " |
2810 | "pd_idx=%d, qd_idx=%d\n", | 3018 | "pd_idx=%d, qd_idx=%d\n", |
2811 | (unsigned long long)sh->sector, sh->state, | 3019 | (unsigned long long)sh->sector, sh->state, |
2812 | atomic_read(&sh->count), pd_idx, r6s.qd_idx); | 3020 | atomic_read(&sh->count), pd_idx, qd_idx); |
2813 | memset(&s, 0, sizeof(s)); | 3021 | memset(&s, 0, sizeof(s)); |
2814 | 3022 | ||
2815 | spin_lock(&sh->lock); | 3023 | spin_lock(&sh->lock); |
@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2920 | pdev = &sh->dev[pd_idx]; | 3128 | pdev = &sh->dev[pd_idx]; |
2921 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) | 3129 | r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) |
2922 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); | 3130 | || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); |
2923 | qdev = &sh->dev[r6s.qd_idx]; | 3131 | qdev = &sh->dev[qd_idx]; |
2924 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) | 3132 | r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) |
2925 | || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); | 3133 | || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); |
2926 | 3134 | ||
2927 | if ( s.written && | 3135 | if ( s.written && |
2928 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | 3136 | ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) |
@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
2980 | } | 3188 | } |
2981 | 3189 | ||
2982 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { | 3190 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { |
3191 | struct stripe_head *sh2 | ||
3192 | = get_active_stripe(conf, sh->sector, 1, 1); | ||
3193 | if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { | ||
3194 | /* sh cannot be written until sh2 has been read. | ||
3195 | * so arrange for sh to be delayed a little | ||
3196 | */ | ||
3197 | set_bit(STRIPE_DELAYED, &sh->state); | ||
3198 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3199 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, | ||
3200 | &sh2->state)) | ||
3201 | atomic_inc(&conf->preread_active_stripes); | ||
3202 | release_stripe(sh2); | ||
3203 | goto unlock; | ||
3204 | } | ||
3205 | if (sh2) | ||
3206 | release_stripe(sh2); | ||
3207 | |||
2983 | /* Need to write out all blocks after computing P&Q */ | 3208 | /* Need to write out all blocks after computing P&Q */ |
2984 | sh->disks = conf->raid_disks; | 3209 | sh->disks = conf->raid_disks; |
2985 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, | 3210 | stripe_set_idx(sh->sector, conf, 0, sh); |
2986 | conf->raid_disks); | ||
2987 | compute_parity6(sh, RECONSTRUCT_WRITE); | 3211 | compute_parity6(sh, RECONSTRUCT_WRITE); |
2988 | for (i = conf->raid_disks ; i-- ; ) { | 3212 | for (i = conf->raid_disks ; i-- ; ) { |
2989 | set_bit(R5_LOCKED, &sh->dev[i].flags); | 3213 | set_bit(R5_LOCKED, &sh->dev[i].flags); |
@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, | |||
3134 | if ((bvm->bi_rw & 1) == WRITE) | 3358 | if ((bvm->bi_rw & 1) == WRITE) |
3135 | return biovec->bv_len; /* always allow writes to be mergeable */ | 3359 | return biovec->bv_len; /* always allow writes to be mergeable */ |
3136 | 3360 | ||
3361 | if (mddev->new_chunk < mddev->chunk_size) | ||
3362 | chunk_sectors = mddev->new_chunk >> 9; | ||
3137 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 3363 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; |
3138 | if (max < 0) max = 0; | 3364 | if (max < 0) max = 0; |
3139 | if (max <= biovec->bv_len && bio_sectors == 0) | 3365 | if (max <= biovec->bv_len && bio_sectors == 0) |
@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) | |||
3149 | unsigned int chunk_sectors = mddev->chunk_size >> 9; | 3375 | unsigned int chunk_sectors = mddev->chunk_size >> 9; |
3150 | unsigned int bio_sectors = bio->bi_size >> 9; | 3376 | unsigned int bio_sectors = bio->bi_size >> 9; |
3151 | 3377 | ||
3378 | if (mddev->new_chunk < mddev->chunk_size) | ||
3379 | chunk_sectors = mddev->new_chunk >> 9; | ||
3152 | return chunk_sectors >= | 3380 | return chunk_sectors >= |
3153 | ((sector & (chunk_sectors - 1)) + bio_sectors); | 3381 | ((sector & (chunk_sectors - 1)) + bio_sectors); |
3154 | } | 3382 | } |
@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | |||
3255 | { | 3483 | { |
3256 | mddev_t *mddev = q->queuedata; | 3484 | mddev_t *mddev = q->queuedata; |
3257 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3485 | raid5_conf_t *conf = mddev_to_conf(mddev); |
3258 | const unsigned int raid_disks = conf->raid_disks; | 3486 | unsigned int dd_idx; |
3259 | const unsigned int data_disks = raid_disks - conf->max_degraded; | ||
3260 | unsigned int dd_idx, pd_idx; | ||
3261 | struct bio* align_bi; | 3487 | struct bio* align_bi; |
3262 | mdk_rdev_t *rdev; | 3488 | mdk_rdev_t *rdev; |
3263 | 3489 | ||
@@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | |||
3266 | return 0; | 3492 | return 0; |
3267 | } | 3493 | } |
3268 | /* | 3494 | /* |
3269 | * use bio_clone to make a copy of the bio | 3495 | * use bio_clone to make a copy of the bio |
3270 | */ | 3496 | */ |
3271 | align_bi = bio_clone(raid_bio, GFP_NOIO); | 3497 | align_bi = bio_clone(raid_bio, GFP_NOIO); |
3272 | if (!align_bi) | 3498 | if (!align_bi) |
@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) | |||
3280 | /* | 3506 | /* |
3281 | * compute position | 3507 | * compute position |
3282 | */ | 3508 | */ |
3283 | align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, | 3509 | align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, |
3284 | raid_disks, | 3510 | 0, |
3285 | data_disks, | 3511 | &dd_idx, NULL); |
3286 | &dd_idx, | ||
3287 | &pd_idx, | ||
3288 | conf); | ||
3289 | 3512 | ||
3290 | rcu_read_lock(); | 3513 | rcu_read_lock(); |
3291 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); | 3514 | rdev = rcu_dereference(conf->disks[dd_idx].rdev); |
@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3377 | { | 3600 | { |
3378 | mddev_t *mddev = q->queuedata; | 3601 | mddev_t *mddev = q->queuedata; |
3379 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3602 | raid5_conf_t *conf = mddev_to_conf(mddev); |
3380 | unsigned int dd_idx, pd_idx; | 3603 | int dd_idx; |
3381 | sector_t new_sector; | 3604 | sector_t new_sector; |
3382 | sector_t logical_sector, last_sector; | 3605 | sector_t logical_sector, last_sector; |
3383 | struct stripe_head *sh; | 3606 | struct stripe_head *sh; |
@@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3400 | if (rw == READ && | 3623 | if (rw == READ && |
3401 | mddev->reshape_position == MaxSector && | 3624 | mddev->reshape_position == MaxSector && |
3402 | chunk_aligned_read(q,bi)) | 3625 | chunk_aligned_read(q,bi)) |
3403 | return 0; | 3626 | return 0; |
3404 | 3627 | ||
3405 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 3628 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
3406 | last_sector = bi->bi_sector + (bi->bi_size>>9); | 3629 | last_sector = bi->bi_sector + (bi->bi_size>>9); |
@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3410 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { | 3633 | for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { |
3411 | DEFINE_WAIT(w); | 3634 | DEFINE_WAIT(w); |
3412 | int disks, data_disks; | 3635 | int disks, data_disks; |
3636 | int previous; | ||
3413 | 3637 | ||
3414 | retry: | 3638 | retry: |
3639 | previous = 0; | ||
3640 | disks = conf->raid_disks; | ||
3415 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); | 3641 | prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); |
3416 | if (likely(conf->expand_progress == MaxSector)) | 3642 | if (unlikely(conf->reshape_progress != MaxSector)) { |
3417 | disks = conf->raid_disks; | 3643 | /* spinlock is needed as reshape_progress may be |
3418 | else { | ||
3419 | /* spinlock is needed as expand_progress may be | ||
3420 | * 64bit on a 32bit platform, and so it might be | 3644 | * 64bit on a 32bit platform, and so it might be |
3421 | * possible to see a half-updated value | 3645 | * possible to see a half-updated value |
3422 | * Ofcourse expand_progress could change after | 3646 | * Ofcourse reshape_progress could change after |
3423 | * the lock is dropped, so once we get a reference | 3647 | * the lock is dropped, so once we get a reference |
3424 | * to the stripe that we think it is, we will have | 3648 | * to the stripe that we think it is, we will have |
3425 | * to check again. | 3649 | * to check again. |
3426 | */ | 3650 | */ |
3427 | spin_lock_irq(&conf->device_lock); | 3651 | spin_lock_irq(&conf->device_lock); |
3428 | disks = conf->raid_disks; | 3652 | if (mddev->delta_disks < 0 |
3429 | if (logical_sector >= conf->expand_progress) | 3653 | ? logical_sector < conf->reshape_progress |
3654 | : logical_sector >= conf->reshape_progress) { | ||
3430 | disks = conf->previous_raid_disks; | 3655 | disks = conf->previous_raid_disks; |
3431 | else { | 3656 | previous = 1; |
3432 | if (logical_sector >= conf->expand_lo) { | 3657 | } else { |
3658 | if (mddev->delta_disks < 0 | ||
3659 | ? logical_sector < conf->reshape_safe | ||
3660 | : logical_sector >= conf->reshape_safe) { | ||
3433 | spin_unlock_irq(&conf->device_lock); | 3661 | spin_unlock_irq(&conf->device_lock); |
3434 | schedule(); | 3662 | schedule(); |
3435 | goto retry; | 3663 | goto retry; |
@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3439 | } | 3667 | } |
3440 | data_disks = disks - conf->max_degraded; | 3668 | data_disks = disks - conf->max_degraded; |
3441 | 3669 | ||
3442 | new_sector = raid5_compute_sector(logical_sector, disks, data_disks, | 3670 | new_sector = raid5_compute_sector(conf, logical_sector, |
3443 | &dd_idx, &pd_idx, conf); | 3671 | previous, |
3672 | &dd_idx, NULL); | ||
3444 | pr_debug("raid5: make_request, sector %llu logical %llu\n", | 3673 | pr_debug("raid5: make_request, sector %llu logical %llu\n", |
3445 | (unsigned long long)new_sector, | 3674 | (unsigned long long)new_sector, |
3446 | (unsigned long long)logical_sector); | 3675 | (unsigned long long)logical_sector); |
3447 | 3676 | ||
3448 | sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); | 3677 | sh = get_active_stripe(conf, new_sector, previous, |
3678 | (bi->bi_rw&RWA_MASK)); | ||
3449 | if (sh) { | 3679 | if (sh) { |
3450 | if (unlikely(conf->expand_progress != MaxSector)) { | 3680 | if (unlikely(previous)) { |
3451 | /* expansion might have moved on while waiting for a | 3681 | /* expansion might have moved on while waiting for a |
3452 | * stripe, so we must do the range check again. | 3682 | * stripe, so we must do the range check again. |
3453 | * Expansion could still move past after this | 3683 | * Expansion could still move past after this |
@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3458 | */ | 3688 | */ |
3459 | int must_retry = 0; | 3689 | int must_retry = 0; |
3460 | spin_lock_irq(&conf->device_lock); | 3690 | spin_lock_irq(&conf->device_lock); |
3461 | if (logical_sector < conf->expand_progress && | 3691 | if (mddev->delta_disks < 0 |
3462 | disks == conf->previous_raid_disks) | 3692 | ? logical_sector >= conf->reshape_progress |
3693 | : logical_sector < conf->reshape_progress) | ||
3463 | /* mismatch, need to try again */ | 3694 | /* mismatch, need to try again */ |
3464 | must_retry = 1; | 3695 | must_retry = 1; |
3465 | spin_unlock_irq(&conf->device_lock); | 3696 | spin_unlock_irq(&conf->device_lock); |
@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3514 | return 0; | 3745 | return 0; |
3515 | } | 3746 | } |
3516 | 3747 | ||
3748 | static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); | ||
3749 | |||
3517 | static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) | 3750 | static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) |
3518 | { | 3751 | { |
3519 | /* reshaping is quite different to recovery/resync so it is | 3752 | /* reshaping is quite different to recovery/resync so it is |
@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3527 | */ | 3760 | */ |
3528 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 3761 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; |
3529 | struct stripe_head *sh; | 3762 | struct stripe_head *sh; |
3530 | int pd_idx; | ||
3531 | sector_t first_sector, last_sector; | 3763 | sector_t first_sector, last_sector; |
3532 | int raid_disks = conf->previous_raid_disks; | 3764 | int raid_disks = conf->previous_raid_disks; |
3533 | int data_disks = raid_disks - conf->max_degraded; | 3765 | int data_disks = raid_disks - conf->max_degraded; |
3534 | int new_data_disks = conf->raid_disks - conf->max_degraded; | 3766 | int new_data_disks = conf->raid_disks - conf->max_degraded; |
3535 | int i; | 3767 | int i; |
3536 | int dd_idx; | 3768 | int dd_idx; |
3537 | sector_t writepos, safepos, gap; | 3769 | sector_t writepos, readpos, safepos; |
3538 | 3770 | sector_t stripe_addr; | |
3539 | if (sector_nr == 0 && | 3771 | int reshape_sectors; |
3540 | conf->expand_progress != 0) { | 3772 | struct list_head stripes; |
3541 | /* restarting in the middle, skip the initial sectors */ | 3773 | |
3542 | sector_nr = conf->expand_progress; | 3774 | if (sector_nr == 0) { |
3775 | /* If restarting in the middle, skip the initial sectors */ | ||
3776 | if (mddev->delta_disks < 0 && | ||
3777 | conf->reshape_progress < raid5_size(mddev, 0, 0)) { | ||
3778 | sector_nr = raid5_size(mddev, 0, 0) | ||
3779 | - conf->reshape_progress; | ||
3780 | } else if (mddev->delta_disks > 0 && | ||
3781 | conf->reshape_progress > 0) | ||
3782 | sector_nr = conf->reshape_progress; | ||
3543 | sector_div(sector_nr, new_data_disks); | 3783 | sector_div(sector_nr, new_data_disks); |
3544 | *skipped = 1; | 3784 | if (sector_nr) { |
3545 | return sector_nr; | 3785 | *skipped = 1; |
3786 | return sector_nr; | ||
3787 | } | ||
3546 | } | 3788 | } |
3547 | 3789 | ||
3790 | /* We need to process a full chunk at a time. | ||
3791 | * If old and new chunk sizes differ, we need to process the | ||
3792 | * largest of these | ||
3793 | */ | ||
3794 | if (mddev->new_chunk > mddev->chunk_size) | ||
3795 | reshape_sectors = mddev->new_chunk / 512; | ||
3796 | else | ||
3797 | reshape_sectors = mddev->chunk_size / 512; | ||
3798 | |||
3548 | /* we update the metadata when there is more than 3Meg | 3799 | /* we update the metadata when there is more than 3Meg |
3549 | * in the block range (that is rather arbitrary, should | 3800 | * in the block range (that is rather arbitrary, should |
3550 | * probably be time based) or when the data about to be | 3801 | * probably be time based) or when the data about to be |
3551 | * copied would over-write the source of the data at | 3802 | * copied would over-write the source of the data at |
3552 | * the front of the range. | 3803 | * the front of the range. |
3553 | * i.e. one new_stripe forward from expand_progress new_maps | 3804 | * i.e. one new_stripe along from reshape_progress new_maps |
3554 | * to after where expand_lo old_maps to | 3805 | * to after where reshape_safe old_maps to |
3555 | */ | 3806 | */ |
3556 | writepos = conf->expand_progress + | 3807 | writepos = conf->reshape_progress; |
3557 | conf->chunk_size/512*(new_data_disks); | ||
3558 | sector_div(writepos, new_data_disks); | 3808 | sector_div(writepos, new_data_disks); |
3559 | safepos = conf->expand_lo; | 3809 | readpos = conf->reshape_progress; |
3810 | sector_div(readpos, data_disks); | ||
3811 | safepos = conf->reshape_safe; | ||
3560 | sector_div(safepos, data_disks); | 3812 | sector_div(safepos, data_disks); |
3561 | gap = conf->expand_progress - conf->expand_lo; | 3813 | if (mddev->delta_disks < 0) { |
3814 | writepos -= reshape_sectors; | ||
3815 | readpos += reshape_sectors; | ||
3816 | safepos += reshape_sectors; | ||
3817 | } else { | ||
3818 | writepos += reshape_sectors; | ||
3819 | readpos -= reshape_sectors; | ||
3820 | safepos -= reshape_sectors; | ||
3821 | } | ||
3562 | 3822 | ||
3563 | if (writepos >= safepos || | 3823 | /* 'writepos' is the most advanced device address we might write. |
3564 | gap > (new_data_disks)*3000*2 /*3Meg*/) { | 3824 | * 'readpos' is the least advanced device address we might read. |
3825 | * 'safepos' is the least address recorded in the metadata as having | ||
3826 | * been reshaped. | ||
3827 | * If 'readpos' is behind 'writepos', then there is no way that we can | ||
3828 | * ensure safety in the face of a crash - that must be done by userspace | ||
3829 | * making a backup of the data. So in that case there is no particular | ||
3830 | * rush to update metadata. | ||
3831 | * Otherwise if 'safepos' is behind 'writepos', then we really need to | ||
3832 | * update the metadata to advance 'safepos' to match 'readpos' so that | ||
3833 | * we can be safe in the event of a crash. | ||
3834 | * So we insist on updating metadata if safepos is behind writepos and | ||
3835 | * readpos is beyond writepos. | ||
3836 | * In any case, update the metadata every 10 seconds. | ||
3837 | * Maybe that number should be configurable, but I'm not sure it is | ||
3838 | * worth it.... maybe it could be a multiple of safemode_delay??? | ||
3839 | */ | ||
3840 | if ((mddev->delta_disks < 0 | ||
3841 | ? (safepos > writepos && readpos < writepos) | ||
3842 | : (safepos < writepos && readpos > writepos)) || | ||
3843 | time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { | ||
3565 | /* Cannot proceed until we've updated the superblock... */ | 3844 | /* Cannot proceed until we've updated the superblock... */ |
3566 | wait_event(conf->wait_for_overlap, | 3845 | wait_event(conf->wait_for_overlap, |
3567 | atomic_read(&conf->reshape_stripes)==0); | 3846 | atomic_read(&conf->reshape_stripes)==0); |
3568 | mddev->reshape_position = conf->expand_progress; | 3847 | mddev->reshape_position = conf->reshape_progress; |
3848 | conf->reshape_checkpoint = jiffies; | ||
3569 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3849 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3570 | md_wakeup_thread(mddev->thread); | 3850 | md_wakeup_thread(mddev->thread); |
3571 | wait_event(mddev->sb_wait, mddev->flags == 0 || | 3851 | wait_event(mddev->sb_wait, mddev->flags == 0 || |
3572 | kthread_should_stop()); | 3852 | kthread_should_stop()); |
3573 | spin_lock_irq(&conf->device_lock); | 3853 | spin_lock_irq(&conf->device_lock); |
3574 | conf->expand_lo = mddev->reshape_position; | 3854 | conf->reshape_safe = mddev->reshape_position; |
3575 | spin_unlock_irq(&conf->device_lock); | 3855 | spin_unlock_irq(&conf->device_lock); |
3576 | wake_up(&conf->wait_for_overlap); | 3856 | wake_up(&conf->wait_for_overlap); |
3577 | } | 3857 | } |
3578 | 3858 | ||
3579 | for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { | 3859 | if (mddev->delta_disks < 0) { |
3860 | BUG_ON(conf->reshape_progress == 0); | ||
3861 | stripe_addr = writepos; | ||
3862 | BUG_ON((mddev->dev_sectors & | ||
3863 | ~((sector_t)reshape_sectors - 1)) | ||
3864 | - reshape_sectors - stripe_addr | ||
3865 | != sector_nr); | ||
3866 | } else { | ||
3867 | BUG_ON(writepos != sector_nr + reshape_sectors); | ||
3868 | stripe_addr = sector_nr; | ||
3869 | } | ||
3870 | INIT_LIST_HEAD(&stripes); | ||
3871 | for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { | ||
3580 | int j; | 3872 | int j; |
3581 | int skipped = 0; | 3873 | int skipped = 0; |
3582 | pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); | 3874 | sh = get_active_stripe(conf, stripe_addr+i, 0, 0); |
3583 | sh = get_active_stripe(conf, sector_nr+i, | ||
3584 | conf->raid_disks, pd_idx, 0); | ||
3585 | set_bit(STRIPE_EXPANDING, &sh->state); | 3875 | set_bit(STRIPE_EXPANDING, &sh->state); |
3586 | atomic_inc(&conf->reshape_stripes); | 3876 | atomic_inc(&conf->reshape_stripes); |
3587 | /* If any of this stripe is beyond the end of the old | 3877 | /* If any of this stripe is beyond the end of the old |
@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3592 | if (j == sh->pd_idx) | 3882 | if (j == sh->pd_idx) |
3593 | continue; | 3883 | continue; |
3594 | if (conf->level == 6 && | 3884 | if (conf->level == 6 && |
3595 | j == raid6_next_disk(sh->pd_idx, sh->disks)) | 3885 | j == sh->qd_idx) |
3596 | continue; | 3886 | continue; |
3597 | s = compute_blocknr(sh, j); | 3887 | s = compute_blocknr(sh, j, 0); |
3598 | if (s < mddev->array_sectors) { | 3888 | if (s < raid5_size(mddev, 0, 0)) { |
3599 | skipped = 1; | 3889 | skipped = 1; |
3600 | continue; | 3890 | continue; |
3601 | } | 3891 | } |
@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3607 | set_bit(STRIPE_EXPAND_READY, &sh->state); | 3897 | set_bit(STRIPE_EXPAND_READY, &sh->state); |
3608 | set_bit(STRIPE_HANDLE, &sh->state); | 3898 | set_bit(STRIPE_HANDLE, &sh->state); |
3609 | } | 3899 | } |
3610 | release_stripe(sh); | 3900 | list_add(&sh->lru, &stripes); |
3611 | } | 3901 | } |
3612 | spin_lock_irq(&conf->device_lock); | 3902 | spin_lock_irq(&conf->device_lock); |
3613 | conf->expand_progress = (sector_nr + i) * new_data_disks; | 3903 | if (mddev->delta_disks < 0) |
3904 | conf->reshape_progress -= reshape_sectors * new_data_disks; | ||
3905 | else | ||
3906 | conf->reshape_progress += reshape_sectors * new_data_disks; | ||
3614 | spin_unlock_irq(&conf->device_lock); | 3907 | spin_unlock_irq(&conf->device_lock); |
3615 | /* Ok, those stripe are ready. We can start scheduling | 3908 | /* Ok, those stripe are ready. We can start scheduling |
3616 | * reads on the source stripes. | 3909 | * reads on the source stripes. |
@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3618 | * block on the destination stripes. | 3911 | * block on the destination stripes. |
3619 | */ | 3912 | */ |
3620 | first_sector = | 3913 | first_sector = |
3621 | raid5_compute_sector(sector_nr*(new_data_disks), | 3914 | raid5_compute_sector(conf, stripe_addr*(new_data_disks), |
3622 | raid_disks, data_disks, | 3915 | 1, &dd_idx, NULL); |
3623 | &dd_idx, &pd_idx, conf); | ||
3624 | last_sector = | 3916 | last_sector = |
3625 | raid5_compute_sector((sector_nr+conf->chunk_size/512) | 3917 | raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) |
3626 | *(new_data_disks) -1, | 3918 | *(new_data_disks) - 1), |
3627 | raid_disks, data_disks, | 3919 | 1, &dd_idx, NULL); |
3628 | &dd_idx, &pd_idx, conf); | 3920 | if (last_sector >= mddev->dev_sectors) |
3629 | if (last_sector >= (mddev->size<<1)) | 3921 | last_sector = mddev->dev_sectors - 1; |
3630 | last_sector = (mddev->size<<1)-1; | ||
3631 | while (first_sector <= last_sector) { | 3922 | while (first_sector <= last_sector) { |
3632 | pd_idx = stripe_to_pdidx(first_sector, conf, | 3923 | sh = get_active_stripe(conf, first_sector, 1, 0); |
3633 | conf->previous_raid_disks); | ||
3634 | sh = get_active_stripe(conf, first_sector, | ||
3635 | conf->previous_raid_disks, pd_idx, 0); | ||
3636 | set_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 3924 | set_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
3637 | set_bit(STRIPE_HANDLE, &sh->state); | 3925 | set_bit(STRIPE_HANDLE, &sh->state); |
3638 | release_stripe(sh); | 3926 | release_stripe(sh); |
3639 | first_sector += STRIPE_SECTORS; | 3927 | first_sector += STRIPE_SECTORS; |
3640 | } | 3928 | } |
3929 | /* Now that the sources are clearly marked, we can release | ||
3930 | * the destination stripes | ||
3931 | */ | ||
3932 | while (!list_empty(&stripes)) { | ||
3933 | sh = list_entry(stripes.next, struct stripe_head, lru); | ||
3934 | list_del_init(&sh->lru); | ||
3935 | release_stripe(sh); | ||
3936 | } | ||
3641 | /* If this takes us to the resync_max point where we have to pause, | 3937 | /* If this takes us to the resync_max point where we have to pause, |
3642 | * then we need to write out the superblock. | 3938 | * then we need to write out the superblock. |
3643 | */ | 3939 | */ |
3644 | sector_nr += conf->chunk_size>>9; | 3940 | sector_nr += reshape_sectors; |
3645 | if (sector_nr >= mddev->resync_max) { | 3941 | if (sector_nr >= mddev->resync_max) { |
3646 | /* Cannot proceed until we've updated the superblock... */ | 3942 | /* Cannot proceed until we've updated the superblock... */ |
3647 | wait_event(conf->wait_for_overlap, | 3943 | wait_event(conf->wait_for_overlap, |
3648 | atomic_read(&conf->reshape_stripes) == 0); | 3944 | atomic_read(&conf->reshape_stripes) == 0); |
3649 | mddev->reshape_position = conf->expand_progress; | 3945 | mddev->reshape_position = conf->reshape_progress; |
3946 | conf->reshape_checkpoint = jiffies; | ||
3650 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 3947 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
3651 | md_wakeup_thread(mddev->thread); | 3948 | md_wakeup_thread(mddev->thread); |
3652 | wait_event(mddev->sb_wait, | 3949 | wait_event(mddev->sb_wait, |
3653 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) | 3950 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) |
3654 | || kthread_should_stop()); | 3951 | || kthread_should_stop()); |
3655 | spin_lock_irq(&conf->device_lock); | 3952 | spin_lock_irq(&conf->device_lock); |
3656 | conf->expand_lo = mddev->reshape_position; | 3953 | conf->reshape_safe = mddev->reshape_position; |
3657 | spin_unlock_irq(&conf->device_lock); | 3954 | spin_unlock_irq(&conf->device_lock); |
3658 | wake_up(&conf->wait_for_overlap); | 3955 | wake_up(&conf->wait_for_overlap); |
3659 | } | 3956 | } |
3660 | return conf->chunk_size>>9; | 3957 | return reshape_sectors; |
3661 | } | 3958 | } |
3662 | 3959 | ||
3663 | /* FIXME go_faster isn't used */ | 3960 | /* FIXME go_faster isn't used */ |
@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
3665 | { | 3962 | { |
3666 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 3963 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; |
3667 | struct stripe_head *sh; | 3964 | struct stripe_head *sh; |
3668 | int pd_idx; | 3965 | sector_t max_sector = mddev->dev_sectors; |
3669 | int raid_disks = conf->raid_disks; | ||
3670 | sector_t max_sector = mddev->size << 1; | ||
3671 | int sync_blocks; | 3966 | int sync_blocks; |
3672 | int still_degraded = 0; | 3967 | int still_degraded = 0; |
3673 | int i; | 3968 | int i; |
@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
3675 | if (sector_nr >= max_sector) { | 3970 | if (sector_nr >= max_sector) { |
3676 | /* just being told to finish up .. nothing much to do */ | 3971 | /* just being told to finish up .. nothing much to do */ |
3677 | unplug_slaves(mddev); | 3972 | unplug_slaves(mddev); |
3973 | |||
3678 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | 3974 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { |
3679 | end_reshape(conf); | 3975 | end_reshape(conf); |
3680 | return 0; | 3976 | return 0; |
@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
3705 | */ | 4001 | */ |
3706 | if (mddev->degraded >= conf->max_degraded && | 4002 | if (mddev->degraded >= conf->max_degraded && |
3707 | test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { | 4003 | test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { |
3708 | sector_t rv = (mddev->size << 1) - sector_nr; | 4004 | sector_t rv = mddev->dev_sectors - sector_nr; |
3709 | *skipped = 1; | 4005 | *skipped = 1; |
3710 | return rv; | 4006 | return rv; |
3711 | } | 4007 | } |
@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski | |||
3721 | 4017 | ||
3722 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); | 4018 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); |
3723 | 4019 | ||
3724 | pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); | 4020 | sh = get_active_stripe(conf, sector_nr, 0, 1); |
3725 | sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); | ||
3726 | if (sh == NULL) { | 4021 | if (sh == NULL) { |
3727 | sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); | 4022 | sh = get_active_stripe(conf, sector_nr, 0, 0); |
3728 | /* make sure we don't swamp the stripe cache if someone else | 4023 | /* make sure we don't swamp the stripe cache if someone else |
3729 | * is trying to get access | 4024 | * is trying to get access |
3730 | */ | 4025 | */ |
@@ -3766,19 +4061,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
3766 | * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. | 4061 | * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. |
3767 | */ | 4062 | */ |
3768 | struct stripe_head *sh; | 4063 | struct stripe_head *sh; |
3769 | int dd_idx, pd_idx; | 4064 | int dd_idx; |
3770 | sector_t sector, logical_sector, last_sector; | 4065 | sector_t sector, logical_sector, last_sector; |
3771 | int scnt = 0; | 4066 | int scnt = 0; |
3772 | int remaining; | 4067 | int remaining; |
3773 | int handled = 0; | 4068 | int handled = 0; |
3774 | 4069 | ||
3775 | logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 4070 | logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
3776 | sector = raid5_compute_sector( logical_sector, | 4071 | sector = raid5_compute_sector(conf, logical_sector, |
3777 | conf->raid_disks, | 4072 | 0, &dd_idx, NULL); |
3778 | conf->raid_disks - conf->max_degraded, | ||
3779 | &dd_idx, | ||
3780 | &pd_idx, | ||
3781 | conf); | ||
3782 | last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); | 4073 | last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); |
3783 | 4074 | ||
3784 | for (; logical_sector < last_sector; | 4075 | for (; logical_sector < last_sector; |
@@ -3790,7 +4081,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
3790 | /* already done this stripe */ | 4081 | /* already done this stripe */ |
3791 | continue; | 4082 | continue; |
3792 | 4083 | ||
3793 | sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); | 4084 | sh = get_active_stripe(conf, sector, 0, 1); |
3794 | 4085 | ||
3795 | if (!sh) { | 4086 | if (!sh) { |
3796 | /* failed to get a stripe - must wait */ | 4087 | /* failed to get a stripe - must wait */ |
@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = { | |||
3992 | .attrs = raid5_attrs, | 4283 | .attrs = raid5_attrs, |
3993 | }; | 4284 | }; |
3994 | 4285 | ||
3995 | static int run(mddev_t *mddev) | 4286 | static sector_t |
4287 | raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) | ||
4288 | { | ||
4289 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
4290 | |||
4291 | if (!sectors) | ||
4292 | sectors = mddev->dev_sectors; | ||
4293 | if (!raid_disks) { | ||
4294 | /* size is defined by the smallest of previous and new size */ | ||
4295 | if (conf->raid_disks < conf->previous_raid_disks) | ||
4296 | raid_disks = conf->raid_disks; | ||
4297 | else | ||
4298 | raid_disks = conf->previous_raid_disks; | ||
4299 | } | ||
4300 | |||
4301 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | ||
4302 | sectors &= ~((sector_t)mddev->new_chunk/512 - 1); | ||
4303 | return sectors * (raid_disks - conf->max_degraded); | ||
4304 | } | ||
4305 | |||
4306 | static raid5_conf_t *setup_conf(mddev_t *mddev) | ||
3996 | { | 4307 | { |
3997 | raid5_conf_t *conf; | 4308 | raid5_conf_t *conf; |
3998 | int raid_disk, memory; | 4309 | int raid_disk, memory; |
3999 | mdk_rdev_t *rdev; | 4310 | mdk_rdev_t *rdev; |
4000 | struct disk_info *disk; | 4311 | struct disk_info *disk; |
4001 | int working_disks = 0; | ||
4002 | 4312 | ||
4003 | if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { | 4313 | if (mddev->new_level != 5 |
4314 | && mddev->new_level != 4 | ||
4315 | && mddev->new_level != 6) { | ||
4004 | printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", | 4316 | printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", |
4005 | mdname(mddev), mddev->level); | 4317 | mdname(mddev), mddev->new_level); |
4006 | return -EIO; | 4318 | return ERR_PTR(-EIO); |
4007 | } | 4319 | } |
4008 | 4320 | if ((mddev->new_level == 5 | |
4009 | if (mddev->chunk_size < PAGE_SIZE) { | 4321 | && !algorithm_valid_raid5(mddev->new_layout)) || |
4010 | printk(KERN_ERR "md/raid5: chunk_size must be at least " | 4322 | (mddev->new_level == 6 |
4011 | "PAGE_SIZE but %d < %ld\n", | 4323 | && !algorithm_valid_raid6(mddev->new_layout))) { |
4012 | mddev->chunk_size, PAGE_SIZE); | 4324 | printk(KERN_ERR "raid5: %s: layout %d not supported\n", |
4013 | return -EINVAL; | 4325 | mdname(mddev), mddev->new_layout); |
4326 | return ERR_PTR(-EIO); | ||
4014 | } | 4327 | } |
4015 | 4328 | if (mddev->new_level == 6 && mddev->raid_disks < 4) { | |
4016 | if (mddev->reshape_position != MaxSector) { | 4329 | printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", |
4017 | /* Check that we can continue the reshape. | 4330 | mdname(mddev), mddev->raid_disks); |
4018 | * Currently only disks can change, it must | 4331 | return ERR_PTR(-EINVAL); |
4019 | * increase, and we must be past the point where | ||
4020 | * a stripe over-writes itself | ||
4021 | */ | ||
4022 | sector_t here_new, here_old; | ||
4023 | int old_disks; | ||
4024 | int max_degraded = (mddev->level == 5 ? 1 : 2); | ||
4025 | |||
4026 | if (mddev->new_level != mddev->level || | ||
4027 | mddev->new_layout != mddev->layout || | ||
4028 | mddev->new_chunk != mddev->chunk_size) { | ||
4029 | printk(KERN_ERR "raid5: %s: unsupported reshape " | ||
4030 | "required - aborting.\n", | ||
4031 | mdname(mddev)); | ||
4032 | return -EINVAL; | ||
4033 | } | ||
4034 | if (mddev->delta_disks <= 0) { | ||
4035 | printk(KERN_ERR "raid5: %s: unsupported reshape " | ||
4036 | "(reduce disks) required - aborting.\n", | ||
4037 | mdname(mddev)); | ||
4038 | return -EINVAL; | ||
4039 | } | ||
4040 | old_disks = mddev->raid_disks - mddev->delta_disks; | ||
4041 | /* reshape_position must be on a new-stripe boundary, and one | ||
4042 | * further up in new geometry must map after here in old | ||
4043 | * geometry. | ||
4044 | */ | ||
4045 | here_new = mddev->reshape_position; | ||
4046 | if (sector_div(here_new, (mddev->chunk_size>>9)* | ||
4047 | (mddev->raid_disks - max_degraded))) { | ||
4048 | printk(KERN_ERR "raid5: reshape_position not " | ||
4049 | "on a stripe boundary\n"); | ||
4050 | return -EINVAL; | ||
4051 | } | ||
4052 | /* here_new is the stripe we will write to */ | ||
4053 | here_old = mddev->reshape_position; | ||
4054 | sector_div(here_old, (mddev->chunk_size>>9)* | ||
4055 | (old_disks-max_degraded)); | ||
4056 | /* here_old is the first stripe that we might need to read | ||
4057 | * from */ | ||
4058 | if (here_new >= here_old) { | ||
4059 | /* Reading from the same stripe as writing to - bad */ | ||
4060 | printk(KERN_ERR "raid5: reshape_position too early for " | ||
4061 | "auto-recovery - aborting.\n"); | ||
4062 | return -EINVAL; | ||
4063 | } | ||
4064 | printk(KERN_INFO "raid5: reshape will continue\n"); | ||
4065 | /* OK, we should be able to continue; */ | ||
4066 | } | 4332 | } |
4067 | 4333 | ||
4334 | if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) { | ||
4335 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", | ||
4336 | mddev->new_chunk, mdname(mddev)); | ||
4337 | return ERR_PTR(-EINVAL); | ||
4338 | } | ||
4068 | 4339 | ||
4069 | mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); | 4340 | conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); |
4070 | if ((conf = mddev->private) == NULL) | 4341 | if (conf == NULL) |
4071 | goto abort; | 4342 | goto abort; |
4072 | if (mddev->reshape_position == MaxSector) { | 4343 | |
4073 | conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; | 4344 | conf->raid_disks = mddev->raid_disks; |
4074 | } else { | 4345 | if (mddev->reshape_position == MaxSector) |
4075 | conf->raid_disks = mddev->raid_disks; | 4346 | conf->previous_raid_disks = mddev->raid_disks; |
4347 | else | ||
4076 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; | 4348 | conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; |
4077 | } | ||
4078 | 4349 | ||
4079 | conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), | 4350 | conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), |
4080 | GFP_KERNEL); | 4351 | GFP_KERNEL); |
@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev) | |||
4086 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) | 4357 | if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) |
4087 | goto abort; | 4358 | goto abort; |
4088 | 4359 | ||
4089 | if (mddev->level == 6) { | 4360 | if (mddev->new_level == 6) { |
4090 | conf->spare_page = alloc_page(GFP_KERNEL); | 4361 | conf->spare_page = alloc_page(GFP_KERNEL); |
4091 | if (!conf->spare_page) | 4362 | if (!conf->spare_page) |
4092 | goto abort; | 4363 | goto abort; |
4093 | } | 4364 | } |
4094 | spin_lock_init(&conf->device_lock); | 4365 | spin_lock_init(&conf->device_lock); |
4095 | mddev->queue->queue_lock = &conf->device_lock; | ||
4096 | init_waitqueue_head(&conf->wait_for_stripe); | 4366 | init_waitqueue_head(&conf->wait_for_stripe); |
4097 | init_waitqueue_head(&conf->wait_for_overlap); | 4367 | init_waitqueue_head(&conf->wait_for_overlap); |
4098 | INIT_LIST_HEAD(&conf->handle_list); | 4368 | INIT_LIST_HEAD(&conf->handle_list); |
@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev) | |||
4121 | printk(KERN_INFO "raid5: device %s operational as raid" | 4391 | printk(KERN_INFO "raid5: device %s operational as raid" |
4122 | " disk %d\n", bdevname(rdev->bdev,b), | 4392 | " disk %d\n", bdevname(rdev->bdev,b), |
4123 | raid_disk); | 4393 | raid_disk); |
4124 | working_disks++; | ||
4125 | } else | 4394 | } else |
4126 | /* Cannot rely on bitmap to complete recovery */ | 4395 | /* Cannot rely on bitmap to complete recovery */ |
4127 | conf->fullsync = 1; | 4396 | conf->fullsync = 1; |
4128 | } | 4397 | } |
4129 | 4398 | ||
4130 | /* | 4399 | conf->chunk_size = mddev->new_chunk; |
4131 | * 0 for a fully functional array, 1 or 2 for a degraded array. | 4400 | conf->level = mddev->new_level; |
4132 | */ | ||
4133 | mddev->degraded = conf->raid_disks - working_disks; | ||
4134 | conf->mddev = mddev; | ||
4135 | conf->chunk_size = mddev->chunk_size; | ||
4136 | conf->level = mddev->level; | ||
4137 | if (conf->level == 6) | 4401 | if (conf->level == 6) |
4138 | conf->max_degraded = 2; | 4402 | conf->max_degraded = 2; |
4139 | else | 4403 | else |
4140 | conf->max_degraded = 1; | 4404 | conf->max_degraded = 1; |
4141 | conf->algorithm = mddev->layout; | 4405 | conf->algorithm = mddev->new_layout; |
4142 | conf->max_nr_stripes = NR_STRIPES; | 4406 | conf->max_nr_stripes = NR_STRIPES; |
4143 | conf->expand_progress = mddev->reshape_position; | 4407 | conf->reshape_progress = mddev->reshape_position; |
4144 | 4408 | if (conf->reshape_progress != MaxSector) { | |
4145 | /* device size must be a multiple of chunk size */ | 4409 | conf->prev_chunk = mddev->chunk_size; |
4146 | mddev->size &= ~(mddev->chunk_size/1024 -1); | 4410 | conf->prev_algo = mddev->layout; |
4147 | mddev->resync_max_sectors = mddev->size << 1; | 4411 | } |
4148 | 4412 | ||
4149 | if (conf->level == 6 && conf->raid_disks < 4) { | 4413 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + |
4150 | printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", | 4414 | conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; |
4151 | mdname(mddev), conf->raid_disks); | 4415 | if (grow_stripes(conf, conf->max_nr_stripes)) { |
4416 | printk(KERN_ERR | ||
4417 | "raid5: couldn't allocate %dkB for buffers\n", memory); | ||
4152 | goto abort; | 4418 | goto abort; |
4153 | } | 4419 | } else |
4154 | if (!conf->chunk_size || conf->chunk_size % 4) { | 4420 | printk(KERN_INFO "raid5: allocated %dkB for %s\n", |
4155 | printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", | 4421 | memory, mdname(mddev)); |
4156 | conf->chunk_size, mdname(mddev)); | 4422 | |
4423 | conf->thread = md_register_thread(raid5d, mddev, "%s_raid5"); | ||
4424 | if (!conf->thread) { | ||
4425 | printk(KERN_ERR | ||
4426 | "raid5: couldn't allocate thread for %s\n", | ||
4427 | mdname(mddev)); | ||
4157 | goto abort; | 4428 | goto abort; |
4158 | } | 4429 | } |
4159 | if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { | 4430 | |
4160 | printk(KERN_ERR | 4431 | return conf; |
4161 | "raid5: unsupported parity algorithm %d for %s\n", | 4432 | |
4162 | conf->algorithm, mdname(mddev)); | 4433 | abort: |
4163 | goto abort; | 4434 | if (conf) { |
4435 | shrink_stripes(conf); | ||
4436 | safe_put_page(conf->spare_page); | ||
4437 | kfree(conf->disks); | ||
4438 | kfree(conf->stripe_hashtbl); | ||
4439 | kfree(conf); | ||
4440 | return ERR_PTR(-EIO); | ||
4441 | } else | ||
4442 | return ERR_PTR(-ENOMEM); | ||
4443 | } | ||
4444 | |||
4445 | static int run(mddev_t *mddev) | ||
4446 | { | ||
4447 | raid5_conf_t *conf; | ||
4448 | int working_disks = 0; | ||
4449 | mdk_rdev_t *rdev; | ||
4450 | |||
4451 | if (mddev->reshape_position != MaxSector) { | ||
4452 | /* Check that we can continue the reshape. | ||
4453 | * Currently only disks can change, it must | ||
4454 | * increase, and we must be past the point where | ||
4455 | * a stripe over-writes itself | ||
4456 | */ | ||
4457 | sector_t here_new, here_old; | ||
4458 | int old_disks; | ||
4459 | int max_degraded = (mddev->level == 6 ? 2 : 1); | ||
4460 | |||
4461 | if (mddev->new_level != mddev->level) { | ||
4462 | printk(KERN_ERR "raid5: %s: unsupported reshape " | ||
4463 | "required - aborting.\n", | ||
4464 | mdname(mddev)); | ||
4465 | return -EINVAL; | ||
4466 | } | ||
4467 | old_disks = mddev->raid_disks - mddev->delta_disks; | ||
4468 | /* reshape_position must be on a new-stripe boundary, and one | ||
4469 | * further up in new geometry must map after here in old | ||
4470 | * geometry. | ||
4471 | */ | ||
4472 | here_new = mddev->reshape_position; | ||
4473 | if (sector_div(here_new, (mddev->new_chunk>>9)* | ||
4474 | (mddev->raid_disks - max_degraded))) { | ||
4475 | printk(KERN_ERR "raid5: reshape_position not " | ||
4476 | "on a stripe boundary\n"); | ||
4477 | return -EINVAL; | ||
4478 | } | ||
4479 | /* here_new is the stripe we will write to */ | ||
4480 | here_old = mddev->reshape_position; | ||
4481 | sector_div(here_old, (mddev->chunk_size>>9)* | ||
4482 | (old_disks-max_degraded)); | ||
4483 | /* here_old is the first stripe that we might need to read | ||
4484 | * from */ | ||
4485 | if (here_new >= here_old) { | ||
4486 | /* Reading from the same stripe as writing to - bad */ | ||
4487 | printk(KERN_ERR "raid5: reshape_position too early for " | ||
4488 | "auto-recovery - aborting.\n"); | ||
4489 | return -EINVAL; | ||
4490 | } | ||
4491 | printk(KERN_INFO "raid5: reshape will continue\n"); | ||
4492 | /* OK, we should be able to continue; */ | ||
4493 | } else { | ||
4494 | BUG_ON(mddev->level != mddev->new_level); | ||
4495 | BUG_ON(mddev->layout != mddev->new_layout); | ||
4496 | BUG_ON(mddev->chunk_size != mddev->new_chunk); | ||
4497 | BUG_ON(mddev->delta_disks != 0); | ||
4164 | } | 4498 | } |
4499 | |||
4500 | if (mddev->private == NULL) | ||
4501 | conf = setup_conf(mddev); | ||
4502 | else | ||
4503 | conf = mddev->private; | ||
4504 | |||
4505 | if (IS_ERR(conf)) | ||
4506 | return PTR_ERR(conf); | ||
4507 | |||
4508 | mddev->thread = conf->thread; | ||
4509 | conf->thread = NULL; | ||
4510 | mddev->private = conf; | ||
4511 | |||
4512 | /* | ||
4513 | * 0 for a fully functional array, 1 or 2 for a degraded array. | ||
4514 | */ | ||
4515 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
4516 | if (rdev->raid_disk >= 0 && | ||
4517 | test_bit(In_sync, &rdev->flags)) | ||
4518 | working_disks++; | ||
4519 | |||
4520 | mddev->degraded = conf->raid_disks - working_disks; | ||
4521 | |||
4165 | if (mddev->degraded > conf->max_degraded) { | 4522 | if (mddev->degraded > conf->max_degraded) { |
4166 | printk(KERN_ERR "raid5: not enough operational devices for %s" | 4523 | printk(KERN_ERR "raid5: not enough operational devices for %s" |
4167 | " (%d/%d failed)\n", | 4524 | " (%d/%d failed)\n", |
@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev) | |||
4169 | goto abort; | 4526 | goto abort; |
4170 | } | 4527 | } |
4171 | 4528 | ||
4529 | /* device size must be a multiple of chunk size */ | ||
4530 | mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1); | ||
4531 | mddev->resync_max_sectors = mddev->dev_sectors; | ||
4532 | |||
4172 | if (mddev->degraded > 0 && | 4533 | if (mddev->degraded > 0 && |
4173 | mddev->recovery_cp != MaxSector) { | 4534 | mddev->recovery_cp != MaxSector) { |
4174 | if (mddev->ok_start_degraded) | 4535 | if (mddev->ok_start_degraded) |
@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev) | |||
4184 | } | 4545 | } |
4185 | } | 4546 | } |
4186 | 4547 | ||
4187 | { | ||
4188 | mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); | ||
4189 | if (!mddev->thread) { | ||
4190 | printk(KERN_ERR | ||
4191 | "raid5: couldn't allocate thread for %s\n", | ||
4192 | mdname(mddev)); | ||
4193 | goto abort; | ||
4194 | } | ||
4195 | } | ||
4196 | memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + | ||
4197 | conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; | ||
4198 | if (grow_stripes(conf, conf->max_nr_stripes)) { | ||
4199 | printk(KERN_ERR | ||
4200 | "raid5: couldn't allocate %dkB for buffers\n", memory); | ||
4201 | shrink_stripes(conf); | ||
4202 | md_unregister_thread(mddev->thread); | ||
4203 | goto abort; | ||
4204 | } else | ||
4205 | printk(KERN_INFO "raid5: allocated %dkB for %s\n", | ||
4206 | memory, mdname(mddev)); | ||
4207 | |||
4208 | if (mddev->degraded == 0) | 4548 | if (mddev->degraded == 0) |
4209 | printk("raid5: raid level %d set %s active with %d out of %d" | 4549 | printk("raid5: raid level %d set %s active with %d out of %d" |
4210 | " devices, algorithm %d\n", conf->level, mdname(mddev), | 4550 | " devices, algorithm %d\n", conf->level, mdname(mddev), |
4211 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, | 4551 | mddev->raid_disks-mddev->degraded, mddev->raid_disks, |
4212 | conf->algorithm); | 4552 | mddev->new_layout); |
4213 | else | 4553 | else |
4214 | printk(KERN_ALERT "raid5: raid level %d set %s active with %d" | 4554 | printk(KERN_ALERT "raid5: raid level %d set %s active with %d" |
4215 | " out of %d devices, algorithm %d\n", conf->level, | 4555 | " out of %d devices, algorithm %d\n", conf->level, |
4216 | mdname(mddev), mddev->raid_disks - mddev->degraded, | 4556 | mdname(mddev), mddev->raid_disks - mddev->degraded, |
4217 | mddev->raid_disks, conf->algorithm); | 4557 | mddev->raid_disks, mddev->new_layout); |
4218 | 4558 | ||
4219 | print_raid5_conf(conf); | 4559 | print_raid5_conf(conf); |
4220 | 4560 | ||
4221 | if (conf->expand_progress != MaxSector) { | 4561 | if (conf->reshape_progress != MaxSector) { |
4222 | printk("...ok start reshape thread\n"); | 4562 | printk("...ok start reshape thread\n"); |
4223 | conf->expand_lo = conf->expand_progress; | 4563 | conf->reshape_safe = conf->reshape_progress; |
4224 | atomic_set(&conf->reshape_stripes, 0); | 4564 | atomic_set(&conf->reshape_stripes, 0); |
4225 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 4565 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
4226 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 4566 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev) | |||
4247 | "raid5: failed to create sysfs attributes for %s\n", | 4587 | "raid5: failed to create sysfs attributes for %s\n", |
4248 | mdname(mddev)); | 4588 | mdname(mddev)); |
4249 | 4589 | ||
4590 | mddev->queue->queue_lock = &conf->device_lock; | ||
4591 | |||
4250 | mddev->queue->unplug_fn = raid5_unplug_device; | 4592 | mddev->queue->unplug_fn = raid5_unplug_device; |
4251 | mddev->queue->backing_dev_info.congested_data = mddev; | 4593 | mddev->queue->backing_dev_info.congested_data = mddev; |
4252 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; | 4594 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; |
4253 | 4595 | ||
4254 | mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - | 4596 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); |
4255 | conf->max_degraded); | ||
4256 | 4597 | ||
4257 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); | 4598 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); |
4258 | 4599 | ||
4259 | return 0; | 4600 | return 0; |
4260 | abort: | 4601 | abort: |
4602 | md_unregister_thread(mddev->thread); | ||
4603 | mddev->thread = NULL; | ||
4261 | if (conf) { | 4604 | if (conf) { |
4605 | shrink_stripes(conf); | ||
4262 | print_raid5_conf(conf); | 4606 | print_raid5_conf(conf); |
4263 | safe_put_page(conf->spare_page); | 4607 | safe_put_page(conf->spare_page); |
4264 | kfree(conf->disks); | 4608 | kfree(conf->disks); |
@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
4396 | print_raid5_conf(conf); | 4740 | print_raid5_conf(conf); |
4397 | rdev = p->rdev; | 4741 | rdev = p->rdev; |
4398 | if (rdev) { | 4742 | if (rdev) { |
4743 | if (number >= conf->raid_disks && | ||
4744 | conf->reshape_progress == MaxSector) | ||
4745 | clear_bit(In_sync, &rdev->flags); | ||
4746 | |||
4399 | if (test_bit(In_sync, &rdev->flags) || | 4747 | if (test_bit(In_sync, &rdev->flags) || |
4400 | atomic_read(&rdev->nr_pending)) { | 4748 | atomic_read(&rdev->nr_pending)) { |
4401 | err = -EBUSY; | 4749 | err = -EBUSY; |
@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number) | |||
4405 | * isn't possible. | 4753 | * isn't possible. |
4406 | */ | 4754 | */ |
4407 | if (!test_bit(Faulty, &rdev->flags) && | 4755 | if (!test_bit(Faulty, &rdev->flags) && |
4408 | mddev->degraded <= conf->max_degraded) { | 4756 | mddev->degraded <= conf->max_degraded && |
4757 | number < conf->raid_disks) { | ||
4409 | err = -EBUSY; | 4758 | err = -EBUSY; |
4410 | goto abort; | 4759 | goto abort; |
4411 | } | 4760 | } |
@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
4472 | * any io in the removed space completes, but it hardly seems | 4821 | * any io in the removed space completes, but it hardly seems |
4473 | * worth it. | 4822 | * worth it. |
4474 | */ | 4823 | */ |
4475 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
4476 | |||
4477 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 4824 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); |
4478 | mddev->array_sectors = sectors * (mddev->raid_disks | 4825 | md_set_array_sectors(mddev, raid5_size(mddev, sectors, |
4479 | - conf->max_degraded); | 4826 | mddev->raid_disks)); |
4827 | if (mddev->array_sectors > | ||
4828 | raid5_size(mddev, sectors, mddev->raid_disks)) | ||
4829 | return -EINVAL; | ||
4480 | set_capacity(mddev->gendisk, mddev->array_sectors); | 4830 | set_capacity(mddev->gendisk, mddev->array_sectors); |
4481 | mddev->changed = 1; | 4831 | mddev->changed = 1; |
4482 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { | 4832 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { |
4483 | mddev->recovery_cp = mddev->size << 1; | 4833 | mddev->recovery_cp = mddev->dev_sectors; |
4484 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4834 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4485 | } | 4835 | } |
4486 | mddev->size = sectors /2; | 4836 | mddev->dev_sectors = sectors; |
4487 | mddev->resync_max_sectors = sectors; | 4837 | mddev->resync_max_sectors = sectors; |
4488 | return 0; | 4838 | return 0; |
4489 | } | 4839 | } |
4490 | 4840 | ||
4491 | #ifdef CONFIG_MD_RAID5_RESHAPE | ||
4492 | static int raid5_check_reshape(mddev_t *mddev) | 4841 | static int raid5_check_reshape(mddev_t *mddev) |
4493 | { | 4842 | { |
4494 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4843 | raid5_conf_t *conf = mddev_to_conf(mddev); |
4495 | int err; | ||
4496 | 4844 | ||
4497 | if (mddev->delta_disks < 0 || | 4845 | if (mddev->delta_disks == 0 && |
4498 | mddev->new_level != mddev->level) | 4846 | mddev->new_layout == mddev->layout && |
4499 | return -EINVAL; /* Cannot shrink array or change level yet */ | 4847 | mddev->new_chunk == mddev->chunk_size) |
4500 | if (mddev->delta_disks == 0) | 4848 | return -EINVAL; /* nothing to do */ |
4501 | return 0; /* nothing to do */ | ||
4502 | if (mddev->bitmap) | 4849 | if (mddev->bitmap) |
4503 | /* Cannot grow a bitmap yet */ | 4850 | /* Cannot grow a bitmap yet */ |
4504 | return -EBUSY; | 4851 | return -EBUSY; |
4852 | if (mddev->degraded > conf->max_degraded) | ||
4853 | return -EINVAL; | ||
4854 | if (mddev->delta_disks < 0) { | ||
4855 | /* We might be able to shrink, but the devices must | ||
4856 | * be made bigger first. | ||
4857 | * For raid6, 4 is the minimum size. | ||
4858 | * Otherwise 2 is the minimum | ||
4859 | */ | ||
4860 | int min = 2; | ||
4861 | if (mddev->level == 6) | ||
4862 | min = 4; | ||
4863 | if (mddev->raid_disks + mddev->delta_disks < min) | ||
4864 | return -EINVAL; | ||
4865 | } | ||
4505 | 4866 | ||
4506 | /* Can only proceed if there are plenty of stripe_heads. | 4867 | /* Can only proceed if there are plenty of stripe_heads. |
4507 | * We need a minimum of one full stripe,, and for sensible progress | 4868 | * We need a minimum of one full stripe,, and for sensible progress |
@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev) | |||
4514 | if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || | 4875 | if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || |
4515 | (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { | 4876 | (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { |
4516 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", | 4877 | printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", |
4517 | (mddev->chunk_size / STRIPE_SIZE)*4); | 4878 | (max(mddev->chunk_size, mddev->new_chunk) |
4879 | / STRIPE_SIZE)*4); | ||
4518 | return -ENOSPC; | 4880 | return -ENOSPC; |
4519 | } | 4881 | } |
4520 | 4882 | ||
4521 | err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); | 4883 | return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); |
4522 | if (err) | ||
4523 | return err; | ||
4524 | |||
4525 | if (mddev->degraded > conf->max_degraded) | ||
4526 | return -EINVAL; | ||
4527 | /* looks like we might be able to manage this */ | ||
4528 | return 0; | ||
4529 | } | 4884 | } |
4530 | 4885 | ||
4531 | static int raid5_start_reshape(mddev_t *mddev) | 4886 | static int raid5_start_reshape(mddev_t *mddev) |
@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4550 | */ | 4905 | */ |
4551 | return -EINVAL; | 4906 | return -EINVAL; |
4552 | 4907 | ||
4908 | /* Refuse to reduce size of the array. Any reductions in | ||
4909 | * array size must be through explicit setting of array_size | ||
4910 | * attribute. | ||
4911 | */ | ||
4912 | if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) | ||
4913 | < mddev->array_sectors) { | ||
4914 | printk(KERN_ERR "md: %s: array size must be reduced " | ||
4915 | "before number of disks\n", mdname(mddev)); | ||
4916 | return -EINVAL; | ||
4917 | } | ||
4918 | |||
4553 | atomic_set(&conf->reshape_stripes, 0); | 4919 | atomic_set(&conf->reshape_stripes, 0); |
4554 | spin_lock_irq(&conf->device_lock); | 4920 | spin_lock_irq(&conf->device_lock); |
4555 | conf->previous_raid_disks = conf->raid_disks; | 4921 | conf->previous_raid_disks = conf->raid_disks; |
4556 | conf->raid_disks += mddev->delta_disks; | 4922 | conf->raid_disks += mddev->delta_disks; |
4557 | conf->expand_progress = 0; | 4923 | conf->prev_chunk = conf->chunk_size; |
4558 | conf->expand_lo = 0; | 4924 | conf->chunk_size = mddev->new_chunk; |
4925 | conf->prev_algo = conf->algorithm; | ||
4926 | conf->algorithm = mddev->new_layout; | ||
4927 | if (mddev->delta_disks < 0) | ||
4928 | conf->reshape_progress = raid5_size(mddev, 0, 0); | ||
4929 | else | ||
4930 | conf->reshape_progress = 0; | ||
4931 | conf->reshape_safe = conf->reshape_progress; | ||
4932 | conf->generation++; | ||
4559 | spin_unlock_irq(&conf->device_lock); | 4933 | spin_unlock_irq(&conf->device_lock); |
4560 | 4934 | ||
4561 | /* Add some new drives, as many as will fit. | 4935 | /* Add some new drives, as many as will fit. |
@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4580 | break; | 4954 | break; |
4581 | } | 4955 | } |
4582 | 4956 | ||
4583 | spin_lock_irqsave(&conf->device_lock, flags); | 4957 | if (mddev->delta_disks > 0) { |
4584 | mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; | 4958 | spin_lock_irqsave(&conf->device_lock, flags); |
4585 | spin_unlock_irqrestore(&conf->device_lock, flags); | 4959 | mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) |
4960 | - added_devices; | ||
4961 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
4962 | } | ||
4586 | mddev->raid_disks = conf->raid_disks; | 4963 | mddev->raid_disks = conf->raid_disks; |
4587 | mddev->reshape_position = 0; | 4964 | mddev->reshape_position = 0; |
4588 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 4965 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4597 | mddev->recovery = 0; | 4974 | mddev->recovery = 0; |
4598 | spin_lock_irq(&conf->device_lock); | 4975 | spin_lock_irq(&conf->device_lock); |
4599 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 4976 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
4600 | conf->expand_progress = MaxSector; | 4977 | conf->reshape_progress = MaxSector; |
4601 | spin_unlock_irq(&conf->device_lock); | 4978 | spin_unlock_irq(&conf->device_lock); |
4602 | return -EAGAIN; | 4979 | return -EAGAIN; |
4603 | } | 4980 | } |
4981 | conf->reshape_checkpoint = jiffies; | ||
4604 | md_wakeup_thread(mddev->sync_thread); | 4982 | md_wakeup_thread(mddev->sync_thread); |
4605 | md_new_event(mddev); | 4983 | md_new_event(mddev); |
4606 | return 0; | 4984 | return 0; |
4607 | } | 4985 | } |
4608 | #endif | ||
4609 | 4986 | ||
4987 | /* This is called from the reshape thread and should make any | ||
4988 | * changes needed in 'conf' | ||
4989 | */ | ||
4610 | static void end_reshape(raid5_conf_t *conf) | 4990 | static void end_reshape(raid5_conf_t *conf) |
4611 | { | 4991 | { |
4612 | struct block_device *bdev; | ||
4613 | 4992 | ||
4614 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 4993 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
4615 | conf->mddev->array_sectors = 2 * conf->mddev->size * | 4994 | |
4616 | (conf->raid_disks - conf->max_degraded); | ||
4617 | set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); | ||
4618 | conf->mddev->changed = 1; | ||
4619 | |||
4620 | bdev = bdget_disk(conf->mddev->gendisk, 0); | ||
4621 | if (bdev) { | ||
4622 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
4623 | i_size_write(bdev->bd_inode, | ||
4624 | (loff_t)conf->mddev->array_sectors << 9); | ||
4625 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
4626 | bdput(bdev); | ||
4627 | } | ||
4628 | spin_lock_irq(&conf->device_lock); | 4995 | spin_lock_irq(&conf->device_lock); |
4629 | conf->expand_progress = MaxSector; | 4996 | conf->previous_raid_disks = conf->raid_disks; |
4997 | conf->reshape_progress = MaxSector; | ||
4630 | spin_unlock_irq(&conf->device_lock); | 4998 | spin_unlock_irq(&conf->device_lock); |
4631 | conf->mddev->reshape_position = MaxSector; | 4999 | wake_up(&conf->wait_for_overlap); |
4632 | 5000 | ||
4633 | /* read-ahead size must cover two whole stripes, which is | 5001 | /* read-ahead size must cover two whole stripes, which is |
4634 | * 2 * (datadisks) * chunksize where 'n' is the number of raid devices | 5002 | * 2 * (datadisks) * chunksize where 'n' is the number of raid devices |
4635 | */ | 5003 | */ |
4636 | { | 5004 | { |
4637 | int data_disks = conf->previous_raid_disks - conf->max_degraded; | 5005 | int data_disks = conf->raid_disks - conf->max_degraded; |
4638 | int stripe = data_disks * | 5006 | int stripe = data_disks * (conf->chunk_size |
4639 | (conf->mddev->chunk_size / PAGE_SIZE); | 5007 | / PAGE_SIZE); |
4640 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 5008 | if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
4641 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 5009 | conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
4642 | } | 5010 | } |
4643 | } | 5011 | } |
4644 | } | 5012 | } |
4645 | 5013 | ||
5014 | /* This is called from the raid5d thread with mddev_lock held. | ||
5015 | * It makes config changes to the device. | ||
5016 | */ | ||
5017 | static void raid5_finish_reshape(mddev_t *mddev) | ||
5018 | { | ||
5019 | struct block_device *bdev; | ||
5020 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
5021 | |||
5022 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | ||
5023 | |||
5024 | if (mddev->delta_disks > 0) { | ||
5025 | md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); | ||
5026 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
5027 | mddev->changed = 1; | ||
5028 | |||
5029 | bdev = bdget_disk(mddev->gendisk, 0); | ||
5030 | if (bdev) { | ||
5031 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
5032 | i_size_write(bdev->bd_inode, | ||
5033 | (loff_t)mddev->array_sectors << 9); | ||
5034 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
5035 | bdput(bdev); | ||
5036 | } | ||
5037 | } else { | ||
5038 | int d; | ||
5039 | mddev->degraded = conf->raid_disks; | ||
5040 | for (d = 0; d < conf->raid_disks ; d++) | ||
5041 | if (conf->disks[d].rdev && | ||
5042 | test_bit(In_sync, | ||
5043 | &conf->disks[d].rdev->flags)) | ||
5044 | mddev->degraded--; | ||
5045 | for (d = conf->raid_disks ; | ||
5046 | d < conf->raid_disks - mddev->delta_disks; | ||
5047 | d++) | ||
5048 | raid5_remove_disk(mddev, d); | ||
5049 | } | ||
5050 | mddev->layout = conf->algorithm; | ||
5051 | mddev->chunk_size = conf->chunk_size; | ||
5052 | mddev->reshape_position = MaxSector; | ||
5053 | mddev->delta_disks = 0; | ||
5054 | } | ||
5055 | } | ||
5056 | |||
4646 | static void raid5_quiesce(mddev_t *mddev, int state) | 5057 | static void raid5_quiesce(mddev_t *mddev, int state) |
4647 | { | 5058 | { |
4648 | raid5_conf_t *conf = mddev_to_conf(mddev); | 5059 | raid5_conf_t *conf = mddev_to_conf(mddev); |
@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state) | |||
4672 | } | 5083 | } |
4673 | } | 5084 | } |
4674 | 5085 | ||
5086 | |||
5087 | static void *raid5_takeover_raid1(mddev_t *mddev) | ||
5088 | { | ||
5089 | int chunksect; | ||
5090 | |||
5091 | if (mddev->raid_disks != 2 || | ||
5092 | mddev->degraded > 1) | ||
5093 | return ERR_PTR(-EINVAL); | ||
5094 | |||
5095 | /* Should check if there are write-behind devices? */ | ||
5096 | |||
5097 | chunksect = 64*2; /* 64K by default */ | ||
5098 | |||
5099 | /* The array must be an exact multiple of chunksize */ | ||
5100 | while (chunksect && (mddev->array_sectors & (chunksect-1))) | ||
5101 | chunksect >>= 1; | ||
5102 | |||
5103 | if ((chunksect<<9) < STRIPE_SIZE) | ||
5104 | /* array size does not allow a suitable chunk size */ | ||
5105 | return ERR_PTR(-EINVAL); | ||
5106 | |||
5107 | mddev->new_level = 5; | ||
5108 | mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; | ||
5109 | mddev->new_chunk = chunksect << 9; | ||
5110 | |||
5111 | return setup_conf(mddev); | ||
5112 | } | ||
5113 | |||
5114 | static void *raid5_takeover_raid6(mddev_t *mddev) | ||
5115 | { | ||
5116 | int new_layout; | ||
5117 | |||
5118 | switch (mddev->layout) { | ||
5119 | case ALGORITHM_LEFT_ASYMMETRIC_6: | ||
5120 | new_layout = ALGORITHM_LEFT_ASYMMETRIC; | ||
5121 | break; | ||
5122 | case ALGORITHM_RIGHT_ASYMMETRIC_6: | ||
5123 | new_layout = ALGORITHM_RIGHT_ASYMMETRIC; | ||
5124 | break; | ||
5125 | case ALGORITHM_LEFT_SYMMETRIC_6: | ||
5126 | new_layout = ALGORITHM_LEFT_SYMMETRIC; | ||
5127 | break; | ||
5128 | case ALGORITHM_RIGHT_SYMMETRIC_6: | ||
5129 | new_layout = ALGORITHM_RIGHT_SYMMETRIC; | ||
5130 | break; | ||
5131 | case ALGORITHM_PARITY_0_6: | ||
5132 | new_layout = ALGORITHM_PARITY_0; | ||
5133 | break; | ||
5134 | case ALGORITHM_PARITY_N: | ||
5135 | new_layout = ALGORITHM_PARITY_N; | ||
5136 | break; | ||
5137 | default: | ||
5138 | return ERR_PTR(-EINVAL); | ||
5139 | } | ||
5140 | mddev->new_level = 5; | ||
5141 | mddev->new_layout = new_layout; | ||
5142 | mddev->delta_disks = -1; | ||
5143 | mddev->raid_disks -= 1; | ||
5144 | return setup_conf(mddev); | ||
5145 | } | ||
5146 | |||
5147 | |||
5148 | static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | ||
5149 | { | ||
5150 | /* For a 2-drive array, the layout and chunk size can be changed | ||
5151 | * immediately as not restriping is needed. | ||
5152 | * For larger arrays we record the new value - after validation | ||
5153 | * to be used by a reshape pass. | ||
5154 | */ | ||
5155 | raid5_conf_t *conf = mddev_to_conf(mddev); | ||
5156 | |||
5157 | if (new_layout >= 0 && !algorithm_valid_raid5(new_layout)) | ||
5158 | return -EINVAL; | ||
5159 | if (new_chunk > 0) { | ||
5160 | if (new_chunk & (new_chunk-1)) | ||
5161 | /* not a power of 2 */ | ||
5162 | return -EINVAL; | ||
5163 | if (new_chunk < PAGE_SIZE) | ||
5164 | return -EINVAL; | ||
5165 | if (mddev->array_sectors & ((new_chunk>>9)-1)) | ||
5166 | /* not factor of array size */ | ||
5167 | return -EINVAL; | ||
5168 | } | ||
5169 | |||
5170 | /* They look valid */ | ||
5171 | |||
5172 | if (mddev->raid_disks == 2) { | ||
5173 | |||
5174 | if (new_layout >= 0) { | ||
5175 | conf->algorithm = new_layout; | ||
5176 | mddev->layout = mddev->new_layout = new_layout; | ||
5177 | } | ||
5178 | if (new_chunk > 0) { | ||
5179 | conf->chunk_size = new_chunk; | ||
5180 | mddev->chunk_size = mddev->new_chunk = new_chunk; | ||
5181 | } | ||
5182 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
5183 | md_wakeup_thread(mddev->thread); | ||
5184 | } else { | ||
5185 | if (new_layout >= 0) | ||
5186 | mddev->new_layout = new_layout; | ||
5187 | if (new_chunk > 0) | ||
5188 | mddev->new_chunk = new_chunk; | ||
5189 | } | ||
5190 | return 0; | ||
5191 | } | ||
5192 | |||
5193 | static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk) | ||
5194 | { | ||
5195 | if (new_layout >= 0 && !algorithm_valid_raid6(new_layout)) | ||
5196 | return -EINVAL; | ||
5197 | if (new_chunk > 0) { | ||
5198 | if (new_chunk & (new_chunk-1)) | ||
5199 | /* not a power of 2 */ | ||
5200 | return -EINVAL; | ||
5201 | if (new_chunk < PAGE_SIZE) | ||
5202 | return -EINVAL; | ||
5203 | if (mddev->array_sectors & ((new_chunk>>9)-1)) | ||
5204 | /* not factor of array size */ | ||
5205 | return -EINVAL; | ||
5206 | } | ||
5207 | |||
5208 | /* They look valid */ | ||
5209 | |||
5210 | if (new_layout >= 0) | ||
5211 | mddev->new_layout = new_layout; | ||
5212 | if (new_chunk > 0) | ||
5213 | mddev->new_chunk = new_chunk; | ||
5214 | |||
5215 | return 0; | ||
5216 | } | ||
5217 | |||
5218 | static void *raid5_takeover(mddev_t *mddev) | ||
5219 | { | ||
5220 | /* raid5 can take over: | ||
5221 | * raid0 - if all devices are the same - make it a raid4 layout | ||
5222 | * raid1 - if there are two drives. We need to know the chunk size | ||
5223 | * raid4 - trivial - just use a raid4 layout. | ||
5224 | * raid6 - Providing it is a *_6 layout | ||
5225 | * | ||
5226 | * For now, just do raid1 | ||
5227 | */ | ||
5228 | |||
5229 | if (mddev->level == 1) | ||
5230 | return raid5_takeover_raid1(mddev); | ||
5231 | if (mddev->level == 4) { | ||
5232 | mddev->new_layout = ALGORITHM_PARITY_N; | ||
5233 | mddev->new_level = 5; | ||
5234 | return setup_conf(mddev); | ||
5235 | } | ||
5236 | if (mddev->level == 6) | ||
5237 | return raid5_takeover_raid6(mddev); | ||
5238 | |||
5239 | return ERR_PTR(-EINVAL); | ||
5240 | } | ||
5241 | |||
5242 | |||
5243 | static struct mdk_personality raid5_personality; | ||
5244 | |||
5245 | static void *raid6_takeover(mddev_t *mddev) | ||
5246 | { | ||
5247 | /* Currently can only take over a raid5. We map the | ||
5248 | * personality to an equivalent raid6 personality | ||
5249 | * with the Q block at the end. | ||
5250 | */ | ||
5251 | int new_layout; | ||
5252 | |||
5253 | if (mddev->pers != &raid5_personality) | ||
5254 | return ERR_PTR(-EINVAL); | ||
5255 | if (mddev->degraded > 1) | ||
5256 | return ERR_PTR(-EINVAL); | ||
5257 | if (mddev->raid_disks > 253) | ||
5258 | return ERR_PTR(-EINVAL); | ||
5259 | if (mddev->raid_disks < 3) | ||
5260 | return ERR_PTR(-EINVAL); | ||
5261 | |||
5262 | switch (mddev->layout) { | ||
5263 | case ALGORITHM_LEFT_ASYMMETRIC: | ||
5264 | new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; | ||
5265 | break; | ||
5266 | case ALGORITHM_RIGHT_ASYMMETRIC: | ||
5267 | new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; | ||
5268 | break; | ||
5269 | case ALGORITHM_LEFT_SYMMETRIC: | ||
5270 | new_layout = ALGORITHM_LEFT_SYMMETRIC_6; | ||
5271 | break; | ||
5272 | case ALGORITHM_RIGHT_SYMMETRIC: | ||
5273 | new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; | ||
5274 | break; | ||
5275 | case ALGORITHM_PARITY_0: | ||
5276 | new_layout = ALGORITHM_PARITY_0_6; | ||
5277 | break; | ||
5278 | case ALGORITHM_PARITY_N: | ||
5279 | new_layout = ALGORITHM_PARITY_N; | ||
5280 | break; | ||
5281 | default: | ||
5282 | return ERR_PTR(-EINVAL); | ||
5283 | } | ||
5284 | mddev->new_level = 6; | ||
5285 | mddev->new_layout = new_layout; | ||
5286 | mddev->delta_disks = 1; | ||
5287 | mddev->raid_disks += 1; | ||
5288 | return setup_conf(mddev); | ||
5289 | } | ||
5290 | |||
5291 | |||
4675 | static struct mdk_personality raid6_personality = | 5292 | static struct mdk_personality raid6_personality = |
4676 | { | 5293 | { |
4677 | .name = "raid6", | 5294 | .name = "raid6", |
@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality = | |||
4687 | .spare_active = raid5_spare_active, | 5304 | .spare_active = raid5_spare_active, |
4688 | .sync_request = sync_request, | 5305 | .sync_request = sync_request, |
4689 | .resize = raid5_resize, | 5306 | .resize = raid5_resize, |
4690 | #ifdef CONFIG_MD_RAID5_RESHAPE | 5307 | .size = raid5_size, |
4691 | .check_reshape = raid5_check_reshape, | 5308 | .check_reshape = raid5_check_reshape, |
4692 | .start_reshape = raid5_start_reshape, | 5309 | .start_reshape = raid5_start_reshape, |
4693 | #endif | 5310 | .finish_reshape = raid5_finish_reshape, |
4694 | .quiesce = raid5_quiesce, | 5311 | .quiesce = raid5_quiesce, |
5312 | .takeover = raid6_takeover, | ||
5313 | .reconfig = raid6_reconfig, | ||
4695 | }; | 5314 | }; |
4696 | static struct mdk_personality raid5_personality = | 5315 | static struct mdk_personality raid5_personality = |
4697 | { | 5316 | { |
@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality = | |||
4708 | .spare_active = raid5_spare_active, | 5327 | .spare_active = raid5_spare_active, |
4709 | .sync_request = sync_request, | 5328 | .sync_request = sync_request, |
4710 | .resize = raid5_resize, | 5329 | .resize = raid5_resize, |
4711 | #ifdef CONFIG_MD_RAID5_RESHAPE | 5330 | .size = raid5_size, |
4712 | .check_reshape = raid5_check_reshape, | 5331 | .check_reshape = raid5_check_reshape, |
4713 | .start_reshape = raid5_start_reshape, | 5332 | .start_reshape = raid5_start_reshape, |
4714 | #endif | 5333 | .finish_reshape = raid5_finish_reshape, |
4715 | .quiesce = raid5_quiesce, | 5334 | .quiesce = raid5_quiesce, |
5335 | .takeover = raid5_takeover, | ||
5336 | .reconfig = raid5_reconfig, | ||
4716 | }; | 5337 | }; |
4717 | 5338 | ||
4718 | static struct mdk_personality raid4_personality = | 5339 | static struct mdk_personality raid4_personality = |
@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality = | |||
4730 | .spare_active = raid5_spare_active, | 5351 | .spare_active = raid5_spare_active, |
4731 | .sync_request = sync_request, | 5352 | .sync_request = sync_request, |
4732 | .resize = raid5_resize, | 5353 | .resize = raid5_resize, |
4733 | #ifdef CONFIG_MD_RAID5_RESHAPE | 5354 | .size = raid5_size, |
4734 | .check_reshape = raid5_check_reshape, | 5355 | .check_reshape = raid5_check_reshape, |
4735 | .start_reshape = raid5_start_reshape, | 5356 | .start_reshape = raid5_start_reshape, |
4736 | #endif | 5357 | .finish_reshape = raid5_finish_reshape, |
4737 | .quiesce = raid5_quiesce, | 5358 | .quiesce = raid5_quiesce, |
4738 | }; | 5359 | }; |
4739 | 5360 | ||
4740 | static int __init raid5_init(void) | 5361 | static int __init raid5_init(void) |
4741 | { | 5362 | { |
4742 | int e; | ||
4743 | |||
4744 | e = raid6_select_algo(); | ||
4745 | if ( e ) | ||
4746 | return e; | ||
4747 | register_md_personality(&raid6_personality); | 5363 | register_md_personality(&raid6_personality); |
4748 | register_md_personality(&raid5_personality); | 5364 | register_md_personality(&raid5_personality); |
4749 | register_md_personality(&raid4_personality); | 5365 | register_md_personality(&raid4_personality); |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 000000000000..52ba99954dec --- /dev/null +++ b/drivers/md/raid5.h | |||
@@ -0,0 +1,474 @@ | |||
1 | #ifndef _RAID5_H | ||
2 | #define _RAID5_H | ||
3 | |||
4 | #include <linux/raid/xor.h> | ||
5 | |||
6 | /* | ||
7 | * | ||
8 | * Each stripe contains one buffer per disc. Each buffer can be in | ||
9 | * one of a number of states stored in "flags". Changes between | ||
10 | * these states happen *almost* exclusively under a per-stripe | ||
11 | * spinlock. Some very specific changes can happen in bi_end_io, and | ||
12 | * these are not protected by the spin lock. | ||
13 | * | ||
14 | * The flag bits that are used to represent these states are: | ||
15 | * R5_UPTODATE and R5_LOCKED | ||
16 | * | ||
17 | * State Empty == !UPTODATE, !LOCK | ||
18 | * We have no data, and there is no active request | ||
19 | * State Want == !UPTODATE, LOCK | ||
20 | * A read request is being submitted for this block | ||
21 | * State Dirty == UPTODATE, LOCK | ||
22 | * Some new data is in this buffer, and it is being written out | ||
23 | * State Clean == UPTODATE, !LOCK | ||
24 | * We have valid data which is the same as on disc | ||
25 | * | ||
26 | * The possible state transitions are: | ||
27 | * | ||
28 | * Empty -> Want - on read or write to get old data for parity calc | ||
29 | * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) | ||
30 | * Empty -> Clean - on compute_block when computing a block for failed drive | ||
31 | * Want -> Empty - on failed read | ||
32 | * Want -> Clean - on successful completion of read request | ||
33 | * Dirty -> Clean - on successful completion of write request | ||
34 | * Dirty -> Clean - on failed write | ||
35 | * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) | ||
36 | * | ||
37 | * The Want->Empty, Want->Clean, Dirty->Clean, transitions | ||
38 | * all happen in b_end_io at interrupt time. | ||
39 | * Each sets the Uptodate bit before releasing the Lock bit. | ||
40 | * This leaves one multi-stage transition: | ||
41 | * Want->Dirty->Clean | ||
42 | * This is safe because thinking that a Clean buffer is actually dirty | ||
43 | * will at worst delay some action, and the stripe will be scheduled | ||
44 | * for attention after the transition is complete. | ||
45 | * | ||
46 | * There is one possibility that is not covered by these states. That | ||
47 | * is if one drive has failed and there is a spare being rebuilt. We | ||
48 | * can't distinguish between a clean block that has been generated | ||
49 | * from parity calculations, and a clean block that has been | ||
50 | * successfully written to the spare ( or to parity when resyncing). | ||
51 | * To distingush these states we have a stripe bit STRIPE_INSYNC that | ||
52 | * is set whenever a write is scheduled to the spare, or to the parity | ||
53 | * disc if there is no spare. A sync request clears this bit, and | ||
54 | * when we find it set with no buffers locked, we know the sync is | ||
55 | * complete. | ||
56 | * | ||
57 | * Buffers for the md device that arrive via make_request are attached | ||
58 | * to the appropriate stripe in one of two lists linked on b_reqnext. | ||
59 | * One list (bh_read) for read requests, one (bh_write) for write. | ||
60 | * There should never be more than one buffer on the two lists | ||
61 | * together, but we are not guaranteed of that so we allow for more. | ||
62 | * | ||
63 | * If a buffer is on the read list when the associated cache buffer is | ||
64 | * Uptodate, the data is copied into the read buffer and it's b_end_io | ||
65 | * routine is called. This may happen in the end_request routine only | ||
66 | * if the buffer has just successfully been read. end_request should | ||
67 | * remove the buffers from the list and then set the Uptodate bit on | ||
68 | * the buffer. Other threads may do this only if they first check | ||
69 | * that the Uptodate bit is set. Once they have checked that they may | ||
70 | * take buffers off the read queue. | ||
71 | * | ||
72 | * When a buffer on the write list is committed for write it is copied | ||
73 | * into the cache buffer, which is then marked dirty, and moved onto a | ||
74 | * third list, the written list (bh_written). Once both the parity | ||
75 | * block and the cached buffer are successfully written, any buffer on | ||
76 | * a written list can be returned with b_end_io. | ||
77 | * | ||
78 | * The write list and read list both act as fifos. The read list is | ||
79 | * protected by the device_lock. The write and written lists are | ||
80 | * protected by the stripe lock. The device_lock, which can be | ||
81 | * claimed while the stipe lock is held, is only for list | ||
82 | * manipulations and will only be held for a very short time. It can | ||
83 | * be claimed from interrupts. | ||
84 | * | ||
85 | * | ||
86 | * Stripes in the stripe cache can be on one of two lists (or on | ||
87 | * neither). The "inactive_list" contains stripes which are not | ||
88 | * currently being used for any request. They can freely be reused | ||
89 | * for another stripe. The "handle_list" contains stripes that need | ||
90 | * to be handled in some way. Both of these are fifo queues. Each | ||
91 | * stripe is also (potentially) linked to a hash bucket in the hash | ||
92 | * table so that it can be found by sector number. Stripes that are | ||
93 | * not hashed must be on the inactive_list, and will normally be at | ||
94 | * the front. All stripes start life this way. | ||
95 | * | ||
96 | * The inactive_list, handle_list and hash bucket lists are all protected by the | ||
97 | * device_lock. | ||
98 | * - stripes on the inactive_list never have their stripe_lock held. | ||
99 | * - stripes have a reference counter. If count==0, they are on a list. | ||
100 | * - If a stripe might need handling, STRIPE_HANDLE is set. | ||
101 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on | ||
102 | * handle_list else inactive_list | ||
103 | * | ||
104 | * This, combined with the fact that STRIPE_HANDLE is only ever | ||
105 | * cleared while a stripe has a non-zero count means that if the | ||
106 | * refcount is 0 and STRIPE_HANDLE is set, then it is on the | ||
107 | * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then | ||
108 | * the stripe is on inactive_list. | ||
109 | * | ||
110 | * The possible transitions are: | ||
111 | * activate an unhashed/inactive stripe (get_active_stripe()) | ||
112 | * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev | ||
113 | * activate a hashed, possibly active stripe (get_active_stripe()) | ||
114 | * lockdev check-hash if(!cnt++)unlink-stripe unlockdev | ||
115 | * attach a request to an active stripe (add_stripe_bh()) | ||
116 | * lockdev attach-buffer unlockdev | ||
117 | * handle a stripe (handle_stripe()) | ||
118 | * lockstripe clrSTRIPE_HANDLE ... | ||
119 | * (lockdev check-buffers unlockdev) .. | ||
120 | * change-state .. | ||
121 | * record io/ops needed unlockstripe schedule io/ops | ||
122 | * release an active stripe (release_stripe()) | ||
123 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | ||
124 | * | ||
125 | * The refcount counts each thread that have activated the stripe, | ||
126 | * plus raid5d if it is handling it, plus one for each active request | ||
127 | * on a cached buffer, and plus one if the stripe is undergoing stripe | ||
128 | * operations. | ||
129 | * | ||
130 | * Stripe operations are performed outside the stripe lock, | ||
131 | * the stripe operations are: | ||
132 | * -copying data between the stripe cache and user application buffers | ||
133 | * -computing blocks to save a disk access, or to recover a missing block | ||
134 | * -updating the parity on a write operation (reconstruct write and | ||
135 | * read-modify-write) | ||
136 | * -checking parity correctness | ||
137 | * -running i/o to disk | ||
138 | * These operations are carried out by raid5_run_ops which uses the async_tx | ||
139 | * api to (optionally) offload operations to dedicated hardware engines. | ||
140 | * When requesting an operation handle_stripe sets the pending bit for the | ||
141 | * operation and increments the count. raid5_run_ops is then run whenever | ||
142 | * the count is non-zero. | ||
143 | * There are some critical dependencies between the operations that prevent some | ||
144 | * from being requested while another is in flight. | ||
145 | * 1/ Parity check operations destroy the in cache version of the parity block, | ||
146 | * so we prevent parity dependent operations like writes and compute_blocks | ||
147 | * from starting while a check is in progress. Some dma engines can perform | ||
148 | * the check without damaging the parity block, in these cases the parity | ||
149 | * block is re-marked up to date (assuming the check was successful) and is | ||
150 | * not re-read from disk. | ||
151 | * 2/ When a write operation is requested we immediately lock the affected | ||
152 | * blocks, and mark them as not up to date. This causes new read requests | ||
153 | * to be held off, as well as parity checks and compute block operations. | ||
154 | * 3/ Once a compute block operation has been requested handle_stripe treats | ||
155 | * that block as if it is up to date. raid5_run_ops guaruntees that any | ||
156 | * operation that is dependent on the compute block result is initiated after | ||
157 | * the compute block completes. | ||
158 | */ | ||
159 | |||
160 | /* | ||
161 | * Operations state - intermediate states that are visible outside of sh->lock | ||
162 | * In general _idle indicates nothing is running, _run indicates a data | ||
163 | * processing operation is active, and _result means the data processing result | ||
164 | * is stable and can be acted upon. For simple operations like biofill and | ||
165 | * compute that only have an _idle and _run state they are indicated with | ||
166 | * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) | ||
167 | */ | ||
168 | /** | ||
169 | * enum check_states - handles syncing / repairing a stripe | ||
170 | * @check_state_idle - check operations are quiesced | ||
171 | * @check_state_run - check operation is running | ||
172 | * @check_state_result - set outside lock when check result is valid | ||
173 | * @check_state_compute_run - check failed and we are repairing | ||
174 | * @check_state_compute_result - set outside lock when compute result is valid | ||
175 | */ | ||
176 | enum check_states { | ||
177 | check_state_idle = 0, | ||
178 | check_state_run, /* parity check */ | ||
179 | check_state_check_result, | ||
180 | check_state_compute_run, /* parity repair */ | ||
181 | check_state_compute_result, | ||
182 | }; | ||
183 | |||
184 | /** | ||
185 | * enum reconstruct_states - handles writing or expanding a stripe | ||
186 | */ | ||
187 | enum reconstruct_states { | ||
188 | reconstruct_state_idle = 0, | ||
189 | reconstruct_state_prexor_drain_run, /* prexor-write */ | ||
190 | reconstruct_state_drain_run, /* write */ | ||
191 | reconstruct_state_run, /* expand */ | ||
192 | reconstruct_state_prexor_drain_result, | ||
193 | reconstruct_state_drain_result, | ||
194 | reconstruct_state_result, | ||
195 | }; | ||
196 | |||
197 | struct stripe_head { | ||
198 | struct hlist_node hash; | ||
199 | struct list_head lru; /* inactive_list or handle_list */ | ||
200 | struct raid5_private_data *raid_conf; | ||
201 | short generation; /* increments with every | ||
202 | * reshape */ | ||
203 | sector_t sector; /* sector of this row */ | ||
204 | short pd_idx; /* parity disk index */ | ||
205 | short qd_idx; /* 'Q' disk index for raid6 */ | ||
206 | short ddf_layout;/* use DDF ordering to calculate Q */ | ||
207 | unsigned long state; /* state flags */ | ||
208 | atomic_t count; /* nr of active thread/requests */ | ||
209 | spinlock_t lock; | ||
210 | int bm_seq; /* sequence number for bitmap flushes */ | ||
211 | int disks; /* disks in stripe */ | ||
212 | enum check_states check_state; | ||
213 | enum reconstruct_states reconstruct_state; | ||
214 | /* stripe_operations | ||
215 | * @target - STRIPE_OP_COMPUTE_BLK target | ||
216 | */ | ||
217 | struct stripe_operations { | ||
218 | int target; | ||
219 | u32 zero_sum_result; | ||
220 | } ops; | ||
221 | struct r5dev { | ||
222 | struct bio req; | ||
223 | struct bio_vec vec; | ||
224 | struct page *page; | ||
225 | struct bio *toread, *read, *towrite, *written; | ||
226 | sector_t sector; /* sector of this page */ | ||
227 | unsigned long flags; | ||
228 | } dev[1]; /* allocated with extra space depending of RAID geometry */ | ||
229 | }; | ||
230 | |||
231 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head | ||
232 | * for handle_stripe. It is only valid under spin_lock(sh->lock); | ||
233 | */ | ||
234 | struct stripe_head_state { | ||
235 | int syncing, expanding, expanded; | ||
236 | int locked, uptodate, to_read, to_write, failed, written; | ||
237 | int to_fill, compute, req_compute, non_overwrite; | ||
238 | int failed_num; | ||
239 | unsigned long ops_request; | ||
240 | }; | ||
241 | |||
242 | /* r6_state - extra state data only relevant to r6 */ | ||
243 | struct r6_state { | ||
244 | int p_failed, q_failed, failed_num[2]; | ||
245 | }; | ||
246 | |||
247 | /* Flags */ | ||
248 | #define R5_UPTODATE 0 /* page contains current data */ | ||
249 | #define R5_LOCKED 1 /* IO has been submitted on "req" */ | ||
250 | #define R5_OVERWRITE 2 /* towrite covers whole page */ | ||
251 | /* and some that are internal to handle_stripe */ | ||
252 | #define R5_Insync 3 /* rdev && rdev->in_sync at start */ | ||
253 | #define R5_Wantread 4 /* want to schedule a read */ | ||
254 | #define R5_Wantwrite 5 | ||
255 | #define R5_Overlap 7 /* There is a pending overlapping request on this block */ | ||
256 | #define R5_ReadError 8 /* seen a read error here recently */ | ||
257 | #define R5_ReWrite 9 /* have tried to over-write the readerror */ | ||
258 | |||
259 | #define R5_Expanded 10 /* This block now has post-expand data */ | ||
260 | #define R5_Wantcompute 11 /* compute_block in progress treat as | ||
261 | * uptodate | ||
262 | */ | ||
263 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | ||
264 | * filling | ||
265 | */ | ||
266 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ | ||
267 | /* | ||
268 | * Write method | ||
269 | */ | ||
270 | #define RECONSTRUCT_WRITE 1 | ||
271 | #define READ_MODIFY_WRITE 2 | ||
272 | /* not a write method, but a compute_parity mode */ | ||
273 | #define CHECK_PARITY 3 | ||
274 | /* Additional compute_parity mode -- updates the parity w/o LOCKING */ | ||
275 | #define UPDATE_PARITY 4 | ||
276 | |||
277 | /* | ||
278 | * Stripe state | ||
279 | */ | ||
280 | #define STRIPE_HANDLE 2 | ||
281 | #define STRIPE_SYNCING 3 | ||
282 | #define STRIPE_INSYNC 4 | ||
283 | #define STRIPE_PREREAD_ACTIVE 5 | ||
284 | #define STRIPE_DELAYED 6 | ||
285 | #define STRIPE_DEGRADED 7 | ||
286 | #define STRIPE_BIT_DELAY 8 | ||
287 | #define STRIPE_EXPANDING 9 | ||
288 | #define STRIPE_EXPAND_SOURCE 10 | ||
289 | #define STRIPE_EXPAND_READY 11 | ||
290 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | ||
291 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | ||
292 | #define STRIPE_BIOFILL_RUN 14 | ||
293 | #define STRIPE_COMPUTE_RUN 15 | ||
294 | /* | ||
295 | * Operation request flags | ||
296 | */ | ||
297 | #define STRIPE_OP_BIOFILL 0 | ||
298 | #define STRIPE_OP_COMPUTE_BLK 1 | ||
299 | #define STRIPE_OP_PREXOR 2 | ||
300 | #define STRIPE_OP_BIODRAIN 3 | ||
301 | #define STRIPE_OP_POSTXOR 4 | ||
302 | #define STRIPE_OP_CHECK 5 | ||
303 | |||
304 | /* | ||
305 | * Plugging: | ||
306 | * | ||
307 | * To improve write throughput, we need to delay the handling of some | ||
308 | * stripes until there has been a chance that several write requests | ||
309 | * for the one stripe have all been collected. | ||
310 | * In particular, any write request that would require pre-reading | ||
311 | * is put on a "delayed" queue until there are no stripes currently | ||
312 | * in a pre-read phase. Further, if the "delayed" queue is empty when | ||
313 | * a stripe is put on it then we "plug" the queue and do not process it | ||
314 | * until an unplug call is made. (the unplug_io_fn() is called). | ||
315 | * | ||
316 | * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add | ||
317 | * it to the count of prereading stripes. | ||
318 | * When write is initiated, or the stripe refcnt == 0 (just in case) we | ||
319 | * clear the PREREAD_ACTIVE flag and decrement the count | ||
320 | * Whenever the 'handle' queue is empty and the device is not plugged, we | ||
321 | * move any strips from delayed to handle and clear the DELAYED flag and set | ||
322 | * PREREAD_ACTIVE. | ||
323 | * In stripe_handle, if we find pre-reading is necessary, we do it if | ||
324 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. | ||
325 | * HANDLE gets cleared if stripe_handle leave nothing locked. | ||
326 | */ | ||
327 | |||
328 | |||
329 | struct disk_info { | ||
330 | mdk_rdev_t *rdev; | ||
331 | }; | ||
332 | |||
333 | struct raid5_private_data { | ||
334 | struct hlist_head *stripe_hashtbl; | ||
335 | mddev_t *mddev; | ||
336 | struct disk_info *spare; | ||
337 | int chunk_size, level, algorithm; | ||
338 | int max_degraded; | ||
339 | int raid_disks; | ||
340 | int max_nr_stripes; | ||
341 | |||
342 | /* reshape_progress is the leading edge of a 'reshape' | ||
343 | * It has value MaxSector when no reshape is happening | ||
344 | * If delta_disks < 0, it is the last sector we started work on, | ||
345 | * else is it the next sector to work on. | ||
346 | */ | ||
347 | sector_t reshape_progress; | ||
348 | /* reshape_safe is the trailing edge of a reshape. We know that | ||
349 | * before (or after) this address, all reshape has completed. | ||
350 | */ | ||
351 | sector_t reshape_safe; | ||
352 | int previous_raid_disks; | ||
353 | int prev_chunk, prev_algo; | ||
354 | short generation; /* increments with every reshape */ | ||
355 | unsigned long reshape_checkpoint; /* Time we last updated | ||
356 | * metadata */ | ||
357 | |||
358 | struct list_head handle_list; /* stripes needing handling */ | ||
359 | struct list_head hold_list; /* preread ready stripes */ | ||
360 | struct list_head delayed_list; /* stripes that have plugged requests */ | ||
361 | struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ | ||
362 | struct bio *retry_read_aligned; /* currently retrying aligned bios */ | ||
363 | struct bio *retry_read_aligned_list; /* aligned bios retry list */ | ||
364 | atomic_t preread_active_stripes; /* stripes with scheduled io */ | ||
365 | atomic_t active_aligned_reads; | ||
366 | atomic_t pending_full_writes; /* full write backlog */ | ||
367 | int bypass_count; /* bypassed prereads */ | ||
368 | int bypass_threshold; /* preread nice */ | ||
369 | struct list_head *last_hold; /* detect hold_list promotions */ | ||
370 | |||
371 | atomic_t reshape_stripes; /* stripes with pending writes for reshape */ | ||
372 | /* unfortunately we need two cache names as we temporarily have | ||
373 | * two caches. | ||
374 | */ | ||
375 | int active_name; | ||
376 | char cache_name[2][20]; | ||
377 | struct kmem_cache *slab_cache; /* for allocating stripes */ | ||
378 | |||
379 | int seq_flush, seq_write; | ||
380 | int quiesce; | ||
381 | |||
382 | int fullsync; /* set to 1 if a full sync is needed, | ||
383 | * (fresh device added). | ||
384 | * Cleared when a sync completes. | ||
385 | */ | ||
386 | |||
387 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | ||
388 | |||
389 | /* | ||
390 | * Free stripes pool | ||
391 | */ | ||
392 | atomic_t active_stripes; | ||
393 | struct list_head inactive_list; | ||
394 | wait_queue_head_t wait_for_stripe; | ||
395 | wait_queue_head_t wait_for_overlap; | ||
396 | int inactive_blocked; /* release of inactive stripes blocked, | ||
397 | * waiting for 25% to be free | ||
398 | */ | ||
399 | int pool_size; /* number of disks in stripeheads in pool */ | ||
400 | spinlock_t device_lock; | ||
401 | struct disk_info *disks; | ||
402 | |||
403 | /* When taking over an array from a different personality, we store | ||
404 | * the new thread here until we fully activate the array. | ||
405 | */ | ||
406 | struct mdk_thread_s *thread; | ||
407 | }; | ||
408 | |||
409 | typedef struct raid5_private_data raid5_conf_t; | ||
410 | |||
411 | #define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) | ||
412 | |||
413 | /* | ||
414 | * Our supported algorithms | ||
415 | */ | ||
416 | #define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ | ||
417 | #define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ | ||
418 | #define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ | ||
419 | #define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ | ||
420 | |||
421 | /* Define non-rotating (raid4) algorithms. These allow | ||
422 | * conversion of raid4 to raid5. | ||
423 | */ | ||
424 | #define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ | ||
425 | #define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ | ||
426 | |||
427 | /* DDF RAID6 layouts differ from md/raid6 layouts in two ways. | ||
428 | * Firstly, the exact positioning of the parity block is slightly | ||
429 | * different between the 'LEFT_*' modes of md and the "_N_*" modes | ||
430 | * of DDF. | ||
431 | * Secondly, or order of datablocks over which the Q syndrome is computed | ||
432 | * is different. | ||
433 | * Consequently we have different layouts for DDF/raid6 than md/raid6. | ||
434 | * These layouts are from the DDFv1.2 spec. | ||
435 | * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but | ||
436 | * leaves RLQ=3 as 'Vendor Specific' | ||
437 | */ | ||
438 | |||
439 | #define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ | ||
440 | #define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ | ||
441 | #define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ | ||
442 | |||
443 | |||
444 | /* For every RAID5 algorithm we define a RAID6 algorithm | ||
445 | * with exactly the same layout for data and parity, and | ||
446 | * with the Q block always on the last device (N-1). | ||
447 | * This allows trivial conversion from RAID5 to RAID6 | ||
448 | */ | ||
449 | #define ALGORITHM_LEFT_ASYMMETRIC_6 16 | ||
450 | #define ALGORITHM_RIGHT_ASYMMETRIC_6 17 | ||
451 | #define ALGORITHM_LEFT_SYMMETRIC_6 18 | ||
452 | #define ALGORITHM_RIGHT_SYMMETRIC_6 19 | ||
453 | #define ALGORITHM_PARITY_0_6 20 | ||
454 | #define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N | ||
455 | |||
456 | static inline int algorithm_valid_raid5(int layout) | ||
457 | { | ||
458 | return (layout >= 0) && | ||
459 | (layout <= 5); | ||
460 | } | ||
461 | static inline int algorithm_valid_raid6(int layout) | ||
462 | { | ||
463 | return (layout >= 0 && layout <= 5) | ||
464 | || | ||
465 | (layout == 8 || layout == 10) | ||
466 | || | ||
467 | (layout >= 16 && layout <= 20); | ||
468 | } | ||
469 | |||
470 | static inline int algorithm_is_DDF(int layout) | ||
471 | { | ||
472 | return layout >= 8 && layout <= 10; | ||
473 | } | ||
474 | #endif | ||
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h deleted file mode 100644 index 98dcde88470e..000000000000 --- a/drivers/md/raid6.h +++ /dev/null | |||
@@ -1,130 +0,0 @@ | |||
1 | /* -*- linux-c -*- ------------------------------------------------------- * | ||
2 | * | ||
3 | * Copyright 2003 H. Peter Anvin - All Rights Reserved | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | ||
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | ||
9 | * (at your option) any later version; incorporated herein by reference. | ||
10 | * | ||
11 | * ----------------------------------------------------------------------- */ | ||
12 | |||
13 | #ifndef LINUX_RAID_RAID6_H | ||
14 | #define LINUX_RAID_RAID6_H | ||
15 | |||
16 | #ifdef __KERNEL__ | ||
17 | |||
18 | /* Set to 1 to use kernel-wide empty_zero_page */ | ||
19 | #define RAID6_USE_EMPTY_ZERO_PAGE 0 | ||
20 | |||
21 | #include <linux/raid/md.h> | ||
22 | #include <linux/raid/raid5.h> | ||
23 | |||
24 | typedef raid5_conf_t raid6_conf_t; /* Same configuration */ | ||
25 | |||
26 | /* Additional compute_parity mode -- updates the parity w/o LOCKING */ | ||
27 | #define UPDATE_PARITY 4 | ||
28 | |||
29 | /* We need a pre-zeroed page... if we don't want to use the kernel-provided | ||
30 | one define it here */ | ||
31 | #if RAID6_USE_EMPTY_ZERO_PAGE | ||
32 | # define raid6_empty_zero_page empty_zero_page | ||
33 | #else | ||
34 | extern const char raid6_empty_zero_page[PAGE_SIZE]; | ||
35 | #endif | ||
36 | |||
37 | #else /* ! __KERNEL__ */ | ||
38 | /* Used for testing in user space */ | ||
39 | |||
40 | #include <errno.h> | ||
41 | #include <inttypes.h> | ||
42 | #include <limits.h> | ||
43 | #include <stddef.h> | ||
44 | #include <sys/mman.h> | ||
45 | #include <sys/types.h> | ||
46 | |||
47 | /* Not standard, but glibc defines it */ | ||
48 | #define BITS_PER_LONG __WORDSIZE | ||
49 | |||
50 | typedef uint8_t u8; | ||
51 | typedef uint16_t u16; | ||
52 | typedef uint32_t u32; | ||
53 | typedef uint64_t u64; | ||
54 | |||
55 | #ifndef PAGE_SIZE | ||
56 | # define PAGE_SIZE 4096 | ||
57 | #endif | ||
58 | extern const char raid6_empty_zero_page[PAGE_SIZE]; | ||
59 | |||
60 | #define __init | ||
61 | #define __exit | ||
62 | #define __attribute_const__ __attribute__((const)) | ||
63 | #define noinline __attribute__((noinline)) | ||
64 | |||
65 | #define preempt_enable() | ||
66 | #define preempt_disable() | ||
67 | #define cpu_has_feature(x) 1 | ||
68 | #define enable_kernel_altivec() | ||
69 | #define disable_kernel_altivec() | ||
70 | |||
71 | #endif /* __KERNEL__ */ | ||
72 | |||
73 | /* Routine choices */ | ||
74 | struct raid6_calls { | ||
75 | void (*gen_syndrome)(int, size_t, void **); | ||
76 | int (*valid)(void); /* Returns 1 if this routine set is usable */ | ||
77 | const char *name; /* Name of this routine set */ | ||
78 | int prefer; /* Has special performance attribute */ | ||
79 | }; | ||
80 | |||
81 | /* Selected algorithm */ | ||
82 | extern struct raid6_calls raid6_call; | ||
83 | |||
84 | /* Algorithm list */ | ||
85 | extern const struct raid6_calls * const raid6_algos[]; | ||
86 | int raid6_select_algo(void); | ||
87 | |||
88 | /* Return values from chk_syndrome */ | ||
89 | #define RAID6_OK 0 | ||
90 | #define RAID6_P_BAD 1 | ||
91 | #define RAID6_Q_BAD 2 | ||
92 | #define RAID6_PQ_BAD 3 | ||
93 | |||
94 | /* Galois field tables */ | ||
95 | extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); | ||
96 | extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); | ||
97 | extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); | ||
98 | extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); | ||
99 | |||
100 | /* Recovery routines */ | ||
101 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); | ||
102 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); | ||
103 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); | ||
104 | |||
105 | /* Some definitions to allow code to be compiled for testing in userspace */ | ||
106 | #ifndef __KERNEL__ | ||
107 | |||
108 | # define jiffies raid6_jiffies() | ||
109 | # define printk printf | ||
110 | # define GFP_KERNEL 0 | ||
111 | # define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) | ||
112 | # define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) | ||
113 | |||
114 | static inline void cpu_relax(void) | ||
115 | { | ||
116 | /* Nothing */ | ||
117 | } | ||
118 | |||
119 | #undef HZ | ||
120 | #define HZ 1000 | ||
121 | static inline uint32_t raid6_jiffies(void) | ||
122 | { | ||
123 | struct timeval tv; | ||
124 | gettimeofday(&tv, NULL); | ||
125 | return tv.tv_sec*1000 + tv.tv_usec/1000; | ||
126 | } | ||
127 | |||
128 | #endif /* ! __KERNEL__ */ | ||
129 | |||
130 | #endif /* LINUX_RAID_RAID6_H */ | ||
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 21987e3dbe6c..866215ac7f25 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -16,13 +16,20 @@ | |||
16 | * Algorithm list and algorithm selection for RAID-6 | 16 | * Algorithm list and algorithm selection for RAID-6 |
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include "raid6.h" | 19 | #include <linux/raid/pq.h> |
20 | #ifndef __KERNEL__ | 20 | #ifndef __KERNEL__ |
21 | #include <sys/mman.h> | 21 | #include <sys/mman.h> |
22 | #include <stdio.h> | 22 | #include <stdio.h> |
23 | #else | ||
24 | #if !RAID6_USE_EMPTY_ZERO_PAGE | ||
25 | /* In .bss so it's zeroed */ | ||
26 | const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); | ||
27 | EXPORT_SYMBOL(raid6_empty_zero_page); | ||
28 | #endif | ||
23 | #endif | 29 | #endif |
24 | 30 | ||
25 | struct raid6_calls raid6_call; | 31 | struct raid6_calls raid6_call; |
32 | EXPORT_SYMBOL_GPL(raid6_call); | ||
26 | 33 | ||
27 | /* Various routine sets */ | 34 | /* Various routine sets */ |
28 | extern const struct raid6_calls raid6_intx1; | 35 | extern const struct raid6_calls raid6_intx1; |
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = { | |||
79 | #else | 86 | #else |
80 | /* Need more time to be stable in userspace */ | 87 | /* Need more time to be stable in userspace */ |
81 | #define RAID6_TIME_JIFFIES_LG2 9 | 88 | #define RAID6_TIME_JIFFIES_LG2 9 |
89 | #define time_before(x, y) ((x) < (y)) | ||
82 | #endif | 90 | #endif |
83 | 91 | ||
84 | /* Try to pick the best algorithm */ | 92 | /* Try to pick the best algorithm */ |
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void) | |||
152 | 160 | ||
153 | return best ? 0 : -EINVAL; | 161 | return best ? 0 : -EINVAL; |
154 | } | 162 | } |
163 | |||
164 | static void raid6_exit(void) | ||
165 | { | ||
166 | do { } while (0); | ||
167 | } | ||
168 | |||
169 | subsys_initcall(raid6_select_algo); | ||
170 | module_exit(raid6_exit); | ||
171 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index b9afd35b8812..699dfeee4944 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -22,7 +22,7 @@ | |||
22 | * bracked this with preempt_disable/enable or in a lock) | 22 | * bracked this with preempt_disable/enable or in a lock) |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include "raid6.h" | 25 | #include <linux/raid/pq.h> |
26 | 26 | ||
27 | #ifdef CONFIG_ALTIVEC | 27 | #ifdef CONFIG_ALTIVEC |
28 | 28 | ||
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index ad004cee0e26..f9bf9cba357f 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -18,7 +18,7 @@ | |||
18 | * This file is postprocessed using unroll.pl | 18 | * This file is postprocessed using unroll.pl |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "raid6.h" | 21 | #include <linux/raid/pq.h> |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * This is the C data type to use | 24 | * This is the C data type to use |
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c index d4e4a1bd70ad..e7f6c13132bf 100644 --- a/drivers/md/raid6mmx.c +++ b/drivers/md/raid6mmx.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -18,7 +18,7 @@ | |||
18 | 18 | ||
19 | #if defined(__i386__) && !defined(__arch_um__) | 19 | #if defined(__i386__) && !defined(__arch_um__) |
20 | 20 | ||
21 | #include "raid6.h" | 21 | #include <linux/raid/pq.h> |
22 | #include "raid6x86.h" | 22 | #include "raid6x86.h" |
23 | 23 | ||
24 | /* Shared with raid6sse1.c */ | 24 | /* Shared with raid6sse1.c */ |
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c index a8c4d9451bd9..2609f00e0d61 100644 --- a/drivers/md/raid6recov.c +++ b/drivers/md/raid6recov.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -18,7 +18,7 @@ | |||
18 | * the syndrome.) | 18 | * the syndrome.) |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include "raid6.h" | 21 | #include <linux/raid/pq.h> |
22 | 22 | ||
23 | /* Recover two failed data blocks. */ | 23 | /* Recover two failed data blocks. */ |
24 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | 24 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, |
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | |||
63 | p++; q++; | 63 | p++; q++; |
64 | } | 64 | } |
65 | } | 65 | } |
66 | 66 | EXPORT_SYMBOL_GPL(raid6_2data_recov); | |
67 | |||
68 | |||
69 | 67 | ||
70 | /* Recover failure of one data block plus the P block */ | 68 | /* Recover failure of one data block plus the P block */ |
71 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | 69 | void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) |
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) | |||
97 | q++; dq++; | 95 | q++; dq++; |
98 | } | 96 | } |
99 | } | 97 | } |
98 | EXPORT_SYMBOL_GPL(raid6_datap_recov); | ||
100 | 99 | ||
101 | 100 | #ifndef __KERNEL__ | |
102 | #ifndef __KERNEL__ /* Testing only */ | 101 | /* Testing only */ |
103 | 102 | ||
104 | /* Recover two failed blocks. */ | 103 | /* Recover two failed blocks. */ |
105 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) | 104 | void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) |
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c index 0666237276ff..b274dd5eab8f 100644 --- a/drivers/md/raid6sse1.c +++ b/drivers/md/raid6sse1.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -23,7 +23,7 @@ | |||
23 | 23 | ||
24 | #if defined(__i386__) && !defined(__arch_um__) | 24 | #if defined(__i386__) && !defined(__arch_um__) |
25 | 25 | ||
26 | #include "raid6.h" | 26 | #include <linux/raid/pq.h> |
27 | #include "raid6x86.h" | 27 | #include "raid6x86.h" |
28 | 28 | ||
29 | /* Defined in raid6mmx.c */ | 29 | /* Defined in raid6mmx.c */ |
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c index b034ad868039..6ed6c6c0389f 100644 --- a/drivers/md/raid6sse2.c +++ b/drivers/md/raid6sse2.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |
@@ -19,7 +19,7 @@ | |||
19 | 19 | ||
20 | #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) | 20 | #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) |
21 | 21 | ||
22 | #include "raid6.h" | 22 | #include <linux/raid/pq.h> |
23 | #include "raid6x86.h" | 23 | #include "raid6x86.h" |
24 | 24 | ||
25 | static const struct raid6_sse_constants { | 25 | static const struct raid6_sse_constants { |
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile index 78e0396adf2a..58ffdf4f5161 100644 --- a/drivers/md/raid6test/Makefile +++ b/drivers/md/raid6test/Makefile | |||
@@ -5,7 +5,7 @@ | |||
5 | 5 | ||
6 | CC = gcc | 6 | CC = gcc |
7 | OPTFLAGS = -O2 # Adjust as desired | 7 | OPTFLAGS = -O2 # Adjust as desired |
8 | CFLAGS = -I.. -g $(OPTFLAGS) | 8 | CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) |
9 | LD = ld | 9 | LD = ld |
10 | PERL = perl | 10 | PERL = perl |
11 | AR = ar | 11 | AR = ar |
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 559cc41b2585..7a930318b17d 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c | |||
@@ -17,7 +17,7 @@ | |||
17 | #include <stdlib.h> | 17 | #include <stdlib.h> |
18 | #include <stdio.h> | 18 | #include <stdio.h> |
19 | #include <string.h> | 19 | #include <string.h> |
20 | #include "raid6.h" | 20 | #include <linux/raid/pq.h> |
21 | 21 | ||
22 | #define NDISKS 16 /* Including P and Q */ | 22 | #define NDISKS 16 /* Including P and Q */ |
23 | 23 | ||
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h index 99fea7a70ca7..4c22c1568558 100644 --- a/drivers/md/raid6x86.h +++ b/drivers/md/raid6x86.h | |||
@@ -5,7 +5,7 @@ | |||
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | 7 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, |
8 | * Bostom MA 02111-1307, USA; either version 2 of the License, or | 8 | * Boston MA 02111-1307, USA; either version 2 of the License, or |
9 | * (at your option) any later version; incorporated herein by reference. | 9 | * (at your option) any later version; incorporated herein by reference. |
10 | * | 10 | * |
11 | * ----------------------------------------------------------------------- */ | 11 | * ----------------------------------------------------------------------- */ |