aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-21 13:29:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-21 13:29:12 -0400
commit8a392625b665c676a77c62f8608d10ff430bcb83 (patch)
tree4000a65d61baed73200e47f91dea5263ed16edd0
parent519f0141f1c42e2b8b59c7dea005cbf6095358e8 (diff)
parent4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (52 commits) md: Protect access to mddev->disks list using RCU md: only count actual openers as access which prevent a 'stop' md: linear: Make array_size sector-based and rename it to array_sectors. md: Make mddev->array_size sector-based. md: Make super_type->rdev_size_change() take sector-based sizes. md: Fix check for overlapping devices. md: Tidy up rdev_size_store a bit: md: Remove some unused macros. md: Turn rdev->sb_offset into a sector-based quantity. md: Make calc_dev_sboffset() return a sector count. md: Replace calc_dev_size() by calc_num_sectors(). md: Make update_size() take the number of sectors. md: Better control of when do_md_stop is allowed to stop the array. md: get_disk_info(): Don't convert between signed and unsigned and back. md: Simplify restart_array(). md: alloc_disk_sb(): Return proper error value. md: Simplify sb_equal(). md: Simplify uuid_equal(). md: sb_equal(): Fix misleading printk. md: Fix a typo in the comment to cmd_match(). ...
-rw-r--r--Documentation/md.txt30
-rw-r--r--drivers/md/bitmap.c54
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c20
-rw-r--r--drivers/md/md.c615
-rw-r--r--drivers/md/multipath.c17
-rw-r--r--drivers/md/raid0.c8
-rw-r--r--drivers/md/raid1.c30
-rw-r--r--drivers/md/raid10.c22
-rw-r--r--drivers/md/raid5.c745
-rw-r--r--include/linux/raid/bitmap.h1
-rw-r--r--include/linux/raid/linear.h2
-rw-r--r--include/linux/raid/md.h2
-rw-r--r--include/linux/raid/md_k.h17
-rw-r--r--include/linux/raid/md_p.h3
-rw-r--r--include/linux/raid/raid5.h64
16 files changed, 842 insertions, 790 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index a8b430627473..1da9d1b1793f 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -236,6 +236,11 @@ All md devices contain:
236 writing the word for the desired state, however some states 236 writing the word for the desired state, however some states
237 cannot be explicitly set, and some transitions are not allowed. 237 cannot be explicitly set, and some transitions are not allowed.
238 238
239 Select/poll works on this file. All changes except between
240 active_idle and active (which can be frequent and are not
241 very interesting) are notified. active->active_idle is
242 reported if the metadata is externally managed.
243
239 clear 244 clear
240 No devices, no size, no level 245 No devices, no size, no level
241 Writing is equivalent to STOP_ARRAY ioctl 246 Writing is equivalent to STOP_ARRAY ioctl
@@ -292,6 +297,10 @@ Each directory contains:
292 writemostly - device will only be subject to read 297 writemostly - device will only be subject to read
293 requests if there are no other options. 298 requests if there are no other options.
294 This applies only to raid1 arrays. 299 This applies only to raid1 arrays.
300 blocked - device has failed, metadata is "external",
301 and the failure hasn't been acknowledged yet.
302 Writes that would write to this device if
303 it were not faulty are blocked.
295 spare - device is working, but not a full member. 304 spare - device is working, but not a full member.
296 This includes spares that are in the process 305 This includes spares that are in the process
297 of being recovered to 306 of being recovered to
@@ -301,6 +310,12 @@ Each directory contains:
301 Writing "remove" removes the device from the array. 310 Writing "remove" removes the device from the array.
302 Writing "writemostly" sets the writemostly flag. 311 Writing "writemostly" sets the writemostly flag.
303 Writing "-writemostly" clears the writemostly flag. 312 Writing "-writemostly" clears the writemostly flag.
313 Writing "blocked" sets the "blocked" flag.
314 Writing "-blocked" clear the "blocked" flag and allows writes
315 to complete.
316
317 This file responds to select/poll. Any change to 'faulty'
318 or 'blocked' causes an event.
304 319
305 errors 320 errors
306 An approximate count of read errors that have been detected on 321 An approximate count of read errors that have been detected on
@@ -332,7 +347,7 @@ Each directory contains:
332 for storage of data. This will normally be the same as the 347 for storage of data. This will normally be the same as the
333 component_size. This can be written while assembling an 348 component_size. This can be written while assembling an
334 array. If a value less than the current component_size is 349 array. If a value less than the current component_size is
335 written, component_size will be reduced to this value. 350 written, it will be rejected.
336 351
337 352
338An active md device will also contain and entry for each active device 353An active md device will also contain and entry for each active device
@@ -381,6 +396,19 @@ also have
381 'check' and 'repair' will start the appropriate process 396 'check' and 'repair' will start the appropriate process
382 providing the current state is 'idle'. 397 providing the current state is 'idle'.
383 398
399 This file responds to select/poll. Any important change in the value
400 triggers a poll event. Sometimes the value will briefly be
401 "recover" if a recovery seems to be needed, but cannot be
402 achieved. In that case, the transition to "recover" isn't
403 notified, but the transition away is.
404
405 degraded
406 This contains a count of the number of devices by which the
407 arrays is degraded. So an optimal array with show '0'. A
408 single failed/missing drive will show '1', etc.
409 This file responds to select/poll, any increase or decrease
410 in the count of missing devices will trigger an event.
411
384 mismatch_count 412 mismatch_count
385 When performing 'check' and 'repair', and possibly when 413 When performing 'check' and 'repair', and possibly when
386 performing 'resync', md will count the number of errors that are 414 performing 'resync', md will count the number of errors that are
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b26927ce889c..621a272a2c74 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
225 || test_bit(Faulty, &rdev->flags)) 225 || test_bit(Faulty, &rdev->flags))
226 continue; 226 continue;
227 227
228 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); 228 target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
229 229
230 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { 230 if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
231 page->index = index; 231 page->index = index;
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
241static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) 241static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
242{ 242{
243 mdk_rdev_t *rdev; 243 mdk_rdev_t *rdev;
244 struct list_head *tmp;
245 mddev_t *mddev = bitmap->mddev; 244 mddev_t *mddev = bitmap->mddev;
246 245
247 rdev_for_each(rdev, tmp, mddev) 246 rcu_read_lock();
247 rdev_for_each_rcu(rdev, mddev)
248 if (test_bit(In_sync, &rdev->flags) 248 if (test_bit(In_sync, &rdev->flags)
249 && !test_bit(Faulty, &rdev->flags)) { 249 && !test_bit(Faulty, &rdev->flags)) {
250 int size = PAGE_SIZE; 250 int size = PAGE_SIZE;
@@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
260 + (long)(page->index * (PAGE_SIZE/512)) 260 + (long)(page->index * (PAGE_SIZE/512))
261 + size/512 > 0) 261 + size/512 > 0)
262 /* bitmap runs in to metadata */ 262 /* bitmap runs in to metadata */
263 return -EINVAL; 263 goto bad_alignment;
264 if (rdev->data_offset + mddev->size*2 264 if (rdev->data_offset + mddev->size*2
265 > rdev->sb_offset*2 + bitmap->offset) 265 > rdev->sb_start + bitmap->offset)
266 /* data runs in to bitmap */ 266 /* data runs in to bitmap */
267 return -EINVAL; 267 goto bad_alignment;
268 } else if (rdev->sb_offset*2 < rdev->data_offset) { 268 } else if (rdev->sb_start < rdev->data_offset) {
269 /* METADATA BITMAP DATA */ 269 /* METADATA BITMAP DATA */
270 if (rdev->sb_offset*2 270 if (rdev->sb_start
271 + bitmap->offset 271 + bitmap->offset
272 + page->index*(PAGE_SIZE/512) + size/512 272 + page->index*(PAGE_SIZE/512) + size/512
273 > rdev->data_offset) 273 > rdev->data_offset)
274 /* bitmap runs in to data */ 274 /* bitmap runs in to data */
275 return -EINVAL; 275 goto bad_alignment;
276 } else { 276 } else {
277 /* DATA METADATA BITMAP - no problems */ 277 /* DATA METADATA BITMAP - no problems */
278 } 278 }
279 md_super_write(mddev, rdev, 279 md_super_write(mddev, rdev,
280 (rdev->sb_offset<<1) + bitmap->offset 280 rdev->sb_start + bitmap->offset
281 + page->index * (PAGE_SIZE/512), 281 + page->index * (PAGE_SIZE/512),
282 size, 282 size,
283 page); 283 page);
284 } 284 }
285 rcu_read_unlock();
285 286
286 if (wait) 287 if (wait)
287 md_super_wait(mddev); 288 md_super_wait(mddev);
288 return 0; 289 return 0;
290
291 bad_alignment:
292 rcu_read_unlock();
293 return -EINVAL;
289} 294}
290 295
291static void bitmap_file_kick(struct bitmap *bitmap); 296static void bitmap_file_kick(struct bitmap *bitmap);
@@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
454 spin_unlock_irqrestore(&bitmap->lock, flags); 459 spin_unlock_irqrestore(&bitmap->lock, flags);
455 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 460 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
456 sb->events = cpu_to_le64(bitmap->mddev->events); 461 sb->events = cpu_to_le64(bitmap->mddev->events);
457 if (!bitmap->mddev->degraded) 462 if (bitmap->mddev->events < bitmap->events_cleared) {
458 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 463 /* rocking back to read-only */
464 bitmap->events_cleared = bitmap->mddev->events;
465 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
466 }
459 kunmap_atomic(sb, KM_USER0); 467 kunmap_atomic(sb, KM_USER0);
460 write_page(bitmap, bitmap->sb_page, 1); 468 write_page(bitmap, bitmap->sb_page, 1);
461} 469}
@@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
1085 } else 1093 } else
1086 spin_unlock_irqrestore(&bitmap->lock, flags); 1094 spin_unlock_irqrestore(&bitmap->lock, flags);
1087 lastpage = page; 1095 lastpage = page;
1088/* 1096
1089 printk("bitmap clean at page %lu\n", j); 1097 /* We are possibly going to clear some bits, so make
1090*/ 1098 * sure that events_cleared is up-to-date.
1099 */
1100 if (bitmap->need_sync) {
1101 bitmap_super_t *sb;
1102 bitmap->need_sync = 0;
1103 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
1104 sb->events_cleared =
1105 cpu_to_le64(bitmap->events_cleared);
1106 kunmap_atomic(sb, KM_USER0);
1107 write_page(bitmap, bitmap->sb_page, 1);
1108 }
1091 spin_lock_irqsave(&bitmap->lock, flags); 1109 spin_lock_irqsave(&bitmap->lock, flags);
1092 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); 1110 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1093 } 1111 }
@@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1257 return; 1275 return;
1258 } 1276 }
1259 1277
1278 if (success &&
1279 bitmap->events_cleared < bitmap->mddev->events) {
1280 bitmap->events_cleared = bitmap->mddev->events;
1281 bitmap->need_sync = 1;
1282 }
1283
1260 if (!success && ! (*bmc & NEEDED_MASK)) 1284 if (!success && ! (*bmc & NEEDED_MASK))
1261 *bmc |= NEEDED_MASK; 1285 *bmc |= NEEDED_MASK;
1262 1286
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index d107ddceefcd..268547dbfbd3 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
297 rdev_for_each(rdev, tmp, mddev) 297 rdev_for_each(rdev, tmp, mddev)
298 conf->rdev = rdev; 298 conf->rdev = rdev;
299 299
300 mddev->array_size = mddev->size; 300 mddev->array_sectors = mddev->size * 2;
301 mddev->private = conf; 301 mddev->private = conf;
302 302
303 reconfig(mddev, mddev->layout, -1); 303 reconfig(mddev, mddev->layout, -1);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 6a866d7c8ae5..b1eebf88c209 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
122 return NULL; 122 return NULL;
123 123
124 cnt = 0; 124 cnt = 0;
125 conf->array_size = 0; 125 conf->array_sectors = 0;
126 126
127 rdev_for_each(rdev, tmp, mddev) { 127 rdev_for_each(rdev, tmp, mddev) {
128 int j = rdev->raid_disk; 128 int j = rdev->raid_disk;
129 dev_info_t *disk = conf->disks + j; 129 dev_info_t *disk = conf->disks + j;
130 130
131 if (j < 0 || j > raid_disks || disk->rdev) { 131 if (j < 0 || j >= raid_disks || disk->rdev) {
132 printk("linear: disk numbering problem. Aborting!\n"); 132 printk("linear: disk numbering problem. Aborting!\n");
133 goto out; 133 goto out;
134 } 134 }
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
146 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 146 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
147 147
148 disk->size = rdev->size; 148 disk->size = rdev->size;
149 conf->array_size += rdev->size; 149 conf->array_sectors += rdev->size * 2;
150 150
151 cnt++; 151 cnt++;
152 } 152 }
@@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
155 goto out; 155 goto out;
156 } 156 }
157 157
158 min_spacing = conf->array_size; 158 min_spacing = conf->array_sectors / 2;
159 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); 159 sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
160 160
161 /* min_spacing is the minimum spacing that will fit the hash 161 /* min_spacing is the minimum spacing that will fit the hash
@@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
164 * that is larger than min_spacing as use the size of that as 164 * that is larger than min_spacing as use the size of that as
165 * the actual spacing 165 * the actual spacing
166 */ 166 */
167 conf->hash_spacing = conf->array_size; 167 conf->hash_spacing = conf->array_sectors / 2;
168 for (i=0; i < cnt-1 ; i++) { 168 for (i=0; i < cnt-1 ; i++) {
169 sector_t sz = 0; 169 sector_t sz = 0;
170 int j; 170 int j;
@@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
194 unsigned round; 194 unsigned round;
195 unsigned long base; 195 unsigned long base;
196 196
197 sz = conf->array_size >> conf->preshift; 197 sz = conf->array_sectors >> (conf->preshift + 1);
198 sz += 1; /* force round-up */ 198 sz += 1; /* force round-up */
199 base = conf->hash_spacing >> conf->preshift; 199 base = conf->hash_spacing >> conf->preshift;
200 round = sector_div(sz, base); 200 round = sector_div(sz, base);
@@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
221 curr_offset = 0; 221 curr_offset = 0;
222 i = 0; 222 i = 0;
223 for (curr_offset = 0; 223 for (curr_offset = 0;
224 curr_offset < conf->array_size; 224 curr_offset < conf->array_sectors / 2;
225 curr_offset += conf->hash_spacing) { 225 curr_offset += conf->hash_spacing) {
226 226
227 while (i < raid_disks-1 && 227 while (i < raid_disks-1 &&
@@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev)
258 if (!conf) 258 if (!conf)
259 return 1; 259 return 1;
260 mddev->private = conf; 260 mddev->private = conf;
261 mddev->array_size = conf->array_size; 261 mddev->array_sectors = conf->array_sectors;
262 262
263 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); 263 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
264 mddev->queue->unplug_fn = linear_unplug; 264 mddev->queue->unplug_fn = linear_unplug;
@@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
292 newconf->prev = mddev_to_conf(mddev); 292 newconf->prev = mddev_to_conf(mddev);
293 mddev->private = newconf; 293 mddev->private = newconf;
294 mddev->raid_disks++; 294 mddev->raid_disks++;
295 mddev->array_size = newconf->array_size; 295 mddev->array_sectors = newconf->array_sectors;
296 set_capacity(mddev->gendisk, mddev->array_size << 1); 296 set_capacity(mddev->gendisk, mddev->array_sectors);
297 return 0; 297 return 0;
298} 298}
299 299
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2580ac1b9b0f..c2ff77ccec50 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
169{ 169{
170 atomic_inc(&md_event_count); 170 atomic_inc(&md_event_count);
171 wake_up(&md_event_waiters); 171 wake_up(&md_event_waiters);
172 sysfs_notify(&mddev->kobj, NULL, "sync_action");
173} 172}
174EXPORT_SYMBOL_GPL(md_new_event); 173EXPORT_SYMBOL_GPL(md_new_event);
175 174
@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit)
274 INIT_LIST_HEAD(&new->all_mddevs); 273 INIT_LIST_HEAD(&new->all_mddevs);
275 init_timer(&new->safemode_timer); 274 init_timer(&new->safemode_timer);
276 atomic_set(&new->active, 1); 275 atomic_set(&new->active, 1);
276 atomic_set(&new->openers, 0);
277 spin_lock_init(&new->write_lock); 277 spin_lock_init(&new->write_lock);
278 init_waitqueue_head(&new->sb_wait); 278 init_waitqueue_head(&new->sb_wait);
279 init_waitqueue_head(&new->recovery_wait); 279 init_waitqueue_head(&new->recovery_wait);
280 new->reshape_position = MaxSector; 280 new->reshape_position = MaxSector;
281 new->resync_min = 0;
281 new->resync_max = MaxSector; 282 new->resync_max = MaxSector;
282 new->level = LEVEL_NONE; 283 new->level = LEVEL_NONE;
283 284
@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel)
347 return NULL; 348 return NULL;
348} 349}
349 350
351/* return the offset of the super block in 512byte sectors */
350static inline sector_t calc_dev_sboffset(struct block_device *bdev) 352static inline sector_t calc_dev_sboffset(struct block_device *bdev)
351{ 353{
352 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 354 sector_t num_sectors = bdev->bd_inode->i_size / 512;
353 return MD_NEW_SIZE_BLOCKS(size); 355 return MD_NEW_SIZE_SECTORS(num_sectors);
354} 356}
355 357
356static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 358static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
357{ 359{
358 sector_t size; 360 sector_t num_sectors = rdev->sb_start;
359
360 size = rdev->sb_offset;
361 361
362 if (chunk_size) 362 if (chunk_size)
363 size &= ~((sector_t)chunk_size/1024 - 1); 363 num_sectors &= ~((sector_t)chunk_size/512 - 1);
364 return size; 364 return num_sectors;
365} 365}
366 366
367static int alloc_disk_sb(mdk_rdev_t * rdev) 367static int alloc_disk_sb(mdk_rdev_t * rdev)
@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
372 rdev->sb_page = alloc_page(GFP_KERNEL); 372 rdev->sb_page = alloc_page(GFP_KERNEL);
373 if (!rdev->sb_page) { 373 if (!rdev->sb_page) {
374 printk(KERN_ALERT "md: out of memory.\n"); 374 printk(KERN_ALERT "md: out of memory.\n");
375 return -EINVAL; 375 return -ENOMEM;
376 } 376 }
377 377
378 return 0; 378 return 0;
@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
384 put_page(rdev->sb_page); 384 put_page(rdev->sb_page);
385 rdev->sb_loaded = 0; 385 rdev->sb_loaded = 0;
386 rdev->sb_page = NULL; 386 rdev->sb_page = NULL;
387 rdev->sb_offset = 0; 387 rdev->sb_start = 0;
388 rdev->size = 0; 388 rdev->size = 0;
389 } 389 }
390} 390}
@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
530 return 0; 530 return 0;
531 531
532 532
533 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 533 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
534 goto fail; 534 goto fail;
535 rdev->sb_loaded = 1; 535 rdev->sb_loaded = 1;
536 return 0; 536 return 0;
@@ -543,17 +543,12 @@ fail:
543 543
544static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 544static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
545{ 545{
546 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 546 return sb1->set_uuid0 == sb2->set_uuid0 &&
547 (sb1->set_uuid1 == sb2->set_uuid1) && 547 sb1->set_uuid1 == sb2->set_uuid1 &&
548 (sb1->set_uuid2 == sb2->set_uuid2) && 548 sb1->set_uuid2 == sb2->set_uuid2 &&
549 (sb1->set_uuid3 == sb2->set_uuid3)) 549 sb1->set_uuid3 == sb2->set_uuid3;
550
551 return 1;
552
553 return 0;
554} 550}
555 551
556
557static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 552static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
558{ 553{
559 int ret; 554 int ret;
@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
564 559
565 if (!tmp1 || !tmp2) { 560 if (!tmp1 || !tmp2) {
566 ret = 0; 561 ret = 0;
567 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 562 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
568 goto abort; 563 goto abort;
569 } 564 }
570 565
@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
577 tmp1->nr_disks = 0; 572 tmp1->nr_disks = 0;
578 tmp2->nr_disks = 0; 573 tmp2->nr_disks = 0;
579 574
580 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 575 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
581 ret = 0;
582 else
583 ret = 1;
584
585abort: 576abort:
586 kfree(tmp1); 577 kfree(tmp1);
587 kfree(tmp2); 578 kfree(tmp2);
@@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
658 */ 649 */
659 650
660struct super_type { 651struct super_type {
661 char *name; 652 char *name;
662 struct module *owner; 653 struct module *owner;
663 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 654 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
664 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 655 int minor_version);
665 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 656 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
657 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
658 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
659 sector_t num_sectors);
666}; 660};
667 661
668/* 662/*
@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
673 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 667 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
674 mdp_super_t *sb; 668 mdp_super_t *sb;
675 int ret; 669 int ret;
676 sector_t sb_offset;
677 670
678 /* 671 /*
679 * Calculate the position of the superblock, 672 * Calculate the position of the superblock (512byte sectors),
680 * it's at the end of the disk. 673 * it's at the end of the disk.
681 * 674 *
682 * It also happens to be a multiple of 4Kb. 675 * It also happens to be a multiple of 4Kb.
683 */ 676 */
684 sb_offset = calc_dev_sboffset(rdev->bdev); 677 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
685 rdev->sb_offset = sb_offset;
686 678
687 ret = read_disk_sb(rdev, MD_SB_BYTES); 679 ret = read_disk_sb(rdev, MD_SB_BYTES);
688 if (ret) return ret; 680 if (ret) return ret;
@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
759 else 751 else
760 ret = 0; 752 ret = 0;
761 } 753 }
762 rdev->size = calc_dev_size(rdev, sb->chunk_size); 754 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
763 755
764 if (rdev->size < sb->size && sb->level > 1) 756 if (rdev->size < sb->size && sb->level > 1)
765 /* "this cannot possibly happen" ... */ 757 /* "this cannot possibly happen" ... */
@@ -1004,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1004} 996}
1005 997
1006/* 998/*
999 * rdev_size_change for 0.90.0
1000 */
1001static unsigned long long
1002super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1003{
1004 if (num_sectors && num_sectors < rdev->mddev->size * 2)
1005 return 0; /* component must fit device */
1006 if (rdev->mddev->bitmap_offset)
1007 return 0; /* can't move bitmap */
1008 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1009 if (!num_sectors || num_sectors > rdev->sb_start)
1010 num_sectors = rdev->sb_start;
1011 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1012 rdev->sb_page);
1013 md_super_wait(rdev->mddev);
1014 return num_sectors / 2; /* kB for sysfs */
1015}
1016
1017
1018/*
1007 * version 1 superblock 1019 * version 1 superblock
1008 */ 1020 */
1009 1021
@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1034{ 1046{
1035 struct mdp_superblock_1 *sb; 1047 struct mdp_superblock_1 *sb;
1036 int ret; 1048 int ret;
1037 sector_t sb_offset; 1049 sector_t sb_start;
1038 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1050 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1039 int bmask; 1051 int bmask;
1040 1052
1041 /* 1053 /*
1042 * Calculate the position of the superblock. 1054 * Calculate the position of the superblock in 512byte sectors.
1043 * It is always aligned to a 4K boundary and 1055 * It is always aligned to a 4K boundary and
1044 * depeding on minor_version, it can be: 1056 * depeding on minor_version, it can be:
1045 * 0: At least 8K, but less than 12K, from end of device 1057 * 0: At least 8K, but less than 12K, from end of device
@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1048 */ 1060 */
1049 switch(minor_version) { 1061 switch(minor_version) {
1050 case 0: 1062 case 0:
1051 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1063 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1052 sb_offset -= 8*2; 1064 sb_start -= 8*2;
1053 sb_offset &= ~(sector_t)(4*2-1); 1065 sb_start &= ~(sector_t)(4*2-1);
1054 /* convert from sectors to K */
1055 sb_offset /= 2;
1056 break; 1066 break;
1057 case 1: 1067 case 1:
1058 sb_offset = 0; 1068 sb_start = 0;
1059 break; 1069 break;
1060 case 2: 1070 case 2:
1061 sb_offset = 4; 1071 sb_start = 8;
1062 break; 1072 break;
1063 default: 1073 default:
1064 return -EINVAL; 1074 return -EINVAL;
1065 } 1075 }
1066 rdev->sb_offset = sb_offset; 1076 rdev->sb_start = sb_start;
1067 1077
1068 /* superblock is rarely larger than 1K, but it can be larger, 1078 /* superblock is rarely larger than 1K, but it can be larger,
1069 * and it is safe to read 4k, so we do that 1079 * and it is safe to read 4k, so we do that
@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1077 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1087 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1078 sb->major_version != cpu_to_le32(1) || 1088 sb->major_version != cpu_to_le32(1) ||
1079 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1089 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1080 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1090 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1081 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1091 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1082 return -EINVAL; 1092 return -EINVAL;
1083 1093
@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1113 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1123 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1114 1124
1115 if (minor_version 1125 if (minor_version
1116 && rdev->data_offset < sb_offset + (rdev->sb_size/512)) 1126 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1117 return -EINVAL; 1127 return -EINVAL;
1118 1128
1119 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1129 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1149 if (minor_version) 1159 if (minor_version)
1150 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1160 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
1151 else 1161 else
1152 rdev->size = rdev->sb_offset; 1162 rdev->size = rdev->sb_start / 2;
1153 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1163 if (rdev->size < le64_to_cpu(sb->data_size)/2)
1154 return -EINVAL; 1164 return -EINVAL;
1155 rdev->size = le64_to_cpu(sb->data_size)/2; 1165 rdev->size = le64_to_cpu(sb->data_size)/2;
@@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1328 sb->sb_csum = calc_sb_1_csum(sb); 1338 sb->sb_csum = calc_sb_1_csum(sb);
1329} 1339}
1330 1340
1341static unsigned long long
1342super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1343{
1344 struct mdp_superblock_1 *sb;
1345 sector_t max_sectors;
1346 if (num_sectors && num_sectors < rdev->mddev->size * 2)
1347 return 0; /* component must fit device */
1348 if (rdev->sb_start < rdev->data_offset) {
1349 /* minor versions 1 and 2; superblock before data */
1350 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1351 max_sectors -= rdev->data_offset;
1352 if (!num_sectors || num_sectors > max_sectors)
1353 num_sectors = max_sectors;
1354 } else if (rdev->mddev->bitmap_offset) {
1355 /* minor version 0 with bitmap we can't move */
1356 return 0;
1357 } else {
1358 /* minor version 0; superblock after data */
1359 sector_t sb_start;
1360 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1361 sb_start &= ~(sector_t)(4*2 - 1);
1362 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
1363 if (!num_sectors || num_sectors > max_sectors)
1364 num_sectors = max_sectors;
1365 rdev->sb_start = sb_start;
1366 }
1367 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1368 sb->data_size = cpu_to_le64(num_sectors);
1369 sb->super_offset = rdev->sb_start;
1370 sb->sb_csum = calc_sb_1_csum(sb);
1371 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1372 rdev->sb_page);
1373 md_super_wait(rdev->mddev);
1374 return num_sectors / 2; /* kB for sysfs */
1375}
1331 1376
1332static struct super_type super_types[] = { 1377static struct super_type super_types[] = {
1333 [0] = { 1378 [0] = {
1334 .name = "0.90.0", 1379 .name = "0.90.0",
1335 .owner = THIS_MODULE, 1380 .owner = THIS_MODULE,
1336 .load_super = super_90_load, 1381 .load_super = super_90_load,
1337 .validate_super = super_90_validate, 1382 .validate_super = super_90_validate,
1338 .sync_super = super_90_sync, 1383 .sync_super = super_90_sync,
1384 .rdev_size_change = super_90_rdev_size_change,
1339 }, 1385 },
1340 [1] = { 1386 [1] = {
1341 .name = "md-1", 1387 .name = "md-1",
1342 .owner = THIS_MODULE, 1388 .owner = THIS_MODULE,
1343 .load_super = super_1_load, 1389 .load_super = super_1_load,
1344 .validate_super = super_1_validate, 1390 .validate_super = super_1_validate,
1345 .sync_super = super_1_sync, 1391 .sync_super = super_1_sync,
1392 .rdev_size_change = super_1_rdev_size_change,
1346 }, 1393 },
1347}; 1394};
1348 1395
1349static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1396static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1350{ 1397{
1351 struct list_head *tmp, *tmp2;
1352 mdk_rdev_t *rdev, *rdev2; 1398 mdk_rdev_t *rdev, *rdev2;
1353 1399
1354 rdev_for_each(rdev, tmp, mddev1) 1400 rcu_read_lock();
1355 rdev_for_each(rdev2, tmp2, mddev2) 1401 rdev_for_each_rcu(rdev, mddev1)
1402 rdev_for_each_rcu(rdev2, mddev2)
1356 if (rdev->bdev->bd_contains == 1403 if (rdev->bdev->bd_contains ==
1357 rdev2->bdev->bd_contains) 1404 rdev2->bdev->bd_contains) {
1405 rcu_read_unlock();
1358 return 1; 1406 return 1;
1359 1407 }
1408 rcu_read_unlock();
1360 return 0; 1409 return 0;
1361} 1410}
1362 1411
@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1423 kobject_del(&rdev->kobj); 1472 kobject_del(&rdev->kobj);
1424 goto fail; 1473 goto fail;
1425 } 1474 }
1426 list_add(&rdev->same_set, &mddev->disks); 1475 list_add_rcu(&rdev->same_set, &mddev->disks);
1427 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1476 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1428 return 0; 1477 return 0;
1429 1478
@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1448 return; 1497 return;
1449 } 1498 }
1450 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1499 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1451 list_del_init(&rdev->same_set); 1500 list_del_rcu(&rdev->same_set);
1452 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1501 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1453 rdev->mddev = NULL; 1502 rdev->mddev = NULL;
1454 sysfs_remove_link(&rdev->kobj, "block"); 1503 sysfs_remove_link(&rdev->kobj, "block");
1455 1504
1456 /* We need to delay this, otherwise we can deadlock when 1505 /* We need to delay this, otherwise we can deadlock when
1457 * writing to 'remove' to "dev/state" 1506 * writing to 'remove' to "dev/state". We also need
1507 * to delay it due to rcu usage.
1458 */ 1508 */
1509 synchronize_rcu();
1459 INIT_WORK(&rdev->del_work, md_delayed_delete); 1510 INIT_WORK(&rdev->del_work, md_delayed_delete);
1460 kobject_get(&rdev->kobj); 1511 kobject_get(&rdev->kobj);
1461 schedule_work(&rdev->del_work); 1512 schedule_work(&rdev->del_work);
@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
1511 if (rdev->mddev) 1562 if (rdev->mddev)
1512 MD_BUG(); 1563 MD_BUG();
1513 free_disk_sb(rdev); 1564 free_disk_sb(rdev);
1514 list_del_init(&rdev->same_set);
1515#ifndef MODULE 1565#ifndef MODULE
1516 if (test_bit(AutoDetected, &rdev->flags)) 1566 if (test_bit(AutoDetected, &rdev->flags))
1517 md_autodetect_dev(rdev->bdev->bd_dev); 1567 md_autodetect_dev(rdev->bdev->bd_dev);
@@ -1758,11 +1808,11 @@ repeat:
1758 dprintk("%s ", bdevname(rdev->bdev,b)); 1808 dprintk("%s ", bdevname(rdev->bdev,b));
1759 if (!test_bit(Faulty, &rdev->flags)) { 1809 if (!test_bit(Faulty, &rdev->flags)) {
1760 md_super_write(mddev,rdev, 1810 md_super_write(mddev,rdev,
1761 rdev->sb_offset<<1, rdev->sb_size, 1811 rdev->sb_start, rdev->sb_size,
1762 rdev->sb_page); 1812 rdev->sb_page);
1763 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1813 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1764 bdevname(rdev->bdev,b), 1814 bdevname(rdev->bdev,b),
1765 (unsigned long long)rdev->sb_offset); 1815 (unsigned long long)rdev->sb_start);
1766 rdev->sb_events = mddev->events; 1816 rdev->sb_events = mddev->events;
1767 1817
1768 } else 1818 } else
@@ -1787,7 +1837,7 @@ repeat:
1787 1837
1788} 1838}
1789 1839
1790/* words written to sysfs files may, or my not, be \n terminated. 1840/* words written to sysfs files may, or may not, be \n terminated.
1791 * We want to accept with case. For this we use cmd_match. 1841 * We want to accept with case. For this we use cmd_match.
1792 */ 1842 */
1793static int cmd_match(const char *cmd, const char *str) 1843static int cmd_match(const char *cmd, const char *str)
@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1886 1936
1887 err = 0; 1937 err = 0;
1888 } 1938 }
1939 if (!err)
1940 sysfs_notify(&rdev->kobj, NULL, "state");
1889 return err ? err : len; 1941 return err ? err : len;
1890} 1942}
1891static struct rdev_sysfs_entry rdev_state = 1943static struct rdev_sysfs_entry rdev_state =
@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1931 slot = -1; 1983 slot = -1;
1932 else if (e==buf || (*e && *e!= '\n')) 1984 else if (e==buf || (*e && *e!= '\n'))
1933 return -EINVAL; 1985 return -EINVAL;
1934 if (rdev->mddev->pers) { 1986 if (rdev->mddev->pers && slot == -1) {
1935 /* Setting 'slot' on an active array requires also 1987 /* Setting 'slot' on an active array requires also
1936 * updating the 'rd%d' link, and communicating 1988 * updating the 'rd%d' link, and communicating
1937 * with the personality with ->hot_*_disk. 1989 * with the personality with ->hot_*_disk.
@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1939 * failed/spare devices. This normally happens automatically, 1991 * failed/spare devices. This normally happens automatically,
1940 * but not when the metadata is externally managed. 1992 * but not when the metadata is externally managed.
1941 */ 1993 */
1942 if (slot != -1)
1943 return -EBUSY;
1944 if (rdev->raid_disk == -1) 1994 if (rdev->raid_disk == -1)
1945 return -EEXIST; 1995 return -EEXIST;
1946 /* personality does all needed checks */ 1996 /* personality does all needed checks */
@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1954 sysfs_remove_link(&rdev->mddev->kobj, nm); 2004 sysfs_remove_link(&rdev->mddev->kobj, nm);
1955 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2005 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1956 md_wakeup_thread(rdev->mddev->thread); 2006 md_wakeup_thread(rdev->mddev->thread);
2007 } else if (rdev->mddev->pers) {
2008 mdk_rdev_t *rdev2;
2009 struct list_head *tmp;
2010 /* Activating a spare .. or possibly reactivating
2011 * if we every get bitmaps working here.
2012 */
2013
2014 if (rdev->raid_disk != -1)
2015 return -EBUSY;
2016
2017 if (rdev->mddev->pers->hot_add_disk == NULL)
2018 return -EINVAL;
2019
2020 rdev_for_each(rdev2, tmp, rdev->mddev)
2021 if (rdev2->raid_disk == slot)
2022 return -EEXIST;
2023
2024 rdev->raid_disk = slot;
2025 if (test_bit(In_sync, &rdev->flags))
2026 rdev->saved_raid_disk = slot;
2027 else
2028 rdev->saved_raid_disk = -1;
2029 err = rdev->mddev->pers->
2030 hot_add_disk(rdev->mddev, rdev);
2031 if (err) {
2032 rdev->raid_disk = -1;
2033 return err;
2034 } else
2035 sysfs_notify(&rdev->kobj, NULL, "state");
2036 sprintf(nm, "rd%d", rdev->raid_disk);
2037 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2038 printk(KERN_WARNING
2039 "md: cannot register "
2040 "%s for %s\n",
2041 nm, mdname(rdev->mddev));
2042
2043 /* don't wakeup anyone, leave that to userspace. */
1957 } else { 2044 } else {
1958 if (slot >= rdev->mddev->raid_disks) 2045 if (slot >= rdev->mddev->raid_disks)
1959 return -ENOSPC; 2046 return -ENOSPC;
@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1962 clear_bit(Faulty, &rdev->flags); 2049 clear_bit(Faulty, &rdev->flags);
1963 clear_bit(WriteMostly, &rdev->flags); 2050 clear_bit(WriteMostly, &rdev->flags);
1964 set_bit(In_sync, &rdev->flags); 2051 set_bit(In_sync, &rdev->flags);
2052 sysfs_notify(&rdev->kobj, NULL, "state");
1965 } 2053 }
1966 return len; 2054 return len;
1967} 2055}
@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1983 unsigned long long offset = simple_strtoull(buf, &e, 10); 2071 unsigned long long offset = simple_strtoull(buf, &e, 10);
1984 if (e==buf || (*e && *e != '\n')) 2072 if (e==buf || (*e && *e != '\n'))
1985 return -EINVAL; 2073 return -EINVAL;
1986 if (rdev->mddev->pers) 2074 if (rdev->mddev->pers && rdev->raid_disk >= 0)
1987 return -EBUSY; 2075 return -EBUSY;
1988 if (rdev->size && rdev->mddev->external) 2076 if (rdev->size && rdev->mddev->external)
1989 /* Must set offset before size, so overlap checks 2077 /* Must set offset before size, so overlap checks
@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2015static ssize_t 2103static ssize_t
2016rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2104rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2017{ 2105{
2018 char *e; 2106 unsigned long long size;
2019 unsigned long long size = simple_strtoull(buf, &e, 10);
2020 unsigned long long oldsize = rdev->size; 2107 unsigned long long oldsize = rdev->size;
2021 mddev_t *my_mddev = rdev->mddev; 2108 mddev_t *my_mddev = rdev->mddev;
2022 2109
2023 if (e==buf || (*e && *e != '\n')) 2110 if (strict_strtoull(buf, 10, &size) < 0)
2024 return -EINVAL; 2111 return -EINVAL;
2025 if (my_mddev->pers) 2112 if (size < my_mddev->size)
2026 return -EBUSY; 2113 return -EINVAL;
2114 if (my_mddev->pers && rdev->raid_disk >= 0) {
2115 if (my_mddev->persistent) {
2116 size = super_types[my_mddev->major_version].
2117 rdev_size_change(rdev, size * 2);
2118 if (!size)
2119 return -EBUSY;
2120 } else if (!size) {
2121 size = (rdev->bdev->bd_inode->i_size >> 10);
2122 size -= rdev->data_offset/2;
2123 }
2124 if (size < my_mddev->size)
2125 return -EINVAL; /* component must fit device */
2126 }
2127
2027 rdev->size = size; 2128 rdev->size = size;
2028 if (size > oldsize && rdev->mddev->external) { 2129 if (size > oldsize && my_mddev->external) {
2029 /* need to check that all other rdevs with the same ->bdev 2130 /* need to check that all other rdevs with the same ->bdev
2030 * do not overlap. We need to unlock the mddev to avoid 2131 * do not overlap. We need to unlock the mddev to avoid
2031 * a deadlock. We have already changed rdev->size, and if 2132 * a deadlock. We have already changed rdev->size, and if
@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2044 if (test_bit(AllReserved, &rdev2->flags) || 2145 if (test_bit(AllReserved, &rdev2->flags) ||
2045 (rdev->bdev == rdev2->bdev && 2146 (rdev->bdev == rdev2->bdev &&
2046 rdev != rdev2 && 2147 rdev != rdev2 &&
2047 overlaps(rdev->data_offset, rdev->size, 2148 overlaps(rdev->data_offset, rdev->size * 2,
2048 rdev2->data_offset, rdev2->size))) { 2149 rdev2->data_offset,
2150 rdev2->size * 2))) {
2049 overlap = 1; 2151 overlap = 1;
2050 break; 2152 break;
2051 } 2153 }
@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2067 return -EBUSY; 2169 return -EBUSY;
2068 } 2170 }
2069 } 2171 }
2070 if (size < my_mddev->size || my_mddev->size == 0)
2071 my_mddev->size = size;
2072 return len; 2172 return len;
2073} 2173}
2074 2174
@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2512 * When written, doesn't tear down array, but just stops it 2612 * When written, doesn't tear down array, but just stops it
2513 * suspended (not supported yet) 2613 * suspended (not supported yet)
2514 * All IO requests will block. The array can be reconfigured. 2614 * All IO requests will block. The array can be reconfigured.
2515 * Writing this, if accepted, will block until array is quiessent 2615 * Writing this, if accepted, will block until array is quiescent
2516 * readonly 2616 * readonly
2517 * no resync can happen. no superblocks get written. 2617 * no resync can happen. no superblocks get written.
2518 * write requests fail 2618 * write requests fail
@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page)
2585 return sprintf(page, "%s\n", array_states[st]); 2685 return sprintf(page, "%s\n", array_states[st]);
2586} 2686}
2587 2687
2588static int do_md_stop(mddev_t * mddev, int ro); 2688static int do_md_stop(mddev_t * mddev, int ro, int is_open);
2589static int do_md_run(mddev_t * mddev); 2689static int do_md_run(mddev_t * mddev);
2590static int restart_array(mddev_t *mddev); 2690static int restart_array(mddev_t *mddev);
2591 2691
@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2599 break; 2699 break;
2600 case clear: 2700 case clear:
2601 /* stopping an active array */ 2701 /* stopping an active array */
2602 if (atomic_read(&mddev->active) > 1) 2702 if (atomic_read(&mddev->openers) > 0)
2603 return -EBUSY; 2703 return -EBUSY;
2604 err = do_md_stop(mddev, 0); 2704 err = do_md_stop(mddev, 0, 0);
2605 break; 2705 break;
2606 case inactive: 2706 case inactive:
2607 /* stopping an active array */ 2707 /* stopping an active array */
2608 if (mddev->pers) { 2708 if (mddev->pers) {
2609 if (atomic_read(&mddev->active) > 1) 2709 if (atomic_read(&mddev->openers) > 0)
2610 return -EBUSY; 2710 return -EBUSY;
2611 err = do_md_stop(mddev, 2); 2711 err = do_md_stop(mddev, 2, 0);
2612 } else 2712 } else
2613 err = 0; /* already inactive */ 2713 err = 0; /* already inactive */
2614 break; 2714 break;
@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2616 break; /* not supported yet */ 2716 break; /* not supported yet */
2617 case readonly: 2717 case readonly:
2618 if (mddev->pers) 2718 if (mddev->pers)
2619 err = do_md_stop(mddev, 1); 2719 err = do_md_stop(mddev, 1, 0);
2620 else { 2720 else {
2621 mddev->ro = 1; 2721 mddev->ro = 1;
2622 set_disk_ro(mddev->gendisk, 1); 2722 set_disk_ro(mddev->gendisk, 1);
@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2626 case read_auto: 2726 case read_auto:
2627 if (mddev->pers) { 2727 if (mddev->pers) {
2628 if (mddev->ro != 1) 2728 if (mddev->ro != 1)
2629 err = do_md_stop(mddev, 1); 2729 err = do_md_stop(mddev, 1, 0);
2630 else 2730 else
2631 err = restart_array(mddev); 2731 err = restart_array(mddev);
2632 if (err == 0) { 2732 if (err == 0) {
@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2681 } 2781 }
2682 if (err) 2782 if (err)
2683 return err; 2783 return err;
2684 else 2784 else {
2785 sysfs_notify(&mddev->kobj, NULL, "array_state");
2685 return len; 2786 return len;
2787 }
2686} 2788}
2687static struct md_sysfs_entry md_array_state = 2789static struct md_sysfs_entry md_array_state =
2688__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2790__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page)
2785 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2887 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2786} 2888}
2787 2889
2788static int update_size(mddev_t *mddev, unsigned long size); 2890static int update_size(mddev_t *mddev, sector_t num_sectors);
2789 2891
2790static ssize_t 2892static ssize_t
2791size_store(mddev_t *mddev, const char *buf, size_t len) 2893size_store(mddev_t *mddev, const char *buf, size_t len)
@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
2802 return -EINVAL; 2904 return -EINVAL;
2803 2905
2804 if (mddev->pers) { 2906 if (mddev->pers) {
2805 err = update_size(mddev, size); 2907 err = update_size(mddev, size * 2);
2806 md_update_sb(mddev, 1); 2908 md_update_sb(mddev, 1);
2807 } else { 2909 } else {
2808 if (mddev->size == 0 || 2910 if (mddev->size == 0 ||
@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page)
2899 type = "check"; 3001 type = "check";
2900 else 3002 else
2901 type = "repair"; 3003 type = "repair";
2902 } else 3004 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
2903 type = "recover"; 3005 type = "recover";
2904 } 3006 }
2905 return sprintf(page, "%s\n", type); 3007 return sprintf(page, "%s\n", type);
@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2921 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3023 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
2922 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3024 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
2923 return -EBUSY; 3025 return -EBUSY;
2924 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 3026 else if (cmd_match(page, "resync"))
3027 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3028 else if (cmd_match(page, "recover")) {
3029 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2925 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3030 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2926 else if (cmd_match(page, "reshape")) { 3031 } else if (cmd_match(page, "reshape")) {
2927 int err; 3032 int err;
2928 if (mddev->pers->start_reshape == NULL) 3033 if (mddev->pers->start_reshape == NULL)
2929 return -EINVAL; 3034 return -EINVAL;
2930 err = mddev->pers->start_reshape(mddev); 3035 err = mddev->pers->start_reshape(mddev);
2931 if (err) 3036 if (err)
2932 return err; 3037 return err;
3038 sysfs_notify(&mddev->kobj, NULL, "degraded");
2933 } else { 3039 } else {
2934 if (cmd_match(page, "check")) 3040 if (cmd_match(page, "check"))
2935 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3041 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
2940 } 3046 }
2941 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3047 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2942 md_wakeup_thread(mddev->thread); 3048 md_wakeup_thread(mddev->thread);
3049 sysfs_notify(&mddev->kobj, NULL, "sync_action");
2943 return len; 3050 return len;
2944} 3051}
2945 3052
@@ -3049,11 +3156,11 @@ static ssize_t
3049sync_speed_show(mddev_t *mddev, char *page) 3156sync_speed_show(mddev_t *mddev, char *page)
3050{ 3157{
3051 unsigned long resync, dt, db; 3158 unsigned long resync, dt, db;
3052 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 3159 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3053 dt = ((jiffies - mddev->resync_mark) / HZ); 3160 dt = (jiffies - mddev->resync_mark) / HZ;
3054 if (!dt) dt++; 3161 if (!dt) dt++;
3055 db = resync - (mddev->resync_mark_cnt); 3162 db = resync - mddev->resync_mark_cnt;
3056 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 3163 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3057} 3164}
3058 3165
3059static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3166static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
@@ -3075,6 +3182,36 @@ sync_completed_show(mddev_t *mddev, char *page)
3075static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3182static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3076 3183
3077static ssize_t 3184static ssize_t
3185min_sync_show(mddev_t *mddev, char *page)
3186{
3187 return sprintf(page, "%llu\n",
3188 (unsigned long long)mddev->resync_min);
3189}
3190static ssize_t
3191min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3192{
3193 unsigned long long min;
3194 if (strict_strtoull(buf, 10, &min))
3195 return -EINVAL;
3196 if (min > mddev->resync_max)
3197 return -EINVAL;
3198 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3199 return -EBUSY;
3200
3201 /* Must be a multiple of chunk_size */
3202 if (mddev->chunk_size) {
3203 if (min & (sector_t)((mddev->chunk_size>>9)-1))
3204 return -EINVAL;
3205 }
3206 mddev->resync_min = min;
3207
3208 return len;
3209}
3210
3211static struct md_sysfs_entry md_min_sync =
3212__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3213
3214static ssize_t
3078max_sync_show(mddev_t *mddev, char *page) 3215max_sync_show(mddev_t *mddev, char *page)
3079{ 3216{
3080 if (mddev->resync_max == MaxSector) 3217 if (mddev->resync_max == MaxSector)
@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3089 if (strncmp(buf, "max", 3) == 0) 3226 if (strncmp(buf, "max", 3) == 0)
3090 mddev->resync_max = MaxSector; 3227 mddev->resync_max = MaxSector;
3091 else { 3228 else {
3092 char *ep; 3229 unsigned long long max;
3093 unsigned long long max = simple_strtoull(buf, &ep, 10); 3230 if (strict_strtoull(buf, 10, &max))
3094 if (ep == buf || (*ep != 0 && *ep != '\n')) 3231 return -EINVAL;
3232 if (max < mddev->resync_min)
3095 return -EINVAL; 3233 return -EINVAL;
3096 if (max < mddev->resync_max && 3234 if (max < mddev->resync_max &&
3097 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3235 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = {
3222 &md_sync_speed.attr, 3360 &md_sync_speed.attr,
3223 &md_sync_force_parallel.attr, 3361 &md_sync_force_parallel.attr,
3224 &md_sync_completed.attr, 3362 &md_sync_completed.attr,
3363 &md_min_sync.attr,
3225 &md_max_sync.attr, 3364 &md_max_sync.attr,
3226 &md_suspend_lo.attr, 3365 &md_suspend_lo.attr,
3227 &md_suspend_hi.attr, 3366 &md_suspend_hi.attr,
@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
3326 disk->queue = mddev->queue; 3465 disk->queue = mddev->queue;
3327 add_disk(disk); 3466 add_disk(disk);
3328 mddev->gendisk = disk; 3467 mddev->gendisk = disk;
3329 mutex_unlock(&disks_mutex);
3330 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, 3468 error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
3331 "%s", "md"); 3469 "%s", "md");
3470 mutex_unlock(&disks_mutex);
3332 if (error) 3471 if (error)
3333 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3472 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3334 disk->disk_name); 3473 disk->disk_name);
@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data)
3341{ 3480{
3342 mddev_t *mddev = (mddev_t *) data; 3481 mddev_t *mddev = (mddev_t *) data;
3343 3482
3344 mddev->safemode = 1; 3483 if (!atomic_read(&mddev->writes_pending)) {
3484 mddev->safemode = 1;
3485 if (mddev->external)
3486 sysfs_notify(&mddev->kobj, NULL, "array_state");
3487 }
3345 md_wakeup_thread(mddev->thread); 3488 md_wakeup_thread(mddev->thread);
3346} 3489}
3347 3490
@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev)
3432 * We don't want the data to overlap the metadata, 3575 * We don't want the data to overlap the metadata,
3433 * Internal Bitmap issues has handled elsewhere. 3576 * Internal Bitmap issues has handled elsewhere.
3434 */ 3577 */
3435 if (rdev->data_offset < rdev->sb_offset) { 3578 if (rdev->data_offset < rdev->sb_start) {
3436 if (mddev->size && 3579 if (mddev->size &&
3437 rdev->data_offset + mddev->size*2 3580 rdev->data_offset + mddev->size*2
3438 > rdev->sb_offset*2) { 3581 > rdev->sb_start) {
3439 printk("md: %s: data overlaps metadata\n", 3582 printk("md: %s: data overlaps metadata\n",
3440 mdname(mddev)); 3583 mdname(mddev));
3441 return -EINVAL; 3584 return -EINVAL;
3442 } 3585 }
3443 } else { 3586 } else {
3444 if (rdev->sb_offset*2 + rdev->sb_size/512 3587 if (rdev->sb_start + rdev->sb_size/512
3445 > rdev->data_offset) { 3588 > rdev->data_offset) {
3446 printk("md: %s: metadata overlaps data\n", 3589 printk("md: %s: metadata overlaps data\n",
3447 mdname(mddev)); 3590 mdname(mddev));
3448 return -EINVAL; 3591 return -EINVAL;
3449 } 3592 }
3450 } 3593 }
3594 sysfs_notify(&rdev->kobj, NULL, "state");
3451 } 3595 }
3452 3596
3453 md_probe(mddev->unit, NULL, NULL); 3597 md_probe(mddev->unit, NULL, NULL);
@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev)
3519 mddev->ro = 2; /* read-only, but switch on first write */ 3663 mddev->ro = 2; /* read-only, but switch on first write */
3520 3664
3521 err = mddev->pers->run(mddev); 3665 err = mddev->pers->run(mddev);
3522 if (!err && mddev->pers->sync_request) { 3666 if (err)
3667 printk(KERN_ERR "md: pers->run() failed ...\n");
3668 else if (mddev->pers->sync_request) {
3523 err = bitmap_create(mddev); 3669 err = bitmap_create(mddev);
3524 if (err) { 3670 if (err) {
3525 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3671 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev)
3528 } 3674 }
3529 } 3675 }
3530 if (err) { 3676 if (err) {
3531 printk(KERN_ERR "md: pers->run() failed ...\n");
3532 module_put(mddev->pers->owner); 3677 module_put(mddev->pers->owner);
3533 mddev->pers = NULL; 3678 mddev->pers = NULL;
3534 bitmap_destroy(mddev); 3679 bitmap_destroy(mddev);
@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev)
3563 if (mddev->flags) 3708 if (mddev->flags)
3564 md_update_sb(mddev, 0); 3709 md_update_sb(mddev, 0);
3565 3710
3566 set_capacity(disk, mddev->array_size<<1); 3711 set_capacity(disk, mddev->array_sectors);
3567 3712
3568 /* If we call blk_queue_make_request here, it will 3713 /* If we call blk_queue_make_request here, it will
3569 * re-initialise max_sectors etc which may have been 3714 * re-initialise max_sectors etc which may have been
@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev)
3608 3753
3609 mddev->changed = 1; 3754 mddev->changed = 1;
3610 md_new_event(mddev); 3755 md_new_event(mddev);
3756 sysfs_notify(&mddev->kobj, NULL, "array_state");
3757 sysfs_notify(&mddev->kobj, NULL, "sync_action");
3758 sysfs_notify(&mddev->kobj, NULL, "degraded");
3611 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); 3759 kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
3612 return 0; 3760 return 0;
3613} 3761}
@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev)
3615static int restart_array(mddev_t *mddev) 3763static int restart_array(mddev_t *mddev)
3616{ 3764{
3617 struct gendisk *disk = mddev->gendisk; 3765 struct gendisk *disk = mddev->gendisk;
3618 int err;
3619 3766
3620 /* 3767 /* Complain if it has no devices */
3621 * Complain if it has no devices
3622 */
3623 err = -ENXIO;
3624 if (list_empty(&mddev->disks)) 3768 if (list_empty(&mddev->disks))
3625 goto out; 3769 return -ENXIO;
3626 3770 if (!mddev->pers)
3627 if (mddev->pers) { 3771 return -EINVAL;
3628 err = -EBUSY; 3772 if (!mddev->ro)
3629 if (!mddev->ro) 3773 return -EBUSY;
3630 goto out; 3774 mddev->safemode = 0;
3631 3775 mddev->ro = 0;
3632 mddev->safemode = 0; 3776 set_disk_ro(disk, 0);
3633 mddev->ro = 0; 3777 printk(KERN_INFO "md: %s switched to read-write mode.\n",
3634 set_disk_ro(disk, 0); 3778 mdname(mddev));
3635 3779 /* Kick recovery or resync if necessary */
3636 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3780 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3637 mdname(mddev)); 3781 md_wakeup_thread(mddev->thread);
3638 /* 3782 md_wakeup_thread(mddev->sync_thread);
3639 * Kick recovery or resync if necessary 3783 sysfs_notify(&mddev->kobj, NULL, "array_state");
3640 */ 3784 return 0;
3641 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3642 md_wakeup_thread(mddev->thread);
3643 md_wakeup_thread(mddev->sync_thread);
3644 err = 0;
3645 } else
3646 err = -EINVAL;
3647
3648out:
3649 return err;
3650} 3785}
3651 3786
3652/* similar to deny_write_access, but accounts for our holding a reference 3787/* similar to deny_write_access, but accounts for our holding a reference
@@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file)
3680 * 1 - switch to readonly 3815 * 1 - switch to readonly
3681 * 2 - stop but do not disassemble array 3816 * 2 - stop but do not disassemble array
3682 */ 3817 */
3683static int do_md_stop(mddev_t * mddev, int mode) 3818static int do_md_stop(mddev_t * mddev, int mode, int is_open)
3684{ 3819{
3685 int err = 0; 3820 int err = 0;
3686 struct gendisk *disk = mddev->gendisk; 3821 struct gendisk *disk = mddev->gendisk;
3687 3822
3823 if (atomic_read(&mddev->openers) > is_open) {
3824 printk("md: %s still in use.\n",mdname(mddev));
3825 return -EBUSY;
3826 }
3827
3688 if (mddev->pers) { 3828 if (mddev->pers) {
3689 if (atomic_read(&mddev->active)>2) {
3690 printk("md: %s still in use.\n",mdname(mddev));
3691 return -EBUSY;
3692 }
3693 3829
3694 if (mddev->sync_thread) { 3830 if (mddev->sync_thread) {
3695 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3831 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode)
3773 3909
3774 export_array(mddev); 3910 export_array(mddev);
3775 3911
3776 mddev->array_size = 0; 3912 mddev->array_sectors = 0;
3777 mddev->size = 0; 3913 mddev->size = 0;
3778 mddev->raid_disks = 0; 3914 mddev->raid_disks = 0;
3779 mddev->recovery_cp = 0; 3915 mddev->recovery_cp = 0;
3916 mddev->resync_min = 0;
3780 mddev->resync_max = MaxSector; 3917 mddev->resync_max = MaxSector;
3781 mddev->reshape_position = MaxSector; 3918 mddev->reshape_position = MaxSector;
3782 mddev->external = 0; 3919 mddev->external = 0;
@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
3811 mdname(mddev)); 3948 mdname(mddev));
3812 err = 0; 3949 err = 0;
3813 md_new_event(mddev); 3950 md_new_event(mddev);
3951 sysfs_notify(&mddev->kobj, NULL, "array_state");
3814out: 3952out:
3815 return err; 3953 return err;
3816} 3954}
@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev)
3836 err = do_md_run (mddev); 3974 err = do_md_run (mddev);
3837 if (err) { 3975 if (err) {
3838 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3976 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
3839 do_md_stop (mddev, 0); 3977 do_md_stop (mddev, 0, 0);
3840 } 3978 }
3841} 3979}
3842 3980
@@ -3927,8 +4065,10 @@ static void autorun_devices(int part)
3927 /* on success, candidates will be empty, on error 4065 /* on success, candidates will be empty, on error
3928 * it won't... 4066 * it won't...
3929 */ 4067 */
3930 rdev_for_each_list(rdev, tmp, candidates) 4068 rdev_for_each_list(rdev, tmp, candidates) {
4069 list_del_init(&rdev->same_set);
3931 export_rdev(rdev); 4070 export_rdev(rdev);
4071 }
3932 mddev_put(mddev); 4072 mddev_put(mddev);
3933 } 4073 }
3934 printk(KERN_INFO "md: ... autorun DONE.\n"); 4074 printk(KERN_INFO "md: ... autorun DONE.\n");
@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4009 char *ptr, *buf = NULL; 4149 char *ptr, *buf = NULL;
4010 int err = -ENOMEM; 4150 int err = -ENOMEM;
4011 4151
4012 md_allow_write(mddev); 4152 if (md_allow_write(mddev))
4153 file = kmalloc(sizeof(*file), GFP_NOIO);
4154 else
4155 file = kmalloc(sizeof(*file), GFP_KERNEL);
4013 4156
4014 file = kmalloc(sizeof(*file), GFP_KERNEL);
4015 if (!file) 4157 if (!file)
4016 goto out; 4158 goto out;
4017 4159
@@ -4044,15 +4186,12 @@ out:
4044static int get_disk_info(mddev_t * mddev, void __user * arg) 4186static int get_disk_info(mddev_t * mddev, void __user * arg)
4045{ 4187{
4046 mdu_disk_info_t info; 4188 mdu_disk_info_t info;
4047 unsigned int nr;
4048 mdk_rdev_t *rdev; 4189 mdk_rdev_t *rdev;
4049 4190
4050 if (copy_from_user(&info, arg, sizeof(info))) 4191 if (copy_from_user(&info, arg, sizeof(info)))
4051 return -EFAULT; 4192 return -EFAULT;
4052 4193
4053 nr = info.number; 4194 rdev = find_rdev_nr(mddev, info.number);
4054
4055 rdev = find_rdev_nr(mddev, nr);
4056 if (rdev) { 4195 if (rdev) {
4057 info.major = MAJOR(rdev->bdev->bd_dev); 4196 info.major = MAJOR(rdev->bdev->bd_dev);
4058 info.minor = MINOR(rdev->bdev->bd_dev); 4197 info.minor = MINOR(rdev->bdev->bd_dev);
@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4172 } 4311 }
4173 if (err) 4312 if (err)
4174 export_rdev(rdev); 4313 export_rdev(rdev);
4314 else
4315 sysfs_notify(&rdev->kobj, NULL, "state");
4175 4316
4176 md_update_sb(mddev, 1); 4317 md_update_sb(mddev, 1);
4318 if (mddev->degraded)
4319 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4320 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4178 md_wakeup_thread(mddev->thread); 4321 md_wakeup_thread(mddev->thread);
4179 return err; 4322 return err;
@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4212 4355
4213 if (!mddev->persistent) { 4356 if (!mddev->persistent) {
4214 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4357 printk(KERN_INFO "md: nonpersistent superblock ...\n");
4215 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 4358 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4216 } else 4359 } else
4217 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4360 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4218 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 4361 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4219 4362
4220 err = bind_rdev_to_array(rdev, mddev); 4363 err = bind_rdev_to_array(rdev, mddev);
4221 if (err) { 4364 if (err) {
@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4232 char b[BDEVNAME_SIZE]; 4375 char b[BDEVNAME_SIZE];
4233 mdk_rdev_t *rdev; 4376 mdk_rdev_t *rdev;
4234 4377
4235 if (!mddev->pers)
4236 return -ENODEV;
4237
4238 rdev = find_rdev(mddev, dev); 4378 rdev = find_rdev(mddev, dev);
4239 if (!rdev) 4379 if (!rdev)
4240 return -ENXIO; 4380 return -ENXIO;
@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4257{ 4397{
4258 char b[BDEVNAME_SIZE]; 4398 char b[BDEVNAME_SIZE];
4259 int err; 4399 int err;
4260 unsigned int size;
4261 mdk_rdev_t *rdev; 4400 mdk_rdev_t *rdev;
4262 4401
4263 if (!mddev->pers) 4402 if (!mddev->pers)
@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
4285 } 4424 }
4286 4425
4287 if (mddev->persistent) 4426 if (mddev->persistent)
4288 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 4427 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4289 else 4428 else
4290 rdev->sb_offset = 4429 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4291 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
4292 4430
4293 size = calc_dev_size(rdev, mddev->chunk_size); 4431 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
4294 rdev->size = size;
4295 4432
4296 if (test_bit(Faulty, &rdev->flags)) { 4433 if (test_bit(Faulty, &rdev->flags)) {
4297 printk(KERN_WARNING 4434 printk(KERN_WARNING
@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
4476 return 0; 4613 return 0;
4477} 4614}
4478 4615
4479static int update_size(mddev_t *mddev, unsigned long size) 4616static int update_size(mddev_t *mddev, sector_t num_sectors)
4480{ 4617{
4481 mdk_rdev_t * rdev; 4618 mdk_rdev_t * rdev;
4482 int rv; 4619 int rv;
4483 struct list_head *tmp; 4620 struct list_head *tmp;
4484 int fit = (size == 0); 4621 int fit = (num_sectors == 0);
4485 4622
4486 if (mddev->pers->resize == NULL) 4623 if (mddev->pers->resize == NULL)
4487 return -EINVAL; 4624 return -EINVAL;
4488 /* The "size" is the amount of each device that is used. 4625 /* The "num_sectors" is the number of sectors of each device that
4489 * This can only make sense for arrays with redundancy. 4626 * is used. This can only make sense for arrays with redundancy.
4490 * linear and raid0 always use whatever space is available 4627 * linear and raid0 always use whatever space is available. We can only
4491 * We can only consider changing the size if no resync 4628 * consider changing this number if no resync or reconstruction is
4492 * or reconstruction is happening, and if the new size 4629 * happening, and if the new size is acceptable. It must fit before the
4493 * is acceptable. It must fit before the sb_offset or, 4630 * sb_start or, if that is <data_offset, it must fit before the size
4494 * if that is <data_offset, it must fit before the 4631 * of each device. If num_sectors is zero, we find the largest size
4495 * size of each device. 4632 * that fits.
4496 * If size is zero, we find the largest size that fits. 4633
4497 */ 4634 */
4498 if (mddev->sync_thread) 4635 if (mddev->sync_thread)
4499 return -EBUSY; 4636 return -EBUSY;
@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size)
4501 sector_t avail; 4638 sector_t avail;
4502 avail = rdev->size * 2; 4639 avail = rdev->size * 2;
4503 4640
4504 if (fit && (size == 0 || size > avail/2)) 4641 if (fit && (num_sectors == 0 || num_sectors > avail))
4505 size = avail/2; 4642 num_sectors = avail;
4506 if (avail < ((sector_t)size << 1)) 4643 if (avail < num_sectors)
4507 return -ENOSPC; 4644 return -ENOSPC;
4508 } 4645 }
4509 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4646 rv = mddev->pers->resize(mddev, num_sectors);
4510 if (!rv) { 4647 if (!rv) {
4511 struct block_device *bdev; 4648 struct block_device *bdev;
4512 4649
4513 bdev = bdget_disk(mddev->gendisk, 0); 4650 bdev = bdget_disk(mddev->gendisk, 0);
4514 if (bdev) { 4651 if (bdev) {
4515 mutex_lock(&bdev->bd_inode->i_mutex); 4652 mutex_lock(&bdev->bd_inode->i_mutex);
4516 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4653 i_size_write(bdev->bd_inode,
4654 (loff_t)mddev->array_sectors << 9);
4517 mutex_unlock(&bdev->bd_inode->i_mutex); 4655 mutex_unlock(&bdev->bd_inode->i_mutex);
4518 bdput(bdev); 4656 bdput(bdev);
4519 } 4657 }
@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
4588 return mddev->pers->reconfig(mddev, info->layout, -1); 4726 return mddev->pers->reconfig(mddev, info->layout, -1);
4589 } 4727 }
4590 if (info->size >= 0 && mddev->size != info->size) 4728 if (info->size >= 0 && mddev->size != info->size)
4591 rv = update_size(mddev, info->size); 4729 rv = update_size(mddev, (sector_t)info->size * 2);
4592 4730
4593 if (mddev->raid_disks != info->raid_disks) 4731 if (mddev->raid_disks != info->raid_disks)
4594 rv = update_raid_disks(mddev, info->raid_disks); 4732 rv = update_raid_disks(mddev, info->raid_disks);
@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
4641 return 0; 4779 return 0;
4642} 4780}
4643 4781
4782/*
4783 * We have a problem here : there is no easy way to give a CHS
4784 * virtual geometry. We currently pretend that we have a 2 heads
4785 * 4 sectors (with a BIG number of cylinders...). This drives
4786 * dosfs just mad... ;-)
4787 */
4644static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4788static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4645{ 4789{
4646 mddev_t *mddev = bdev->bd_disk->private_data; 4790 mddev_t *mddev = bdev->bd_disk->private_data;
@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
4785 goto done_unlock; 4929 goto done_unlock;
4786 4930
4787 case STOP_ARRAY: 4931 case STOP_ARRAY:
4788 err = do_md_stop (mddev, 0); 4932 err = do_md_stop (mddev, 0, 1);
4789 goto done_unlock; 4933 goto done_unlock;
4790 4934
4791 case STOP_ARRAY_RO: 4935 case STOP_ARRAY_RO:
4792 err = do_md_stop (mddev, 1); 4936 err = do_md_stop (mddev, 1, 1);
4793 goto done_unlock; 4937 goto done_unlock;
4794 4938
4795 /*
4796 * We have a problem here : there is no easy way to give a CHS
4797 * virtual geometry. We currently pretend that we have a 2 heads
4798 * 4 sectors (with a BIG number of cylinders...). This drives
4799 * dosfs just mad... ;-)
4800 */
4801 } 4939 }
4802 4940
4803 /* 4941 /*
@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
4807 * here and hit the 'default' below, so only disallow 4945 * here and hit the 'default' below, so only disallow
4808 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4946 * 'md' ioctls, and switch to rw mode if started auto-readonly.
4809 */ 4947 */
4810 if (_IOC_TYPE(cmd) == MD_MAJOR && 4948 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
4811 mddev->ro && mddev->pers) {
4812 if (mddev->ro == 2) { 4949 if (mddev->ro == 2) {
4813 mddev->ro = 0; 4950 mddev->ro = 0;
4814 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4951 sysfs_notify(&mddev->kobj, NULL, "array_state");
4815 md_wakeup_thread(mddev->thread); 4952 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4816 4953 md_wakeup_thread(mddev->thread);
4817 } else { 4954 } else {
4818 err = -EROFS; 4955 err = -EROFS;
4819 goto abort_unlock; 4956 goto abort_unlock;
@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file)
4883 5020
4884 err = 0; 5021 err = 0;
4885 mddev_get(mddev); 5022 mddev_get(mddev);
5023 atomic_inc(&mddev->openers);
4886 mddev_unlock(mddev); 5024 mddev_unlock(mddev);
4887 5025
4888 check_disk_change(inode->i_bdev); 5026 check_disk_change(inode->i_bdev);
@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file)
4895 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5033 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
4896 5034
4897 BUG_ON(!mddev); 5035 BUG_ON(!mddev);
5036 atomic_dec(&mddev->openers);
4898 mddev_put(mddev); 5037 mddev_put(mddev);
4899 5038
4900 return 0; 5039 return 0;
@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5029 if (!mddev->pers->error_handler) 5168 if (!mddev->pers->error_handler)
5030 return; 5169 return;
5031 mddev->pers->error_handler(mddev,rdev); 5170 mddev->pers->error_handler(mddev,rdev);
5171 if (mddev->degraded)
5172 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5173 set_bit(StateChanged, &rdev->flags);
5032 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5174 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5033 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5175 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5034 md_wakeup_thread(mddev->thread); 5176 md_wakeup_thread(mddev->thread);
@@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
5258 if (!list_empty(&mddev->disks)) { 5400 if (!list_empty(&mddev->disks)) {
5259 if (mddev->pers) 5401 if (mddev->pers)
5260 seq_printf(seq, "\n %llu blocks", 5402 seq_printf(seq, "\n %llu blocks",
5261 (unsigned long long)mddev->array_size); 5403 (unsigned long long)
5404 mddev->array_sectors / 2);
5262 else 5405 else
5263 seq_printf(seq, "\n %llu blocks", 5406 seq_printf(seq, "\n %llu blocks",
5264 (unsigned long long)size); 5407 (unsigned long long)size);
5265 } 5408 }
5266 if (mddev->persistent) { 5409 if (mddev->persistent) {
5267 if (mddev->major_version != 0 || 5410 if (mddev->major_version != 0 ||
@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
5391static int is_mddev_idle(mddev_t *mddev) 5534static int is_mddev_idle(mddev_t *mddev)
5392{ 5535{
5393 mdk_rdev_t * rdev; 5536 mdk_rdev_t * rdev;
5394 struct list_head *tmp;
5395 int idle; 5537 int idle;
5396 long curr_events; 5538 long curr_events;
5397 5539
5398 idle = 1; 5540 idle = 1;
5399 rdev_for_each(rdev, tmp, mddev) { 5541 rcu_read_lock();
5542 rdev_for_each_rcu(rdev, mddev) {
5400 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5543 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
5401 curr_events = disk_stat_read(disk, sectors[0]) + 5544 curr_events = disk_stat_read(disk, sectors[0]) +
5402 disk_stat_read(disk, sectors[1]) - 5545 disk_stat_read(disk, sectors[1]) -
@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
5428 idle = 0; 5571 idle = 0;
5429 } 5572 }
5430 } 5573 }
5574 rcu_read_unlock();
5431 return idle; 5575 return idle;
5432} 5576}
5433 5577
@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
5451 */ 5595 */
5452void md_write_start(mddev_t *mddev, struct bio *bi) 5596void md_write_start(mddev_t *mddev, struct bio *bi)
5453{ 5597{
5598 int did_change = 0;
5454 if (bio_data_dir(bi) != WRITE) 5599 if (bio_data_dir(bi) != WRITE)
5455 return; 5600 return;
5456 5601
@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5606 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5462 md_wakeup_thread(mddev->thread); 5607 md_wakeup_thread(mddev->thread);
5463 md_wakeup_thread(mddev->sync_thread); 5608 md_wakeup_thread(mddev->sync_thread);
5609 did_change = 1;
5464 } 5610 }
5465 atomic_inc(&mddev->writes_pending); 5611 atomic_inc(&mddev->writes_pending);
5466 if (mddev->safemode == 1) 5612 if (mddev->safemode == 1)
@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5471 mddev->in_sync = 0; 5617 mddev->in_sync = 0;
5472 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5618 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5473 md_wakeup_thread(mddev->thread); 5619 md_wakeup_thread(mddev->thread);
5620 did_change = 1;
5474 } 5621 }
5475 spin_unlock_irq(&mddev->write_lock); 5622 spin_unlock_irq(&mddev->write_lock);
5476 sysfs_notify(&mddev->kobj, NULL, "array_state");
5477 } 5623 }
5624 if (did_change)
5625 sysfs_notify(&mddev->kobj, NULL, "array_state");
5478 wait_event(mddev->sb_wait, 5626 wait_event(mddev->sb_wait,
5479 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 5627 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5480 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5628 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev)
5495 * may proceed without blocking. It is important to call this before 5643 * may proceed without blocking. It is important to call this before
5496 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5644 * attempting a GFP_KERNEL allocation while holding the mddev lock.
5497 * Must be called with mddev_lock held. 5645 * Must be called with mddev_lock held.
5646 *
5647 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
5648 * is dropped, so return -EAGAIN after notifying userspace.
5498 */ 5649 */
5499void md_allow_write(mddev_t *mddev) 5650int md_allow_write(mddev_t *mddev)
5500{ 5651{
5501 if (!mddev->pers) 5652 if (!mddev->pers)
5502 return; 5653 return 0;
5503 if (mddev->ro) 5654 if (mddev->ro)
5504 return; 5655 return 0;
5656 if (!mddev->pers->sync_request)
5657 return 0;
5505 5658
5506 spin_lock_irq(&mddev->write_lock); 5659 spin_lock_irq(&mddev->write_lock);
5507 if (mddev->in_sync) { 5660 if (mddev->in_sync) {
@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev)
5512 mddev->safemode = 1; 5665 mddev->safemode = 1;
5513 spin_unlock_irq(&mddev->write_lock); 5666 spin_unlock_irq(&mddev->write_lock);
5514 md_update_sb(mddev, 0); 5667 md_update_sb(mddev, 0);
5515
5516 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5668 sysfs_notify(&mddev->kobj, NULL, "array_state");
5517 /* wait for the dirty state to be recorded in the metadata */
5518 wait_event(mddev->sb_wait,
5519 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
5520 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
5521 } else 5669 } else
5522 spin_unlock_irq(&mddev->write_lock); 5670 spin_unlock_irq(&mddev->write_lock);
5671
5672 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
5673 return -EAGAIN;
5674 else
5675 return 0;
5523} 5676}
5524EXPORT_SYMBOL_GPL(md_allow_write); 5677EXPORT_SYMBOL_GPL(md_allow_write);
5525 5678
@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev)
5625 max_sectors = mddev->resync_max_sectors; 5778 max_sectors = mddev->resync_max_sectors;
5626 mddev->resync_mismatches = 0; 5779 mddev->resync_mismatches = 0;
5627 /* we don't use the checkpoint if there's a bitmap */ 5780 /* we don't use the checkpoint if there's a bitmap */
5628 if (!mddev->bitmap && 5781 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
5629 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5782 j = mddev->resync_min;
5783 else if (!mddev->bitmap)
5630 j = mddev->recovery_cp; 5784 j = mddev->recovery_cp;
5785
5631 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5786 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5632 max_sectors = mddev->size << 1; 5787 max_sectors = mddev->size << 1;
5633 else { 5788 else {
@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev)
5796 5951
5797 skip: 5952 skip:
5798 mddev->curr_resync = 0; 5953 mddev->curr_resync = 0;
5954 mddev->resync_min = 0;
5799 mddev->resync_max = MaxSector; 5955 mddev->resync_max = MaxSector;
5800 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5956 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5801 wake_up(&resync_wait); 5957 wake_up(&resync_wait);
@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev)
5845 if (rdev->raid_disk < 0 6001 if (rdev->raid_disk < 0
5846 && !test_bit(Faulty, &rdev->flags)) { 6002 && !test_bit(Faulty, &rdev->flags)) {
5847 rdev->recovery_offset = 0; 6003 rdev->recovery_offset = 0;
5848 if (mddev->pers->hot_add_disk(mddev,rdev)) { 6004 if (mddev->pers->
6005 hot_add_disk(mddev, rdev) == 0) {
5849 char nm[20]; 6006 char nm[20];
5850 sprintf(nm, "rd%d", rdev->raid_disk); 6007 sprintf(nm, "rd%d", rdev->raid_disk);
5851 if (sysfs_create_link(&mddev->kobj, 6008 if (sysfs_create_link(&mddev->kobj,
@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev)
5920 int spares = 0; 6077 int spares = 0;
5921 6078
5922 if (!mddev->external) { 6079 if (!mddev->external) {
6080 int did_change = 0;
5923 spin_lock_irq(&mddev->write_lock); 6081 spin_lock_irq(&mddev->write_lock);
5924 if (mddev->safemode && 6082 if (mddev->safemode &&
5925 !atomic_read(&mddev->writes_pending) && 6083 !atomic_read(&mddev->writes_pending) &&
5926 !mddev->in_sync && 6084 !mddev->in_sync &&
5927 mddev->recovery_cp == MaxSector) { 6085 mddev->recovery_cp == MaxSector) {
5928 mddev->in_sync = 1; 6086 mddev->in_sync = 1;
6087 did_change = 1;
5929 if (mddev->persistent) 6088 if (mddev->persistent)
5930 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6089 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5931 } 6090 }
5932 if (mddev->safemode == 1) 6091 if (mddev->safemode == 1)
5933 mddev->safemode = 0; 6092 mddev->safemode = 0;
5934 spin_unlock_irq(&mddev->write_lock); 6093 spin_unlock_irq(&mddev->write_lock);
6094 if (did_change)
6095 sysfs_notify(&mddev->kobj, NULL, "array_state");
5935 } 6096 }
5936 6097
5937 if (mddev->flags) 6098 if (mddev->flags)
5938 md_update_sb(mddev, 0); 6099 md_update_sb(mddev, 0);
5939 6100
6101 rdev_for_each(rdev, rtmp, mddev)
6102 if (test_and_clear_bit(StateChanged, &rdev->flags))
6103 sysfs_notify(&rdev->kobj, NULL, "state");
6104
5940 6105
5941 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6106 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
5942 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6107 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev)
5951 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6116 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5952 /* success...*/ 6117 /* success...*/
5953 /* activate any spares */ 6118 /* activate any spares */
5954 mddev->pers->spare_active(mddev); 6119 if (mddev->pers->spare_active(mddev))
6120 sysfs_notify(&mddev->kobj, NULL,
6121 "degraded");
5955 } 6122 }
5956 md_update_sb(mddev, 1); 6123 md_update_sb(mddev, 1);
5957 6124
@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev)
5965 mddev->recovery = 0; 6132 mddev->recovery = 0;
5966 /* flag recovery needed just to double check */ 6133 /* flag recovery needed just to double check */
5967 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6134 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6135 sysfs_notify(&mddev->kobj, NULL, "sync_action");
5968 md_new_event(mddev); 6136 md_new_event(mddev);
5969 goto unlock; 6137 goto unlock;
5970 } 6138 }
6139 /* Set RUNNING before clearing NEEDED to avoid
6140 * any transients in the value of "sync_action".
6141 */
6142 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6143 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5971 /* Clear some bits that don't mean anything, but 6144 /* Clear some bits that don't mean anything, but
5972 * might be left set 6145 * might be left set
5973 */ 6146 */
5974 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5975 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6147 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
5976 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6148 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
5977 6149
@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev)
5989 /* Cannot proceed */ 6161 /* Cannot proceed */
5990 goto unlock; 6162 goto unlock;
5991 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6163 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6164 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5992 } else if ((spares = remove_and_add_spares(mddev))) { 6165 } else if ((spares = remove_and_add_spares(mddev))) {
5993 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6166 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5994 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6167 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6168 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5995 } else if (mddev->recovery_cp < MaxSector) { 6169 } else if (mddev->recovery_cp < MaxSector) {
5996 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6170 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6171 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5997 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6172 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5998 /* nothing to be done ... */ 6173 /* nothing to be done ... */
5999 goto unlock; 6174 goto unlock;
6000 6175
6001 if (mddev->pers->sync_request) { 6176 if (mddev->pers->sync_request) {
6002 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6003 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6177 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6004 /* We are adding a device or devices to an array 6178 /* We are adding a device or devices to an array
6005 * which has the bitmap stored on all devices. 6179 * which has the bitmap stored on all devices.
@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev)
6018 mddev->recovery = 0; 6192 mddev->recovery = 0;
6019 } else 6193 } else
6020 md_wakeup_thread(mddev->sync_thread); 6194 md_wakeup_thread(mddev->sync_thread);
6195 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6021 md_new_event(mddev); 6196 md_new_event(mddev);
6022 } 6197 }
6023 unlock: 6198 unlock:
6199 if (!mddev->sync_thread) {
6200 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6201 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6202 &mddev->recovery))
6203 sysfs_notify(&mddev->kobj, NULL, "sync_action");
6204 }
6024 mddev_unlock(mddev); 6205 mddev_unlock(mddev);
6025 } 6206 }
6026} 6207}
@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this,
6047 6228
6048 for_each_mddev(mddev, tmp) 6229 for_each_mddev(mddev, tmp)
6049 if (mddev_trylock(mddev)) { 6230 if (mddev_trylock(mddev)) {
6050 do_md_stop (mddev, 1); 6231 do_md_stop (mddev, 1, 0);
6051 mddev_unlock(mddev); 6232 mddev_unlock(mddev);
6052 } 6233 }
6053 /* 6234 /*
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e968116e0de9..c4779ccba1c3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
281{ 281{
282 multipath_conf_t *conf = mddev->private; 282 multipath_conf_t *conf = mddev->private;
283 struct request_queue *q; 283 struct request_queue *q;
284 int found = 0; 284 int err = -EEXIST;
285 int path; 285 int path;
286 struct multipath_info *p; 286 struct multipath_info *p;
287 int first = 0;
288 int last = mddev->raid_disks - 1;
289
290 if (rdev->raid_disk >= 0)
291 first = last = rdev->raid_disk;
287 292
288 print_multipath_conf(conf); 293 print_multipath_conf(conf);
289 294
290 for (path=0; path<mddev->raid_disks; path++) 295 for (path = first; path <= last; path++)
291 if ((p=conf->multipaths+path)->rdev == NULL) { 296 if ((p=conf->multipaths+path)->rdev == NULL) {
292 q = rdev->bdev->bd_disk->queue; 297 q = rdev->bdev->bd_disk->queue;
293 blk_queue_stack_limits(mddev->queue, q); 298 blk_queue_stack_limits(mddev->queue, q);
@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
307 rdev->raid_disk = path; 312 rdev->raid_disk = path;
308 set_bit(In_sync, &rdev->flags); 313 set_bit(In_sync, &rdev->flags);
309 rcu_assign_pointer(p->rdev, rdev); 314 rcu_assign_pointer(p->rdev, rdev);
310 found = 1; 315 err = 0;
316 break;
311 } 317 }
312 318
313 print_multipath_conf(conf); 319 print_multipath_conf(conf);
314 return found; 320
321 return err;
315} 322}
316 323
317static int multipath_remove_disk(mddev_t *mddev, int number) 324static int multipath_remove_disk(mddev_t *mddev, int number)
@@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
497 /* 504 /*
498 * Ok, everything is just fine now 505 * Ok, everything is just fine now
499 */ 506 */
500 mddev->array_size = mddev->size; 507 mddev->array_sectors = mddev->size * 2;
501 508
502 mddev->queue->unplug_fn = multipath_unplug; 509 mddev->queue->unplug_fn = multipath_unplug;
503 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 510 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index bcbb82594a19..183610635661 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev)
295 goto out_free_conf; 295 goto out_free_conf;
296 296
297 /* calculate array device size */ 297 /* calculate array device size */
298 mddev->array_size = 0; 298 mddev->array_sectors = 0;
299 rdev_for_each(rdev, tmp, mddev) 299 rdev_for_each(rdev, tmp, mddev)
300 mddev->array_size += rdev->size; 300 mddev->array_sectors += rdev->size * 2;
301 301
302 printk("raid0 : md_size is %llu blocks.\n", 302 printk("raid0 : md_size is %llu blocks.\n",
303 (unsigned long long)mddev->array_size); 303 (unsigned long long)mddev->array_sectors / 2);
304 printk("raid0 : conf->hash_spacing is %llu blocks.\n", 304 printk("raid0 : conf->hash_spacing is %llu blocks.\n",
305 (unsigned long long)conf->hash_spacing); 305 (unsigned long long)conf->hash_spacing);
306 { 306 {
307 sector_t s = mddev->array_size; 307 sector_t s = mddev->array_sectors / 2;
308 sector_t space = conf->hash_spacing; 308 sector_t space = conf->hash_spacing;
309 int round; 309 int round;
310 conf->preshift = 0; 310 conf->preshift = 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c610b947218a..03a5ab705c20 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev)
1100static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1100static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1101{ 1101{
1102 conf_t *conf = mddev->private; 1102 conf_t *conf = mddev->private;
1103 int found = 0; 1103 int err = -EEXIST;
1104 int mirror = 0; 1104 int mirror = 0;
1105 mirror_info_t *p; 1105 mirror_info_t *p;
1106 int first = 0;
1107 int last = mddev->raid_disks - 1;
1106 1108
1107 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1109 if (rdev->raid_disk >= 0)
1110 first = last = rdev->raid_disk;
1111
1112 for (mirror = first; mirror <= last; mirror++)
1108 if ( !(p=conf->mirrors+mirror)->rdev) { 1113 if ( !(p=conf->mirrors+mirror)->rdev) {
1109 1114
1110 blk_queue_stack_limits(mddev->queue, 1115 blk_queue_stack_limits(mddev->queue,
@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1119 1124
1120 p->head_position = 0; 1125 p->head_position = 0;
1121 rdev->raid_disk = mirror; 1126 rdev->raid_disk = mirror;
1122 found = 1; 1127 err = 0;
1123 /* As all devices are equivalent, we don't need a full recovery 1128 /* As all devices are equivalent, we don't need a full recovery
1124 * if this was recently any drive of the array 1129 * if this was recently any drive of the array
1125 */ 1130 */
@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1130 } 1135 }
1131 1136
1132 print_conf(conf); 1137 print_conf(conf);
1133 return found; 1138 return err;
1134} 1139}
1135 1140
1136static int raid1_remove_disk(mddev_t *mddev, int number) 1141static int raid1_remove_disk(mddev_t *mddev, int number)
@@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev)
2038 /* 2043 /*
2039 * Ok, everything is just fine now 2044 * Ok, everything is just fine now
2040 */ 2045 */
2041 mddev->array_size = mddev->size; 2046 mddev->array_sectors = mddev->size * 2;
2042 2047
2043 mddev->queue->unplug_fn = raid1_unplug; 2048 mddev->queue->unplug_fn = raid1_unplug;
2044 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2049 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2100 * any io in the removed space completes, but it hardly seems 2105 * any io in the removed space completes, but it hardly seems
2101 * worth it. 2106 * worth it.
2102 */ 2107 */
2103 mddev->array_size = sectors>>1; 2108 mddev->array_sectors = sectors;
2104 set_capacity(mddev->gendisk, mddev->array_size << 1); 2109 set_capacity(mddev->gendisk, mddev->array_sectors);
2105 mddev->changed = 1; 2110 mddev->changed = 1;
2106 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { 2111 if (mddev->array_sectors / 2 > mddev->size &&
2112 mddev->recovery_cp == MaxSector) {
2107 mddev->recovery_cp = mddev->size << 1; 2113 mddev->recovery_cp = mddev->size << 1;
2108 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2114 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2109 } 2115 }
2110 mddev->size = mddev->array_size; 2116 mddev->size = mddev->array_sectors / 2;
2111 mddev->resync_max_sectors = sectors; 2117 mddev->resync_max_sectors = sectors;
2112 return 0; 2118 return 0;
2113} 2119}
@@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev)
2131 conf_t *conf = mddev_to_conf(mddev); 2137 conf_t *conf = mddev_to_conf(mddev);
2132 int cnt, raid_disks; 2138 int cnt, raid_disks;
2133 unsigned long flags; 2139 unsigned long flags;
2134 int d, d2; 2140 int d, d2, err;
2135 2141
2136 /* Cannot change chunk_size, layout, or level */ 2142 /* Cannot change chunk_size, layout, or level */
2137 if (mddev->chunk_size != mddev->new_chunk || 2143 if (mddev->chunk_size != mddev->new_chunk ||
@@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev)
2143 return -EINVAL; 2149 return -EINVAL;
2144 } 2150 }
2145 2151
2146 md_allow_write(mddev); 2152 err = md_allow_write(mddev);
2153 if (err)
2154 return err;
2147 2155
2148 raid_disks = mddev->raid_disks + mddev->delta_disks; 2156 raid_disks = mddev->raid_disks + mddev->delta_disks;
2149 2157
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 22bb2b1b886d..159535d73567 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev)
1114static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1114static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1115{ 1115{
1116 conf_t *conf = mddev->private; 1116 conf_t *conf = mddev->private;
1117 int found = 0; 1117 int err = -EEXIST;
1118 int mirror; 1118 int mirror;
1119 mirror_info_t *p; 1119 mirror_info_t *p;
1120 int first = 0;
1121 int last = mddev->raid_disks - 1;
1120 1122
1121 if (mddev->recovery_cp < MaxSector) 1123 if (mddev->recovery_cp < MaxSector)
1122 /* only hot-add to in-sync arrays, as recovery is 1124 /* only hot-add to in-sync arrays, as recovery is
1123 * very different from resync 1125 * very different from resync
1124 */ 1126 */
1125 return 0; 1127 return -EBUSY;
1126 if (!enough(conf)) 1128 if (!enough(conf))
1127 return 0; 1129 return -EINVAL;
1130
1131 if (rdev->raid_disk)
1132 first = last = rdev->raid_disk;
1128 1133
1129 if (rdev->saved_raid_disk >= 0 && 1134 if (rdev->saved_raid_disk >= 0 &&
1135 rdev->saved_raid_disk >= first &&
1130 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1136 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1131 mirror = rdev->saved_raid_disk; 1137 mirror = rdev->saved_raid_disk;
1132 else 1138 else
1133 mirror = 0; 1139 mirror = first;
1134 for ( ; mirror < mddev->raid_disks; mirror++) 1140 for ( ; mirror <= last ; mirror++)
1135 if ( !(p=conf->mirrors+mirror)->rdev) { 1141 if ( !(p=conf->mirrors+mirror)->rdev) {
1136 1142
1137 blk_queue_stack_limits(mddev->queue, 1143 blk_queue_stack_limits(mddev->queue,
@@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1146 1152
1147 p->head_position = 0; 1153 p->head_position = 0;
1148 rdev->raid_disk = mirror; 1154 rdev->raid_disk = mirror;
1149 found = 1; 1155 err = 0;
1150 if (rdev->saved_raid_disk != mirror) 1156 if (rdev->saved_raid_disk != mirror)
1151 conf->fullsync = 1; 1157 conf->fullsync = 1;
1152 rcu_assign_pointer(p->rdev, rdev); 1158 rcu_assign_pointer(p->rdev, rdev);
@@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1154 } 1160 }
1155 1161
1156 print_conf(conf); 1162 print_conf(conf);
1157 return found; 1163 return err;
1158} 1164}
1159 1165
1160static int raid10_remove_disk(mddev_t *mddev, int number) 1166static int raid10_remove_disk(mddev_t *mddev, int number)
@@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev)
2159 /* 2165 /*
2160 * Ok, everything is just fine now 2166 * Ok, everything is just fine now
2161 */ 2167 */
2162 mddev->array_size = size << (conf->chunk_shift-1); 2168 mddev->array_sectors = size << conf->chunk_shift;
2163 mddev->resync_max_sectors = size << conf->chunk_shift; 2169 mddev->resync_max_sectors = size << conf->chunk_shift;
2164 2170
2165 mddev->queue->unplug_fn = raid10_unplug; 2171 mddev->queue->unplug_fn = raid10_unplug;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9ce7154845c6..55e7c56045a0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi)
115 return_bi = bi->bi_next; 115 return_bi = bi->bi_next;
116 bi->bi_next = NULL; 116 bi->bi_next = NULL;
117 bi->bi_size = 0; 117 bi->bi_size = 0;
118 bi->bi_end_io(bi, 118 bio_endio(bi, 0);
119 test_bit(BIO_UPTODATE, &bi->bi_flags)
120 ? 0 : -EIO);
121 bi = return_bi; 119 bi = return_bi;
122 } 120 }
123} 121}
124 122
125static void print_raid5_conf (raid5_conf_t *conf); 123static void print_raid5_conf (raid5_conf_t *conf);
126 124
125static int stripe_operations_active(struct stripe_head *sh)
126{
127 return sh->check_state || sh->reconstruct_state ||
128 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
129 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
130}
131
127static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 132static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
128{ 133{
129 if (atomic_dec_and_test(&sh->count)) { 134 if (atomic_dec_and_test(&sh->count)) {
@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
143 } 148 }
144 md_wakeup_thread(conf->mddev->thread); 149 md_wakeup_thread(conf->mddev->thread);
145 } else { 150 } else {
146 BUG_ON(sh->ops.pending); 151 BUG_ON(stripe_operations_active(sh));
147 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 152 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
148 atomic_dec(&conf->preread_active_stripes); 153 atomic_dec(&conf->preread_active_stripes);
149 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 154 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
245 250
246 BUG_ON(atomic_read(&sh->count) != 0); 251 BUG_ON(atomic_read(&sh->count) != 0);
247 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 252 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
248 BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); 253 BUG_ON(stripe_operations_active(sh));
249 254
250 CHECK_DEVLOCK(); 255 CHECK_DEVLOCK();
251 pr_debug("init_stripe called, stripe %llu\n", 256 pr_debug("init_stripe called, stripe %llu\n",
@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
346 return sh; 351 return sh;
347} 352}
348 353
349/* test_and_ack_op() ensures that we only dequeue an operation once */
350#define test_and_ack_op(op, pend) \
351do { \
352 if (test_bit(op, &sh->ops.pending) && \
353 !test_bit(op, &sh->ops.complete)) { \
354 if (test_and_set_bit(op, &sh->ops.ack)) \
355 clear_bit(op, &pend); \
356 else \
357 ack++; \
358 } else \
359 clear_bit(op, &pend); \
360} while (0)
361
362/* find new work to run, do not resubmit work that is already
363 * in flight
364 */
365static unsigned long get_stripe_work(struct stripe_head *sh)
366{
367 unsigned long pending;
368 int ack = 0;
369
370 pending = sh->ops.pending;
371
372 test_and_ack_op(STRIPE_OP_BIOFILL, pending);
373 test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
374 test_and_ack_op(STRIPE_OP_PREXOR, pending);
375 test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
376 test_and_ack_op(STRIPE_OP_POSTXOR, pending);
377 test_and_ack_op(STRIPE_OP_CHECK, pending);
378 if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
379 ack++;
380
381 sh->ops.count -= ack;
382 if (unlikely(sh->ops.count < 0)) {
383 printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
384 "ops.complete: %#lx\n", pending, sh->ops.pending,
385 sh->ops.ack, sh->ops.complete);
386 BUG();
387 }
388
389 return pending;
390}
391
392static void 354static void
393raid5_end_read_request(struct bio *bi, int error); 355raid5_end_read_request(struct bio *bi, int error);
394static void 356static void
395raid5_end_write_request(struct bio *bi, int error); 357raid5_end_write_request(struct bio *bi, int error);
396 358
397static void ops_run_io(struct stripe_head *sh) 359static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
398{ 360{
399 raid5_conf_t *conf = sh->raid_conf; 361 raid5_conf_t *conf = sh->raid_conf;
400 int i, disks = sh->disks; 362 int i, disks = sh->disks;
401 363
402 might_sleep(); 364 might_sleep();
403 365
404 set_bit(STRIPE_IO_STARTED, &sh->state);
405 for (i = disks; i--; ) { 366 for (i = disks; i--; ) {
406 int rw; 367 int rw;
407 struct bio *bi; 368 struct bio *bi;
@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
430 rcu_read_unlock(); 391 rcu_read_unlock();
431 392
432 if (rdev) { 393 if (rdev) {
433 if (test_bit(STRIPE_SYNCING, &sh->state) || 394 if (s->syncing || s->expanding || s->expanded)
434 test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
435 test_bit(STRIPE_EXPAND_READY, &sh->state))
436 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 395 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
437 396
397 set_bit(STRIPE_IO_STARTED, &sh->state);
398
438 bi->bi_bdev = rdev->bdev; 399 bi->bi_bdev = rdev->bdev;
439 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 400 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
440 __func__, (unsigned long long)sh->sector, 401 __func__, (unsigned long long)sh->sector,
@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
528 (unsigned long long)sh->sector); 489 (unsigned long long)sh->sector);
529 490
530 /* clear completed biofills */ 491 /* clear completed biofills */
492 spin_lock_irq(&conf->device_lock);
531 for (i = sh->disks; i--; ) { 493 for (i = sh->disks; i--; ) {
532 struct r5dev *dev = &sh->dev[i]; 494 struct r5dev *dev = &sh->dev[i];
533 495
534 /* acknowledge completion of a biofill operation */ 496 /* acknowledge completion of a biofill operation */
535 /* and check if we need to reply to a read request, 497 /* and check if we need to reply to a read request,
536 * new R5_Wantfill requests are held off until 498 * new R5_Wantfill requests are held off until
537 * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) 499 * !STRIPE_BIOFILL_RUN
538 */ 500 */
539 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 501 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
540 struct bio *rbi, *rbi2; 502 struct bio *rbi, *rbi2;
541 503
542 /* The access to dev->read is outside of the
543 * spin_lock_irq(&conf->device_lock), but is protected
544 * by the STRIPE_OP_BIOFILL pending bit
545 */
546 BUG_ON(!dev->read); 504 BUG_ON(!dev->read);
547 rbi = dev->read; 505 rbi = dev->read;
548 dev->read = NULL; 506 dev->read = NULL;
549 while (rbi && rbi->bi_sector < 507 while (rbi && rbi->bi_sector <
550 dev->sector + STRIPE_SECTORS) { 508 dev->sector + STRIPE_SECTORS) {
551 rbi2 = r5_next_bio(rbi, dev->sector); 509 rbi2 = r5_next_bio(rbi, dev->sector);
552 spin_lock_irq(&conf->device_lock);
553 if (--rbi->bi_phys_segments == 0) { 510 if (--rbi->bi_phys_segments == 0) {
554 rbi->bi_next = return_bi; 511 rbi->bi_next = return_bi;
555 return_bi = rbi; 512 return_bi = rbi;
556 } 513 }
557 spin_unlock_irq(&conf->device_lock);
558 rbi = rbi2; 514 rbi = rbi2;
559 } 515 }
560 } 516 }
561 } 517 }
562 set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); 518 spin_unlock_irq(&conf->device_lock);
519 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
563 520
564 return_io(return_bi); 521 return_io(return_bi);
565 522
@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
610 set_bit(R5_UPTODATE, &tgt->flags); 567 set_bit(R5_UPTODATE, &tgt->flags);
611 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 568 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
612 clear_bit(R5_Wantcompute, &tgt->flags); 569 clear_bit(R5_Wantcompute, &tgt->flags);
613 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 570 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
571 if (sh->check_state == check_state_compute_run)
572 sh->check_state = check_state_compute_result;
614 set_bit(STRIPE_HANDLE, &sh->state); 573 set_bit(STRIPE_HANDLE, &sh->state);
615 release_stripe(sh); 574 release_stripe(sh);
616} 575}
617 576
618static struct dma_async_tx_descriptor * 577static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
619ops_run_compute5(struct stripe_head *sh, unsigned long pending)
620{ 578{
621 /* kernel stack size limits the total number of disks */ 579 /* kernel stack size limits the total number of disks */
622 int disks = sh->disks; 580 int disks = sh->disks;
@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
646 ASYNC_TX_XOR_ZERO_DST, NULL, 604 ASYNC_TX_XOR_ZERO_DST, NULL,
647 ops_complete_compute5, sh); 605 ops_complete_compute5, sh);
648 606
649 /* ack now if postxor is not set to be run */
650 if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
651 async_tx_ack(tx);
652
653 return tx; 607 return tx;
654} 608}
655 609
@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
659 613
660 pr_debug("%s: stripe %llu\n", __func__, 614 pr_debug("%s: stripe %llu\n", __func__,
661 (unsigned long long)sh->sector); 615 (unsigned long long)sh->sector);
662
663 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
664} 616}
665 617
666static struct dma_async_tx_descriptor * 618static struct dma_async_tx_descriptor *
@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
680 for (i = disks; i--; ) { 632 for (i = disks; i--; ) {
681 struct r5dev *dev = &sh->dev[i]; 633 struct r5dev *dev = &sh->dev[i];
682 /* Only process blocks that are known to be uptodate */ 634 /* Only process blocks that are known to be uptodate */
683 if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) 635 if (test_bit(R5_Wantdrain, &dev->flags))
684 xor_srcs[count++] = dev->page; 636 xor_srcs[count++] = dev->page;
685 } 637 }
686 638
@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
692} 644}
693 645
694static struct dma_async_tx_descriptor * 646static struct dma_async_tx_descriptor *
695ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 647ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
696 unsigned long pending)
697{ 648{
698 int disks = sh->disks; 649 int disks = sh->disks;
699 int pd_idx = sh->pd_idx, i; 650 int i;
700
701 /* check if prexor is active which means only process blocks
702 * that are part of a read-modify-write (Wantprexor)
703 */
704 int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
705 651
706 pr_debug("%s: stripe %llu\n", __func__, 652 pr_debug("%s: stripe %llu\n", __func__,
707 (unsigned long long)sh->sector); 653 (unsigned long long)sh->sector);
@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
709 for (i = disks; i--; ) { 655 for (i = disks; i--; ) {
710 struct r5dev *dev = &sh->dev[i]; 656 struct r5dev *dev = &sh->dev[i];
711 struct bio *chosen; 657 struct bio *chosen;
712 int towrite;
713
714 towrite = 0;
715 if (prexor) { /* rmw */
716 if (dev->towrite &&
717 test_bit(R5_Wantprexor, &dev->flags))
718 towrite = 1;
719 } else { /* rcw */
720 if (i != pd_idx && dev->towrite &&
721 test_bit(R5_LOCKED, &dev->flags))
722 towrite = 1;
723 }
724 658
725 if (towrite) { 659 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
726 struct bio *wbi; 660 struct bio *wbi;
727 661
728 spin_lock(&sh->lock); 662 spin_lock(&sh->lock);
@@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
747static void ops_complete_postxor(void *stripe_head_ref) 681static void ops_complete_postxor(void *stripe_head_ref)
748{ 682{
749 struct stripe_head *sh = stripe_head_ref; 683 struct stripe_head *sh = stripe_head_ref;
750
751 pr_debug("%s: stripe %llu\n", __func__,
752 (unsigned long long)sh->sector);
753
754 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
755 set_bit(STRIPE_HANDLE, &sh->state);
756 release_stripe(sh);
757}
758
759static void ops_complete_write(void *stripe_head_ref)
760{
761 struct stripe_head *sh = stripe_head_ref;
762 int disks = sh->disks, i, pd_idx = sh->pd_idx; 684 int disks = sh->disks, i, pd_idx = sh->pd_idx;
763 685
764 pr_debug("%s: stripe %llu\n", __func__, 686 pr_debug("%s: stripe %llu\n", __func__,
@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
770 set_bit(R5_UPTODATE, &dev->flags); 692 set_bit(R5_UPTODATE, &dev->flags);
771 } 693 }
772 694
773 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); 695 if (sh->reconstruct_state == reconstruct_state_drain_run)
774 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); 696 sh->reconstruct_state = reconstruct_state_drain_result;
697 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
698 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
699 else {
700 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
701 sh->reconstruct_state = reconstruct_state_result;
702 }
775 703
776 set_bit(STRIPE_HANDLE, &sh->state); 704 set_bit(STRIPE_HANDLE, &sh->state);
777 release_stripe(sh); 705 release_stripe(sh);
778} 706}
779 707
780static void 708static void
781ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, 709ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
782 unsigned long pending)
783{ 710{
784 /* kernel stack size limits the total number of disks */ 711 /* kernel stack size limits the total number of disks */
785 int disks = sh->disks; 712 int disks = sh->disks;
@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
787 714
788 int count = 0, pd_idx = sh->pd_idx, i; 715 int count = 0, pd_idx = sh->pd_idx, i;
789 struct page *xor_dest; 716 struct page *xor_dest;
790 int prexor = test_bit(STRIPE_OP_PREXOR, &pending); 717 int prexor = 0;
791 unsigned long flags; 718 unsigned long flags;
792 dma_async_tx_callback callback;
793 719
794 pr_debug("%s: stripe %llu\n", __func__, 720 pr_debug("%s: stripe %llu\n", __func__,
795 (unsigned long long)sh->sector); 721 (unsigned long long)sh->sector);
@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
797 /* check if prexor is active which means only process blocks 723 /* check if prexor is active which means only process blocks
798 * that are part of a read-modify-write (written) 724 * that are part of a read-modify-write (written)
799 */ 725 */
800 if (prexor) { 726 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
727 prexor = 1;
801 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 728 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
802 for (i = disks; i--; ) { 729 for (i = disks; i--; ) {
803 struct r5dev *dev = &sh->dev[i]; 730 struct r5dev *dev = &sh->dev[i];
@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
813 } 740 }
814 } 741 }
815 742
816 /* check whether this postxor is part of a write */
817 callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
818 ops_complete_write : ops_complete_postxor;
819
820 /* 1/ if we prexor'd then the dest is reused as a source 743 /* 1/ if we prexor'd then the dest is reused as a source
821 * 2/ if we did not prexor then we are redoing the parity 744 * 2/ if we did not prexor then we are redoing the parity
822 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 745 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
830 if (unlikely(count == 1)) { 753 if (unlikely(count == 1)) {
831 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); 754 flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
832 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 755 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
833 flags, tx, callback, sh); 756 flags, tx, ops_complete_postxor, sh);
834 } else 757 } else
835 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 758 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
836 flags, tx, callback, sh); 759 flags, tx, ops_complete_postxor, sh);
837} 760}
838 761
839static void ops_complete_check(void *stripe_head_ref) 762static void ops_complete_check(void *stripe_head_ref)
840{ 763{
841 struct stripe_head *sh = stripe_head_ref; 764 struct stripe_head *sh = stripe_head_ref;
842 int pd_idx = sh->pd_idx;
843 765
844 pr_debug("%s: stripe %llu\n", __func__, 766 pr_debug("%s: stripe %llu\n", __func__,
845 (unsigned long long)sh->sector); 767 (unsigned long long)sh->sector);
846 768
847 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && 769 sh->check_state = check_state_check_result;
848 sh->ops.zero_sum_result == 0)
849 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
850
851 set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
852 set_bit(STRIPE_HANDLE, &sh->state); 770 set_bit(STRIPE_HANDLE, &sh->state);
853 release_stripe(sh); 771 release_stripe(sh);
854} 772}
@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
875 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 793 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
876 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); 794 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
877 795
878 if (tx)
879 set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
880 else
881 clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
882
883 atomic_inc(&sh->count); 796 atomic_inc(&sh->count);
884 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, 797 tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
885 ops_complete_check, sh); 798 ops_complete_check, sh);
886} 799}
887 800
888static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) 801static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
889{ 802{
890 int overlap_clear = 0, i, disks = sh->disks; 803 int overlap_clear = 0, i, disks = sh->disks;
891 struct dma_async_tx_descriptor *tx = NULL; 804 struct dma_async_tx_descriptor *tx = NULL;
892 805
893 if (test_bit(STRIPE_OP_BIOFILL, &pending)) { 806 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
894 ops_run_biofill(sh); 807 ops_run_biofill(sh);
895 overlap_clear++; 808 overlap_clear++;
896 } 809 }
897 810
898 if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) 811 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
899 tx = ops_run_compute5(sh, pending); 812 tx = ops_run_compute5(sh);
813 /* terminate the chain if postxor is not set to be run */
814 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
815 async_tx_ack(tx);
816 }
900 817
901 if (test_bit(STRIPE_OP_PREXOR, &pending)) 818 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
902 tx = ops_run_prexor(sh, tx); 819 tx = ops_run_prexor(sh, tx);
903 820
904 if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { 821 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
905 tx = ops_run_biodrain(sh, tx, pending); 822 tx = ops_run_biodrain(sh, tx);
906 overlap_clear++; 823 overlap_clear++;
907 } 824 }
908 825
909 if (test_bit(STRIPE_OP_POSTXOR, &pending)) 826 if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
910 ops_run_postxor(sh, tx, pending); 827 ops_run_postxor(sh, tx);
911 828
912 if (test_bit(STRIPE_OP_CHECK, &pending)) 829 if (test_bit(STRIPE_OP_CHECK, &ops_request))
913 ops_run_check(sh); 830 ops_run_check(sh);
914 831
915 if (test_bit(STRIPE_OP_IO, &pending))
916 ops_run_io(sh);
917
918 if (overlap_clear) 832 if (overlap_clear)
919 for (i = disks; i--; ) { 833 for (i = disks; i--; ) {
920 struct r5dev *dev = &sh->dev[i]; 834 struct r5dev *dev = &sh->dev[i];
@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
997 struct stripe_head *osh, *nsh; 911 struct stripe_head *osh, *nsh;
998 LIST_HEAD(newstripes); 912 LIST_HEAD(newstripes);
999 struct disk_info *ndisks; 913 struct disk_info *ndisks;
1000 int err = 0; 914 int err;
1001 struct kmem_cache *sc; 915 struct kmem_cache *sc;
1002 int i; 916 int i;
1003 917
1004 if (newsize <= conf->pool_size) 918 if (newsize <= conf->pool_size)
1005 return 0; /* never bother to shrink */ 919 return 0; /* never bother to shrink */
1006 920
1007 md_allow_write(conf->mddev); 921 err = md_allow_write(conf->mddev);
922 if (err)
923 return err;
1008 924
1009 /* Step 1 */ 925 /* Step 1 */
1010 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 926 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
1703 } 1619 }
1704} 1620}
1705 1621
1706static int 1622static void
1707handle_write_operations5(struct stripe_head *sh, int rcw, int expand) 1623schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
1624 int rcw, int expand)
1708{ 1625{
1709 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1626 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1710 int locked = 0;
1711 1627
1712 if (rcw) { 1628 if (rcw) {
1713 /* if we are not expanding this is a proper write request, and 1629 /* if we are not expanding this is a proper write request, and
@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1715 * stripe cache 1631 * stripe cache
1716 */ 1632 */
1717 if (!expand) { 1633 if (!expand) {
1718 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1634 sh->reconstruct_state = reconstruct_state_drain_run;
1719 sh->ops.count++; 1635 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1720 } 1636 } else
1637 sh->reconstruct_state = reconstruct_state_run;
1721 1638
1722 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1639 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1723 sh->ops.count++;
1724 1640
1725 for (i = disks; i--; ) { 1641 for (i = disks; i--; ) {
1726 struct r5dev *dev = &sh->dev[i]; 1642 struct r5dev *dev = &sh->dev[i];
1727 1643
1728 if (dev->towrite) { 1644 if (dev->towrite) {
1729 set_bit(R5_LOCKED, &dev->flags); 1645 set_bit(R5_LOCKED, &dev->flags);
1646 set_bit(R5_Wantdrain, &dev->flags);
1730 if (!expand) 1647 if (!expand)
1731 clear_bit(R5_UPTODATE, &dev->flags); 1648 clear_bit(R5_UPTODATE, &dev->flags);
1732 locked++; 1649 s->locked++;
1733 } 1650 }
1734 } 1651 }
1735 if (locked + 1 == disks) 1652 if (s->locked + 1 == disks)
1736 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 1653 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1737 atomic_inc(&sh->raid_conf->pending_full_writes); 1654 atomic_inc(&sh->raid_conf->pending_full_writes);
1738 } else { 1655 } else {
1739 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1656 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1740 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1657 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
1741 1658
1742 set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 1659 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
1743 set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); 1660 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
1744 set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); 1661 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1745 1662 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1746 sh->ops.count += 3;
1747 1663
1748 for (i = disks; i--; ) { 1664 for (i = disks; i--; ) {
1749 struct r5dev *dev = &sh->dev[i]; 1665 struct r5dev *dev = &sh->dev[i];
1750 if (i == pd_idx) 1666 if (i == pd_idx)
1751 continue; 1667 continue;
1752 1668
1753 /* For a read-modify write there may be blocks that are
1754 * locked for reading while others are ready to be
1755 * written so we distinguish these blocks by the
1756 * R5_Wantprexor bit
1757 */
1758 if (dev->towrite && 1669 if (dev->towrite &&
1759 (test_bit(R5_UPTODATE, &dev->flags) || 1670 (test_bit(R5_UPTODATE, &dev->flags) ||
1760 test_bit(R5_Wantcompute, &dev->flags))) { 1671 test_bit(R5_Wantcompute, &dev->flags))) {
1761 set_bit(R5_Wantprexor, &dev->flags); 1672 set_bit(R5_Wantdrain, &dev->flags);
1762 set_bit(R5_LOCKED, &dev->flags); 1673 set_bit(R5_LOCKED, &dev->flags);
1763 clear_bit(R5_UPTODATE, &dev->flags); 1674 clear_bit(R5_UPTODATE, &dev->flags);
1764 locked++; 1675 s->locked++;
1765 } 1676 }
1766 } 1677 }
1767 } 1678 }
@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1771 */ 1682 */
1772 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1683 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
1773 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1684 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
1774 locked++; 1685 s->locked++;
1775 1686
1776 pr_debug("%s: stripe %llu locked: %d pending: %lx\n", 1687 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
1777 __func__, (unsigned long long)sh->sector, 1688 __func__, (unsigned long long)sh->sector,
1778 locked, sh->ops.pending); 1689 s->locked, s->ops_request);
1779
1780 return locked;
1781} 1690}
1782 1691
1783/* 1692/*
@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
1876} 1785}
1877 1786
1878static void 1787static void
1879handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, 1788handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
1880 struct stripe_head_state *s, int disks, 1789 struct stripe_head_state *s, int disks,
1881 struct bio **return_bi) 1790 struct bio **return_bi)
1882{ 1791{
@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1967 md_wakeup_thread(conf->mddev->thread); 1876 md_wakeup_thread(conf->mddev->thread);
1968} 1877}
1969 1878
1970/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1879/* fetch_block5 - checks the given member device to see if its data needs
1971 * to process 1880 * to be read or computed to satisfy a request.
1881 *
1882 * Returns 1 when no more member devices need to be checked, otherwise returns
1883 * 0 to tell the loop in handle_stripe_fill5 to continue
1972 */ 1884 */
1973static int __handle_issuing_new_read_requests5(struct stripe_head *sh, 1885static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
1974 struct stripe_head_state *s, int disk_idx, int disks) 1886 int disk_idx, int disks)
1975{ 1887{
1976 struct r5dev *dev = &sh->dev[disk_idx]; 1888 struct r5dev *dev = &sh->dev[disk_idx];
1977 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 1889 struct r5dev *failed_dev = &sh->dev[s->failed_num];
1978 1890
1979 /* don't schedule compute operations or reads on the parity block while
1980 * a check is in flight
1981 */
1982 if ((disk_idx == sh->pd_idx) &&
1983 test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
1984 return ~0;
1985
1986 /* is the data in this block needed, and can we get it? */ 1891 /* is the data in this block needed, and can we get it? */
1987 if (!test_bit(R5_LOCKED, &dev->flags) && 1892 if (!test_bit(R5_LOCKED, &dev->flags) &&
1988 !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || 1893 !test_bit(R5_UPTODATE, &dev->flags) &&
1989 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 1894 (dev->toread ||
1990 s->syncing || s->expanding || (s->failed && 1895 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1991 (failed_dev->toread || (failed_dev->towrite && 1896 s->syncing || s->expanding ||
1992 !test_bit(R5_OVERWRITE, &failed_dev->flags) 1897 (s->failed &&
1993 ))))) { 1898 (failed_dev->toread ||
1994 /* 1/ We would like to get this block, possibly by computing it, 1899 (failed_dev->towrite &&
1995 * but we might not be able to. 1900 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
1996 * 1901 /* We would like to get this block, possibly by computing it,
1997 * 2/ Since parity check operations potentially make the parity 1902 * otherwise read it if the backing disk is insync
1998 * block !uptodate it will need to be refreshed before any
1999 * compute operations on data disks are scheduled.
2000 *
2001 * 3/ We hold off parity block re-reads until check operations
2002 * have quiesced.
2003 */ 1903 */
2004 if ((s->uptodate == disks - 1) && 1904 if ((s->uptodate == disks - 1) &&
2005 (s->failed && disk_idx == s->failed_num) && 1905 (s->failed && disk_idx == s->failed_num)) {
2006 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { 1906 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2007 set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 1907 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2008 set_bit(R5_Wantcompute, &dev->flags); 1908 set_bit(R5_Wantcompute, &dev->flags);
2009 sh->ops.target = disk_idx; 1909 sh->ops.target = disk_idx;
2010 s->req_compute = 1; 1910 s->req_compute = 1;
2011 sh->ops.count++;
2012 /* Careful: from this point on 'uptodate' is in the eye 1911 /* Careful: from this point on 'uptodate' is in the eye
2013 * of raid5_run_ops which services 'compute' operations 1912 * of raid5_run_ops which services 'compute' operations
2014 * before writes. R5_Wantcompute flags a block that will 1913 * before writes. R5_Wantcompute flags a block that will
@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
2016 * subsequent operation. 1915 * subsequent operation.
2017 */ 1916 */
2018 s->uptodate++; 1917 s->uptodate++;
2019 return 0; /* uptodate + compute == disks */ 1918 return 1; /* uptodate + compute == disks */
2020 } else if (test_bit(R5_Insync, &dev->flags)) { 1919 } else if (test_bit(R5_Insync, &dev->flags)) {
2021 set_bit(R5_LOCKED, &dev->flags); 1920 set_bit(R5_LOCKED, &dev->flags);
2022 set_bit(R5_Wantread, &dev->flags); 1921 set_bit(R5_Wantread, &dev->flags);
2023 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2024 sh->ops.count++;
2025 s->locked++; 1922 s->locked++;
2026 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 1923 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2027 s->syncing); 1924 s->syncing);
2028 } 1925 }
2029 } 1926 }
2030 1927
2031 return ~0; 1928 return 0;
2032} 1929}
2033 1930
2034static void handle_issuing_new_read_requests5(struct stripe_head *sh, 1931/**
1932 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
1933 */
1934static void handle_stripe_fill5(struct stripe_head *sh,
2035 struct stripe_head_state *s, int disks) 1935 struct stripe_head_state *s, int disks)
2036{ 1936{
2037 int i; 1937 int i;
2038 1938
2039 /* Clear completed compute operations. Parity recovery
2040 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
2041 * later on in this routine
2042 */
2043 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
2044 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2045 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
2046 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
2047 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
2048 }
2049
2050 /* look for blocks to read/compute, skip this if a compute 1939 /* look for blocks to read/compute, skip this if a compute
2051 * is already in flight, or if the stripe contents are in the 1940 * is already in flight, or if the stripe contents are in the
2052 * midst of changing due to a write 1941 * midst of changing due to a write
2053 */ 1942 */
2054 if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 1943 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2055 !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && 1944 !sh->reconstruct_state)
2056 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2057 for (i = disks; i--; ) 1945 for (i = disks; i--; )
2058 if (__handle_issuing_new_read_requests5( 1946 if (fetch_block5(sh, s, i, disks))
2059 sh, s, i, disks) == 0)
2060 break; 1947 break;
2061 }
2062 set_bit(STRIPE_HANDLE, &sh->state); 1948 set_bit(STRIPE_HANDLE, &sh->state);
2063} 1949}
2064 1950
2065static void handle_issuing_new_read_requests6(struct stripe_head *sh, 1951static void handle_stripe_fill6(struct stripe_head *sh,
2066 struct stripe_head_state *s, struct r6_state *r6s, 1952 struct stripe_head_state *s, struct r6_state *r6s,
2067 int disks) 1953 int disks)
2068{ 1954{
@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
2121} 2007}
2122 2008
2123 2009
2124/* handle_completed_write_requests 2010/* handle_stripe_clean_event
2125 * any written block on an uptodate or failed drive can be returned. 2011 * any written block on an uptodate or failed drive can be returned.
2126 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2012 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2127 * never LOCKED, so we don't need to test 'failed' directly. 2013 * never LOCKED, so we don't need to test 'failed' directly.
2128 */ 2014 */
2129static void handle_completed_write_requests(raid5_conf_t *conf, 2015static void handle_stripe_clean_event(raid5_conf_t *conf,
2130 struct stripe_head *sh, int disks, struct bio **return_bi) 2016 struct stripe_head *sh, int disks, struct bio **return_bi)
2131{ 2017{
2132 int i; 2018 int i;
@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2171 md_wakeup_thread(conf->mddev->thread); 2057 md_wakeup_thread(conf->mddev->thread);
2172} 2058}
2173 2059
2174static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2060static void handle_stripe_dirtying5(raid5_conf_t *conf,
2175 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2061 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2176{ 2062{
2177 int rmw = 0, rcw = 0, i; 2063 int rmw = 0, rcw = 0, i;
@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2215 "%d for r-m-w\n", i); 2101 "%d for r-m-w\n", i);
2216 set_bit(R5_LOCKED, &dev->flags); 2102 set_bit(R5_LOCKED, &dev->flags);
2217 set_bit(R5_Wantread, &dev->flags); 2103 set_bit(R5_Wantread, &dev->flags);
2218 if (!test_and_set_bit(
2219 STRIPE_OP_IO, &sh->ops.pending))
2220 sh->ops.count++;
2221 s->locked++; 2104 s->locked++;
2222 } else { 2105 } else {
2223 set_bit(STRIPE_DELAYED, &sh->state); 2106 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2241 "%d for Reconstruct\n", i); 2124 "%d for Reconstruct\n", i);
2242 set_bit(R5_LOCKED, &dev->flags); 2125 set_bit(R5_LOCKED, &dev->flags);
2243 set_bit(R5_Wantread, &dev->flags); 2126 set_bit(R5_Wantread, &dev->flags);
2244 if (!test_and_set_bit(
2245 STRIPE_OP_IO, &sh->ops.pending))
2246 sh->ops.count++;
2247 s->locked++; 2127 s->locked++;
2248 } else { 2128 } else {
2249 set_bit(STRIPE_DELAYED, &sh->state); 2129 set_bit(STRIPE_DELAYED, &sh->state);
@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
2261 * simultaneously. If this is not the case then new writes need to be 2141 * simultaneously. If this is not the case then new writes need to be
2262 * held off until the compute completes. 2142 * held off until the compute completes.
2263 */ 2143 */
2264 if ((s->req_compute || 2144 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2265 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && 2145 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2266 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2146 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2267 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2147 schedule_reconstruction5(sh, s, rcw == 0, 0);
2268 s->locked += handle_write_operations5(sh, rcw == 0, 0);
2269} 2148}
2270 2149
2271static void handle_issuing_new_write_requests6(raid5_conf_t *conf, 2150static void handle_stripe_dirtying6(raid5_conf_t *conf,
2272 struct stripe_head *sh, struct stripe_head_state *s, 2151 struct stripe_head *sh, struct stripe_head_state *s,
2273 struct r6_state *r6s, int disks) 2152 struct r6_state *r6s, int disks)
2274{ 2153{
@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2371static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2250static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2372 struct stripe_head_state *s, int disks) 2251 struct stripe_head_state *s, int disks)
2373{ 2252{
2374 int canceled_check = 0; 2253 struct r5dev *dev = NULL;
2375 2254
2376 set_bit(STRIPE_HANDLE, &sh->state); 2255 set_bit(STRIPE_HANDLE, &sh->state);
2377 2256
2378 /* complete a check operation */ 2257 switch (sh->check_state) {
2379 if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { 2258 case check_state_idle:
2380 clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); 2259 /* start a new check operation if there are no failures */
2381 clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
2382 if (s->failed == 0) { 2260 if (s->failed == 0) {
2383 if (sh->ops.zero_sum_result == 0)
2384 /* parity is correct (on disc,
2385 * not in buffer any more)
2386 */
2387 set_bit(STRIPE_INSYNC, &sh->state);
2388 else {
2389 conf->mddev->resync_mismatches +=
2390 STRIPE_SECTORS;
2391 if (test_bit(
2392 MD_RECOVERY_CHECK, &conf->mddev->recovery))
2393 /* don't try to repair!! */
2394 set_bit(STRIPE_INSYNC, &sh->state);
2395 else {
2396 set_bit(STRIPE_OP_COMPUTE_BLK,
2397 &sh->ops.pending);
2398 set_bit(STRIPE_OP_MOD_REPAIR_PD,
2399 &sh->ops.pending);
2400 set_bit(R5_Wantcompute,
2401 &sh->dev[sh->pd_idx].flags);
2402 sh->ops.target = sh->pd_idx;
2403 sh->ops.count++;
2404 s->uptodate++;
2405 }
2406 }
2407 } else
2408 canceled_check = 1; /* STRIPE_INSYNC is not set */
2409 }
2410
2411 /* start a new check operation if there are no failures, the stripe is
2412 * not insync, and a repair is not in flight
2413 */
2414 if (s->failed == 0 &&
2415 !test_bit(STRIPE_INSYNC, &sh->state) &&
2416 !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
2417 if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
2418 BUG_ON(s->uptodate != disks); 2261 BUG_ON(s->uptodate != disks);
2262 sh->check_state = check_state_run;
2263 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2419 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2264 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2420 sh->ops.count++;
2421 s->uptodate--; 2265 s->uptodate--;
2266 break;
2422 } 2267 }
2423 } 2268 dev = &sh->dev[s->failed_num];
2424 2269 /* fall through */
2425 /* check if we can clear a parity disk reconstruct */ 2270 case check_state_compute_result:
2426 if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && 2271 sh->check_state = check_state_idle;
2427 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { 2272 if (!dev)
2428 2273 dev = &sh->dev[sh->pd_idx];
2429 clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); 2274
2430 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); 2275 /* check that a write has not made the stripe insync */
2431 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); 2276 if (test_bit(STRIPE_INSYNC, &sh->state))
2432 clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); 2277 break;
2433 }
2434
2435 2278
2436 /* Wait for check parity and compute block operations to complete
2437 * before write-back. If a failure occurred while the check operation
2438 * was in flight we need to cycle this stripe through handle_stripe
2439 * since the parity block may not be uptodate
2440 */
2441 if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
2442 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
2443 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
2444 struct r5dev *dev;
2445 /* either failed parity check, or recovery is happening */ 2279 /* either failed parity check, or recovery is happening */
2446 if (s->failed == 0)
2447 s->failed_num = sh->pd_idx;
2448 dev = &sh->dev[s->failed_num];
2449 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2280 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2450 BUG_ON(s->uptodate != disks); 2281 BUG_ON(s->uptodate != disks);
2451 2282
2452 set_bit(R5_LOCKED, &dev->flags); 2283 set_bit(R5_LOCKED, &dev->flags);
2284 s->locked++;
2453 set_bit(R5_Wantwrite, &dev->flags); 2285 set_bit(R5_Wantwrite, &dev->flags);
2454 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2455 sh->ops.count++;
2456 2286
2457 clear_bit(STRIPE_DEGRADED, &sh->state); 2287 clear_bit(STRIPE_DEGRADED, &sh->state);
2458 s->locked++;
2459 set_bit(STRIPE_INSYNC, &sh->state); 2288 set_bit(STRIPE_INSYNC, &sh->state);
2289 break;
2290 case check_state_run:
2291 break; /* we will be called again upon completion */
2292 case check_state_check_result:
2293 sh->check_state = check_state_idle;
2294
2295 /* if a failure occurred during the check operation, leave
2296 * STRIPE_INSYNC not set and let the stripe be handled again
2297 */
2298 if (s->failed)
2299 break;
2300
2301 /* handle a successful check operation, if parity is correct
2302 * we are done. Otherwise update the mismatch count and repair
2303 * parity if !MD_RECOVERY_CHECK
2304 */
2305 if (sh->ops.zero_sum_result == 0)
2306 /* parity is correct (on disc,
2307 * not in buffer any more)
2308 */
2309 set_bit(STRIPE_INSYNC, &sh->state);
2310 else {
2311 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2312 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2313 /* don't try to repair!! */
2314 set_bit(STRIPE_INSYNC, &sh->state);
2315 else {
2316 sh->check_state = check_state_compute_run;
2317 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2318 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2319 set_bit(R5_Wantcompute,
2320 &sh->dev[sh->pd_idx].flags);
2321 sh->ops.target = sh->pd_idx;
2322 s->uptodate++;
2323 }
2324 }
2325 break;
2326 case check_state_compute_run:
2327 break;
2328 default:
2329 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2330 __func__, sh->check_state,
2331 (unsigned long long) sh->sector);
2332 BUG();
2460 } 2333 }
2461} 2334}
2462 2335
@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
2641 struct bio *return_bi = NULL; 2514 struct bio *return_bi = NULL;
2642 struct stripe_head_state s; 2515 struct stripe_head_state s;
2643 struct r5dev *dev; 2516 struct r5dev *dev;
2644 unsigned long pending = 0;
2645 mdk_rdev_t *blocked_rdev = NULL; 2517 mdk_rdev_t *blocked_rdev = NULL;
2646 int prexor; 2518 int prexor;
2647 2519
2648 memset(&s, 0, sizeof(s)); 2520 memset(&s, 0, sizeof(s));
2649 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2521 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2650 "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, 2522 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2651 atomic_read(&sh->count), sh->pd_idx, 2523 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2652 sh->ops.pending, sh->ops.ack, sh->ops.complete); 2524 sh->reconstruct_state);
2653 2525
2654 spin_lock(&sh->lock); 2526 spin_lock(&sh->lock);
2655 clear_bit(STRIPE_HANDLE, &sh->state); 2527 clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
2658 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2530 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2659 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2531 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2660 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2532 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2661 /* Now to look around and see what can be done */
2662
2663 /* clean-up completed biofill operations */
2664 if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
2665 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
2666 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
2667 clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
2668 }
2669 2533
2534 /* Now to look around and see what can be done */
2670 rcu_read_lock(); 2535 rcu_read_lock();
2671 for (i=disks; i--; ) { 2536 for (i=disks; i--; ) {
2672 mdk_rdev_t *rdev; 2537 mdk_rdev_t *rdev;
@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
2680 /* maybe we can request a biofill operation 2545 /* maybe we can request a biofill operation
2681 * 2546 *
2682 * new wantfill requests are only permitted while 2547 * new wantfill requests are only permitted while
2683 * STRIPE_OP_BIOFILL is clear 2548 * ops_complete_biofill is guaranteed to be inactive
2684 */ 2549 */
2685 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2550 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2686 !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2551 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2687 set_bit(R5_Wantfill, &dev->flags); 2552 set_bit(R5_Wantfill, &dev->flags);
2688 2553
2689 /* now count some things */ 2554 /* now count some things */
@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
2727 goto unlock; 2592 goto unlock;
2728 } 2593 }
2729 2594
2730 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2595 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
2731 sh->ops.count++; 2596 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
2597 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
2598 }
2732 2599
2733 pr_debug("locked=%d uptodate=%d to_read=%d" 2600 pr_debug("locked=%d uptodate=%d to_read=%d"
2734 " to_write=%d failed=%d failed_num=%d\n", 2601 " to_write=%d failed=%d failed_num=%d\n",
@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
2738 * need to be failed 2605 * need to be failed
2739 */ 2606 */
2740 if (s.failed > 1 && s.to_read+s.to_write+s.written) 2607 if (s.failed > 1 && s.to_read+s.to_write+s.written)
2741 handle_requests_to_failed_array(conf, sh, &s, disks, 2608 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
2742 &return_bi);
2743 if (s.failed > 1 && s.syncing) { 2609 if (s.failed > 1 && s.syncing) {
2744 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2610 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
2745 clear_bit(STRIPE_SYNCING, &sh->state); 2611 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
2755 !test_bit(R5_LOCKED, &dev->flags) && 2621 !test_bit(R5_LOCKED, &dev->flags) &&
2756 test_bit(R5_UPTODATE, &dev->flags)) || 2622 test_bit(R5_UPTODATE, &dev->flags)) ||
2757 (s.failed == 1 && s.failed_num == sh->pd_idx))) 2623 (s.failed == 1 && s.failed_num == sh->pd_idx)))
2758 handle_completed_write_requests(conf, sh, disks, &return_bi); 2624 handle_stripe_clean_event(conf, sh, disks, &return_bi);
2759 2625
2760 /* Now we might consider reading some blocks, either to check/generate 2626 /* Now we might consider reading some blocks, either to check/generate
2761 * parity, or to satisfy requests 2627 * parity, or to satisfy requests
2762 * or to load a block that is being partially written. 2628 * or to load a block that is being partially written.
2763 */ 2629 */
2764 if (s.to_read || s.non_overwrite || 2630 if (s.to_read || s.non_overwrite ||
2765 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || 2631 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
2766 test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2632 handle_stripe_fill5(sh, &s, disks);
2767 handle_issuing_new_read_requests5(sh, &s, disks);
2768 2633
2769 /* Now we check to see if any write operations have recently 2634 /* Now we check to see if any write operations have recently
2770 * completed 2635 * completed
2771 */ 2636 */
2772
2773 /* leave prexor set until postxor is done, allows us to distinguish
2774 * a rmw from a rcw during biodrain
2775 */
2776 prexor = 0; 2637 prexor = 0;
2777 if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && 2638 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
2778 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2779
2780 prexor = 1; 2639 prexor = 1;
2781 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 2640 if (sh->reconstruct_state == reconstruct_state_drain_result ||
2782 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); 2641 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
2783 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); 2642 sh->reconstruct_state = reconstruct_state_idle;
2784
2785 for (i = disks; i--; )
2786 clear_bit(R5_Wantprexor, &sh->dev[i].flags);
2787 }
2788
2789 /* if only POSTXOR is set then this is an 'expand' postxor */
2790 if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
2791 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
2792
2793 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
2794 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
2795 clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
2796
2797 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2798 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2799 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2800 2643
2801 /* All the 'written' buffers and the parity block are ready to 2644 /* All the 'written' buffers and the parity block are ready to
2802 * be written back to disk 2645 * be written back to disk
@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh)
2808 (i == sh->pd_idx || dev->written)) { 2651 (i == sh->pd_idx || dev->written)) {
2809 pr_debug("Writing block %d\n", i); 2652 pr_debug("Writing block %d\n", i);
2810 set_bit(R5_Wantwrite, &dev->flags); 2653 set_bit(R5_Wantwrite, &dev->flags);
2811 if (!test_and_set_bit(
2812 STRIPE_OP_IO, &sh->ops.pending))
2813 sh->ops.count++;
2814 if (prexor) 2654 if (prexor)
2815 continue; 2655 continue;
2816 if (!test_bit(R5_Insync, &dev->flags) || 2656 if (!test_bit(R5_Insync, &dev->flags) ||
@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
2832 * 2/ A 'check' operation is in flight, as it may clobber the parity 2672 * 2/ A 'check' operation is in flight, as it may clobber the parity
2833 * block. 2673 * block.
2834 */ 2674 */
2835 if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && 2675 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
2836 !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) 2676 handle_stripe_dirtying5(conf, sh, &s, disks);
2837 handle_issuing_new_write_requests5(conf, sh, &s, disks);
2838 2677
2839 /* maybe we need to check and possibly fix the parity for this stripe 2678 /* maybe we need to check and possibly fix the parity for this stripe
2840 * Any reads will already have been scheduled, so we just see if enough 2679 * Any reads will already have been scheduled, so we just see if enough
2841 * data is available. The parity check is held off while parity 2680 * data is available. The parity check is held off while parity
2842 * dependent operations are in flight. 2681 * dependent operations are in flight.
2843 */ 2682 */
2844 if ((s.syncing && s.locked == 0 && 2683 if (sh->check_state ||
2845 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && 2684 (s.syncing && s.locked == 0 &&
2846 !test_bit(STRIPE_INSYNC, &sh->state)) || 2685 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
2847 test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || 2686 !test_bit(STRIPE_INSYNC, &sh->state)))
2848 test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
2849 handle_parity_checks5(conf, sh, &s, disks); 2687 handle_parity_checks5(conf, sh, &s, disks);
2850 2688
2851 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 2689 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
2864 dev = &sh->dev[s.failed_num]; 2702 dev = &sh->dev[s.failed_num];
2865 if (!test_bit(R5_ReWrite, &dev->flags)) { 2703 if (!test_bit(R5_ReWrite, &dev->flags)) {
2866 set_bit(R5_Wantwrite, &dev->flags); 2704 set_bit(R5_Wantwrite, &dev->flags);
2867 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2868 sh->ops.count++;
2869 set_bit(R5_ReWrite, &dev->flags); 2705 set_bit(R5_ReWrite, &dev->flags);
2870 set_bit(R5_LOCKED, &dev->flags); 2706 set_bit(R5_LOCKED, &dev->flags);
2871 s.locked++; 2707 s.locked++;
2872 } else { 2708 } else {
2873 /* let's read it back */ 2709 /* let's read it back */
2874 set_bit(R5_Wantread, &dev->flags); 2710 set_bit(R5_Wantread, &dev->flags);
2875 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2876 sh->ops.count++;
2877 set_bit(R5_LOCKED, &dev->flags); 2711 set_bit(R5_LOCKED, &dev->flags);
2878 s.locked++; 2712 s.locked++;
2879 } 2713 }
2880 } 2714 }
2881 2715
2882 /* Finish postxor operations initiated by the expansion 2716 /* Finish reconstruct operations initiated by the expansion process */
2883 * process 2717 if (sh->reconstruct_state == reconstruct_state_result) {
2884 */ 2718 sh->reconstruct_state = reconstruct_state_idle;
2885 if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
2886 !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
2887
2888 clear_bit(STRIPE_EXPANDING, &sh->state); 2719 clear_bit(STRIPE_EXPANDING, &sh->state);
2889 2720 for (i = conf->raid_disks; i--; )
2890 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
2891 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
2892 clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
2893
2894 for (i = conf->raid_disks; i--; ) {
2895 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2721 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2896 set_bit(R5_LOCKED, &dev->flags); 2722 set_bit(R5_LOCKED, &dev->flags);
2897 s.locked++; 2723 s.locked++;
2898 if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
2899 sh->ops.count++;
2900 }
2901 } 2724 }
2902 2725
2903 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2726 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
2904 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { 2727 !sh->reconstruct_state) {
2905 /* Need to write out all blocks after computing parity */ 2728 /* Need to write out all blocks after computing parity */
2906 sh->disks = conf->raid_disks; 2729 sh->disks = conf->raid_disks;
2907 sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 2730 sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
2908 conf->raid_disks); 2731 conf->raid_disks);
2909 s.locked += handle_write_operations5(sh, 1, 1); 2732 schedule_reconstruction5(sh, &s, 1, 1);
2910 } else if (s.expanded && 2733 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
2911 s.locked == 0 &&
2912 !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
2913 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2734 clear_bit(STRIPE_EXPAND_READY, &sh->state);
2914 atomic_dec(&conf->reshape_stripes); 2735 atomic_dec(&conf->reshape_stripes);
2915 wake_up(&conf->wait_for_overlap); 2736 wake_up(&conf->wait_for_overlap);
@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
2917 } 2738 }
2918 2739
2919 if (s.expanding && s.locked == 0 && 2740 if (s.expanding && s.locked == 0 &&
2920 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2741 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
2921 handle_stripe_expansion(conf, sh, NULL); 2742 handle_stripe_expansion(conf, sh, NULL);
2922 2743
2923 if (sh->ops.count)
2924 pending = get_stripe_work(sh);
2925
2926 unlock: 2744 unlock:
2927 spin_unlock(&sh->lock); 2745 spin_unlock(&sh->lock);
2928 2746
@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
2930 if (unlikely(blocked_rdev)) 2748 if (unlikely(blocked_rdev))
2931 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2749 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2932 2750
2933 if (pending) 2751 if (s.ops_request)
2934 raid5_run_ops(sh, pending); 2752 raid5_run_ops(sh, s.ops_request);
2935 2753
2936 return_io(return_bi); 2754 ops_run_io(sh, &s);
2937 2755
2756 return_io(return_bi);
2938} 2757}
2939 2758
2940static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) 2759static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3042 * might need to be failed 2861 * might need to be failed
3043 */ 2862 */
3044 if (s.failed > 2 && s.to_read+s.to_write+s.written) 2863 if (s.failed > 2 && s.to_read+s.to_write+s.written)
3045 handle_requests_to_failed_array(conf, sh, &s, disks, 2864 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3046 &return_bi);
3047 if (s.failed > 2 && s.syncing) { 2865 if (s.failed > 2 && s.syncing) {
3048 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 2866 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3049 clear_bit(STRIPE_SYNCING, &sh->state); 2867 clear_bit(STRIPE_SYNCING, &sh->state);
@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3068 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 2886 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3069 && !test_bit(R5_LOCKED, &qdev->flags) 2887 && !test_bit(R5_LOCKED, &qdev->flags)
3070 && test_bit(R5_UPTODATE, &qdev->flags))))) 2888 && test_bit(R5_UPTODATE, &qdev->flags)))))
3071 handle_completed_write_requests(conf, sh, disks, &return_bi); 2889 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3072 2890
3073 /* Now we might consider reading some blocks, either to check/generate 2891 /* Now we might consider reading some blocks, either to check/generate
3074 * parity, or to satisfy requests 2892 * parity, or to satisfy requests
@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3076 */ 2894 */
3077 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 2895 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3078 (s.syncing && (s.uptodate < disks)) || s.expanding) 2896 (s.syncing && (s.uptodate < disks)) || s.expanding)
3079 handle_issuing_new_read_requests6(sh, &s, &r6s, disks); 2897 handle_stripe_fill6(sh, &s, &r6s, disks);
3080 2898
3081 /* now to consider writing and what else, if anything should be read */ 2899 /* now to consider writing and what else, if anything should be read */
3082 if (s.to_write) 2900 if (s.to_write)
3083 handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); 2901 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3084 2902
3085 /* maybe we need to check and possibly fix the parity for this stripe 2903 /* maybe we need to check and possibly fix the parity for this stripe
3086 * Any reads will already have been scheduled, so we just see if enough 2904 * Any reads will already have been scheduled, so we just see if enough
@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3136 } 2954 }
3137 2955
3138 if (s.expanding && s.locked == 0 && 2956 if (s.expanding && s.locked == 0 &&
3139 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 2957 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3140 handle_stripe_expansion(conf, sh, &r6s); 2958 handle_stripe_expansion(conf, sh, &r6s);
3141 2959
3142 unlock: 2960 unlock:
@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3146 if (unlikely(blocked_rdev)) 2964 if (unlikely(blocked_rdev))
3147 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2965 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3148 2966
3149 return_io(return_bi); 2967 ops_run_io(sh, &s);
3150
3151 for (i=disks; i-- ;) {
3152 int rw;
3153 struct bio *bi;
3154 mdk_rdev_t *rdev;
3155 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
3156 rw = WRITE;
3157 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
3158 rw = READ;
3159 else
3160 continue;
3161
3162 set_bit(STRIPE_IO_STARTED, &sh->state);
3163
3164 bi = &sh->dev[i].req;
3165
3166 bi->bi_rw = rw;
3167 if (rw == WRITE)
3168 bi->bi_end_io = raid5_end_write_request;
3169 else
3170 bi->bi_end_io = raid5_end_read_request;
3171
3172 rcu_read_lock();
3173 rdev = rcu_dereference(conf->disks[i].rdev);
3174 if (rdev && test_bit(Faulty, &rdev->flags))
3175 rdev = NULL;
3176 if (rdev)
3177 atomic_inc(&rdev->nr_pending);
3178 rcu_read_unlock();
3179 2968
3180 if (rdev) { 2969 return_io(return_bi);
3181 if (s.syncing || s.expanding || s.expanded)
3182 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
3183
3184 bi->bi_bdev = rdev->bdev;
3185 pr_debug("for %llu schedule op %ld on disc %d\n",
3186 (unsigned long long)sh->sector, bi->bi_rw, i);
3187 atomic_inc(&sh->count);
3188 bi->bi_sector = sh->sector + rdev->data_offset;
3189 bi->bi_flags = 1 << BIO_UPTODATE;
3190 bi->bi_vcnt = 1;
3191 bi->bi_max_vecs = 1;
3192 bi->bi_idx = 0;
3193 bi->bi_io_vec = &sh->dev[i].vec;
3194 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
3195 bi->bi_io_vec[0].bv_offset = 0;
3196 bi->bi_size = STRIPE_SIZE;
3197 bi->bi_next = NULL;
3198 if (rw == WRITE &&
3199 test_bit(R5_ReWrite, &sh->dev[i].flags))
3200 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
3201 generic_make_request(bi);
3202 } else {
3203 if (rw == WRITE)
3204 set_bit(STRIPE_DEGRADED, &sh->state);
3205 pr_debug("skip op %ld on disc %d for sector %llu\n",
3206 bi->bi_rw, i, (unsigned long long)sh->sector);
3207 clear_bit(R5_LOCKED, &sh->dev[i].flags);
3208 set_bit(STRIPE_HANDLE, &sh->state);
3209 }
3210 }
3211} 2970}
3212 2971
3213static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) 2972static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
3697 if ( rw == WRITE ) 3456 if ( rw == WRITE )
3698 md_write_end(mddev); 3457 md_write_end(mddev);
3699 3458
3700 bi->bi_end_io(bi, 3459 bio_endio(bi, 0);
3701 test_bit(BIO_UPTODATE, &bi->bi_flags)
3702 ? 0 : -EIO);
3703 } 3460 }
3704 return 0; 3461 return 0;
3705} 3462}
@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
3785 j == raid6_next_disk(sh->pd_idx, sh->disks)) 3542 j == raid6_next_disk(sh->pd_idx, sh->disks))
3786 continue; 3543 continue;
3787 s = compute_blocknr(sh, j); 3544 s = compute_blocknr(sh, j);
3788 if (s < (mddev->array_size<<1)) { 3545 if (s < mddev->array_sectors) {
3789 skipped = 1; 3546 skipped = 1;
3790 continue; 3547 continue;
3791 } 3548 }
@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4002 spin_lock_irq(&conf->device_lock); 3759 spin_lock_irq(&conf->device_lock);
4003 remaining = --raid_bio->bi_phys_segments; 3760 remaining = --raid_bio->bi_phys_segments;
4004 spin_unlock_irq(&conf->device_lock); 3761 spin_unlock_irq(&conf->device_lock);
4005 if (remaining == 0) { 3762 if (remaining == 0)
4006 3763 bio_endio(raid_bio, 0);
4007 raid_bio->bi_end_io(raid_bio,
4008 test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
4009 ? 0 : -EIO);
4010 }
4011 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3764 if (atomic_dec_and_test(&conf->active_aligned_reads))
4012 wake_up(&conf->wait_for_stripe); 3765 wake_up(&conf->wait_for_stripe);
4013 return handled; 3766 return handled;
@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4094{ 3847{
4095 raid5_conf_t *conf = mddev_to_conf(mddev); 3848 raid5_conf_t *conf = mddev_to_conf(mddev);
4096 unsigned long new; 3849 unsigned long new;
3850 int err;
3851
4097 if (len >= PAGE_SIZE) 3852 if (len >= PAGE_SIZE)
4098 return -EINVAL; 3853 return -EINVAL;
4099 if (!conf) 3854 if (!conf)
@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4109 else 3864 else
4110 break; 3865 break;
4111 } 3866 }
4112 md_allow_write(mddev); 3867 err = md_allow_write(mddev);
3868 if (err)
3869 return err;
4113 while (new > conf->max_nr_stripes) { 3870 while (new > conf->max_nr_stripes) {
4114 if (grow_one_stripe(conf)) 3871 if (grow_one_stripe(conf))
4115 conf->max_nr_stripes++; 3872 conf->max_nr_stripes++;
@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev)
4434 mddev->queue->backing_dev_info.congested_data = mddev; 4191 mddev->queue->backing_dev_info.congested_data = mddev;
4435 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4192 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4436 4193
4437 mddev->array_size = mddev->size * (conf->previous_raid_disks - 4194 mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
4438 conf->max_degraded); 4195 conf->max_degraded);
4439 4196
4440 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 4197 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
@@ -4609,35 +4366,41 @@ abort:
4609static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 4366static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4610{ 4367{
4611 raid5_conf_t *conf = mddev->private; 4368 raid5_conf_t *conf = mddev->private;
4612 int found = 0; 4369 int err = -EEXIST;
4613 int disk; 4370 int disk;
4614 struct disk_info *p; 4371 struct disk_info *p;
4372 int first = 0;
4373 int last = conf->raid_disks - 1;
4615 4374
4616 if (mddev->degraded > conf->max_degraded) 4375 if (mddev->degraded > conf->max_degraded)
4617 /* no point adding a device */ 4376 /* no point adding a device */
4618 return 0; 4377 return -EINVAL;
4378
4379 if (rdev->raid_disk >= 0)
4380 first = last = rdev->raid_disk;
4619 4381
4620 /* 4382 /*
4621 * find the disk ... but prefer rdev->saved_raid_disk 4383 * find the disk ... but prefer rdev->saved_raid_disk
4622 * if possible. 4384 * if possible.
4623 */ 4385 */
4624 if (rdev->saved_raid_disk >= 0 && 4386 if (rdev->saved_raid_disk >= 0 &&
4387 rdev->saved_raid_disk >= first &&
4625 conf->disks[rdev->saved_raid_disk].rdev == NULL) 4388 conf->disks[rdev->saved_raid_disk].rdev == NULL)
4626 disk = rdev->saved_raid_disk; 4389 disk = rdev->saved_raid_disk;
4627 else 4390 else
4628 disk = 0; 4391 disk = first;
4629 for ( ; disk < conf->raid_disks; disk++) 4392 for ( ; disk <= last ; disk++)
4630 if ((p=conf->disks + disk)->rdev == NULL) { 4393 if ((p=conf->disks + disk)->rdev == NULL) {
4631 clear_bit(In_sync, &rdev->flags); 4394 clear_bit(In_sync, &rdev->flags);
4632 rdev->raid_disk = disk; 4395 rdev->raid_disk = disk;
4633 found = 1; 4396 err = 0;
4634 if (rdev->saved_raid_disk != disk) 4397 if (rdev->saved_raid_disk != disk)
4635 conf->fullsync = 1; 4398 conf->fullsync = 1;
4636 rcu_assign_pointer(p->rdev, rdev); 4399 rcu_assign_pointer(p->rdev, rdev);
4637 break; 4400 break;
4638 } 4401 }
4639 print_raid5_conf(conf); 4402 print_raid5_conf(conf);
4640 return found; 4403 return err;
4641} 4404}
4642 4405
4643static int raid5_resize(mddev_t *mddev, sector_t sectors) 4406static int raid5_resize(mddev_t *mddev, sector_t sectors)
@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
4652 raid5_conf_t *conf = mddev_to_conf(mddev); 4415 raid5_conf_t *conf = mddev_to_conf(mddev);
4653 4416
4654 sectors &= ~((sector_t)mddev->chunk_size/512 - 1); 4417 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
4655 mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; 4418 mddev->array_sectors = sectors * (mddev->raid_disks
4656 set_capacity(mddev->gendisk, mddev->array_size << 1); 4419 - conf->max_degraded);
4420 set_capacity(mddev->gendisk, mddev->array_sectors);
4657 mddev->changed = 1; 4421 mddev->changed = 1;
4658 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { 4422 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
4659 mddev->recovery_cp = mddev->size << 1; 4423 mddev->recovery_cp = mddev->size << 1;
@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev)
4738 rdev_for_each(rdev, rtmp, mddev) 4502 rdev_for_each(rdev, rtmp, mddev)
4739 if (rdev->raid_disk < 0 && 4503 if (rdev->raid_disk < 0 &&
4740 !test_bit(Faulty, &rdev->flags)) { 4504 !test_bit(Faulty, &rdev->flags)) {
4741 if (raid5_add_disk(mddev, rdev)) { 4505 if (raid5_add_disk(mddev, rdev) == 0) {
4742 char nm[20]; 4506 char nm[20];
4743 set_bit(In_sync, &rdev->flags); 4507 set_bit(In_sync, &rdev->flags);
4744 added_devices++; 4508 added_devices++;
@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf)
4786 struct block_device *bdev; 4550 struct block_device *bdev;
4787 4551
4788 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 4552 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
4789 conf->mddev->array_size = conf->mddev->size * 4553 conf->mddev->array_sectors = 2 * conf->mddev->size *
4790 (conf->raid_disks - conf->max_degraded); 4554 (conf->raid_disks - conf->max_degraded);
4791 set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); 4555 set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
4792 conf->mddev->changed = 1; 4556 conf->mddev->changed = 1;
4793 4557
4794 bdev = bdget_disk(conf->mddev->gendisk, 0); 4558 bdev = bdget_disk(conf->mddev->gendisk, 0);
4795 if (bdev) { 4559 if (bdev) {
4796 mutex_lock(&bdev->bd_inode->i_mutex); 4560 mutex_lock(&bdev->bd_inode->i_mutex);
4797 i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); 4561 i_size_write(bdev->bd_inode,
4562 (loff_t)conf->mddev->array_sectors << 9);
4798 mutex_unlock(&bdev->bd_inode->i_mutex); 4563 mutex_unlock(&bdev->bd_inode->i_mutex);
4799 bdput(bdev); 4564 bdput(bdev);
4800 } 4565 }
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 78bfdea24a8e..e98900671ca9 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -221,6 +221,7 @@ struct bitmap {
221 unsigned long syncchunk; 221 unsigned long syncchunk;
222 222
223 __u64 events_cleared; 223 __u64 events_cleared;
224 int need_sync;
224 225
225 /* bitmap spinlock */ 226 /* bitmap spinlock */
226 spinlock_t lock; 227 spinlock_t lock;
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index ba15469daf11..7e375111d007 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -16,7 +16,7 @@ struct linear_private_data
16 struct linear_private_data *prev; /* earlier version */ 16 struct linear_private_data *prev; /* earlier version */
17 dev_info_t **hash_table; 17 dev_info_t **hash_table;
18 sector_t hash_spacing; 18 sector_t hash_spacing;
19 sector_t array_size; 19 sector_t array_sectors;
20 int preshift; /* shift before dividing by hash_spacing */ 20 int preshift; /* shift before dividing by hash_spacing */
21 dev_info_t disks[0]; 21 dev_info_t disks[0];
22}; 22};
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b7386ae9d288..dc0e3fcb9f28 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
95 struct page *page, int rw); 95 struct page *page, int rw);
96extern void md_do_sync(mddev_t *mddev); 96extern void md_do_sync(mddev_t *mddev);
97extern void md_new_event(mddev_t *mddev); 97extern void md_new_event(mddev_t *mddev);
98extern void md_allow_write(mddev_t *mddev); 98extern int md_allow_write(mddev_t *mddev);
99extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 99extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
100 100
101#endif /* CONFIG_MD */ 101#endif /* CONFIG_MD */
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 3dea9f545c8f..9f2549ac0e2d 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -59,7 +59,7 @@ struct mdk_rdev_s
59 int sb_loaded; 59 int sb_loaded;
60 __u64 sb_events; 60 __u64 sb_events;
61 sector_t data_offset; /* start of data in array */ 61 sector_t data_offset; /* start of data in array */
62 sector_t sb_offset; 62 sector_t sb_start; /* offset of the super block (in 512byte sectors) */
63 int sb_size; /* bytes in the superblock */ 63 int sb_size; /* bytes in the superblock */
64 int preferred_minor; /* autorun support */ 64 int preferred_minor; /* autorun support */
65 65
@@ -87,6 +87,9 @@ struct mdk_rdev_s
87#define Blocked 8 /* An error occured on an externally 87#define Blocked 8 /* An error occured on an externally
88 * managed array, don't allow writes 88 * managed array, don't allow writes
89 * until it is cleared */ 89 * until it is cleared */
90#define StateChanged 9 /* Faulty or Blocked has changed during
91 * interrupt, so it needs to be
92 * notified by the thread */
90 wait_queue_head_t blocked_wait; 93 wait_queue_head_t blocked_wait;
91 94
92 int desc_nr; /* descriptor index in the superblock */ 95 int desc_nr; /* descriptor index in the superblock */
@@ -147,7 +150,7 @@ struct mddev_s
147 int raid_disks; 150 int raid_disks;
148 int max_disks; 151 int max_disks;
149 sector_t size; /* used size of component devices */ 152 sector_t size; /* used size of component devices */
150 sector_t array_size; /* exported array size */ 153 sector_t array_sectors; /* exported array size */
151 __u64 events; 154 __u64 events;
152 155
153 char uuid[16]; 156 char uuid[16];
@@ -188,6 +191,7 @@ struct mddev_s
188 * NEEDED: we might need to start a resync/recover 191 * NEEDED: we might need to start a resync/recover
189 * RUNNING: a thread is running, or about to be started 192 * RUNNING: a thread is running, or about to be started
190 * SYNC: actually doing a resync, not a recovery 193 * SYNC: actually doing a resync, not a recovery
194 * RECOVER: doing recovery, or need to try it.
191 * INTR: resync needs to be aborted for some reason 195 * INTR: resync needs to be aborted for some reason
192 * DONE: thread is done and is waiting to be reaped 196 * DONE: thread is done and is waiting to be reaped
193 * REQUEST: user-space has requested a sync (used with SYNC) 197 * REQUEST: user-space has requested a sync (used with SYNC)
@@ -198,6 +202,7 @@ struct mddev_s
198 */ 202 */
199#define MD_RECOVERY_RUNNING 0 203#define MD_RECOVERY_RUNNING 0
200#define MD_RECOVERY_SYNC 1 204#define MD_RECOVERY_SYNC 1
205#define MD_RECOVERY_RECOVER 2
201#define MD_RECOVERY_INTR 3 206#define MD_RECOVERY_INTR 3
202#define MD_RECOVERY_DONE 4 207#define MD_RECOVERY_DONE 4
203#define MD_RECOVERY_NEEDED 5 208#define MD_RECOVERY_NEEDED 5
@@ -210,7 +215,8 @@ struct mddev_s
210 215
211 int in_sync; /* know to not need resync */ 216 int in_sync; /* know to not need resync */
212 struct mutex reconfig_mutex; 217 struct mutex reconfig_mutex;
213 atomic_t active; 218 atomic_t active; /* general refcount */
219 atomic_t openers; /* number of active opens */
214 220
215 int changed; /* true if we might need to reread partition info */ 221 int changed; /* true if we might need to reread partition info */
216 int degraded; /* whether md should consider 222 int degraded; /* whether md should consider
@@ -227,6 +233,8 @@ struct mddev_s
227 atomic_t recovery_active; /* blocks scheduled, but not written */ 233 atomic_t recovery_active; /* blocks scheduled, but not written */
228 wait_queue_head_t recovery_wait; 234 wait_queue_head_t recovery_wait;
229 sector_t recovery_cp; 235 sector_t recovery_cp;
236 sector_t resync_min; /* user requested sync
237 * starts here */
230 sector_t resync_max; /* resync should pause 238 sector_t resync_max; /* resync should pause
231 * when it gets here */ 239 * when it gets here */
232 240
@@ -331,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
331#define rdev_for_each(rdev, tmp, mddev) \ 339#define rdev_for_each(rdev, tmp, mddev) \
332 rdev_for_each_list(rdev, tmp, (mddev)->disks) 340 rdev_for_each_list(rdev, tmp, (mddev)->disks)
333 341
342#define rdev_for_each_rcu(rdev, mddev) \
343 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
344
334typedef struct mdk_thread_s { 345typedef struct mdk_thread_s {
335 void (*run) (mddev_t *mddev); 346 void (*run) (mddev_t *mddev);
336 mddev_t *mddev; 347 mddev_t *mddev;
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 3f2cd98c508b..8b4de4a41ff1 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -43,14 +43,11 @@
43 */ 43 */
44#define MD_RESERVED_BYTES (64 * 1024) 44#define MD_RESERVED_BYTES (64 * 1024)
45#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) 45#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
46#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
47 46
48#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) 47#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
49#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
50 48
51#define MD_SB_BYTES 4096 49#define MD_SB_BYTES 4096
52#define MD_SB_WORDS (MD_SB_BYTES / 4) 50#define MD_SB_WORDS (MD_SB_BYTES / 4)
53#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
54#define MD_SB_SECTORS (MD_SB_BYTES / 512) 51#define MD_SB_SECTORS (MD_SB_BYTES / 512)
55 52
56/* 53/*
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f0827d31ae6f..3b2672792457 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -158,6 +158,43 @@
158 * the compute block completes. 158 * the compute block completes.
159 */ 159 */
160 160
161/*
162 * Operations state - intermediate states that are visible outside of sh->lock
163 * In general _idle indicates nothing is running, _run indicates a data
164 * processing operation is active, and _result means the data processing result
165 * is stable and can be acted upon. For simple operations like biofill and
166 * compute that only have an _idle and _run state they are indicated with
167 * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
168 */
169/**
170 * enum check_states - handles syncing / repairing a stripe
171 * @check_state_idle - check operations are quiesced
172 * @check_state_run - check operation is running
173 * @check_state_result - set outside lock when check result is valid
174 * @check_state_compute_run - check failed and we are repairing
175 * @check_state_compute_result - set outside lock when compute result is valid
176 */
177enum check_states {
178 check_state_idle = 0,
179 check_state_run, /* parity check */
180 check_state_check_result,
181 check_state_compute_run, /* parity repair */
182 check_state_compute_result,
183};
184
185/**
186 * enum reconstruct_states - handles writing or expanding a stripe
187 */
188enum reconstruct_states {
189 reconstruct_state_idle = 0,
190 reconstruct_state_prexor_drain_run, /* prexor-write */
191 reconstruct_state_drain_run, /* write */
192 reconstruct_state_run, /* expand */
193 reconstruct_state_prexor_drain_result,
194 reconstruct_state_drain_result,
195 reconstruct_state_result,
196};
197
161struct stripe_head { 198struct stripe_head {
162 struct hlist_node hash; 199 struct hlist_node hash;
163 struct list_head lru; /* inactive_list or handle_list */ 200 struct list_head lru; /* inactive_list or handle_list */
@@ -169,19 +206,13 @@ struct stripe_head {
169 spinlock_t lock; 206 spinlock_t lock;
170 int bm_seq; /* sequence number for bitmap flushes */ 207 int bm_seq; /* sequence number for bitmap flushes */
171 int disks; /* disks in stripe */ 208 int disks; /* disks in stripe */
209 enum check_states check_state;
210 enum reconstruct_states reconstruct_state;
172 /* stripe_operations 211 /* stripe_operations
173 * @pending - pending ops flags (set for request->issue->complete)
174 * @ack - submitted ops flags (set for issue->complete)
175 * @complete - completed ops flags (set for complete)
176 * @target - STRIPE_OP_COMPUTE_BLK target 212 * @target - STRIPE_OP_COMPUTE_BLK target
177 * @count - raid5_runs_ops is set to run when this is non-zero
178 */ 213 */
179 struct stripe_operations { 214 struct stripe_operations {
180 unsigned long pending;
181 unsigned long ack;
182 unsigned long complete;
183 int target; 215 int target;
184 int count;
185 u32 zero_sum_result; 216 u32 zero_sum_result;
186 } ops; 217 } ops;
187 struct r5dev { 218 struct r5dev {
@@ -202,6 +233,7 @@ struct stripe_head_state {
202 int locked, uptodate, to_read, to_write, failed, written; 233 int locked, uptodate, to_read, to_write, failed, written;
203 int to_fill, compute, req_compute, non_overwrite; 234 int to_fill, compute, req_compute, non_overwrite;
204 int failed_num; 235 int failed_num;
236 unsigned long ops_request;
205}; 237};
206 238
207/* r6_state - extra state data only relevant to r6 */ 239/* r6_state - extra state data only relevant to r6 */
@@ -228,9 +260,7 @@ struct r6_state {
228#define R5_Wantfill 12 /* dev->toread contains a bio that needs 260#define R5_Wantfill 12 /* dev->toread contains a bio that needs
229 * filling 261 * filling
230 */ 262 */
231#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from 263#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
232 * other "towrites"
233 */
234/* 264/*
235 * Write method 265 * Write method
236 */ 266 */
@@ -254,8 +284,10 @@ struct r6_state {
254#define STRIPE_EXPAND_READY 11 284#define STRIPE_EXPAND_READY 11
255#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ 285#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
256#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ 286#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
287#define STRIPE_BIOFILL_RUN 14
288#define STRIPE_COMPUTE_RUN 15
257/* 289/*
258 * Operations flags (in issue order) 290 * Operation request flags
259 */ 291 */
260#define STRIPE_OP_BIOFILL 0 292#define STRIPE_OP_BIOFILL 0
261#define STRIPE_OP_COMPUTE_BLK 1 293#define STRIPE_OP_COMPUTE_BLK 1
@@ -263,14 +295,6 @@ struct r6_state {
263#define STRIPE_OP_BIODRAIN 3 295#define STRIPE_OP_BIODRAIN 3
264#define STRIPE_OP_POSTXOR 4 296#define STRIPE_OP_POSTXOR 4
265#define STRIPE_OP_CHECK 5 297#define STRIPE_OP_CHECK 5
266#define STRIPE_OP_IO 6
267
268/* modifiers to the base operations
269 * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
270 * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
271 */
272#define STRIPE_OP_MOD_REPAIR_PD 7
273#define STRIPE_OP_MOD_DMA_CHECK 8
274 298
275/* 299/*
276 * Plugging: 300 * Plugging: