diff options
-rw-r--r-- | Documentation/md.txt | 30 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 54 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 20 | ||||
-rw-r--r-- | drivers/md/md.c | 615 | ||||
-rw-r--r-- | drivers/md/multipath.c | 17 | ||||
-rw-r--r-- | drivers/md/raid0.c | 8 | ||||
-rw-r--r-- | drivers/md/raid1.c | 30 | ||||
-rw-r--r-- | drivers/md/raid10.c | 22 | ||||
-rw-r--r-- | drivers/md/raid5.c | 745 | ||||
-rw-r--r-- | include/linux/raid/bitmap.h | 1 | ||||
-rw-r--r-- | include/linux/raid/linear.h | 2 | ||||
-rw-r--r-- | include/linux/raid/md.h | 2 | ||||
-rw-r--r-- | include/linux/raid/md_k.h | 17 | ||||
-rw-r--r-- | include/linux/raid/md_p.h | 3 | ||||
-rw-r--r-- | include/linux/raid/raid5.h | 64 |
16 files changed, 842 insertions, 790 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index a8b430627473..1da9d1b1793f 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
@@ -236,6 +236,11 @@ All md devices contain: | |||
236 | writing the word for the desired state, however some states | 236 | writing the word for the desired state, however some states |
237 | cannot be explicitly set, and some transitions are not allowed. | 237 | cannot be explicitly set, and some transitions are not allowed. |
238 | 238 | ||
239 | Select/poll works on this file. All changes except between | ||
240 | active_idle and active (which can be frequent and are not | ||
241 | very interesting) are notified. active->active_idle is | ||
242 | reported if the metadata is externally managed. | ||
243 | |||
239 | clear | 244 | clear |
240 | No devices, no size, no level | 245 | No devices, no size, no level |
241 | Writing is equivalent to STOP_ARRAY ioctl | 246 | Writing is equivalent to STOP_ARRAY ioctl |
@@ -292,6 +297,10 @@ Each directory contains: | |||
292 | writemostly - device will only be subject to read | 297 | writemostly - device will only be subject to read |
293 | requests if there are no other options. | 298 | requests if there are no other options. |
294 | This applies only to raid1 arrays. | 299 | This applies only to raid1 arrays. |
300 | blocked - device has failed, metadata is "external", | ||
301 | and the failure hasn't been acknowledged yet. | ||
302 | Writes that would write to this device if | ||
303 | it were not faulty are blocked. | ||
295 | spare - device is working, but not a full member. | 304 | spare - device is working, but not a full member. |
296 | This includes spares that are in the process | 305 | This includes spares that are in the process |
297 | of being recovered to | 306 | of being recovered to |
@@ -301,6 +310,12 @@ Each directory contains: | |||
301 | Writing "remove" removes the device from the array. | 310 | Writing "remove" removes the device from the array. |
302 | Writing "writemostly" sets the writemostly flag. | 311 | Writing "writemostly" sets the writemostly flag. |
303 | Writing "-writemostly" clears the writemostly flag. | 312 | Writing "-writemostly" clears the writemostly flag. |
313 | Writing "blocked" sets the "blocked" flag. | ||
314 | Writing "-blocked" clear the "blocked" flag and allows writes | ||
315 | to complete. | ||
316 | |||
317 | This file responds to select/poll. Any change to 'faulty' | ||
318 | or 'blocked' causes an event. | ||
304 | 319 | ||
305 | errors | 320 | errors |
306 | An approximate count of read errors that have been detected on | 321 | An approximate count of read errors that have been detected on |
@@ -332,7 +347,7 @@ Each directory contains: | |||
332 | for storage of data. This will normally be the same as the | 347 | for storage of data. This will normally be the same as the |
333 | component_size. This can be written while assembling an | 348 | component_size. This can be written while assembling an |
334 | array. If a value less than the current component_size is | 349 | array. If a value less than the current component_size is |
335 | written, component_size will be reduced to this value. | 350 | written, it will be rejected. |
336 | 351 | ||
337 | 352 | ||
338 | An active md device will also contain and entry for each active device | 353 | An active md device will also contain and entry for each active device |
@@ -381,6 +396,19 @@ also have | |||
381 | 'check' and 'repair' will start the appropriate process | 396 | 'check' and 'repair' will start the appropriate process |
382 | providing the current state is 'idle'. | 397 | providing the current state is 'idle'. |
383 | 398 | ||
399 | This file responds to select/poll. Any important change in the value | ||
400 | triggers a poll event. Sometimes the value will briefly be | ||
401 | "recover" if a recovery seems to be needed, but cannot be | ||
402 | achieved. In that case, the transition to "recover" isn't | ||
403 | notified, but the transition away is. | ||
404 | |||
405 | degraded | ||
406 | This contains a count of the number of devices by which the | ||
407 | arrays is degraded. So an optimal array with show '0'. A | ||
408 | single failed/missing drive will show '1', etc. | ||
409 | This file responds to select/poll, any increase or decrease | ||
410 | in the count of missing devices will trigger an event. | ||
411 | |||
384 | mismatch_count | 412 | mismatch_count |
385 | When performing 'check' and 'repair', and possibly when | 413 | When performing 'check' and 'repair', and possibly when |
386 | performing 'resync', md will count the number of errors that are | 414 | performing 'resync', md will count the number of errors that are |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index b26927ce889c..621a272a2c74 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
225 | || test_bit(Faulty, &rdev->flags)) | 225 | || test_bit(Faulty, &rdev->flags)) |
226 | continue; | 226 | continue; |
227 | 227 | ||
228 | target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); | 228 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); |
229 | 229 | ||
230 | if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { | 230 | if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { |
231 | page->index = index; | 231 | page->index = index; |
@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
241 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | 241 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) |
242 | { | 242 | { |
243 | mdk_rdev_t *rdev; | 243 | mdk_rdev_t *rdev; |
244 | struct list_head *tmp; | ||
245 | mddev_t *mddev = bitmap->mddev; | 244 | mddev_t *mddev = bitmap->mddev; |
246 | 245 | ||
247 | rdev_for_each(rdev, tmp, mddev) | 246 | rcu_read_lock(); |
247 | rdev_for_each_rcu(rdev, mddev) | ||
248 | if (test_bit(In_sync, &rdev->flags) | 248 | if (test_bit(In_sync, &rdev->flags) |
249 | && !test_bit(Faulty, &rdev->flags)) { | 249 | && !test_bit(Faulty, &rdev->flags)) { |
250 | int size = PAGE_SIZE; | 250 | int size = PAGE_SIZE; |
@@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
260 | + (long)(page->index * (PAGE_SIZE/512)) | 260 | + (long)(page->index * (PAGE_SIZE/512)) |
261 | + size/512 > 0) | 261 | + size/512 > 0) |
262 | /* bitmap runs in to metadata */ | 262 | /* bitmap runs in to metadata */ |
263 | return -EINVAL; | 263 | goto bad_alignment; |
264 | if (rdev->data_offset + mddev->size*2 | 264 | if (rdev->data_offset + mddev->size*2 |
265 | > rdev->sb_offset*2 + bitmap->offset) | 265 | > rdev->sb_start + bitmap->offset) |
266 | /* data runs in to bitmap */ | 266 | /* data runs in to bitmap */ |
267 | return -EINVAL; | 267 | goto bad_alignment; |
268 | } else if (rdev->sb_offset*2 < rdev->data_offset) { | 268 | } else if (rdev->sb_start < rdev->data_offset) { |
269 | /* METADATA BITMAP DATA */ | 269 | /* METADATA BITMAP DATA */ |
270 | if (rdev->sb_offset*2 | 270 | if (rdev->sb_start |
271 | + bitmap->offset | 271 | + bitmap->offset |
272 | + page->index*(PAGE_SIZE/512) + size/512 | 272 | + page->index*(PAGE_SIZE/512) + size/512 |
273 | > rdev->data_offset) | 273 | > rdev->data_offset) |
274 | /* bitmap runs in to data */ | 274 | /* bitmap runs in to data */ |
275 | return -EINVAL; | 275 | goto bad_alignment; |
276 | } else { | 276 | } else { |
277 | /* DATA METADATA BITMAP - no problems */ | 277 | /* DATA METADATA BITMAP - no problems */ |
278 | } | 278 | } |
279 | md_super_write(mddev, rdev, | 279 | md_super_write(mddev, rdev, |
280 | (rdev->sb_offset<<1) + bitmap->offset | 280 | rdev->sb_start + bitmap->offset |
281 | + page->index * (PAGE_SIZE/512), | 281 | + page->index * (PAGE_SIZE/512), |
282 | size, | 282 | size, |
283 | page); | 283 | page); |
284 | } | 284 | } |
285 | rcu_read_unlock(); | ||
285 | 286 | ||
286 | if (wait) | 287 | if (wait) |
287 | md_super_wait(mddev); | 288 | md_super_wait(mddev); |
288 | return 0; | 289 | return 0; |
290 | |||
291 | bad_alignment: | ||
292 | rcu_read_unlock(); | ||
293 | return -EINVAL; | ||
289 | } | 294 | } |
290 | 295 | ||
291 | static void bitmap_file_kick(struct bitmap *bitmap); | 296 | static void bitmap_file_kick(struct bitmap *bitmap); |
@@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
454 | spin_unlock_irqrestore(&bitmap->lock, flags); | 459 | spin_unlock_irqrestore(&bitmap->lock, flags); |
455 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 460 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); |
456 | sb->events = cpu_to_le64(bitmap->mddev->events); | 461 | sb->events = cpu_to_le64(bitmap->mddev->events); |
457 | if (!bitmap->mddev->degraded) | 462 | if (bitmap->mddev->events < bitmap->events_cleared) { |
458 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); | 463 | /* rocking back to read-only */ |
464 | bitmap->events_cleared = bitmap->mddev->events; | ||
465 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | ||
466 | } | ||
459 | kunmap_atomic(sb, KM_USER0); | 467 | kunmap_atomic(sb, KM_USER0); |
460 | write_page(bitmap, bitmap->sb_page, 1); | 468 | write_page(bitmap, bitmap->sb_page, 1); |
461 | } | 469 | } |
@@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
1085 | } else | 1093 | } else |
1086 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1094 | spin_unlock_irqrestore(&bitmap->lock, flags); |
1087 | lastpage = page; | 1095 | lastpage = page; |
1088 | /* | 1096 | |
1089 | printk("bitmap clean at page %lu\n", j); | 1097 | /* We are possibly going to clear some bits, so make |
1090 | */ | 1098 | * sure that events_cleared is up-to-date. |
1099 | */ | ||
1100 | if (bitmap->need_sync) { | ||
1101 | bitmap_super_t *sb; | ||
1102 | bitmap->need_sync = 0; | ||
1103 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | ||
1104 | sb->events_cleared = | ||
1105 | cpu_to_le64(bitmap->events_cleared); | ||
1106 | kunmap_atomic(sb, KM_USER0); | ||
1107 | write_page(bitmap, bitmap->sb_page, 1); | ||
1108 | } | ||
1091 | spin_lock_irqsave(&bitmap->lock, flags); | 1109 | spin_lock_irqsave(&bitmap->lock, flags); |
1092 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1110 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
1093 | } | 1111 | } |
@@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1257 | return; | 1275 | return; |
1258 | } | 1276 | } |
1259 | 1277 | ||
1278 | if (success && | ||
1279 | bitmap->events_cleared < bitmap->mddev->events) { | ||
1280 | bitmap->events_cleared = bitmap->mddev->events; | ||
1281 | bitmap->need_sync = 1; | ||
1282 | } | ||
1283 | |||
1260 | if (!success && ! (*bmc & NEEDED_MASK)) | 1284 | if (!success && ! (*bmc & NEEDED_MASK)) |
1261 | *bmc |= NEEDED_MASK; | 1285 | *bmc |= NEEDED_MASK; |
1262 | 1286 | ||
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index d107ddceefcd..268547dbfbd3 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -297,7 +297,7 @@ static int run(mddev_t *mddev) | |||
297 | rdev_for_each(rdev, tmp, mddev) | 297 | rdev_for_each(rdev, tmp, mddev) |
298 | conf->rdev = rdev; | 298 | conf->rdev = rdev; |
299 | 299 | ||
300 | mddev->array_size = mddev->size; | 300 | mddev->array_sectors = mddev->size * 2; |
301 | mddev->private = conf; | 301 | mddev->private = conf; |
302 | 302 | ||
303 | reconfig(mddev, mddev->layout, -1); | 303 | reconfig(mddev, mddev->layout, -1); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 6a866d7c8ae5..b1eebf88c209 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
122 | return NULL; | 122 | return NULL; |
123 | 123 | ||
124 | cnt = 0; | 124 | cnt = 0; |
125 | conf->array_size = 0; | 125 | conf->array_sectors = 0; |
126 | 126 | ||
127 | rdev_for_each(rdev, tmp, mddev) { | 127 | rdev_for_each(rdev, tmp, mddev) { |
128 | int j = rdev->raid_disk; | 128 | int j = rdev->raid_disk; |
129 | dev_info_t *disk = conf->disks + j; | 129 | dev_info_t *disk = conf->disks + j; |
130 | 130 | ||
131 | if (j < 0 || j > raid_disks || disk->rdev) { | 131 | if (j < 0 || j >= raid_disks || disk->rdev) { |
132 | printk("linear: disk numbering problem. Aborting!\n"); | 132 | printk("linear: disk numbering problem. Aborting!\n"); |
133 | goto out; | 133 | goto out; |
134 | } | 134 | } |
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
146 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 146 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
147 | 147 | ||
148 | disk->size = rdev->size; | 148 | disk->size = rdev->size; |
149 | conf->array_size += rdev->size; | 149 | conf->array_sectors += rdev->size * 2; |
150 | 150 | ||
151 | cnt++; | 151 | cnt++; |
152 | } | 152 | } |
@@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
155 | goto out; | 155 | goto out; |
156 | } | 156 | } |
157 | 157 | ||
158 | min_spacing = conf->array_size; | 158 | min_spacing = conf->array_sectors / 2; |
159 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); | 159 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); |
160 | 160 | ||
161 | /* min_spacing is the minimum spacing that will fit the hash | 161 | /* min_spacing is the minimum spacing that will fit the hash |
@@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
164 | * that is larger than min_spacing as use the size of that as | 164 | * that is larger than min_spacing as use the size of that as |
165 | * the actual spacing | 165 | * the actual spacing |
166 | */ | 166 | */ |
167 | conf->hash_spacing = conf->array_size; | 167 | conf->hash_spacing = conf->array_sectors / 2; |
168 | for (i=0; i < cnt-1 ; i++) { | 168 | for (i=0; i < cnt-1 ; i++) { |
169 | sector_t sz = 0; | 169 | sector_t sz = 0; |
170 | int j; | 170 | int j; |
@@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
194 | unsigned round; | 194 | unsigned round; |
195 | unsigned long base; | 195 | unsigned long base; |
196 | 196 | ||
197 | sz = conf->array_size >> conf->preshift; | 197 | sz = conf->array_sectors >> (conf->preshift + 1); |
198 | sz += 1; /* force round-up */ | 198 | sz += 1; /* force round-up */ |
199 | base = conf->hash_spacing >> conf->preshift; | 199 | base = conf->hash_spacing >> conf->preshift; |
200 | round = sector_div(sz, base); | 200 | round = sector_div(sz, base); |
@@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
221 | curr_offset = 0; | 221 | curr_offset = 0; |
222 | i = 0; | 222 | i = 0; |
223 | for (curr_offset = 0; | 223 | for (curr_offset = 0; |
224 | curr_offset < conf->array_size; | 224 | curr_offset < conf->array_sectors / 2; |
225 | curr_offset += conf->hash_spacing) { | 225 | curr_offset += conf->hash_spacing) { |
226 | 226 | ||
227 | while (i < raid_disks-1 && | 227 | while (i < raid_disks-1 && |
@@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev) | |||
258 | if (!conf) | 258 | if (!conf) |
259 | return 1; | 259 | return 1; |
260 | mddev->private = conf; | 260 | mddev->private = conf; |
261 | mddev->array_size = conf->array_size; | 261 | mddev->array_sectors = conf->array_sectors; |
262 | 262 | ||
263 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 263 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
264 | mddev->queue->unplug_fn = linear_unplug; | 264 | mddev->queue->unplug_fn = linear_unplug; |
@@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
292 | newconf->prev = mddev_to_conf(mddev); | 292 | newconf->prev = mddev_to_conf(mddev); |
293 | mddev->private = newconf; | 293 | mddev->private = newconf; |
294 | mddev->raid_disks++; | 294 | mddev->raid_disks++; |
295 | mddev->array_size = newconf->array_size; | 295 | mddev->array_sectors = newconf->array_sectors; |
296 | set_capacity(mddev->gendisk, mddev->array_size << 1); | 296 | set_capacity(mddev->gendisk, mddev->array_sectors); |
297 | return 0; | 297 | return 0; |
298 | } | 298 | } |
299 | 299 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 2580ac1b9b0f..c2ff77ccec50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev) | |||
169 | { | 169 | { |
170 | atomic_inc(&md_event_count); | 170 | atomic_inc(&md_event_count); |
171 | wake_up(&md_event_waiters); | 171 | wake_up(&md_event_waiters); |
172 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
173 | } | 172 | } |
174 | EXPORT_SYMBOL_GPL(md_new_event); | 173 | EXPORT_SYMBOL_GPL(md_new_event); |
175 | 174 | ||
@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit) | |||
274 | INIT_LIST_HEAD(&new->all_mddevs); | 273 | INIT_LIST_HEAD(&new->all_mddevs); |
275 | init_timer(&new->safemode_timer); | 274 | init_timer(&new->safemode_timer); |
276 | atomic_set(&new->active, 1); | 275 | atomic_set(&new->active, 1); |
276 | atomic_set(&new->openers, 0); | ||
277 | spin_lock_init(&new->write_lock); | 277 | spin_lock_init(&new->write_lock); |
278 | init_waitqueue_head(&new->sb_wait); | 278 | init_waitqueue_head(&new->sb_wait); |
279 | init_waitqueue_head(&new->recovery_wait); | 279 | init_waitqueue_head(&new->recovery_wait); |
280 | new->reshape_position = MaxSector; | 280 | new->reshape_position = MaxSector; |
281 | new->resync_min = 0; | ||
281 | new->resync_max = MaxSector; | 282 | new->resync_max = MaxSector; |
282 | new->level = LEVEL_NONE; | 283 | new->level = LEVEL_NONE; |
283 | 284 | ||
@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel) | |||
347 | return NULL; | 348 | return NULL; |
348 | } | 349 | } |
349 | 350 | ||
351 | /* return the offset of the super block in 512byte sectors */ | ||
350 | static inline sector_t calc_dev_sboffset(struct block_device *bdev) | 352 | static inline sector_t calc_dev_sboffset(struct block_device *bdev) |
351 | { | 353 | { |
352 | sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | 354 | sector_t num_sectors = bdev->bd_inode->i_size / 512; |
353 | return MD_NEW_SIZE_BLOCKS(size); | 355 | return MD_NEW_SIZE_SECTORS(num_sectors); |
354 | } | 356 | } |
355 | 357 | ||
356 | static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) | 358 | static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) |
357 | { | 359 | { |
358 | sector_t size; | 360 | sector_t num_sectors = rdev->sb_start; |
359 | |||
360 | size = rdev->sb_offset; | ||
361 | 361 | ||
362 | if (chunk_size) | 362 | if (chunk_size) |
363 | size &= ~((sector_t)chunk_size/1024 - 1); | 363 | num_sectors &= ~((sector_t)chunk_size/512 - 1); |
364 | return size; | 364 | return num_sectors; |
365 | } | 365 | } |
366 | 366 | ||
367 | static int alloc_disk_sb(mdk_rdev_t * rdev) | 367 | static int alloc_disk_sb(mdk_rdev_t * rdev) |
@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) | |||
372 | rdev->sb_page = alloc_page(GFP_KERNEL); | 372 | rdev->sb_page = alloc_page(GFP_KERNEL); |
373 | if (!rdev->sb_page) { | 373 | if (!rdev->sb_page) { |
374 | printk(KERN_ALERT "md: out of memory.\n"); | 374 | printk(KERN_ALERT "md: out of memory.\n"); |
375 | return -EINVAL; | 375 | return -ENOMEM; |
376 | } | 376 | } |
377 | 377 | ||
378 | return 0; | 378 | return 0; |
@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
384 | put_page(rdev->sb_page); | 384 | put_page(rdev->sb_page); |
385 | rdev->sb_loaded = 0; | 385 | rdev->sb_loaded = 0; |
386 | rdev->sb_page = NULL; | 386 | rdev->sb_page = NULL; |
387 | rdev->sb_offset = 0; | 387 | rdev->sb_start = 0; |
388 | rdev->size = 0; | 388 | rdev->size = 0; |
389 | } | 389 | } |
390 | } | 390 | } |
@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) | |||
530 | return 0; | 530 | return 0; |
531 | 531 | ||
532 | 532 | ||
533 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) | 533 | if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) |
534 | goto fail; | 534 | goto fail; |
535 | rdev->sb_loaded = 1; | 535 | rdev->sb_loaded = 1; |
536 | return 0; | 536 | return 0; |
@@ -543,17 +543,12 @@ fail: | |||
543 | 543 | ||
544 | static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) | 544 | static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
545 | { | 545 | { |
546 | if ( (sb1->set_uuid0 == sb2->set_uuid0) && | 546 | return sb1->set_uuid0 == sb2->set_uuid0 && |
547 | (sb1->set_uuid1 == sb2->set_uuid1) && | 547 | sb1->set_uuid1 == sb2->set_uuid1 && |
548 | (sb1->set_uuid2 == sb2->set_uuid2) && | 548 | sb1->set_uuid2 == sb2->set_uuid2 && |
549 | (sb1->set_uuid3 == sb2->set_uuid3)) | 549 | sb1->set_uuid3 == sb2->set_uuid3; |
550 | |||
551 | return 1; | ||
552 | |||
553 | return 0; | ||
554 | } | 550 | } |
555 | 551 | ||
556 | |||
557 | static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | 552 | static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
558 | { | 553 | { |
559 | int ret; | 554 | int ret; |
@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
564 | 559 | ||
565 | if (!tmp1 || !tmp2) { | 560 | if (!tmp1 || !tmp2) { |
566 | ret = 0; | 561 | ret = 0; |
567 | printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); | 562 | printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); |
568 | goto abort; | 563 | goto abort; |
569 | } | 564 | } |
570 | 565 | ||
@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
577 | tmp1->nr_disks = 0; | 572 | tmp1->nr_disks = 0; |
578 | tmp2->nr_disks = 0; | 573 | tmp2->nr_disks = 0; |
579 | 574 | ||
580 | if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) | 575 | ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
581 | ret = 0; | ||
582 | else | ||
583 | ret = 1; | ||
584 | |||
585 | abort: | 576 | abort: |
586 | kfree(tmp1); | 577 | kfree(tmp1); |
587 | kfree(tmp2); | 578 | kfree(tmp2); |
@@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) | |||
658 | */ | 649 | */ |
659 | 650 | ||
660 | struct super_type { | 651 | struct super_type { |
661 | char *name; | 652 | char *name; |
662 | struct module *owner; | 653 | struct module *owner; |
663 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); | 654 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, |
664 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); | 655 | int minor_version); |
665 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | 656 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); |
657 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
658 | unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, | ||
659 | sector_t num_sectors); | ||
666 | }; | 660 | }; |
667 | 661 | ||
668 | /* | 662 | /* |
@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
673 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 667 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
674 | mdp_super_t *sb; | 668 | mdp_super_t *sb; |
675 | int ret; | 669 | int ret; |
676 | sector_t sb_offset; | ||
677 | 670 | ||
678 | /* | 671 | /* |
679 | * Calculate the position of the superblock, | 672 | * Calculate the position of the superblock (512byte sectors), |
680 | * it's at the end of the disk. | 673 | * it's at the end of the disk. |
681 | * | 674 | * |
682 | * It also happens to be a multiple of 4Kb. | 675 | * It also happens to be a multiple of 4Kb. |
683 | */ | 676 | */ |
684 | sb_offset = calc_dev_sboffset(rdev->bdev); | 677 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
685 | rdev->sb_offset = sb_offset; | ||
686 | 678 | ||
687 | ret = read_disk_sb(rdev, MD_SB_BYTES); | 679 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
688 | if (ret) return ret; | 680 | if (ret) return ret; |
@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
759 | else | 751 | else |
760 | ret = 0; | 752 | ret = 0; |
761 | } | 753 | } |
762 | rdev->size = calc_dev_size(rdev, sb->chunk_size); | 754 | rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; |
763 | 755 | ||
764 | if (rdev->size < sb->size && sb->level > 1) | 756 | if (rdev->size < sb->size && sb->level > 1) |
765 | /* "this cannot possibly happen" ... */ | 757 | /* "this cannot possibly happen" ... */ |
@@ -1004,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1004 | } | 996 | } |
1005 | 997 | ||
1006 | /* | 998 | /* |
999 | * rdev_size_change for 0.90.0 | ||
1000 | */ | ||
1001 | static unsigned long long | ||
1002 | super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | ||
1003 | { | ||
1004 | if (num_sectors && num_sectors < rdev->mddev->size * 2) | ||
1005 | return 0; /* component must fit device */ | ||
1006 | if (rdev->mddev->bitmap_offset) | ||
1007 | return 0; /* can't move bitmap */ | ||
1008 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | ||
1009 | if (!num_sectors || num_sectors > rdev->sb_start) | ||
1010 | num_sectors = rdev->sb_start; | ||
1011 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||
1012 | rdev->sb_page); | ||
1013 | md_super_wait(rdev->mddev); | ||
1014 | return num_sectors / 2; /* kB for sysfs */ | ||
1015 | } | ||
1016 | |||
1017 | |||
1018 | /* | ||
1007 | * version 1 superblock | 1019 | * version 1 superblock |
1008 | */ | 1020 | */ |
1009 | 1021 | ||
@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1034 | { | 1046 | { |
1035 | struct mdp_superblock_1 *sb; | 1047 | struct mdp_superblock_1 *sb; |
1036 | int ret; | 1048 | int ret; |
1037 | sector_t sb_offset; | 1049 | sector_t sb_start; |
1038 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 1050 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
1039 | int bmask; | 1051 | int bmask; |
1040 | 1052 | ||
1041 | /* | 1053 | /* |
1042 | * Calculate the position of the superblock. | 1054 | * Calculate the position of the superblock in 512byte sectors. |
1043 | * It is always aligned to a 4K boundary and | 1055 | * It is always aligned to a 4K boundary and |
1044 | * depeding on minor_version, it can be: | 1056 | * depeding on minor_version, it can be: |
1045 | * 0: At least 8K, but less than 12K, from end of device | 1057 | * 0: At least 8K, but less than 12K, from end of device |
@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1048 | */ | 1060 | */ |
1049 | switch(minor_version) { | 1061 | switch(minor_version) { |
1050 | case 0: | 1062 | case 0: |
1051 | sb_offset = rdev->bdev->bd_inode->i_size >> 9; | 1063 | sb_start = rdev->bdev->bd_inode->i_size >> 9; |
1052 | sb_offset -= 8*2; | 1064 | sb_start -= 8*2; |
1053 | sb_offset &= ~(sector_t)(4*2-1); | 1065 | sb_start &= ~(sector_t)(4*2-1); |
1054 | /* convert from sectors to K */ | ||
1055 | sb_offset /= 2; | ||
1056 | break; | 1066 | break; |
1057 | case 1: | 1067 | case 1: |
1058 | sb_offset = 0; | 1068 | sb_start = 0; |
1059 | break; | 1069 | break; |
1060 | case 2: | 1070 | case 2: |
1061 | sb_offset = 4; | 1071 | sb_start = 8; |
1062 | break; | 1072 | break; |
1063 | default: | 1073 | default: |
1064 | return -EINVAL; | 1074 | return -EINVAL; |
1065 | } | 1075 | } |
1066 | rdev->sb_offset = sb_offset; | 1076 | rdev->sb_start = sb_start; |
1067 | 1077 | ||
1068 | /* superblock is rarely larger than 1K, but it can be larger, | 1078 | /* superblock is rarely larger than 1K, but it can be larger, |
1069 | * and it is safe to read 4k, so we do that | 1079 | * and it is safe to read 4k, so we do that |
@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1077 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | 1087 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
1078 | sb->major_version != cpu_to_le32(1) || | 1088 | sb->major_version != cpu_to_le32(1) || |
1079 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || | 1089 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
1080 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || | 1090 | le64_to_cpu(sb->super_offset) != rdev->sb_start || |
1081 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) | 1091 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
1082 | return -EINVAL; | 1092 | return -EINVAL; |
1083 | 1093 | ||
@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1113 | rdev->sb_size = (rdev->sb_size | bmask) + 1; | 1123 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
1114 | 1124 | ||
1115 | if (minor_version | 1125 | if (minor_version |
1116 | && rdev->data_offset < sb_offset + (rdev->sb_size/512)) | 1126 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
1117 | return -EINVAL; | 1127 | return -EINVAL; |
1118 | 1128 | ||
1119 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) | 1129 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1149 | if (minor_version) | 1159 | if (minor_version) |
1150 | rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; | 1160 | rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; |
1151 | else | 1161 | else |
1152 | rdev->size = rdev->sb_offset; | 1162 | rdev->size = rdev->sb_start / 2; |
1153 | if (rdev->size < le64_to_cpu(sb->data_size)/2) | 1163 | if (rdev->size < le64_to_cpu(sb->data_size)/2) |
1154 | return -EINVAL; | 1164 | return -EINVAL; |
1155 | rdev->size = le64_to_cpu(sb->data_size)/2; | 1165 | rdev->size = le64_to_cpu(sb->data_size)/2; |
@@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1328 | sb->sb_csum = calc_sb_1_csum(sb); | 1338 | sb->sb_csum = calc_sb_1_csum(sb); |
1329 | } | 1339 | } |
1330 | 1340 | ||
1341 | static unsigned long long | ||
1342 | super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | ||
1343 | { | ||
1344 | struct mdp_superblock_1 *sb; | ||
1345 | sector_t max_sectors; | ||
1346 | if (num_sectors && num_sectors < rdev->mddev->size * 2) | ||
1347 | return 0; /* component must fit device */ | ||
1348 | if (rdev->sb_start < rdev->data_offset) { | ||
1349 | /* minor versions 1 and 2; superblock before data */ | ||
1350 | max_sectors = rdev->bdev->bd_inode->i_size >> 9; | ||
1351 | max_sectors -= rdev->data_offset; | ||
1352 | if (!num_sectors || num_sectors > max_sectors) | ||
1353 | num_sectors = max_sectors; | ||
1354 | } else if (rdev->mddev->bitmap_offset) { | ||
1355 | /* minor version 0 with bitmap we can't move */ | ||
1356 | return 0; | ||
1357 | } else { | ||
1358 | /* minor version 0; superblock after data */ | ||
1359 | sector_t sb_start; | ||
1360 | sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; | ||
1361 | sb_start &= ~(sector_t)(4*2 - 1); | ||
1362 | max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; | ||
1363 | if (!num_sectors || num_sectors > max_sectors) | ||
1364 | num_sectors = max_sectors; | ||
1365 | rdev->sb_start = sb_start; | ||
1366 | } | ||
1367 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | ||
1368 | sb->data_size = cpu_to_le64(num_sectors); | ||
1369 | sb->super_offset = rdev->sb_start; | ||
1370 | sb->sb_csum = calc_sb_1_csum(sb); | ||
1371 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||
1372 | rdev->sb_page); | ||
1373 | md_super_wait(rdev->mddev); | ||
1374 | return num_sectors / 2; /* kB for sysfs */ | ||
1375 | } | ||
1331 | 1376 | ||
1332 | static struct super_type super_types[] = { | 1377 | static struct super_type super_types[] = { |
1333 | [0] = { | 1378 | [0] = { |
1334 | .name = "0.90.0", | 1379 | .name = "0.90.0", |
1335 | .owner = THIS_MODULE, | 1380 | .owner = THIS_MODULE, |
1336 | .load_super = super_90_load, | 1381 | .load_super = super_90_load, |
1337 | .validate_super = super_90_validate, | 1382 | .validate_super = super_90_validate, |
1338 | .sync_super = super_90_sync, | 1383 | .sync_super = super_90_sync, |
1384 | .rdev_size_change = super_90_rdev_size_change, | ||
1339 | }, | 1385 | }, |
1340 | [1] = { | 1386 | [1] = { |
1341 | .name = "md-1", | 1387 | .name = "md-1", |
1342 | .owner = THIS_MODULE, | 1388 | .owner = THIS_MODULE, |
1343 | .load_super = super_1_load, | 1389 | .load_super = super_1_load, |
1344 | .validate_super = super_1_validate, | 1390 | .validate_super = super_1_validate, |
1345 | .sync_super = super_1_sync, | 1391 | .sync_super = super_1_sync, |
1392 | .rdev_size_change = super_1_rdev_size_change, | ||
1346 | }, | 1393 | }, |
1347 | }; | 1394 | }; |
1348 | 1395 | ||
1349 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | 1396 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) |
1350 | { | 1397 | { |
1351 | struct list_head *tmp, *tmp2; | ||
1352 | mdk_rdev_t *rdev, *rdev2; | 1398 | mdk_rdev_t *rdev, *rdev2; |
1353 | 1399 | ||
1354 | rdev_for_each(rdev, tmp, mddev1) | 1400 | rcu_read_lock(); |
1355 | rdev_for_each(rdev2, tmp2, mddev2) | 1401 | rdev_for_each_rcu(rdev, mddev1) |
1402 | rdev_for_each_rcu(rdev2, mddev2) | ||
1356 | if (rdev->bdev->bd_contains == | 1403 | if (rdev->bdev->bd_contains == |
1357 | rdev2->bdev->bd_contains) | 1404 | rdev2->bdev->bd_contains) { |
1405 | rcu_read_unlock(); | ||
1358 | return 1; | 1406 | return 1; |
1359 | 1407 | } | |
1408 | rcu_read_unlock(); | ||
1360 | return 0; | 1409 | return 0; |
1361 | } | 1410 | } |
1362 | 1411 | ||
@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1423 | kobject_del(&rdev->kobj); | 1472 | kobject_del(&rdev->kobj); |
1424 | goto fail; | 1473 | goto fail; |
1425 | } | 1474 | } |
1426 | list_add(&rdev->same_set, &mddev->disks); | 1475 | list_add_rcu(&rdev->same_set, &mddev->disks); |
1427 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); | 1476 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); |
1428 | return 0; | 1477 | return 0; |
1429 | 1478 | ||
@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1448 | return; | 1497 | return; |
1449 | } | 1498 | } |
1450 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); | 1499 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); |
1451 | list_del_init(&rdev->same_set); | 1500 | list_del_rcu(&rdev->same_set); |
1452 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 1501 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
1453 | rdev->mddev = NULL; | 1502 | rdev->mddev = NULL; |
1454 | sysfs_remove_link(&rdev->kobj, "block"); | 1503 | sysfs_remove_link(&rdev->kobj, "block"); |
1455 | 1504 | ||
1456 | /* We need to delay this, otherwise we can deadlock when | 1505 | /* We need to delay this, otherwise we can deadlock when |
1457 | * writing to 'remove' to "dev/state" | 1506 | * writing to 'remove' to "dev/state". We also need |
1507 | * to delay it due to rcu usage. | ||
1458 | */ | 1508 | */ |
1509 | synchronize_rcu(); | ||
1459 | INIT_WORK(&rdev->del_work, md_delayed_delete); | 1510 | INIT_WORK(&rdev->del_work, md_delayed_delete); |
1460 | kobject_get(&rdev->kobj); | 1511 | kobject_get(&rdev->kobj); |
1461 | schedule_work(&rdev->del_work); | 1512 | schedule_work(&rdev->del_work); |
@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) | |||
1511 | if (rdev->mddev) | 1562 | if (rdev->mddev) |
1512 | MD_BUG(); | 1563 | MD_BUG(); |
1513 | free_disk_sb(rdev); | 1564 | free_disk_sb(rdev); |
1514 | list_del_init(&rdev->same_set); | ||
1515 | #ifndef MODULE | 1565 | #ifndef MODULE |
1516 | if (test_bit(AutoDetected, &rdev->flags)) | 1566 | if (test_bit(AutoDetected, &rdev->flags)) |
1517 | md_autodetect_dev(rdev->bdev->bd_dev); | 1567 | md_autodetect_dev(rdev->bdev->bd_dev); |
@@ -1758,11 +1808,11 @@ repeat: | |||
1758 | dprintk("%s ", bdevname(rdev->bdev,b)); | 1808 | dprintk("%s ", bdevname(rdev->bdev,b)); |
1759 | if (!test_bit(Faulty, &rdev->flags)) { | 1809 | if (!test_bit(Faulty, &rdev->flags)) { |
1760 | md_super_write(mddev,rdev, | 1810 | md_super_write(mddev,rdev, |
1761 | rdev->sb_offset<<1, rdev->sb_size, | 1811 | rdev->sb_start, rdev->sb_size, |
1762 | rdev->sb_page); | 1812 | rdev->sb_page); |
1763 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | 1813 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", |
1764 | bdevname(rdev->bdev,b), | 1814 | bdevname(rdev->bdev,b), |
1765 | (unsigned long long)rdev->sb_offset); | 1815 | (unsigned long long)rdev->sb_start); |
1766 | rdev->sb_events = mddev->events; | 1816 | rdev->sb_events = mddev->events; |
1767 | 1817 | ||
1768 | } else | 1818 | } else |
@@ -1787,7 +1837,7 @@ repeat: | |||
1787 | 1837 | ||
1788 | } | 1838 | } |
1789 | 1839 | ||
1790 | /* words written to sysfs files may, or my not, be \n terminated. | 1840 | /* words written to sysfs files may, or may not, be \n terminated. |
1791 | * We want to accept with case. For this we use cmd_match. | 1841 | * We want to accept with case. For this we use cmd_match. |
1792 | */ | 1842 | */ |
1793 | static int cmd_match(const char *cmd, const char *str) | 1843 | static int cmd_match(const char *cmd, const char *str) |
@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1886 | 1936 | ||
1887 | err = 0; | 1937 | err = 0; |
1888 | } | 1938 | } |
1939 | if (!err) | ||
1940 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
1889 | return err ? err : len; | 1941 | return err ? err : len; |
1890 | } | 1942 | } |
1891 | static struct rdev_sysfs_entry rdev_state = | 1943 | static struct rdev_sysfs_entry rdev_state = |
@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1931 | slot = -1; | 1983 | slot = -1; |
1932 | else if (e==buf || (*e && *e!= '\n')) | 1984 | else if (e==buf || (*e && *e!= '\n')) |
1933 | return -EINVAL; | 1985 | return -EINVAL; |
1934 | if (rdev->mddev->pers) { | 1986 | if (rdev->mddev->pers && slot == -1) { |
1935 | /* Setting 'slot' on an active array requires also | 1987 | /* Setting 'slot' on an active array requires also |
1936 | * updating the 'rd%d' link, and communicating | 1988 | * updating the 'rd%d' link, and communicating |
1937 | * with the personality with ->hot_*_disk. | 1989 | * with the personality with ->hot_*_disk. |
@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1939 | * failed/spare devices. This normally happens automatically, | 1991 | * failed/spare devices. This normally happens automatically, |
1940 | * but not when the metadata is externally managed. | 1992 | * but not when the metadata is externally managed. |
1941 | */ | 1993 | */ |
1942 | if (slot != -1) | ||
1943 | return -EBUSY; | ||
1944 | if (rdev->raid_disk == -1) | 1994 | if (rdev->raid_disk == -1) |
1945 | return -EEXIST; | 1995 | return -EEXIST; |
1946 | /* personality does all needed checks */ | 1996 | /* personality does all needed checks */ |
@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1954 | sysfs_remove_link(&rdev->mddev->kobj, nm); | 2004 | sysfs_remove_link(&rdev->mddev->kobj, nm); |
1955 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2005 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
1956 | md_wakeup_thread(rdev->mddev->thread); | 2006 | md_wakeup_thread(rdev->mddev->thread); |
2007 | } else if (rdev->mddev->pers) { | ||
2008 | mdk_rdev_t *rdev2; | ||
2009 | struct list_head *tmp; | ||
2010 | /* Activating a spare .. or possibly reactivating | ||
2011 | * if we every get bitmaps working here. | ||
2012 | */ | ||
2013 | |||
2014 | if (rdev->raid_disk != -1) | ||
2015 | return -EBUSY; | ||
2016 | |||
2017 | if (rdev->mddev->pers->hot_add_disk == NULL) | ||
2018 | return -EINVAL; | ||
2019 | |||
2020 | rdev_for_each(rdev2, tmp, rdev->mddev) | ||
2021 | if (rdev2->raid_disk == slot) | ||
2022 | return -EEXIST; | ||
2023 | |||
2024 | rdev->raid_disk = slot; | ||
2025 | if (test_bit(In_sync, &rdev->flags)) | ||
2026 | rdev->saved_raid_disk = slot; | ||
2027 | else | ||
2028 | rdev->saved_raid_disk = -1; | ||
2029 | err = rdev->mddev->pers-> | ||
2030 | hot_add_disk(rdev->mddev, rdev); | ||
2031 | if (err) { | ||
2032 | rdev->raid_disk = -1; | ||
2033 | return err; | ||
2034 | } else | ||
2035 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
2036 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
2037 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
2038 | printk(KERN_WARNING | ||
2039 | "md: cannot register " | ||
2040 | "%s for %s\n", | ||
2041 | nm, mdname(rdev->mddev)); | ||
2042 | |||
2043 | /* don't wakeup anyone, leave that to userspace. */ | ||
1957 | } else { | 2044 | } else { |
1958 | if (slot >= rdev->mddev->raid_disks) | 2045 | if (slot >= rdev->mddev->raid_disks) |
1959 | return -ENOSPC; | 2046 | return -ENOSPC; |
@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1962 | clear_bit(Faulty, &rdev->flags); | 2049 | clear_bit(Faulty, &rdev->flags); |
1963 | clear_bit(WriteMostly, &rdev->flags); | 2050 | clear_bit(WriteMostly, &rdev->flags); |
1964 | set_bit(In_sync, &rdev->flags); | 2051 | set_bit(In_sync, &rdev->flags); |
2052 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
1965 | } | 2053 | } |
1966 | return len; | 2054 | return len; |
1967 | } | 2055 | } |
@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
1983 | unsigned long long offset = simple_strtoull(buf, &e, 10); | 2071 | unsigned long long offset = simple_strtoull(buf, &e, 10); |
1984 | if (e==buf || (*e && *e != '\n')) | 2072 | if (e==buf || (*e && *e != '\n')) |
1985 | return -EINVAL; | 2073 | return -EINVAL; |
1986 | if (rdev->mddev->pers) | 2074 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
1987 | return -EBUSY; | 2075 | return -EBUSY; |
1988 | if (rdev->size && rdev->mddev->external) | 2076 | if (rdev->size && rdev->mddev->external) |
1989 | /* Must set offset before size, so overlap checks | 2077 | /* Must set offset before size, so overlap checks |
@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) | |||
2015 | static ssize_t | 2103 | static ssize_t |
2016 | rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2104 | rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
2017 | { | 2105 | { |
2018 | char *e; | 2106 | unsigned long long size; |
2019 | unsigned long long size = simple_strtoull(buf, &e, 10); | ||
2020 | unsigned long long oldsize = rdev->size; | 2107 | unsigned long long oldsize = rdev->size; |
2021 | mddev_t *my_mddev = rdev->mddev; | 2108 | mddev_t *my_mddev = rdev->mddev; |
2022 | 2109 | ||
2023 | if (e==buf || (*e && *e != '\n')) | 2110 | if (strict_strtoull(buf, 10, &size) < 0) |
2024 | return -EINVAL; | 2111 | return -EINVAL; |
2025 | if (my_mddev->pers) | 2112 | if (size < my_mddev->size) |
2026 | return -EBUSY; | 2113 | return -EINVAL; |
2114 | if (my_mddev->pers && rdev->raid_disk >= 0) { | ||
2115 | if (my_mddev->persistent) { | ||
2116 | size = super_types[my_mddev->major_version]. | ||
2117 | rdev_size_change(rdev, size * 2); | ||
2118 | if (!size) | ||
2119 | return -EBUSY; | ||
2120 | } else if (!size) { | ||
2121 | size = (rdev->bdev->bd_inode->i_size >> 10); | ||
2122 | size -= rdev->data_offset/2; | ||
2123 | } | ||
2124 | if (size < my_mddev->size) | ||
2125 | return -EINVAL; /* component must fit device */ | ||
2126 | } | ||
2127 | |||
2027 | rdev->size = size; | 2128 | rdev->size = size; |
2028 | if (size > oldsize && rdev->mddev->external) { | 2129 | if (size > oldsize && my_mddev->external) { |
2029 | /* need to check that all other rdevs with the same ->bdev | 2130 | /* need to check that all other rdevs with the same ->bdev |
2030 | * do not overlap. We need to unlock the mddev to avoid | 2131 | * do not overlap. We need to unlock the mddev to avoid |
2031 | * a deadlock. We have already changed rdev->size, and if | 2132 | * a deadlock. We have already changed rdev->size, and if |
@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2044 | if (test_bit(AllReserved, &rdev2->flags) || | 2145 | if (test_bit(AllReserved, &rdev2->flags) || |
2045 | (rdev->bdev == rdev2->bdev && | 2146 | (rdev->bdev == rdev2->bdev && |
2046 | rdev != rdev2 && | 2147 | rdev != rdev2 && |
2047 | overlaps(rdev->data_offset, rdev->size, | 2148 | overlaps(rdev->data_offset, rdev->size * 2, |
2048 | rdev2->data_offset, rdev2->size))) { | 2149 | rdev2->data_offset, |
2150 | rdev2->size * 2))) { | ||
2049 | overlap = 1; | 2151 | overlap = 1; |
2050 | break; | 2152 | break; |
2051 | } | 2153 | } |
@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2067 | return -EBUSY; | 2169 | return -EBUSY; |
2068 | } | 2170 | } |
2069 | } | 2171 | } |
2070 | if (size < my_mddev->size || my_mddev->size == 0) | ||
2071 | my_mddev->size = size; | ||
2072 | return len; | 2172 | return len; |
2073 | } | 2173 | } |
2074 | 2174 | ||
@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); | |||
2512 | * When written, doesn't tear down array, but just stops it | 2612 | * When written, doesn't tear down array, but just stops it |
2513 | * suspended (not supported yet) | 2613 | * suspended (not supported yet) |
2514 | * All IO requests will block. The array can be reconfigured. | 2614 | * All IO requests will block. The array can be reconfigured. |
2515 | * Writing this, if accepted, will block until array is quiessent | 2615 | * Writing this, if accepted, will block until array is quiescent |
2516 | * readonly | 2616 | * readonly |
2517 | * no resync can happen. no superblocks get written. | 2617 | * no resync can happen. no superblocks get written. |
2518 | * write requests fail | 2618 | * write requests fail |
@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page) | |||
2585 | return sprintf(page, "%s\n", array_states[st]); | 2685 | return sprintf(page, "%s\n", array_states[st]); |
2586 | } | 2686 | } |
2587 | 2687 | ||
2588 | static int do_md_stop(mddev_t * mddev, int ro); | 2688 | static int do_md_stop(mddev_t * mddev, int ro, int is_open); |
2589 | static int do_md_run(mddev_t * mddev); | 2689 | static int do_md_run(mddev_t * mddev); |
2590 | static int restart_array(mddev_t *mddev); | 2690 | static int restart_array(mddev_t *mddev); |
2591 | 2691 | ||
@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
2599 | break; | 2699 | break; |
2600 | case clear: | 2700 | case clear: |
2601 | /* stopping an active array */ | 2701 | /* stopping an active array */ |
2602 | if (atomic_read(&mddev->active) > 1) | 2702 | if (atomic_read(&mddev->openers) > 0) |
2603 | return -EBUSY; | 2703 | return -EBUSY; |
2604 | err = do_md_stop(mddev, 0); | 2704 | err = do_md_stop(mddev, 0, 0); |
2605 | break; | 2705 | break; |
2606 | case inactive: | 2706 | case inactive: |
2607 | /* stopping an active array */ | 2707 | /* stopping an active array */ |
2608 | if (mddev->pers) { | 2708 | if (mddev->pers) { |
2609 | if (atomic_read(&mddev->active) > 1) | 2709 | if (atomic_read(&mddev->openers) > 0) |
2610 | return -EBUSY; | 2710 | return -EBUSY; |
2611 | err = do_md_stop(mddev, 2); | 2711 | err = do_md_stop(mddev, 2, 0); |
2612 | } else | 2712 | } else |
2613 | err = 0; /* already inactive */ | 2713 | err = 0; /* already inactive */ |
2614 | break; | 2714 | break; |
@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
2616 | break; /* not supported yet */ | 2716 | break; /* not supported yet */ |
2617 | case readonly: | 2717 | case readonly: |
2618 | if (mddev->pers) | 2718 | if (mddev->pers) |
2619 | err = do_md_stop(mddev, 1); | 2719 | err = do_md_stop(mddev, 1, 0); |
2620 | else { | 2720 | else { |
2621 | mddev->ro = 1; | 2721 | mddev->ro = 1; |
2622 | set_disk_ro(mddev->gendisk, 1); | 2722 | set_disk_ro(mddev->gendisk, 1); |
@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
2626 | case read_auto: | 2726 | case read_auto: |
2627 | if (mddev->pers) { | 2727 | if (mddev->pers) { |
2628 | if (mddev->ro != 1) | 2728 | if (mddev->ro != 1) |
2629 | err = do_md_stop(mddev, 1); | 2729 | err = do_md_stop(mddev, 1, 0); |
2630 | else | 2730 | else |
2631 | err = restart_array(mddev); | 2731 | err = restart_array(mddev); |
2632 | if (err == 0) { | 2732 | if (err == 0) { |
@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
2681 | } | 2781 | } |
2682 | if (err) | 2782 | if (err) |
2683 | return err; | 2783 | return err; |
2684 | else | 2784 | else { |
2785 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
2685 | return len; | 2786 | return len; |
2787 | } | ||
2686 | } | 2788 | } |
2687 | static struct md_sysfs_entry md_array_state = | 2789 | static struct md_sysfs_entry md_array_state = |
2688 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); | 2790 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page) | |||
2785 | return sprintf(page, "%llu\n", (unsigned long long)mddev->size); | 2887 | return sprintf(page, "%llu\n", (unsigned long long)mddev->size); |
2786 | } | 2888 | } |
2787 | 2889 | ||
2788 | static int update_size(mddev_t *mddev, unsigned long size); | 2890 | static int update_size(mddev_t *mddev, sector_t num_sectors); |
2789 | 2891 | ||
2790 | static ssize_t | 2892 | static ssize_t |
2791 | size_store(mddev_t *mddev, const char *buf, size_t len) | 2893 | size_store(mddev_t *mddev, const char *buf, size_t len) |
@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) | |||
2802 | return -EINVAL; | 2904 | return -EINVAL; |
2803 | 2905 | ||
2804 | if (mddev->pers) { | 2906 | if (mddev->pers) { |
2805 | err = update_size(mddev, size); | 2907 | err = update_size(mddev, size * 2); |
2806 | md_update_sb(mddev, 1); | 2908 | md_update_sb(mddev, 1); |
2807 | } else { | 2909 | } else { |
2808 | if (mddev->size == 0 || | 2910 | if (mddev->size == 0 || |
@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page) | |||
2899 | type = "check"; | 3001 | type = "check"; |
2900 | else | 3002 | else |
2901 | type = "repair"; | 3003 | type = "repair"; |
2902 | } else | 3004 | } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) |
2903 | type = "recover"; | 3005 | type = "recover"; |
2904 | } | 3006 | } |
2905 | return sprintf(page, "%s\n", type); | 3007 | return sprintf(page, "%s\n", type); |
@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
2921 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || | 3023 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
2922 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 3024 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
2923 | return -EBUSY; | 3025 | return -EBUSY; |
2924 | else if (cmd_match(page, "resync") || cmd_match(page, "recover")) | 3026 | else if (cmd_match(page, "resync")) |
3027 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
3028 | else if (cmd_match(page, "recover")) { | ||
3029 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
2925 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3030 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2926 | else if (cmd_match(page, "reshape")) { | 3031 | } else if (cmd_match(page, "reshape")) { |
2927 | int err; | 3032 | int err; |
2928 | if (mddev->pers->start_reshape == NULL) | 3033 | if (mddev->pers->start_reshape == NULL) |
2929 | return -EINVAL; | 3034 | return -EINVAL; |
2930 | err = mddev->pers->start_reshape(mddev); | 3035 | err = mddev->pers->start_reshape(mddev); |
2931 | if (err) | 3036 | if (err) |
2932 | return err; | 3037 | return err; |
3038 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
2933 | } else { | 3039 | } else { |
2934 | if (cmd_match(page, "check")) | 3040 | if (cmd_match(page, "check")) |
2935 | set_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 3041 | set_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
2940 | } | 3046 | } |
2941 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3047 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2942 | md_wakeup_thread(mddev->thread); | 3048 | md_wakeup_thread(mddev->thread); |
3049 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
2943 | return len; | 3050 | return len; |
2944 | } | 3051 | } |
2945 | 3052 | ||
@@ -3049,11 +3156,11 @@ static ssize_t | |||
3049 | sync_speed_show(mddev_t *mddev, char *page) | 3156 | sync_speed_show(mddev_t *mddev, char *page) |
3050 | { | 3157 | { |
3051 | unsigned long resync, dt, db; | 3158 | unsigned long resync, dt, db; |
3052 | resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); | 3159 | resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); |
3053 | dt = ((jiffies - mddev->resync_mark) / HZ); | 3160 | dt = (jiffies - mddev->resync_mark) / HZ; |
3054 | if (!dt) dt++; | 3161 | if (!dt) dt++; |
3055 | db = resync - (mddev->resync_mark_cnt); | 3162 | db = resync - mddev->resync_mark_cnt; |
3056 | return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ | 3163 | return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ |
3057 | } | 3164 | } |
3058 | 3165 | ||
3059 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); | 3166 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); |
@@ -3075,6 +3182,36 @@ sync_completed_show(mddev_t *mddev, char *page) | |||
3075 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); | 3182 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); |
3076 | 3183 | ||
3077 | static ssize_t | 3184 | static ssize_t |
3185 | min_sync_show(mddev_t *mddev, char *page) | ||
3186 | { | ||
3187 | return sprintf(page, "%llu\n", | ||
3188 | (unsigned long long)mddev->resync_min); | ||
3189 | } | ||
3190 | static ssize_t | ||
3191 | min_sync_store(mddev_t *mddev, const char *buf, size_t len) | ||
3192 | { | ||
3193 | unsigned long long min; | ||
3194 | if (strict_strtoull(buf, 10, &min)) | ||
3195 | return -EINVAL; | ||
3196 | if (min > mddev->resync_max) | ||
3197 | return -EINVAL; | ||
3198 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
3199 | return -EBUSY; | ||
3200 | |||
3201 | /* Must be a multiple of chunk_size */ | ||
3202 | if (mddev->chunk_size) { | ||
3203 | if (min & (sector_t)((mddev->chunk_size>>9)-1)) | ||
3204 | return -EINVAL; | ||
3205 | } | ||
3206 | mddev->resync_min = min; | ||
3207 | |||
3208 | return len; | ||
3209 | } | ||
3210 | |||
3211 | static struct md_sysfs_entry md_min_sync = | ||
3212 | __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); | ||
3213 | |||
3214 | static ssize_t | ||
3078 | max_sync_show(mddev_t *mddev, char *page) | 3215 | max_sync_show(mddev_t *mddev, char *page) |
3079 | { | 3216 | { |
3080 | if (mddev->resync_max == MaxSector) | 3217 | if (mddev->resync_max == MaxSector) |
@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
3089 | if (strncmp(buf, "max", 3) == 0) | 3226 | if (strncmp(buf, "max", 3) == 0) |
3090 | mddev->resync_max = MaxSector; | 3227 | mddev->resync_max = MaxSector; |
3091 | else { | 3228 | else { |
3092 | char *ep; | 3229 | unsigned long long max; |
3093 | unsigned long long max = simple_strtoull(buf, &ep, 10); | 3230 | if (strict_strtoull(buf, 10, &max)) |
3094 | if (ep == buf || (*ep != 0 && *ep != '\n')) | 3231 | return -EINVAL; |
3232 | if (max < mddev->resync_min) | ||
3095 | return -EINVAL; | 3233 | return -EINVAL; |
3096 | if (max < mddev->resync_max && | 3234 | if (max < mddev->resync_max && |
3097 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 3235 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = { | |||
3222 | &md_sync_speed.attr, | 3360 | &md_sync_speed.attr, |
3223 | &md_sync_force_parallel.attr, | 3361 | &md_sync_force_parallel.attr, |
3224 | &md_sync_completed.attr, | 3362 | &md_sync_completed.attr, |
3363 | &md_min_sync.attr, | ||
3225 | &md_max_sync.attr, | 3364 | &md_max_sync.attr, |
3226 | &md_suspend_lo.attr, | 3365 | &md_suspend_lo.attr, |
3227 | &md_suspend_hi.attr, | 3366 | &md_suspend_hi.attr, |
@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) | |||
3326 | disk->queue = mddev->queue; | 3465 | disk->queue = mddev->queue; |
3327 | add_disk(disk); | 3466 | add_disk(disk); |
3328 | mddev->gendisk = disk; | 3467 | mddev->gendisk = disk; |
3329 | mutex_unlock(&disks_mutex); | ||
3330 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, | 3468 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, |
3331 | "%s", "md"); | 3469 | "%s", "md"); |
3470 | mutex_unlock(&disks_mutex); | ||
3332 | if (error) | 3471 | if (error) |
3333 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 3472 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", |
3334 | disk->disk_name); | 3473 | disk->disk_name); |
@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data) | |||
3341 | { | 3480 | { |
3342 | mddev_t *mddev = (mddev_t *) data; | 3481 | mddev_t *mddev = (mddev_t *) data; |
3343 | 3482 | ||
3344 | mddev->safemode = 1; | 3483 | if (!atomic_read(&mddev->writes_pending)) { |
3484 | mddev->safemode = 1; | ||
3485 | if (mddev->external) | ||
3486 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3487 | } | ||
3345 | md_wakeup_thread(mddev->thread); | 3488 | md_wakeup_thread(mddev->thread); |
3346 | } | 3489 | } |
3347 | 3490 | ||
@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev) | |||
3432 | * We don't want the data to overlap the metadata, | 3575 | * We don't want the data to overlap the metadata, |
3433 | * Internal Bitmap issues has handled elsewhere. | 3576 | * Internal Bitmap issues has handled elsewhere. |
3434 | */ | 3577 | */ |
3435 | if (rdev->data_offset < rdev->sb_offset) { | 3578 | if (rdev->data_offset < rdev->sb_start) { |
3436 | if (mddev->size && | 3579 | if (mddev->size && |
3437 | rdev->data_offset + mddev->size*2 | 3580 | rdev->data_offset + mddev->size*2 |
3438 | > rdev->sb_offset*2) { | 3581 | > rdev->sb_start) { |
3439 | printk("md: %s: data overlaps metadata\n", | 3582 | printk("md: %s: data overlaps metadata\n", |
3440 | mdname(mddev)); | 3583 | mdname(mddev)); |
3441 | return -EINVAL; | 3584 | return -EINVAL; |
3442 | } | 3585 | } |
3443 | } else { | 3586 | } else { |
3444 | if (rdev->sb_offset*2 + rdev->sb_size/512 | 3587 | if (rdev->sb_start + rdev->sb_size/512 |
3445 | > rdev->data_offset) { | 3588 | > rdev->data_offset) { |
3446 | printk("md: %s: metadata overlaps data\n", | 3589 | printk("md: %s: metadata overlaps data\n", |
3447 | mdname(mddev)); | 3590 | mdname(mddev)); |
3448 | return -EINVAL; | 3591 | return -EINVAL; |
3449 | } | 3592 | } |
3450 | } | 3593 | } |
3594 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
3451 | } | 3595 | } |
3452 | 3596 | ||
3453 | md_probe(mddev->unit, NULL, NULL); | 3597 | md_probe(mddev->unit, NULL, NULL); |
@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev) | |||
3519 | mddev->ro = 2; /* read-only, but switch on first write */ | 3663 | mddev->ro = 2; /* read-only, but switch on first write */ |
3520 | 3664 | ||
3521 | err = mddev->pers->run(mddev); | 3665 | err = mddev->pers->run(mddev); |
3522 | if (!err && mddev->pers->sync_request) { | 3666 | if (err) |
3667 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
3668 | else if (mddev->pers->sync_request) { | ||
3523 | err = bitmap_create(mddev); | 3669 | err = bitmap_create(mddev); |
3524 | if (err) { | 3670 | if (err) { |
3525 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 3671 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev) | |||
3528 | } | 3674 | } |
3529 | } | 3675 | } |
3530 | if (err) { | 3676 | if (err) { |
3531 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
3532 | module_put(mddev->pers->owner); | 3677 | module_put(mddev->pers->owner); |
3533 | mddev->pers = NULL; | 3678 | mddev->pers = NULL; |
3534 | bitmap_destroy(mddev); | 3679 | bitmap_destroy(mddev); |
@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev) | |||
3563 | if (mddev->flags) | 3708 | if (mddev->flags) |
3564 | md_update_sb(mddev, 0); | 3709 | md_update_sb(mddev, 0); |
3565 | 3710 | ||
3566 | set_capacity(disk, mddev->array_size<<1); | 3711 | set_capacity(disk, mddev->array_sectors); |
3567 | 3712 | ||
3568 | /* If we call blk_queue_make_request here, it will | 3713 | /* If we call blk_queue_make_request here, it will |
3569 | * re-initialise max_sectors etc which may have been | 3714 | * re-initialise max_sectors etc which may have been |
@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev) | |||
3608 | 3753 | ||
3609 | mddev->changed = 1; | 3754 | mddev->changed = 1; |
3610 | md_new_event(mddev); | 3755 | md_new_event(mddev); |
3756 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3757 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
3758 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
3611 | kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); | 3759 | kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); |
3612 | return 0; | 3760 | return 0; |
3613 | } | 3761 | } |
@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev) | |||
3615 | static int restart_array(mddev_t *mddev) | 3763 | static int restart_array(mddev_t *mddev) |
3616 | { | 3764 | { |
3617 | struct gendisk *disk = mddev->gendisk; | 3765 | struct gendisk *disk = mddev->gendisk; |
3618 | int err; | ||
3619 | 3766 | ||
3620 | /* | 3767 | /* Complain if it has no devices */ |
3621 | * Complain if it has no devices | ||
3622 | */ | ||
3623 | err = -ENXIO; | ||
3624 | if (list_empty(&mddev->disks)) | 3768 | if (list_empty(&mddev->disks)) |
3625 | goto out; | 3769 | return -ENXIO; |
3626 | 3770 | if (!mddev->pers) | |
3627 | if (mddev->pers) { | 3771 | return -EINVAL; |
3628 | err = -EBUSY; | 3772 | if (!mddev->ro) |
3629 | if (!mddev->ro) | 3773 | return -EBUSY; |
3630 | goto out; | 3774 | mddev->safemode = 0; |
3631 | 3775 | mddev->ro = 0; | |
3632 | mddev->safemode = 0; | 3776 | set_disk_ro(disk, 0); |
3633 | mddev->ro = 0; | 3777 | printk(KERN_INFO "md: %s switched to read-write mode.\n", |
3634 | set_disk_ro(disk, 0); | 3778 | mdname(mddev)); |
3635 | 3779 | /* Kick recovery or resync if necessary */ | |
3636 | printk(KERN_INFO "md: %s switched to read-write mode.\n", | 3780 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
3637 | mdname(mddev)); | 3781 | md_wakeup_thread(mddev->thread); |
3638 | /* | 3782 | md_wakeup_thread(mddev->sync_thread); |
3639 | * Kick recovery or resync if necessary | 3783 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
3640 | */ | 3784 | return 0; |
3641 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
3642 | md_wakeup_thread(mddev->thread); | ||
3643 | md_wakeup_thread(mddev->sync_thread); | ||
3644 | err = 0; | ||
3645 | } else | ||
3646 | err = -EINVAL; | ||
3647 | |||
3648 | out: | ||
3649 | return err; | ||
3650 | } | 3785 | } |
3651 | 3786 | ||
3652 | /* similar to deny_write_access, but accounts for our holding a reference | 3787 | /* similar to deny_write_access, but accounts for our holding a reference |
@@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file) | |||
3680 | * 1 - switch to readonly | 3815 | * 1 - switch to readonly |
3681 | * 2 - stop but do not disassemble array | 3816 | * 2 - stop but do not disassemble array |
3682 | */ | 3817 | */ |
3683 | static int do_md_stop(mddev_t * mddev, int mode) | 3818 | static int do_md_stop(mddev_t * mddev, int mode, int is_open) |
3684 | { | 3819 | { |
3685 | int err = 0; | 3820 | int err = 0; |
3686 | struct gendisk *disk = mddev->gendisk; | 3821 | struct gendisk *disk = mddev->gendisk; |
3687 | 3822 | ||
3823 | if (atomic_read(&mddev->openers) > is_open) { | ||
3824 | printk("md: %s still in use.\n",mdname(mddev)); | ||
3825 | return -EBUSY; | ||
3826 | } | ||
3827 | |||
3688 | if (mddev->pers) { | 3828 | if (mddev->pers) { |
3689 | if (atomic_read(&mddev->active)>2) { | ||
3690 | printk("md: %s still in use.\n",mdname(mddev)); | ||
3691 | return -EBUSY; | ||
3692 | } | ||
3693 | 3829 | ||
3694 | if (mddev->sync_thread) { | 3830 | if (mddev->sync_thread) { |
3695 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 3831 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode) | |||
3773 | 3909 | ||
3774 | export_array(mddev); | 3910 | export_array(mddev); |
3775 | 3911 | ||
3776 | mddev->array_size = 0; | 3912 | mddev->array_sectors = 0; |
3777 | mddev->size = 0; | 3913 | mddev->size = 0; |
3778 | mddev->raid_disks = 0; | 3914 | mddev->raid_disks = 0; |
3779 | mddev->recovery_cp = 0; | 3915 | mddev->recovery_cp = 0; |
3916 | mddev->resync_min = 0; | ||
3780 | mddev->resync_max = MaxSector; | 3917 | mddev->resync_max = MaxSector; |
3781 | mddev->reshape_position = MaxSector; | 3918 | mddev->reshape_position = MaxSector; |
3782 | mddev->external = 0; | 3919 | mddev->external = 0; |
@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode) | |||
3811 | mdname(mddev)); | 3948 | mdname(mddev)); |
3812 | err = 0; | 3949 | err = 0; |
3813 | md_new_event(mddev); | 3950 | md_new_event(mddev); |
3951 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
3814 | out: | 3952 | out: |
3815 | return err; | 3953 | return err; |
3816 | } | 3954 | } |
@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev) | |||
3836 | err = do_md_run (mddev); | 3974 | err = do_md_run (mddev); |
3837 | if (err) { | 3975 | if (err) { |
3838 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); | 3976 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); |
3839 | do_md_stop (mddev, 0); | 3977 | do_md_stop (mddev, 0, 0); |
3840 | } | 3978 | } |
3841 | } | 3979 | } |
3842 | 3980 | ||
@@ -3927,8 +4065,10 @@ static void autorun_devices(int part) | |||
3927 | /* on success, candidates will be empty, on error | 4065 | /* on success, candidates will be empty, on error |
3928 | * it won't... | 4066 | * it won't... |
3929 | */ | 4067 | */ |
3930 | rdev_for_each_list(rdev, tmp, candidates) | 4068 | rdev_for_each_list(rdev, tmp, candidates) { |
4069 | list_del_init(&rdev->same_set); | ||
3931 | export_rdev(rdev); | 4070 | export_rdev(rdev); |
4071 | } | ||
3932 | mddev_put(mddev); | 4072 | mddev_put(mddev); |
3933 | } | 4073 | } |
3934 | printk(KERN_INFO "md: ... autorun DONE.\n"); | 4074 | printk(KERN_INFO "md: ... autorun DONE.\n"); |
@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) | |||
4009 | char *ptr, *buf = NULL; | 4149 | char *ptr, *buf = NULL; |
4010 | int err = -ENOMEM; | 4150 | int err = -ENOMEM; |
4011 | 4151 | ||
4012 | md_allow_write(mddev); | 4152 | if (md_allow_write(mddev)) |
4153 | file = kmalloc(sizeof(*file), GFP_NOIO); | ||
4154 | else | ||
4155 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
4013 | 4156 | ||
4014 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
4015 | if (!file) | 4157 | if (!file) |
4016 | goto out; | 4158 | goto out; |
4017 | 4159 | ||
@@ -4044,15 +4186,12 @@ out: | |||
4044 | static int get_disk_info(mddev_t * mddev, void __user * arg) | 4186 | static int get_disk_info(mddev_t * mddev, void __user * arg) |
4045 | { | 4187 | { |
4046 | mdu_disk_info_t info; | 4188 | mdu_disk_info_t info; |
4047 | unsigned int nr; | ||
4048 | mdk_rdev_t *rdev; | 4189 | mdk_rdev_t *rdev; |
4049 | 4190 | ||
4050 | if (copy_from_user(&info, arg, sizeof(info))) | 4191 | if (copy_from_user(&info, arg, sizeof(info))) |
4051 | return -EFAULT; | 4192 | return -EFAULT; |
4052 | 4193 | ||
4053 | nr = info.number; | 4194 | rdev = find_rdev_nr(mddev, info.number); |
4054 | |||
4055 | rdev = find_rdev_nr(mddev, nr); | ||
4056 | if (rdev) { | 4195 | if (rdev) { |
4057 | info.major = MAJOR(rdev->bdev->bd_dev); | 4196 | info.major = MAJOR(rdev->bdev->bd_dev); |
4058 | info.minor = MINOR(rdev->bdev->bd_dev); | 4197 | info.minor = MINOR(rdev->bdev->bd_dev); |
@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4172 | } | 4311 | } |
4173 | if (err) | 4312 | if (err) |
4174 | export_rdev(rdev); | 4313 | export_rdev(rdev); |
4314 | else | ||
4315 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
4175 | 4316 | ||
4176 | md_update_sb(mddev, 1); | 4317 | md_update_sb(mddev, 1); |
4318 | if (mddev->degraded) | ||
4319 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
4177 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4320 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4178 | md_wakeup_thread(mddev->thread); | 4321 | md_wakeup_thread(mddev->thread); |
4179 | return err; | 4322 | return err; |
@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
4212 | 4355 | ||
4213 | if (!mddev->persistent) { | 4356 | if (!mddev->persistent) { |
4214 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | 4357 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); |
4215 | rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | 4358 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
4216 | } else | 4359 | } else |
4217 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | 4360 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
4218 | rdev->size = calc_dev_size(rdev, mddev->chunk_size); | 4361 | rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; |
4219 | 4362 | ||
4220 | err = bind_rdev_to_array(rdev, mddev); | 4363 | err = bind_rdev_to_array(rdev, mddev); |
4221 | if (err) { | 4364 | if (err) { |
@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) | |||
4232 | char b[BDEVNAME_SIZE]; | 4375 | char b[BDEVNAME_SIZE]; |
4233 | mdk_rdev_t *rdev; | 4376 | mdk_rdev_t *rdev; |
4234 | 4377 | ||
4235 | if (!mddev->pers) | ||
4236 | return -ENODEV; | ||
4237 | |||
4238 | rdev = find_rdev(mddev, dev); | 4378 | rdev = find_rdev(mddev, dev); |
4239 | if (!rdev) | 4379 | if (!rdev) |
4240 | return -ENXIO; | 4380 | return -ENXIO; |
@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
4257 | { | 4397 | { |
4258 | char b[BDEVNAME_SIZE]; | 4398 | char b[BDEVNAME_SIZE]; |
4259 | int err; | 4399 | int err; |
4260 | unsigned int size; | ||
4261 | mdk_rdev_t *rdev; | 4400 | mdk_rdev_t *rdev; |
4262 | 4401 | ||
4263 | if (!mddev->pers) | 4402 | if (!mddev->pers) |
@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
4285 | } | 4424 | } |
4286 | 4425 | ||
4287 | if (mddev->persistent) | 4426 | if (mddev->persistent) |
4288 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | 4427 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
4289 | else | 4428 | else |
4290 | rdev->sb_offset = | 4429 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
4291 | rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | ||
4292 | 4430 | ||
4293 | size = calc_dev_size(rdev, mddev->chunk_size); | 4431 | rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; |
4294 | rdev->size = size; | ||
4295 | 4432 | ||
4296 | if (test_bit(Faulty, &rdev->flags)) { | 4433 | if (test_bit(Faulty, &rdev->flags)) { |
4297 | printk(KERN_WARNING | 4434 | printk(KERN_WARNING |
@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
4476 | return 0; | 4613 | return 0; |
4477 | } | 4614 | } |
4478 | 4615 | ||
4479 | static int update_size(mddev_t *mddev, unsigned long size) | 4616 | static int update_size(mddev_t *mddev, sector_t num_sectors) |
4480 | { | 4617 | { |
4481 | mdk_rdev_t * rdev; | 4618 | mdk_rdev_t * rdev; |
4482 | int rv; | 4619 | int rv; |
4483 | struct list_head *tmp; | 4620 | struct list_head *tmp; |
4484 | int fit = (size == 0); | 4621 | int fit = (num_sectors == 0); |
4485 | 4622 | ||
4486 | if (mddev->pers->resize == NULL) | 4623 | if (mddev->pers->resize == NULL) |
4487 | return -EINVAL; | 4624 | return -EINVAL; |
4488 | /* The "size" is the amount of each device that is used. | 4625 | /* The "num_sectors" is the number of sectors of each device that |
4489 | * This can only make sense for arrays with redundancy. | 4626 | * is used. This can only make sense for arrays with redundancy. |
4490 | * linear and raid0 always use whatever space is available | 4627 | * linear and raid0 always use whatever space is available. We can only |
4491 | * We can only consider changing the size if no resync | 4628 | * consider changing this number if no resync or reconstruction is |
4492 | * or reconstruction is happening, and if the new size | 4629 | * happening, and if the new size is acceptable. It must fit before the |
4493 | * is acceptable. It must fit before the sb_offset or, | 4630 | * sb_start or, if that is <data_offset, it must fit before the size |
4494 | * if that is <data_offset, it must fit before the | 4631 | * of each device. If num_sectors is zero, we find the largest size |
4495 | * size of each device. | 4632 | * that fits. |
4496 | * If size is zero, we find the largest size that fits. | 4633 | |
4497 | */ | 4634 | */ |
4498 | if (mddev->sync_thread) | 4635 | if (mddev->sync_thread) |
4499 | return -EBUSY; | 4636 | return -EBUSY; |
@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size) | |||
4501 | sector_t avail; | 4638 | sector_t avail; |
4502 | avail = rdev->size * 2; | 4639 | avail = rdev->size * 2; |
4503 | 4640 | ||
4504 | if (fit && (size == 0 || size > avail/2)) | 4641 | if (fit && (num_sectors == 0 || num_sectors > avail)) |
4505 | size = avail/2; | 4642 | num_sectors = avail; |
4506 | if (avail < ((sector_t)size << 1)) | 4643 | if (avail < num_sectors) |
4507 | return -ENOSPC; | 4644 | return -ENOSPC; |
4508 | } | 4645 | } |
4509 | rv = mddev->pers->resize(mddev, (sector_t)size *2); | 4646 | rv = mddev->pers->resize(mddev, num_sectors); |
4510 | if (!rv) { | 4647 | if (!rv) { |
4511 | struct block_device *bdev; | 4648 | struct block_device *bdev; |
4512 | 4649 | ||
4513 | bdev = bdget_disk(mddev->gendisk, 0); | 4650 | bdev = bdget_disk(mddev->gendisk, 0); |
4514 | if (bdev) { | 4651 | if (bdev) { |
4515 | mutex_lock(&bdev->bd_inode->i_mutex); | 4652 | mutex_lock(&bdev->bd_inode->i_mutex); |
4516 | i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); | 4653 | i_size_write(bdev->bd_inode, |
4654 | (loff_t)mddev->array_sectors << 9); | ||
4517 | mutex_unlock(&bdev->bd_inode->i_mutex); | 4655 | mutex_unlock(&bdev->bd_inode->i_mutex); |
4518 | bdput(bdev); | 4656 | bdput(bdev); |
4519 | } | 4657 | } |
@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
4588 | return mddev->pers->reconfig(mddev, info->layout, -1); | 4726 | return mddev->pers->reconfig(mddev, info->layout, -1); |
4589 | } | 4727 | } |
4590 | if (info->size >= 0 && mddev->size != info->size) | 4728 | if (info->size >= 0 && mddev->size != info->size) |
4591 | rv = update_size(mddev, info->size); | 4729 | rv = update_size(mddev, (sector_t)info->size * 2); |
4592 | 4730 | ||
4593 | if (mddev->raid_disks != info->raid_disks) | 4731 | if (mddev->raid_disks != info->raid_disks) |
4594 | rv = update_raid_disks(mddev, info->raid_disks); | 4732 | rv = update_raid_disks(mddev, info->raid_disks); |
@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) | |||
4641 | return 0; | 4779 | return 0; |
4642 | } | 4780 | } |
4643 | 4781 | ||
4782 | /* | ||
4783 | * We have a problem here : there is no easy way to give a CHS | ||
4784 | * virtual geometry. We currently pretend that we have a 2 heads | ||
4785 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
4786 | * dosfs just mad... ;-) | ||
4787 | */ | ||
4644 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) | 4788 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
4645 | { | 4789 | { |
4646 | mddev_t *mddev = bdev->bd_disk->private_data; | 4790 | mddev_t *mddev = bdev->bd_disk->private_data; |
@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
4785 | goto done_unlock; | 4929 | goto done_unlock; |
4786 | 4930 | ||
4787 | case STOP_ARRAY: | 4931 | case STOP_ARRAY: |
4788 | err = do_md_stop (mddev, 0); | 4932 | err = do_md_stop (mddev, 0, 1); |
4789 | goto done_unlock; | 4933 | goto done_unlock; |
4790 | 4934 | ||
4791 | case STOP_ARRAY_RO: | 4935 | case STOP_ARRAY_RO: |
4792 | err = do_md_stop (mddev, 1); | 4936 | err = do_md_stop (mddev, 1, 1); |
4793 | goto done_unlock; | 4937 | goto done_unlock; |
4794 | 4938 | ||
4795 | /* | ||
4796 | * We have a problem here : there is no easy way to give a CHS | ||
4797 | * virtual geometry. We currently pretend that we have a 2 heads | ||
4798 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
4799 | * dosfs just mad... ;-) | ||
4800 | */ | ||
4801 | } | 4939 | } |
4802 | 4940 | ||
4803 | /* | 4941 | /* |
@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
4807 | * here and hit the 'default' below, so only disallow | 4945 | * here and hit the 'default' below, so only disallow |
4808 | * 'md' ioctls, and switch to rw mode if started auto-readonly. | 4946 | * 'md' ioctls, and switch to rw mode if started auto-readonly. |
4809 | */ | 4947 | */ |
4810 | if (_IOC_TYPE(cmd) == MD_MAJOR && | 4948 | if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { |
4811 | mddev->ro && mddev->pers) { | ||
4812 | if (mddev->ro == 2) { | 4949 | if (mddev->ro == 2) { |
4813 | mddev->ro = 0; | 4950 | mddev->ro = 0; |
4814 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4951 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
4815 | md_wakeup_thread(mddev->thread); | 4952 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4816 | 4953 | md_wakeup_thread(mddev->thread); | |
4817 | } else { | 4954 | } else { |
4818 | err = -EROFS; | 4955 | err = -EROFS; |
4819 | goto abort_unlock; | 4956 | goto abort_unlock; |
@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file) | |||
4883 | 5020 | ||
4884 | err = 0; | 5021 | err = 0; |
4885 | mddev_get(mddev); | 5022 | mddev_get(mddev); |
5023 | atomic_inc(&mddev->openers); | ||
4886 | mddev_unlock(mddev); | 5024 | mddev_unlock(mddev); |
4887 | 5025 | ||
4888 | check_disk_change(inode->i_bdev); | 5026 | check_disk_change(inode->i_bdev); |
@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file) | |||
4895 | mddev_t *mddev = inode->i_bdev->bd_disk->private_data; | 5033 | mddev_t *mddev = inode->i_bdev->bd_disk->private_data; |
4896 | 5034 | ||
4897 | BUG_ON(!mddev); | 5035 | BUG_ON(!mddev); |
5036 | atomic_dec(&mddev->openers); | ||
4898 | mddev_put(mddev); | 5037 | mddev_put(mddev); |
4899 | 5038 | ||
4900 | return 0; | 5039 | return 0; |
@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
5029 | if (!mddev->pers->error_handler) | 5168 | if (!mddev->pers->error_handler) |
5030 | return; | 5169 | return; |
5031 | mddev->pers->error_handler(mddev,rdev); | 5170 | mddev->pers->error_handler(mddev,rdev); |
5171 | if (mddev->degraded) | ||
5172 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5173 | set_bit(StateChanged, &rdev->flags); | ||
5032 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 5174 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
5033 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5175 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5034 | md_wakeup_thread(mddev->thread); | 5176 | md_wakeup_thread(mddev->thread); |
@@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
5258 | if (!list_empty(&mddev->disks)) { | 5400 | if (!list_empty(&mddev->disks)) { |
5259 | if (mddev->pers) | 5401 | if (mddev->pers) |
5260 | seq_printf(seq, "\n %llu blocks", | 5402 | seq_printf(seq, "\n %llu blocks", |
5261 | (unsigned long long)mddev->array_size); | 5403 | (unsigned long long) |
5404 | mddev->array_sectors / 2); | ||
5262 | else | 5405 | else |
5263 | seq_printf(seq, "\n %llu blocks", | 5406 | seq_printf(seq, "\n %llu blocks", |
5264 | (unsigned long long)size); | 5407 | (unsigned long long)size); |
5265 | } | 5408 | } |
5266 | if (mddev->persistent) { | 5409 | if (mddev->persistent) { |
5267 | if (mddev->major_version != 0 || | 5410 | if (mddev->major_version != 0 || |
@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p) | |||
5391 | static int is_mddev_idle(mddev_t *mddev) | 5534 | static int is_mddev_idle(mddev_t *mddev) |
5392 | { | 5535 | { |
5393 | mdk_rdev_t * rdev; | 5536 | mdk_rdev_t * rdev; |
5394 | struct list_head *tmp; | ||
5395 | int idle; | 5537 | int idle; |
5396 | long curr_events; | 5538 | long curr_events; |
5397 | 5539 | ||
5398 | idle = 1; | 5540 | idle = 1; |
5399 | rdev_for_each(rdev, tmp, mddev) { | 5541 | rcu_read_lock(); |
5542 | rdev_for_each_rcu(rdev, mddev) { | ||
5400 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; | 5543 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; |
5401 | curr_events = disk_stat_read(disk, sectors[0]) + | 5544 | curr_events = disk_stat_read(disk, sectors[0]) + |
5402 | disk_stat_read(disk, sectors[1]) - | 5545 | disk_stat_read(disk, sectors[1]) - |
@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev) | |||
5428 | idle = 0; | 5571 | idle = 0; |
5429 | } | 5572 | } |
5430 | } | 5573 | } |
5574 | rcu_read_unlock(); | ||
5431 | return idle; | 5575 | return idle; |
5432 | } | 5576 | } |
5433 | 5577 | ||
@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) | |||
5451 | */ | 5595 | */ |
5452 | void md_write_start(mddev_t *mddev, struct bio *bi) | 5596 | void md_write_start(mddev_t *mddev, struct bio *bi) |
5453 | { | 5597 | { |
5598 | int did_change = 0; | ||
5454 | if (bio_data_dir(bi) != WRITE) | 5599 | if (bio_data_dir(bi) != WRITE) |
5455 | return; | 5600 | return; |
5456 | 5601 | ||
@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) | |||
5461 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5606 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5462 | md_wakeup_thread(mddev->thread); | 5607 | md_wakeup_thread(mddev->thread); |
5463 | md_wakeup_thread(mddev->sync_thread); | 5608 | md_wakeup_thread(mddev->sync_thread); |
5609 | did_change = 1; | ||
5464 | } | 5610 | } |
5465 | atomic_inc(&mddev->writes_pending); | 5611 | atomic_inc(&mddev->writes_pending); |
5466 | if (mddev->safemode == 1) | 5612 | if (mddev->safemode == 1) |
@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) | |||
5471 | mddev->in_sync = 0; | 5617 | mddev->in_sync = 0; |
5472 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 5618 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
5473 | md_wakeup_thread(mddev->thread); | 5619 | md_wakeup_thread(mddev->thread); |
5620 | did_change = 1; | ||
5474 | } | 5621 | } |
5475 | spin_unlock_irq(&mddev->write_lock); | 5622 | spin_unlock_irq(&mddev->write_lock); |
5476 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
5477 | } | 5623 | } |
5624 | if (did_change) | ||
5625 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
5478 | wait_event(mddev->sb_wait, | 5626 | wait_event(mddev->sb_wait, |
5479 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | 5627 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && |
5480 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 5628 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |
@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev) | |||
5495 | * may proceed without blocking. It is important to call this before | 5643 | * may proceed without blocking. It is important to call this before |
5496 | * attempting a GFP_KERNEL allocation while holding the mddev lock. | 5644 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
5497 | * Must be called with mddev_lock held. | 5645 | * Must be called with mddev_lock held. |
5646 | * | ||
5647 | * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock | ||
5648 | * is dropped, so return -EAGAIN after notifying userspace. | ||
5498 | */ | 5649 | */ |
5499 | void md_allow_write(mddev_t *mddev) | 5650 | int md_allow_write(mddev_t *mddev) |
5500 | { | 5651 | { |
5501 | if (!mddev->pers) | 5652 | if (!mddev->pers) |
5502 | return; | 5653 | return 0; |
5503 | if (mddev->ro) | 5654 | if (mddev->ro) |
5504 | return; | 5655 | return 0; |
5656 | if (!mddev->pers->sync_request) | ||
5657 | return 0; | ||
5505 | 5658 | ||
5506 | spin_lock_irq(&mddev->write_lock); | 5659 | spin_lock_irq(&mddev->write_lock); |
5507 | if (mddev->in_sync) { | 5660 | if (mddev->in_sync) { |
@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev) | |||
5512 | mddev->safemode = 1; | 5665 | mddev->safemode = 1; |
5513 | spin_unlock_irq(&mddev->write_lock); | 5666 | spin_unlock_irq(&mddev->write_lock); |
5514 | md_update_sb(mddev, 0); | 5667 | md_update_sb(mddev, 0); |
5515 | |||
5516 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | 5668 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
5517 | /* wait for the dirty state to be recorded in the metadata */ | ||
5518 | wait_event(mddev->sb_wait, | ||
5519 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | ||
5520 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
5521 | } else | 5669 | } else |
5522 | spin_unlock_irq(&mddev->write_lock); | 5670 | spin_unlock_irq(&mddev->write_lock); |
5671 | |||
5672 | if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) | ||
5673 | return -EAGAIN; | ||
5674 | else | ||
5675 | return 0; | ||
5523 | } | 5676 | } |
5524 | EXPORT_SYMBOL_GPL(md_allow_write); | 5677 | EXPORT_SYMBOL_GPL(md_allow_write); |
5525 | 5678 | ||
@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev) | |||
5625 | max_sectors = mddev->resync_max_sectors; | 5778 | max_sectors = mddev->resync_max_sectors; |
5626 | mddev->resync_mismatches = 0; | 5779 | mddev->resync_mismatches = 0; |
5627 | /* we don't use the checkpoint if there's a bitmap */ | 5780 | /* we don't use the checkpoint if there's a bitmap */ |
5628 | if (!mddev->bitmap && | 5781 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
5629 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | 5782 | j = mddev->resync_min; |
5783 | else if (!mddev->bitmap) | ||
5630 | j = mddev->recovery_cp; | 5784 | j = mddev->recovery_cp; |
5785 | |||
5631 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 5786 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
5632 | max_sectors = mddev->size << 1; | 5787 | max_sectors = mddev->size << 1; |
5633 | else { | 5788 | else { |
@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev) | |||
5796 | 5951 | ||
5797 | skip: | 5952 | skip: |
5798 | mddev->curr_resync = 0; | 5953 | mddev->curr_resync = 0; |
5954 | mddev->resync_min = 0; | ||
5799 | mddev->resync_max = MaxSector; | 5955 | mddev->resync_max = MaxSector; |
5800 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 5956 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
5801 | wake_up(&resync_wait); | 5957 | wake_up(&resync_wait); |
@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
5845 | if (rdev->raid_disk < 0 | 6001 | if (rdev->raid_disk < 0 |
5846 | && !test_bit(Faulty, &rdev->flags)) { | 6002 | && !test_bit(Faulty, &rdev->flags)) { |
5847 | rdev->recovery_offset = 0; | 6003 | rdev->recovery_offset = 0; |
5848 | if (mddev->pers->hot_add_disk(mddev,rdev)) { | 6004 | if (mddev->pers-> |
6005 | hot_add_disk(mddev, rdev) == 0) { | ||
5849 | char nm[20]; | 6006 | char nm[20]; |
5850 | sprintf(nm, "rd%d", rdev->raid_disk); | 6007 | sprintf(nm, "rd%d", rdev->raid_disk); |
5851 | if (sysfs_create_link(&mddev->kobj, | 6008 | if (sysfs_create_link(&mddev->kobj, |
@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev) | |||
5920 | int spares = 0; | 6077 | int spares = 0; |
5921 | 6078 | ||
5922 | if (!mddev->external) { | 6079 | if (!mddev->external) { |
6080 | int did_change = 0; | ||
5923 | spin_lock_irq(&mddev->write_lock); | 6081 | spin_lock_irq(&mddev->write_lock); |
5924 | if (mddev->safemode && | 6082 | if (mddev->safemode && |
5925 | !atomic_read(&mddev->writes_pending) && | 6083 | !atomic_read(&mddev->writes_pending) && |
5926 | !mddev->in_sync && | 6084 | !mddev->in_sync && |
5927 | mddev->recovery_cp == MaxSector) { | 6085 | mddev->recovery_cp == MaxSector) { |
5928 | mddev->in_sync = 1; | 6086 | mddev->in_sync = 1; |
6087 | did_change = 1; | ||
5929 | if (mddev->persistent) | 6088 | if (mddev->persistent) |
5930 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 6089 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
5931 | } | 6090 | } |
5932 | if (mddev->safemode == 1) | 6091 | if (mddev->safemode == 1) |
5933 | mddev->safemode = 0; | 6092 | mddev->safemode = 0; |
5934 | spin_unlock_irq(&mddev->write_lock); | 6093 | spin_unlock_irq(&mddev->write_lock); |
6094 | if (did_change) | ||
6095 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
5935 | } | 6096 | } |
5936 | 6097 | ||
5937 | if (mddev->flags) | 6098 | if (mddev->flags) |
5938 | md_update_sb(mddev, 0); | 6099 | md_update_sb(mddev, 0); |
5939 | 6100 | ||
6101 | rdev_for_each(rdev, rtmp, mddev) | ||
6102 | if (test_and_clear_bit(StateChanged, &rdev->flags)) | ||
6103 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
6104 | |||
5940 | 6105 | ||
5941 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 6106 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
5942 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 6107 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev) | |||
5951 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 6116 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
5952 | /* success...*/ | 6117 | /* success...*/ |
5953 | /* activate any spares */ | 6118 | /* activate any spares */ |
5954 | mddev->pers->spare_active(mddev); | 6119 | if (mddev->pers->spare_active(mddev)) |
6120 | sysfs_notify(&mddev->kobj, NULL, | ||
6121 | "degraded"); | ||
5955 | } | 6122 | } |
5956 | md_update_sb(mddev, 1); | 6123 | md_update_sb(mddev, 1); |
5957 | 6124 | ||
@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev) | |||
5965 | mddev->recovery = 0; | 6132 | mddev->recovery = 0; |
5966 | /* flag recovery needed just to double check */ | 6133 | /* flag recovery needed just to double check */ |
5967 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6134 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
6135 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
5968 | md_new_event(mddev); | 6136 | md_new_event(mddev); |
5969 | goto unlock; | 6137 | goto unlock; |
5970 | } | 6138 | } |
6139 | /* Set RUNNING before clearing NEEDED to avoid | ||
6140 | * any transients in the value of "sync_action". | ||
6141 | */ | ||
6142 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
6143 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
5971 | /* Clear some bits that don't mean anything, but | 6144 | /* Clear some bits that don't mean anything, but |
5972 | * might be left set | 6145 | * might be left set |
5973 | */ | 6146 | */ |
5974 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
5975 | clear_bit(MD_RECOVERY_INTR, &mddev->recovery); | 6147 | clear_bit(MD_RECOVERY_INTR, &mddev->recovery); |
5976 | clear_bit(MD_RECOVERY_DONE, &mddev->recovery); | 6148 | clear_bit(MD_RECOVERY_DONE, &mddev->recovery); |
5977 | 6149 | ||
@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev) | |||
5989 | /* Cannot proceed */ | 6161 | /* Cannot proceed */ |
5990 | goto unlock; | 6162 | goto unlock; |
5991 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 6163 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
6164 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5992 | } else if ((spares = remove_and_add_spares(mddev))) { | 6165 | } else if ((spares = remove_and_add_spares(mddev))) { |
5993 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 6166 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
5994 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 6167 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
6168 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5995 | } else if (mddev->recovery_cp < MaxSector) { | 6169 | } else if (mddev->recovery_cp < MaxSector) { |
5996 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 6170 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
6171 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
5997 | } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 6172 | } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
5998 | /* nothing to be done ... */ | 6173 | /* nothing to be done ... */ |
5999 | goto unlock; | 6174 | goto unlock; |
6000 | 6175 | ||
6001 | if (mddev->pers->sync_request) { | 6176 | if (mddev->pers->sync_request) { |
6002 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
6003 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { | 6177 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { |
6004 | /* We are adding a device or devices to an array | 6178 | /* We are adding a device or devices to an array |
6005 | * which has the bitmap stored on all devices. | 6179 | * which has the bitmap stored on all devices. |
@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev) | |||
6018 | mddev->recovery = 0; | 6192 | mddev->recovery = 0; |
6019 | } else | 6193 | } else |
6020 | md_wakeup_thread(mddev->sync_thread); | 6194 | md_wakeup_thread(mddev->sync_thread); |
6195 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
6021 | md_new_event(mddev); | 6196 | md_new_event(mddev); |
6022 | } | 6197 | } |
6023 | unlock: | 6198 | unlock: |
6199 | if (!mddev->sync_thread) { | ||
6200 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
6201 | if (test_and_clear_bit(MD_RECOVERY_RECOVER, | ||
6202 | &mddev->recovery)) | ||
6203 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
6204 | } | ||
6024 | mddev_unlock(mddev); | 6205 | mddev_unlock(mddev); |
6025 | } | 6206 | } |
6026 | } | 6207 | } |
@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this, | |||
6047 | 6228 | ||
6048 | for_each_mddev(mddev, tmp) | 6229 | for_each_mddev(mddev, tmp) |
6049 | if (mddev_trylock(mddev)) { | 6230 | if (mddev_trylock(mddev)) { |
6050 | do_md_stop (mddev, 1); | 6231 | do_md_stop (mddev, 1, 0); |
6051 | mddev_unlock(mddev); | 6232 | mddev_unlock(mddev); |
6052 | } | 6233 | } |
6053 | /* | 6234 | /* |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e968116e0de9..c4779ccba1c3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
281 | { | 281 | { |
282 | multipath_conf_t *conf = mddev->private; | 282 | multipath_conf_t *conf = mddev->private; |
283 | struct request_queue *q; | 283 | struct request_queue *q; |
284 | int found = 0; | 284 | int err = -EEXIST; |
285 | int path; | 285 | int path; |
286 | struct multipath_info *p; | 286 | struct multipath_info *p; |
287 | int first = 0; | ||
288 | int last = mddev->raid_disks - 1; | ||
289 | |||
290 | if (rdev->raid_disk >= 0) | ||
291 | first = last = rdev->raid_disk; | ||
287 | 292 | ||
288 | print_multipath_conf(conf); | 293 | print_multipath_conf(conf); |
289 | 294 | ||
290 | for (path=0; path<mddev->raid_disks; path++) | 295 | for (path = first; path <= last; path++) |
291 | if ((p=conf->multipaths+path)->rdev == NULL) { | 296 | if ((p=conf->multipaths+path)->rdev == NULL) { |
292 | q = rdev->bdev->bd_disk->queue; | 297 | q = rdev->bdev->bd_disk->queue; |
293 | blk_queue_stack_limits(mddev->queue, q); | 298 | blk_queue_stack_limits(mddev->queue, q); |
@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
307 | rdev->raid_disk = path; | 312 | rdev->raid_disk = path; |
308 | set_bit(In_sync, &rdev->flags); | 313 | set_bit(In_sync, &rdev->flags); |
309 | rcu_assign_pointer(p->rdev, rdev); | 314 | rcu_assign_pointer(p->rdev, rdev); |
310 | found = 1; | 315 | err = 0; |
316 | break; | ||
311 | } | 317 | } |
312 | 318 | ||
313 | print_multipath_conf(conf); | 319 | print_multipath_conf(conf); |
314 | return found; | 320 | |
321 | return err; | ||
315 | } | 322 | } |
316 | 323 | ||
317 | static int multipath_remove_disk(mddev_t *mddev, int number) | 324 | static int multipath_remove_disk(mddev_t *mddev, int number) |
@@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev) | |||
497 | /* | 504 | /* |
498 | * Ok, everything is just fine now | 505 | * Ok, everything is just fine now |
499 | */ | 506 | */ |
500 | mddev->array_size = mddev->size; | 507 | mddev->array_sectors = mddev->size * 2; |
501 | 508 | ||
502 | mddev->queue->unplug_fn = multipath_unplug; | 509 | mddev->queue->unplug_fn = multipath_unplug; |
503 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; | 510 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bcbb82594a19..183610635661 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev) | |||
295 | goto out_free_conf; | 295 | goto out_free_conf; |
296 | 296 | ||
297 | /* calculate array device size */ | 297 | /* calculate array device size */ |
298 | mddev->array_size = 0; | 298 | mddev->array_sectors = 0; |
299 | rdev_for_each(rdev, tmp, mddev) | 299 | rdev_for_each(rdev, tmp, mddev) |
300 | mddev->array_size += rdev->size; | 300 | mddev->array_sectors += rdev->size * 2; |
301 | 301 | ||
302 | printk("raid0 : md_size is %llu blocks.\n", | 302 | printk("raid0 : md_size is %llu blocks.\n", |
303 | (unsigned long long)mddev->array_size); | 303 | (unsigned long long)mddev->array_sectors / 2); |
304 | printk("raid0 : conf->hash_spacing is %llu blocks.\n", | 304 | printk("raid0 : conf->hash_spacing is %llu blocks.\n", |
305 | (unsigned long long)conf->hash_spacing); | 305 | (unsigned long long)conf->hash_spacing); |
306 | { | 306 | { |
307 | sector_t s = mddev->array_size; | 307 | sector_t s = mddev->array_sectors / 2; |
308 | sector_t space = conf->hash_spacing; | 308 | sector_t space = conf->hash_spacing; |
309 | int round; | 309 | int round; |
310 | conf->preshift = 0; | 310 | conf->preshift = 0; |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c610b947218a..03a5ab705c20 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev) | |||
1100 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 1100 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
1101 | { | 1101 | { |
1102 | conf_t *conf = mddev->private; | 1102 | conf_t *conf = mddev->private; |
1103 | int found = 0; | 1103 | int err = -EEXIST; |
1104 | int mirror = 0; | 1104 | int mirror = 0; |
1105 | mirror_info_t *p; | 1105 | mirror_info_t *p; |
1106 | int first = 0; | ||
1107 | int last = mddev->raid_disks - 1; | ||
1106 | 1108 | ||
1107 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | 1109 | if (rdev->raid_disk >= 0) |
1110 | first = last = rdev->raid_disk; | ||
1111 | |||
1112 | for (mirror = first; mirror <= last; mirror++) | ||
1108 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1113 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1109 | 1114 | ||
1110 | blk_queue_stack_limits(mddev->queue, | 1115 | blk_queue_stack_limits(mddev->queue, |
@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1119 | 1124 | ||
1120 | p->head_position = 0; | 1125 | p->head_position = 0; |
1121 | rdev->raid_disk = mirror; | 1126 | rdev->raid_disk = mirror; |
1122 | found = 1; | 1127 | err = 0; |
1123 | /* As all devices are equivalent, we don't need a full recovery | 1128 | /* As all devices are equivalent, we don't need a full recovery |
1124 | * if this was recently any drive of the array | 1129 | * if this was recently any drive of the array |
1125 | */ | 1130 | */ |
@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1130 | } | 1135 | } |
1131 | 1136 | ||
1132 | print_conf(conf); | 1137 | print_conf(conf); |
1133 | return found; | 1138 | return err; |
1134 | } | 1139 | } |
1135 | 1140 | ||
1136 | static int raid1_remove_disk(mddev_t *mddev, int number) | 1141 | static int raid1_remove_disk(mddev_t *mddev, int number) |
@@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev) | |||
2038 | /* | 2043 | /* |
2039 | * Ok, everything is just fine now | 2044 | * Ok, everything is just fine now |
2040 | */ | 2045 | */ |
2041 | mddev->array_size = mddev->size; | 2046 | mddev->array_sectors = mddev->size * 2; |
2042 | 2047 | ||
2043 | mddev->queue->unplug_fn = raid1_unplug; | 2048 | mddev->queue->unplug_fn = raid1_unplug; |
2044 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2049 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
@@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2100 | * any io in the removed space completes, but it hardly seems | 2105 | * any io in the removed space completes, but it hardly seems |
2101 | * worth it. | 2106 | * worth it. |
2102 | */ | 2107 | */ |
2103 | mddev->array_size = sectors>>1; | 2108 | mddev->array_sectors = sectors; |
2104 | set_capacity(mddev->gendisk, mddev->array_size << 1); | 2109 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2105 | mddev->changed = 1; | 2110 | mddev->changed = 1; |
2106 | if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { | 2111 | if (mddev->array_sectors / 2 > mddev->size && |
2112 | mddev->recovery_cp == MaxSector) { | ||
2107 | mddev->recovery_cp = mddev->size << 1; | 2113 | mddev->recovery_cp = mddev->size << 1; |
2108 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2114 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2109 | } | 2115 | } |
2110 | mddev->size = mddev->array_size; | 2116 | mddev->size = mddev->array_sectors / 2; |
2111 | mddev->resync_max_sectors = sectors; | 2117 | mddev->resync_max_sectors = sectors; |
2112 | return 0; | 2118 | return 0; |
2113 | } | 2119 | } |
@@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev) | |||
2131 | conf_t *conf = mddev_to_conf(mddev); | 2137 | conf_t *conf = mddev_to_conf(mddev); |
2132 | int cnt, raid_disks; | 2138 | int cnt, raid_disks; |
2133 | unsigned long flags; | 2139 | unsigned long flags; |
2134 | int d, d2; | 2140 | int d, d2, err; |
2135 | 2141 | ||
2136 | /* Cannot change chunk_size, layout, or level */ | 2142 | /* Cannot change chunk_size, layout, or level */ |
2137 | if (mddev->chunk_size != mddev->new_chunk || | 2143 | if (mddev->chunk_size != mddev->new_chunk || |
@@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev) | |||
2143 | return -EINVAL; | 2149 | return -EINVAL; |
2144 | } | 2150 | } |
2145 | 2151 | ||
2146 | md_allow_write(mddev); | 2152 | err = md_allow_write(mddev); |
2153 | if (err) | ||
2154 | return err; | ||
2147 | 2155 | ||
2148 | raid_disks = mddev->raid_disks + mddev->delta_disks; | 2156 | raid_disks = mddev->raid_disks + mddev->delta_disks; |
2149 | 2157 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 22bb2b1b886d..159535d73567 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev) | |||
1114 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 1114 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
1115 | { | 1115 | { |
1116 | conf_t *conf = mddev->private; | 1116 | conf_t *conf = mddev->private; |
1117 | int found = 0; | 1117 | int err = -EEXIST; |
1118 | int mirror; | 1118 | int mirror; |
1119 | mirror_info_t *p; | 1119 | mirror_info_t *p; |
1120 | int first = 0; | ||
1121 | int last = mddev->raid_disks - 1; | ||
1120 | 1122 | ||
1121 | if (mddev->recovery_cp < MaxSector) | 1123 | if (mddev->recovery_cp < MaxSector) |
1122 | /* only hot-add to in-sync arrays, as recovery is | 1124 | /* only hot-add to in-sync arrays, as recovery is |
1123 | * very different from resync | 1125 | * very different from resync |
1124 | */ | 1126 | */ |
1125 | return 0; | 1127 | return -EBUSY; |
1126 | if (!enough(conf)) | 1128 | if (!enough(conf)) |
1127 | return 0; | 1129 | return -EINVAL; |
1130 | |||
1131 | if (rdev->raid_disk) | ||
1132 | first = last = rdev->raid_disk; | ||
1128 | 1133 | ||
1129 | if (rdev->saved_raid_disk >= 0 && | 1134 | if (rdev->saved_raid_disk >= 0 && |
1135 | rdev->saved_raid_disk >= first && | ||
1130 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1136 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
1131 | mirror = rdev->saved_raid_disk; | 1137 | mirror = rdev->saved_raid_disk; |
1132 | else | 1138 | else |
1133 | mirror = 0; | 1139 | mirror = first; |
1134 | for ( ; mirror < mddev->raid_disks; mirror++) | 1140 | for ( ; mirror <= last ; mirror++) |
1135 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1141 | if ( !(p=conf->mirrors+mirror)->rdev) { |
1136 | 1142 | ||
1137 | blk_queue_stack_limits(mddev->queue, | 1143 | blk_queue_stack_limits(mddev->queue, |
@@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1146 | 1152 | ||
1147 | p->head_position = 0; | 1153 | p->head_position = 0; |
1148 | rdev->raid_disk = mirror; | 1154 | rdev->raid_disk = mirror; |
1149 | found = 1; | 1155 | err = 0; |
1150 | if (rdev->saved_raid_disk != mirror) | 1156 | if (rdev->saved_raid_disk != mirror) |
1151 | conf->fullsync = 1; | 1157 | conf->fullsync = 1; |
1152 | rcu_assign_pointer(p->rdev, rdev); | 1158 | rcu_assign_pointer(p->rdev, rdev); |
@@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1154 | } | 1160 | } |
1155 | 1161 | ||
1156 | print_conf(conf); | 1162 | print_conf(conf); |
1157 | return found; | 1163 | return err; |
1158 | } | 1164 | } |
1159 | 1165 | ||
1160 | static int raid10_remove_disk(mddev_t *mddev, int number) | 1166 | static int raid10_remove_disk(mddev_t *mddev, int number) |
@@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev) | |||
2159 | /* | 2165 | /* |
2160 | * Ok, everything is just fine now | 2166 | * Ok, everything is just fine now |
2161 | */ | 2167 | */ |
2162 | mddev->array_size = size << (conf->chunk_shift-1); | 2168 | mddev->array_sectors = size << conf->chunk_shift; |
2163 | mddev->resync_max_sectors = size << conf->chunk_shift; | 2169 | mddev->resync_max_sectors = size << conf->chunk_shift; |
2164 | 2170 | ||
2165 | mddev->queue->unplug_fn = raid10_unplug; | 2171 | mddev->queue->unplug_fn = raid10_unplug; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9ce7154845c6..55e7c56045a0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi) | |||
115 | return_bi = bi->bi_next; | 115 | return_bi = bi->bi_next; |
116 | bi->bi_next = NULL; | 116 | bi->bi_next = NULL; |
117 | bi->bi_size = 0; | 117 | bi->bi_size = 0; |
118 | bi->bi_end_io(bi, | 118 | bio_endio(bi, 0); |
119 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
120 | ? 0 : -EIO); | ||
121 | bi = return_bi; | 119 | bi = return_bi; |
122 | } | 120 | } |
123 | } | 121 | } |
124 | 122 | ||
125 | static void print_raid5_conf (raid5_conf_t *conf); | 123 | static void print_raid5_conf (raid5_conf_t *conf); |
126 | 124 | ||
125 | static int stripe_operations_active(struct stripe_head *sh) | ||
126 | { | ||
127 | return sh->check_state || sh->reconstruct_state || | ||
128 | test_bit(STRIPE_BIOFILL_RUN, &sh->state) || | ||
129 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
130 | } | ||
131 | |||
127 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | 132 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) |
128 | { | 133 | { |
129 | if (atomic_dec_and_test(&sh->count)) { | 134 | if (atomic_dec_and_test(&sh->count)) { |
@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
143 | } | 148 | } |
144 | md_wakeup_thread(conf->mddev->thread); | 149 | md_wakeup_thread(conf->mddev->thread); |
145 | } else { | 150 | } else { |
146 | BUG_ON(sh->ops.pending); | 151 | BUG_ON(stripe_operations_active(sh)); |
147 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 152 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
148 | atomic_dec(&conf->preread_active_stripes); | 153 | atomic_dec(&conf->preread_active_stripes); |
149 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 154 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) |
@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int | |||
245 | 250 | ||
246 | BUG_ON(atomic_read(&sh->count) != 0); | 251 | BUG_ON(atomic_read(&sh->count) != 0); |
247 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 252 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
248 | BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); | 253 | BUG_ON(stripe_operations_active(sh)); |
249 | 254 | ||
250 | CHECK_DEVLOCK(); | 255 | CHECK_DEVLOCK(); |
251 | pr_debug("init_stripe called, stripe %llu\n", | 256 | pr_debug("init_stripe called, stripe %llu\n", |
@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
346 | return sh; | 351 | return sh; |
347 | } | 352 | } |
348 | 353 | ||
349 | /* test_and_ack_op() ensures that we only dequeue an operation once */ | ||
350 | #define test_and_ack_op(op, pend) \ | ||
351 | do { \ | ||
352 | if (test_bit(op, &sh->ops.pending) && \ | ||
353 | !test_bit(op, &sh->ops.complete)) { \ | ||
354 | if (test_and_set_bit(op, &sh->ops.ack)) \ | ||
355 | clear_bit(op, &pend); \ | ||
356 | else \ | ||
357 | ack++; \ | ||
358 | } else \ | ||
359 | clear_bit(op, &pend); \ | ||
360 | } while (0) | ||
361 | |||
362 | /* find new work to run, do not resubmit work that is already | ||
363 | * in flight | ||
364 | */ | ||
365 | static unsigned long get_stripe_work(struct stripe_head *sh) | ||
366 | { | ||
367 | unsigned long pending; | ||
368 | int ack = 0; | ||
369 | |||
370 | pending = sh->ops.pending; | ||
371 | |||
372 | test_and_ack_op(STRIPE_OP_BIOFILL, pending); | ||
373 | test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); | ||
374 | test_and_ack_op(STRIPE_OP_PREXOR, pending); | ||
375 | test_and_ack_op(STRIPE_OP_BIODRAIN, pending); | ||
376 | test_and_ack_op(STRIPE_OP_POSTXOR, pending); | ||
377 | test_and_ack_op(STRIPE_OP_CHECK, pending); | ||
378 | if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
379 | ack++; | ||
380 | |||
381 | sh->ops.count -= ack; | ||
382 | if (unlikely(sh->ops.count < 0)) { | ||
383 | printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " | ||
384 | "ops.complete: %#lx\n", pending, sh->ops.pending, | ||
385 | sh->ops.ack, sh->ops.complete); | ||
386 | BUG(); | ||
387 | } | ||
388 | |||
389 | return pending; | ||
390 | } | ||
391 | |||
392 | static void | 354 | static void |
393 | raid5_end_read_request(struct bio *bi, int error); | 355 | raid5_end_read_request(struct bio *bi, int error); |
394 | static void | 356 | static void |
395 | raid5_end_write_request(struct bio *bi, int error); | 357 | raid5_end_write_request(struct bio *bi, int error); |
396 | 358 | ||
397 | static void ops_run_io(struct stripe_head *sh) | 359 | static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) |
398 | { | 360 | { |
399 | raid5_conf_t *conf = sh->raid_conf; | 361 | raid5_conf_t *conf = sh->raid_conf; |
400 | int i, disks = sh->disks; | 362 | int i, disks = sh->disks; |
401 | 363 | ||
402 | might_sleep(); | 364 | might_sleep(); |
403 | 365 | ||
404 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
405 | for (i = disks; i--; ) { | 366 | for (i = disks; i--; ) { |
406 | int rw; | 367 | int rw; |
407 | struct bio *bi; | 368 | struct bio *bi; |
@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) | |||
430 | rcu_read_unlock(); | 391 | rcu_read_unlock(); |
431 | 392 | ||
432 | if (rdev) { | 393 | if (rdev) { |
433 | if (test_bit(STRIPE_SYNCING, &sh->state) || | 394 | if (s->syncing || s->expanding || s->expanded) |
434 | test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || | ||
435 | test_bit(STRIPE_EXPAND_READY, &sh->state)) | ||
436 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 395 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
437 | 396 | ||
397 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
398 | |||
438 | bi->bi_bdev = rdev->bdev; | 399 | bi->bi_bdev = rdev->bdev; |
439 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", | 400 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", |
440 | __func__, (unsigned long long)sh->sector, | 401 | __func__, (unsigned long long)sh->sector, |
@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
528 | (unsigned long long)sh->sector); | 489 | (unsigned long long)sh->sector); |
529 | 490 | ||
530 | /* clear completed biofills */ | 491 | /* clear completed biofills */ |
492 | spin_lock_irq(&conf->device_lock); | ||
531 | for (i = sh->disks; i--; ) { | 493 | for (i = sh->disks; i--; ) { |
532 | struct r5dev *dev = &sh->dev[i]; | 494 | struct r5dev *dev = &sh->dev[i]; |
533 | 495 | ||
534 | /* acknowledge completion of a biofill operation */ | 496 | /* acknowledge completion of a biofill operation */ |
535 | /* and check if we need to reply to a read request, | 497 | /* and check if we need to reply to a read request, |
536 | * new R5_Wantfill requests are held off until | 498 | * new R5_Wantfill requests are held off until |
537 | * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) | 499 | * !STRIPE_BIOFILL_RUN |
538 | */ | 500 | */ |
539 | if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { | 501 | if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { |
540 | struct bio *rbi, *rbi2; | 502 | struct bio *rbi, *rbi2; |
541 | 503 | ||
542 | /* The access to dev->read is outside of the | ||
543 | * spin_lock_irq(&conf->device_lock), but is protected | ||
544 | * by the STRIPE_OP_BIOFILL pending bit | ||
545 | */ | ||
546 | BUG_ON(!dev->read); | 504 | BUG_ON(!dev->read); |
547 | rbi = dev->read; | 505 | rbi = dev->read; |
548 | dev->read = NULL; | 506 | dev->read = NULL; |
549 | while (rbi && rbi->bi_sector < | 507 | while (rbi && rbi->bi_sector < |
550 | dev->sector + STRIPE_SECTORS) { | 508 | dev->sector + STRIPE_SECTORS) { |
551 | rbi2 = r5_next_bio(rbi, dev->sector); | 509 | rbi2 = r5_next_bio(rbi, dev->sector); |
552 | spin_lock_irq(&conf->device_lock); | ||
553 | if (--rbi->bi_phys_segments == 0) { | 510 | if (--rbi->bi_phys_segments == 0) { |
554 | rbi->bi_next = return_bi; | 511 | rbi->bi_next = return_bi; |
555 | return_bi = rbi; | 512 | return_bi = rbi; |
556 | } | 513 | } |
557 | spin_unlock_irq(&conf->device_lock); | ||
558 | rbi = rbi2; | 514 | rbi = rbi2; |
559 | } | 515 | } |
560 | } | 516 | } |
561 | } | 517 | } |
562 | set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); | 518 | spin_unlock_irq(&conf->device_lock); |
519 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
563 | 520 | ||
564 | return_io(return_bi); | 521 | return_io(return_bi); |
565 | 522 | ||
@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
610 | set_bit(R5_UPTODATE, &tgt->flags); | 567 | set_bit(R5_UPTODATE, &tgt->flags); |
611 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 568 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
612 | clear_bit(R5_Wantcompute, &tgt->flags); | 569 | clear_bit(R5_Wantcompute, &tgt->flags); |
613 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 570 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
571 | if (sh->check_state == check_state_compute_run) | ||
572 | sh->check_state = check_state_compute_result; | ||
614 | set_bit(STRIPE_HANDLE, &sh->state); | 573 | set_bit(STRIPE_HANDLE, &sh->state); |
615 | release_stripe(sh); | 574 | release_stripe(sh); |
616 | } | 575 | } |
617 | 576 | ||
618 | static struct dma_async_tx_descriptor * | 577 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) |
619 | ops_run_compute5(struct stripe_head *sh, unsigned long pending) | ||
620 | { | 578 | { |
621 | /* kernel stack size limits the total number of disks */ | 579 | /* kernel stack size limits the total number of disks */ |
622 | int disks = sh->disks; | 580 | int disks = sh->disks; |
@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) | |||
646 | ASYNC_TX_XOR_ZERO_DST, NULL, | 604 | ASYNC_TX_XOR_ZERO_DST, NULL, |
647 | ops_complete_compute5, sh); | 605 | ops_complete_compute5, sh); |
648 | 606 | ||
649 | /* ack now if postxor is not set to be run */ | ||
650 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) | ||
651 | async_tx_ack(tx); | ||
652 | |||
653 | return tx; | 607 | return tx; |
654 | } | 608 | } |
655 | 609 | ||
@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
659 | 613 | ||
660 | pr_debug("%s: stripe %llu\n", __func__, | 614 | pr_debug("%s: stripe %llu\n", __func__, |
661 | (unsigned long long)sh->sector); | 615 | (unsigned long long)sh->sector); |
662 | |||
663 | set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | ||
664 | } | 616 | } |
665 | 617 | ||
666 | static struct dma_async_tx_descriptor * | 618 | static struct dma_async_tx_descriptor * |
@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
680 | for (i = disks; i--; ) { | 632 | for (i = disks; i--; ) { |
681 | struct r5dev *dev = &sh->dev[i]; | 633 | struct r5dev *dev = &sh->dev[i]; |
682 | /* Only process blocks that are known to be uptodate */ | 634 | /* Only process blocks that are known to be uptodate */ |
683 | if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) | 635 | if (test_bit(R5_Wantdrain, &dev->flags)) |
684 | xor_srcs[count++] = dev->page; | 636 | xor_srcs[count++] = dev->page; |
685 | } | 637 | } |
686 | 638 | ||
@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
692 | } | 644 | } |
693 | 645 | ||
694 | static struct dma_async_tx_descriptor * | 646 | static struct dma_async_tx_descriptor * |
695 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | 647 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
696 | unsigned long pending) | ||
697 | { | 648 | { |
698 | int disks = sh->disks; | 649 | int disks = sh->disks; |
699 | int pd_idx = sh->pd_idx, i; | 650 | int i; |
700 | |||
701 | /* check if prexor is active which means only process blocks | ||
702 | * that are part of a read-modify-write (Wantprexor) | ||
703 | */ | ||
704 | int prexor = test_bit(STRIPE_OP_PREXOR, &pending); | ||
705 | 651 | ||
706 | pr_debug("%s: stripe %llu\n", __func__, | 652 | pr_debug("%s: stripe %llu\n", __func__, |
707 | (unsigned long long)sh->sector); | 653 | (unsigned long long)sh->sector); |
@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
709 | for (i = disks; i--; ) { | 655 | for (i = disks; i--; ) { |
710 | struct r5dev *dev = &sh->dev[i]; | 656 | struct r5dev *dev = &sh->dev[i]; |
711 | struct bio *chosen; | 657 | struct bio *chosen; |
712 | int towrite; | ||
713 | |||
714 | towrite = 0; | ||
715 | if (prexor) { /* rmw */ | ||
716 | if (dev->towrite && | ||
717 | test_bit(R5_Wantprexor, &dev->flags)) | ||
718 | towrite = 1; | ||
719 | } else { /* rcw */ | ||
720 | if (i != pd_idx && dev->towrite && | ||
721 | test_bit(R5_LOCKED, &dev->flags)) | ||
722 | towrite = 1; | ||
723 | } | ||
724 | 658 | ||
725 | if (towrite) { | 659 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
726 | struct bio *wbi; | 660 | struct bio *wbi; |
727 | 661 | ||
728 | spin_lock(&sh->lock); | 662 | spin_lock(&sh->lock); |
@@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
747 | static void ops_complete_postxor(void *stripe_head_ref) | 681 | static void ops_complete_postxor(void *stripe_head_ref) |
748 | { | 682 | { |
749 | struct stripe_head *sh = stripe_head_ref; | 683 | struct stripe_head *sh = stripe_head_ref; |
750 | |||
751 | pr_debug("%s: stripe %llu\n", __func__, | ||
752 | (unsigned long long)sh->sector); | ||
753 | |||
754 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
755 | set_bit(STRIPE_HANDLE, &sh->state); | ||
756 | release_stripe(sh); | ||
757 | } | ||
758 | |||
759 | static void ops_complete_write(void *stripe_head_ref) | ||
760 | { | ||
761 | struct stripe_head *sh = stripe_head_ref; | ||
762 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 684 | int disks = sh->disks, i, pd_idx = sh->pd_idx; |
763 | 685 | ||
764 | pr_debug("%s: stripe %llu\n", __func__, | 686 | pr_debug("%s: stripe %llu\n", __func__, |
@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) | |||
770 | set_bit(R5_UPTODATE, &dev->flags); | 692 | set_bit(R5_UPTODATE, &dev->flags); |
771 | } | 693 | } |
772 | 694 | ||
773 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | 695 | if (sh->reconstruct_state == reconstruct_state_drain_run) |
774 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | 696 | sh->reconstruct_state = reconstruct_state_drain_result; |
697 | else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) | ||
698 | sh->reconstruct_state = reconstruct_state_prexor_drain_result; | ||
699 | else { | ||
700 | BUG_ON(sh->reconstruct_state != reconstruct_state_run); | ||
701 | sh->reconstruct_state = reconstruct_state_result; | ||
702 | } | ||
775 | 703 | ||
776 | set_bit(STRIPE_HANDLE, &sh->state); | 704 | set_bit(STRIPE_HANDLE, &sh->state); |
777 | release_stripe(sh); | 705 | release_stripe(sh); |
778 | } | 706 | } |
779 | 707 | ||
780 | static void | 708 | static void |
781 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | 709 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
782 | unsigned long pending) | ||
783 | { | 710 | { |
784 | /* kernel stack size limits the total number of disks */ | 711 | /* kernel stack size limits the total number of disks */ |
785 | int disks = sh->disks; | 712 | int disks = sh->disks; |
@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
787 | 714 | ||
788 | int count = 0, pd_idx = sh->pd_idx, i; | 715 | int count = 0, pd_idx = sh->pd_idx, i; |
789 | struct page *xor_dest; | 716 | struct page *xor_dest; |
790 | int prexor = test_bit(STRIPE_OP_PREXOR, &pending); | 717 | int prexor = 0; |
791 | unsigned long flags; | 718 | unsigned long flags; |
792 | dma_async_tx_callback callback; | ||
793 | 719 | ||
794 | pr_debug("%s: stripe %llu\n", __func__, | 720 | pr_debug("%s: stripe %llu\n", __func__, |
795 | (unsigned long long)sh->sector); | 721 | (unsigned long long)sh->sector); |
@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
797 | /* check if prexor is active which means only process blocks | 723 | /* check if prexor is active which means only process blocks |
798 | * that are part of a read-modify-write (written) | 724 | * that are part of a read-modify-write (written) |
799 | */ | 725 | */ |
800 | if (prexor) { | 726 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
727 | prexor = 1; | ||
801 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 728 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
802 | for (i = disks; i--; ) { | 729 | for (i = disks; i--; ) { |
803 | struct r5dev *dev = &sh->dev[i]; | 730 | struct r5dev *dev = &sh->dev[i]; |
@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
813 | } | 740 | } |
814 | } | 741 | } |
815 | 742 | ||
816 | /* check whether this postxor is part of a write */ | ||
817 | callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? | ||
818 | ops_complete_write : ops_complete_postxor; | ||
819 | |||
820 | /* 1/ if we prexor'd then the dest is reused as a source | 743 | /* 1/ if we prexor'd then the dest is reused as a source |
821 | * 2/ if we did not prexor then we are redoing the parity | 744 | * 2/ if we did not prexor then we are redoing the parity |
822 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 745 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
830 | if (unlikely(count == 1)) { | 753 | if (unlikely(count == 1)) { |
831 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 754 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); |
832 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 755 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, |
833 | flags, tx, callback, sh); | 756 | flags, tx, ops_complete_postxor, sh); |
834 | } else | 757 | } else |
835 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 758 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
836 | flags, tx, callback, sh); | 759 | flags, tx, ops_complete_postxor, sh); |
837 | } | 760 | } |
838 | 761 | ||
839 | static void ops_complete_check(void *stripe_head_ref) | 762 | static void ops_complete_check(void *stripe_head_ref) |
840 | { | 763 | { |
841 | struct stripe_head *sh = stripe_head_ref; | 764 | struct stripe_head *sh = stripe_head_ref; |
842 | int pd_idx = sh->pd_idx; | ||
843 | 765 | ||
844 | pr_debug("%s: stripe %llu\n", __func__, | 766 | pr_debug("%s: stripe %llu\n", __func__, |
845 | (unsigned long long)sh->sector); | 767 | (unsigned long long)sh->sector); |
846 | 768 | ||
847 | if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && | 769 | sh->check_state = check_state_check_result; |
848 | sh->ops.zero_sum_result == 0) | ||
849 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
850 | |||
851 | set_bit(STRIPE_OP_CHECK, &sh->ops.complete); | ||
852 | set_bit(STRIPE_HANDLE, &sh->state); | 770 | set_bit(STRIPE_HANDLE, &sh->state); |
853 | release_stripe(sh); | 771 | release_stripe(sh); |
854 | } | 772 | } |
@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) | |||
875 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 793 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
876 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 794 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); |
877 | 795 | ||
878 | if (tx) | ||
879 | set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
880 | else | ||
881 | clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
882 | |||
883 | atomic_inc(&sh->count); | 796 | atomic_inc(&sh->count); |
884 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 797 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, |
885 | ops_complete_check, sh); | 798 | ops_complete_check, sh); |
886 | } | 799 | } |
887 | 800 | ||
888 | static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | 801 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) |
889 | { | 802 | { |
890 | int overlap_clear = 0, i, disks = sh->disks; | 803 | int overlap_clear = 0, i, disks = sh->disks; |
891 | struct dma_async_tx_descriptor *tx = NULL; | 804 | struct dma_async_tx_descriptor *tx = NULL; |
892 | 805 | ||
893 | if (test_bit(STRIPE_OP_BIOFILL, &pending)) { | 806 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
894 | ops_run_biofill(sh); | 807 | ops_run_biofill(sh); |
895 | overlap_clear++; | 808 | overlap_clear++; |
896 | } | 809 | } |
897 | 810 | ||
898 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) | 811 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
899 | tx = ops_run_compute5(sh, pending); | 812 | tx = ops_run_compute5(sh); |
813 | /* terminate the chain if postxor is not set to be run */ | ||
814 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | ||
815 | async_tx_ack(tx); | ||
816 | } | ||
900 | 817 | ||
901 | if (test_bit(STRIPE_OP_PREXOR, &pending)) | 818 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
902 | tx = ops_run_prexor(sh, tx); | 819 | tx = ops_run_prexor(sh, tx); |
903 | 820 | ||
904 | if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { | 821 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
905 | tx = ops_run_biodrain(sh, tx, pending); | 822 | tx = ops_run_biodrain(sh, tx); |
906 | overlap_clear++; | 823 | overlap_clear++; |
907 | } | 824 | } |
908 | 825 | ||
909 | if (test_bit(STRIPE_OP_POSTXOR, &pending)) | 826 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) |
910 | ops_run_postxor(sh, tx, pending); | 827 | ops_run_postxor(sh, tx); |
911 | 828 | ||
912 | if (test_bit(STRIPE_OP_CHECK, &pending)) | 829 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) |
913 | ops_run_check(sh); | 830 | ops_run_check(sh); |
914 | 831 | ||
915 | if (test_bit(STRIPE_OP_IO, &pending)) | ||
916 | ops_run_io(sh); | ||
917 | |||
918 | if (overlap_clear) | 832 | if (overlap_clear) |
919 | for (i = disks; i--; ) { | 833 | for (i = disks; i--; ) { |
920 | struct r5dev *dev = &sh->dev[i]; | 834 | struct r5dev *dev = &sh->dev[i]; |
@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
997 | struct stripe_head *osh, *nsh; | 911 | struct stripe_head *osh, *nsh; |
998 | LIST_HEAD(newstripes); | 912 | LIST_HEAD(newstripes); |
999 | struct disk_info *ndisks; | 913 | struct disk_info *ndisks; |
1000 | int err = 0; | 914 | int err; |
1001 | struct kmem_cache *sc; | 915 | struct kmem_cache *sc; |
1002 | int i; | 916 | int i; |
1003 | 917 | ||
1004 | if (newsize <= conf->pool_size) | 918 | if (newsize <= conf->pool_size) |
1005 | return 0; /* never bother to shrink */ | 919 | return 0; /* never bother to shrink */ |
1006 | 920 | ||
1007 | md_allow_write(conf->mddev); | 921 | err = md_allow_write(conf->mddev); |
922 | if (err) | ||
923 | return err; | ||
1008 | 924 | ||
1009 | /* Step 1 */ | 925 | /* Step 1 */ |
1010 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], | 926 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], |
@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | |||
1703 | } | 1619 | } |
1704 | } | 1620 | } |
1705 | 1621 | ||
1706 | static int | 1622 | static void |
1707 | handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | 1623 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, |
1624 | int rcw, int expand) | ||
1708 | { | 1625 | { |
1709 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1626 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
1710 | int locked = 0; | ||
1711 | 1627 | ||
1712 | if (rcw) { | 1628 | if (rcw) { |
1713 | /* if we are not expanding this is a proper write request, and | 1629 | /* if we are not expanding this is a proper write request, and |
@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
1715 | * stripe cache | 1631 | * stripe cache |
1716 | */ | 1632 | */ |
1717 | if (!expand) { | 1633 | if (!expand) { |
1718 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | 1634 | sh->reconstruct_state = reconstruct_state_drain_run; |
1719 | sh->ops.count++; | 1635 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1720 | } | 1636 | } else |
1637 | sh->reconstruct_state = reconstruct_state_run; | ||
1721 | 1638 | ||
1722 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | 1639 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); |
1723 | sh->ops.count++; | ||
1724 | 1640 | ||
1725 | for (i = disks; i--; ) { | 1641 | for (i = disks; i--; ) { |
1726 | struct r5dev *dev = &sh->dev[i]; | 1642 | struct r5dev *dev = &sh->dev[i]; |
1727 | 1643 | ||
1728 | if (dev->towrite) { | 1644 | if (dev->towrite) { |
1729 | set_bit(R5_LOCKED, &dev->flags); | 1645 | set_bit(R5_LOCKED, &dev->flags); |
1646 | set_bit(R5_Wantdrain, &dev->flags); | ||
1730 | if (!expand) | 1647 | if (!expand) |
1731 | clear_bit(R5_UPTODATE, &dev->flags); | 1648 | clear_bit(R5_UPTODATE, &dev->flags); |
1732 | locked++; | 1649 | s->locked++; |
1733 | } | 1650 | } |
1734 | } | 1651 | } |
1735 | if (locked + 1 == disks) | 1652 | if (s->locked + 1 == disks) |
1736 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1653 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
1737 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1654 | atomic_inc(&sh->raid_conf->pending_full_writes); |
1738 | } else { | 1655 | } else { |
1739 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1656 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
1740 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1657 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
1741 | 1658 | ||
1742 | set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | 1659 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
1743 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | 1660 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
1744 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | 1661 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
1745 | 1662 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | |
1746 | sh->ops.count += 3; | ||
1747 | 1663 | ||
1748 | for (i = disks; i--; ) { | 1664 | for (i = disks; i--; ) { |
1749 | struct r5dev *dev = &sh->dev[i]; | 1665 | struct r5dev *dev = &sh->dev[i]; |
1750 | if (i == pd_idx) | 1666 | if (i == pd_idx) |
1751 | continue; | 1667 | continue; |
1752 | 1668 | ||
1753 | /* For a read-modify write there may be blocks that are | ||
1754 | * locked for reading while others are ready to be | ||
1755 | * written so we distinguish these blocks by the | ||
1756 | * R5_Wantprexor bit | ||
1757 | */ | ||
1758 | if (dev->towrite && | 1669 | if (dev->towrite && |
1759 | (test_bit(R5_UPTODATE, &dev->flags) || | 1670 | (test_bit(R5_UPTODATE, &dev->flags) || |
1760 | test_bit(R5_Wantcompute, &dev->flags))) { | 1671 | test_bit(R5_Wantcompute, &dev->flags))) { |
1761 | set_bit(R5_Wantprexor, &dev->flags); | 1672 | set_bit(R5_Wantdrain, &dev->flags); |
1762 | set_bit(R5_LOCKED, &dev->flags); | 1673 | set_bit(R5_LOCKED, &dev->flags); |
1763 | clear_bit(R5_UPTODATE, &dev->flags); | 1674 | clear_bit(R5_UPTODATE, &dev->flags); |
1764 | locked++; | 1675 | s->locked++; |
1765 | } | 1676 | } |
1766 | } | 1677 | } |
1767 | } | 1678 | } |
@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
1771 | */ | 1682 | */ |
1772 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 1683 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
1773 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 1684 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
1774 | locked++; | 1685 | s->locked++; |
1775 | 1686 | ||
1776 | pr_debug("%s: stripe %llu locked: %d pending: %lx\n", | 1687 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
1777 | __func__, (unsigned long long)sh->sector, | 1688 | __func__, (unsigned long long)sh->sector, |
1778 | locked, sh->ops.pending); | 1689 | s->locked, s->ops_request); |
1779 | |||
1780 | return locked; | ||
1781 | } | 1690 | } |
1782 | 1691 | ||
1783 | /* | 1692 | /* |
@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | |||
1876 | } | 1785 | } |
1877 | 1786 | ||
1878 | static void | 1787 | static void |
1879 | handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | 1788 | handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, |
1880 | struct stripe_head_state *s, int disks, | 1789 | struct stripe_head_state *s, int disks, |
1881 | struct bio **return_bi) | 1790 | struct bio **return_bi) |
1882 | { | 1791 | { |
@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | |||
1967 | md_wakeup_thread(conf->mddev->thread); | 1876 | md_wakeup_thread(conf->mddev->thread); |
1968 | } | 1877 | } |
1969 | 1878 | ||
1970 | /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks | 1879 | /* fetch_block5 - checks the given member device to see if its data needs |
1971 | * to process | 1880 | * to be read or computed to satisfy a request. |
1881 | * | ||
1882 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
1883 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
1972 | */ | 1884 | */ |
1973 | static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | 1885 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, |
1974 | struct stripe_head_state *s, int disk_idx, int disks) | 1886 | int disk_idx, int disks) |
1975 | { | 1887 | { |
1976 | struct r5dev *dev = &sh->dev[disk_idx]; | 1888 | struct r5dev *dev = &sh->dev[disk_idx]; |
1977 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | 1889 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; |
1978 | 1890 | ||
1979 | /* don't schedule compute operations or reads on the parity block while | ||
1980 | * a check is in flight | ||
1981 | */ | ||
1982 | if ((disk_idx == sh->pd_idx) && | ||
1983 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | ||
1984 | return ~0; | ||
1985 | |||
1986 | /* is the data in this block needed, and can we get it? */ | 1891 | /* is the data in this block needed, and can we get it? */ |
1987 | if (!test_bit(R5_LOCKED, &dev->flags) && | 1892 | if (!test_bit(R5_LOCKED, &dev->flags) && |
1988 | !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || | 1893 | !test_bit(R5_UPTODATE, &dev->flags) && |
1989 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 1894 | (dev->toread || |
1990 | s->syncing || s->expanding || (s->failed && | 1895 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
1991 | (failed_dev->toread || (failed_dev->towrite && | 1896 | s->syncing || s->expanding || |
1992 | !test_bit(R5_OVERWRITE, &failed_dev->flags) | 1897 | (s->failed && |
1993 | ))))) { | 1898 | (failed_dev->toread || |
1994 | /* 1/ We would like to get this block, possibly by computing it, | 1899 | (failed_dev->towrite && |
1995 | * but we might not be able to. | 1900 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { |
1996 | * | 1901 | /* We would like to get this block, possibly by computing it, |
1997 | * 2/ Since parity check operations potentially make the parity | 1902 | * otherwise read it if the backing disk is insync |
1998 | * block !uptodate it will need to be refreshed before any | ||
1999 | * compute operations on data disks are scheduled. | ||
2000 | * | ||
2001 | * 3/ We hold off parity block re-reads until check operations | ||
2002 | * have quiesced. | ||
2003 | */ | 1903 | */ |
2004 | if ((s->uptodate == disks - 1) && | 1904 | if ((s->uptodate == disks - 1) && |
2005 | (s->failed && disk_idx == s->failed_num) && | 1905 | (s->failed && disk_idx == s->failed_num)) { |
2006 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | 1906 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
2007 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 1907 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
2008 | set_bit(R5_Wantcompute, &dev->flags); | 1908 | set_bit(R5_Wantcompute, &dev->flags); |
2009 | sh->ops.target = disk_idx; | 1909 | sh->ops.target = disk_idx; |
2010 | s->req_compute = 1; | 1910 | s->req_compute = 1; |
2011 | sh->ops.count++; | ||
2012 | /* Careful: from this point on 'uptodate' is in the eye | 1911 | /* Careful: from this point on 'uptodate' is in the eye |
2013 | * of raid5_run_ops which services 'compute' operations | 1912 | * of raid5_run_ops which services 'compute' operations |
2014 | * before writes. R5_Wantcompute flags a block that will | 1913 | * before writes. R5_Wantcompute flags a block that will |
@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | |||
2016 | * subsequent operation. | 1915 | * subsequent operation. |
2017 | */ | 1916 | */ |
2018 | s->uptodate++; | 1917 | s->uptodate++; |
2019 | return 0; /* uptodate + compute == disks */ | 1918 | return 1; /* uptodate + compute == disks */ |
2020 | } else if (test_bit(R5_Insync, &dev->flags)) { | 1919 | } else if (test_bit(R5_Insync, &dev->flags)) { |
2021 | set_bit(R5_LOCKED, &dev->flags); | 1920 | set_bit(R5_LOCKED, &dev->flags); |
2022 | set_bit(R5_Wantread, &dev->flags); | 1921 | set_bit(R5_Wantread, &dev->flags); |
2023 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2024 | sh->ops.count++; | ||
2025 | s->locked++; | 1922 | s->locked++; |
2026 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | 1923 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, |
2027 | s->syncing); | 1924 | s->syncing); |
2028 | } | 1925 | } |
2029 | } | 1926 | } |
2030 | 1927 | ||
2031 | return ~0; | 1928 | return 0; |
2032 | } | 1929 | } |
2033 | 1930 | ||
2034 | static void handle_issuing_new_read_requests5(struct stripe_head *sh, | 1931 | /** |
1932 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
1933 | */ | ||
1934 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
2035 | struct stripe_head_state *s, int disks) | 1935 | struct stripe_head_state *s, int disks) |
2036 | { | 1936 | { |
2037 | int i; | 1937 | int i; |
2038 | 1938 | ||
2039 | /* Clear completed compute operations. Parity recovery | ||
2040 | * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled | ||
2041 | * later on in this routine | ||
2042 | */ | ||
2043 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | ||
2044 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
2045 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | ||
2046 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | ||
2047 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | ||
2048 | } | ||
2049 | |||
2050 | /* look for blocks to read/compute, skip this if a compute | 1939 | /* look for blocks to read/compute, skip this if a compute |
2051 | * is already in flight, or if the stripe contents are in the | 1940 | * is already in flight, or if the stripe contents are in the |
2052 | * midst of changing due to a write | 1941 | * midst of changing due to a write |
2053 | */ | 1942 | */ |
2054 | if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 1943 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
2055 | !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && | 1944 | !sh->reconstruct_state) |
2056 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | ||
2057 | for (i = disks; i--; ) | 1945 | for (i = disks; i--; ) |
2058 | if (__handle_issuing_new_read_requests5( | 1946 | if (fetch_block5(sh, s, i, disks)) |
2059 | sh, s, i, disks) == 0) | ||
2060 | break; | 1947 | break; |
2061 | } | ||
2062 | set_bit(STRIPE_HANDLE, &sh->state); | 1948 | set_bit(STRIPE_HANDLE, &sh->state); |
2063 | } | 1949 | } |
2064 | 1950 | ||
2065 | static void handle_issuing_new_read_requests6(struct stripe_head *sh, | 1951 | static void handle_stripe_fill6(struct stripe_head *sh, |
2066 | struct stripe_head_state *s, struct r6_state *r6s, | 1952 | struct stripe_head_state *s, struct r6_state *r6s, |
2067 | int disks) | 1953 | int disks) |
2068 | { | 1954 | { |
@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, | |||
2121 | } | 2007 | } |
2122 | 2008 | ||
2123 | 2009 | ||
2124 | /* handle_completed_write_requests | 2010 | /* handle_stripe_clean_event |
2125 | * any written block on an uptodate or failed drive can be returned. | 2011 | * any written block on an uptodate or failed drive can be returned. |
2126 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but | 2012 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but |
2127 | * never LOCKED, so we don't need to test 'failed' directly. | 2013 | * never LOCKED, so we don't need to test 'failed' directly. |
2128 | */ | 2014 | */ |
2129 | static void handle_completed_write_requests(raid5_conf_t *conf, | 2015 | static void handle_stripe_clean_event(raid5_conf_t *conf, |
2130 | struct stripe_head *sh, int disks, struct bio **return_bi) | 2016 | struct stripe_head *sh, int disks, struct bio **return_bi) |
2131 | { | 2017 | { |
2132 | int i; | 2018 | int i; |
@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, | |||
2171 | md_wakeup_thread(conf->mddev->thread); | 2057 | md_wakeup_thread(conf->mddev->thread); |
2172 | } | 2058 | } |
2173 | 2059 | ||
2174 | static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | 2060 | static void handle_stripe_dirtying5(raid5_conf_t *conf, |
2175 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2061 | struct stripe_head *sh, struct stripe_head_state *s, int disks) |
2176 | { | 2062 | { |
2177 | int rmw = 0, rcw = 0, i; | 2063 | int rmw = 0, rcw = 0, i; |
@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
2215 | "%d for r-m-w\n", i); | 2101 | "%d for r-m-w\n", i); |
2216 | set_bit(R5_LOCKED, &dev->flags); | 2102 | set_bit(R5_LOCKED, &dev->flags); |
2217 | set_bit(R5_Wantread, &dev->flags); | 2103 | set_bit(R5_Wantread, &dev->flags); |
2218 | if (!test_and_set_bit( | ||
2219 | STRIPE_OP_IO, &sh->ops.pending)) | ||
2220 | sh->ops.count++; | ||
2221 | s->locked++; | 2104 | s->locked++; |
2222 | } else { | 2105 | } else { |
2223 | set_bit(STRIPE_DELAYED, &sh->state); | 2106 | set_bit(STRIPE_DELAYED, &sh->state); |
@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
2241 | "%d for Reconstruct\n", i); | 2124 | "%d for Reconstruct\n", i); |
2242 | set_bit(R5_LOCKED, &dev->flags); | 2125 | set_bit(R5_LOCKED, &dev->flags); |
2243 | set_bit(R5_Wantread, &dev->flags); | 2126 | set_bit(R5_Wantread, &dev->flags); |
2244 | if (!test_and_set_bit( | ||
2245 | STRIPE_OP_IO, &sh->ops.pending)) | ||
2246 | sh->ops.count++; | ||
2247 | s->locked++; | 2127 | s->locked++; |
2248 | } else { | 2128 | } else { |
2249 | set_bit(STRIPE_DELAYED, &sh->state); | 2129 | set_bit(STRIPE_DELAYED, &sh->state); |
@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
2261 | * simultaneously. If this is not the case then new writes need to be | 2141 | * simultaneously. If this is not the case then new writes need to be |
2262 | * held off until the compute completes. | 2142 | * held off until the compute completes. |
2263 | */ | 2143 | */ |
2264 | if ((s->req_compute || | 2144 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
2265 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && | 2145 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
2266 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2146 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
2267 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2147 | schedule_reconstruction5(sh, s, rcw == 0, 0); |
2268 | s->locked += handle_write_operations5(sh, rcw == 0, 0); | ||
2269 | } | 2148 | } |
2270 | 2149 | ||
2271 | static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | 2150 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
2272 | struct stripe_head *sh, struct stripe_head_state *s, | 2151 | struct stripe_head *sh, struct stripe_head_state *s, |
2273 | struct r6_state *r6s, int disks) | 2152 | struct r6_state *r6s, int disks) |
2274 | { | 2153 | { |
@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | |||
2371 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2250 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
2372 | struct stripe_head_state *s, int disks) | 2251 | struct stripe_head_state *s, int disks) |
2373 | { | 2252 | { |
2374 | int canceled_check = 0; | 2253 | struct r5dev *dev = NULL; |
2375 | 2254 | ||
2376 | set_bit(STRIPE_HANDLE, &sh->state); | 2255 | set_bit(STRIPE_HANDLE, &sh->state); |
2377 | 2256 | ||
2378 | /* complete a check operation */ | 2257 | switch (sh->check_state) { |
2379 | if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { | 2258 | case check_state_idle: |
2380 | clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); | 2259 | /* start a new check operation if there are no failures */ |
2381 | clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); | ||
2382 | if (s->failed == 0) { | 2260 | if (s->failed == 0) { |
2383 | if (sh->ops.zero_sum_result == 0) | ||
2384 | /* parity is correct (on disc, | ||
2385 | * not in buffer any more) | ||
2386 | */ | ||
2387 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2388 | else { | ||
2389 | conf->mddev->resync_mismatches += | ||
2390 | STRIPE_SECTORS; | ||
2391 | if (test_bit( | ||
2392 | MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2393 | /* don't try to repair!! */ | ||
2394 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2395 | else { | ||
2396 | set_bit(STRIPE_OP_COMPUTE_BLK, | ||
2397 | &sh->ops.pending); | ||
2398 | set_bit(STRIPE_OP_MOD_REPAIR_PD, | ||
2399 | &sh->ops.pending); | ||
2400 | set_bit(R5_Wantcompute, | ||
2401 | &sh->dev[sh->pd_idx].flags); | ||
2402 | sh->ops.target = sh->pd_idx; | ||
2403 | sh->ops.count++; | ||
2404 | s->uptodate++; | ||
2405 | } | ||
2406 | } | ||
2407 | } else | ||
2408 | canceled_check = 1; /* STRIPE_INSYNC is not set */ | ||
2409 | } | ||
2410 | |||
2411 | /* start a new check operation if there are no failures, the stripe is | ||
2412 | * not insync, and a repair is not in flight | ||
2413 | */ | ||
2414 | if (s->failed == 0 && | ||
2415 | !test_bit(STRIPE_INSYNC, &sh->state) && | ||
2416 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
2417 | if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | ||
2418 | BUG_ON(s->uptodate != disks); | 2261 | BUG_ON(s->uptodate != disks); |
2262 | sh->check_state = check_state_run; | ||
2263 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
2419 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); | 2264 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); |
2420 | sh->ops.count++; | ||
2421 | s->uptodate--; | 2265 | s->uptodate--; |
2266 | break; | ||
2422 | } | 2267 | } |
2423 | } | 2268 | dev = &sh->dev[s->failed_num]; |
2424 | 2269 | /* fall through */ | |
2425 | /* check if we can clear a parity disk reconstruct */ | 2270 | case check_state_compute_result: |
2426 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | 2271 | sh->check_state = check_state_idle; |
2427 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | 2272 | if (!dev) |
2428 | 2273 | dev = &sh->dev[sh->pd_idx]; | |
2429 | clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); | 2274 | |
2430 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 2275 | /* check that a write has not made the stripe insync */ |
2431 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | 2276 | if (test_bit(STRIPE_INSYNC, &sh->state)) |
2432 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 2277 | break; |
2433 | } | ||
2434 | |||
2435 | 2278 | ||
2436 | /* Wait for check parity and compute block operations to complete | ||
2437 | * before write-back. If a failure occurred while the check operation | ||
2438 | * was in flight we need to cycle this stripe through handle_stripe | ||
2439 | * since the parity block may not be uptodate | ||
2440 | */ | ||
2441 | if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && | ||
2442 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && | ||
2443 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { | ||
2444 | struct r5dev *dev; | ||
2445 | /* either failed parity check, or recovery is happening */ | 2279 | /* either failed parity check, or recovery is happening */ |
2446 | if (s->failed == 0) | ||
2447 | s->failed_num = sh->pd_idx; | ||
2448 | dev = &sh->dev[s->failed_num]; | ||
2449 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | 2280 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); |
2450 | BUG_ON(s->uptodate != disks); | 2281 | BUG_ON(s->uptodate != disks); |
2451 | 2282 | ||
2452 | set_bit(R5_LOCKED, &dev->flags); | 2283 | set_bit(R5_LOCKED, &dev->flags); |
2284 | s->locked++; | ||
2453 | set_bit(R5_Wantwrite, &dev->flags); | 2285 | set_bit(R5_Wantwrite, &dev->flags); |
2454 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2455 | sh->ops.count++; | ||
2456 | 2286 | ||
2457 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2287 | clear_bit(STRIPE_DEGRADED, &sh->state); |
2458 | s->locked++; | ||
2459 | set_bit(STRIPE_INSYNC, &sh->state); | 2288 | set_bit(STRIPE_INSYNC, &sh->state); |
2289 | break; | ||
2290 | case check_state_run: | ||
2291 | break; /* we will be called again upon completion */ | ||
2292 | case check_state_check_result: | ||
2293 | sh->check_state = check_state_idle; | ||
2294 | |||
2295 | /* if a failure occurred during the check operation, leave | ||
2296 | * STRIPE_INSYNC not set and let the stripe be handled again | ||
2297 | */ | ||
2298 | if (s->failed) | ||
2299 | break; | ||
2300 | |||
2301 | /* handle a successful check operation, if parity is correct | ||
2302 | * we are done. Otherwise update the mismatch count and repair | ||
2303 | * parity if !MD_RECOVERY_CHECK | ||
2304 | */ | ||
2305 | if (sh->ops.zero_sum_result == 0) | ||
2306 | /* parity is correct (on disc, | ||
2307 | * not in buffer any more) | ||
2308 | */ | ||
2309 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2310 | else { | ||
2311 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
2312 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
2313 | /* don't try to repair!! */ | ||
2314 | set_bit(STRIPE_INSYNC, &sh->state); | ||
2315 | else { | ||
2316 | sh->check_state = check_state_compute_run; | ||
2317 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
2318 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
2319 | set_bit(R5_Wantcompute, | ||
2320 | &sh->dev[sh->pd_idx].flags); | ||
2321 | sh->ops.target = sh->pd_idx; | ||
2322 | s->uptodate++; | ||
2323 | } | ||
2324 | } | ||
2325 | break; | ||
2326 | case check_state_compute_run: | ||
2327 | break; | ||
2328 | default: | ||
2329 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
2330 | __func__, sh->check_state, | ||
2331 | (unsigned long long) sh->sector); | ||
2332 | BUG(); | ||
2460 | } | 2333 | } |
2461 | } | 2334 | } |
2462 | 2335 | ||
@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2641 | struct bio *return_bi = NULL; | 2514 | struct bio *return_bi = NULL; |
2642 | struct stripe_head_state s; | 2515 | struct stripe_head_state s; |
2643 | struct r5dev *dev; | 2516 | struct r5dev *dev; |
2644 | unsigned long pending = 0; | ||
2645 | mdk_rdev_t *blocked_rdev = NULL; | 2517 | mdk_rdev_t *blocked_rdev = NULL; |
2646 | int prexor; | 2518 | int prexor; |
2647 | 2519 | ||
2648 | memset(&s, 0, sizeof(s)); | 2520 | memset(&s, 0, sizeof(s)); |
2649 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " | 2521 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " |
2650 | "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, | 2522 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, |
2651 | atomic_read(&sh->count), sh->pd_idx, | 2523 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, |
2652 | sh->ops.pending, sh->ops.ack, sh->ops.complete); | 2524 | sh->reconstruct_state); |
2653 | 2525 | ||
2654 | spin_lock(&sh->lock); | 2526 | spin_lock(&sh->lock); |
2655 | clear_bit(STRIPE_HANDLE, &sh->state); | 2527 | clear_bit(STRIPE_HANDLE, &sh->state); |
@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2658 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2530 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
2659 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 2531 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
2660 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 2532 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
2661 | /* Now to look around and see what can be done */ | ||
2662 | |||
2663 | /* clean-up completed biofill operations */ | ||
2664 | if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { | ||
2665 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); | ||
2666 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); | ||
2667 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); | ||
2668 | } | ||
2669 | 2533 | ||
2534 | /* Now to look around and see what can be done */ | ||
2670 | rcu_read_lock(); | 2535 | rcu_read_lock(); |
2671 | for (i=disks; i--; ) { | 2536 | for (i=disks; i--; ) { |
2672 | mdk_rdev_t *rdev; | 2537 | mdk_rdev_t *rdev; |
@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2680 | /* maybe we can request a biofill operation | 2545 | /* maybe we can request a biofill operation |
2681 | * | 2546 | * |
2682 | * new wantfill requests are only permitted while | 2547 | * new wantfill requests are only permitted while |
2683 | * STRIPE_OP_BIOFILL is clear | 2548 | * ops_complete_biofill is guaranteed to be inactive |
2684 | */ | 2549 | */ |
2685 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | 2550 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
2686 | !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2551 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
2687 | set_bit(R5_Wantfill, &dev->flags); | 2552 | set_bit(R5_Wantfill, &dev->flags); |
2688 | 2553 | ||
2689 | /* now count some things */ | 2554 | /* now count some things */ |
@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2727 | goto unlock; | 2592 | goto unlock; |
2728 | } | 2593 | } |
2729 | 2594 | ||
2730 | if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2595 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
2731 | sh->ops.count++; | 2596 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); |
2597 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
2598 | } | ||
2732 | 2599 | ||
2733 | pr_debug("locked=%d uptodate=%d to_read=%d" | 2600 | pr_debug("locked=%d uptodate=%d to_read=%d" |
2734 | " to_write=%d failed=%d failed_num=%d\n", | 2601 | " to_write=%d failed=%d failed_num=%d\n", |
@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2738 | * need to be failed | 2605 | * need to be failed |
2739 | */ | 2606 | */ |
2740 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | 2607 | if (s.failed > 1 && s.to_read+s.to_write+s.written) |
2741 | handle_requests_to_failed_array(conf, sh, &s, disks, | 2608 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); |
2742 | &return_bi); | ||
2743 | if (s.failed > 1 && s.syncing) { | 2609 | if (s.failed > 1 && s.syncing) { |
2744 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2610 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
2745 | clear_bit(STRIPE_SYNCING, &sh->state); | 2611 | clear_bit(STRIPE_SYNCING, &sh->state); |
@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2755 | !test_bit(R5_LOCKED, &dev->flags) && | 2621 | !test_bit(R5_LOCKED, &dev->flags) && |
2756 | test_bit(R5_UPTODATE, &dev->flags)) || | 2622 | test_bit(R5_UPTODATE, &dev->flags)) || |
2757 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | 2623 | (s.failed == 1 && s.failed_num == sh->pd_idx))) |
2758 | handle_completed_write_requests(conf, sh, disks, &return_bi); | 2624 | handle_stripe_clean_event(conf, sh, disks, &return_bi); |
2759 | 2625 | ||
2760 | /* Now we might consider reading some blocks, either to check/generate | 2626 | /* Now we might consider reading some blocks, either to check/generate |
2761 | * parity, or to satisfy requests | 2627 | * parity, or to satisfy requests |
2762 | * or to load a block that is being partially written. | 2628 | * or to load a block that is being partially written. |
2763 | */ | 2629 | */ |
2764 | if (s.to_read || s.non_overwrite || | 2630 | if (s.to_read || s.non_overwrite || |
2765 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || | 2631 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
2766 | test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2632 | handle_stripe_fill5(sh, &s, disks); |
2767 | handle_issuing_new_read_requests5(sh, &s, disks); | ||
2768 | 2633 | ||
2769 | /* Now we check to see if any write operations have recently | 2634 | /* Now we check to see if any write operations have recently |
2770 | * completed | 2635 | * completed |
2771 | */ | 2636 | */ |
2772 | |||
2773 | /* leave prexor set until postxor is done, allows us to distinguish | ||
2774 | * a rmw from a rcw during biodrain | ||
2775 | */ | ||
2776 | prexor = 0; | 2637 | prexor = 0; |
2777 | if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && | 2638 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) |
2778 | test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { | ||
2779 | |||
2780 | prexor = 1; | 2639 | prexor = 1; |
2781 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | 2640 | if (sh->reconstruct_state == reconstruct_state_drain_result || |
2782 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); | 2641 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { |
2783 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | 2642 | sh->reconstruct_state = reconstruct_state_idle; |
2784 | |||
2785 | for (i = disks; i--; ) | ||
2786 | clear_bit(R5_Wantprexor, &sh->dev[i].flags); | ||
2787 | } | ||
2788 | |||
2789 | /* if only POSTXOR is set then this is an 'expand' postxor */ | ||
2790 | if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && | ||
2791 | test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { | ||
2792 | |||
2793 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | ||
2794 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); | ||
2795 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | ||
2796 | |||
2797 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
2798 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); | ||
2799 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | ||
2800 | 2643 | ||
2801 | /* All the 'written' buffers and the parity block are ready to | 2644 | /* All the 'written' buffers and the parity block are ready to |
2802 | * be written back to disk | 2645 | * be written back to disk |
@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2808 | (i == sh->pd_idx || dev->written)) { | 2651 | (i == sh->pd_idx || dev->written)) { |
2809 | pr_debug("Writing block %d\n", i); | 2652 | pr_debug("Writing block %d\n", i); |
2810 | set_bit(R5_Wantwrite, &dev->flags); | 2653 | set_bit(R5_Wantwrite, &dev->flags); |
2811 | if (!test_and_set_bit( | ||
2812 | STRIPE_OP_IO, &sh->ops.pending)) | ||
2813 | sh->ops.count++; | ||
2814 | if (prexor) | 2654 | if (prexor) |
2815 | continue; | 2655 | continue; |
2816 | if (!test_bit(R5_Insync, &dev->flags) || | 2656 | if (!test_bit(R5_Insync, &dev->flags) || |
@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2832 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 2672 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
2833 | * block. | 2673 | * block. |
2834 | */ | 2674 | */ |
2835 | if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && | 2675 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
2836 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | 2676 | handle_stripe_dirtying5(conf, sh, &s, disks); |
2837 | handle_issuing_new_write_requests5(conf, sh, &s, disks); | ||
2838 | 2677 | ||
2839 | /* maybe we need to check and possibly fix the parity for this stripe | 2678 | /* maybe we need to check and possibly fix the parity for this stripe |
2840 | * Any reads will already have been scheduled, so we just see if enough | 2679 | * Any reads will already have been scheduled, so we just see if enough |
2841 | * data is available. The parity check is held off while parity | 2680 | * data is available. The parity check is held off while parity |
2842 | * dependent operations are in flight. | 2681 | * dependent operations are in flight. |
2843 | */ | 2682 | */ |
2844 | if ((s.syncing && s.locked == 0 && | 2683 | if (sh->check_state || |
2845 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 2684 | (s.syncing && s.locked == 0 && |
2846 | !test_bit(STRIPE_INSYNC, &sh->state)) || | 2685 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
2847 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || | 2686 | !test_bit(STRIPE_INSYNC, &sh->state))) |
2848 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) | ||
2849 | handle_parity_checks5(conf, sh, &s, disks); | 2687 | handle_parity_checks5(conf, sh, &s, disks); |
2850 | 2688 | ||
2851 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 2689 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2864 | dev = &sh->dev[s.failed_num]; | 2702 | dev = &sh->dev[s.failed_num]; |
2865 | if (!test_bit(R5_ReWrite, &dev->flags)) { | 2703 | if (!test_bit(R5_ReWrite, &dev->flags)) { |
2866 | set_bit(R5_Wantwrite, &dev->flags); | 2704 | set_bit(R5_Wantwrite, &dev->flags); |
2867 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2868 | sh->ops.count++; | ||
2869 | set_bit(R5_ReWrite, &dev->flags); | 2705 | set_bit(R5_ReWrite, &dev->flags); |
2870 | set_bit(R5_LOCKED, &dev->flags); | 2706 | set_bit(R5_LOCKED, &dev->flags); |
2871 | s.locked++; | 2707 | s.locked++; |
2872 | } else { | 2708 | } else { |
2873 | /* let's read it back */ | 2709 | /* let's read it back */ |
2874 | set_bit(R5_Wantread, &dev->flags); | 2710 | set_bit(R5_Wantread, &dev->flags); |
2875 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2876 | sh->ops.count++; | ||
2877 | set_bit(R5_LOCKED, &dev->flags); | 2711 | set_bit(R5_LOCKED, &dev->flags); |
2878 | s.locked++; | 2712 | s.locked++; |
2879 | } | 2713 | } |
2880 | } | 2714 | } |
2881 | 2715 | ||
2882 | /* Finish postxor operations initiated by the expansion | 2716 | /* Finish reconstruct operations initiated by the expansion process */ |
2883 | * process | 2717 | if (sh->reconstruct_state == reconstruct_state_result) { |
2884 | */ | 2718 | sh->reconstruct_state = reconstruct_state_idle; |
2885 | if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && | ||
2886 | !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { | ||
2887 | |||
2888 | clear_bit(STRIPE_EXPANDING, &sh->state); | 2719 | clear_bit(STRIPE_EXPANDING, &sh->state); |
2889 | 2720 | for (i = conf->raid_disks; i--; ) | |
2890 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | ||
2891 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); | ||
2892 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
2893 | |||
2894 | for (i = conf->raid_disks; i--; ) { | ||
2895 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 2721 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
2896 | set_bit(R5_LOCKED, &dev->flags); | 2722 | set_bit(R5_LOCKED, &dev->flags); |
2897 | s.locked++; | 2723 | s.locked++; |
2898 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
2899 | sh->ops.count++; | ||
2900 | } | ||
2901 | } | 2724 | } |
2902 | 2725 | ||
2903 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 2726 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
2904 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | 2727 | !sh->reconstruct_state) { |
2905 | /* Need to write out all blocks after computing parity */ | 2728 | /* Need to write out all blocks after computing parity */ |
2906 | sh->disks = conf->raid_disks; | 2729 | sh->disks = conf->raid_disks; |
2907 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, | 2730 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, |
2908 | conf->raid_disks); | 2731 | conf->raid_disks); |
2909 | s.locked += handle_write_operations5(sh, 1, 1); | 2732 | schedule_reconstruction5(sh, &s, 1, 1); |
2910 | } else if (s.expanded && | 2733 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
2911 | s.locked == 0 && | ||
2912 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | ||
2913 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 2734 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
2914 | atomic_dec(&conf->reshape_stripes); | 2735 | atomic_dec(&conf->reshape_stripes); |
2915 | wake_up(&conf->wait_for_overlap); | 2736 | wake_up(&conf->wait_for_overlap); |
@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2917 | } | 2738 | } |
2918 | 2739 | ||
2919 | if (s.expanding && s.locked == 0 && | 2740 | if (s.expanding && s.locked == 0 && |
2920 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2741 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
2921 | handle_stripe_expansion(conf, sh, NULL); | 2742 | handle_stripe_expansion(conf, sh, NULL); |
2922 | 2743 | ||
2923 | if (sh->ops.count) | ||
2924 | pending = get_stripe_work(sh); | ||
2925 | |||
2926 | unlock: | 2744 | unlock: |
2927 | spin_unlock(&sh->lock); | 2745 | spin_unlock(&sh->lock); |
2928 | 2746 | ||
@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh) | |||
2930 | if (unlikely(blocked_rdev)) | 2748 | if (unlikely(blocked_rdev)) |
2931 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2749 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
2932 | 2750 | ||
2933 | if (pending) | 2751 | if (s.ops_request) |
2934 | raid5_run_ops(sh, pending); | 2752 | raid5_run_ops(sh, s.ops_request); |
2935 | 2753 | ||
2936 | return_io(return_bi); | 2754 | ops_run_io(sh, &s); |
2937 | 2755 | ||
2756 | return_io(return_bi); | ||
2938 | } | 2757 | } |
2939 | 2758 | ||
2940 | static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 2759 | static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) |
@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3042 | * might need to be failed | 2861 | * might need to be failed |
3043 | */ | 2862 | */ |
3044 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 2863 | if (s.failed > 2 && s.to_read+s.to_write+s.written) |
3045 | handle_requests_to_failed_array(conf, sh, &s, disks, | 2864 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); |
3046 | &return_bi); | ||
3047 | if (s.failed > 2 && s.syncing) { | 2865 | if (s.failed > 2 && s.syncing) { |
3048 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2866 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
3049 | clear_bit(STRIPE_SYNCING, &sh->state); | 2867 | clear_bit(STRIPE_SYNCING, &sh->state); |
@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3068 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 2886 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
3069 | && !test_bit(R5_LOCKED, &qdev->flags) | 2887 | && !test_bit(R5_LOCKED, &qdev->flags) |
3070 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 2888 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
3071 | handle_completed_write_requests(conf, sh, disks, &return_bi); | 2889 | handle_stripe_clean_event(conf, sh, disks, &return_bi); |
3072 | 2890 | ||
3073 | /* Now we might consider reading some blocks, either to check/generate | 2891 | /* Now we might consider reading some blocks, either to check/generate |
3074 | * parity, or to satisfy requests | 2892 | * parity, or to satisfy requests |
@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3076 | */ | 2894 | */ |
3077 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 2895 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
3078 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 2896 | (s.syncing && (s.uptodate < disks)) || s.expanding) |
3079 | handle_issuing_new_read_requests6(sh, &s, &r6s, disks); | 2897 | handle_stripe_fill6(sh, &s, &r6s, disks); |
3080 | 2898 | ||
3081 | /* now to consider writing and what else, if anything should be read */ | 2899 | /* now to consider writing and what else, if anything should be read */ |
3082 | if (s.to_write) | 2900 | if (s.to_write) |
3083 | handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); | 2901 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
3084 | 2902 | ||
3085 | /* maybe we need to check and possibly fix the parity for this stripe | 2903 | /* maybe we need to check and possibly fix the parity for this stripe |
3086 | * Any reads will already have been scheduled, so we just see if enough | 2904 | * Any reads will already have been scheduled, so we just see if enough |
@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3136 | } | 2954 | } |
3137 | 2955 | ||
3138 | if (s.expanding && s.locked == 0 && | 2956 | if (s.expanding && s.locked == 0 && |
3139 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2957 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
3140 | handle_stripe_expansion(conf, sh, &r6s); | 2958 | handle_stripe_expansion(conf, sh, &r6s); |
3141 | 2959 | ||
3142 | unlock: | 2960 | unlock: |
@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
3146 | if (unlikely(blocked_rdev)) | 2964 | if (unlikely(blocked_rdev)) |
3147 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2965 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
3148 | 2966 | ||
3149 | return_io(return_bi); | 2967 | ops_run_io(sh, &s); |
3150 | |||
3151 | for (i=disks; i-- ;) { | ||
3152 | int rw; | ||
3153 | struct bio *bi; | ||
3154 | mdk_rdev_t *rdev; | ||
3155 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
3156 | rw = WRITE; | ||
3157 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
3158 | rw = READ; | ||
3159 | else | ||
3160 | continue; | ||
3161 | |||
3162 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
3163 | |||
3164 | bi = &sh->dev[i].req; | ||
3165 | |||
3166 | bi->bi_rw = rw; | ||
3167 | if (rw == WRITE) | ||
3168 | bi->bi_end_io = raid5_end_write_request; | ||
3169 | else | ||
3170 | bi->bi_end_io = raid5_end_read_request; | ||
3171 | |||
3172 | rcu_read_lock(); | ||
3173 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
3174 | if (rdev && test_bit(Faulty, &rdev->flags)) | ||
3175 | rdev = NULL; | ||
3176 | if (rdev) | ||
3177 | atomic_inc(&rdev->nr_pending); | ||
3178 | rcu_read_unlock(); | ||
3179 | 2968 | ||
3180 | if (rdev) { | 2969 | return_io(return_bi); |
3181 | if (s.syncing || s.expanding || s.expanded) | ||
3182 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | ||
3183 | |||
3184 | bi->bi_bdev = rdev->bdev; | ||
3185 | pr_debug("for %llu schedule op %ld on disc %d\n", | ||
3186 | (unsigned long long)sh->sector, bi->bi_rw, i); | ||
3187 | atomic_inc(&sh->count); | ||
3188 | bi->bi_sector = sh->sector + rdev->data_offset; | ||
3189 | bi->bi_flags = 1 << BIO_UPTODATE; | ||
3190 | bi->bi_vcnt = 1; | ||
3191 | bi->bi_max_vecs = 1; | ||
3192 | bi->bi_idx = 0; | ||
3193 | bi->bi_io_vec = &sh->dev[i].vec; | ||
3194 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
3195 | bi->bi_io_vec[0].bv_offset = 0; | ||
3196 | bi->bi_size = STRIPE_SIZE; | ||
3197 | bi->bi_next = NULL; | ||
3198 | if (rw == WRITE && | ||
3199 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
3200 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
3201 | generic_make_request(bi); | ||
3202 | } else { | ||
3203 | if (rw == WRITE) | ||
3204 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
3205 | pr_debug("skip op %ld on disc %d for sector %llu\n", | ||
3206 | bi->bi_rw, i, (unsigned long long)sh->sector); | ||
3207 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
3208 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3209 | } | ||
3210 | } | ||
3211 | } | 2970 | } |
3212 | 2971 | ||
3213 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 2972 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) |
@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
3697 | if ( rw == WRITE ) | 3456 | if ( rw == WRITE ) |
3698 | md_write_end(mddev); | 3457 | md_write_end(mddev); |
3699 | 3458 | ||
3700 | bi->bi_end_io(bi, | 3459 | bio_endio(bi, 0); |
3701 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
3702 | ? 0 : -EIO); | ||
3703 | } | 3460 | } |
3704 | return 0; | 3461 | return 0; |
3705 | } | 3462 | } |
@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
3785 | j == raid6_next_disk(sh->pd_idx, sh->disks)) | 3542 | j == raid6_next_disk(sh->pd_idx, sh->disks)) |
3786 | continue; | 3543 | continue; |
3787 | s = compute_blocknr(sh, j); | 3544 | s = compute_blocknr(sh, j); |
3788 | if (s < (mddev->array_size<<1)) { | 3545 | if (s < mddev->array_sectors) { |
3789 | skipped = 1; | 3546 | skipped = 1; |
3790 | continue; | 3547 | continue; |
3791 | } | 3548 | } |
@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
4002 | spin_lock_irq(&conf->device_lock); | 3759 | spin_lock_irq(&conf->device_lock); |
4003 | remaining = --raid_bio->bi_phys_segments; | 3760 | remaining = --raid_bio->bi_phys_segments; |
4004 | spin_unlock_irq(&conf->device_lock); | 3761 | spin_unlock_irq(&conf->device_lock); |
4005 | if (remaining == 0) { | 3762 | if (remaining == 0) |
4006 | 3763 | bio_endio(raid_bio, 0); | |
4007 | raid_bio->bi_end_io(raid_bio, | ||
4008 | test_bit(BIO_UPTODATE, &raid_bio->bi_flags) | ||
4009 | ? 0 : -EIO); | ||
4010 | } | ||
4011 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 3764 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
4012 | wake_up(&conf->wait_for_stripe); | 3765 | wake_up(&conf->wait_for_stripe); |
4013 | return handled; | 3766 | return handled; |
@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
4094 | { | 3847 | { |
4095 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3848 | raid5_conf_t *conf = mddev_to_conf(mddev); |
4096 | unsigned long new; | 3849 | unsigned long new; |
3850 | int err; | ||
3851 | |||
4097 | if (len >= PAGE_SIZE) | 3852 | if (len >= PAGE_SIZE) |
4098 | return -EINVAL; | 3853 | return -EINVAL; |
4099 | if (!conf) | 3854 | if (!conf) |
@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
4109 | else | 3864 | else |
4110 | break; | 3865 | break; |
4111 | } | 3866 | } |
4112 | md_allow_write(mddev); | 3867 | err = md_allow_write(mddev); |
3868 | if (err) | ||
3869 | return err; | ||
4113 | while (new > conf->max_nr_stripes) { | 3870 | while (new > conf->max_nr_stripes) { |
4114 | if (grow_one_stripe(conf)) | 3871 | if (grow_one_stripe(conf)) |
4115 | conf->max_nr_stripes++; | 3872 | conf->max_nr_stripes++; |
@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev) | |||
4434 | mddev->queue->backing_dev_info.congested_data = mddev; | 4191 | mddev->queue->backing_dev_info.congested_data = mddev; |
4435 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; | 4192 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; |
4436 | 4193 | ||
4437 | mddev->array_size = mddev->size * (conf->previous_raid_disks - | 4194 | mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - |
4438 | conf->max_degraded); | 4195 | conf->max_degraded); |
4439 | 4196 | ||
4440 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); | 4197 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); |
@@ -4609,35 +4366,41 @@ abort: | |||
4609 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 4366 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
4610 | { | 4367 | { |
4611 | raid5_conf_t *conf = mddev->private; | 4368 | raid5_conf_t *conf = mddev->private; |
4612 | int found = 0; | 4369 | int err = -EEXIST; |
4613 | int disk; | 4370 | int disk; |
4614 | struct disk_info *p; | 4371 | struct disk_info *p; |
4372 | int first = 0; | ||
4373 | int last = conf->raid_disks - 1; | ||
4615 | 4374 | ||
4616 | if (mddev->degraded > conf->max_degraded) | 4375 | if (mddev->degraded > conf->max_degraded) |
4617 | /* no point adding a device */ | 4376 | /* no point adding a device */ |
4618 | return 0; | 4377 | return -EINVAL; |
4378 | |||
4379 | if (rdev->raid_disk >= 0) | ||
4380 | first = last = rdev->raid_disk; | ||
4619 | 4381 | ||
4620 | /* | 4382 | /* |
4621 | * find the disk ... but prefer rdev->saved_raid_disk | 4383 | * find the disk ... but prefer rdev->saved_raid_disk |
4622 | * if possible. | 4384 | * if possible. |
4623 | */ | 4385 | */ |
4624 | if (rdev->saved_raid_disk >= 0 && | 4386 | if (rdev->saved_raid_disk >= 0 && |
4387 | rdev->saved_raid_disk >= first && | ||
4625 | conf->disks[rdev->saved_raid_disk].rdev == NULL) | 4388 | conf->disks[rdev->saved_raid_disk].rdev == NULL) |
4626 | disk = rdev->saved_raid_disk; | 4389 | disk = rdev->saved_raid_disk; |
4627 | else | 4390 | else |
4628 | disk = 0; | 4391 | disk = first; |
4629 | for ( ; disk < conf->raid_disks; disk++) | 4392 | for ( ; disk <= last ; disk++) |
4630 | if ((p=conf->disks + disk)->rdev == NULL) { | 4393 | if ((p=conf->disks + disk)->rdev == NULL) { |
4631 | clear_bit(In_sync, &rdev->flags); | 4394 | clear_bit(In_sync, &rdev->flags); |
4632 | rdev->raid_disk = disk; | 4395 | rdev->raid_disk = disk; |
4633 | found = 1; | 4396 | err = 0; |
4634 | if (rdev->saved_raid_disk != disk) | 4397 | if (rdev->saved_raid_disk != disk) |
4635 | conf->fullsync = 1; | 4398 | conf->fullsync = 1; |
4636 | rcu_assign_pointer(p->rdev, rdev); | 4399 | rcu_assign_pointer(p->rdev, rdev); |
4637 | break; | 4400 | break; |
4638 | } | 4401 | } |
4639 | print_raid5_conf(conf); | 4402 | print_raid5_conf(conf); |
4640 | return found; | 4403 | return err; |
4641 | } | 4404 | } |
4642 | 4405 | ||
4643 | static int raid5_resize(mddev_t *mddev, sector_t sectors) | 4406 | static int raid5_resize(mddev_t *mddev, sector_t sectors) |
@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
4652 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4415 | raid5_conf_t *conf = mddev_to_conf(mddev); |
4653 | 4416 | ||
4654 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 4417 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); |
4655 | mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; | 4418 | mddev->array_sectors = sectors * (mddev->raid_disks |
4656 | set_capacity(mddev->gendisk, mddev->array_size << 1); | 4419 | - conf->max_degraded); |
4420 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
4657 | mddev->changed = 1; | 4421 | mddev->changed = 1; |
4658 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { | 4422 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { |
4659 | mddev->recovery_cp = mddev->size << 1; | 4423 | mddev->recovery_cp = mddev->size << 1; |
@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
4738 | rdev_for_each(rdev, rtmp, mddev) | 4502 | rdev_for_each(rdev, rtmp, mddev) |
4739 | if (rdev->raid_disk < 0 && | 4503 | if (rdev->raid_disk < 0 && |
4740 | !test_bit(Faulty, &rdev->flags)) { | 4504 | !test_bit(Faulty, &rdev->flags)) { |
4741 | if (raid5_add_disk(mddev, rdev)) { | 4505 | if (raid5_add_disk(mddev, rdev) == 0) { |
4742 | char nm[20]; | 4506 | char nm[20]; |
4743 | set_bit(In_sync, &rdev->flags); | 4507 | set_bit(In_sync, &rdev->flags); |
4744 | added_devices++; | 4508 | added_devices++; |
@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf) | |||
4786 | struct block_device *bdev; | 4550 | struct block_device *bdev; |
4787 | 4551 | ||
4788 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 4552 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
4789 | conf->mddev->array_size = conf->mddev->size * | 4553 | conf->mddev->array_sectors = 2 * conf->mddev->size * |
4790 | (conf->raid_disks - conf->max_degraded); | 4554 | (conf->raid_disks - conf->max_degraded); |
4791 | set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); | 4555 | set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); |
4792 | conf->mddev->changed = 1; | 4556 | conf->mddev->changed = 1; |
4793 | 4557 | ||
4794 | bdev = bdget_disk(conf->mddev->gendisk, 0); | 4558 | bdev = bdget_disk(conf->mddev->gendisk, 0); |
4795 | if (bdev) { | 4559 | if (bdev) { |
4796 | mutex_lock(&bdev->bd_inode->i_mutex); | 4560 | mutex_lock(&bdev->bd_inode->i_mutex); |
4797 | i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); | 4561 | i_size_write(bdev->bd_inode, |
4562 | (loff_t)conf->mddev->array_sectors << 9); | ||
4798 | mutex_unlock(&bdev->bd_inode->i_mutex); | 4563 | mutex_unlock(&bdev->bd_inode->i_mutex); |
4799 | bdput(bdev); | 4564 | bdput(bdev); |
4800 | } | 4565 | } |
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 78bfdea24a8e..e98900671ca9 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h | |||
@@ -221,6 +221,7 @@ struct bitmap { | |||
221 | unsigned long syncchunk; | 221 | unsigned long syncchunk; |
222 | 222 | ||
223 | __u64 events_cleared; | 223 | __u64 events_cleared; |
224 | int need_sync; | ||
224 | 225 | ||
225 | /* bitmap spinlock */ | 226 | /* bitmap spinlock */ |
226 | spinlock_t lock; | 227 | spinlock_t lock; |
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index ba15469daf11..7e375111d007 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h | |||
@@ -16,7 +16,7 @@ struct linear_private_data | |||
16 | struct linear_private_data *prev; /* earlier version */ | 16 | struct linear_private_data *prev; /* earlier version */ |
17 | dev_info_t **hash_table; | 17 | dev_info_t **hash_table; |
18 | sector_t hash_spacing; | 18 | sector_t hash_spacing; |
19 | sector_t array_size; | 19 | sector_t array_sectors; |
20 | int preshift; /* shift before dividing by hash_spacing */ | 20 | int preshift; /* shift before dividing by hash_spacing */ |
21 | dev_info_t disks[0]; | 21 | dev_info_t disks[0]; |
22 | }; | 22 | }; |
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b7386ae9d288..dc0e3fcb9f28 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h | |||
@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | |||
95 | struct page *page, int rw); | 95 | struct page *page, int rw); |
96 | extern void md_do_sync(mddev_t *mddev); | 96 | extern void md_do_sync(mddev_t *mddev); |
97 | extern void md_new_event(mddev_t *mddev); | 97 | extern void md_new_event(mddev_t *mddev); |
98 | extern void md_allow_write(mddev_t *mddev); | 98 | extern int md_allow_write(mddev_t *mddev); |
99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
100 | 100 | ||
101 | #endif /* CONFIG_MD */ | 101 | #endif /* CONFIG_MD */ |
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 3dea9f545c8f..9f2549ac0e2d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h | |||
@@ -59,7 +59,7 @@ struct mdk_rdev_s | |||
59 | int sb_loaded; | 59 | int sb_loaded; |
60 | __u64 sb_events; | 60 | __u64 sb_events; |
61 | sector_t data_offset; /* start of data in array */ | 61 | sector_t data_offset; /* start of data in array */ |
62 | sector_t sb_offset; | 62 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ |
63 | int sb_size; /* bytes in the superblock */ | 63 | int sb_size; /* bytes in the superblock */ |
64 | int preferred_minor; /* autorun support */ | 64 | int preferred_minor; /* autorun support */ |
65 | 65 | ||
@@ -87,6 +87,9 @@ struct mdk_rdev_s | |||
87 | #define Blocked 8 /* An error occured on an externally | 87 | #define Blocked 8 /* An error occured on an externally |
88 | * managed array, don't allow writes | 88 | * managed array, don't allow writes |
89 | * until it is cleared */ | 89 | * until it is cleared */ |
90 | #define StateChanged 9 /* Faulty or Blocked has changed during | ||
91 | * interrupt, so it needs to be | ||
92 | * notified by the thread */ | ||
90 | wait_queue_head_t blocked_wait; | 93 | wait_queue_head_t blocked_wait; |
91 | 94 | ||
92 | int desc_nr; /* descriptor index in the superblock */ | 95 | int desc_nr; /* descriptor index in the superblock */ |
@@ -147,7 +150,7 @@ struct mddev_s | |||
147 | int raid_disks; | 150 | int raid_disks; |
148 | int max_disks; | 151 | int max_disks; |
149 | sector_t size; /* used size of component devices */ | 152 | sector_t size; /* used size of component devices */ |
150 | sector_t array_size; /* exported array size */ | 153 | sector_t array_sectors; /* exported array size */ |
151 | __u64 events; | 154 | __u64 events; |
152 | 155 | ||
153 | char uuid[16]; | 156 | char uuid[16]; |
@@ -188,6 +191,7 @@ struct mddev_s | |||
188 | * NEEDED: we might need to start a resync/recover | 191 | * NEEDED: we might need to start a resync/recover |
189 | * RUNNING: a thread is running, or about to be started | 192 | * RUNNING: a thread is running, or about to be started |
190 | * SYNC: actually doing a resync, not a recovery | 193 | * SYNC: actually doing a resync, not a recovery |
194 | * RECOVER: doing recovery, or need to try it. | ||
191 | * INTR: resync needs to be aborted for some reason | 195 | * INTR: resync needs to be aborted for some reason |
192 | * DONE: thread is done and is waiting to be reaped | 196 | * DONE: thread is done and is waiting to be reaped |
193 | * REQUEST: user-space has requested a sync (used with SYNC) | 197 | * REQUEST: user-space has requested a sync (used with SYNC) |
@@ -198,6 +202,7 @@ struct mddev_s | |||
198 | */ | 202 | */ |
199 | #define MD_RECOVERY_RUNNING 0 | 203 | #define MD_RECOVERY_RUNNING 0 |
200 | #define MD_RECOVERY_SYNC 1 | 204 | #define MD_RECOVERY_SYNC 1 |
205 | #define MD_RECOVERY_RECOVER 2 | ||
201 | #define MD_RECOVERY_INTR 3 | 206 | #define MD_RECOVERY_INTR 3 |
202 | #define MD_RECOVERY_DONE 4 | 207 | #define MD_RECOVERY_DONE 4 |
203 | #define MD_RECOVERY_NEEDED 5 | 208 | #define MD_RECOVERY_NEEDED 5 |
@@ -210,7 +215,8 @@ struct mddev_s | |||
210 | 215 | ||
211 | int in_sync; /* know to not need resync */ | 216 | int in_sync; /* know to not need resync */ |
212 | struct mutex reconfig_mutex; | 217 | struct mutex reconfig_mutex; |
213 | atomic_t active; | 218 | atomic_t active; /* general refcount */ |
219 | atomic_t openers; /* number of active opens */ | ||
214 | 220 | ||
215 | int changed; /* true if we might need to reread partition info */ | 221 | int changed; /* true if we might need to reread partition info */ |
216 | int degraded; /* whether md should consider | 222 | int degraded; /* whether md should consider |
@@ -227,6 +233,8 @@ struct mddev_s | |||
227 | atomic_t recovery_active; /* blocks scheduled, but not written */ | 233 | atomic_t recovery_active; /* blocks scheduled, but not written */ |
228 | wait_queue_head_t recovery_wait; | 234 | wait_queue_head_t recovery_wait; |
229 | sector_t recovery_cp; | 235 | sector_t recovery_cp; |
236 | sector_t resync_min; /* user requested sync | ||
237 | * starts here */ | ||
230 | sector_t resync_max; /* resync should pause | 238 | sector_t resync_max; /* resync should pause |
231 | * when it gets here */ | 239 | * when it gets here */ |
232 | 240 | ||
@@ -331,6 +339,9 @@ static inline char * mdname (mddev_t * mddev) | |||
331 | #define rdev_for_each(rdev, tmp, mddev) \ | 339 | #define rdev_for_each(rdev, tmp, mddev) \ |
332 | rdev_for_each_list(rdev, tmp, (mddev)->disks) | 340 | rdev_for_each_list(rdev, tmp, (mddev)->disks) |
333 | 341 | ||
342 | #define rdev_for_each_rcu(rdev, mddev) \ | ||
343 | list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) | ||
344 | |||
334 | typedef struct mdk_thread_s { | 345 | typedef struct mdk_thread_s { |
335 | void (*run) (mddev_t *mddev); | 346 | void (*run) (mddev_t *mddev); |
336 | mddev_t *mddev; | 347 | mddev_t *mddev; |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 3f2cd98c508b..8b4de4a41ff1 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
@@ -43,14 +43,11 @@ | |||
43 | */ | 43 | */ |
44 | #define MD_RESERVED_BYTES (64 * 1024) | 44 | #define MD_RESERVED_BYTES (64 * 1024) |
45 | #define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) | 45 | #define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) |
46 | #define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) | ||
47 | 46 | ||
48 | #define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) | 47 | #define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) |
49 | #define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) | ||
50 | 48 | ||
51 | #define MD_SB_BYTES 4096 | 49 | #define MD_SB_BYTES 4096 |
52 | #define MD_SB_WORDS (MD_SB_BYTES / 4) | 50 | #define MD_SB_WORDS (MD_SB_BYTES / 4) |
53 | #define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) | ||
54 | #define MD_SB_SECTORS (MD_SB_BYTES / 512) | 51 | #define MD_SB_SECTORS (MD_SB_BYTES / 512) |
55 | 52 | ||
56 | /* | 53 | /* |
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index f0827d31ae6f..3b2672792457 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
@@ -158,6 +158,43 @@ | |||
158 | * the compute block completes. | 158 | * the compute block completes. |
159 | */ | 159 | */ |
160 | 160 | ||
161 | /* | ||
162 | * Operations state - intermediate states that are visible outside of sh->lock | ||
163 | * In general _idle indicates nothing is running, _run indicates a data | ||
164 | * processing operation is active, and _result means the data processing result | ||
165 | * is stable and can be acted upon. For simple operations like biofill and | ||
166 | * compute that only have an _idle and _run state they are indicated with | ||
167 | * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) | ||
168 | */ | ||
169 | /** | ||
170 | * enum check_states - handles syncing / repairing a stripe | ||
171 | * @check_state_idle - check operations are quiesced | ||
172 | * @check_state_run - check operation is running | ||
173 | * @check_state_result - set outside lock when check result is valid | ||
174 | * @check_state_compute_run - check failed and we are repairing | ||
175 | * @check_state_compute_result - set outside lock when compute result is valid | ||
176 | */ | ||
177 | enum check_states { | ||
178 | check_state_idle = 0, | ||
179 | check_state_run, /* parity check */ | ||
180 | check_state_check_result, | ||
181 | check_state_compute_run, /* parity repair */ | ||
182 | check_state_compute_result, | ||
183 | }; | ||
184 | |||
185 | /** | ||
186 | * enum reconstruct_states - handles writing or expanding a stripe | ||
187 | */ | ||
188 | enum reconstruct_states { | ||
189 | reconstruct_state_idle = 0, | ||
190 | reconstruct_state_prexor_drain_run, /* prexor-write */ | ||
191 | reconstruct_state_drain_run, /* write */ | ||
192 | reconstruct_state_run, /* expand */ | ||
193 | reconstruct_state_prexor_drain_result, | ||
194 | reconstruct_state_drain_result, | ||
195 | reconstruct_state_result, | ||
196 | }; | ||
197 | |||
161 | struct stripe_head { | 198 | struct stripe_head { |
162 | struct hlist_node hash; | 199 | struct hlist_node hash; |
163 | struct list_head lru; /* inactive_list or handle_list */ | 200 | struct list_head lru; /* inactive_list or handle_list */ |
@@ -169,19 +206,13 @@ struct stripe_head { | |||
169 | spinlock_t lock; | 206 | spinlock_t lock; |
170 | int bm_seq; /* sequence number for bitmap flushes */ | 207 | int bm_seq; /* sequence number for bitmap flushes */ |
171 | int disks; /* disks in stripe */ | 208 | int disks; /* disks in stripe */ |
209 | enum check_states check_state; | ||
210 | enum reconstruct_states reconstruct_state; | ||
172 | /* stripe_operations | 211 | /* stripe_operations |
173 | * @pending - pending ops flags (set for request->issue->complete) | ||
174 | * @ack - submitted ops flags (set for issue->complete) | ||
175 | * @complete - completed ops flags (set for complete) | ||
176 | * @target - STRIPE_OP_COMPUTE_BLK target | 212 | * @target - STRIPE_OP_COMPUTE_BLK target |
177 | * @count - raid5_runs_ops is set to run when this is non-zero | ||
178 | */ | 213 | */ |
179 | struct stripe_operations { | 214 | struct stripe_operations { |
180 | unsigned long pending; | ||
181 | unsigned long ack; | ||
182 | unsigned long complete; | ||
183 | int target; | 215 | int target; |
184 | int count; | ||
185 | u32 zero_sum_result; | 216 | u32 zero_sum_result; |
186 | } ops; | 217 | } ops; |
187 | struct r5dev { | 218 | struct r5dev { |
@@ -202,6 +233,7 @@ struct stripe_head_state { | |||
202 | int locked, uptodate, to_read, to_write, failed, written; | 233 | int locked, uptodate, to_read, to_write, failed, written; |
203 | int to_fill, compute, req_compute, non_overwrite; | 234 | int to_fill, compute, req_compute, non_overwrite; |
204 | int failed_num; | 235 | int failed_num; |
236 | unsigned long ops_request; | ||
205 | }; | 237 | }; |
206 | 238 | ||
207 | /* r6_state - extra state data only relevant to r6 */ | 239 | /* r6_state - extra state data only relevant to r6 */ |
@@ -228,9 +260,7 @@ struct r6_state { | |||
228 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 260 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs |
229 | * filling | 261 | * filling |
230 | */ | 262 | */ |
231 | #define R5_Wantprexor 13 /* distinguish blocks ready for rmw from | 263 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
232 | * other "towrites" | ||
233 | */ | ||
234 | /* | 264 | /* |
235 | * Write method | 265 | * Write method |
236 | */ | 266 | */ |
@@ -254,8 +284,10 @@ struct r6_state { | |||
254 | #define STRIPE_EXPAND_READY 11 | 284 | #define STRIPE_EXPAND_READY 11 |
255 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | 285 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ |
256 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 286 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ |
287 | #define STRIPE_BIOFILL_RUN 14 | ||
288 | #define STRIPE_COMPUTE_RUN 15 | ||
257 | /* | 289 | /* |
258 | * Operations flags (in issue order) | 290 | * Operation request flags |
259 | */ | 291 | */ |
260 | #define STRIPE_OP_BIOFILL 0 | 292 | #define STRIPE_OP_BIOFILL 0 |
261 | #define STRIPE_OP_COMPUTE_BLK 1 | 293 | #define STRIPE_OP_COMPUTE_BLK 1 |
@@ -263,14 +295,6 @@ struct r6_state { | |||
263 | #define STRIPE_OP_BIODRAIN 3 | 295 | #define STRIPE_OP_BIODRAIN 3 |
264 | #define STRIPE_OP_POSTXOR 4 | 296 | #define STRIPE_OP_POSTXOR 4 |
265 | #define STRIPE_OP_CHECK 5 | 297 | #define STRIPE_OP_CHECK 5 |
266 | #define STRIPE_OP_IO 6 | ||
267 | |||
268 | /* modifiers to the base operations | ||
269 | * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back | ||
270 | * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check | ||
271 | */ | ||
272 | #define STRIPE_OP_MOD_REPAIR_PD 7 | ||
273 | #define STRIPE_OP_MOD_DMA_CHECK 8 | ||
274 | 298 | ||
275 | /* | 299 | /* |
276 | * Plugging: | 300 | * Plugging: |