diff options
| -rw-r--r-- | Documentation/md.txt | 30 | ||||
| -rw-r--r-- | drivers/md/bitmap.c | 54 | ||||
| -rw-r--r-- | drivers/md/faulty.c | 2 | ||||
| -rw-r--r-- | drivers/md/linear.c | 20 | ||||
| -rw-r--r-- | drivers/md/md.c | 615 | ||||
| -rw-r--r-- | drivers/md/multipath.c | 17 | ||||
| -rw-r--r-- | drivers/md/raid0.c | 8 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 30 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 22 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 745 | ||||
| -rw-r--r-- | include/linux/raid/bitmap.h | 1 | ||||
| -rw-r--r-- | include/linux/raid/linear.h | 2 | ||||
| -rw-r--r-- | include/linux/raid/md.h | 2 | ||||
| -rw-r--r-- | include/linux/raid/md_k.h | 17 | ||||
| -rw-r--r-- | include/linux/raid/md_p.h | 3 | ||||
| -rw-r--r-- | include/linux/raid/raid5.h | 64 |
16 files changed, 842 insertions, 790 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt index a8b430627473..1da9d1b1793f 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt | |||
| @@ -236,6 +236,11 @@ All md devices contain: | |||
| 236 | writing the word for the desired state, however some states | 236 | writing the word for the desired state, however some states |
| 237 | cannot be explicitly set, and some transitions are not allowed. | 237 | cannot be explicitly set, and some transitions are not allowed. |
| 238 | 238 | ||
| 239 | Select/poll works on this file. All changes except between | ||
| 240 | active_idle and active (which can be frequent and are not | ||
| 241 | very interesting) are notified. active->active_idle is | ||
| 242 | reported if the metadata is externally managed. | ||
| 243 | |||
| 239 | clear | 244 | clear |
| 240 | No devices, no size, no level | 245 | No devices, no size, no level |
| 241 | Writing is equivalent to STOP_ARRAY ioctl | 246 | Writing is equivalent to STOP_ARRAY ioctl |
| @@ -292,6 +297,10 @@ Each directory contains: | |||
| 292 | writemostly - device will only be subject to read | 297 | writemostly - device will only be subject to read |
| 293 | requests if there are no other options. | 298 | requests if there are no other options. |
| 294 | This applies only to raid1 arrays. | 299 | This applies only to raid1 arrays. |
| 300 | blocked - device has failed, metadata is "external", | ||
| 301 | and the failure hasn't been acknowledged yet. | ||
| 302 | Writes that would write to this device if | ||
| 303 | it were not faulty are blocked. | ||
| 295 | spare - device is working, but not a full member. | 304 | spare - device is working, but not a full member. |
| 296 | This includes spares that are in the process | 305 | This includes spares that are in the process |
| 297 | of being recovered to | 306 | of being recovered to |
| @@ -301,6 +310,12 @@ Each directory contains: | |||
| 301 | Writing "remove" removes the device from the array. | 310 | Writing "remove" removes the device from the array. |
| 302 | Writing "writemostly" sets the writemostly flag. | 311 | Writing "writemostly" sets the writemostly flag. |
| 303 | Writing "-writemostly" clears the writemostly flag. | 312 | Writing "-writemostly" clears the writemostly flag. |
| 313 | Writing "blocked" sets the "blocked" flag. | ||
| 314 | Writing "-blocked" clear the "blocked" flag and allows writes | ||
| 315 | to complete. | ||
| 316 | |||
| 317 | This file responds to select/poll. Any change to 'faulty' | ||
| 318 | or 'blocked' causes an event. | ||
| 304 | 319 | ||
| 305 | errors | 320 | errors |
| 306 | An approximate count of read errors that have been detected on | 321 | An approximate count of read errors that have been detected on |
| @@ -332,7 +347,7 @@ Each directory contains: | |||
| 332 | for storage of data. This will normally be the same as the | 347 | for storage of data. This will normally be the same as the |
| 333 | component_size. This can be written while assembling an | 348 | component_size. This can be written while assembling an |
| 334 | array. If a value less than the current component_size is | 349 | array. If a value less than the current component_size is |
| 335 | written, component_size will be reduced to this value. | 350 | written, it will be rejected. |
| 336 | 351 | ||
| 337 | 352 | ||
| 338 | An active md device will also contain and entry for each active device | 353 | An active md device will also contain and entry for each active device |
| @@ -381,6 +396,19 @@ also have | |||
| 381 | 'check' and 'repair' will start the appropriate process | 396 | 'check' and 'repair' will start the appropriate process |
| 382 | providing the current state is 'idle'. | 397 | providing the current state is 'idle'. |
| 383 | 398 | ||
| 399 | This file responds to select/poll. Any important change in the value | ||
| 400 | triggers a poll event. Sometimes the value will briefly be | ||
| 401 | "recover" if a recovery seems to be needed, but cannot be | ||
| 402 | achieved. In that case, the transition to "recover" isn't | ||
| 403 | notified, but the transition away is. | ||
| 404 | |||
| 405 | degraded | ||
| 406 | This contains a count of the number of devices by which the | ||
| 407 | arrays is degraded. So an optimal array with show '0'. A | ||
| 408 | single failed/missing drive will show '1', etc. | ||
| 409 | This file responds to select/poll, any increase or decrease | ||
| 410 | in the count of missing devices will trigger an event. | ||
| 411 | |||
| 384 | mismatch_count | 412 | mismatch_count |
| 385 | When performing 'check' and 'repair', and possibly when | 413 | When performing 'check' and 'repair', and possibly when |
| 386 | performing 'resync', md will count the number of errors that are | 414 | performing 'resync', md will count the number of errors that are |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index b26927ce889c..621a272a2c74 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
| 225 | || test_bit(Faulty, &rdev->flags)) | 225 | || test_bit(Faulty, &rdev->flags)) |
| 226 | continue; | 226 | continue; |
| 227 | 227 | ||
| 228 | target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); | 228 | target = rdev->sb_start + offset + index * (PAGE_SIZE/512); |
| 229 | 229 | ||
| 230 | if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { | 230 | if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { |
| 231 | page->index = index; | 231 | page->index = index; |
| @@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde | |||
| 241 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | 241 | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) |
| 242 | { | 242 | { |
| 243 | mdk_rdev_t *rdev; | 243 | mdk_rdev_t *rdev; |
| 244 | struct list_head *tmp; | ||
| 245 | mddev_t *mddev = bitmap->mddev; | 244 | mddev_t *mddev = bitmap->mddev; |
| 246 | 245 | ||
| 247 | rdev_for_each(rdev, tmp, mddev) | 246 | rcu_read_lock(); |
| 247 | rdev_for_each_rcu(rdev, mddev) | ||
| 248 | if (test_bit(In_sync, &rdev->flags) | 248 | if (test_bit(In_sync, &rdev->flags) |
| 249 | && !test_bit(Faulty, &rdev->flags)) { | 249 | && !test_bit(Faulty, &rdev->flags)) { |
| 250 | int size = PAGE_SIZE; | 250 | int size = PAGE_SIZE; |
| @@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | |||
| 260 | + (long)(page->index * (PAGE_SIZE/512)) | 260 | + (long)(page->index * (PAGE_SIZE/512)) |
| 261 | + size/512 > 0) | 261 | + size/512 > 0) |
| 262 | /* bitmap runs in to metadata */ | 262 | /* bitmap runs in to metadata */ |
| 263 | return -EINVAL; | 263 | goto bad_alignment; |
| 264 | if (rdev->data_offset + mddev->size*2 | 264 | if (rdev->data_offset + mddev->size*2 |
| 265 | > rdev->sb_offset*2 + bitmap->offset) | 265 | > rdev->sb_start + bitmap->offset) |
| 266 | /* data runs in to bitmap */ | 266 | /* data runs in to bitmap */ |
| 267 | return -EINVAL; | 267 | goto bad_alignment; |
| 268 | } else if (rdev->sb_offset*2 < rdev->data_offset) { | 268 | } else if (rdev->sb_start < rdev->data_offset) { |
| 269 | /* METADATA BITMAP DATA */ | 269 | /* METADATA BITMAP DATA */ |
| 270 | if (rdev->sb_offset*2 | 270 | if (rdev->sb_start |
| 271 | + bitmap->offset | 271 | + bitmap->offset |
| 272 | + page->index*(PAGE_SIZE/512) + size/512 | 272 | + page->index*(PAGE_SIZE/512) + size/512 |
| 273 | > rdev->data_offset) | 273 | > rdev->data_offset) |
| 274 | /* bitmap runs in to data */ | 274 | /* bitmap runs in to data */ |
| 275 | return -EINVAL; | 275 | goto bad_alignment; |
| 276 | } else { | 276 | } else { |
| 277 | /* DATA METADATA BITMAP - no problems */ | 277 | /* DATA METADATA BITMAP - no problems */ |
| 278 | } | 278 | } |
| 279 | md_super_write(mddev, rdev, | 279 | md_super_write(mddev, rdev, |
| 280 | (rdev->sb_offset<<1) + bitmap->offset | 280 | rdev->sb_start + bitmap->offset |
| 281 | + page->index * (PAGE_SIZE/512), | 281 | + page->index * (PAGE_SIZE/512), |
| 282 | size, | 282 | size, |
| 283 | page); | 283 | page); |
| 284 | } | 284 | } |
| 285 | rcu_read_unlock(); | ||
| 285 | 286 | ||
| 286 | if (wait) | 287 | if (wait) |
| 287 | md_super_wait(mddev); | 288 | md_super_wait(mddev); |
| 288 | return 0; | 289 | return 0; |
| 290 | |||
| 291 | bad_alignment: | ||
| 292 | rcu_read_unlock(); | ||
| 293 | return -EINVAL; | ||
| 289 | } | 294 | } |
| 290 | 295 | ||
| 291 | static void bitmap_file_kick(struct bitmap *bitmap); | 296 | static void bitmap_file_kick(struct bitmap *bitmap); |
| @@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
| 454 | spin_unlock_irqrestore(&bitmap->lock, flags); | 459 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| 455 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); | 460 | sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); |
| 456 | sb->events = cpu_to_le64(bitmap->mddev->events); | 461 | sb->events = cpu_to_le64(bitmap->mddev->events); |
| 457 | if (!bitmap->mddev->degraded) | 462 | if (bitmap->mddev->events < bitmap->events_cleared) { |
| 458 | sb->events_cleared = cpu_to_le64(bitmap->mddev->events); | 463 | /* rocking back to read-only */ |
| 464 | bitmap->events_cleared = bitmap->mddev->events; | ||
| 465 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | ||
| 466 | } | ||
| 459 | kunmap_atomic(sb, KM_USER0); | 467 | kunmap_atomic(sb, KM_USER0); |
| 460 | write_page(bitmap, bitmap->sb_page, 1); | 468 | write_page(bitmap, bitmap->sb_page, 1); |
| 461 | } | 469 | } |
| @@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap) | |||
| 1085 | } else | 1093 | } else |
| 1086 | spin_unlock_irqrestore(&bitmap->lock, flags); | 1094 | spin_unlock_irqrestore(&bitmap->lock, flags); |
| 1087 | lastpage = page; | 1095 | lastpage = page; |
| 1088 | /* | 1096 | |
| 1089 | printk("bitmap clean at page %lu\n", j); | 1097 | /* We are possibly going to clear some bits, so make |
| 1090 | */ | 1098 | * sure that events_cleared is up-to-date. |
| 1099 | */ | ||
| 1100 | if (bitmap->need_sync) { | ||
| 1101 | bitmap_super_t *sb; | ||
| 1102 | bitmap->need_sync = 0; | ||
| 1103 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | ||
| 1104 | sb->events_cleared = | ||
| 1105 | cpu_to_le64(bitmap->events_cleared); | ||
| 1106 | kunmap_atomic(sb, KM_USER0); | ||
| 1107 | write_page(bitmap, bitmap->sb_page, 1); | ||
| 1108 | } | ||
| 1091 | spin_lock_irqsave(&bitmap->lock, flags); | 1109 | spin_lock_irqsave(&bitmap->lock, flags); |
| 1092 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); | 1110 | clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); |
| 1093 | } | 1111 | } |
| @@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
| 1257 | return; | 1275 | return; |
| 1258 | } | 1276 | } |
| 1259 | 1277 | ||
| 1278 | if (success && | ||
| 1279 | bitmap->events_cleared < bitmap->mddev->events) { | ||
| 1280 | bitmap->events_cleared = bitmap->mddev->events; | ||
| 1281 | bitmap->need_sync = 1; | ||
| 1282 | } | ||
| 1283 | |||
| 1260 | if (!success && ! (*bmc & NEEDED_MASK)) | 1284 | if (!success && ! (*bmc & NEEDED_MASK)) |
| 1261 | *bmc |= NEEDED_MASK; | 1285 | *bmc |= NEEDED_MASK; |
| 1262 | 1286 | ||
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index d107ddceefcd..268547dbfbd3 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
| @@ -297,7 +297,7 @@ static int run(mddev_t *mddev) | |||
| 297 | rdev_for_each(rdev, tmp, mddev) | 297 | rdev_for_each(rdev, tmp, mddev) |
| 298 | conf->rdev = rdev; | 298 | conf->rdev = rdev; |
| 299 | 299 | ||
| 300 | mddev->array_size = mddev->size; | 300 | mddev->array_sectors = mddev->size * 2; |
| 301 | mddev->private = conf; | 301 | mddev->private = conf; |
| 302 | 302 | ||
| 303 | reconfig(mddev, mddev->layout, -1); | 303 | reconfig(mddev, mddev->layout, -1); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 6a866d7c8ae5..b1eebf88c209 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
| @@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 122 | return NULL; | 122 | return NULL; |
| 123 | 123 | ||
| 124 | cnt = 0; | 124 | cnt = 0; |
| 125 | conf->array_size = 0; | 125 | conf->array_sectors = 0; |
| 126 | 126 | ||
| 127 | rdev_for_each(rdev, tmp, mddev) { | 127 | rdev_for_each(rdev, tmp, mddev) { |
| 128 | int j = rdev->raid_disk; | 128 | int j = rdev->raid_disk; |
| 129 | dev_info_t *disk = conf->disks + j; | 129 | dev_info_t *disk = conf->disks + j; |
| 130 | 130 | ||
| 131 | if (j < 0 || j > raid_disks || disk->rdev) { | 131 | if (j < 0 || j >= raid_disks || disk->rdev) { |
| 132 | printk("linear: disk numbering problem. Aborting!\n"); | 132 | printk("linear: disk numbering problem. Aborting!\n"); |
| 133 | goto out; | 133 | goto out; |
| 134 | } | 134 | } |
| @@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 146 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 146 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); |
| 147 | 147 | ||
| 148 | disk->size = rdev->size; | 148 | disk->size = rdev->size; |
| 149 | conf->array_size += rdev->size; | 149 | conf->array_sectors += rdev->size * 2; |
| 150 | 150 | ||
| 151 | cnt++; | 151 | cnt++; |
| 152 | } | 152 | } |
| @@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 155 | goto out; | 155 | goto out; |
| 156 | } | 156 | } |
| 157 | 157 | ||
| 158 | min_spacing = conf->array_size; | 158 | min_spacing = conf->array_sectors / 2; |
| 159 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); | 159 | sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); |
| 160 | 160 | ||
| 161 | /* min_spacing is the minimum spacing that will fit the hash | 161 | /* min_spacing is the minimum spacing that will fit the hash |
| @@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 164 | * that is larger than min_spacing as use the size of that as | 164 | * that is larger than min_spacing as use the size of that as |
| 165 | * the actual spacing | 165 | * the actual spacing |
| 166 | */ | 166 | */ |
| 167 | conf->hash_spacing = conf->array_size; | 167 | conf->hash_spacing = conf->array_sectors / 2; |
| 168 | for (i=0; i < cnt-1 ; i++) { | 168 | for (i=0; i < cnt-1 ; i++) { |
| 169 | sector_t sz = 0; | 169 | sector_t sz = 0; |
| 170 | int j; | 170 | int j; |
| @@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 194 | unsigned round; | 194 | unsigned round; |
| 195 | unsigned long base; | 195 | unsigned long base; |
| 196 | 196 | ||
| 197 | sz = conf->array_size >> conf->preshift; | 197 | sz = conf->array_sectors >> (conf->preshift + 1); |
| 198 | sz += 1; /* force round-up */ | 198 | sz += 1; /* force round-up */ |
| 199 | base = conf->hash_spacing >> conf->preshift; | 199 | base = conf->hash_spacing >> conf->preshift; |
| 200 | round = sector_div(sz, base); | 200 | round = sector_div(sz, base); |
| @@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) | |||
| 221 | curr_offset = 0; | 221 | curr_offset = 0; |
| 222 | i = 0; | 222 | i = 0; |
| 223 | for (curr_offset = 0; | 223 | for (curr_offset = 0; |
| 224 | curr_offset < conf->array_size; | 224 | curr_offset < conf->array_sectors / 2; |
| 225 | curr_offset += conf->hash_spacing) { | 225 | curr_offset += conf->hash_spacing) { |
| 226 | 226 | ||
| 227 | while (i < raid_disks-1 && | 227 | while (i < raid_disks-1 && |
| @@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev) | |||
| 258 | if (!conf) | 258 | if (!conf) |
| 259 | return 1; | 259 | return 1; |
| 260 | mddev->private = conf; | 260 | mddev->private = conf; |
| 261 | mddev->array_size = conf->array_size; | 261 | mddev->array_sectors = conf->array_sectors; |
| 262 | 262 | ||
| 263 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); | 263 | blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); |
| 264 | mddev->queue->unplug_fn = linear_unplug; | 264 | mddev->queue->unplug_fn = linear_unplug; |
| @@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 292 | newconf->prev = mddev_to_conf(mddev); | 292 | newconf->prev = mddev_to_conf(mddev); |
| 293 | mddev->private = newconf; | 293 | mddev->private = newconf; |
| 294 | mddev->raid_disks++; | 294 | mddev->raid_disks++; |
| 295 | mddev->array_size = newconf->array_size; | 295 | mddev->array_sectors = newconf->array_sectors; |
| 296 | set_capacity(mddev->gendisk, mddev->array_size << 1); | 296 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 297 | return 0; | 297 | return 0; |
| 298 | } | 298 | } |
| 299 | 299 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 2580ac1b9b0f..c2ff77ccec50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev) | |||
| 169 | { | 169 | { |
| 170 | atomic_inc(&md_event_count); | 170 | atomic_inc(&md_event_count); |
| 171 | wake_up(&md_event_waiters); | 171 | wake_up(&md_event_waiters); |
| 172 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
| 173 | } | 172 | } |
| 174 | EXPORT_SYMBOL_GPL(md_new_event); | 173 | EXPORT_SYMBOL_GPL(md_new_event); |
| 175 | 174 | ||
| @@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit) | |||
| 274 | INIT_LIST_HEAD(&new->all_mddevs); | 273 | INIT_LIST_HEAD(&new->all_mddevs); |
| 275 | init_timer(&new->safemode_timer); | 274 | init_timer(&new->safemode_timer); |
| 276 | atomic_set(&new->active, 1); | 275 | atomic_set(&new->active, 1); |
| 276 | atomic_set(&new->openers, 0); | ||
| 277 | spin_lock_init(&new->write_lock); | 277 | spin_lock_init(&new->write_lock); |
| 278 | init_waitqueue_head(&new->sb_wait); | 278 | init_waitqueue_head(&new->sb_wait); |
| 279 | init_waitqueue_head(&new->recovery_wait); | 279 | init_waitqueue_head(&new->recovery_wait); |
| 280 | new->reshape_position = MaxSector; | 280 | new->reshape_position = MaxSector; |
| 281 | new->resync_min = 0; | ||
| 281 | new->resync_max = MaxSector; | 282 | new->resync_max = MaxSector; |
| 282 | new->level = LEVEL_NONE; | 283 | new->level = LEVEL_NONE; |
| 283 | 284 | ||
| @@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel) | |||
| 347 | return NULL; | 348 | return NULL; |
| 348 | } | 349 | } |
| 349 | 350 | ||
| 351 | /* return the offset of the super block in 512byte sectors */ | ||
| 350 | static inline sector_t calc_dev_sboffset(struct block_device *bdev) | 352 | static inline sector_t calc_dev_sboffset(struct block_device *bdev) |
| 351 | { | 353 | { |
| 352 | sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | 354 | sector_t num_sectors = bdev->bd_inode->i_size / 512; |
| 353 | return MD_NEW_SIZE_BLOCKS(size); | 355 | return MD_NEW_SIZE_SECTORS(num_sectors); |
| 354 | } | 356 | } |
| 355 | 357 | ||
| 356 | static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) | 358 | static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) |
| 357 | { | 359 | { |
| 358 | sector_t size; | 360 | sector_t num_sectors = rdev->sb_start; |
| 359 | |||
| 360 | size = rdev->sb_offset; | ||
| 361 | 361 | ||
| 362 | if (chunk_size) | 362 | if (chunk_size) |
| 363 | size &= ~((sector_t)chunk_size/1024 - 1); | 363 | num_sectors &= ~((sector_t)chunk_size/512 - 1); |
| 364 | return size; | 364 | return num_sectors; |
| 365 | } | 365 | } |
| 366 | 366 | ||
| 367 | static int alloc_disk_sb(mdk_rdev_t * rdev) | 367 | static int alloc_disk_sb(mdk_rdev_t * rdev) |
| @@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) | |||
| 372 | rdev->sb_page = alloc_page(GFP_KERNEL); | 372 | rdev->sb_page = alloc_page(GFP_KERNEL); |
| 373 | if (!rdev->sb_page) { | 373 | if (!rdev->sb_page) { |
| 374 | printk(KERN_ALERT "md: out of memory.\n"); | 374 | printk(KERN_ALERT "md: out of memory.\n"); |
| 375 | return -EINVAL; | 375 | return -ENOMEM; |
| 376 | } | 376 | } |
| 377 | 377 | ||
| 378 | return 0; | 378 | return 0; |
| @@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
| 384 | put_page(rdev->sb_page); | 384 | put_page(rdev->sb_page); |
| 385 | rdev->sb_loaded = 0; | 385 | rdev->sb_loaded = 0; |
| 386 | rdev->sb_page = NULL; | 386 | rdev->sb_page = NULL; |
| 387 | rdev->sb_offset = 0; | 387 | rdev->sb_start = 0; |
| 388 | rdev->size = 0; | 388 | rdev->size = 0; |
| 389 | } | 389 | } |
| 390 | } | 390 | } |
| @@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) | |||
| 530 | return 0; | 530 | return 0; |
| 531 | 531 | ||
| 532 | 532 | ||
| 533 | if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) | 533 | if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) |
| 534 | goto fail; | 534 | goto fail; |
| 535 | rdev->sb_loaded = 1; | 535 | rdev->sb_loaded = 1; |
| 536 | return 0; | 536 | return 0; |
| @@ -543,17 +543,12 @@ fail: | |||
| 543 | 543 | ||
| 544 | static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) | 544 | static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| 545 | { | 545 | { |
| 546 | if ( (sb1->set_uuid0 == sb2->set_uuid0) && | 546 | return sb1->set_uuid0 == sb2->set_uuid0 && |
| 547 | (sb1->set_uuid1 == sb2->set_uuid1) && | 547 | sb1->set_uuid1 == sb2->set_uuid1 && |
| 548 | (sb1->set_uuid2 == sb2->set_uuid2) && | 548 | sb1->set_uuid2 == sb2->set_uuid2 && |
| 549 | (sb1->set_uuid3 == sb2->set_uuid3)) | 549 | sb1->set_uuid3 == sb2->set_uuid3; |
| 550 | |||
| 551 | return 1; | ||
| 552 | |||
| 553 | return 0; | ||
| 554 | } | 550 | } |
| 555 | 551 | ||
| 556 | |||
| 557 | static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | 552 | static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| 558 | { | 553 | { |
| 559 | int ret; | 554 | int ret; |
| @@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
| 564 | 559 | ||
| 565 | if (!tmp1 || !tmp2) { | 560 | if (!tmp1 || !tmp2) { |
| 566 | ret = 0; | 561 | ret = 0; |
| 567 | printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); | 562 | printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); |
| 568 | goto abort; | 563 | goto abort; |
| 569 | } | 564 | } |
| 570 | 565 | ||
| @@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) | |||
| 577 | tmp1->nr_disks = 0; | 572 | tmp1->nr_disks = 0; |
| 578 | tmp2->nr_disks = 0; | 573 | tmp2->nr_disks = 0; |
| 579 | 574 | ||
| 580 | if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) | 575 | ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
| 581 | ret = 0; | ||
| 582 | else | ||
| 583 | ret = 1; | ||
| 584 | |||
| 585 | abort: | 576 | abort: |
| 586 | kfree(tmp1); | 577 | kfree(tmp1); |
| 587 | kfree(tmp2); | 578 | kfree(tmp2); |
| @@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) | |||
| 658 | */ | 649 | */ |
| 659 | 650 | ||
| 660 | struct super_type { | 651 | struct super_type { |
| 661 | char *name; | 652 | char *name; |
| 662 | struct module *owner; | 653 | struct module *owner; |
| 663 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); | 654 | int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, |
| 664 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); | 655 | int minor_version); |
| 665 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | 656 | int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); |
| 657 | void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); | ||
| 658 | unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, | ||
| 659 | sector_t num_sectors); | ||
| 666 | }; | 660 | }; |
| 667 | 661 | ||
| 668 | /* | 662 | /* |
| @@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 673 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 667 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| 674 | mdp_super_t *sb; | 668 | mdp_super_t *sb; |
| 675 | int ret; | 669 | int ret; |
| 676 | sector_t sb_offset; | ||
| 677 | 670 | ||
| 678 | /* | 671 | /* |
| 679 | * Calculate the position of the superblock, | 672 | * Calculate the position of the superblock (512byte sectors), |
| 680 | * it's at the end of the disk. | 673 | * it's at the end of the disk. |
| 681 | * | 674 | * |
| 682 | * It also happens to be a multiple of 4Kb. | 675 | * It also happens to be a multiple of 4Kb. |
| 683 | */ | 676 | */ |
| 684 | sb_offset = calc_dev_sboffset(rdev->bdev); | 677 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
| 685 | rdev->sb_offset = sb_offset; | ||
| 686 | 678 | ||
| 687 | ret = read_disk_sb(rdev, MD_SB_BYTES); | 679 | ret = read_disk_sb(rdev, MD_SB_BYTES); |
| 688 | if (ret) return ret; | 680 | if (ret) return ret; |
| @@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
| 759 | else | 751 | else |
| 760 | ret = 0; | 752 | ret = 0; |
| 761 | } | 753 | } |
| 762 | rdev->size = calc_dev_size(rdev, sb->chunk_size); | 754 | rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; |
| 763 | 755 | ||
| 764 | if (rdev->size < sb->size && sb->level > 1) | 756 | if (rdev->size < sb->size && sb->level > 1) |
| 765 | /* "this cannot possibly happen" ... */ | 757 | /* "this cannot possibly happen" ... */ |
| @@ -1004,6 +996,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1004 | } | 996 | } |
| 1005 | 997 | ||
| 1006 | /* | 998 | /* |
| 999 | * rdev_size_change for 0.90.0 | ||
| 1000 | */ | ||
| 1001 | static unsigned long long | ||
| 1002 | super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | ||
| 1003 | { | ||
| 1004 | if (num_sectors && num_sectors < rdev->mddev->size * 2) | ||
| 1005 | return 0; /* component must fit device */ | ||
| 1006 | if (rdev->mddev->bitmap_offset) | ||
| 1007 | return 0; /* can't move bitmap */ | ||
| 1008 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); | ||
| 1009 | if (!num_sectors || num_sectors > rdev->sb_start) | ||
| 1010 | num_sectors = rdev->sb_start; | ||
| 1011 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||
| 1012 | rdev->sb_page); | ||
| 1013 | md_super_wait(rdev->mddev); | ||
| 1014 | return num_sectors / 2; /* kB for sysfs */ | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | |||
| 1018 | /* | ||
| 1007 | * version 1 superblock | 1019 | * version 1 superblock |
| 1008 | */ | 1020 | */ |
| 1009 | 1021 | ||
| @@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1034 | { | 1046 | { |
| 1035 | struct mdp_superblock_1 *sb; | 1047 | struct mdp_superblock_1 *sb; |
| 1036 | int ret; | 1048 | int ret; |
| 1037 | sector_t sb_offset; | 1049 | sector_t sb_start; |
| 1038 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; | 1050 | char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| 1039 | int bmask; | 1051 | int bmask; |
| 1040 | 1052 | ||
| 1041 | /* | 1053 | /* |
| 1042 | * Calculate the position of the superblock. | 1054 | * Calculate the position of the superblock in 512byte sectors. |
| 1043 | * It is always aligned to a 4K boundary and | 1055 | * It is always aligned to a 4K boundary and |
| 1044 | * depeding on minor_version, it can be: | 1056 | * depeding on minor_version, it can be: |
| 1045 | * 0: At least 8K, but less than 12K, from end of device | 1057 | * 0: At least 8K, but less than 12K, from end of device |
| @@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1048 | */ | 1060 | */ |
| 1049 | switch(minor_version) { | 1061 | switch(minor_version) { |
| 1050 | case 0: | 1062 | case 0: |
| 1051 | sb_offset = rdev->bdev->bd_inode->i_size >> 9; | 1063 | sb_start = rdev->bdev->bd_inode->i_size >> 9; |
| 1052 | sb_offset -= 8*2; | 1064 | sb_start -= 8*2; |
| 1053 | sb_offset &= ~(sector_t)(4*2-1); | 1065 | sb_start &= ~(sector_t)(4*2-1); |
| 1054 | /* convert from sectors to K */ | ||
| 1055 | sb_offset /= 2; | ||
| 1056 | break; | 1066 | break; |
| 1057 | case 1: | 1067 | case 1: |
| 1058 | sb_offset = 0; | 1068 | sb_start = 0; |
| 1059 | break; | 1069 | break; |
| 1060 | case 2: | 1070 | case 2: |
| 1061 | sb_offset = 4; | 1071 | sb_start = 8; |
| 1062 | break; | 1072 | break; |
| 1063 | default: | 1073 | default: |
| 1064 | return -EINVAL; | 1074 | return -EINVAL; |
| 1065 | } | 1075 | } |
| 1066 | rdev->sb_offset = sb_offset; | 1076 | rdev->sb_start = sb_start; |
| 1067 | 1077 | ||
| 1068 | /* superblock is rarely larger than 1K, but it can be larger, | 1078 | /* superblock is rarely larger than 1K, but it can be larger, |
| 1069 | * and it is safe to read 4k, so we do that | 1079 | * and it is safe to read 4k, so we do that |
| @@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1077 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | 1087 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
| 1078 | sb->major_version != cpu_to_le32(1) || | 1088 | sb->major_version != cpu_to_le32(1) || |
| 1079 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || | 1089 | le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
| 1080 | le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || | 1090 | le64_to_cpu(sb->super_offset) != rdev->sb_start || |
| 1081 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) | 1091 | (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
| 1082 | return -EINVAL; | 1092 | return -EINVAL; |
| 1083 | 1093 | ||
| @@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1113 | rdev->sb_size = (rdev->sb_size | bmask) + 1; | 1123 | rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| 1114 | 1124 | ||
| 1115 | if (minor_version | 1125 | if (minor_version |
| 1116 | && rdev->data_offset < sb_offset + (rdev->sb_size/512)) | 1126 | && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
| 1117 | return -EINVAL; | 1127 | return -EINVAL; |
| 1118 | 1128 | ||
| 1119 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) | 1129 | if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
| @@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
| 1149 | if (minor_version) | 1159 | if (minor_version) |
| 1150 | rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; | 1160 | rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; |
| 1151 | else | 1161 | else |
| 1152 | rdev->size = rdev->sb_offset; | 1162 | rdev->size = rdev->sb_start / 2; |
| 1153 | if (rdev->size < le64_to_cpu(sb->data_size)/2) | 1163 | if (rdev->size < le64_to_cpu(sb->data_size)/2) |
| 1154 | return -EINVAL; | 1164 | return -EINVAL; |
| 1155 | rdev->size = le64_to_cpu(sb->data_size)/2; | 1165 | rdev->size = le64_to_cpu(sb->data_size)/2; |
| @@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1328 | sb->sb_csum = calc_sb_1_csum(sb); | 1338 | sb->sb_csum = calc_sb_1_csum(sb); |
| 1329 | } | 1339 | } |
| 1330 | 1340 | ||
| 1341 | static unsigned long long | ||
| 1342 | super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | ||
| 1343 | { | ||
| 1344 | struct mdp_superblock_1 *sb; | ||
| 1345 | sector_t max_sectors; | ||
| 1346 | if (num_sectors && num_sectors < rdev->mddev->size * 2) | ||
| 1347 | return 0; /* component must fit device */ | ||
| 1348 | if (rdev->sb_start < rdev->data_offset) { | ||
| 1349 | /* minor versions 1 and 2; superblock before data */ | ||
| 1350 | max_sectors = rdev->bdev->bd_inode->i_size >> 9; | ||
| 1351 | max_sectors -= rdev->data_offset; | ||
| 1352 | if (!num_sectors || num_sectors > max_sectors) | ||
| 1353 | num_sectors = max_sectors; | ||
| 1354 | } else if (rdev->mddev->bitmap_offset) { | ||
| 1355 | /* minor version 0 with bitmap we can't move */ | ||
| 1356 | return 0; | ||
| 1357 | } else { | ||
| 1358 | /* minor version 0; superblock after data */ | ||
| 1359 | sector_t sb_start; | ||
| 1360 | sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; | ||
| 1361 | sb_start &= ~(sector_t)(4*2 - 1); | ||
| 1362 | max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; | ||
| 1363 | if (!num_sectors || num_sectors > max_sectors) | ||
| 1364 | num_sectors = max_sectors; | ||
| 1365 | rdev->sb_start = sb_start; | ||
| 1366 | } | ||
| 1367 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | ||
| 1368 | sb->data_size = cpu_to_le64(num_sectors); | ||
| 1369 | sb->super_offset = rdev->sb_start; | ||
| 1370 | sb->sb_csum = calc_sb_1_csum(sb); | ||
| 1371 | md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||
| 1372 | rdev->sb_page); | ||
| 1373 | md_super_wait(rdev->mddev); | ||
| 1374 | return num_sectors / 2; /* kB for sysfs */ | ||
| 1375 | } | ||
| 1331 | 1376 | ||
| 1332 | static struct super_type super_types[] = { | 1377 | static struct super_type super_types[] = { |
| 1333 | [0] = { | 1378 | [0] = { |
| 1334 | .name = "0.90.0", | 1379 | .name = "0.90.0", |
| 1335 | .owner = THIS_MODULE, | 1380 | .owner = THIS_MODULE, |
| 1336 | .load_super = super_90_load, | 1381 | .load_super = super_90_load, |
| 1337 | .validate_super = super_90_validate, | 1382 | .validate_super = super_90_validate, |
| 1338 | .sync_super = super_90_sync, | 1383 | .sync_super = super_90_sync, |
| 1384 | .rdev_size_change = super_90_rdev_size_change, | ||
| 1339 | }, | 1385 | }, |
| 1340 | [1] = { | 1386 | [1] = { |
| 1341 | .name = "md-1", | 1387 | .name = "md-1", |
| 1342 | .owner = THIS_MODULE, | 1388 | .owner = THIS_MODULE, |
| 1343 | .load_super = super_1_load, | 1389 | .load_super = super_1_load, |
| 1344 | .validate_super = super_1_validate, | 1390 | .validate_super = super_1_validate, |
| 1345 | .sync_super = super_1_sync, | 1391 | .sync_super = super_1_sync, |
| 1392 | .rdev_size_change = super_1_rdev_size_change, | ||
| 1346 | }, | 1393 | }, |
| 1347 | }; | 1394 | }; |
| 1348 | 1395 | ||
| 1349 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) | 1396 | static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) |
| 1350 | { | 1397 | { |
| 1351 | struct list_head *tmp, *tmp2; | ||
| 1352 | mdk_rdev_t *rdev, *rdev2; | 1398 | mdk_rdev_t *rdev, *rdev2; |
| 1353 | 1399 | ||
| 1354 | rdev_for_each(rdev, tmp, mddev1) | 1400 | rcu_read_lock(); |
| 1355 | rdev_for_each(rdev2, tmp2, mddev2) | 1401 | rdev_for_each_rcu(rdev, mddev1) |
| 1402 | rdev_for_each_rcu(rdev2, mddev2) | ||
| 1356 | if (rdev->bdev->bd_contains == | 1403 | if (rdev->bdev->bd_contains == |
| 1357 | rdev2->bdev->bd_contains) | 1404 | rdev2->bdev->bd_contains) { |
| 1405 | rcu_read_unlock(); | ||
| 1358 | return 1; | 1406 | return 1; |
| 1359 | 1407 | } | |
| 1408 | rcu_read_unlock(); | ||
| 1360 | return 0; | 1409 | return 0; |
| 1361 | } | 1410 | } |
| 1362 | 1411 | ||
| @@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
| 1423 | kobject_del(&rdev->kobj); | 1472 | kobject_del(&rdev->kobj); |
| 1424 | goto fail; | 1473 | goto fail; |
| 1425 | } | 1474 | } |
| 1426 | list_add(&rdev->same_set, &mddev->disks); | 1475 | list_add_rcu(&rdev->same_set, &mddev->disks); |
| 1427 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); | 1476 | bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); |
| 1428 | return 0; | 1477 | return 0; |
| 1429 | 1478 | ||
| @@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
| 1448 | return; | 1497 | return; |
| 1449 | } | 1498 | } |
| 1450 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); | 1499 | bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); |
| 1451 | list_del_init(&rdev->same_set); | 1500 | list_del_rcu(&rdev->same_set); |
| 1452 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); | 1501 | printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
| 1453 | rdev->mddev = NULL; | 1502 | rdev->mddev = NULL; |
| 1454 | sysfs_remove_link(&rdev->kobj, "block"); | 1503 | sysfs_remove_link(&rdev->kobj, "block"); |
| 1455 | 1504 | ||
| 1456 | /* We need to delay this, otherwise we can deadlock when | 1505 | /* We need to delay this, otherwise we can deadlock when |
| 1457 | * writing to 'remove' to "dev/state" | 1506 | * writing to 'remove' to "dev/state". We also need |
| 1507 | * to delay it due to rcu usage. | ||
| 1458 | */ | 1508 | */ |
| 1509 | synchronize_rcu(); | ||
| 1459 | INIT_WORK(&rdev->del_work, md_delayed_delete); | 1510 | INIT_WORK(&rdev->del_work, md_delayed_delete); |
| 1460 | kobject_get(&rdev->kobj); | 1511 | kobject_get(&rdev->kobj); |
| 1461 | schedule_work(&rdev->del_work); | 1512 | schedule_work(&rdev->del_work); |
| @@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) | |||
| 1511 | if (rdev->mddev) | 1562 | if (rdev->mddev) |
| 1512 | MD_BUG(); | 1563 | MD_BUG(); |
| 1513 | free_disk_sb(rdev); | 1564 | free_disk_sb(rdev); |
| 1514 | list_del_init(&rdev->same_set); | ||
| 1515 | #ifndef MODULE | 1565 | #ifndef MODULE |
| 1516 | if (test_bit(AutoDetected, &rdev->flags)) | 1566 | if (test_bit(AutoDetected, &rdev->flags)) |
| 1517 | md_autodetect_dev(rdev->bdev->bd_dev); | 1567 | md_autodetect_dev(rdev->bdev->bd_dev); |
| @@ -1758,11 +1808,11 @@ repeat: | |||
| 1758 | dprintk("%s ", bdevname(rdev->bdev,b)); | 1808 | dprintk("%s ", bdevname(rdev->bdev,b)); |
| 1759 | if (!test_bit(Faulty, &rdev->flags)) { | 1809 | if (!test_bit(Faulty, &rdev->flags)) { |
| 1760 | md_super_write(mddev,rdev, | 1810 | md_super_write(mddev,rdev, |
| 1761 | rdev->sb_offset<<1, rdev->sb_size, | 1811 | rdev->sb_start, rdev->sb_size, |
| 1762 | rdev->sb_page); | 1812 | rdev->sb_page); |
| 1763 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", | 1813 | dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", |
| 1764 | bdevname(rdev->bdev,b), | 1814 | bdevname(rdev->bdev,b), |
| 1765 | (unsigned long long)rdev->sb_offset); | 1815 | (unsigned long long)rdev->sb_start); |
| 1766 | rdev->sb_events = mddev->events; | 1816 | rdev->sb_events = mddev->events; |
| 1767 | 1817 | ||
| 1768 | } else | 1818 | } else |
| @@ -1787,7 +1837,7 @@ repeat: | |||
| 1787 | 1837 | ||
| 1788 | } | 1838 | } |
| 1789 | 1839 | ||
| 1790 | /* words written to sysfs files may, or my not, be \n terminated. | 1840 | /* words written to sysfs files may, or may not, be \n terminated. |
| 1791 | * We want to accept with case. For this we use cmd_match. | 1841 | * We want to accept with case. For this we use cmd_match. |
| 1792 | */ | 1842 | */ |
| 1793 | static int cmd_match(const char *cmd, const char *str) | 1843 | static int cmd_match(const char *cmd, const char *str) |
| @@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 1886 | 1936 | ||
| 1887 | err = 0; | 1937 | err = 0; |
| 1888 | } | 1938 | } |
| 1939 | if (!err) | ||
| 1940 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
| 1889 | return err ? err : len; | 1941 | return err ? err : len; |
| 1890 | } | 1942 | } |
| 1891 | static struct rdev_sysfs_entry rdev_state = | 1943 | static struct rdev_sysfs_entry rdev_state = |
| @@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 1931 | slot = -1; | 1983 | slot = -1; |
| 1932 | else if (e==buf || (*e && *e!= '\n')) | 1984 | else if (e==buf || (*e && *e!= '\n')) |
| 1933 | return -EINVAL; | 1985 | return -EINVAL; |
| 1934 | if (rdev->mddev->pers) { | 1986 | if (rdev->mddev->pers && slot == -1) { |
| 1935 | /* Setting 'slot' on an active array requires also | 1987 | /* Setting 'slot' on an active array requires also |
| 1936 | * updating the 'rd%d' link, and communicating | 1988 | * updating the 'rd%d' link, and communicating |
| 1937 | * with the personality with ->hot_*_disk. | 1989 | * with the personality with ->hot_*_disk. |
| @@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 1939 | * failed/spare devices. This normally happens automatically, | 1991 | * failed/spare devices. This normally happens automatically, |
| 1940 | * but not when the metadata is externally managed. | 1992 | * but not when the metadata is externally managed. |
| 1941 | */ | 1993 | */ |
| 1942 | if (slot != -1) | ||
| 1943 | return -EBUSY; | ||
| 1944 | if (rdev->raid_disk == -1) | 1994 | if (rdev->raid_disk == -1) |
| 1945 | return -EEXIST; | 1995 | return -EEXIST; |
| 1946 | /* personality does all needed checks */ | 1996 | /* personality does all needed checks */ |
| @@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 1954 | sysfs_remove_link(&rdev->mddev->kobj, nm); | 2004 | sysfs_remove_link(&rdev->mddev->kobj, nm); |
| 1955 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2005 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| 1956 | md_wakeup_thread(rdev->mddev->thread); | 2006 | md_wakeup_thread(rdev->mddev->thread); |
| 2007 | } else if (rdev->mddev->pers) { | ||
| 2008 | mdk_rdev_t *rdev2; | ||
| 2009 | struct list_head *tmp; | ||
| 2010 | /* Activating a spare .. or possibly reactivating | ||
| 2011 | * if we every get bitmaps working here. | ||
| 2012 | */ | ||
| 2013 | |||
| 2014 | if (rdev->raid_disk != -1) | ||
| 2015 | return -EBUSY; | ||
| 2016 | |||
| 2017 | if (rdev->mddev->pers->hot_add_disk == NULL) | ||
| 2018 | return -EINVAL; | ||
| 2019 | |||
| 2020 | rdev_for_each(rdev2, tmp, rdev->mddev) | ||
| 2021 | if (rdev2->raid_disk == slot) | ||
| 2022 | return -EEXIST; | ||
| 2023 | |||
| 2024 | rdev->raid_disk = slot; | ||
| 2025 | if (test_bit(In_sync, &rdev->flags)) | ||
| 2026 | rdev->saved_raid_disk = slot; | ||
| 2027 | else | ||
| 2028 | rdev->saved_raid_disk = -1; | ||
| 2029 | err = rdev->mddev->pers-> | ||
| 2030 | hot_add_disk(rdev->mddev, rdev); | ||
| 2031 | if (err) { | ||
| 2032 | rdev->raid_disk = -1; | ||
| 2033 | return err; | ||
| 2034 | } else | ||
| 2035 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
| 2036 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
| 2037 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
| 2038 | printk(KERN_WARNING | ||
| 2039 | "md: cannot register " | ||
| 2040 | "%s for %s\n", | ||
| 2041 | nm, mdname(rdev->mddev)); | ||
| 2042 | |||
| 2043 | /* don't wakeup anyone, leave that to userspace. */ | ||
| 1957 | } else { | 2044 | } else { |
| 1958 | if (slot >= rdev->mddev->raid_disks) | 2045 | if (slot >= rdev->mddev->raid_disks) |
| 1959 | return -ENOSPC; | 2046 | return -ENOSPC; |
| @@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 1962 | clear_bit(Faulty, &rdev->flags); | 2049 | clear_bit(Faulty, &rdev->flags); |
| 1963 | clear_bit(WriteMostly, &rdev->flags); | 2050 | clear_bit(WriteMostly, &rdev->flags); |
| 1964 | set_bit(In_sync, &rdev->flags); | 2051 | set_bit(In_sync, &rdev->flags); |
| 2052 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
| 1965 | } | 2053 | } |
| 1966 | return len; | 2054 | return len; |
| 1967 | } | 2055 | } |
| @@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 1983 | unsigned long long offset = simple_strtoull(buf, &e, 10); | 2071 | unsigned long long offset = simple_strtoull(buf, &e, 10); |
| 1984 | if (e==buf || (*e && *e != '\n')) | 2072 | if (e==buf || (*e && *e != '\n')) |
| 1985 | return -EINVAL; | 2073 | return -EINVAL; |
| 1986 | if (rdev->mddev->pers) | 2074 | if (rdev->mddev->pers && rdev->raid_disk >= 0) |
| 1987 | return -EBUSY; | 2075 | return -EBUSY; |
| 1988 | if (rdev->size && rdev->mddev->external) | 2076 | if (rdev->size && rdev->mddev->external) |
| 1989 | /* Must set offset before size, so overlap checks | 2077 | /* Must set offset before size, so overlap checks |
| @@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) | |||
| 2015 | static ssize_t | 2103 | static ssize_t |
| 2016 | rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2104 | rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
| 2017 | { | 2105 | { |
| 2018 | char *e; | 2106 | unsigned long long size; |
| 2019 | unsigned long long size = simple_strtoull(buf, &e, 10); | ||
| 2020 | unsigned long long oldsize = rdev->size; | 2107 | unsigned long long oldsize = rdev->size; |
| 2021 | mddev_t *my_mddev = rdev->mddev; | 2108 | mddev_t *my_mddev = rdev->mddev; |
| 2022 | 2109 | ||
| 2023 | if (e==buf || (*e && *e != '\n')) | 2110 | if (strict_strtoull(buf, 10, &size) < 0) |
| 2024 | return -EINVAL; | 2111 | return -EINVAL; |
| 2025 | if (my_mddev->pers) | 2112 | if (size < my_mddev->size) |
| 2026 | return -EBUSY; | 2113 | return -EINVAL; |
| 2114 | if (my_mddev->pers && rdev->raid_disk >= 0) { | ||
| 2115 | if (my_mddev->persistent) { | ||
| 2116 | size = super_types[my_mddev->major_version]. | ||
| 2117 | rdev_size_change(rdev, size * 2); | ||
| 2118 | if (!size) | ||
| 2119 | return -EBUSY; | ||
| 2120 | } else if (!size) { | ||
| 2121 | size = (rdev->bdev->bd_inode->i_size >> 10); | ||
| 2122 | size -= rdev->data_offset/2; | ||
| 2123 | } | ||
| 2124 | if (size < my_mddev->size) | ||
| 2125 | return -EINVAL; /* component must fit device */ | ||
| 2126 | } | ||
| 2127 | |||
| 2027 | rdev->size = size; | 2128 | rdev->size = size; |
| 2028 | if (size > oldsize && rdev->mddev->external) { | 2129 | if (size > oldsize && my_mddev->external) { |
| 2029 | /* need to check that all other rdevs with the same ->bdev | 2130 | /* need to check that all other rdevs with the same ->bdev |
| 2030 | * do not overlap. We need to unlock the mddev to avoid | 2131 | * do not overlap. We need to unlock the mddev to avoid |
| 2031 | * a deadlock. We have already changed rdev->size, and if | 2132 | * a deadlock. We have already changed rdev->size, and if |
| @@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2044 | if (test_bit(AllReserved, &rdev2->flags) || | 2145 | if (test_bit(AllReserved, &rdev2->flags) || |
| 2045 | (rdev->bdev == rdev2->bdev && | 2146 | (rdev->bdev == rdev2->bdev && |
| 2046 | rdev != rdev2 && | 2147 | rdev != rdev2 && |
| 2047 | overlaps(rdev->data_offset, rdev->size, | 2148 | overlaps(rdev->data_offset, rdev->size * 2, |
| 2048 | rdev2->data_offset, rdev2->size))) { | 2149 | rdev2->data_offset, |
| 2150 | rdev2->size * 2))) { | ||
| 2049 | overlap = 1; | 2151 | overlap = 1; |
| 2050 | break; | 2152 | break; |
| 2051 | } | 2153 | } |
| @@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
| 2067 | return -EBUSY; | 2169 | return -EBUSY; |
| 2068 | } | 2170 | } |
| 2069 | } | 2171 | } |
| 2070 | if (size < my_mddev->size || my_mddev->size == 0) | ||
| 2071 | my_mddev->size = size; | ||
| 2072 | return len; | 2172 | return len; |
| 2073 | } | 2173 | } |
| 2074 | 2174 | ||
| @@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); | |||
| 2512 | * When written, doesn't tear down array, but just stops it | 2612 | * When written, doesn't tear down array, but just stops it |
| 2513 | * suspended (not supported yet) | 2613 | * suspended (not supported yet) |
| 2514 | * All IO requests will block. The array can be reconfigured. | 2614 | * All IO requests will block. The array can be reconfigured. |
| 2515 | * Writing this, if accepted, will block until array is quiessent | 2615 | * Writing this, if accepted, will block until array is quiescent |
| 2516 | * readonly | 2616 | * readonly |
| 2517 | * no resync can happen. no superblocks get written. | 2617 | * no resync can happen. no superblocks get written. |
| 2518 | * write requests fail | 2618 | * write requests fail |
| @@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page) | |||
| 2585 | return sprintf(page, "%s\n", array_states[st]); | 2685 | return sprintf(page, "%s\n", array_states[st]); |
| 2586 | } | 2686 | } |
| 2587 | 2687 | ||
| 2588 | static int do_md_stop(mddev_t * mddev, int ro); | 2688 | static int do_md_stop(mddev_t * mddev, int ro, int is_open); |
| 2589 | static int do_md_run(mddev_t * mddev); | 2689 | static int do_md_run(mddev_t * mddev); |
| 2590 | static int restart_array(mddev_t *mddev); | 2690 | static int restart_array(mddev_t *mddev); |
| 2591 | 2691 | ||
| @@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2599 | break; | 2699 | break; |
| 2600 | case clear: | 2700 | case clear: |
| 2601 | /* stopping an active array */ | 2701 | /* stopping an active array */ |
| 2602 | if (atomic_read(&mddev->active) > 1) | 2702 | if (atomic_read(&mddev->openers) > 0) |
| 2603 | return -EBUSY; | 2703 | return -EBUSY; |
| 2604 | err = do_md_stop(mddev, 0); | 2704 | err = do_md_stop(mddev, 0, 0); |
| 2605 | break; | 2705 | break; |
| 2606 | case inactive: | 2706 | case inactive: |
| 2607 | /* stopping an active array */ | 2707 | /* stopping an active array */ |
| 2608 | if (mddev->pers) { | 2708 | if (mddev->pers) { |
| 2609 | if (atomic_read(&mddev->active) > 1) | 2709 | if (atomic_read(&mddev->openers) > 0) |
| 2610 | return -EBUSY; | 2710 | return -EBUSY; |
| 2611 | err = do_md_stop(mddev, 2); | 2711 | err = do_md_stop(mddev, 2, 0); |
| 2612 | } else | 2712 | } else |
| 2613 | err = 0; /* already inactive */ | 2713 | err = 0; /* already inactive */ |
| 2614 | break; | 2714 | break; |
| @@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2616 | break; /* not supported yet */ | 2716 | break; /* not supported yet */ |
| 2617 | case readonly: | 2717 | case readonly: |
| 2618 | if (mddev->pers) | 2718 | if (mddev->pers) |
| 2619 | err = do_md_stop(mddev, 1); | 2719 | err = do_md_stop(mddev, 1, 0); |
| 2620 | else { | 2720 | else { |
| 2621 | mddev->ro = 1; | 2721 | mddev->ro = 1; |
| 2622 | set_disk_ro(mddev->gendisk, 1); | 2722 | set_disk_ro(mddev->gendisk, 1); |
| @@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2626 | case read_auto: | 2726 | case read_auto: |
| 2627 | if (mddev->pers) { | 2727 | if (mddev->pers) { |
| 2628 | if (mddev->ro != 1) | 2728 | if (mddev->ro != 1) |
| 2629 | err = do_md_stop(mddev, 1); | 2729 | err = do_md_stop(mddev, 1, 0); |
| 2630 | else | 2730 | else |
| 2631 | err = restart_array(mddev); | 2731 | err = restart_array(mddev); |
| 2632 | if (err == 0) { | 2732 | if (err == 0) { |
| @@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2681 | } | 2781 | } |
| 2682 | if (err) | 2782 | if (err) |
| 2683 | return err; | 2783 | return err; |
| 2684 | else | 2784 | else { |
| 2785 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 2685 | return len; | 2786 | return len; |
| 2787 | } | ||
| 2686 | } | 2788 | } |
| 2687 | static struct md_sysfs_entry md_array_state = | 2789 | static struct md_sysfs_entry md_array_state = |
| 2688 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); | 2790 | __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); |
| @@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page) | |||
| 2785 | return sprintf(page, "%llu\n", (unsigned long long)mddev->size); | 2887 | return sprintf(page, "%llu\n", (unsigned long long)mddev->size); |
| 2786 | } | 2888 | } |
| 2787 | 2889 | ||
| 2788 | static int update_size(mddev_t *mddev, unsigned long size); | 2890 | static int update_size(mddev_t *mddev, sector_t num_sectors); |
| 2789 | 2891 | ||
| 2790 | static ssize_t | 2892 | static ssize_t |
| 2791 | size_store(mddev_t *mddev, const char *buf, size_t len) | 2893 | size_store(mddev_t *mddev, const char *buf, size_t len) |
| @@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 2802 | return -EINVAL; | 2904 | return -EINVAL; |
| 2803 | 2905 | ||
| 2804 | if (mddev->pers) { | 2906 | if (mddev->pers) { |
| 2805 | err = update_size(mddev, size); | 2907 | err = update_size(mddev, size * 2); |
| 2806 | md_update_sb(mddev, 1); | 2908 | md_update_sb(mddev, 1); |
| 2807 | } else { | 2909 | } else { |
| 2808 | if (mddev->size == 0 || | 2910 | if (mddev->size == 0 || |
| @@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page) | |||
| 2899 | type = "check"; | 3001 | type = "check"; |
| 2900 | else | 3002 | else |
| 2901 | type = "repair"; | 3003 | type = "repair"; |
| 2902 | } else | 3004 | } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) |
| 2903 | type = "recover"; | 3005 | type = "recover"; |
| 2904 | } | 3006 | } |
| 2905 | return sprintf(page, "%s\n", type); | 3007 | return sprintf(page, "%s\n", type); |
| @@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
| 2921 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || | 3023 | } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || |
| 2922 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) | 3024 | test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) |
| 2923 | return -EBUSY; | 3025 | return -EBUSY; |
| 2924 | else if (cmd_match(page, "resync") || cmd_match(page, "recover")) | 3026 | else if (cmd_match(page, "resync")) |
| 3027 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 3028 | else if (cmd_match(page, "recover")) { | ||
| 3029 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
| 2925 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3030 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 2926 | else if (cmd_match(page, "reshape")) { | 3031 | } else if (cmd_match(page, "reshape")) { |
| 2927 | int err; | 3032 | int err; |
| 2928 | if (mddev->pers->start_reshape == NULL) | 3033 | if (mddev->pers->start_reshape == NULL) |
| 2929 | return -EINVAL; | 3034 | return -EINVAL; |
| 2930 | err = mddev->pers->start_reshape(mddev); | 3035 | err = mddev->pers->start_reshape(mddev); |
| 2931 | if (err) | 3036 | if (err) |
| 2932 | return err; | 3037 | return err; |
| 3038 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
| 2933 | } else { | 3039 | } else { |
| 2934 | if (cmd_match(page, "check")) | 3040 | if (cmd_match(page, "check")) |
| 2935 | set_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 3041 | set_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
| @@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) | |||
| 2940 | } | 3046 | } |
| 2941 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 3047 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 2942 | md_wakeup_thread(mddev->thread); | 3048 | md_wakeup_thread(mddev->thread); |
| 3049 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
| 2943 | return len; | 3050 | return len; |
| 2944 | } | 3051 | } |
| 2945 | 3052 | ||
| @@ -3049,11 +3156,11 @@ static ssize_t | |||
| 3049 | sync_speed_show(mddev_t *mddev, char *page) | 3156 | sync_speed_show(mddev_t *mddev, char *page) |
| 3050 | { | 3157 | { |
| 3051 | unsigned long resync, dt, db; | 3158 | unsigned long resync, dt, db; |
| 3052 | resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); | 3159 | resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); |
| 3053 | dt = ((jiffies - mddev->resync_mark) / HZ); | 3160 | dt = (jiffies - mddev->resync_mark) / HZ; |
| 3054 | if (!dt) dt++; | 3161 | if (!dt) dt++; |
| 3055 | db = resync - (mddev->resync_mark_cnt); | 3162 | db = resync - mddev->resync_mark_cnt; |
| 3056 | return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ | 3163 | return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ |
| 3057 | } | 3164 | } |
| 3058 | 3165 | ||
| 3059 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); | 3166 | static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); |
| @@ -3075,6 +3182,36 @@ sync_completed_show(mddev_t *mddev, char *page) | |||
| 3075 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); | 3182 | static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); |
| 3076 | 3183 | ||
| 3077 | static ssize_t | 3184 | static ssize_t |
| 3185 | min_sync_show(mddev_t *mddev, char *page) | ||
| 3186 | { | ||
| 3187 | return sprintf(page, "%llu\n", | ||
| 3188 | (unsigned long long)mddev->resync_min); | ||
| 3189 | } | ||
| 3190 | static ssize_t | ||
| 3191 | min_sync_store(mddev_t *mddev, const char *buf, size_t len) | ||
| 3192 | { | ||
| 3193 | unsigned long long min; | ||
| 3194 | if (strict_strtoull(buf, 10, &min)) | ||
| 3195 | return -EINVAL; | ||
| 3196 | if (min > mddev->resync_max) | ||
| 3197 | return -EINVAL; | ||
| 3198 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | ||
| 3199 | return -EBUSY; | ||
| 3200 | |||
| 3201 | /* Must be a multiple of chunk_size */ | ||
| 3202 | if (mddev->chunk_size) { | ||
| 3203 | if (min & (sector_t)((mddev->chunk_size>>9)-1)) | ||
| 3204 | return -EINVAL; | ||
| 3205 | } | ||
| 3206 | mddev->resync_min = min; | ||
| 3207 | |||
| 3208 | return len; | ||
| 3209 | } | ||
| 3210 | |||
| 3211 | static struct md_sysfs_entry md_min_sync = | ||
| 3212 | __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); | ||
| 3213 | |||
| 3214 | static ssize_t | ||
| 3078 | max_sync_show(mddev_t *mddev, char *page) | 3215 | max_sync_show(mddev_t *mddev, char *page) |
| 3079 | { | 3216 | { |
| 3080 | if (mddev->resync_max == MaxSector) | 3217 | if (mddev->resync_max == MaxSector) |
| @@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) | |||
| 3089 | if (strncmp(buf, "max", 3) == 0) | 3226 | if (strncmp(buf, "max", 3) == 0) |
| 3090 | mddev->resync_max = MaxSector; | 3227 | mddev->resync_max = MaxSector; |
| 3091 | else { | 3228 | else { |
| 3092 | char *ep; | 3229 | unsigned long long max; |
| 3093 | unsigned long long max = simple_strtoull(buf, &ep, 10); | 3230 | if (strict_strtoull(buf, 10, &max)) |
| 3094 | if (ep == buf || (*ep != 0 && *ep != '\n')) | 3231 | return -EINVAL; |
| 3232 | if (max < mddev->resync_min) | ||
| 3095 | return -EINVAL; | 3233 | return -EINVAL; |
| 3096 | if (max < mddev->resync_max && | 3234 | if (max < mddev->resync_max && |
| 3097 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 3235 | test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| @@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = { | |||
| 3222 | &md_sync_speed.attr, | 3360 | &md_sync_speed.attr, |
| 3223 | &md_sync_force_parallel.attr, | 3361 | &md_sync_force_parallel.attr, |
| 3224 | &md_sync_completed.attr, | 3362 | &md_sync_completed.attr, |
| 3363 | &md_min_sync.attr, | ||
| 3225 | &md_max_sync.attr, | 3364 | &md_max_sync.attr, |
| 3226 | &md_suspend_lo.attr, | 3365 | &md_suspend_lo.attr, |
| 3227 | &md_suspend_hi.attr, | 3366 | &md_suspend_hi.attr, |
| @@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) | |||
| 3326 | disk->queue = mddev->queue; | 3465 | disk->queue = mddev->queue; |
| 3327 | add_disk(disk); | 3466 | add_disk(disk); |
| 3328 | mddev->gendisk = disk; | 3467 | mddev->gendisk = disk; |
| 3329 | mutex_unlock(&disks_mutex); | ||
| 3330 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, | 3468 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, |
| 3331 | "%s", "md"); | 3469 | "%s", "md"); |
| 3470 | mutex_unlock(&disks_mutex); | ||
| 3332 | if (error) | 3471 | if (error) |
| 3333 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", | 3472 | printk(KERN_WARNING "md: cannot register %s/md - name in use\n", |
| 3334 | disk->disk_name); | 3473 | disk->disk_name); |
| @@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data) | |||
| 3341 | { | 3480 | { |
| 3342 | mddev_t *mddev = (mddev_t *) data; | 3481 | mddev_t *mddev = (mddev_t *) data; |
| 3343 | 3482 | ||
| 3344 | mddev->safemode = 1; | 3483 | if (!atomic_read(&mddev->writes_pending)) { |
| 3484 | mddev->safemode = 1; | ||
| 3485 | if (mddev->external) | ||
| 3486 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 3487 | } | ||
| 3345 | md_wakeup_thread(mddev->thread); | 3488 | md_wakeup_thread(mddev->thread); |
| 3346 | } | 3489 | } |
| 3347 | 3490 | ||
| @@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev) | |||
| 3432 | * We don't want the data to overlap the metadata, | 3575 | * We don't want the data to overlap the metadata, |
| 3433 | * Internal Bitmap issues has handled elsewhere. | 3576 | * Internal Bitmap issues has handled elsewhere. |
| 3434 | */ | 3577 | */ |
| 3435 | if (rdev->data_offset < rdev->sb_offset) { | 3578 | if (rdev->data_offset < rdev->sb_start) { |
| 3436 | if (mddev->size && | 3579 | if (mddev->size && |
| 3437 | rdev->data_offset + mddev->size*2 | 3580 | rdev->data_offset + mddev->size*2 |
| 3438 | > rdev->sb_offset*2) { | 3581 | > rdev->sb_start) { |
| 3439 | printk("md: %s: data overlaps metadata\n", | 3582 | printk("md: %s: data overlaps metadata\n", |
| 3440 | mdname(mddev)); | 3583 | mdname(mddev)); |
| 3441 | return -EINVAL; | 3584 | return -EINVAL; |
| 3442 | } | 3585 | } |
| 3443 | } else { | 3586 | } else { |
| 3444 | if (rdev->sb_offset*2 + rdev->sb_size/512 | 3587 | if (rdev->sb_start + rdev->sb_size/512 |
| 3445 | > rdev->data_offset) { | 3588 | > rdev->data_offset) { |
| 3446 | printk("md: %s: metadata overlaps data\n", | 3589 | printk("md: %s: metadata overlaps data\n", |
| 3447 | mdname(mddev)); | 3590 | mdname(mddev)); |
| 3448 | return -EINVAL; | 3591 | return -EINVAL; |
| 3449 | } | 3592 | } |
| 3450 | } | 3593 | } |
| 3594 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
| 3451 | } | 3595 | } |
| 3452 | 3596 | ||
| 3453 | md_probe(mddev->unit, NULL, NULL); | 3597 | md_probe(mddev->unit, NULL, NULL); |
| @@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev) | |||
| 3519 | mddev->ro = 2; /* read-only, but switch on first write */ | 3663 | mddev->ro = 2; /* read-only, but switch on first write */ |
| 3520 | 3664 | ||
| 3521 | err = mddev->pers->run(mddev); | 3665 | err = mddev->pers->run(mddev); |
| 3522 | if (!err && mddev->pers->sync_request) { | 3666 | if (err) |
| 3667 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
| 3668 | else if (mddev->pers->sync_request) { | ||
| 3523 | err = bitmap_create(mddev); | 3669 | err = bitmap_create(mddev); |
| 3524 | if (err) { | 3670 | if (err) { |
| 3525 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", | 3671 | printk(KERN_ERR "%s: failed to create bitmap (%d)\n", |
| @@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev) | |||
| 3528 | } | 3674 | } |
| 3529 | } | 3675 | } |
| 3530 | if (err) { | 3676 | if (err) { |
| 3531 | printk(KERN_ERR "md: pers->run() failed ...\n"); | ||
| 3532 | module_put(mddev->pers->owner); | 3677 | module_put(mddev->pers->owner); |
| 3533 | mddev->pers = NULL; | 3678 | mddev->pers = NULL; |
| 3534 | bitmap_destroy(mddev); | 3679 | bitmap_destroy(mddev); |
| @@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev) | |||
| 3563 | if (mddev->flags) | 3708 | if (mddev->flags) |
| 3564 | md_update_sb(mddev, 0); | 3709 | md_update_sb(mddev, 0); |
| 3565 | 3710 | ||
| 3566 | set_capacity(disk, mddev->array_size<<1); | 3711 | set_capacity(disk, mddev->array_sectors); |
| 3567 | 3712 | ||
| 3568 | /* If we call blk_queue_make_request here, it will | 3713 | /* If we call blk_queue_make_request here, it will |
| 3569 | * re-initialise max_sectors etc which may have been | 3714 | * re-initialise max_sectors etc which may have been |
| @@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev) | |||
| 3608 | 3753 | ||
| 3609 | mddev->changed = 1; | 3754 | mddev->changed = 1; |
| 3610 | md_new_event(mddev); | 3755 | md_new_event(mddev); |
| 3756 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 3757 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
| 3758 | sysfs_notify(&mddev->kobj, NULL, "degraded"); | ||
| 3611 | kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); | 3759 | kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); |
| 3612 | return 0; | 3760 | return 0; |
| 3613 | } | 3761 | } |
| @@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev) | |||
| 3615 | static int restart_array(mddev_t *mddev) | 3763 | static int restart_array(mddev_t *mddev) |
| 3616 | { | 3764 | { |
| 3617 | struct gendisk *disk = mddev->gendisk; | 3765 | struct gendisk *disk = mddev->gendisk; |
| 3618 | int err; | ||
| 3619 | 3766 | ||
| 3620 | /* | 3767 | /* Complain if it has no devices */ |
| 3621 | * Complain if it has no devices | ||
| 3622 | */ | ||
| 3623 | err = -ENXIO; | ||
| 3624 | if (list_empty(&mddev->disks)) | 3768 | if (list_empty(&mddev->disks)) |
| 3625 | goto out; | 3769 | return -ENXIO; |
| 3626 | 3770 | if (!mddev->pers) | |
| 3627 | if (mddev->pers) { | 3771 | return -EINVAL; |
| 3628 | err = -EBUSY; | 3772 | if (!mddev->ro) |
| 3629 | if (!mddev->ro) | 3773 | return -EBUSY; |
| 3630 | goto out; | 3774 | mddev->safemode = 0; |
| 3631 | 3775 | mddev->ro = 0; | |
| 3632 | mddev->safemode = 0; | 3776 | set_disk_ro(disk, 0); |
| 3633 | mddev->ro = 0; | 3777 | printk(KERN_INFO "md: %s switched to read-write mode.\n", |
| 3634 | set_disk_ro(disk, 0); | 3778 | mdname(mddev)); |
| 3635 | 3779 | /* Kick recovery or resync if necessary */ | |
| 3636 | printk(KERN_INFO "md: %s switched to read-write mode.\n", | 3780 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 3637 | mdname(mddev)); | 3781 | md_wakeup_thread(mddev->thread); |
| 3638 | /* | 3782 | md_wakeup_thread(mddev->sync_thread); |
| 3639 | * Kick recovery or resync if necessary | 3783 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
| 3640 | */ | 3784 | return 0; |
| 3641 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 3642 | md_wakeup_thread(mddev->thread); | ||
| 3643 | md_wakeup_thread(mddev->sync_thread); | ||
| 3644 | err = 0; | ||
| 3645 | } else | ||
| 3646 | err = -EINVAL; | ||
| 3647 | |||
| 3648 | out: | ||
| 3649 | return err; | ||
| 3650 | } | 3785 | } |
| 3651 | 3786 | ||
| 3652 | /* similar to deny_write_access, but accounts for our holding a reference | 3787 | /* similar to deny_write_access, but accounts for our holding a reference |
| @@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file) | |||
| 3680 | * 1 - switch to readonly | 3815 | * 1 - switch to readonly |
| 3681 | * 2 - stop but do not disassemble array | 3816 | * 2 - stop but do not disassemble array |
| 3682 | */ | 3817 | */ |
| 3683 | static int do_md_stop(mddev_t * mddev, int mode) | 3818 | static int do_md_stop(mddev_t * mddev, int mode, int is_open) |
| 3684 | { | 3819 | { |
| 3685 | int err = 0; | 3820 | int err = 0; |
| 3686 | struct gendisk *disk = mddev->gendisk; | 3821 | struct gendisk *disk = mddev->gendisk; |
| 3687 | 3822 | ||
| 3823 | if (atomic_read(&mddev->openers) > is_open) { | ||
| 3824 | printk("md: %s still in use.\n",mdname(mddev)); | ||
| 3825 | return -EBUSY; | ||
| 3826 | } | ||
| 3827 | |||
| 3688 | if (mddev->pers) { | 3828 | if (mddev->pers) { |
| 3689 | if (atomic_read(&mddev->active)>2) { | ||
| 3690 | printk("md: %s still in use.\n",mdname(mddev)); | ||
| 3691 | return -EBUSY; | ||
| 3692 | } | ||
| 3693 | 3829 | ||
| 3694 | if (mddev->sync_thread) { | 3830 | if (mddev->sync_thread) { |
| 3695 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); | 3831 | set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); |
| @@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode) | |||
| 3773 | 3909 | ||
| 3774 | export_array(mddev); | 3910 | export_array(mddev); |
| 3775 | 3911 | ||
| 3776 | mddev->array_size = 0; | 3912 | mddev->array_sectors = 0; |
| 3777 | mddev->size = 0; | 3913 | mddev->size = 0; |
| 3778 | mddev->raid_disks = 0; | 3914 | mddev->raid_disks = 0; |
| 3779 | mddev->recovery_cp = 0; | 3915 | mddev->recovery_cp = 0; |
| 3916 | mddev->resync_min = 0; | ||
| 3780 | mddev->resync_max = MaxSector; | 3917 | mddev->resync_max = MaxSector; |
| 3781 | mddev->reshape_position = MaxSector; | 3918 | mddev->reshape_position = MaxSector; |
| 3782 | mddev->external = 0; | 3919 | mddev->external = 0; |
| @@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode) | |||
| 3811 | mdname(mddev)); | 3948 | mdname(mddev)); |
| 3812 | err = 0; | 3949 | err = 0; |
| 3813 | md_new_event(mddev); | 3950 | md_new_event(mddev); |
| 3951 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 3814 | out: | 3952 | out: |
| 3815 | return err; | 3953 | return err; |
| 3816 | } | 3954 | } |
| @@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev) | |||
| 3836 | err = do_md_run (mddev); | 3974 | err = do_md_run (mddev); |
| 3837 | if (err) { | 3975 | if (err) { |
| 3838 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); | 3976 | printk(KERN_WARNING "md: do_md_run() returned %d\n", err); |
| 3839 | do_md_stop (mddev, 0); | 3977 | do_md_stop (mddev, 0, 0); |
| 3840 | } | 3978 | } |
| 3841 | } | 3979 | } |
| 3842 | 3980 | ||
| @@ -3927,8 +4065,10 @@ static void autorun_devices(int part) | |||
| 3927 | /* on success, candidates will be empty, on error | 4065 | /* on success, candidates will be empty, on error |
| 3928 | * it won't... | 4066 | * it won't... |
| 3929 | */ | 4067 | */ |
| 3930 | rdev_for_each_list(rdev, tmp, candidates) | 4068 | rdev_for_each_list(rdev, tmp, candidates) { |
| 4069 | list_del_init(&rdev->same_set); | ||
| 3931 | export_rdev(rdev); | 4070 | export_rdev(rdev); |
| 4071 | } | ||
| 3932 | mddev_put(mddev); | 4072 | mddev_put(mddev); |
| 3933 | } | 4073 | } |
| 3934 | printk(KERN_INFO "md: ... autorun DONE.\n"); | 4074 | printk(KERN_INFO "md: ... autorun DONE.\n"); |
| @@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) | |||
| 4009 | char *ptr, *buf = NULL; | 4149 | char *ptr, *buf = NULL; |
| 4010 | int err = -ENOMEM; | 4150 | int err = -ENOMEM; |
| 4011 | 4151 | ||
| 4012 | md_allow_write(mddev); | 4152 | if (md_allow_write(mddev)) |
| 4153 | file = kmalloc(sizeof(*file), GFP_NOIO); | ||
| 4154 | else | ||
| 4155 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
| 4013 | 4156 | ||
| 4014 | file = kmalloc(sizeof(*file), GFP_KERNEL); | ||
| 4015 | if (!file) | 4157 | if (!file) |
| 4016 | goto out; | 4158 | goto out; |
| 4017 | 4159 | ||
| @@ -4044,15 +4186,12 @@ out: | |||
| 4044 | static int get_disk_info(mddev_t * mddev, void __user * arg) | 4186 | static int get_disk_info(mddev_t * mddev, void __user * arg) |
| 4045 | { | 4187 | { |
| 4046 | mdu_disk_info_t info; | 4188 | mdu_disk_info_t info; |
| 4047 | unsigned int nr; | ||
| 4048 | mdk_rdev_t *rdev; | 4189 | mdk_rdev_t *rdev; |
| 4049 | 4190 | ||
| 4050 | if (copy_from_user(&info, arg, sizeof(info))) | 4191 | if (copy_from_user(&info, arg, sizeof(info))) |
| 4051 | return -EFAULT; | 4192 | return -EFAULT; |
| 4052 | 4193 | ||
| 4053 | nr = info.number; | 4194 | rdev = find_rdev_nr(mddev, info.number); |
| 4054 | |||
| 4055 | rdev = find_rdev_nr(mddev, nr); | ||
| 4056 | if (rdev) { | 4195 | if (rdev) { |
| 4057 | info.major = MAJOR(rdev->bdev->bd_dev); | 4196 | info.major = MAJOR(rdev->bdev->bd_dev); |
| 4058 | info.minor = MINOR(rdev->bdev->bd_dev); | 4197 | info.minor = MINOR(rdev->bdev->bd_dev); |
| @@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 4172 | } | 4311 | } |
| 4173 | if (err) | 4312 | if (err) |
| 4174 | export_rdev(rdev); | 4313 | export_rdev(rdev); |
| 4314 | else | ||
| 4315 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
| 4175 | 4316 | ||
| 4176 | md_update_sb(mddev, 1); | 4317 | md_update_sb(mddev, 1); |
| 4318 | if (mddev->degraded) | ||
| 4319 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
| 4177 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4320 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 4178 | md_wakeup_thread(mddev->thread); | 4321 | md_wakeup_thread(mddev->thread); |
| 4179 | return err; | 4322 | return err; |
| @@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
| 4212 | 4355 | ||
| 4213 | if (!mddev->persistent) { | 4356 | if (!mddev->persistent) { |
| 4214 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); | 4357 | printk(KERN_INFO "md: nonpersistent superblock ...\n"); |
| 4215 | rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | 4358 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
| 4216 | } else | 4359 | } else |
| 4217 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | 4360 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
| 4218 | rdev->size = calc_dev_size(rdev, mddev->chunk_size); | 4361 | rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; |
| 4219 | 4362 | ||
| 4220 | err = bind_rdev_to_array(rdev, mddev); | 4363 | err = bind_rdev_to_array(rdev, mddev); |
| 4221 | if (err) { | 4364 | if (err) { |
| @@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) | |||
| 4232 | char b[BDEVNAME_SIZE]; | 4375 | char b[BDEVNAME_SIZE]; |
| 4233 | mdk_rdev_t *rdev; | 4376 | mdk_rdev_t *rdev; |
| 4234 | 4377 | ||
| 4235 | if (!mddev->pers) | ||
| 4236 | return -ENODEV; | ||
| 4237 | |||
| 4238 | rdev = find_rdev(mddev, dev); | 4378 | rdev = find_rdev(mddev, dev); |
| 4239 | if (!rdev) | 4379 | if (!rdev) |
| 4240 | return -ENXIO; | 4380 | return -ENXIO; |
| @@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
| 4257 | { | 4397 | { |
| 4258 | char b[BDEVNAME_SIZE]; | 4398 | char b[BDEVNAME_SIZE]; |
| 4259 | int err; | 4399 | int err; |
| 4260 | unsigned int size; | ||
| 4261 | mdk_rdev_t *rdev; | 4400 | mdk_rdev_t *rdev; |
| 4262 | 4401 | ||
| 4263 | if (!mddev->pers) | 4402 | if (!mddev->pers) |
| @@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) | |||
| 4285 | } | 4424 | } |
| 4286 | 4425 | ||
| 4287 | if (mddev->persistent) | 4426 | if (mddev->persistent) |
| 4288 | rdev->sb_offset = calc_dev_sboffset(rdev->bdev); | 4427 | rdev->sb_start = calc_dev_sboffset(rdev->bdev); |
| 4289 | else | 4428 | else |
| 4290 | rdev->sb_offset = | 4429 | rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; |
| 4291 | rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; | ||
| 4292 | 4430 | ||
| 4293 | size = calc_dev_size(rdev, mddev->chunk_size); | 4431 | rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; |
| 4294 | rdev->size = size; | ||
| 4295 | 4432 | ||
| 4296 | if (test_bit(Faulty, &rdev->flags)) { | 4433 | if (test_bit(Faulty, &rdev->flags)) { |
| 4297 | printk(KERN_WARNING | 4434 | printk(KERN_WARNING |
| @@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) | |||
| 4476 | return 0; | 4613 | return 0; |
| 4477 | } | 4614 | } |
| 4478 | 4615 | ||
| 4479 | static int update_size(mddev_t *mddev, unsigned long size) | 4616 | static int update_size(mddev_t *mddev, sector_t num_sectors) |
| 4480 | { | 4617 | { |
| 4481 | mdk_rdev_t * rdev; | 4618 | mdk_rdev_t * rdev; |
| 4482 | int rv; | 4619 | int rv; |
| 4483 | struct list_head *tmp; | 4620 | struct list_head *tmp; |
| 4484 | int fit = (size == 0); | 4621 | int fit = (num_sectors == 0); |
| 4485 | 4622 | ||
| 4486 | if (mddev->pers->resize == NULL) | 4623 | if (mddev->pers->resize == NULL) |
| 4487 | return -EINVAL; | 4624 | return -EINVAL; |
| 4488 | /* The "size" is the amount of each device that is used. | 4625 | /* The "num_sectors" is the number of sectors of each device that |
| 4489 | * This can only make sense for arrays with redundancy. | 4626 | * is used. This can only make sense for arrays with redundancy. |
| 4490 | * linear and raid0 always use whatever space is available | 4627 | * linear and raid0 always use whatever space is available. We can only |
| 4491 | * We can only consider changing the size if no resync | 4628 | * consider changing this number if no resync or reconstruction is |
| 4492 | * or reconstruction is happening, and if the new size | 4629 | * happening, and if the new size is acceptable. It must fit before the |
| 4493 | * is acceptable. It must fit before the sb_offset or, | 4630 | * sb_start or, if that is <data_offset, it must fit before the size |
| 4494 | * if that is <data_offset, it must fit before the | 4631 | * of each device. If num_sectors is zero, we find the largest size |
| 4495 | * size of each device. | 4632 | * that fits. |
| 4496 | * If size is zero, we find the largest size that fits. | 4633 | |
| 4497 | */ | 4634 | */ |
| 4498 | if (mddev->sync_thread) | 4635 | if (mddev->sync_thread) |
| 4499 | return -EBUSY; | 4636 | return -EBUSY; |
| @@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size) | |||
| 4501 | sector_t avail; | 4638 | sector_t avail; |
| 4502 | avail = rdev->size * 2; | 4639 | avail = rdev->size * 2; |
| 4503 | 4640 | ||
| 4504 | if (fit && (size == 0 || size > avail/2)) | 4641 | if (fit && (num_sectors == 0 || num_sectors > avail)) |
| 4505 | size = avail/2; | 4642 | num_sectors = avail; |
| 4506 | if (avail < ((sector_t)size << 1)) | 4643 | if (avail < num_sectors) |
| 4507 | return -ENOSPC; | 4644 | return -ENOSPC; |
| 4508 | } | 4645 | } |
| 4509 | rv = mddev->pers->resize(mddev, (sector_t)size *2); | 4646 | rv = mddev->pers->resize(mddev, num_sectors); |
| 4510 | if (!rv) { | 4647 | if (!rv) { |
| 4511 | struct block_device *bdev; | 4648 | struct block_device *bdev; |
| 4512 | 4649 | ||
| 4513 | bdev = bdget_disk(mddev->gendisk, 0); | 4650 | bdev = bdget_disk(mddev->gendisk, 0); |
| 4514 | if (bdev) { | 4651 | if (bdev) { |
| 4515 | mutex_lock(&bdev->bd_inode->i_mutex); | 4652 | mutex_lock(&bdev->bd_inode->i_mutex); |
| 4516 | i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); | 4653 | i_size_write(bdev->bd_inode, |
| 4654 | (loff_t)mddev->array_sectors << 9); | ||
| 4517 | mutex_unlock(&bdev->bd_inode->i_mutex); | 4655 | mutex_unlock(&bdev->bd_inode->i_mutex); |
| 4518 | bdput(bdev); | 4656 | bdput(bdev); |
| 4519 | } | 4657 | } |
| @@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) | |||
| 4588 | return mddev->pers->reconfig(mddev, info->layout, -1); | 4726 | return mddev->pers->reconfig(mddev, info->layout, -1); |
| 4589 | } | 4727 | } |
| 4590 | if (info->size >= 0 && mddev->size != info->size) | 4728 | if (info->size >= 0 && mddev->size != info->size) |
| 4591 | rv = update_size(mddev, info->size); | 4729 | rv = update_size(mddev, (sector_t)info->size * 2); |
| 4592 | 4730 | ||
| 4593 | if (mddev->raid_disks != info->raid_disks) | 4731 | if (mddev->raid_disks != info->raid_disks) |
| 4594 | rv = update_raid_disks(mddev, info->raid_disks); | 4732 | rv = update_raid_disks(mddev, info->raid_disks); |
| @@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) | |||
| 4641 | return 0; | 4779 | return 0; |
| 4642 | } | 4780 | } |
| 4643 | 4781 | ||
| 4782 | /* | ||
| 4783 | * We have a problem here : there is no easy way to give a CHS | ||
| 4784 | * virtual geometry. We currently pretend that we have a 2 heads | ||
| 4785 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
| 4786 | * dosfs just mad... ;-) | ||
| 4787 | */ | ||
| 4644 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) | 4788 | static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
| 4645 | { | 4789 | { |
| 4646 | mddev_t *mddev = bdev->bd_disk->private_data; | 4790 | mddev_t *mddev = bdev->bd_disk->private_data; |
| @@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
| 4785 | goto done_unlock; | 4929 | goto done_unlock; |
| 4786 | 4930 | ||
| 4787 | case STOP_ARRAY: | 4931 | case STOP_ARRAY: |
| 4788 | err = do_md_stop (mddev, 0); | 4932 | err = do_md_stop (mddev, 0, 1); |
| 4789 | goto done_unlock; | 4933 | goto done_unlock; |
| 4790 | 4934 | ||
| 4791 | case STOP_ARRAY_RO: | 4935 | case STOP_ARRAY_RO: |
| 4792 | err = do_md_stop (mddev, 1); | 4936 | err = do_md_stop (mddev, 1, 1); |
| 4793 | goto done_unlock; | 4937 | goto done_unlock; |
| 4794 | 4938 | ||
| 4795 | /* | ||
| 4796 | * We have a problem here : there is no easy way to give a CHS | ||
| 4797 | * virtual geometry. We currently pretend that we have a 2 heads | ||
| 4798 | * 4 sectors (with a BIG number of cylinders...). This drives | ||
| 4799 | * dosfs just mad... ;-) | ||
| 4800 | */ | ||
| 4801 | } | 4939 | } |
| 4802 | 4940 | ||
| 4803 | /* | 4941 | /* |
| @@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file, | |||
| 4807 | * here and hit the 'default' below, so only disallow | 4945 | * here and hit the 'default' below, so only disallow |
| 4808 | * 'md' ioctls, and switch to rw mode if started auto-readonly. | 4946 | * 'md' ioctls, and switch to rw mode if started auto-readonly. |
| 4809 | */ | 4947 | */ |
| 4810 | if (_IOC_TYPE(cmd) == MD_MAJOR && | 4948 | if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { |
| 4811 | mddev->ro && mddev->pers) { | ||
| 4812 | if (mddev->ro == 2) { | 4949 | if (mddev->ro == 2) { |
| 4813 | mddev->ro = 0; | 4950 | mddev->ro = 0; |
| 4814 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4951 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
| 4815 | md_wakeup_thread(mddev->thread); | 4952 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 4816 | 4953 | md_wakeup_thread(mddev->thread); | |
| 4817 | } else { | 4954 | } else { |
| 4818 | err = -EROFS; | 4955 | err = -EROFS; |
| 4819 | goto abort_unlock; | 4956 | goto abort_unlock; |
| @@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file) | |||
| 4883 | 5020 | ||
| 4884 | err = 0; | 5021 | err = 0; |
| 4885 | mddev_get(mddev); | 5022 | mddev_get(mddev); |
| 5023 | atomic_inc(&mddev->openers); | ||
| 4886 | mddev_unlock(mddev); | 5024 | mddev_unlock(mddev); |
| 4887 | 5025 | ||
| 4888 | check_disk_change(inode->i_bdev); | 5026 | check_disk_change(inode->i_bdev); |
| @@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file) | |||
| 4895 | mddev_t *mddev = inode->i_bdev->bd_disk->private_data; | 5033 | mddev_t *mddev = inode->i_bdev->bd_disk->private_data; |
| 4896 | 5034 | ||
| 4897 | BUG_ON(!mddev); | 5035 | BUG_ON(!mddev); |
| 5036 | atomic_dec(&mddev->openers); | ||
| 4898 | mddev_put(mddev); | 5037 | mddev_put(mddev); |
| 4899 | 5038 | ||
| 4900 | return 0; | 5039 | return 0; |
| @@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 5029 | if (!mddev->pers->error_handler) | 5168 | if (!mddev->pers->error_handler) |
| 5030 | return; | 5169 | return; |
| 5031 | mddev->pers->error_handler(mddev,rdev); | 5170 | mddev->pers->error_handler(mddev,rdev); |
| 5171 | if (mddev->degraded) | ||
| 5172 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
| 5173 | set_bit(StateChanged, &rdev->flags); | ||
| 5032 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | 5174 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 5033 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5175 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 5034 | md_wakeup_thread(mddev->thread); | 5176 | md_wakeup_thread(mddev->thread); |
| @@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
| 5258 | if (!list_empty(&mddev->disks)) { | 5400 | if (!list_empty(&mddev->disks)) { |
| 5259 | if (mddev->pers) | 5401 | if (mddev->pers) |
| 5260 | seq_printf(seq, "\n %llu blocks", | 5402 | seq_printf(seq, "\n %llu blocks", |
| 5261 | (unsigned long long)mddev->array_size); | 5403 | (unsigned long long) |
| 5404 | mddev->array_sectors / 2); | ||
| 5262 | else | 5405 | else |
| 5263 | seq_printf(seq, "\n %llu blocks", | 5406 | seq_printf(seq, "\n %llu blocks", |
| 5264 | (unsigned long long)size); | 5407 | (unsigned long long)size); |
| 5265 | } | 5408 | } |
| 5266 | if (mddev->persistent) { | 5409 | if (mddev->persistent) { |
| 5267 | if (mddev->major_version != 0 || | 5410 | if (mddev->major_version != 0 || |
| @@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p) | |||
| 5391 | static int is_mddev_idle(mddev_t *mddev) | 5534 | static int is_mddev_idle(mddev_t *mddev) |
| 5392 | { | 5535 | { |
| 5393 | mdk_rdev_t * rdev; | 5536 | mdk_rdev_t * rdev; |
| 5394 | struct list_head *tmp; | ||
| 5395 | int idle; | 5537 | int idle; |
| 5396 | long curr_events; | 5538 | long curr_events; |
| 5397 | 5539 | ||
| 5398 | idle = 1; | 5540 | idle = 1; |
| 5399 | rdev_for_each(rdev, tmp, mddev) { | 5541 | rcu_read_lock(); |
| 5542 | rdev_for_each_rcu(rdev, mddev) { | ||
| 5400 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; | 5543 | struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; |
| 5401 | curr_events = disk_stat_read(disk, sectors[0]) + | 5544 | curr_events = disk_stat_read(disk, sectors[0]) + |
| 5402 | disk_stat_read(disk, sectors[1]) - | 5545 | disk_stat_read(disk, sectors[1]) - |
| @@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev) | |||
| 5428 | idle = 0; | 5571 | idle = 0; |
| 5429 | } | 5572 | } |
| 5430 | } | 5573 | } |
| 5574 | rcu_read_unlock(); | ||
| 5431 | return idle; | 5575 | return idle; |
| 5432 | } | 5576 | } |
| 5433 | 5577 | ||
| @@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) | |||
| 5451 | */ | 5595 | */ |
| 5452 | void md_write_start(mddev_t *mddev, struct bio *bi) | 5596 | void md_write_start(mddev_t *mddev, struct bio *bi) |
| 5453 | { | 5597 | { |
| 5598 | int did_change = 0; | ||
| 5454 | if (bio_data_dir(bi) != WRITE) | 5599 | if (bio_data_dir(bi) != WRITE) |
| 5455 | return; | 5600 | return; |
| 5456 | 5601 | ||
| @@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) | |||
| 5461 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5606 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 5462 | md_wakeup_thread(mddev->thread); | 5607 | md_wakeup_thread(mddev->thread); |
| 5463 | md_wakeup_thread(mddev->sync_thread); | 5608 | md_wakeup_thread(mddev->sync_thread); |
| 5609 | did_change = 1; | ||
| 5464 | } | 5610 | } |
| 5465 | atomic_inc(&mddev->writes_pending); | 5611 | atomic_inc(&mddev->writes_pending); |
| 5466 | if (mddev->safemode == 1) | 5612 | if (mddev->safemode == 1) |
| @@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) | |||
| 5471 | mddev->in_sync = 0; | 5617 | mddev->in_sync = 0; |
| 5472 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 5618 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
| 5473 | md_wakeup_thread(mddev->thread); | 5619 | md_wakeup_thread(mddev->thread); |
| 5620 | did_change = 1; | ||
| 5474 | } | 5621 | } |
| 5475 | spin_unlock_irq(&mddev->write_lock); | 5622 | spin_unlock_irq(&mddev->write_lock); |
| 5476 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 5477 | } | 5623 | } |
| 5624 | if (did_change) | ||
| 5625 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 5478 | wait_event(mddev->sb_wait, | 5626 | wait_event(mddev->sb_wait, |
| 5479 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | 5627 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && |
| 5480 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | 5628 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |
| @@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev) | |||
| 5495 | * may proceed without blocking. It is important to call this before | 5643 | * may proceed without blocking. It is important to call this before |
| 5496 | * attempting a GFP_KERNEL allocation while holding the mddev lock. | 5644 | * attempting a GFP_KERNEL allocation while holding the mddev lock. |
| 5497 | * Must be called with mddev_lock held. | 5645 | * Must be called with mddev_lock held. |
| 5646 | * | ||
| 5647 | * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock | ||
| 5648 | * is dropped, so return -EAGAIN after notifying userspace. | ||
| 5498 | */ | 5649 | */ |
| 5499 | void md_allow_write(mddev_t *mddev) | 5650 | int md_allow_write(mddev_t *mddev) |
| 5500 | { | 5651 | { |
| 5501 | if (!mddev->pers) | 5652 | if (!mddev->pers) |
| 5502 | return; | 5653 | return 0; |
| 5503 | if (mddev->ro) | 5654 | if (mddev->ro) |
| 5504 | return; | 5655 | return 0; |
| 5656 | if (!mddev->pers->sync_request) | ||
| 5657 | return 0; | ||
| 5505 | 5658 | ||
| 5506 | spin_lock_irq(&mddev->write_lock); | 5659 | spin_lock_irq(&mddev->write_lock); |
| 5507 | if (mddev->in_sync) { | 5660 | if (mddev->in_sync) { |
| @@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev) | |||
| 5512 | mddev->safemode = 1; | 5665 | mddev->safemode = 1; |
| 5513 | spin_unlock_irq(&mddev->write_lock); | 5666 | spin_unlock_irq(&mddev->write_lock); |
| 5514 | md_update_sb(mddev, 0); | 5667 | md_update_sb(mddev, 0); |
| 5515 | |||
| 5516 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | 5668 | sysfs_notify(&mddev->kobj, NULL, "array_state"); |
| 5517 | /* wait for the dirty state to be recorded in the metadata */ | ||
| 5518 | wait_event(mddev->sb_wait, | ||
| 5519 | !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && | ||
| 5520 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
| 5521 | } else | 5669 | } else |
| 5522 | spin_unlock_irq(&mddev->write_lock); | 5670 | spin_unlock_irq(&mddev->write_lock); |
| 5671 | |||
| 5672 | if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) | ||
| 5673 | return -EAGAIN; | ||
| 5674 | else | ||
| 5675 | return 0; | ||
| 5523 | } | 5676 | } |
| 5524 | EXPORT_SYMBOL_GPL(md_allow_write); | 5677 | EXPORT_SYMBOL_GPL(md_allow_write); |
| 5525 | 5678 | ||
| @@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev) | |||
| 5625 | max_sectors = mddev->resync_max_sectors; | 5778 | max_sectors = mddev->resync_max_sectors; |
| 5626 | mddev->resync_mismatches = 0; | 5779 | mddev->resync_mismatches = 0; |
| 5627 | /* we don't use the checkpoint if there's a bitmap */ | 5780 | /* we don't use the checkpoint if there's a bitmap */ |
| 5628 | if (!mddev->bitmap && | 5781 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
| 5629 | !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | 5782 | j = mddev->resync_min; |
| 5783 | else if (!mddev->bitmap) | ||
| 5630 | j = mddev->recovery_cp; | 5784 | j = mddev->recovery_cp; |
| 5785 | |||
| 5631 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 5786 | } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
| 5632 | max_sectors = mddev->size << 1; | 5787 | max_sectors = mddev->size << 1; |
| 5633 | else { | 5788 | else { |
| @@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev) | |||
| 5796 | 5951 | ||
| 5797 | skip: | 5952 | skip: |
| 5798 | mddev->curr_resync = 0; | 5953 | mddev->curr_resync = 0; |
| 5954 | mddev->resync_min = 0; | ||
| 5799 | mddev->resync_max = MaxSector; | 5955 | mddev->resync_max = MaxSector; |
| 5800 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 5956 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| 5801 | wake_up(&resync_wait); | 5957 | wake_up(&resync_wait); |
| @@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
| 5845 | if (rdev->raid_disk < 0 | 6001 | if (rdev->raid_disk < 0 |
| 5846 | && !test_bit(Faulty, &rdev->flags)) { | 6002 | && !test_bit(Faulty, &rdev->flags)) { |
| 5847 | rdev->recovery_offset = 0; | 6003 | rdev->recovery_offset = 0; |
| 5848 | if (mddev->pers->hot_add_disk(mddev,rdev)) { | 6004 | if (mddev->pers-> |
| 6005 | hot_add_disk(mddev, rdev) == 0) { | ||
| 5849 | char nm[20]; | 6006 | char nm[20]; |
| 5850 | sprintf(nm, "rd%d", rdev->raid_disk); | 6007 | sprintf(nm, "rd%d", rdev->raid_disk); |
| 5851 | if (sysfs_create_link(&mddev->kobj, | 6008 | if (sysfs_create_link(&mddev->kobj, |
| @@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev) | |||
| 5920 | int spares = 0; | 6077 | int spares = 0; |
| 5921 | 6078 | ||
| 5922 | if (!mddev->external) { | 6079 | if (!mddev->external) { |
| 6080 | int did_change = 0; | ||
| 5923 | spin_lock_irq(&mddev->write_lock); | 6081 | spin_lock_irq(&mddev->write_lock); |
| 5924 | if (mddev->safemode && | 6082 | if (mddev->safemode && |
| 5925 | !atomic_read(&mddev->writes_pending) && | 6083 | !atomic_read(&mddev->writes_pending) && |
| 5926 | !mddev->in_sync && | 6084 | !mddev->in_sync && |
| 5927 | mddev->recovery_cp == MaxSector) { | 6085 | mddev->recovery_cp == MaxSector) { |
| 5928 | mddev->in_sync = 1; | 6086 | mddev->in_sync = 1; |
| 6087 | did_change = 1; | ||
| 5929 | if (mddev->persistent) | 6088 | if (mddev->persistent) |
| 5930 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | 6089 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); |
| 5931 | } | 6090 | } |
| 5932 | if (mddev->safemode == 1) | 6091 | if (mddev->safemode == 1) |
| 5933 | mddev->safemode = 0; | 6092 | mddev->safemode = 0; |
| 5934 | spin_unlock_irq(&mddev->write_lock); | 6093 | spin_unlock_irq(&mddev->write_lock); |
| 6094 | if (did_change) | ||
| 6095 | sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||
| 5935 | } | 6096 | } |
| 5936 | 6097 | ||
| 5937 | if (mddev->flags) | 6098 | if (mddev->flags) |
| 5938 | md_update_sb(mddev, 0); | 6099 | md_update_sb(mddev, 0); |
| 5939 | 6100 | ||
| 6101 | rdev_for_each(rdev, rtmp, mddev) | ||
| 6102 | if (test_and_clear_bit(StateChanged, &rdev->flags)) | ||
| 6103 | sysfs_notify(&rdev->kobj, NULL, "state"); | ||
| 6104 | |||
| 5940 | 6105 | ||
| 5941 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && | 6106 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
| 5942 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { | 6107 | !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { |
| @@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev) | |||
| 5951 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { | 6116 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { |
| 5952 | /* success...*/ | 6117 | /* success...*/ |
| 5953 | /* activate any spares */ | 6118 | /* activate any spares */ |
| 5954 | mddev->pers->spare_active(mddev); | 6119 | if (mddev->pers->spare_active(mddev)) |
| 6120 | sysfs_notify(&mddev->kobj, NULL, | ||
| 6121 | "degraded"); | ||
| 5955 | } | 6122 | } |
| 5956 | md_update_sb(mddev, 1); | 6123 | md_update_sb(mddev, 1); |
| 5957 | 6124 | ||
| @@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev) | |||
| 5965 | mddev->recovery = 0; | 6132 | mddev->recovery = 0; |
| 5966 | /* flag recovery needed just to double check */ | 6133 | /* flag recovery needed just to double check */ |
| 5967 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6134 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 6135 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
| 5968 | md_new_event(mddev); | 6136 | md_new_event(mddev); |
| 5969 | goto unlock; | 6137 | goto unlock; |
| 5970 | } | 6138 | } |
| 6139 | /* Set RUNNING before clearing NEEDED to avoid | ||
| 6140 | * any transients in the value of "sync_action". | ||
| 6141 | */ | ||
| 6142 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
| 6143 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 5971 | /* Clear some bits that don't mean anything, but | 6144 | /* Clear some bits that don't mean anything, but |
| 5972 | * might be left set | 6145 | * might be left set |
| 5973 | */ | 6146 | */ |
| 5974 | clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
| 5975 | clear_bit(MD_RECOVERY_INTR, &mddev->recovery); | 6147 | clear_bit(MD_RECOVERY_INTR, &mddev->recovery); |
| 5976 | clear_bit(MD_RECOVERY_DONE, &mddev->recovery); | 6148 | clear_bit(MD_RECOVERY_DONE, &mddev->recovery); |
| 5977 | 6149 | ||
| @@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev) | |||
| 5989 | /* Cannot proceed */ | 6161 | /* Cannot proceed */ |
| 5990 | goto unlock; | 6162 | goto unlock; |
| 5991 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); | 6163 | set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); |
| 6164 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
| 5992 | } else if ((spares = remove_and_add_spares(mddev))) { | 6165 | } else if ((spares = remove_and_add_spares(mddev))) { |
| 5993 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 6166 | clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
| 5994 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); | 6167 | clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); |
| 6168 | set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
| 5995 | } else if (mddev->recovery_cp < MaxSector) { | 6169 | } else if (mddev->recovery_cp < MaxSector) { |
| 5996 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 6170 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
| 6171 | clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); | ||
| 5997 | } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) | 6172 | } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) |
| 5998 | /* nothing to be done ... */ | 6173 | /* nothing to be done ... */ |
| 5999 | goto unlock; | 6174 | goto unlock; |
| 6000 | 6175 | ||
| 6001 | if (mddev->pers->sync_request) { | 6176 | if (mddev->pers->sync_request) { |
| 6002 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
| 6003 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { | 6177 | if (spares && mddev->bitmap && ! mddev->bitmap->file) { |
| 6004 | /* We are adding a device or devices to an array | 6178 | /* We are adding a device or devices to an array |
| 6005 | * which has the bitmap stored on all devices. | 6179 | * which has the bitmap stored on all devices. |
| @@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev) | |||
| 6018 | mddev->recovery = 0; | 6192 | mddev->recovery = 0; |
| 6019 | } else | 6193 | } else |
| 6020 | md_wakeup_thread(mddev->sync_thread); | 6194 | md_wakeup_thread(mddev->sync_thread); |
| 6195 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
| 6021 | md_new_event(mddev); | 6196 | md_new_event(mddev); |
| 6022 | } | 6197 | } |
| 6023 | unlock: | 6198 | unlock: |
| 6199 | if (!mddev->sync_thread) { | ||
| 6200 | clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | ||
| 6201 | if (test_and_clear_bit(MD_RECOVERY_RECOVER, | ||
| 6202 | &mddev->recovery)) | ||
| 6203 | sysfs_notify(&mddev->kobj, NULL, "sync_action"); | ||
| 6204 | } | ||
| 6024 | mddev_unlock(mddev); | 6205 | mddev_unlock(mddev); |
| 6025 | } | 6206 | } |
| 6026 | } | 6207 | } |
| @@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this, | |||
| 6047 | 6228 | ||
| 6048 | for_each_mddev(mddev, tmp) | 6229 | for_each_mddev(mddev, tmp) |
| 6049 | if (mddev_trylock(mddev)) { | 6230 | if (mddev_trylock(mddev)) { |
| 6050 | do_md_stop (mddev, 1); | 6231 | do_md_stop (mddev, 1, 0); |
| 6051 | mddev_unlock(mddev); | 6232 | mddev_unlock(mddev); |
| 6052 | } | 6233 | } |
| 6053 | /* | 6234 | /* |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e968116e0de9..c4779ccba1c3 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
| @@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 281 | { | 281 | { |
| 282 | multipath_conf_t *conf = mddev->private; | 282 | multipath_conf_t *conf = mddev->private; |
| 283 | struct request_queue *q; | 283 | struct request_queue *q; |
| 284 | int found = 0; | 284 | int err = -EEXIST; |
| 285 | int path; | 285 | int path; |
| 286 | struct multipath_info *p; | 286 | struct multipath_info *p; |
| 287 | int first = 0; | ||
| 288 | int last = mddev->raid_disks - 1; | ||
| 289 | |||
| 290 | if (rdev->raid_disk >= 0) | ||
| 291 | first = last = rdev->raid_disk; | ||
| 287 | 292 | ||
| 288 | print_multipath_conf(conf); | 293 | print_multipath_conf(conf); |
| 289 | 294 | ||
| 290 | for (path=0; path<mddev->raid_disks; path++) | 295 | for (path = first; path <= last; path++) |
| 291 | if ((p=conf->multipaths+path)->rdev == NULL) { | 296 | if ((p=conf->multipaths+path)->rdev == NULL) { |
| 292 | q = rdev->bdev->bd_disk->queue; | 297 | q = rdev->bdev->bd_disk->queue; |
| 293 | blk_queue_stack_limits(mddev->queue, q); | 298 | blk_queue_stack_limits(mddev->queue, q); |
| @@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 307 | rdev->raid_disk = path; | 312 | rdev->raid_disk = path; |
| 308 | set_bit(In_sync, &rdev->flags); | 313 | set_bit(In_sync, &rdev->flags); |
| 309 | rcu_assign_pointer(p->rdev, rdev); | 314 | rcu_assign_pointer(p->rdev, rdev); |
| 310 | found = 1; | 315 | err = 0; |
| 316 | break; | ||
| 311 | } | 317 | } |
| 312 | 318 | ||
| 313 | print_multipath_conf(conf); | 319 | print_multipath_conf(conf); |
| 314 | return found; | 320 | |
| 321 | return err; | ||
| 315 | } | 322 | } |
| 316 | 323 | ||
| 317 | static int multipath_remove_disk(mddev_t *mddev, int number) | 324 | static int multipath_remove_disk(mddev_t *mddev, int number) |
| @@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev) | |||
| 497 | /* | 504 | /* |
| 498 | * Ok, everything is just fine now | 505 | * Ok, everything is just fine now |
| 499 | */ | 506 | */ |
| 500 | mddev->array_size = mddev->size; | 507 | mddev->array_sectors = mddev->size * 2; |
| 501 | 508 | ||
| 502 | mddev->queue->unplug_fn = multipath_unplug; | 509 | mddev->queue->unplug_fn = multipath_unplug; |
| 503 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; | 510 | mddev->queue->backing_dev_info.congested_fn = multipath_congested; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bcbb82594a19..183610635661 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev) | |||
| 295 | goto out_free_conf; | 295 | goto out_free_conf; |
| 296 | 296 | ||
| 297 | /* calculate array device size */ | 297 | /* calculate array device size */ |
| 298 | mddev->array_size = 0; | 298 | mddev->array_sectors = 0; |
| 299 | rdev_for_each(rdev, tmp, mddev) | 299 | rdev_for_each(rdev, tmp, mddev) |
| 300 | mddev->array_size += rdev->size; | 300 | mddev->array_sectors += rdev->size * 2; |
| 301 | 301 | ||
| 302 | printk("raid0 : md_size is %llu blocks.\n", | 302 | printk("raid0 : md_size is %llu blocks.\n", |
| 303 | (unsigned long long)mddev->array_size); | 303 | (unsigned long long)mddev->array_sectors / 2); |
| 304 | printk("raid0 : conf->hash_spacing is %llu blocks.\n", | 304 | printk("raid0 : conf->hash_spacing is %llu blocks.\n", |
| 305 | (unsigned long long)conf->hash_spacing); | 305 | (unsigned long long)conf->hash_spacing); |
| 306 | { | 306 | { |
| 307 | sector_t s = mddev->array_size; | 307 | sector_t s = mddev->array_sectors / 2; |
| 308 | sector_t space = conf->hash_spacing; | 308 | sector_t space = conf->hash_spacing; |
| 309 | int round; | 309 | int round; |
| 310 | conf->preshift = 0; | 310 | conf->preshift = 0; |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c610b947218a..03a5ab705c20 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev) | |||
| 1100 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 1100 | static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
| 1101 | { | 1101 | { |
| 1102 | conf_t *conf = mddev->private; | 1102 | conf_t *conf = mddev->private; |
| 1103 | int found = 0; | 1103 | int err = -EEXIST; |
| 1104 | int mirror = 0; | 1104 | int mirror = 0; |
| 1105 | mirror_info_t *p; | 1105 | mirror_info_t *p; |
| 1106 | int first = 0; | ||
| 1107 | int last = mddev->raid_disks - 1; | ||
| 1106 | 1108 | ||
| 1107 | for (mirror=0; mirror < mddev->raid_disks; mirror++) | 1109 | if (rdev->raid_disk >= 0) |
| 1110 | first = last = rdev->raid_disk; | ||
| 1111 | |||
| 1112 | for (mirror = first; mirror <= last; mirror++) | ||
| 1108 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1113 | if ( !(p=conf->mirrors+mirror)->rdev) { |
| 1109 | 1114 | ||
| 1110 | blk_queue_stack_limits(mddev->queue, | 1115 | blk_queue_stack_limits(mddev->queue, |
| @@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1119 | 1124 | ||
| 1120 | p->head_position = 0; | 1125 | p->head_position = 0; |
| 1121 | rdev->raid_disk = mirror; | 1126 | rdev->raid_disk = mirror; |
| 1122 | found = 1; | 1127 | err = 0; |
| 1123 | /* As all devices are equivalent, we don't need a full recovery | 1128 | /* As all devices are equivalent, we don't need a full recovery |
| 1124 | * if this was recently any drive of the array | 1129 | * if this was recently any drive of the array |
| 1125 | */ | 1130 | */ |
| @@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1130 | } | 1135 | } |
| 1131 | 1136 | ||
| 1132 | print_conf(conf); | 1137 | print_conf(conf); |
| 1133 | return found; | 1138 | return err; |
| 1134 | } | 1139 | } |
| 1135 | 1140 | ||
| 1136 | static int raid1_remove_disk(mddev_t *mddev, int number) | 1141 | static int raid1_remove_disk(mddev_t *mddev, int number) |
| @@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev) | |||
| 2038 | /* | 2043 | /* |
| 2039 | * Ok, everything is just fine now | 2044 | * Ok, everything is just fine now |
| 2040 | */ | 2045 | */ |
| 2041 | mddev->array_size = mddev->size; | 2046 | mddev->array_sectors = mddev->size * 2; |
| 2042 | 2047 | ||
| 2043 | mddev->queue->unplug_fn = raid1_unplug; | 2048 | mddev->queue->unplug_fn = raid1_unplug; |
| 2044 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2049 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
| @@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
| 2100 | * any io in the removed space completes, but it hardly seems | 2105 | * any io in the removed space completes, but it hardly seems |
| 2101 | * worth it. | 2106 | * worth it. |
| 2102 | */ | 2107 | */ |
| 2103 | mddev->array_size = sectors>>1; | 2108 | mddev->array_sectors = sectors; |
| 2104 | set_capacity(mddev->gendisk, mddev->array_size << 1); | 2109 | set_capacity(mddev->gendisk, mddev->array_sectors); |
| 2105 | mddev->changed = 1; | 2110 | mddev->changed = 1; |
| 2106 | if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { | 2111 | if (mddev->array_sectors / 2 > mddev->size && |
| 2112 | mddev->recovery_cp == MaxSector) { | ||
| 2107 | mddev->recovery_cp = mddev->size << 1; | 2113 | mddev->recovery_cp = mddev->size << 1; |
| 2108 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2114 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 2109 | } | 2115 | } |
| 2110 | mddev->size = mddev->array_size; | 2116 | mddev->size = mddev->array_sectors / 2; |
| 2111 | mddev->resync_max_sectors = sectors; | 2117 | mddev->resync_max_sectors = sectors; |
| 2112 | return 0; | 2118 | return 0; |
| 2113 | } | 2119 | } |
| @@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev) | |||
| 2131 | conf_t *conf = mddev_to_conf(mddev); | 2137 | conf_t *conf = mddev_to_conf(mddev); |
| 2132 | int cnt, raid_disks; | 2138 | int cnt, raid_disks; |
| 2133 | unsigned long flags; | 2139 | unsigned long flags; |
| 2134 | int d, d2; | 2140 | int d, d2, err; |
| 2135 | 2141 | ||
| 2136 | /* Cannot change chunk_size, layout, or level */ | 2142 | /* Cannot change chunk_size, layout, or level */ |
| 2137 | if (mddev->chunk_size != mddev->new_chunk || | 2143 | if (mddev->chunk_size != mddev->new_chunk || |
| @@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev) | |||
| 2143 | return -EINVAL; | 2149 | return -EINVAL; |
| 2144 | } | 2150 | } |
| 2145 | 2151 | ||
| 2146 | md_allow_write(mddev); | 2152 | err = md_allow_write(mddev); |
| 2153 | if (err) | ||
| 2154 | return err; | ||
| 2147 | 2155 | ||
| 2148 | raid_disks = mddev->raid_disks + mddev->delta_disks; | 2156 | raid_disks = mddev->raid_disks + mddev->delta_disks; |
| 2149 | 2157 | ||
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 22bb2b1b886d..159535d73567 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev) | |||
| 1114 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 1114 | static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
| 1115 | { | 1115 | { |
| 1116 | conf_t *conf = mddev->private; | 1116 | conf_t *conf = mddev->private; |
| 1117 | int found = 0; | 1117 | int err = -EEXIST; |
| 1118 | int mirror; | 1118 | int mirror; |
| 1119 | mirror_info_t *p; | 1119 | mirror_info_t *p; |
| 1120 | int first = 0; | ||
| 1121 | int last = mddev->raid_disks - 1; | ||
| 1120 | 1122 | ||
| 1121 | if (mddev->recovery_cp < MaxSector) | 1123 | if (mddev->recovery_cp < MaxSector) |
| 1122 | /* only hot-add to in-sync arrays, as recovery is | 1124 | /* only hot-add to in-sync arrays, as recovery is |
| 1123 | * very different from resync | 1125 | * very different from resync |
| 1124 | */ | 1126 | */ |
| 1125 | return 0; | 1127 | return -EBUSY; |
| 1126 | if (!enough(conf)) | 1128 | if (!enough(conf)) |
| 1127 | return 0; | 1129 | return -EINVAL; |
| 1130 | |||
| 1131 | if (rdev->raid_disk) | ||
| 1132 | first = last = rdev->raid_disk; | ||
| 1128 | 1133 | ||
| 1129 | if (rdev->saved_raid_disk >= 0 && | 1134 | if (rdev->saved_raid_disk >= 0 && |
| 1135 | rdev->saved_raid_disk >= first && | ||
| 1130 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1136 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
| 1131 | mirror = rdev->saved_raid_disk; | 1137 | mirror = rdev->saved_raid_disk; |
| 1132 | else | 1138 | else |
| 1133 | mirror = 0; | 1139 | mirror = first; |
| 1134 | for ( ; mirror < mddev->raid_disks; mirror++) | 1140 | for ( ; mirror <= last ; mirror++) |
| 1135 | if ( !(p=conf->mirrors+mirror)->rdev) { | 1141 | if ( !(p=conf->mirrors+mirror)->rdev) { |
| 1136 | 1142 | ||
| 1137 | blk_queue_stack_limits(mddev->queue, | 1143 | blk_queue_stack_limits(mddev->queue, |
| @@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1146 | 1152 | ||
| 1147 | p->head_position = 0; | 1153 | p->head_position = 0; |
| 1148 | rdev->raid_disk = mirror; | 1154 | rdev->raid_disk = mirror; |
| 1149 | found = 1; | 1155 | err = 0; |
| 1150 | if (rdev->saved_raid_disk != mirror) | 1156 | if (rdev->saved_raid_disk != mirror) |
| 1151 | conf->fullsync = 1; | 1157 | conf->fullsync = 1; |
| 1152 | rcu_assign_pointer(p->rdev, rdev); | 1158 | rcu_assign_pointer(p->rdev, rdev); |
| @@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1154 | } | 1160 | } |
| 1155 | 1161 | ||
| 1156 | print_conf(conf); | 1162 | print_conf(conf); |
| 1157 | return found; | 1163 | return err; |
| 1158 | } | 1164 | } |
| 1159 | 1165 | ||
| 1160 | static int raid10_remove_disk(mddev_t *mddev, int number) | 1166 | static int raid10_remove_disk(mddev_t *mddev, int number) |
| @@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev) | |||
| 2159 | /* | 2165 | /* |
| 2160 | * Ok, everything is just fine now | 2166 | * Ok, everything is just fine now |
| 2161 | */ | 2167 | */ |
| 2162 | mddev->array_size = size << (conf->chunk_shift-1); | 2168 | mddev->array_sectors = size << conf->chunk_shift; |
| 2163 | mddev->resync_max_sectors = size << conf->chunk_shift; | 2169 | mddev->resync_max_sectors = size << conf->chunk_shift; |
| 2164 | 2170 | ||
| 2165 | mddev->queue->unplug_fn = raid10_unplug; | 2171 | mddev->queue->unplug_fn = raid10_unplug; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9ce7154845c6..55e7c56045a0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi) | |||
| 115 | return_bi = bi->bi_next; | 115 | return_bi = bi->bi_next; |
| 116 | bi->bi_next = NULL; | 116 | bi->bi_next = NULL; |
| 117 | bi->bi_size = 0; | 117 | bi->bi_size = 0; |
| 118 | bi->bi_end_io(bi, | 118 | bio_endio(bi, 0); |
| 119 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
| 120 | ? 0 : -EIO); | ||
| 121 | bi = return_bi; | 119 | bi = return_bi; |
| 122 | } | 120 | } |
| 123 | } | 121 | } |
| 124 | 122 | ||
| 125 | static void print_raid5_conf (raid5_conf_t *conf); | 123 | static void print_raid5_conf (raid5_conf_t *conf); |
| 126 | 124 | ||
| 125 | static int stripe_operations_active(struct stripe_head *sh) | ||
| 126 | { | ||
| 127 | return sh->check_state || sh->reconstruct_state || | ||
| 128 | test_bit(STRIPE_BIOFILL_RUN, &sh->state) || | ||
| 129 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
| 130 | } | ||
| 131 | |||
| 127 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | 132 | static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) |
| 128 | { | 133 | { |
| 129 | if (atomic_dec_and_test(&sh->count)) { | 134 | if (atomic_dec_and_test(&sh->count)) { |
| @@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) | |||
| 143 | } | 148 | } |
| 144 | md_wakeup_thread(conf->mddev->thread); | 149 | md_wakeup_thread(conf->mddev->thread); |
| 145 | } else { | 150 | } else { |
| 146 | BUG_ON(sh->ops.pending); | 151 | BUG_ON(stripe_operations_active(sh)); |
| 147 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 152 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { |
| 148 | atomic_dec(&conf->preread_active_stripes); | 153 | atomic_dec(&conf->preread_active_stripes); |
| 149 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 154 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) |
| @@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int | |||
| 245 | 250 | ||
| 246 | BUG_ON(atomic_read(&sh->count) != 0); | 251 | BUG_ON(atomic_read(&sh->count) != 0); |
| 247 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); | 252 | BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); |
| 248 | BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); | 253 | BUG_ON(stripe_operations_active(sh)); |
| 249 | 254 | ||
| 250 | CHECK_DEVLOCK(); | 255 | CHECK_DEVLOCK(); |
| 251 | pr_debug("init_stripe called, stripe %llu\n", | 256 | pr_debug("init_stripe called, stripe %llu\n", |
| @@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector | |||
| 346 | return sh; | 351 | return sh; |
| 347 | } | 352 | } |
| 348 | 353 | ||
| 349 | /* test_and_ack_op() ensures that we only dequeue an operation once */ | ||
| 350 | #define test_and_ack_op(op, pend) \ | ||
| 351 | do { \ | ||
| 352 | if (test_bit(op, &sh->ops.pending) && \ | ||
| 353 | !test_bit(op, &sh->ops.complete)) { \ | ||
| 354 | if (test_and_set_bit(op, &sh->ops.ack)) \ | ||
| 355 | clear_bit(op, &pend); \ | ||
| 356 | else \ | ||
| 357 | ack++; \ | ||
| 358 | } else \ | ||
| 359 | clear_bit(op, &pend); \ | ||
| 360 | } while (0) | ||
| 361 | |||
| 362 | /* find new work to run, do not resubmit work that is already | ||
| 363 | * in flight | ||
| 364 | */ | ||
| 365 | static unsigned long get_stripe_work(struct stripe_head *sh) | ||
| 366 | { | ||
| 367 | unsigned long pending; | ||
| 368 | int ack = 0; | ||
| 369 | |||
| 370 | pending = sh->ops.pending; | ||
| 371 | |||
| 372 | test_and_ack_op(STRIPE_OP_BIOFILL, pending); | ||
| 373 | test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); | ||
| 374 | test_and_ack_op(STRIPE_OP_PREXOR, pending); | ||
| 375 | test_and_ack_op(STRIPE_OP_BIODRAIN, pending); | ||
| 376 | test_and_ack_op(STRIPE_OP_POSTXOR, pending); | ||
| 377 | test_and_ack_op(STRIPE_OP_CHECK, pending); | ||
| 378 | if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
| 379 | ack++; | ||
| 380 | |||
| 381 | sh->ops.count -= ack; | ||
| 382 | if (unlikely(sh->ops.count < 0)) { | ||
| 383 | printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " | ||
| 384 | "ops.complete: %#lx\n", pending, sh->ops.pending, | ||
| 385 | sh->ops.ack, sh->ops.complete); | ||
| 386 | BUG(); | ||
| 387 | } | ||
| 388 | |||
| 389 | return pending; | ||
| 390 | } | ||
| 391 | |||
| 392 | static void | 354 | static void |
| 393 | raid5_end_read_request(struct bio *bi, int error); | 355 | raid5_end_read_request(struct bio *bi, int error); |
| 394 | static void | 356 | static void |
| 395 | raid5_end_write_request(struct bio *bi, int error); | 357 | raid5_end_write_request(struct bio *bi, int error); |
| 396 | 358 | ||
| 397 | static void ops_run_io(struct stripe_head *sh) | 359 | static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) |
| 398 | { | 360 | { |
| 399 | raid5_conf_t *conf = sh->raid_conf; | 361 | raid5_conf_t *conf = sh->raid_conf; |
| 400 | int i, disks = sh->disks; | 362 | int i, disks = sh->disks; |
| 401 | 363 | ||
| 402 | might_sleep(); | 364 | might_sleep(); |
| 403 | 365 | ||
| 404 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
| 405 | for (i = disks; i--; ) { | 366 | for (i = disks; i--; ) { |
| 406 | int rw; | 367 | int rw; |
| 407 | struct bio *bi; | 368 | struct bio *bi; |
| @@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) | |||
| 430 | rcu_read_unlock(); | 391 | rcu_read_unlock(); |
| 431 | 392 | ||
| 432 | if (rdev) { | 393 | if (rdev) { |
| 433 | if (test_bit(STRIPE_SYNCING, &sh->state) || | 394 | if (s->syncing || s->expanding || s->expanded) |
| 434 | test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || | ||
| 435 | test_bit(STRIPE_EXPAND_READY, &sh->state)) | ||
| 436 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | 395 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); |
| 437 | 396 | ||
| 397 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
| 398 | |||
| 438 | bi->bi_bdev = rdev->bdev; | 399 | bi->bi_bdev = rdev->bdev; |
| 439 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", | 400 | pr_debug("%s: for %llu schedule op %ld on disc %d\n", |
| 440 | __func__, (unsigned long long)sh->sector, | 401 | __func__, (unsigned long long)sh->sector, |
| @@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 528 | (unsigned long long)sh->sector); | 489 | (unsigned long long)sh->sector); |
| 529 | 490 | ||
| 530 | /* clear completed biofills */ | 491 | /* clear completed biofills */ |
| 492 | spin_lock_irq(&conf->device_lock); | ||
| 531 | for (i = sh->disks; i--; ) { | 493 | for (i = sh->disks; i--; ) { |
| 532 | struct r5dev *dev = &sh->dev[i]; | 494 | struct r5dev *dev = &sh->dev[i]; |
| 533 | 495 | ||
| 534 | /* acknowledge completion of a biofill operation */ | 496 | /* acknowledge completion of a biofill operation */ |
| 535 | /* and check if we need to reply to a read request, | 497 | /* and check if we need to reply to a read request, |
| 536 | * new R5_Wantfill requests are held off until | 498 | * new R5_Wantfill requests are held off until |
| 537 | * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) | 499 | * !STRIPE_BIOFILL_RUN |
| 538 | */ | 500 | */ |
| 539 | if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { | 501 | if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { |
| 540 | struct bio *rbi, *rbi2; | 502 | struct bio *rbi, *rbi2; |
| 541 | 503 | ||
| 542 | /* The access to dev->read is outside of the | ||
| 543 | * spin_lock_irq(&conf->device_lock), but is protected | ||
| 544 | * by the STRIPE_OP_BIOFILL pending bit | ||
| 545 | */ | ||
| 546 | BUG_ON(!dev->read); | 504 | BUG_ON(!dev->read); |
| 547 | rbi = dev->read; | 505 | rbi = dev->read; |
| 548 | dev->read = NULL; | 506 | dev->read = NULL; |
| 549 | while (rbi && rbi->bi_sector < | 507 | while (rbi && rbi->bi_sector < |
| 550 | dev->sector + STRIPE_SECTORS) { | 508 | dev->sector + STRIPE_SECTORS) { |
| 551 | rbi2 = r5_next_bio(rbi, dev->sector); | 509 | rbi2 = r5_next_bio(rbi, dev->sector); |
| 552 | spin_lock_irq(&conf->device_lock); | ||
| 553 | if (--rbi->bi_phys_segments == 0) { | 510 | if (--rbi->bi_phys_segments == 0) { |
| 554 | rbi->bi_next = return_bi; | 511 | rbi->bi_next = return_bi; |
| 555 | return_bi = rbi; | 512 | return_bi = rbi; |
| 556 | } | 513 | } |
| 557 | spin_unlock_irq(&conf->device_lock); | ||
| 558 | rbi = rbi2; | 514 | rbi = rbi2; |
| 559 | } | 515 | } |
| 560 | } | 516 | } |
| 561 | } | 517 | } |
| 562 | set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); | 518 | spin_unlock_irq(&conf->device_lock); |
| 519 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
| 563 | 520 | ||
| 564 | return_io(return_bi); | 521 | return_io(return_bi); |
| 565 | 522 | ||
| @@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) | |||
| 610 | set_bit(R5_UPTODATE, &tgt->flags); | 567 | set_bit(R5_UPTODATE, &tgt->flags); |
| 611 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); | 568 | BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); |
| 612 | clear_bit(R5_Wantcompute, &tgt->flags); | 569 | clear_bit(R5_Wantcompute, &tgt->flags); |
| 613 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 570 | clear_bit(STRIPE_COMPUTE_RUN, &sh->state); |
| 571 | if (sh->check_state == check_state_compute_run) | ||
| 572 | sh->check_state = check_state_compute_result; | ||
| 614 | set_bit(STRIPE_HANDLE, &sh->state); | 573 | set_bit(STRIPE_HANDLE, &sh->state); |
| 615 | release_stripe(sh); | 574 | release_stripe(sh); |
| 616 | } | 575 | } |
| 617 | 576 | ||
| 618 | static struct dma_async_tx_descriptor * | 577 | static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) |
| 619 | ops_run_compute5(struct stripe_head *sh, unsigned long pending) | ||
| 620 | { | 578 | { |
| 621 | /* kernel stack size limits the total number of disks */ | 579 | /* kernel stack size limits the total number of disks */ |
| 622 | int disks = sh->disks; | 580 | int disks = sh->disks; |
| @@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) | |||
| 646 | ASYNC_TX_XOR_ZERO_DST, NULL, | 604 | ASYNC_TX_XOR_ZERO_DST, NULL, |
| 647 | ops_complete_compute5, sh); | 605 | ops_complete_compute5, sh); |
| 648 | 606 | ||
| 649 | /* ack now if postxor is not set to be run */ | ||
| 650 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) | ||
| 651 | async_tx_ack(tx); | ||
| 652 | |||
| 653 | return tx; | 607 | return tx; |
| 654 | } | 608 | } |
| 655 | 609 | ||
| @@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) | |||
| 659 | 613 | ||
| 660 | pr_debug("%s: stripe %llu\n", __func__, | 614 | pr_debug("%s: stripe %llu\n", __func__, |
| 661 | (unsigned long long)sh->sector); | 615 | (unsigned long long)sh->sector); |
| 662 | |||
| 663 | set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | ||
| 664 | } | 616 | } |
| 665 | 617 | ||
| 666 | static struct dma_async_tx_descriptor * | 618 | static struct dma_async_tx_descriptor * |
| @@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 680 | for (i = disks; i--; ) { | 632 | for (i = disks; i--; ) { |
| 681 | struct r5dev *dev = &sh->dev[i]; | 633 | struct r5dev *dev = &sh->dev[i]; |
| 682 | /* Only process blocks that are known to be uptodate */ | 634 | /* Only process blocks that are known to be uptodate */ |
| 683 | if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) | 635 | if (test_bit(R5_Wantdrain, &dev->flags)) |
| 684 | xor_srcs[count++] = dev->page; | 636 | xor_srcs[count++] = dev->page; |
| 685 | } | 637 | } |
| 686 | 638 | ||
| @@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 692 | } | 644 | } |
| 693 | 645 | ||
| 694 | static struct dma_async_tx_descriptor * | 646 | static struct dma_async_tx_descriptor * |
| 695 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | 647 | ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
| 696 | unsigned long pending) | ||
| 697 | { | 648 | { |
| 698 | int disks = sh->disks; | 649 | int disks = sh->disks; |
| 699 | int pd_idx = sh->pd_idx, i; | 650 | int i; |
| 700 | |||
| 701 | /* check if prexor is active which means only process blocks | ||
| 702 | * that are part of a read-modify-write (Wantprexor) | ||
| 703 | */ | ||
| 704 | int prexor = test_bit(STRIPE_OP_PREXOR, &pending); | ||
| 705 | 651 | ||
| 706 | pr_debug("%s: stripe %llu\n", __func__, | 652 | pr_debug("%s: stripe %llu\n", __func__, |
| 707 | (unsigned long long)sh->sector); | 653 | (unsigned long long)sh->sector); |
| @@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
| 709 | for (i = disks; i--; ) { | 655 | for (i = disks; i--; ) { |
| 710 | struct r5dev *dev = &sh->dev[i]; | 656 | struct r5dev *dev = &sh->dev[i]; |
| 711 | struct bio *chosen; | 657 | struct bio *chosen; |
| 712 | int towrite; | ||
| 713 | |||
| 714 | towrite = 0; | ||
| 715 | if (prexor) { /* rmw */ | ||
| 716 | if (dev->towrite && | ||
| 717 | test_bit(R5_Wantprexor, &dev->flags)) | ||
| 718 | towrite = 1; | ||
| 719 | } else { /* rcw */ | ||
| 720 | if (i != pd_idx && dev->towrite && | ||
| 721 | test_bit(R5_LOCKED, &dev->flags)) | ||
| 722 | towrite = 1; | ||
| 723 | } | ||
| 724 | 658 | ||
| 725 | if (towrite) { | 659 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
| 726 | struct bio *wbi; | 660 | struct bio *wbi; |
| 727 | 661 | ||
| 728 | spin_lock(&sh->lock); | 662 | spin_lock(&sh->lock); |
| @@ -747,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
| 747 | static void ops_complete_postxor(void *stripe_head_ref) | 681 | static void ops_complete_postxor(void *stripe_head_ref) |
| 748 | { | 682 | { |
| 749 | struct stripe_head *sh = stripe_head_ref; | 683 | struct stripe_head *sh = stripe_head_ref; |
| 750 | |||
| 751 | pr_debug("%s: stripe %llu\n", __func__, | ||
| 752 | (unsigned long long)sh->sector); | ||
| 753 | |||
| 754 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
| 755 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 756 | release_stripe(sh); | ||
| 757 | } | ||
| 758 | |||
| 759 | static void ops_complete_write(void *stripe_head_ref) | ||
| 760 | { | ||
| 761 | struct stripe_head *sh = stripe_head_ref; | ||
| 762 | int disks = sh->disks, i, pd_idx = sh->pd_idx; | 684 | int disks = sh->disks, i, pd_idx = sh->pd_idx; |
| 763 | 685 | ||
| 764 | pr_debug("%s: stripe %llu\n", __func__, | 686 | pr_debug("%s: stripe %llu\n", __func__, |
| @@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) | |||
| 770 | set_bit(R5_UPTODATE, &dev->flags); | 692 | set_bit(R5_UPTODATE, &dev->flags); |
| 771 | } | 693 | } |
| 772 | 694 | ||
| 773 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | 695 | if (sh->reconstruct_state == reconstruct_state_drain_run) |
| 774 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | 696 | sh->reconstruct_state = reconstruct_state_drain_result; |
| 697 | else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) | ||
| 698 | sh->reconstruct_state = reconstruct_state_prexor_drain_result; | ||
| 699 | else { | ||
| 700 | BUG_ON(sh->reconstruct_state != reconstruct_state_run); | ||
| 701 | sh->reconstruct_state = reconstruct_state_result; | ||
| 702 | } | ||
| 775 | 703 | ||
| 776 | set_bit(STRIPE_HANDLE, &sh->state); | 704 | set_bit(STRIPE_HANDLE, &sh->state); |
| 777 | release_stripe(sh); | 705 | release_stripe(sh); |
| 778 | } | 706 | } |
| 779 | 707 | ||
| 780 | static void | 708 | static void |
| 781 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | 709 | ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) |
| 782 | unsigned long pending) | ||
| 783 | { | 710 | { |
| 784 | /* kernel stack size limits the total number of disks */ | 711 | /* kernel stack size limits the total number of disks */ |
| 785 | int disks = sh->disks; | 712 | int disks = sh->disks; |
| @@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
| 787 | 714 | ||
| 788 | int count = 0, pd_idx = sh->pd_idx, i; | 715 | int count = 0, pd_idx = sh->pd_idx, i; |
| 789 | struct page *xor_dest; | 716 | struct page *xor_dest; |
| 790 | int prexor = test_bit(STRIPE_OP_PREXOR, &pending); | 717 | int prexor = 0; |
| 791 | unsigned long flags; | 718 | unsigned long flags; |
| 792 | dma_async_tx_callback callback; | ||
| 793 | 719 | ||
| 794 | pr_debug("%s: stripe %llu\n", __func__, | 720 | pr_debug("%s: stripe %llu\n", __func__, |
| 795 | (unsigned long long)sh->sector); | 721 | (unsigned long long)sh->sector); |
| @@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
| 797 | /* check if prexor is active which means only process blocks | 723 | /* check if prexor is active which means only process blocks |
| 798 | * that are part of a read-modify-write (written) | 724 | * that are part of a read-modify-write (written) |
| 799 | */ | 725 | */ |
| 800 | if (prexor) { | 726 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { |
| 727 | prexor = 1; | ||
| 801 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; | 728 | xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; |
| 802 | for (i = disks; i--; ) { | 729 | for (i = disks; i--; ) { |
| 803 | struct r5dev *dev = &sh->dev[i]; | 730 | struct r5dev *dev = &sh->dev[i]; |
| @@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
| 813 | } | 740 | } |
| 814 | } | 741 | } |
| 815 | 742 | ||
| 816 | /* check whether this postxor is part of a write */ | ||
| 817 | callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? | ||
| 818 | ops_complete_write : ops_complete_postxor; | ||
| 819 | |||
| 820 | /* 1/ if we prexor'd then the dest is reused as a source | 743 | /* 1/ if we prexor'd then the dest is reused as a source |
| 821 | * 2/ if we did not prexor then we are redoing the parity | 744 | * 2/ if we did not prexor then we are redoing the parity |
| 822 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST | 745 | * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST |
| @@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, | |||
| 830 | if (unlikely(count == 1)) { | 753 | if (unlikely(count == 1)) { |
| 831 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); | 754 | flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); |
| 832 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, | 755 | tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, |
| 833 | flags, tx, callback, sh); | 756 | flags, tx, ops_complete_postxor, sh); |
| 834 | } else | 757 | } else |
| 835 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 758 | tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
| 836 | flags, tx, callback, sh); | 759 | flags, tx, ops_complete_postxor, sh); |
| 837 | } | 760 | } |
| 838 | 761 | ||
| 839 | static void ops_complete_check(void *stripe_head_ref) | 762 | static void ops_complete_check(void *stripe_head_ref) |
| 840 | { | 763 | { |
| 841 | struct stripe_head *sh = stripe_head_ref; | 764 | struct stripe_head *sh = stripe_head_ref; |
| 842 | int pd_idx = sh->pd_idx; | ||
| 843 | 765 | ||
| 844 | pr_debug("%s: stripe %llu\n", __func__, | 766 | pr_debug("%s: stripe %llu\n", __func__, |
| 845 | (unsigned long long)sh->sector); | 767 | (unsigned long long)sh->sector); |
| 846 | 768 | ||
| 847 | if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && | 769 | sh->check_state = check_state_check_result; |
| 848 | sh->ops.zero_sum_result == 0) | ||
| 849 | set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | ||
| 850 | |||
| 851 | set_bit(STRIPE_OP_CHECK, &sh->ops.complete); | ||
| 852 | set_bit(STRIPE_HANDLE, &sh->state); | 770 | set_bit(STRIPE_HANDLE, &sh->state); |
| 853 | release_stripe(sh); | 771 | release_stripe(sh); |
| 854 | } | 772 | } |
| @@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) | |||
| 875 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, | 793 | tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, |
| 876 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); | 794 | &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); |
| 877 | 795 | ||
| 878 | if (tx) | ||
| 879 | set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
| 880 | else | ||
| 881 | clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); | ||
| 882 | |||
| 883 | atomic_inc(&sh->count); | 796 | atomic_inc(&sh->count); |
| 884 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, | 797 | tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, |
| 885 | ops_complete_check, sh); | 798 | ops_complete_check, sh); |
| 886 | } | 799 | } |
| 887 | 800 | ||
| 888 | static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) | 801 | static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) |
| 889 | { | 802 | { |
| 890 | int overlap_clear = 0, i, disks = sh->disks; | 803 | int overlap_clear = 0, i, disks = sh->disks; |
| 891 | struct dma_async_tx_descriptor *tx = NULL; | 804 | struct dma_async_tx_descriptor *tx = NULL; |
| 892 | 805 | ||
| 893 | if (test_bit(STRIPE_OP_BIOFILL, &pending)) { | 806 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { |
| 894 | ops_run_biofill(sh); | 807 | ops_run_biofill(sh); |
| 895 | overlap_clear++; | 808 | overlap_clear++; |
| 896 | } | 809 | } |
| 897 | 810 | ||
| 898 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) | 811 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { |
| 899 | tx = ops_run_compute5(sh, pending); | 812 | tx = ops_run_compute5(sh); |
| 813 | /* terminate the chain if postxor is not set to be run */ | ||
| 814 | if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) | ||
| 815 | async_tx_ack(tx); | ||
| 816 | } | ||
| 900 | 817 | ||
| 901 | if (test_bit(STRIPE_OP_PREXOR, &pending)) | 818 | if (test_bit(STRIPE_OP_PREXOR, &ops_request)) |
| 902 | tx = ops_run_prexor(sh, tx); | 819 | tx = ops_run_prexor(sh, tx); |
| 903 | 820 | ||
| 904 | if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { | 821 | if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { |
| 905 | tx = ops_run_biodrain(sh, tx, pending); | 822 | tx = ops_run_biodrain(sh, tx); |
| 906 | overlap_clear++; | 823 | overlap_clear++; |
| 907 | } | 824 | } |
| 908 | 825 | ||
| 909 | if (test_bit(STRIPE_OP_POSTXOR, &pending)) | 826 | if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) |
| 910 | ops_run_postxor(sh, tx, pending); | 827 | ops_run_postxor(sh, tx); |
| 911 | 828 | ||
| 912 | if (test_bit(STRIPE_OP_CHECK, &pending)) | 829 | if (test_bit(STRIPE_OP_CHECK, &ops_request)) |
| 913 | ops_run_check(sh); | 830 | ops_run_check(sh); |
| 914 | 831 | ||
| 915 | if (test_bit(STRIPE_OP_IO, &pending)) | ||
| 916 | ops_run_io(sh); | ||
| 917 | |||
| 918 | if (overlap_clear) | 832 | if (overlap_clear) |
| 919 | for (i = disks; i--; ) { | 833 | for (i = disks; i--; ) { |
| 920 | struct r5dev *dev = &sh->dev[i]; | 834 | struct r5dev *dev = &sh->dev[i]; |
| @@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
| 997 | struct stripe_head *osh, *nsh; | 911 | struct stripe_head *osh, *nsh; |
| 998 | LIST_HEAD(newstripes); | 912 | LIST_HEAD(newstripes); |
| 999 | struct disk_info *ndisks; | 913 | struct disk_info *ndisks; |
| 1000 | int err = 0; | 914 | int err; |
| 1001 | struct kmem_cache *sc; | 915 | struct kmem_cache *sc; |
| 1002 | int i; | 916 | int i; |
| 1003 | 917 | ||
| 1004 | if (newsize <= conf->pool_size) | 918 | if (newsize <= conf->pool_size) |
| 1005 | return 0; /* never bother to shrink */ | 919 | return 0; /* never bother to shrink */ |
| 1006 | 920 | ||
| 1007 | md_allow_write(conf->mddev); | 921 | err = md_allow_write(conf->mddev); |
| 922 | if (err) | ||
| 923 | return err; | ||
| 1008 | 924 | ||
| 1009 | /* Step 1 */ | 925 | /* Step 1 */ |
| 1010 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], | 926 | sc = kmem_cache_create(conf->cache_name[1-conf->active_name], |
| @@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) | |||
| 1703 | } | 1619 | } |
| 1704 | } | 1620 | } |
| 1705 | 1621 | ||
| 1706 | static int | 1622 | static void |
| 1707 | handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | 1623 | schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, |
| 1624 | int rcw, int expand) | ||
| 1708 | { | 1625 | { |
| 1709 | int i, pd_idx = sh->pd_idx, disks = sh->disks; | 1626 | int i, pd_idx = sh->pd_idx, disks = sh->disks; |
| 1710 | int locked = 0; | ||
| 1711 | 1627 | ||
| 1712 | if (rcw) { | 1628 | if (rcw) { |
| 1713 | /* if we are not expanding this is a proper write request, and | 1629 | /* if we are not expanding this is a proper write request, and |
| @@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
| 1715 | * stripe cache | 1631 | * stripe cache |
| 1716 | */ | 1632 | */ |
| 1717 | if (!expand) { | 1633 | if (!expand) { |
| 1718 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | 1634 | sh->reconstruct_state = reconstruct_state_drain_run; |
| 1719 | sh->ops.count++; | 1635 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
| 1720 | } | 1636 | } else |
| 1637 | sh->reconstruct_state = reconstruct_state_run; | ||
| 1721 | 1638 | ||
| 1722 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | 1639 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); |
| 1723 | sh->ops.count++; | ||
| 1724 | 1640 | ||
| 1725 | for (i = disks; i--; ) { | 1641 | for (i = disks; i--; ) { |
| 1726 | struct r5dev *dev = &sh->dev[i]; | 1642 | struct r5dev *dev = &sh->dev[i]; |
| 1727 | 1643 | ||
| 1728 | if (dev->towrite) { | 1644 | if (dev->towrite) { |
| 1729 | set_bit(R5_LOCKED, &dev->flags); | 1645 | set_bit(R5_LOCKED, &dev->flags); |
| 1646 | set_bit(R5_Wantdrain, &dev->flags); | ||
| 1730 | if (!expand) | 1647 | if (!expand) |
| 1731 | clear_bit(R5_UPTODATE, &dev->flags); | 1648 | clear_bit(R5_UPTODATE, &dev->flags); |
| 1732 | locked++; | 1649 | s->locked++; |
| 1733 | } | 1650 | } |
| 1734 | } | 1651 | } |
| 1735 | if (locked + 1 == disks) | 1652 | if (s->locked + 1 == disks) |
| 1736 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) | 1653 | if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) |
| 1737 | atomic_inc(&sh->raid_conf->pending_full_writes); | 1654 | atomic_inc(&sh->raid_conf->pending_full_writes); |
| 1738 | } else { | 1655 | } else { |
| 1739 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || | 1656 | BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || |
| 1740 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); | 1657 | test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); |
| 1741 | 1658 | ||
| 1742 | set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | 1659 | sh->reconstruct_state = reconstruct_state_prexor_drain_run; |
| 1743 | set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | 1660 | set_bit(STRIPE_OP_PREXOR, &s->ops_request); |
| 1744 | set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | 1661 | set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); |
| 1745 | 1662 | set_bit(STRIPE_OP_POSTXOR, &s->ops_request); | |
| 1746 | sh->ops.count += 3; | ||
| 1747 | 1663 | ||
| 1748 | for (i = disks; i--; ) { | 1664 | for (i = disks; i--; ) { |
| 1749 | struct r5dev *dev = &sh->dev[i]; | 1665 | struct r5dev *dev = &sh->dev[i]; |
| 1750 | if (i == pd_idx) | 1666 | if (i == pd_idx) |
| 1751 | continue; | 1667 | continue; |
| 1752 | 1668 | ||
| 1753 | /* For a read-modify write there may be blocks that are | ||
| 1754 | * locked for reading while others are ready to be | ||
| 1755 | * written so we distinguish these blocks by the | ||
| 1756 | * R5_Wantprexor bit | ||
| 1757 | */ | ||
| 1758 | if (dev->towrite && | 1669 | if (dev->towrite && |
| 1759 | (test_bit(R5_UPTODATE, &dev->flags) || | 1670 | (test_bit(R5_UPTODATE, &dev->flags) || |
| 1760 | test_bit(R5_Wantcompute, &dev->flags))) { | 1671 | test_bit(R5_Wantcompute, &dev->flags))) { |
| 1761 | set_bit(R5_Wantprexor, &dev->flags); | 1672 | set_bit(R5_Wantdrain, &dev->flags); |
| 1762 | set_bit(R5_LOCKED, &dev->flags); | 1673 | set_bit(R5_LOCKED, &dev->flags); |
| 1763 | clear_bit(R5_UPTODATE, &dev->flags); | 1674 | clear_bit(R5_UPTODATE, &dev->flags); |
| 1764 | locked++; | 1675 | s->locked++; |
| 1765 | } | 1676 | } |
| 1766 | } | 1677 | } |
| 1767 | } | 1678 | } |
| @@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) | |||
| 1771 | */ | 1682 | */ |
| 1772 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); | 1683 | set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); |
| 1773 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); | 1684 | clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); |
| 1774 | locked++; | 1685 | s->locked++; |
| 1775 | 1686 | ||
| 1776 | pr_debug("%s: stripe %llu locked: %d pending: %lx\n", | 1687 | pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", |
| 1777 | __func__, (unsigned long long)sh->sector, | 1688 | __func__, (unsigned long long)sh->sector, |
| 1778 | locked, sh->ops.pending); | 1689 | s->locked, s->ops_request); |
| 1779 | |||
| 1780 | return locked; | ||
| 1781 | } | 1690 | } |
| 1782 | 1691 | ||
| 1783 | /* | 1692 | /* |
| @@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) | |||
| 1876 | } | 1785 | } |
| 1877 | 1786 | ||
| 1878 | static void | 1787 | static void |
| 1879 | handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | 1788 | handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, |
| 1880 | struct stripe_head_state *s, int disks, | 1789 | struct stripe_head_state *s, int disks, |
| 1881 | struct bio **return_bi) | 1790 | struct bio **return_bi) |
| 1882 | { | 1791 | { |
| @@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, | |||
| 1967 | md_wakeup_thread(conf->mddev->thread); | 1876 | md_wakeup_thread(conf->mddev->thread); |
| 1968 | } | 1877 | } |
| 1969 | 1878 | ||
| 1970 | /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks | 1879 | /* fetch_block5 - checks the given member device to see if its data needs |
| 1971 | * to process | 1880 | * to be read or computed to satisfy a request. |
| 1881 | * | ||
| 1882 | * Returns 1 when no more member devices need to be checked, otherwise returns | ||
| 1883 | * 0 to tell the loop in handle_stripe_fill5 to continue | ||
| 1972 | */ | 1884 | */ |
| 1973 | static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | 1885 | static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, |
| 1974 | struct stripe_head_state *s, int disk_idx, int disks) | 1886 | int disk_idx, int disks) |
| 1975 | { | 1887 | { |
| 1976 | struct r5dev *dev = &sh->dev[disk_idx]; | 1888 | struct r5dev *dev = &sh->dev[disk_idx]; |
| 1977 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; | 1889 | struct r5dev *failed_dev = &sh->dev[s->failed_num]; |
| 1978 | 1890 | ||
| 1979 | /* don't schedule compute operations or reads on the parity block while | ||
| 1980 | * a check is in flight | ||
| 1981 | */ | ||
| 1982 | if ((disk_idx == sh->pd_idx) && | ||
| 1983 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | ||
| 1984 | return ~0; | ||
| 1985 | |||
| 1986 | /* is the data in this block needed, and can we get it? */ | 1891 | /* is the data in this block needed, and can we get it? */ |
| 1987 | if (!test_bit(R5_LOCKED, &dev->flags) && | 1892 | if (!test_bit(R5_LOCKED, &dev->flags) && |
| 1988 | !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || | 1893 | !test_bit(R5_UPTODATE, &dev->flags) && |
| 1989 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || | 1894 | (dev->toread || |
| 1990 | s->syncing || s->expanding || (s->failed && | 1895 | (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || |
| 1991 | (failed_dev->toread || (failed_dev->towrite && | 1896 | s->syncing || s->expanding || |
| 1992 | !test_bit(R5_OVERWRITE, &failed_dev->flags) | 1897 | (s->failed && |
| 1993 | ))))) { | 1898 | (failed_dev->toread || |
| 1994 | /* 1/ We would like to get this block, possibly by computing it, | 1899 | (failed_dev->towrite && |
| 1995 | * but we might not be able to. | 1900 | !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { |
| 1996 | * | 1901 | /* We would like to get this block, possibly by computing it, |
| 1997 | * 2/ Since parity check operations potentially make the parity | 1902 | * otherwise read it if the backing disk is insync |
| 1998 | * block !uptodate it will need to be refreshed before any | ||
| 1999 | * compute operations on data disks are scheduled. | ||
| 2000 | * | ||
| 2001 | * 3/ We hold off parity block re-reads until check operations | ||
| 2002 | * have quiesced. | ||
| 2003 | */ | 1903 | */ |
| 2004 | if ((s->uptodate == disks - 1) && | 1904 | if ((s->uptodate == disks - 1) && |
| 2005 | (s->failed && disk_idx == s->failed_num) && | 1905 | (s->failed && disk_idx == s->failed_num)) { |
| 2006 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | 1906 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); |
| 2007 | set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 1907 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); |
| 2008 | set_bit(R5_Wantcompute, &dev->flags); | 1908 | set_bit(R5_Wantcompute, &dev->flags); |
| 2009 | sh->ops.target = disk_idx; | 1909 | sh->ops.target = disk_idx; |
| 2010 | s->req_compute = 1; | 1910 | s->req_compute = 1; |
| 2011 | sh->ops.count++; | ||
| 2012 | /* Careful: from this point on 'uptodate' is in the eye | 1911 | /* Careful: from this point on 'uptodate' is in the eye |
| 2013 | * of raid5_run_ops which services 'compute' operations | 1912 | * of raid5_run_ops which services 'compute' operations |
| 2014 | * before writes. R5_Wantcompute flags a block that will | 1913 | * before writes. R5_Wantcompute flags a block that will |
| @@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, | |||
| 2016 | * subsequent operation. | 1915 | * subsequent operation. |
| 2017 | */ | 1916 | */ |
| 2018 | s->uptodate++; | 1917 | s->uptodate++; |
| 2019 | return 0; /* uptodate + compute == disks */ | 1918 | return 1; /* uptodate + compute == disks */ |
| 2020 | } else if (test_bit(R5_Insync, &dev->flags)) { | 1919 | } else if (test_bit(R5_Insync, &dev->flags)) { |
| 2021 | set_bit(R5_LOCKED, &dev->flags); | 1920 | set_bit(R5_LOCKED, &dev->flags); |
| 2022 | set_bit(R5_Wantread, &dev->flags); | 1921 | set_bit(R5_Wantread, &dev->flags); |
| 2023 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2024 | sh->ops.count++; | ||
| 2025 | s->locked++; | 1922 | s->locked++; |
| 2026 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, | 1923 | pr_debug("Reading block %d (sync=%d)\n", disk_idx, |
| 2027 | s->syncing); | 1924 | s->syncing); |
| 2028 | } | 1925 | } |
| 2029 | } | 1926 | } |
| 2030 | 1927 | ||
| 2031 | return ~0; | 1928 | return 0; |
| 2032 | } | 1929 | } |
| 2033 | 1930 | ||
| 2034 | static void handle_issuing_new_read_requests5(struct stripe_head *sh, | 1931 | /** |
| 1932 | * handle_stripe_fill5 - read or compute data to satisfy pending requests. | ||
| 1933 | */ | ||
| 1934 | static void handle_stripe_fill5(struct stripe_head *sh, | ||
| 2035 | struct stripe_head_state *s, int disks) | 1935 | struct stripe_head_state *s, int disks) |
| 2036 | { | 1936 | { |
| 2037 | int i; | 1937 | int i; |
| 2038 | 1938 | ||
| 2039 | /* Clear completed compute operations. Parity recovery | ||
| 2040 | * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled | ||
| 2041 | * later on in this routine | ||
| 2042 | */ | ||
| 2043 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | ||
| 2044 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
| 2045 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | ||
| 2046 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | ||
| 2047 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | ||
| 2048 | } | ||
| 2049 | |||
| 2050 | /* look for blocks to read/compute, skip this if a compute | 1939 | /* look for blocks to read/compute, skip this if a compute |
| 2051 | * is already in flight, or if the stripe contents are in the | 1940 | * is already in flight, or if the stripe contents are in the |
| 2052 | * midst of changing due to a write | 1941 | * midst of changing due to a write |
| 2053 | */ | 1942 | */ |
| 2054 | if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 1943 | if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && |
| 2055 | !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && | 1944 | !sh->reconstruct_state) |
| 2056 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | ||
| 2057 | for (i = disks; i--; ) | 1945 | for (i = disks; i--; ) |
| 2058 | if (__handle_issuing_new_read_requests5( | 1946 | if (fetch_block5(sh, s, i, disks)) |
| 2059 | sh, s, i, disks) == 0) | ||
| 2060 | break; | 1947 | break; |
| 2061 | } | ||
| 2062 | set_bit(STRIPE_HANDLE, &sh->state); | 1948 | set_bit(STRIPE_HANDLE, &sh->state); |
| 2063 | } | 1949 | } |
| 2064 | 1950 | ||
| 2065 | static void handle_issuing_new_read_requests6(struct stripe_head *sh, | 1951 | static void handle_stripe_fill6(struct stripe_head *sh, |
| 2066 | struct stripe_head_state *s, struct r6_state *r6s, | 1952 | struct stripe_head_state *s, struct r6_state *r6s, |
| 2067 | int disks) | 1953 | int disks) |
| 2068 | { | 1954 | { |
| @@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, | |||
| 2121 | } | 2007 | } |
| 2122 | 2008 | ||
| 2123 | 2009 | ||
| 2124 | /* handle_completed_write_requests | 2010 | /* handle_stripe_clean_event |
| 2125 | * any written block on an uptodate or failed drive can be returned. | 2011 | * any written block on an uptodate or failed drive can be returned. |
| 2126 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but | 2012 | * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but |
| 2127 | * never LOCKED, so we don't need to test 'failed' directly. | 2013 | * never LOCKED, so we don't need to test 'failed' directly. |
| 2128 | */ | 2014 | */ |
| 2129 | static void handle_completed_write_requests(raid5_conf_t *conf, | 2015 | static void handle_stripe_clean_event(raid5_conf_t *conf, |
| 2130 | struct stripe_head *sh, int disks, struct bio **return_bi) | 2016 | struct stripe_head *sh, int disks, struct bio **return_bi) |
| 2131 | { | 2017 | { |
| 2132 | int i; | 2018 | int i; |
| @@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, | |||
| 2171 | md_wakeup_thread(conf->mddev->thread); | 2057 | md_wakeup_thread(conf->mddev->thread); |
| 2172 | } | 2058 | } |
| 2173 | 2059 | ||
| 2174 | static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | 2060 | static void handle_stripe_dirtying5(raid5_conf_t *conf, |
| 2175 | struct stripe_head *sh, struct stripe_head_state *s, int disks) | 2061 | struct stripe_head *sh, struct stripe_head_state *s, int disks) |
| 2176 | { | 2062 | { |
| 2177 | int rmw = 0, rcw = 0, i; | 2063 | int rmw = 0, rcw = 0, i; |
| @@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
| 2215 | "%d for r-m-w\n", i); | 2101 | "%d for r-m-w\n", i); |
| 2216 | set_bit(R5_LOCKED, &dev->flags); | 2102 | set_bit(R5_LOCKED, &dev->flags); |
| 2217 | set_bit(R5_Wantread, &dev->flags); | 2103 | set_bit(R5_Wantread, &dev->flags); |
| 2218 | if (!test_and_set_bit( | ||
| 2219 | STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2220 | sh->ops.count++; | ||
| 2221 | s->locked++; | 2104 | s->locked++; |
| 2222 | } else { | 2105 | } else { |
| 2223 | set_bit(STRIPE_DELAYED, &sh->state); | 2106 | set_bit(STRIPE_DELAYED, &sh->state); |
| @@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
| 2241 | "%d for Reconstruct\n", i); | 2124 | "%d for Reconstruct\n", i); |
| 2242 | set_bit(R5_LOCKED, &dev->flags); | 2125 | set_bit(R5_LOCKED, &dev->flags); |
| 2243 | set_bit(R5_Wantread, &dev->flags); | 2126 | set_bit(R5_Wantread, &dev->flags); |
| 2244 | if (!test_and_set_bit( | ||
| 2245 | STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2246 | sh->ops.count++; | ||
| 2247 | s->locked++; | 2127 | s->locked++; |
| 2248 | } else { | 2128 | } else { |
| 2249 | set_bit(STRIPE_DELAYED, &sh->state); | 2129 | set_bit(STRIPE_DELAYED, &sh->state); |
| @@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, | |||
| 2261 | * simultaneously. If this is not the case then new writes need to be | 2141 | * simultaneously. If this is not the case then new writes need to be |
| 2262 | * held off until the compute completes. | 2142 | * held off until the compute completes. |
| 2263 | */ | 2143 | */ |
| 2264 | if ((s->req_compute || | 2144 | if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && |
| 2265 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && | 2145 | (s->locked == 0 && (rcw == 0 || rmw == 0) && |
| 2266 | (s->locked == 0 && (rcw == 0 || rmw == 0) && | 2146 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) |
| 2267 | !test_bit(STRIPE_BIT_DELAY, &sh->state))) | 2147 | schedule_reconstruction5(sh, s, rcw == 0, 0); |
| 2268 | s->locked += handle_write_operations5(sh, rcw == 0, 0); | ||
| 2269 | } | 2148 | } |
| 2270 | 2149 | ||
| 2271 | static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | 2150 | static void handle_stripe_dirtying6(raid5_conf_t *conf, |
| 2272 | struct stripe_head *sh, struct stripe_head_state *s, | 2151 | struct stripe_head *sh, struct stripe_head_state *s, |
| 2273 | struct r6_state *r6s, int disks) | 2152 | struct r6_state *r6s, int disks) |
| 2274 | { | 2153 | { |
| @@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, | |||
| 2371 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, | 2250 | static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, |
| 2372 | struct stripe_head_state *s, int disks) | 2251 | struct stripe_head_state *s, int disks) |
| 2373 | { | 2252 | { |
| 2374 | int canceled_check = 0; | 2253 | struct r5dev *dev = NULL; |
| 2375 | 2254 | ||
| 2376 | set_bit(STRIPE_HANDLE, &sh->state); | 2255 | set_bit(STRIPE_HANDLE, &sh->state); |
| 2377 | 2256 | ||
| 2378 | /* complete a check operation */ | 2257 | switch (sh->check_state) { |
| 2379 | if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { | 2258 | case check_state_idle: |
| 2380 | clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); | 2259 | /* start a new check operation if there are no failures */ |
| 2381 | clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); | ||
| 2382 | if (s->failed == 0) { | 2260 | if (s->failed == 0) { |
| 2383 | if (sh->ops.zero_sum_result == 0) | ||
| 2384 | /* parity is correct (on disc, | ||
| 2385 | * not in buffer any more) | ||
| 2386 | */ | ||
| 2387 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 2388 | else { | ||
| 2389 | conf->mddev->resync_mismatches += | ||
| 2390 | STRIPE_SECTORS; | ||
| 2391 | if (test_bit( | ||
| 2392 | MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
| 2393 | /* don't try to repair!! */ | ||
| 2394 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 2395 | else { | ||
| 2396 | set_bit(STRIPE_OP_COMPUTE_BLK, | ||
| 2397 | &sh->ops.pending); | ||
| 2398 | set_bit(STRIPE_OP_MOD_REPAIR_PD, | ||
| 2399 | &sh->ops.pending); | ||
| 2400 | set_bit(R5_Wantcompute, | ||
| 2401 | &sh->dev[sh->pd_idx].flags); | ||
| 2402 | sh->ops.target = sh->pd_idx; | ||
| 2403 | sh->ops.count++; | ||
| 2404 | s->uptodate++; | ||
| 2405 | } | ||
| 2406 | } | ||
| 2407 | } else | ||
| 2408 | canceled_check = 1; /* STRIPE_INSYNC is not set */ | ||
| 2409 | } | ||
| 2410 | |||
| 2411 | /* start a new check operation if there are no failures, the stripe is | ||
| 2412 | * not insync, and a repair is not in flight | ||
| 2413 | */ | ||
| 2414 | if (s->failed == 0 && | ||
| 2415 | !test_bit(STRIPE_INSYNC, &sh->state) && | ||
| 2416 | !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | ||
| 2417 | if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { | ||
| 2418 | BUG_ON(s->uptodate != disks); | 2261 | BUG_ON(s->uptodate != disks); |
| 2262 | sh->check_state = check_state_run; | ||
| 2263 | set_bit(STRIPE_OP_CHECK, &s->ops_request); | ||
| 2419 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); | 2264 | clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); |
| 2420 | sh->ops.count++; | ||
| 2421 | s->uptodate--; | 2265 | s->uptodate--; |
| 2266 | break; | ||
| 2422 | } | 2267 | } |
| 2423 | } | 2268 | dev = &sh->dev[s->failed_num]; |
| 2424 | 2269 | /* fall through */ | |
| 2425 | /* check if we can clear a parity disk reconstruct */ | 2270 | case check_state_compute_result: |
| 2426 | if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && | 2271 | sh->check_state = check_state_idle; |
| 2427 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { | 2272 | if (!dev) |
| 2428 | 2273 | dev = &sh->dev[sh->pd_idx]; | |
| 2429 | clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); | 2274 | |
| 2430 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); | 2275 | /* check that a write has not made the stripe insync */ |
| 2431 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); | 2276 | if (test_bit(STRIPE_INSYNC, &sh->state)) |
| 2432 | clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); | 2277 | break; |
| 2433 | } | ||
| 2434 | |||
| 2435 | 2278 | ||
| 2436 | /* Wait for check parity and compute block operations to complete | ||
| 2437 | * before write-back. If a failure occurred while the check operation | ||
| 2438 | * was in flight we need to cycle this stripe through handle_stripe | ||
| 2439 | * since the parity block may not be uptodate | ||
| 2440 | */ | ||
| 2441 | if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && | ||
| 2442 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && | ||
| 2443 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { | ||
| 2444 | struct r5dev *dev; | ||
| 2445 | /* either failed parity check, or recovery is happening */ | 2279 | /* either failed parity check, or recovery is happening */ |
| 2446 | if (s->failed == 0) | ||
| 2447 | s->failed_num = sh->pd_idx; | ||
| 2448 | dev = &sh->dev[s->failed_num]; | ||
| 2449 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); | 2280 | BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); |
| 2450 | BUG_ON(s->uptodate != disks); | 2281 | BUG_ON(s->uptodate != disks); |
| 2451 | 2282 | ||
| 2452 | set_bit(R5_LOCKED, &dev->flags); | 2283 | set_bit(R5_LOCKED, &dev->flags); |
| 2284 | s->locked++; | ||
| 2453 | set_bit(R5_Wantwrite, &dev->flags); | 2285 | set_bit(R5_Wantwrite, &dev->flags); |
| 2454 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2455 | sh->ops.count++; | ||
| 2456 | 2286 | ||
| 2457 | clear_bit(STRIPE_DEGRADED, &sh->state); | 2287 | clear_bit(STRIPE_DEGRADED, &sh->state); |
| 2458 | s->locked++; | ||
| 2459 | set_bit(STRIPE_INSYNC, &sh->state); | 2288 | set_bit(STRIPE_INSYNC, &sh->state); |
| 2289 | break; | ||
| 2290 | case check_state_run: | ||
| 2291 | break; /* we will be called again upon completion */ | ||
| 2292 | case check_state_check_result: | ||
| 2293 | sh->check_state = check_state_idle; | ||
| 2294 | |||
| 2295 | /* if a failure occurred during the check operation, leave | ||
| 2296 | * STRIPE_INSYNC not set and let the stripe be handled again | ||
| 2297 | */ | ||
| 2298 | if (s->failed) | ||
| 2299 | break; | ||
| 2300 | |||
| 2301 | /* handle a successful check operation, if parity is correct | ||
| 2302 | * we are done. Otherwise update the mismatch count and repair | ||
| 2303 | * parity if !MD_RECOVERY_CHECK | ||
| 2304 | */ | ||
| 2305 | if (sh->ops.zero_sum_result == 0) | ||
| 2306 | /* parity is correct (on disc, | ||
| 2307 | * not in buffer any more) | ||
| 2308 | */ | ||
| 2309 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 2310 | else { | ||
| 2311 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | ||
| 2312 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | ||
| 2313 | /* don't try to repair!! */ | ||
| 2314 | set_bit(STRIPE_INSYNC, &sh->state); | ||
| 2315 | else { | ||
| 2316 | sh->check_state = check_state_compute_run; | ||
| 2317 | set_bit(STRIPE_COMPUTE_RUN, &sh->state); | ||
| 2318 | set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); | ||
| 2319 | set_bit(R5_Wantcompute, | ||
| 2320 | &sh->dev[sh->pd_idx].flags); | ||
| 2321 | sh->ops.target = sh->pd_idx; | ||
| 2322 | s->uptodate++; | ||
| 2323 | } | ||
| 2324 | } | ||
| 2325 | break; | ||
| 2326 | case check_state_compute_run: | ||
| 2327 | break; | ||
| 2328 | default: | ||
| 2329 | printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", | ||
| 2330 | __func__, sh->check_state, | ||
| 2331 | (unsigned long long) sh->sector); | ||
| 2332 | BUG(); | ||
| 2460 | } | 2333 | } |
| 2461 | } | 2334 | } |
| 2462 | 2335 | ||
| @@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2641 | struct bio *return_bi = NULL; | 2514 | struct bio *return_bi = NULL; |
| 2642 | struct stripe_head_state s; | 2515 | struct stripe_head_state s; |
| 2643 | struct r5dev *dev; | 2516 | struct r5dev *dev; |
| 2644 | unsigned long pending = 0; | ||
| 2645 | mdk_rdev_t *blocked_rdev = NULL; | 2517 | mdk_rdev_t *blocked_rdev = NULL; |
| 2646 | int prexor; | 2518 | int prexor; |
| 2647 | 2519 | ||
| 2648 | memset(&s, 0, sizeof(s)); | 2520 | memset(&s, 0, sizeof(s)); |
| 2649 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " | 2521 | pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " |
| 2650 | "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, | 2522 | "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, |
| 2651 | atomic_read(&sh->count), sh->pd_idx, | 2523 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, |
| 2652 | sh->ops.pending, sh->ops.ack, sh->ops.complete); | 2524 | sh->reconstruct_state); |
| 2653 | 2525 | ||
| 2654 | spin_lock(&sh->lock); | 2526 | spin_lock(&sh->lock); |
| 2655 | clear_bit(STRIPE_HANDLE, &sh->state); | 2527 | clear_bit(STRIPE_HANDLE, &sh->state); |
| @@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2658 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 2530 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
| 2659 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); | 2531 | s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); |
| 2660 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); | 2532 | s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); |
| 2661 | /* Now to look around and see what can be done */ | ||
| 2662 | |||
| 2663 | /* clean-up completed biofill operations */ | ||
| 2664 | if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { | ||
| 2665 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); | ||
| 2666 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); | ||
| 2667 | clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); | ||
| 2668 | } | ||
| 2669 | 2533 | ||
| 2534 | /* Now to look around and see what can be done */ | ||
| 2670 | rcu_read_lock(); | 2535 | rcu_read_lock(); |
| 2671 | for (i=disks; i--; ) { | 2536 | for (i=disks; i--; ) { |
| 2672 | mdk_rdev_t *rdev; | 2537 | mdk_rdev_t *rdev; |
| @@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2680 | /* maybe we can request a biofill operation | 2545 | /* maybe we can request a biofill operation |
| 2681 | * | 2546 | * |
| 2682 | * new wantfill requests are only permitted while | 2547 | * new wantfill requests are only permitted while |
| 2683 | * STRIPE_OP_BIOFILL is clear | 2548 | * ops_complete_biofill is guaranteed to be inactive |
| 2684 | */ | 2549 | */ |
| 2685 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && | 2550 | if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && |
| 2686 | !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2551 | !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) |
| 2687 | set_bit(R5_Wantfill, &dev->flags); | 2552 | set_bit(R5_Wantfill, &dev->flags); |
| 2688 | 2553 | ||
| 2689 | /* now count some things */ | 2554 | /* now count some things */ |
| @@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2727 | goto unlock; | 2592 | goto unlock; |
| 2728 | } | 2593 | } |
| 2729 | 2594 | ||
| 2730 | if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) | 2595 | if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { |
| 2731 | sh->ops.count++; | 2596 | set_bit(STRIPE_OP_BIOFILL, &s.ops_request); |
| 2597 | set_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||
| 2598 | } | ||
| 2732 | 2599 | ||
| 2733 | pr_debug("locked=%d uptodate=%d to_read=%d" | 2600 | pr_debug("locked=%d uptodate=%d to_read=%d" |
| 2734 | " to_write=%d failed=%d failed_num=%d\n", | 2601 | " to_write=%d failed=%d failed_num=%d\n", |
| @@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2738 | * need to be failed | 2605 | * need to be failed |
| 2739 | */ | 2606 | */ |
| 2740 | if (s.failed > 1 && s.to_read+s.to_write+s.written) | 2607 | if (s.failed > 1 && s.to_read+s.to_write+s.written) |
| 2741 | handle_requests_to_failed_array(conf, sh, &s, disks, | 2608 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); |
| 2742 | &return_bi); | ||
| 2743 | if (s.failed > 1 && s.syncing) { | 2609 | if (s.failed > 1 && s.syncing) { |
| 2744 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2610 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
| 2745 | clear_bit(STRIPE_SYNCING, &sh->state); | 2611 | clear_bit(STRIPE_SYNCING, &sh->state); |
| @@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2755 | !test_bit(R5_LOCKED, &dev->flags) && | 2621 | !test_bit(R5_LOCKED, &dev->flags) && |
| 2756 | test_bit(R5_UPTODATE, &dev->flags)) || | 2622 | test_bit(R5_UPTODATE, &dev->flags)) || |
| 2757 | (s.failed == 1 && s.failed_num == sh->pd_idx))) | 2623 | (s.failed == 1 && s.failed_num == sh->pd_idx))) |
| 2758 | handle_completed_write_requests(conf, sh, disks, &return_bi); | 2624 | handle_stripe_clean_event(conf, sh, disks, &return_bi); |
| 2759 | 2625 | ||
| 2760 | /* Now we might consider reading some blocks, either to check/generate | 2626 | /* Now we might consider reading some blocks, either to check/generate |
| 2761 | * parity, or to satisfy requests | 2627 | * parity, or to satisfy requests |
| 2762 | * or to load a block that is being partially written. | 2628 | * or to load a block that is being partially written. |
| 2763 | */ | 2629 | */ |
| 2764 | if (s.to_read || s.non_overwrite || | 2630 | if (s.to_read || s.non_overwrite || |
| 2765 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || | 2631 | (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) |
| 2766 | test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2632 | handle_stripe_fill5(sh, &s, disks); |
| 2767 | handle_issuing_new_read_requests5(sh, &s, disks); | ||
| 2768 | 2633 | ||
| 2769 | /* Now we check to see if any write operations have recently | 2634 | /* Now we check to see if any write operations have recently |
| 2770 | * completed | 2635 | * completed |
| 2771 | */ | 2636 | */ |
| 2772 | |||
| 2773 | /* leave prexor set until postxor is done, allows us to distinguish | ||
| 2774 | * a rmw from a rcw during biodrain | ||
| 2775 | */ | ||
| 2776 | prexor = 0; | 2637 | prexor = 0; |
| 2777 | if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && | 2638 | if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) |
| 2778 | test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { | ||
| 2779 | |||
| 2780 | prexor = 1; | 2639 | prexor = 1; |
| 2781 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); | 2640 | if (sh->reconstruct_state == reconstruct_state_drain_result || |
| 2782 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); | 2641 | sh->reconstruct_state == reconstruct_state_prexor_drain_result) { |
| 2783 | clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); | 2642 | sh->reconstruct_state = reconstruct_state_idle; |
| 2784 | |||
| 2785 | for (i = disks; i--; ) | ||
| 2786 | clear_bit(R5_Wantprexor, &sh->dev[i].flags); | ||
| 2787 | } | ||
| 2788 | |||
| 2789 | /* if only POSTXOR is set then this is an 'expand' postxor */ | ||
| 2790 | if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && | ||
| 2791 | test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { | ||
| 2792 | |||
| 2793 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); | ||
| 2794 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); | ||
| 2795 | clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); | ||
| 2796 | |||
| 2797 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
| 2798 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); | ||
| 2799 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | ||
| 2800 | 2643 | ||
| 2801 | /* All the 'written' buffers and the parity block are ready to | 2644 | /* All the 'written' buffers and the parity block are ready to |
| 2802 | * be written back to disk | 2645 | * be written back to disk |
| @@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2808 | (i == sh->pd_idx || dev->written)) { | 2651 | (i == sh->pd_idx || dev->written)) { |
| 2809 | pr_debug("Writing block %d\n", i); | 2652 | pr_debug("Writing block %d\n", i); |
| 2810 | set_bit(R5_Wantwrite, &dev->flags); | 2653 | set_bit(R5_Wantwrite, &dev->flags); |
| 2811 | if (!test_and_set_bit( | ||
| 2812 | STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2813 | sh->ops.count++; | ||
| 2814 | if (prexor) | 2654 | if (prexor) |
| 2815 | continue; | 2655 | continue; |
| 2816 | if (!test_bit(R5_Insync, &dev->flags) || | 2656 | if (!test_bit(R5_Insync, &dev->flags) || |
| @@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2832 | * 2/ A 'check' operation is in flight, as it may clobber the parity | 2672 | * 2/ A 'check' operation is in flight, as it may clobber the parity |
| 2833 | * block. | 2673 | * block. |
| 2834 | */ | 2674 | */ |
| 2835 | if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && | 2675 | if (s.to_write && !sh->reconstruct_state && !sh->check_state) |
| 2836 | !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) | 2676 | handle_stripe_dirtying5(conf, sh, &s, disks); |
| 2837 | handle_issuing_new_write_requests5(conf, sh, &s, disks); | ||
| 2838 | 2677 | ||
| 2839 | /* maybe we need to check and possibly fix the parity for this stripe | 2678 | /* maybe we need to check and possibly fix the parity for this stripe |
| 2840 | * Any reads will already have been scheduled, so we just see if enough | 2679 | * Any reads will already have been scheduled, so we just see if enough |
| 2841 | * data is available. The parity check is held off while parity | 2680 | * data is available. The parity check is held off while parity |
| 2842 | * dependent operations are in flight. | 2681 | * dependent operations are in flight. |
| 2843 | */ | 2682 | */ |
| 2844 | if ((s.syncing && s.locked == 0 && | 2683 | if (sh->check_state || |
| 2845 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && | 2684 | (s.syncing && s.locked == 0 && |
| 2846 | !test_bit(STRIPE_INSYNC, &sh->state)) || | 2685 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && |
| 2847 | test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || | 2686 | !test_bit(STRIPE_INSYNC, &sh->state))) |
| 2848 | test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) | ||
| 2849 | handle_parity_checks5(conf, sh, &s, disks); | 2687 | handle_parity_checks5(conf, sh, &s, disks); |
| 2850 | 2688 | ||
| 2851 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { | 2689 | if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { |
| @@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2864 | dev = &sh->dev[s.failed_num]; | 2702 | dev = &sh->dev[s.failed_num]; |
| 2865 | if (!test_bit(R5_ReWrite, &dev->flags)) { | 2703 | if (!test_bit(R5_ReWrite, &dev->flags)) { |
| 2866 | set_bit(R5_Wantwrite, &dev->flags); | 2704 | set_bit(R5_Wantwrite, &dev->flags); |
| 2867 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2868 | sh->ops.count++; | ||
| 2869 | set_bit(R5_ReWrite, &dev->flags); | 2705 | set_bit(R5_ReWrite, &dev->flags); |
| 2870 | set_bit(R5_LOCKED, &dev->flags); | 2706 | set_bit(R5_LOCKED, &dev->flags); |
| 2871 | s.locked++; | 2707 | s.locked++; |
| 2872 | } else { | 2708 | } else { |
| 2873 | /* let's read it back */ | 2709 | /* let's read it back */ |
| 2874 | set_bit(R5_Wantread, &dev->flags); | 2710 | set_bit(R5_Wantread, &dev->flags); |
| 2875 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2876 | sh->ops.count++; | ||
| 2877 | set_bit(R5_LOCKED, &dev->flags); | 2711 | set_bit(R5_LOCKED, &dev->flags); |
| 2878 | s.locked++; | 2712 | s.locked++; |
| 2879 | } | 2713 | } |
| 2880 | } | 2714 | } |
| 2881 | 2715 | ||
| 2882 | /* Finish postxor operations initiated by the expansion | 2716 | /* Finish reconstruct operations initiated by the expansion process */ |
| 2883 | * process | 2717 | if (sh->reconstruct_state == reconstruct_state_result) { |
| 2884 | */ | 2718 | sh->reconstruct_state = reconstruct_state_idle; |
| 2885 | if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && | ||
| 2886 | !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { | ||
| 2887 | |||
| 2888 | clear_bit(STRIPE_EXPANDING, &sh->state); | 2719 | clear_bit(STRIPE_EXPANDING, &sh->state); |
| 2889 | 2720 | for (i = conf->raid_disks; i--; ) | |
| 2890 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); | ||
| 2891 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); | ||
| 2892 | clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); | ||
| 2893 | |||
| 2894 | for (i = conf->raid_disks; i--; ) { | ||
| 2895 | set_bit(R5_Wantwrite, &sh->dev[i].flags); | 2721 | set_bit(R5_Wantwrite, &sh->dev[i].flags); |
| 2896 | set_bit(R5_LOCKED, &dev->flags); | 2722 | set_bit(R5_LOCKED, &dev->flags); |
| 2897 | s.locked++; | 2723 | s.locked++; |
| 2898 | if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) | ||
| 2899 | sh->ops.count++; | ||
| 2900 | } | ||
| 2901 | } | 2724 | } |
| 2902 | 2725 | ||
| 2903 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && | 2726 | if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && |
| 2904 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | 2727 | !sh->reconstruct_state) { |
| 2905 | /* Need to write out all blocks after computing parity */ | 2728 | /* Need to write out all blocks after computing parity */ |
| 2906 | sh->disks = conf->raid_disks; | 2729 | sh->disks = conf->raid_disks; |
| 2907 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, | 2730 | sh->pd_idx = stripe_to_pdidx(sh->sector, conf, |
| 2908 | conf->raid_disks); | 2731 | conf->raid_disks); |
| 2909 | s.locked += handle_write_operations5(sh, 1, 1); | 2732 | schedule_reconstruction5(sh, &s, 1, 1); |
| 2910 | } else if (s.expanded && | 2733 | } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { |
| 2911 | s.locked == 0 && | ||
| 2912 | !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { | ||
| 2913 | clear_bit(STRIPE_EXPAND_READY, &sh->state); | 2734 | clear_bit(STRIPE_EXPAND_READY, &sh->state); |
| 2914 | atomic_dec(&conf->reshape_stripes); | 2735 | atomic_dec(&conf->reshape_stripes); |
| 2915 | wake_up(&conf->wait_for_overlap); | 2736 | wake_up(&conf->wait_for_overlap); |
| @@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2917 | } | 2738 | } |
| 2918 | 2739 | ||
| 2919 | if (s.expanding && s.locked == 0 && | 2740 | if (s.expanding && s.locked == 0 && |
| 2920 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2741 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
| 2921 | handle_stripe_expansion(conf, sh, NULL); | 2742 | handle_stripe_expansion(conf, sh, NULL); |
| 2922 | 2743 | ||
| 2923 | if (sh->ops.count) | ||
| 2924 | pending = get_stripe_work(sh); | ||
| 2925 | |||
| 2926 | unlock: | 2744 | unlock: |
| 2927 | spin_unlock(&sh->lock); | 2745 | spin_unlock(&sh->lock); |
| 2928 | 2746 | ||
| @@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh) | |||
| 2930 | if (unlikely(blocked_rdev)) | 2748 | if (unlikely(blocked_rdev)) |
| 2931 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2749 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
| 2932 | 2750 | ||
| 2933 | if (pending) | 2751 | if (s.ops_request) |
| 2934 | raid5_run_ops(sh, pending); | 2752 | raid5_run_ops(sh, s.ops_request); |
| 2935 | 2753 | ||
| 2936 | return_io(return_bi); | 2754 | ops_run_io(sh, &s); |
| 2937 | 2755 | ||
| 2756 | return_io(return_bi); | ||
| 2938 | } | 2757 | } |
| 2939 | 2758 | ||
| 2940 | static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | 2759 | static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) |
| @@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
| 3042 | * might need to be failed | 2861 | * might need to be failed |
| 3043 | */ | 2862 | */ |
| 3044 | if (s.failed > 2 && s.to_read+s.to_write+s.written) | 2863 | if (s.failed > 2 && s.to_read+s.to_write+s.written) |
| 3045 | handle_requests_to_failed_array(conf, sh, &s, disks, | 2864 | handle_failed_stripe(conf, sh, &s, disks, &return_bi); |
| 3046 | &return_bi); | ||
| 3047 | if (s.failed > 2 && s.syncing) { | 2865 | if (s.failed > 2 && s.syncing) { |
| 3048 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); | 2866 | md_done_sync(conf->mddev, STRIPE_SECTORS,0); |
| 3049 | clear_bit(STRIPE_SYNCING, &sh->state); | 2867 | clear_bit(STRIPE_SYNCING, &sh->state); |
| @@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
| 3068 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 2886 | ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
| 3069 | && !test_bit(R5_LOCKED, &qdev->flags) | 2887 | && !test_bit(R5_LOCKED, &qdev->flags) |
| 3070 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 2888 | && test_bit(R5_UPTODATE, &qdev->flags))))) |
| 3071 | handle_completed_write_requests(conf, sh, disks, &return_bi); | 2889 | handle_stripe_clean_event(conf, sh, disks, &return_bi); |
| 3072 | 2890 | ||
| 3073 | /* Now we might consider reading some blocks, either to check/generate | 2891 | /* Now we might consider reading some blocks, either to check/generate |
| 3074 | * parity, or to satisfy requests | 2892 | * parity, or to satisfy requests |
| @@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
| 3076 | */ | 2894 | */ |
| 3077 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || | 2895 | if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || |
| 3078 | (s.syncing && (s.uptodate < disks)) || s.expanding) | 2896 | (s.syncing && (s.uptodate < disks)) || s.expanding) |
| 3079 | handle_issuing_new_read_requests6(sh, &s, &r6s, disks); | 2897 | handle_stripe_fill6(sh, &s, &r6s, disks); |
| 3080 | 2898 | ||
| 3081 | /* now to consider writing and what else, if anything should be read */ | 2899 | /* now to consider writing and what else, if anything should be read */ |
| 3082 | if (s.to_write) | 2900 | if (s.to_write) |
| 3083 | handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); | 2901 | handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); |
| 3084 | 2902 | ||
| 3085 | /* maybe we need to check and possibly fix the parity for this stripe | 2903 | /* maybe we need to check and possibly fix the parity for this stripe |
| 3086 | * Any reads will already have been scheduled, so we just see if enough | 2904 | * Any reads will already have been scheduled, so we just see if enough |
| @@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
| 3136 | } | 2954 | } |
| 3137 | 2955 | ||
| 3138 | if (s.expanding && s.locked == 0 && | 2956 | if (s.expanding && s.locked == 0 && |
| 3139 | !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) | 2957 | !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) |
| 3140 | handle_stripe_expansion(conf, sh, &r6s); | 2958 | handle_stripe_expansion(conf, sh, &r6s); |
| 3141 | 2959 | ||
| 3142 | unlock: | 2960 | unlock: |
| @@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) | |||
| 3146 | if (unlikely(blocked_rdev)) | 2964 | if (unlikely(blocked_rdev)) |
| 3147 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); | 2965 | md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); |
| 3148 | 2966 | ||
| 3149 | return_io(return_bi); | 2967 | ops_run_io(sh, &s); |
| 3150 | |||
| 3151 | for (i=disks; i-- ;) { | ||
| 3152 | int rw; | ||
| 3153 | struct bio *bi; | ||
| 3154 | mdk_rdev_t *rdev; | ||
| 3155 | if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) | ||
| 3156 | rw = WRITE; | ||
| 3157 | else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | ||
| 3158 | rw = READ; | ||
| 3159 | else | ||
| 3160 | continue; | ||
| 3161 | |||
| 3162 | set_bit(STRIPE_IO_STARTED, &sh->state); | ||
| 3163 | |||
| 3164 | bi = &sh->dev[i].req; | ||
| 3165 | |||
| 3166 | bi->bi_rw = rw; | ||
| 3167 | if (rw == WRITE) | ||
| 3168 | bi->bi_end_io = raid5_end_write_request; | ||
| 3169 | else | ||
| 3170 | bi->bi_end_io = raid5_end_read_request; | ||
| 3171 | |||
| 3172 | rcu_read_lock(); | ||
| 3173 | rdev = rcu_dereference(conf->disks[i].rdev); | ||
| 3174 | if (rdev && test_bit(Faulty, &rdev->flags)) | ||
| 3175 | rdev = NULL; | ||
| 3176 | if (rdev) | ||
| 3177 | atomic_inc(&rdev->nr_pending); | ||
| 3178 | rcu_read_unlock(); | ||
| 3179 | 2968 | ||
| 3180 | if (rdev) { | 2969 | return_io(return_bi); |
| 3181 | if (s.syncing || s.expanding || s.expanded) | ||
| 3182 | md_sync_acct(rdev->bdev, STRIPE_SECTORS); | ||
| 3183 | |||
| 3184 | bi->bi_bdev = rdev->bdev; | ||
| 3185 | pr_debug("for %llu schedule op %ld on disc %d\n", | ||
| 3186 | (unsigned long long)sh->sector, bi->bi_rw, i); | ||
| 3187 | atomic_inc(&sh->count); | ||
| 3188 | bi->bi_sector = sh->sector + rdev->data_offset; | ||
| 3189 | bi->bi_flags = 1 << BIO_UPTODATE; | ||
| 3190 | bi->bi_vcnt = 1; | ||
| 3191 | bi->bi_max_vecs = 1; | ||
| 3192 | bi->bi_idx = 0; | ||
| 3193 | bi->bi_io_vec = &sh->dev[i].vec; | ||
| 3194 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | ||
| 3195 | bi->bi_io_vec[0].bv_offset = 0; | ||
| 3196 | bi->bi_size = STRIPE_SIZE; | ||
| 3197 | bi->bi_next = NULL; | ||
| 3198 | if (rw == WRITE && | ||
| 3199 | test_bit(R5_ReWrite, &sh->dev[i].flags)) | ||
| 3200 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | ||
| 3201 | generic_make_request(bi); | ||
| 3202 | } else { | ||
| 3203 | if (rw == WRITE) | ||
| 3204 | set_bit(STRIPE_DEGRADED, &sh->state); | ||
| 3205 | pr_debug("skip op %ld on disc %d for sector %llu\n", | ||
| 3206 | bi->bi_rw, i, (unsigned long long)sh->sector); | ||
| 3207 | clear_bit(R5_LOCKED, &sh->dev[i].flags); | ||
| 3208 | set_bit(STRIPE_HANDLE, &sh->state); | ||
| 3209 | } | ||
| 3210 | } | ||
| 3211 | } | 2970 | } |
| 3212 | 2971 | ||
| 3213 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) | 2972 | static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) |
| @@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi) | |||
| 3697 | if ( rw == WRITE ) | 3456 | if ( rw == WRITE ) |
| 3698 | md_write_end(mddev); | 3457 | md_write_end(mddev); |
| 3699 | 3458 | ||
| 3700 | bi->bi_end_io(bi, | 3459 | bio_endio(bi, 0); |
| 3701 | test_bit(BIO_UPTODATE, &bi->bi_flags) | ||
| 3702 | ? 0 : -EIO); | ||
| 3703 | } | 3460 | } |
| 3704 | return 0; | 3461 | return 0; |
| 3705 | } | 3462 | } |
| @@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped | |||
| 3785 | j == raid6_next_disk(sh->pd_idx, sh->disks)) | 3542 | j == raid6_next_disk(sh->pd_idx, sh->disks)) |
| 3786 | continue; | 3543 | continue; |
| 3787 | s = compute_blocknr(sh, j); | 3544 | s = compute_blocknr(sh, j); |
| 3788 | if (s < (mddev->array_size<<1)) { | 3545 | if (s < mddev->array_sectors) { |
| 3789 | skipped = 1; | 3546 | skipped = 1; |
| 3790 | continue; | 3547 | continue; |
| 3791 | } | 3548 | } |
| @@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) | |||
| 4002 | spin_lock_irq(&conf->device_lock); | 3759 | spin_lock_irq(&conf->device_lock); |
| 4003 | remaining = --raid_bio->bi_phys_segments; | 3760 | remaining = --raid_bio->bi_phys_segments; |
| 4004 | spin_unlock_irq(&conf->device_lock); | 3761 | spin_unlock_irq(&conf->device_lock); |
| 4005 | if (remaining == 0) { | 3762 | if (remaining == 0) |
| 4006 | 3763 | bio_endio(raid_bio, 0); | |
| 4007 | raid_bio->bi_end_io(raid_bio, | ||
| 4008 | test_bit(BIO_UPTODATE, &raid_bio->bi_flags) | ||
| 4009 | ? 0 : -EIO); | ||
| 4010 | } | ||
| 4011 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 3764 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
| 4012 | wake_up(&conf->wait_for_stripe); | 3765 | wake_up(&conf->wait_for_stripe); |
| 4013 | return handled; | 3766 | return handled; |
| @@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
| 4094 | { | 3847 | { |
| 4095 | raid5_conf_t *conf = mddev_to_conf(mddev); | 3848 | raid5_conf_t *conf = mddev_to_conf(mddev); |
| 4096 | unsigned long new; | 3849 | unsigned long new; |
| 3850 | int err; | ||
| 3851 | |||
| 4097 | if (len >= PAGE_SIZE) | 3852 | if (len >= PAGE_SIZE) |
| 4098 | return -EINVAL; | 3853 | return -EINVAL; |
| 4099 | if (!conf) | 3854 | if (!conf) |
| @@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | |||
| 4109 | else | 3864 | else |
| 4110 | break; | 3865 | break; |
| 4111 | } | 3866 | } |
| 4112 | md_allow_write(mddev); | 3867 | err = md_allow_write(mddev); |
| 3868 | if (err) | ||
| 3869 | return err; | ||
| 4113 | while (new > conf->max_nr_stripes) { | 3870 | while (new > conf->max_nr_stripes) { |
| 4114 | if (grow_one_stripe(conf)) | 3871 | if (grow_one_stripe(conf)) |
| 4115 | conf->max_nr_stripes++; | 3872 | conf->max_nr_stripes++; |
| @@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev) | |||
| 4434 | mddev->queue->backing_dev_info.congested_data = mddev; | 4191 | mddev->queue->backing_dev_info.congested_data = mddev; |
| 4435 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; | 4192 | mddev->queue->backing_dev_info.congested_fn = raid5_congested; |
| 4436 | 4193 | ||
| 4437 | mddev->array_size = mddev->size * (conf->previous_raid_disks - | 4194 | mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - |
| 4438 | conf->max_degraded); | 4195 | conf->max_degraded); |
| 4439 | 4196 | ||
| 4440 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); | 4197 | blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); |
| @@ -4609,35 +4366,41 @@ abort: | |||
| 4609 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | 4366 | static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) |
| 4610 | { | 4367 | { |
| 4611 | raid5_conf_t *conf = mddev->private; | 4368 | raid5_conf_t *conf = mddev->private; |
| 4612 | int found = 0; | 4369 | int err = -EEXIST; |
| 4613 | int disk; | 4370 | int disk; |
| 4614 | struct disk_info *p; | 4371 | struct disk_info *p; |
| 4372 | int first = 0; | ||
| 4373 | int last = conf->raid_disks - 1; | ||
| 4615 | 4374 | ||
| 4616 | if (mddev->degraded > conf->max_degraded) | 4375 | if (mddev->degraded > conf->max_degraded) |
| 4617 | /* no point adding a device */ | 4376 | /* no point adding a device */ |
| 4618 | return 0; | 4377 | return -EINVAL; |
| 4378 | |||
| 4379 | if (rdev->raid_disk >= 0) | ||
| 4380 | first = last = rdev->raid_disk; | ||
| 4619 | 4381 | ||
| 4620 | /* | 4382 | /* |
| 4621 | * find the disk ... but prefer rdev->saved_raid_disk | 4383 | * find the disk ... but prefer rdev->saved_raid_disk |
| 4622 | * if possible. | 4384 | * if possible. |
| 4623 | */ | 4385 | */ |
| 4624 | if (rdev->saved_raid_disk >= 0 && | 4386 | if (rdev->saved_raid_disk >= 0 && |
| 4387 | rdev->saved_raid_disk >= first && | ||
| 4625 | conf->disks[rdev->saved_raid_disk].rdev == NULL) | 4388 | conf->disks[rdev->saved_raid_disk].rdev == NULL) |
| 4626 | disk = rdev->saved_raid_disk; | 4389 | disk = rdev->saved_raid_disk; |
| 4627 | else | 4390 | else |
| 4628 | disk = 0; | 4391 | disk = first; |
| 4629 | for ( ; disk < conf->raid_disks; disk++) | 4392 | for ( ; disk <= last ; disk++) |
| 4630 | if ((p=conf->disks + disk)->rdev == NULL) { | 4393 | if ((p=conf->disks + disk)->rdev == NULL) { |
| 4631 | clear_bit(In_sync, &rdev->flags); | 4394 | clear_bit(In_sync, &rdev->flags); |
| 4632 | rdev->raid_disk = disk; | 4395 | rdev->raid_disk = disk; |
| 4633 | found = 1; | 4396 | err = 0; |
| 4634 | if (rdev->saved_raid_disk != disk) | 4397 | if (rdev->saved_raid_disk != disk) |
| 4635 | conf->fullsync = 1; | 4398 | conf->fullsync = 1; |
| 4636 | rcu_assign_pointer(p->rdev, rdev); | 4399 | rcu_assign_pointer(p->rdev, rdev); |
| 4637 | break; | 4400 | break; |
| 4638 | } | 4401 | } |
| 4639 | print_raid5_conf(conf); | 4402 | print_raid5_conf(conf); |
| 4640 | return found; | 4403 | return err; |
| 4641 | } | 4404 | } |
| 4642 | 4405 | ||
| 4643 | static int raid5_resize(mddev_t *mddev, sector_t sectors) | 4406 | static int raid5_resize(mddev_t *mddev, sector_t sectors) |
| @@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
| 4652 | raid5_conf_t *conf = mddev_to_conf(mddev); | 4415 | raid5_conf_t *conf = mddev_to_conf(mddev); |
| 4653 | 4416 | ||
| 4654 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); | 4417 | sectors &= ~((sector_t)mddev->chunk_size/512 - 1); |
| 4655 | mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; | 4418 | mddev->array_sectors = sectors * (mddev->raid_disks |
| 4656 | set_capacity(mddev->gendisk, mddev->array_size << 1); | 4419 | - conf->max_degraded); |
| 4420 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
| 4657 | mddev->changed = 1; | 4421 | mddev->changed = 1; |
| 4658 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { | 4422 | if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { |
| 4659 | mddev->recovery_cp = mddev->size << 1; | 4423 | mddev->recovery_cp = mddev->size << 1; |
| @@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev) | |||
| 4738 | rdev_for_each(rdev, rtmp, mddev) | 4502 | rdev_for_each(rdev, rtmp, mddev) |
| 4739 | if (rdev->raid_disk < 0 && | 4503 | if (rdev->raid_disk < 0 && |
| 4740 | !test_bit(Faulty, &rdev->flags)) { | 4504 | !test_bit(Faulty, &rdev->flags)) { |
| 4741 | if (raid5_add_disk(mddev, rdev)) { | 4505 | if (raid5_add_disk(mddev, rdev) == 0) { |
| 4742 | char nm[20]; | 4506 | char nm[20]; |
| 4743 | set_bit(In_sync, &rdev->flags); | 4507 | set_bit(In_sync, &rdev->flags); |
| 4744 | added_devices++; | 4508 | added_devices++; |
| @@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf) | |||
| 4786 | struct block_device *bdev; | 4550 | struct block_device *bdev; |
| 4787 | 4551 | ||
| 4788 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { | 4552 | if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { |
| 4789 | conf->mddev->array_size = conf->mddev->size * | 4553 | conf->mddev->array_sectors = 2 * conf->mddev->size * |
| 4790 | (conf->raid_disks - conf->max_degraded); | 4554 | (conf->raid_disks - conf->max_degraded); |
| 4791 | set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); | 4555 | set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); |
| 4792 | conf->mddev->changed = 1; | 4556 | conf->mddev->changed = 1; |
| 4793 | 4557 | ||
| 4794 | bdev = bdget_disk(conf->mddev->gendisk, 0); | 4558 | bdev = bdget_disk(conf->mddev->gendisk, 0); |
| 4795 | if (bdev) { | 4559 | if (bdev) { |
| 4796 | mutex_lock(&bdev->bd_inode->i_mutex); | 4560 | mutex_lock(&bdev->bd_inode->i_mutex); |
| 4797 | i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); | 4561 | i_size_write(bdev->bd_inode, |
| 4562 | (loff_t)conf->mddev->array_sectors << 9); | ||
| 4798 | mutex_unlock(&bdev->bd_inode->i_mutex); | 4563 | mutex_unlock(&bdev->bd_inode->i_mutex); |
| 4799 | bdput(bdev); | 4564 | bdput(bdev); |
| 4800 | } | 4565 | } |
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 78bfdea24a8e..e98900671ca9 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h | |||
| @@ -221,6 +221,7 @@ struct bitmap { | |||
| 221 | unsigned long syncchunk; | 221 | unsigned long syncchunk; |
| 222 | 222 | ||
| 223 | __u64 events_cleared; | 223 | __u64 events_cleared; |
| 224 | int need_sync; | ||
| 224 | 225 | ||
| 225 | /* bitmap spinlock */ | 226 | /* bitmap spinlock */ |
| 226 | spinlock_t lock; | 227 | spinlock_t lock; |
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index ba15469daf11..7e375111d007 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h | |||
| @@ -16,7 +16,7 @@ struct linear_private_data | |||
| 16 | struct linear_private_data *prev; /* earlier version */ | 16 | struct linear_private_data *prev; /* earlier version */ |
| 17 | dev_info_t **hash_table; | 17 | dev_info_t **hash_table; |
| 18 | sector_t hash_spacing; | 18 | sector_t hash_spacing; |
| 19 | sector_t array_size; | 19 | sector_t array_sectors; |
| 20 | int preshift; /* shift before dividing by hash_spacing */ | 20 | int preshift; /* shift before dividing by hash_spacing */ |
| 21 | dev_info_t disks[0]; | 21 | dev_info_t disks[0]; |
| 22 | }; | 22 | }; |
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b7386ae9d288..dc0e3fcb9f28 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h | |||
| @@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | |||
| 95 | struct page *page, int rw); | 95 | struct page *page, int rw); |
| 96 | extern void md_do_sync(mddev_t *mddev); | 96 | extern void md_do_sync(mddev_t *mddev); |
| 97 | extern void md_new_event(mddev_t *mddev); | 97 | extern void md_new_event(mddev_t *mddev); |
| 98 | extern void md_allow_write(mddev_t *mddev); | 98 | extern int md_allow_write(mddev_t *mddev); |
| 99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | 99 | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); |
| 100 | 100 | ||
| 101 | #endif /* CONFIG_MD */ | 101 | #endif /* CONFIG_MD */ |
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 3dea9f545c8f..9f2549ac0e2d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h | |||
| @@ -59,7 +59,7 @@ struct mdk_rdev_s | |||
| 59 | int sb_loaded; | 59 | int sb_loaded; |
| 60 | __u64 sb_events; | 60 | __u64 sb_events; |
| 61 | sector_t data_offset; /* start of data in array */ | 61 | sector_t data_offset; /* start of data in array */ |
| 62 | sector_t sb_offset; | 62 | sector_t sb_start; /* offset of the super block (in 512byte sectors) */ |
| 63 | int sb_size; /* bytes in the superblock */ | 63 | int sb_size; /* bytes in the superblock */ |
| 64 | int preferred_minor; /* autorun support */ | 64 | int preferred_minor; /* autorun support */ |
| 65 | 65 | ||
| @@ -87,6 +87,9 @@ struct mdk_rdev_s | |||
| 87 | #define Blocked 8 /* An error occured on an externally | 87 | #define Blocked 8 /* An error occured on an externally |
| 88 | * managed array, don't allow writes | 88 | * managed array, don't allow writes |
| 89 | * until it is cleared */ | 89 | * until it is cleared */ |
| 90 | #define StateChanged 9 /* Faulty or Blocked has changed during | ||
| 91 | * interrupt, so it needs to be | ||
| 92 | * notified by the thread */ | ||
| 90 | wait_queue_head_t blocked_wait; | 93 | wait_queue_head_t blocked_wait; |
| 91 | 94 | ||
| 92 | int desc_nr; /* descriptor index in the superblock */ | 95 | int desc_nr; /* descriptor index in the superblock */ |
| @@ -147,7 +150,7 @@ struct mddev_s | |||
| 147 | int raid_disks; | 150 | int raid_disks; |
| 148 | int max_disks; | 151 | int max_disks; |
| 149 | sector_t size; /* used size of component devices */ | 152 | sector_t size; /* used size of component devices */ |
| 150 | sector_t array_size; /* exported array size */ | 153 | sector_t array_sectors; /* exported array size */ |
| 151 | __u64 events; | 154 | __u64 events; |
| 152 | 155 | ||
| 153 | char uuid[16]; | 156 | char uuid[16]; |
| @@ -188,6 +191,7 @@ struct mddev_s | |||
| 188 | * NEEDED: we might need to start a resync/recover | 191 | * NEEDED: we might need to start a resync/recover |
| 189 | * RUNNING: a thread is running, or about to be started | 192 | * RUNNING: a thread is running, or about to be started |
| 190 | * SYNC: actually doing a resync, not a recovery | 193 | * SYNC: actually doing a resync, not a recovery |
| 194 | * RECOVER: doing recovery, or need to try it. | ||
| 191 | * INTR: resync needs to be aborted for some reason | 195 | * INTR: resync needs to be aborted for some reason |
| 192 | * DONE: thread is done and is waiting to be reaped | 196 | * DONE: thread is done and is waiting to be reaped |
| 193 | * REQUEST: user-space has requested a sync (used with SYNC) | 197 | * REQUEST: user-space has requested a sync (used with SYNC) |
| @@ -198,6 +202,7 @@ struct mddev_s | |||
| 198 | */ | 202 | */ |
| 199 | #define MD_RECOVERY_RUNNING 0 | 203 | #define MD_RECOVERY_RUNNING 0 |
| 200 | #define MD_RECOVERY_SYNC 1 | 204 | #define MD_RECOVERY_SYNC 1 |
| 205 | #define MD_RECOVERY_RECOVER 2 | ||
| 201 | #define MD_RECOVERY_INTR 3 | 206 | #define MD_RECOVERY_INTR 3 |
| 202 | #define MD_RECOVERY_DONE 4 | 207 | #define MD_RECOVERY_DONE 4 |
| 203 | #define MD_RECOVERY_NEEDED 5 | 208 | #define MD_RECOVERY_NEEDED 5 |
| @@ -210,7 +215,8 @@ struct mddev_s | |||
| 210 | 215 | ||
| 211 | int in_sync; /* know to not need resync */ | 216 | int in_sync; /* know to not need resync */ |
| 212 | struct mutex reconfig_mutex; | 217 | struct mutex reconfig_mutex; |
| 213 | atomic_t active; | 218 | atomic_t active; /* general refcount */ |
| 219 | atomic_t openers; /* number of active opens */ | ||
| 214 | 220 | ||
| 215 | int changed; /* true if we might need to reread partition info */ | 221 | int changed; /* true if we might need to reread partition info */ |
| 216 | int degraded; /* whether md should consider | 222 | int degraded; /* whether md should consider |
| @@ -227,6 +233,8 @@ struct mddev_s | |||
| 227 | atomic_t recovery_active; /* blocks scheduled, but not written */ | 233 | atomic_t recovery_active; /* blocks scheduled, but not written */ |
| 228 | wait_queue_head_t recovery_wait; | 234 | wait_queue_head_t recovery_wait; |
| 229 | sector_t recovery_cp; | 235 | sector_t recovery_cp; |
| 236 | sector_t resync_min; /* user requested sync | ||
| 237 | * starts here */ | ||
| 230 | sector_t resync_max; /* resync should pause | 238 | sector_t resync_max; /* resync should pause |
| 231 | * when it gets here */ | 239 | * when it gets here */ |
| 232 | 240 | ||
| @@ -331,6 +339,9 @@ static inline char * mdname (mddev_t * mddev) | |||
| 331 | #define rdev_for_each(rdev, tmp, mddev) \ | 339 | #define rdev_for_each(rdev, tmp, mddev) \ |
| 332 | rdev_for_each_list(rdev, tmp, (mddev)->disks) | 340 | rdev_for_each_list(rdev, tmp, (mddev)->disks) |
| 333 | 341 | ||
| 342 | #define rdev_for_each_rcu(rdev, mddev) \ | ||
| 343 | list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) | ||
| 344 | |||
| 334 | typedef struct mdk_thread_s { | 345 | typedef struct mdk_thread_s { |
| 335 | void (*run) (mddev_t *mddev); | 346 | void (*run) (mddev_t *mddev); |
| 336 | mddev_t *mddev; | 347 | mddev_t *mddev; |
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 3f2cd98c508b..8b4de4a41ff1 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h | |||
| @@ -43,14 +43,11 @@ | |||
| 43 | */ | 43 | */ |
| 44 | #define MD_RESERVED_BYTES (64 * 1024) | 44 | #define MD_RESERVED_BYTES (64 * 1024) |
| 45 | #define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) | 45 | #define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) |
| 46 | #define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) | ||
| 47 | 46 | ||
| 48 | #define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) | 47 | #define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) |
| 49 | #define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) | ||
| 50 | 48 | ||
| 51 | #define MD_SB_BYTES 4096 | 49 | #define MD_SB_BYTES 4096 |
| 52 | #define MD_SB_WORDS (MD_SB_BYTES / 4) | 50 | #define MD_SB_WORDS (MD_SB_BYTES / 4) |
| 53 | #define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) | ||
| 54 | #define MD_SB_SECTORS (MD_SB_BYTES / 512) | 51 | #define MD_SB_SECTORS (MD_SB_BYTES / 512) |
| 55 | 52 | ||
| 56 | /* | 53 | /* |
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index f0827d31ae6f..3b2672792457 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h | |||
| @@ -158,6 +158,43 @@ | |||
| 158 | * the compute block completes. | 158 | * the compute block completes. |
| 159 | */ | 159 | */ |
| 160 | 160 | ||
| 161 | /* | ||
| 162 | * Operations state - intermediate states that are visible outside of sh->lock | ||
| 163 | * In general _idle indicates nothing is running, _run indicates a data | ||
| 164 | * processing operation is active, and _result means the data processing result | ||
| 165 | * is stable and can be acted upon. For simple operations like biofill and | ||
| 166 | * compute that only have an _idle and _run state they are indicated with | ||
| 167 | * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) | ||
| 168 | */ | ||
| 169 | /** | ||
| 170 | * enum check_states - handles syncing / repairing a stripe | ||
| 171 | * @check_state_idle - check operations are quiesced | ||
| 172 | * @check_state_run - check operation is running | ||
| 173 | * @check_state_result - set outside lock when check result is valid | ||
| 174 | * @check_state_compute_run - check failed and we are repairing | ||
| 175 | * @check_state_compute_result - set outside lock when compute result is valid | ||
| 176 | */ | ||
| 177 | enum check_states { | ||
| 178 | check_state_idle = 0, | ||
| 179 | check_state_run, /* parity check */ | ||
| 180 | check_state_check_result, | ||
| 181 | check_state_compute_run, /* parity repair */ | ||
| 182 | check_state_compute_result, | ||
| 183 | }; | ||
| 184 | |||
| 185 | /** | ||
| 186 | * enum reconstruct_states - handles writing or expanding a stripe | ||
| 187 | */ | ||
| 188 | enum reconstruct_states { | ||
| 189 | reconstruct_state_idle = 0, | ||
| 190 | reconstruct_state_prexor_drain_run, /* prexor-write */ | ||
| 191 | reconstruct_state_drain_run, /* write */ | ||
| 192 | reconstruct_state_run, /* expand */ | ||
| 193 | reconstruct_state_prexor_drain_result, | ||
| 194 | reconstruct_state_drain_result, | ||
| 195 | reconstruct_state_result, | ||
| 196 | }; | ||
| 197 | |||
| 161 | struct stripe_head { | 198 | struct stripe_head { |
| 162 | struct hlist_node hash; | 199 | struct hlist_node hash; |
| 163 | struct list_head lru; /* inactive_list or handle_list */ | 200 | struct list_head lru; /* inactive_list or handle_list */ |
| @@ -169,19 +206,13 @@ struct stripe_head { | |||
| 169 | spinlock_t lock; | 206 | spinlock_t lock; |
| 170 | int bm_seq; /* sequence number for bitmap flushes */ | 207 | int bm_seq; /* sequence number for bitmap flushes */ |
| 171 | int disks; /* disks in stripe */ | 208 | int disks; /* disks in stripe */ |
| 209 | enum check_states check_state; | ||
| 210 | enum reconstruct_states reconstruct_state; | ||
| 172 | /* stripe_operations | 211 | /* stripe_operations |
| 173 | * @pending - pending ops flags (set for request->issue->complete) | ||
| 174 | * @ack - submitted ops flags (set for issue->complete) | ||
| 175 | * @complete - completed ops flags (set for complete) | ||
| 176 | * @target - STRIPE_OP_COMPUTE_BLK target | 212 | * @target - STRIPE_OP_COMPUTE_BLK target |
| 177 | * @count - raid5_runs_ops is set to run when this is non-zero | ||
| 178 | */ | 213 | */ |
| 179 | struct stripe_operations { | 214 | struct stripe_operations { |
| 180 | unsigned long pending; | ||
| 181 | unsigned long ack; | ||
| 182 | unsigned long complete; | ||
| 183 | int target; | 215 | int target; |
| 184 | int count; | ||
| 185 | u32 zero_sum_result; | 216 | u32 zero_sum_result; |
| 186 | } ops; | 217 | } ops; |
| 187 | struct r5dev { | 218 | struct r5dev { |
| @@ -202,6 +233,7 @@ struct stripe_head_state { | |||
| 202 | int locked, uptodate, to_read, to_write, failed, written; | 233 | int locked, uptodate, to_read, to_write, failed, written; |
| 203 | int to_fill, compute, req_compute, non_overwrite; | 234 | int to_fill, compute, req_compute, non_overwrite; |
| 204 | int failed_num; | 235 | int failed_num; |
| 236 | unsigned long ops_request; | ||
| 205 | }; | 237 | }; |
| 206 | 238 | ||
| 207 | /* r6_state - extra state data only relevant to r6 */ | 239 | /* r6_state - extra state data only relevant to r6 */ |
| @@ -228,9 +260,7 @@ struct r6_state { | |||
| 228 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs | 260 | #define R5_Wantfill 12 /* dev->toread contains a bio that needs |
| 229 | * filling | 261 | * filling |
| 230 | */ | 262 | */ |
| 231 | #define R5_Wantprexor 13 /* distinguish blocks ready for rmw from | 263 | #define R5_Wantdrain 13 /* dev->towrite needs to be drained */ |
| 232 | * other "towrites" | ||
| 233 | */ | ||
| 234 | /* | 264 | /* |
| 235 | * Write method | 265 | * Write method |
| 236 | */ | 266 | */ |
| @@ -254,8 +284,10 @@ struct r6_state { | |||
| 254 | #define STRIPE_EXPAND_READY 11 | 284 | #define STRIPE_EXPAND_READY 11 |
| 255 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ | 285 | #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ |
| 256 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ | 286 | #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ |
| 287 | #define STRIPE_BIOFILL_RUN 14 | ||
| 288 | #define STRIPE_COMPUTE_RUN 15 | ||
| 257 | /* | 289 | /* |
| 258 | * Operations flags (in issue order) | 290 | * Operation request flags |
| 259 | */ | 291 | */ |
| 260 | #define STRIPE_OP_BIOFILL 0 | 292 | #define STRIPE_OP_BIOFILL 0 |
| 261 | #define STRIPE_OP_COMPUTE_BLK 1 | 293 | #define STRIPE_OP_COMPUTE_BLK 1 |
| @@ -263,14 +295,6 @@ struct r6_state { | |||
| 263 | #define STRIPE_OP_BIODRAIN 3 | 295 | #define STRIPE_OP_BIODRAIN 3 |
| 264 | #define STRIPE_OP_POSTXOR 4 | 296 | #define STRIPE_OP_POSTXOR 4 |
| 265 | #define STRIPE_OP_CHECK 5 | 297 | #define STRIPE_OP_CHECK 5 |
| 266 | #define STRIPE_OP_IO 6 | ||
| 267 | |||
| 268 | /* modifiers to the base operations | ||
| 269 | * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back | ||
| 270 | * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check | ||
| 271 | */ | ||
| 272 | #define STRIPE_OP_MOD_REPAIR_PD 7 | ||
| 273 | #define STRIPE_OP_MOD_DMA_CHECK 8 | ||
| 274 | 298 | ||
| 275 | /* | 299 | /* |
| 276 | * Plugging: | 300 | * Plugging: |
