diff options
author | NeilBrown <neilb@suse.de> | 2011-07-25 21:34:20 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-07-25 21:34:20 -0400 |
commit | c4c1663be46b2ab94e59d3e0c583a8f6b188ff0c (patch) | |
tree | 2e0b6b51c0a55c5f9edad2c832a66c9949ef496d | |
parent | cbe47ec559c33a68b5ee002051b848d1531a8adb (diff) |
md/raid5: replace sh->lock with an 'active' flag.
sh->lock is now mainly used to ensure that two threads aren't running
in the locked part of handle_stripe[56] at the same time.
That can more neatly be achieved with an 'active' flag which we set
while running handle_stripe. If we find the flag is set, we simply
requeue the stripe for later by setting STRIPE_HANDLE.
For safety we take ->device_lock while examining the state of the
stripe and creating a summary in 'stripe_head_state / r6_state'.
This possibly isn't needed but as shared fields like ->toread,
->towrite are checked it is safer for now at least.
We leave the label after the old 'unlock' called "unlock" because it
will disappear in a few patches, so renaming seems pointless.
This leaves the stripe 'locked' for longer as we clear STRIPE_ACTIVE
later, but that is not a problem.
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
-rw-r--r-- | drivers/md/raid5.c | 26 | ||||
-rw-r--r-- | drivers/md/raid5.h | 35 |
2 files changed, 29 insertions, 32 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9985138f4c04..f8275b5a6fbe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -1020,14 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1020 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1020 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
1021 | struct bio *wbi; | 1021 | struct bio *wbi; |
1022 | 1022 | ||
1023 | spin_lock(&sh->lock); | ||
1024 | spin_lock_irq(&sh->raid_conf->device_lock); | 1023 | spin_lock_irq(&sh->raid_conf->device_lock); |
1025 | chosen = dev->towrite; | 1024 | chosen = dev->towrite; |
1026 | dev->towrite = NULL; | 1025 | dev->towrite = NULL; |
1027 | BUG_ON(dev->written); | 1026 | BUG_ON(dev->written); |
1028 | wbi = dev->written = chosen; | 1027 | wbi = dev->written = chosen; |
1029 | spin_unlock_irq(&sh->raid_conf->device_lock); | 1028 | spin_unlock_irq(&sh->raid_conf->device_lock); |
1030 | spin_unlock(&sh->lock); | ||
1031 | 1029 | ||
1032 | while (wbi && wbi->bi_sector < | 1030 | while (wbi && wbi->bi_sector < |
1033 | dev->sector + STRIPE_SECTORS) { | 1031 | dev->sector + STRIPE_SECTORS) { |
@@ -1322,7 +1320,6 @@ static int grow_one_stripe(raid5_conf_t *conf) | |||
1322 | return 0; | 1320 | return 0; |
1323 | 1321 | ||
1324 | sh->raid_conf = conf; | 1322 | sh->raid_conf = conf; |
1325 | spin_lock_init(&sh->lock); | ||
1326 | #ifdef CONFIG_MULTICORE_RAID456 | 1323 | #ifdef CONFIG_MULTICORE_RAID456 |
1327 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1324 | init_waitqueue_head(&sh->ops.wait_for_ops); |
1328 | #endif | 1325 | #endif |
@@ -1442,7 +1439,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | |||
1442 | break; | 1439 | break; |
1443 | 1440 | ||
1444 | nsh->raid_conf = conf; | 1441 | nsh->raid_conf = conf; |
1445 | spin_lock_init(&nsh->lock); | ||
1446 | #ifdef CONFIG_MULTICORE_RAID456 | 1442 | #ifdef CONFIG_MULTICORE_RAID456 |
1447 | init_waitqueue_head(&nsh->ops.wait_for_ops); | 1443 | init_waitqueue_head(&nsh->ops.wait_for_ops); |
1448 | #endif | 1444 | #endif |
@@ -2148,7 +2144,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2148 | (unsigned long long)sh->sector); | 2144 | (unsigned long long)sh->sector); |
2149 | 2145 | ||
2150 | 2146 | ||
2151 | spin_lock(&sh->lock); | ||
2152 | spin_lock_irq(&conf->device_lock); | 2147 | spin_lock_irq(&conf->device_lock); |
2153 | if (forwrite) { | 2148 | if (forwrite) { |
2154 | bip = &sh->dev[dd_idx].towrite; | 2149 | bip = &sh->dev[dd_idx].towrite; |
@@ -2184,7 +2179,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2184 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2179 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
2185 | } | 2180 | } |
2186 | spin_unlock_irq(&conf->device_lock); | 2181 | spin_unlock_irq(&conf->device_lock); |
2187 | spin_unlock(&sh->lock); | ||
2188 | 2182 | ||
2189 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2183 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
2190 | (unsigned long long)(*bip)->bi_sector, | 2184 | (unsigned long long)(*bip)->bi_sector, |
@@ -2201,7 +2195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2201 | overlap: | 2195 | overlap: |
2202 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2196 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
2203 | spin_unlock_irq(&conf->device_lock); | 2197 | spin_unlock_irq(&conf->device_lock); |
2204 | spin_unlock(&sh->lock); | ||
2205 | return 0; | 2198 | return 0; |
2206 | } | 2199 | } |
2207 | 2200 | ||
@@ -3023,12 +3016,10 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3023 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, | 3016 | atomic_read(&sh->count), sh->pd_idx, sh->check_state, |
3024 | sh->reconstruct_state); | 3017 | sh->reconstruct_state); |
3025 | 3018 | ||
3026 | spin_lock(&sh->lock); | ||
3027 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | 3019 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { |
3028 | set_bit(STRIPE_SYNCING, &sh->state); | 3020 | set_bit(STRIPE_SYNCING, &sh->state); |
3029 | clear_bit(STRIPE_INSYNC, &sh->state); | 3021 | clear_bit(STRIPE_INSYNC, &sh->state); |
3030 | } | 3022 | } |
3031 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
3032 | clear_bit(STRIPE_DELAYED, &sh->state); | 3023 | clear_bit(STRIPE_DELAYED, &sh->state); |
3033 | 3024 | ||
3034 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 3025 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
@@ -3037,6 +3028,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3037 | 3028 | ||
3038 | /* Now to look around and see what can be done */ | 3029 | /* Now to look around and see what can be done */ |
3039 | rcu_read_lock(); | 3030 | rcu_read_lock(); |
3031 | spin_lock_irq(&conf->device_lock); | ||
3040 | for (i=disks; i--; ) { | 3032 | for (i=disks; i--; ) { |
3041 | mdk_rdev_t *rdev; | 3033 | mdk_rdev_t *rdev; |
3042 | 3034 | ||
@@ -3099,6 +3091,7 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3099 | s.failed_num = i; | 3091 | s.failed_num = i; |
3100 | } | 3092 | } |
3101 | } | 3093 | } |
3094 | spin_unlock_irq(&conf->device_lock); | ||
3102 | rcu_read_unlock(); | 3095 | rcu_read_unlock(); |
3103 | 3096 | ||
3104 | if (unlikely(blocked_rdev)) { | 3097 | if (unlikely(blocked_rdev)) { |
@@ -3275,7 +3268,6 @@ static void handle_stripe5(struct stripe_head *sh) | |||
3275 | handle_stripe_expansion(conf, sh, NULL); | 3268 | handle_stripe_expansion(conf, sh, NULL); |
3276 | 3269 | ||
3277 | unlock: | 3270 | unlock: |
3278 | spin_unlock(&sh->lock); | ||
3279 | 3271 | ||
3280 | /* wait for this device to become unblocked */ | 3272 | /* wait for this device to become unblocked */ |
3281 | if (unlikely(blocked_rdev)) | 3273 | if (unlikely(blocked_rdev)) |
@@ -3318,12 +3310,10 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3318 | sh->check_state, sh->reconstruct_state); | 3310 | sh->check_state, sh->reconstruct_state); |
3319 | memset(&s, 0, sizeof(s)); | 3311 | memset(&s, 0, sizeof(s)); |
3320 | 3312 | ||
3321 | spin_lock(&sh->lock); | ||
3322 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { | 3313 | if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { |
3323 | set_bit(STRIPE_SYNCING, &sh->state); | 3314 | set_bit(STRIPE_SYNCING, &sh->state); |
3324 | clear_bit(STRIPE_INSYNC, &sh->state); | 3315 | clear_bit(STRIPE_INSYNC, &sh->state); |
3325 | } | 3316 | } |
3326 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
3327 | clear_bit(STRIPE_DELAYED, &sh->state); | 3317 | clear_bit(STRIPE_DELAYED, &sh->state); |
3328 | 3318 | ||
3329 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); | 3319 | s.syncing = test_bit(STRIPE_SYNCING, &sh->state); |
@@ -3332,6 +3322,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3332 | /* Now to look around and see what can be done */ | 3322 | /* Now to look around and see what can be done */ |
3333 | 3323 | ||
3334 | rcu_read_lock(); | 3324 | rcu_read_lock(); |
3325 | spin_lock_irq(&conf->device_lock); | ||
3335 | for (i=disks; i--; ) { | 3326 | for (i=disks; i--; ) { |
3336 | mdk_rdev_t *rdev; | 3327 | mdk_rdev_t *rdev; |
3337 | dev = &sh->dev[i]; | 3328 | dev = &sh->dev[i]; |
@@ -3395,6 +3386,7 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3395 | s.failed++; | 3386 | s.failed++; |
3396 | } | 3387 | } |
3397 | } | 3388 | } |
3389 | spin_unlock_irq(&conf->device_lock); | ||
3398 | rcu_read_unlock(); | 3390 | rcu_read_unlock(); |
3399 | 3391 | ||
3400 | if (unlikely(blocked_rdev)) { | 3392 | if (unlikely(blocked_rdev)) { |
@@ -3580,7 +3572,6 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3580 | handle_stripe_expansion(conf, sh, &r6s); | 3572 | handle_stripe_expansion(conf, sh, &r6s); |
3581 | 3573 | ||
3582 | unlock: | 3574 | unlock: |
3583 | spin_unlock(&sh->lock); | ||
3584 | 3575 | ||
3585 | /* wait for this device to become unblocked */ | 3576 | /* wait for this device to become unblocked */ |
3586 | if (unlikely(blocked_rdev)) | 3577 | if (unlikely(blocked_rdev)) |
@@ -3608,10 +3599,19 @@ static void handle_stripe6(struct stripe_head *sh) | |||
3608 | 3599 | ||
3609 | static void handle_stripe(struct stripe_head *sh) | 3600 | static void handle_stripe(struct stripe_head *sh) |
3610 | { | 3601 | { |
3602 | clear_bit(STRIPE_HANDLE, &sh->state); | ||
3603 | if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { | ||
3604 | /* already being handled, ensure it gets handled | ||
3605 | * again when current action finishes */ | ||
3606 | set_bit(STRIPE_HANDLE, &sh->state); | ||
3607 | return; | ||
3608 | } | ||
3609 | |||
3611 | if (sh->raid_conf->level == 6) | 3610 | if (sh->raid_conf->level == 6) |
3612 | handle_stripe6(sh); | 3611 | handle_stripe6(sh); |
3613 | else | 3612 | else |
3614 | handle_stripe5(sh); | 3613 | handle_stripe5(sh); |
3614 | clear_bit(STRIPE_ACTIVE, &sh->state); | ||
3615 | } | 3615 | } |
3616 | 3616 | ||
3617 | static void raid5_activate_delayed(raid5_conf_t *conf) | 3617 | static void raid5_activate_delayed(raid5_conf_t *conf) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a33001137bf8..bb246d9e0547 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -6,11 +6,11 @@ | |||
6 | 6 | ||
7 | /* | 7 | /* |
8 | * | 8 | * |
9 | * Each stripe contains one buffer per disc. Each buffer can be in | 9 | * Each stripe contains one buffer per device. Each buffer can be in |
10 | * one of a number of states stored in "flags". Changes between | 10 | * one of a number of states stored in "flags". Changes between |
11 | * these states happen *almost* exclusively under a per-stripe | 11 | * these states happen *almost* exclusively under the protection of the |
12 | * spinlock. Some very specific changes can happen in bi_end_io, and | 12 | * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and |
13 | * these are not protected by the spin lock. | 13 | * these are not protected by STRIPE_ACTIVE. |
14 | * | 14 | * |
15 | * The flag bits that are used to represent these states are: | 15 | * The flag bits that are used to represent these states are: |
16 | * R5_UPTODATE and R5_LOCKED | 16 | * R5_UPTODATE and R5_LOCKED |
@@ -76,12 +76,10 @@ | |||
76 | * block and the cached buffer are successfully written, any buffer on | 76 | * block and the cached buffer are successfully written, any buffer on |
77 | * a written list can be returned with b_end_io. | 77 | * a written list can be returned with b_end_io. |
78 | * | 78 | * |
79 | * The write list and read list both act as fifos. The read list is | 79 | * The write list and read list both act as fifos. The read list, |
80 | * protected by the device_lock. The write and written lists are | 80 | * write list and written list are protected by the device_lock. |
81 | * protected by the stripe lock. The device_lock, which can be | 81 | * The device_lock is only for list manipulations and will only be |
82 | * claimed while the stipe lock is held, is only for list | 82 | * held for a very short time. It can be claimed from interrupts. |
83 | * manipulations and will only be held for a very short time. It can | ||
84 | * be claimed from interrupts. | ||
85 | * | 83 | * |
86 | * | 84 | * |
87 | * Stripes in the stripe cache can be on one of two lists (or on | 85 | * Stripes in the stripe cache can be on one of two lists (or on |
@@ -96,7 +94,6 @@ | |||
96 | * | 94 | * |
97 | * The inactive_list, handle_list and hash bucket lists are all protected by the | 95 | * The inactive_list, handle_list and hash bucket lists are all protected by the |
98 | * device_lock. | 96 | * device_lock. |
99 | * - stripes on the inactive_list never have their stripe_lock held. | ||
100 | * - stripes have a reference counter. If count==0, they are on a list. | 97 | * - stripes have a reference counter. If count==0, they are on a list. |
101 | * - If a stripe might need handling, STRIPE_HANDLE is set. | 98 | * - If a stripe might need handling, STRIPE_HANDLE is set. |
102 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on | 99 | * - When refcount reaches zero, then if STRIPE_HANDLE it is put on |
@@ -116,10 +113,10 @@ | |||
116 | * attach a request to an active stripe (add_stripe_bh()) | 113 | * attach a request to an active stripe (add_stripe_bh()) |
117 | * lockdev attach-buffer unlockdev | 114 | * lockdev attach-buffer unlockdev |
118 | * handle a stripe (handle_stripe()) | 115 | * handle a stripe (handle_stripe()) |
119 | * lockstripe clrSTRIPE_HANDLE ... | 116 | * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... |
120 | * (lockdev check-buffers unlockdev) .. | 117 | * (lockdev check-buffers unlockdev) .. |
121 | * change-state .. | 118 | * change-state .. |
122 | * record io/ops needed unlockstripe schedule io/ops | 119 | * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops |
123 | * release an active stripe (release_stripe()) | 120 | * release an active stripe (release_stripe()) |
124 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev | 121 | * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev |
125 | * | 122 | * |
@@ -128,8 +125,7 @@ | |||
128 | * on a cached buffer, and plus one if the stripe is undergoing stripe | 125 | * on a cached buffer, and plus one if the stripe is undergoing stripe |
129 | * operations. | 126 | * operations. |
130 | * | 127 | * |
131 | * Stripe operations are performed outside the stripe lock, | 128 | * The stripe operations are: |
132 | * the stripe operations are: | ||
133 | * -copying data between the stripe cache and user application buffers | 129 | * -copying data between the stripe cache and user application buffers |
134 | * -computing blocks to save a disk access, or to recover a missing block | 130 | * -computing blocks to save a disk access, or to recover a missing block |
135 | * -updating the parity on a write operation (reconstruct write and | 131 | * -updating the parity on a write operation (reconstruct write and |
@@ -159,7 +155,8 @@ | |||
159 | */ | 155 | */ |
160 | 156 | ||
161 | /* | 157 | /* |
162 | * Operations state - intermediate states that are visible outside of sh->lock | 158 | * Operations state - intermediate states that are visible outside of |
159 | * STRIPE_ACTIVE. | ||
163 | * In general _idle indicates nothing is running, _run indicates a data | 160 | * In general _idle indicates nothing is running, _run indicates a data |
164 | * processing operation is active, and _result means the data processing result | 161 | * processing operation is active, and _result means the data processing result |
165 | * is stable and can be acted upon. For simple operations like biofill and | 162 | * is stable and can be acted upon. For simple operations like biofill and |
@@ -209,7 +206,6 @@ struct stripe_head { | |||
209 | short ddf_layout;/* use DDF ordering to calculate Q */ | 206 | short ddf_layout;/* use DDF ordering to calculate Q */ |
210 | unsigned long state; /* state flags */ | 207 | unsigned long state; /* state flags */ |
211 | atomic_t count; /* nr of active thread/requests */ | 208 | atomic_t count; /* nr of active thread/requests */ |
212 | spinlock_t lock; | ||
213 | int bm_seq; /* sequence number for bitmap flushes */ | 209 | int bm_seq; /* sequence number for bitmap flushes */ |
214 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
215 | enum check_states check_state; | 211 | enum check_states check_state; |
@@ -240,7 +236,7 @@ struct stripe_head { | |||
240 | }; | 236 | }; |
241 | 237 | ||
242 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head | 238 | /* stripe_head_state - collects and tracks the dynamic state of a stripe_head |
243 | * for handle_stripe. It is only valid under spin_lock(sh->lock); | 239 | * for handle_stripe. |
244 | */ | 240 | */ |
245 | struct stripe_head_state { | 241 | struct stripe_head_state { |
246 | int syncing, expanding, expanded; | 242 | int syncing, expanding, expanded; |
@@ -290,6 +286,7 @@ struct r6_state { | |||
290 | * Stripe state | 286 | * Stripe state |
291 | */ | 287 | */ |
292 | enum { | 288 | enum { |
289 | STRIPE_ACTIVE, | ||
293 | STRIPE_HANDLE, | 290 | STRIPE_HANDLE, |
294 | STRIPE_SYNC_REQUESTED, | 291 | STRIPE_SYNC_REQUESTED, |
295 | STRIPE_SYNCING, | 292 | STRIPE_SYNCING, |
@@ -339,7 +336,7 @@ enum { | |||
339 | * PREREAD_ACTIVE. | 336 | * PREREAD_ACTIVE. |
340 | * In stripe_handle, if we find pre-reading is necessary, we do it if | 337 | * In stripe_handle, if we find pre-reading is necessary, we do it if |
341 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. | 338 | * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. |
342 | * HANDLE gets cleared if stripe_handle leave nothing locked. | 339 | * HANDLE gets cleared if stripe_handle leaves nothing locked. |
343 | */ | 340 | */ |
344 | 341 | ||
345 | 342 | ||