aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-10 16:03:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-10 16:03:41 -0400
commit4d7696f1b05f4aeb586c74868fe3da2731daca4b (patch)
treedd6cf4d41df2c0a1f52a85a3f8b8af5a9ebdeb5d /drivers/md
parentb05430fc9341fea7a6228a3611c850a476809596 (diff)
parentbfc90cb0936f5b972706625f38f72c7cb726c20a (diff)
Merge tag 'md/3.12' of git://neil.brown.name/md
Pull md update from Neil Brown: "Headline item is multithreading for RAID5 so that more IO/sec can be supported on fast (SSD) devices. Also TILE-Gx SIMD suppor for RAID6 calculations and an assortment of bug fixes" * tag 'md/3.12' of git://neil.brown.name/md: raid5: only wakeup necessary threads md/raid5: flush out all pending requests before proceeding with reshape. md/raid5: use seqcount to protect access to shape in make_request. raid5: sysfs entry to control worker thread number raid5: offload stripe handle to workqueue raid5: fix stripe release order raid5: make release_stripe lockless md: avoid deadlock when dirty buffers during md_stop. md: Don't test all of mddev->flags at once. md: Fix apparent cut-and-paste error in super_90_validate raid6/test: replace echo -e with printf RAID: add tilegx SIMD implementation of raid6 md: fix safe_mode buglet. md: don't call md_allow_write in get_bitmap_file.
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/md.c54
-rw-r--r--drivers/md/md.h8
-rw-r--r--drivers/md/raid5.c362
-rw-r--r--drivers/md/raid5.h22
4 files changed, 406 insertions, 40 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9f13e13506ef..adf4d7e1d5e1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1180,7 +1180,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1180 mddev->bitmap_info.offset = 1180 mddev->bitmap_info.offset =
1181 mddev->bitmap_info.default_offset; 1181 mddev->bitmap_info.default_offset;
1182 mddev->bitmap_info.space = 1182 mddev->bitmap_info.space =
1183 mddev->bitmap_info.space; 1183 mddev->bitmap_info.default_space;
1184 } 1184 }
1185 1185
1186 } else if (mddev->pers == NULL) { 1186 } else if (mddev->pers == NULL) {
@@ -3429,7 +3429,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3429 mddev->safemode_delay = (msec*HZ)/1000; 3429 mddev->safemode_delay = (msec*HZ)/1000;
3430 if (mddev->safemode_delay == 0) 3430 if (mddev->safemode_delay == 0)
3431 mddev->safemode_delay = 1; 3431 mddev->safemode_delay = 1;
3432 if (mddev->safemode_delay < old_delay) 3432 if (mddev->safemode_delay < old_delay || old_delay == 0)
3433 md_safemode_timeout((unsigned long)mddev); 3433 md_safemode_timeout((unsigned long)mddev);
3434 } 3434 }
3435 return len; 3435 return len;
@@ -5144,7 +5144,7 @@ int md_run(struct mddev *mddev)
5144 5144
5145 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5145 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5146 5146
5147 if (mddev->flags) 5147 if (mddev->flags & MD_UPDATE_SB_FLAGS)
5148 md_update_sb(mddev, 0); 5148 md_update_sb(mddev, 0);
5149 5149
5150 md_new_event(mddev); 5150 md_new_event(mddev);
@@ -5289,7 +5289,7 @@ static void __md_stop_writes(struct mddev *mddev)
5289 md_super_wait(mddev); 5289 md_super_wait(mddev);
5290 5290
5291 if (mddev->ro == 0 && 5291 if (mddev->ro == 0 &&
5292 (!mddev->in_sync || mddev->flags)) { 5292 (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
5293 /* mark array as shutdown cleanly */ 5293 /* mark array as shutdown cleanly */
5294 mddev->in_sync = 1; 5294 mddev->in_sync = 1;
5295 md_update_sb(mddev, 1); 5295 md_update_sb(mddev, 1);
@@ -5337,8 +5337,14 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5337 err = -EBUSY; 5337 err = -EBUSY;
5338 goto out; 5338 goto out;
5339 } 5339 }
5340 if (bdev) 5340 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
5341 sync_blockdev(bdev); 5341 /* Someone opened the device since we flushed it
5342 * so page cache could be dirty and it is too late
5343 * to flush. So abort
5344 */
5345 mutex_unlock(&mddev->open_mutex);
5346 return -EBUSY;
5347 }
5342 if (mddev->pers) { 5348 if (mddev->pers) {
5343 __md_stop_writes(mddev); 5349 __md_stop_writes(mddev);
5344 5350
@@ -5373,14 +5379,14 @@ static int do_md_stop(struct mddev * mddev, int mode,
5373 mutex_unlock(&mddev->open_mutex); 5379 mutex_unlock(&mddev->open_mutex);
5374 return -EBUSY; 5380 return -EBUSY;
5375 } 5381 }
5376 if (bdev) 5382 if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
5377 /* It is possible IO was issued on some other 5383 /* Someone opened the device since we flushed it
5378 * open file which was closed before we took ->open_mutex. 5384 * so page cache could be dirty and it is too late
5379 * As that was not the last close __blkdev_put will not 5385 * to flush. So abort
5380 * have called sync_blockdev, so we must.
5381 */ 5386 */
5382 sync_blockdev(bdev); 5387 mutex_unlock(&mddev->open_mutex);
5383 5388 return -EBUSY;
5389 }
5384 if (mddev->pers) { 5390 if (mddev->pers) {
5385 if (mddev->ro) 5391 if (mddev->ro)
5386 set_disk_ro(disk, 0); 5392 set_disk_ro(disk, 0);
@@ -5628,10 +5634,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5628 char *ptr, *buf = NULL; 5634 char *ptr, *buf = NULL;
5629 int err = -ENOMEM; 5635 int err = -ENOMEM;
5630 5636
5631 if (md_allow_write(mddev)) 5637 file = kmalloc(sizeof(*file), GFP_NOIO);
5632 file = kmalloc(sizeof(*file), GFP_NOIO);
5633 else
5634 file = kmalloc(sizeof(*file), GFP_KERNEL);
5635 5638
5636 if (!file) 5639 if (!file)
5637 goto out; 5640 goto out;
@@ -6420,6 +6423,20 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6420 !test_bit(MD_RECOVERY_NEEDED, 6423 !test_bit(MD_RECOVERY_NEEDED,
6421 &mddev->flags), 6424 &mddev->flags),
6422 msecs_to_jiffies(5000)); 6425 msecs_to_jiffies(5000));
6426 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6427 /* Need to flush page cache, and ensure no-one else opens
6428 * and writes
6429 */
6430 mutex_lock(&mddev->open_mutex);
6431 if (atomic_read(&mddev->openers) > 1) {
6432 mutex_unlock(&mddev->open_mutex);
6433 err = -EBUSY;
6434 goto abort;
6435 }
6436 set_bit(MD_STILL_CLOSED, &mddev->flags);
6437 mutex_unlock(&mddev->open_mutex);
6438 sync_blockdev(bdev);
6439 }
6423 err = mddev_lock(mddev); 6440 err = mddev_lock(mddev);
6424 if (err) { 6441 if (err) {
6425 printk(KERN_INFO 6442 printk(KERN_INFO
@@ -6673,6 +6690,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
6673 6690
6674 err = 0; 6691 err = 0;
6675 atomic_inc(&mddev->openers); 6692 atomic_inc(&mddev->openers);
6693 clear_bit(MD_STILL_CLOSED, &mddev->flags);
6676 mutex_unlock(&mddev->open_mutex); 6694 mutex_unlock(&mddev->open_mutex);
6677 6695
6678 check_disk_change(bdev); 6696 check_disk_change(bdev);
@@ -7817,7 +7835,7 @@ void md_check_recovery(struct mddev *mddev)
7817 sysfs_notify_dirent_safe(mddev->sysfs_state); 7835 sysfs_notify_dirent_safe(mddev->sysfs_state);
7818 } 7836 }
7819 7837
7820 if (mddev->flags) 7838 if (mddev->flags & MD_UPDATE_SB_FLAGS)
7821 md_update_sb(mddev, 0); 7839 md_update_sb(mddev, 0);
7822 7840
7823 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7841 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 20f02c0b5f2d..608050c43f17 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -204,12 +204,16 @@ struct mddev {
204 struct md_personality *pers; 204 struct md_personality *pers;
205 dev_t unit; 205 dev_t unit;
206 int md_minor; 206 int md_minor;
207 struct list_head disks; 207 struct list_head disks;
208 unsigned long flags; 208 unsigned long flags;
209#define MD_CHANGE_DEVS 0 /* Some device status has changed */ 209#define MD_CHANGE_DEVS 0 /* Some device status has changed */
210#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 210#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
211#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 211#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
212#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */
212#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ 213#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
214#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
215 * md_ioctl checked on it.
216 */
213 217
214 int suspended; 218 int suspended;
215 atomic_t active_io; 219 atomic_t active_io;
@@ -218,7 +222,7 @@ struct mddev {
218 * are happening, so run/ 222 * are happening, so run/
219 * takeover/stop are not safe 223 * takeover/stop are not safe
220 */ 224 */
221 int ready; /* See when safe to pass 225 int ready; /* See when safe to pass
222 * IO requests down */ 226 * IO requests down */
223 struct gendisk *gendisk; 227 struct gendisk *gendisk;
224 228
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78ea44336e75..7ff4f252ca1a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -53,6 +53,7 @@
53#include <linux/cpu.h> 53#include <linux/cpu.h>
54#include <linux/slab.h> 54#include <linux/slab.h>
55#include <linux/ratelimit.h> 55#include <linux/ratelimit.h>
56#include <linux/nodemask.h>
56#include <trace/events/block.h> 57#include <trace/events/block.h>
57 58
58#include "md.h" 59#include "md.h"
@@ -60,6 +61,10 @@
60#include "raid0.h" 61#include "raid0.h"
61#include "bitmap.h" 62#include "bitmap.h"
62 63
64#define cpu_to_group(cpu) cpu_to_node(cpu)
65#define ANY_GROUP NUMA_NO_NODE
66
67static struct workqueue_struct *raid5_wq;
63/* 68/*
64 * Stripe cache 69 * Stripe cache
65 */ 70 */
@@ -72,6 +77,7 @@
72#define BYPASS_THRESHOLD 1 77#define BYPASS_THRESHOLD 1
73#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 78#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
74#define HASH_MASK (NR_HASH - 1) 79#define HASH_MASK (NR_HASH - 1)
80#define MAX_STRIPE_BATCH 8
75 81
76static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 82static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
77{ 83{
@@ -200,6 +206,49 @@ static int stripe_operations_active(struct stripe_head *sh)
200 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 206 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
201} 207}
202 208
209static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
210{
211 struct r5conf *conf = sh->raid_conf;
212 struct r5worker_group *group;
213 int thread_cnt;
214 int i, cpu = sh->cpu;
215
216 if (!cpu_online(cpu)) {
217 cpu = cpumask_any(cpu_online_mask);
218 sh->cpu = cpu;
219 }
220
221 if (list_empty(&sh->lru)) {
222 struct r5worker_group *group;
223 group = conf->worker_groups + cpu_to_group(cpu);
224 list_add_tail(&sh->lru, &group->handle_list);
225 group->stripes_cnt++;
226 sh->group = group;
227 }
228
229 if (conf->worker_cnt_per_group == 0) {
230 md_wakeup_thread(conf->mddev->thread);
231 return;
232 }
233
234 group = conf->worker_groups + cpu_to_group(sh->cpu);
235
236 group->workers[0].working = true;
237 /* at least one worker should run to avoid race */
238 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
239
240 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
241 /* wakeup more workers */
242 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
243 if (group->workers[i].working == false) {
244 group->workers[i].working = true;
245 queue_work_on(sh->cpu, raid5_wq,
246 &group->workers[i].work);
247 thread_cnt--;
248 }
249 }
250}
251
203static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 252static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
204{ 253{
205 BUG_ON(!list_empty(&sh->lru)); 254 BUG_ON(!list_empty(&sh->lru));
@@ -214,7 +263,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
214 else { 263 else {
215 clear_bit(STRIPE_DELAYED, &sh->state); 264 clear_bit(STRIPE_DELAYED, &sh->state);
216 clear_bit(STRIPE_BIT_DELAY, &sh->state); 265 clear_bit(STRIPE_BIT_DELAY, &sh->state);
217 list_add_tail(&sh->lru, &conf->handle_list); 266 if (conf->worker_cnt_per_group == 0) {
267 list_add_tail(&sh->lru, &conf->handle_list);
268 } else {
269 raid5_wakeup_stripe_thread(sh);
270 return;
271 }
218 } 272 }
219 md_wakeup_thread(conf->mddev->thread); 273 md_wakeup_thread(conf->mddev->thread);
220 } else { 274 } else {
@@ -239,12 +293,62 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
239 do_release_stripe(conf, sh); 293 do_release_stripe(conf, sh);
240} 294}
241 295
296static struct llist_node *llist_reverse_order(struct llist_node *head)
297{
298 struct llist_node *new_head = NULL;
299
300 while (head) {
301 struct llist_node *tmp = head;
302 head = head->next;
303 tmp->next = new_head;
304 new_head = tmp;
305 }
306
307 return new_head;
308}
309
310/* should hold conf->device_lock already */
311static int release_stripe_list(struct r5conf *conf)
312{
313 struct stripe_head *sh;
314 int count = 0;
315 struct llist_node *head;
316
317 head = llist_del_all(&conf->released_stripes);
318 head = llist_reverse_order(head);
319 while (head) {
320 sh = llist_entry(head, struct stripe_head, release_list);
321 head = llist_next(head);
322 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
323 smp_mb();
324 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
325 /*
326 * Don't worry the bit is set here, because if the bit is set
327 * again, the count is always > 1. This is true for
328 * STRIPE_ON_UNPLUG_LIST bit too.
329 */
330 __release_stripe(conf, sh);
331 count++;
332 }
333
334 return count;
335}
336
242static void release_stripe(struct stripe_head *sh) 337static void release_stripe(struct stripe_head *sh)
243{ 338{
244 struct r5conf *conf = sh->raid_conf; 339 struct r5conf *conf = sh->raid_conf;
245 unsigned long flags; 340 unsigned long flags;
341 bool wakeup;
246 342
343 if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
344 goto slow_path;
345 wakeup = llist_add(&sh->release_list, &conf->released_stripes);
346 if (wakeup)
347 md_wakeup_thread(conf->mddev->thread);
348 return;
349slow_path:
247 local_irq_save(flags); 350 local_irq_save(flags);
351 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
248 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 352 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
249 do_release_stripe(conf, sh); 353 do_release_stripe(conf, sh);
250 spin_unlock(&conf->device_lock); 354 spin_unlock(&conf->device_lock);
@@ -359,6 +463,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
359 raid5_build_block(sh, i, previous); 463 raid5_build_block(sh, i, previous);
360 } 464 }
361 insert_hash(conf, sh); 465 insert_hash(conf, sh);
466 sh->cpu = smp_processor_id();
362} 467}
363 468
364static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 469static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -491,7 +596,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
491 if (atomic_read(&sh->count)) { 596 if (atomic_read(&sh->count)) {
492 BUG_ON(!list_empty(&sh->lru) 597 BUG_ON(!list_empty(&sh->lru)
493 && !test_bit(STRIPE_EXPANDING, &sh->state) 598 && !test_bit(STRIPE_EXPANDING, &sh->state)
494 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); 599 && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
600 && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
495 } else { 601 } else {
496 if (!test_bit(STRIPE_HANDLE, &sh->state)) 602 if (!test_bit(STRIPE_HANDLE, &sh->state))
497 atomic_inc(&conf->active_stripes); 603 atomic_inc(&conf->active_stripes);
@@ -499,6 +605,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
499 !test_bit(STRIPE_EXPANDING, &sh->state)) 605 !test_bit(STRIPE_EXPANDING, &sh->state))
500 BUG(); 606 BUG();
501 list_del_init(&sh->lru); 607 list_del_init(&sh->lru);
608 if (sh->group) {
609 sh->group->stripes_cnt--;
610 sh->group = NULL;
611 }
502 } 612 }
503 } 613 }
504 } while (sh == NULL); 614 } while (sh == NULL);
@@ -3779,6 +3889,7 @@ static void raid5_activate_delayed(struct r5conf *conf)
3779 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3889 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3780 atomic_inc(&conf->preread_active_stripes); 3890 atomic_inc(&conf->preread_active_stripes);
3781 list_add_tail(&sh->lru, &conf->hold_list); 3891 list_add_tail(&sh->lru, &conf->hold_list);
3892 raid5_wakeup_stripe_thread(sh);
3782 } 3893 }
3783 } 3894 }
3784} 3895}
@@ -4058,18 +4169,35 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
4058 * head of the hold_list has changed, i.e. the head was promoted to the 4169 * head of the hold_list has changed, i.e. the head was promoted to the
4059 * handle_list. 4170 * handle_list.
4060 */ 4171 */
4061static struct stripe_head *__get_priority_stripe(struct r5conf *conf) 4172static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4062{ 4173{
4063 struct stripe_head *sh; 4174 struct stripe_head *sh = NULL, *tmp;
4175 struct list_head *handle_list = NULL;
4176 struct r5worker_group *wg = NULL;
4177
4178 if (conf->worker_cnt_per_group == 0) {
4179 handle_list = &conf->handle_list;
4180 } else if (group != ANY_GROUP) {
4181 handle_list = &conf->worker_groups[group].handle_list;
4182 wg = &conf->worker_groups[group];
4183 } else {
4184 int i;
4185 for (i = 0; i < conf->group_cnt; i++) {
4186 handle_list = &conf->worker_groups[i].handle_list;
4187 wg = &conf->worker_groups[i];
4188 if (!list_empty(handle_list))
4189 break;
4190 }
4191 }
4064 4192
4065 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4193 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
4066 __func__, 4194 __func__,
4067 list_empty(&conf->handle_list) ? "empty" : "busy", 4195 list_empty(handle_list) ? "empty" : "busy",
4068 list_empty(&conf->hold_list) ? "empty" : "busy", 4196 list_empty(&conf->hold_list) ? "empty" : "busy",
4069 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4197 atomic_read(&conf->pending_full_writes), conf->bypass_count);
4070 4198
4071 if (!list_empty(&conf->handle_list)) { 4199 if (!list_empty(handle_list)) {
4072 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 4200 sh = list_entry(handle_list->next, typeof(*sh), lru);
4073 4201
4074 if (list_empty(&conf->hold_list)) 4202 if (list_empty(&conf->hold_list))
4075 conf->bypass_count = 0; 4203 conf->bypass_count = 0;
@@ -4087,14 +4215,32 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
4087 ((conf->bypass_threshold && 4215 ((conf->bypass_threshold &&
4088 conf->bypass_count > conf->bypass_threshold) || 4216 conf->bypass_count > conf->bypass_threshold) ||
4089 atomic_read(&conf->pending_full_writes) == 0)) { 4217 atomic_read(&conf->pending_full_writes) == 0)) {
4090 sh = list_entry(conf->hold_list.next, 4218
4091 typeof(*sh), lru); 4219 list_for_each_entry(tmp, &conf->hold_list, lru) {
4092 conf->bypass_count -= conf->bypass_threshold; 4220 if (conf->worker_cnt_per_group == 0 ||
4093 if (conf->bypass_count < 0) 4221 group == ANY_GROUP ||
4094 conf->bypass_count = 0; 4222 !cpu_online(tmp->cpu) ||
4095 } else 4223 cpu_to_group(tmp->cpu) == group) {
4224 sh = tmp;
4225 break;
4226 }
4227 }
4228
4229 if (sh) {
4230 conf->bypass_count -= conf->bypass_threshold;
4231 if (conf->bypass_count < 0)
4232 conf->bypass_count = 0;
4233 }
4234 wg = NULL;
4235 }
4236
4237 if (!sh)
4096 return NULL; 4238 return NULL;
4097 4239
4240 if (wg) {
4241 wg->stripes_cnt--;
4242 sh->group = NULL;
4243 }
4098 list_del_init(&sh->lru); 4244 list_del_init(&sh->lru);
4099 atomic_inc(&sh->count); 4245 atomic_inc(&sh->count);
4100 BUG_ON(atomic_read(&sh->count) != 1); 4246 BUG_ON(atomic_read(&sh->count) != 1);
@@ -4127,6 +4273,10 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4127 */ 4273 */
4128 smp_mb__before_clear_bit(); 4274 smp_mb__before_clear_bit();
4129 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4275 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
4276 /*
4277 * STRIPE_ON_RELEASE_LIST could be set here. In that
4278 * case, the count is always > 1 here
4279 */
4130 __release_stripe(conf, sh); 4280 __release_stripe(conf, sh);
4131 cnt++; 4281 cnt++;
4132 } 4282 }
@@ -4286,8 +4436,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4286 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 4436 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
4287 DEFINE_WAIT(w); 4437 DEFINE_WAIT(w);
4288 int previous; 4438 int previous;
4439 int seq;
4289 4440
4290 retry: 4441 retry:
4442 seq = read_seqcount_begin(&conf->gen_lock);
4291 previous = 0; 4443 previous = 0;
4292 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 4444 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
4293 if (unlikely(conf->reshape_progress != MaxSector)) { 4445 if (unlikely(conf->reshape_progress != MaxSector)) {
@@ -4320,7 +4472,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4320 previous, 4472 previous,
4321 &dd_idx, NULL); 4473 &dd_idx, NULL);
4322 pr_debug("raid456: make_request, sector %llu logical %llu\n", 4474 pr_debug("raid456: make_request, sector %llu logical %llu\n",
4323 (unsigned long long)new_sector, 4475 (unsigned long long)new_sector,
4324 (unsigned long long)logical_sector); 4476 (unsigned long long)logical_sector);
4325 4477
4326 sh = get_active_stripe(conf, new_sector, previous, 4478 sh = get_active_stripe(conf, new_sector, previous,
@@ -4349,6 +4501,13 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4349 goto retry; 4501 goto retry;
4350 } 4502 }
4351 } 4503 }
4504 if (read_seqcount_retry(&conf->gen_lock, seq)) {
4505 /* Might have got the wrong stripe_head
4506 * by accident
4507 */
4508 release_stripe(sh);
4509 goto retry;
4510 }
4352 4511
4353 if (rw == WRITE && 4512 if (rw == WRITE &&
4354 logical_sector >= mddev->suspend_lo && 4513 logical_sector >= mddev->suspend_lo &&
@@ -4788,14 +4947,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4788 return handled; 4947 return handled;
4789} 4948}
4790 4949
4791#define MAX_STRIPE_BATCH 8 4950static int handle_active_stripes(struct r5conf *conf, int group,
4792static int handle_active_stripes(struct r5conf *conf) 4951 struct r5worker *worker)
4793{ 4952{
4794 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 4953 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4795 int i, batch_size = 0; 4954 int i, batch_size = 0;
4796 4955
4797 while (batch_size < MAX_STRIPE_BATCH && 4956 while (batch_size < MAX_STRIPE_BATCH &&
4798 (sh = __get_priority_stripe(conf)) != NULL) 4957 (sh = __get_priority_stripe(conf, group)) != NULL)
4799 batch[batch_size++] = sh; 4958 batch[batch_size++] = sh;
4800 4959
4801 if (batch_size == 0) 4960 if (batch_size == 0)
@@ -4813,6 +4972,39 @@ static int handle_active_stripes(struct r5conf *conf)
4813 return batch_size; 4972 return batch_size;
4814} 4973}
4815 4974
4975static void raid5_do_work(struct work_struct *work)
4976{
4977 struct r5worker *worker = container_of(work, struct r5worker, work);
4978 struct r5worker_group *group = worker->group;
4979 struct r5conf *conf = group->conf;
4980 int group_id = group - conf->worker_groups;
4981 int handled;
4982 struct blk_plug plug;
4983
4984 pr_debug("+++ raid5worker active\n");
4985
4986 blk_start_plug(&plug);
4987 handled = 0;
4988 spin_lock_irq(&conf->device_lock);
4989 while (1) {
4990 int batch_size, released;
4991
4992 released = release_stripe_list(conf);
4993
4994 batch_size = handle_active_stripes(conf, group_id, worker);
4995 worker->working = false;
4996 if (!batch_size && !released)
4997 break;
4998 handled += batch_size;
4999 }
5000 pr_debug("%d stripes handled\n", handled);
5001
5002 spin_unlock_irq(&conf->device_lock);
5003 blk_finish_plug(&plug);
5004
5005 pr_debug("--- raid5worker inactive\n");
5006}
5007
4816/* 5008/*
4817 * This is our raid5 kernel thread. 5009 * This is our raid5 kernel thread.
4818 * 5010 *
@@ -4836,7 +5028,9 @@ static void raid5d(struct md_thread *thread)
4836 spin_lock_irq(&conf->device_lock); 5028 spin_lock_irq(&conf->device_lock);
4837 while (1) { 5029 while (1) {
4838 struct bio *bio; 5030 struct bio *bio;
4839 int batch_size; 5031 int batch_size, released;
5032
5033 released = release_stripe_list(conf);
4840 5034
4841 if ( 5035 if (
4842 !list_empty(&conf->bitmap_list)) { 5036 !list_empty(&conf->bitmap_list)) {
@@ -4860,8 +5054,8 @@ static void raid5d(struct md_thread *thread)
4860 handled++; 5054 handled++;
4861 } 5055 }
4862 5056
4863 batch_size = handle_active_stripes(conf); 5057 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
4864 if (!batch_size) 5058 if (!batch_size && !released)
4865 break; 5059 break;
4866 handled += batch_size; 5060 handled += batch_size;
4867 5061
@@ -4989,10 +5183,70 @@ stripe_cache_active_show(struct mddev *mddev, char *page)
4989static struct md_sysfs_entry 5183static struct md_sysfs_entry
4990raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 5184raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4991 5185
5186static ssize_t
5187raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
5188{
5189 struct r5conf *conf = mddev->private;
5190 if (conf)
5191 return sprintf(page, "%d\n", conf->worker_cnt_per_group);
5192 else
5193 return 0;
5194}
5195
5196static int alloc_thread_groups(struct r5conf *conf, int cnt);
5197static ssize_t
5198raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
5199{
5200 struct r5conf *conf = mddev->private;
5201 unsigned long new;
5202 int err;
5203 struct r5worker_group *old_groups;
5204 int old_group_cnt;
5205
5206 if (len >= PAGE_SIZE)
5207 return -EINVAL;
5208 if (!conf)
5209 return -ENODEV;
5210
5211 if (kstrtoul(page, 10, &new))
5212 return -EINVAL;
5213
5214 if (new == conf->worker_cnt_per_group)
5215 return len;
5216
5217 mddev_suspend(mddev);
5218
5219 old_groups = conf->worker_groups;
5220 old_group_cnt = conf->worker_cnt_per_group;
5221
5222 conf->worker_groups = NULL;
5223 err = alloc_thread_groups(conf, new);
5224 if (err) {
5225 conf->worker_groups = old_groups;
5226 conf->worker_cnt_per_group = old_group_cnt;
5227 } else {
5228 if (old_groups)
5229 kfree(old_groups[0].workers);
5230 kfree(old_groups);
5231 }
5232
5233 mddev_resume(mddev);
5234
5235 if (err)
5236 return err;
5237 return len;
5238}
5239
5240static struct md_sysfs_entry
5241raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
5242 raid5_show_group_thread_cnt,
5243 raid5_store_group_thread_cnt);
5244
4992static struct attribute *raid5_attrs[] = { 5245static struct attribute *raid5_attrs[] = {
4993 &raid5_stripecache_size.attr, 5246 &raid5_stripecache_size.attr,
4994 &raid5_stripecache_active.attr, 5247 &raid5_stripecache_active.attr,
4995 &raid5_preread_bypass_threshold.attr, 5248 &raid5_preread_bypass_threshold.attr,
5249 &raid5_group_thread_cnt.attr,
4996 NULL, 5250 NULL,
4997}; 5251};
4998static struct attribute_group raid5_attrs_group = { 5252static struct attribute_group raid5_attrs_group = {
@@ -5000,6 +5254,54 @@ static struct attribute_group raid5_attrs_group = {
5000 .attrs = raid5_attrs, 5254 .attrs = raid5_attrs,
5001}; 5255};
5002 5256
5257static int alloc_thread_groups(struct r5conf *conf, int cnt)
5258{
5259 int i, j;
5260 ssize_t size;
5261 struct r5worker *workers;
5262
5263 conf->worker_cnt_per_group = cnt;
5264 if (cnt == 0) {
5265 conf->worker_groups = NULL;
5266 return 0;
5267 }
5268 conf->group_cnt = num_possible_nodes();
5269 size = sizeof(struct r5worker) * cnt;
5270 workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
5271 conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
5272 conf->group_cnt, GFP_NOIO);
5273 if (!conf->worker_groups || !workers) {
5274 kfree(workers);
5275 kfree(conf->worker_groups);
5276 conf->worker_groups = NULL;
5277 return -ENOMEM;
5278 }
5279
5280 for (i = 0; i < conf->group_cnt; i++) {
5281 struct r5worker_group *group;
5282
5283 group = &conf->worker_groups[i];
5284 INIT_LIST_HEAD(&group->handle_list);
5285 group->conf = conf;
5286 group->workers = workers + i * cnt;
5287
5288 for (j = 0; j < cnt; j++) {
5289 group->workers[j].group = group;
5290 INIT_WORK(&group->workers[j].work, raid5_do_work);
5291 }
5292 }
5293
5294 return 0;
5295}
5296
5297static void free_thread_groups(struct r5conf *conf)
5298{
5299 if (conf->worker_groups)
5300 kfree(conf->worker_groups[0].workers);
5301 kfree(conf->worker_groups);
5302 conf->worker_groups = NULL;
5303}
5304
5003static sector_t 5305static sector_t
5004raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 5306raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
5005{ 5307{
@@ -5040,6 +5342,7 @@ static void raid5_free_percpu(struct r5conf *conf)
5040 5342
5041static void free_conf(struct r5conf *conf) 5343static void free_conf(struct r5conf *conf)
5042{ 5344{
5345 free_thread_groups(conf);
5043 shrink_stripes(conf); 5346 shrink_stripes(conf);
5044 raid5_free_percpu(conf); 5347 raid5_free_percpu(conf);
5045 kfree(conf->disks); 5348 kfree(conf->disks);
@@ -5168,7 +5471,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5168 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 5471 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
5169 if (conf == NULL) 5472 if (conf == NULL)
5170 goto abort; 5473 goto abort;
5474 /* Don't enable multi-threading by default*/
5475 if (alloc_thread_groups(conf, 0))
5476 goto abort;
5171 spin_lock_init(&conf->device_lock); 5477 spin_lock_init(&conf->device_lock);
5478 seqcount_init(&conf->gen_lock);
5172 init_waitqueue_head(&conf->wait_for_stripe); 5479 init_waitqueue_head(&conf->wait_for_stripe);
5173 init_waitqueue_head(&conf->wait_for_overlap); 5480 init_waitqueue_head(&conf->wait_for_overlap);
5174 INIT_LIST_HEAD(&conf->handle_list); 5481 INIT_LIST_HEAD(&conf->handle_list);
@@ -5176,6 +5483,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
5176 INIT_LIST_HEAD(&conf->delayed_list); 5483 INIT_LIST_HEAD(&conf->delayed_list);
5177 INIT_LIST_HEAD(&conf->bitmap_list); 5484 INIT_LIST_HEAD(&conf->bitmap_list);
5178 INIT_LIST_HEAD(&conf->inactive_list); 5485 INIT_LIST_HEAD(&conf->inactive_list);
5486 init_llist_head(&conf->released_stripes);
5179 atomic_set(&conf->active_stripes, 0); 5487 atomic_set(&conf->active_stripes, 0);
5180 atomic_set(&conf->preread_active_stripes, 0); 5488 atomic_set(&conf->preread_active_stripes, 0);
5181 atomic_set(&conf->active_aligned_reads, 0); 5489 atomic_set(&conf->active_aligned_reads, 0);
@@ -5980,6 +6288,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5980 6288
5981 atomic_set(&conf->reshape_stripes, 0); 6289 atomic_set(&conf->reshape_stripes, 0);
5982 spin_lock_irq(&conf->device_lock); 6290 spin_lock_irq(&conf->device_lock);
6291 write_seqcount_begin(&conf->gen_lock);
5983 conf->previous_raid_disks = conf->raid_disks; 6292 conf->previous_raid_disks = conf->raid_disks;
5984 conf->raid_disks += mddev->delta_disks; 6293 conf->raid_disks += mddev->delta_disks;
5985 conf->prev_chunk_sectors = conf->chunk_sectors; 6294 conf->prev_chunk_sectors = conf->chunk_sectors;
@@ -5996,8 +6305,16 @@ static int raid5_start_reshape(struct mddev *mddev)
5996 else 6305 else
5997 conf->reshape_progress = 0; 6306 conf->reshape_progress = 0;
5998 conf->reshape_safe = conf->reshape_progress; 6307 conf->reshape_safe = conf->reshape_progress;
6308 write_seqcount_end(&conf->gen_lock);
5999 spin_unlock_irq(&conf->device_lock); 6309 spin_unlock_irq(&conf->device_lock);
6000 6310
6311 /* Now make sure any requests that proceeded on the assumption
6312 * the reshape wasn't running - like Discard or Read - have
6313 * completed.
6314 */
6315 mddev_suspend(mddev);
6316 mddev_resume(mddev);
6317
6001 /* Add some new drives, as many as will fit. 6318 /* Add some new drives, as many as will fit.
6002 * We know there are enough to make the newly sized array work. 6319 * We know there are enough to make the newly sized array work.
6003 * Don't add devices if we are reducing the number of 6320 * Don't add devices if we are reducing the number of
@@ -6472,6 +6789,10 @@ static struct md_personality raid4_personality =
6472 6789
6473static int __init raid5_init(void) 6790static int __init raid5_init(void)
6474{ 6791{
6792 raid5_wq = alloc_workqueue("raid5wq",
6793 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
6794 if (!raid5_wq)
6795 return -ENOMEM;
6475 register_md_personality(&raid6_personality); 6796 register_md_personality(&raid6_personality);
6476 register_md_personality(&raid5_personality); 6797 register_md_personality(&raid5_personality);
6477 register_md_personality(&raid4_personality); 6798 register_md_personality(&raid4_personality);
@@ -6483,6 +6804,7 @@ static void raid5_exit(void)
6483 unregister_md_personality(&raid6_personality); 6804 unregister_md_personality(&raid6_personality);
6484 unregister_md_personality(&raid5_personality); 6805 unregister_md_personality(&raid5_personality);
6485 unregister_md_personality(&raid4_personality); 6806 unregister_md_personality(&raid4_personality);
6807 destroy_workqueue(raid5_wq);
6486} 6808}
6487 6809
6488module_init(raid5_init); 6810module_init(raid5_init);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 70c49329ca9a..2113ffa82c7a 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -197,6 +197,7 @@ enum reconstruct_states {
197struct stripe_head { 197struct stripe_head {
198 struct hlist_node hash; 198 struct hlist_node hash;
199 struct list_head lru; /* inactive_list or handle_list */ 199 struct list_head lru; /* inactive_list or handle_list */
200 struct llist_node release_list;
200 struct r5conf *raid_conf; 201 struct r5conf *raid_conf;
201 short generation; /* increments with every 202 short generation; /* increments with every
202 * reshape */ 203 * reshape */
@@ -211,6 +212,8 @@ struct stripe_head {
211 enum check_states check_state; 212 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 213 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock; 214 spinlock_t stripe_lock;
215 int cpu;
216 struct r5worker_group *group;
214 /** 217 /**
215 * struct stripe_operations 218 * struct stripe_operations
216 * @target - STRIPE_OP_COMPUTE_BLK target 219 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -321,6 +324,7 @@ enum {
321 STRIPE_OPS_REQ_PENDING, 324 STRIPE_OPS_REQ_PENDING,
322 STRIPE_ON_UNPLUG_LIST, 325 STRIPE_ON_UNPLUG_LIST,
323 STRIPE_DISCARD, 326 STRIPE_DISCARD,
327 STRIPE_ON_RELEASE_LIST,
324}; 328};
325 329
326/* 330/*
@@ -363,6 +367,19 @@ struct disk_info {
363 struct md_rdev *rdev, *replacement; 367 struct md_rdev *rdev, *replacement;
364}; 368};
365 369
370struct r5worker {
371 struct work_struct work;
372 struct r5worker_group *group;
373 bool working;
374};
375
376struct r5worker_group {
377 struct list_head handle_list;
378 struct r5conf *conf;
379 struct r5worker *workers;
380 int stripes_cnt;
381};
382
366struct r5conf { 383struct r5conf {
367 struct hlist_head *stripe_hashtbl; 384 struct hlist_head *stripe_hashtbl;
368 struct mddev *mddev; 385 struct mddev *mddev;
@@ -386,6 +403,7 @@ struct r5conf {
386 int prev_chunk_sectors; 403 int prev_chunk_sectors;
387 int prev_algo; 404 int prev_algo;
388 short generation; /* increments with every reshape */ 405 short generation; /* increments with every reshape */
406 seqcount_t gen_lock; /* lock against generation changes */
389 unsigned long reshape_checkpoint; /* Time we last updated 407 unsigned long reshape_checkpoint; /* Time we last updated
390 * metadata */ 408 * metadata */
391 long long min_offset_diff; /* minimum difference between 409 long long min_offset_diff; /* minimum difference between
@@ -445,6 +463,7 @@ struct r5conf {
445 */ 463 */
446 atomic_t active_stripes; 464 atomic_t active_stripes;
447 struct list_head inactive_list; 465 struct list_head inactive_list;
466 struct llist_head released_stripes;
448 wait_queue_head_t wait_for_stripe; 467 wait_queue_head_t wait_for_stripe;
449 wait_queue_head_t wait_for_overlap; 468 wait_queue_head_t wait_for_overlap;
450 int inactive_blocked; /* release of inactive stripes blocked, 469 int inactive_blocked; /* release of inactive stripes blocked,
@@ -458,6 +477,9 @@ struct r5conf {
458 * the new thread here until we fully activate the array. 477 * the new thread here until we fully activate the array.
459 */ 478 */
460 struct md_thread *thread; 479 struct md_thread *thread;
480 struct r5worker_group *worker_groups;
481 int group_cnt;
482 int worker_cnt_per_group;
461}; 483};
462 484
463/* 485/*