aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 19:07:26 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 19:07:26 -0500
commit47f521ba18190e4bfbb65ead3977af5756884427 (patch)
tree54d6039d71149d8596b66a1d41cfd9eb7f334601
parentb91593fa8531a7396551dd9c0a0c51e9b9b97ca9 (diff)
parent0868b99c214a3d55486c700de7c3f770b7243e7c (diff)
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD update from Shaohua Li: "This update mostly includes bug fixes: - md-cluster now supports raid10 from Guoqing - raid5 PPL fixes from Artur - badblock regression fix from Bo - suspend hang related fixes from Neil - raid5 reshape fixes from Neil - raid1 freeze deadlock fix from Nate - memleak fixes from Zdenek - bitmap related fixes from Me and Tao - other fixes and cleanups" * 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (33 commits) md: free unused memory after bitmap resize md: release allocated bitset sync_set md/bitmap: clear BITMAP_WRITE_ERROR bit before writing it to sb md: be cautious about using ->curr_resync_completed for ->recovery_offset badblocks: fix wrong return value in badblocks_set if badblocks are disabled md: don't check MD_SB_CHANGE_CLEAN in md_allow_write md-cluster: update document for raid10 md: remove redundant variable q raid1: remove obsolete code in raid1_write_request md-cluster: Use a small window for raid10 resync md-cluster: Suspend writes in RAID10 if within range md-cluster/raid10: set "do_balance = 0" if area is resyncing md: use lockdep_assert_held raid1: prevent freeze_array/wait_all_barriers deadlock md: use TASK_IDLE instead of blocking signals md: remove special meaning of ->quiesce(.., 2) md: allow metadata update while suspending. md: use mddev_suspend/resume instead of ->quiesce() md: move suspend_hi/lo handling into core md code md: don't call bitmap_create() while array is quiesced. ...
-rw-r--r--Documentation/md/md-cluster.txt3
-rw-r--r--MAINTAINERS7
-rw-r--r--block/badblocks.c2
-rw-r--r--drivers/md/Kconfig5
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/dm-raid.c12
-rw-r--r--drivers/md/md-bitmap.c (renamed from drivers/md/bitmap.c)27
-rw-r--r--drivers/md/md-bitmap.h (renamed from drivers/md/bitmap.h)0
-rw-r--r--drivers/md/md-cluster.c12
-rw-r--r--drivers/md/md-faulty.c (renamed from drivers/md/faulty.c)0
-rw-r--r--drivers/md/md-linear.c (renamed from drivers/md/linear.c)2
-rw-r--r--drivers/md/md-linear.h (renamed from drivers/md/linear.h)0
-rw-r--r--drivers/md/md-multipath.c (renamed from drivers/md/multipath.c)4
-rw-r--r--drivers/md/md-multipath.h (renamed from drivers/md/multipath.h)0
-rw-r--r--drivers/md/md.c147
-rw-r--r--drivers/md/md.h20
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c78
-rw-r--r--drivers/md/raid10.c169
-rw-r--r--drivers/md/raid10.h6
-rw-r--r--drivers/md/raid5-cache.c44
-rw-r--r--drivers/md/raid5-log.h2
-rw-r--r--drivers/md/raid5-ppl.c6
-rw-r--r--drivers/md/raid5.c79
24 files changed, 409 insertions, 223 deletions
diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
index 82ee51604e9a..e1055f105cf5 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -1,4 +1,5 @@
1The cluster MD is a shared-device RAID for a cluster. 1The cluster MD is a shared-device RAID for a cluster, it supports
2two levels: raid1 and raid10 (limited support).
2 3
3 4
41. On-disk format 51. On-disk format
diff --git a/MAINTAINERS b/MAINTAINERS
index ba3d8c197d92..8604cf64a169 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4103,6 +4103,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git
4103T: quilt http://people.redhat.com/agk/patches/linux/editing/ 4103T: quilt http://people.redhat.com/agk/patches/linux/editing/
4104S: Maintained 4104S: Maintained
4105F: Documentation/device-mapper/ 4105F: Documentation/device-mapper/
4106F: drivers/md/Makefile
4107F: drivers/md/Kconfig
4106F: drivers/md/dm* 4108F: drivers/md/dm*
4107F: drivers/md/persistent-data/ 4109F: drivers/md/persistent-data/
4108F: include/linux/device-mapper.h 4110F: include/linux/device-mapper.h
@@ -12487,7 +12489,10 @@ M: Shaohua Li <shli@kernel.org>
12487L: linux-raid@vger.kernel.org 12489L: linux-raid@vger.kernel.org
12488T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git 12490T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
12489S: Supported 12491S: Supported
12490F: drivers/md/ 12492F: drivers/md/Makefile
12493F: drivers/md/Kconfig
12494F: drivers/md/md*
12495F: drivers/md/raid*
12491F: include/linux/raid/ 12496F: include/linux/raid/
12492F: include/uapi/linux/raid/ 12497F: include/uapi/linux/raid/
12493 12498
diff --git a/block/badblocks.c b/block/badblocks.c
index 43c71166e1e2..91f7bcf979d3 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -178,7 +178,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
178 178
179 if (bb->shift < 0) 179 if (bb->shift < 0)
180 /* badblocks are disabled */ 180 /* badblocks are disabled */
181 return 0; 181 return 1;
182 182
183 if (bb->shift) { 183 if (bb->shift) {
184 /* round the start down, and the end up */ 184 /* round the start down, and the end up */
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4a249ee86364..83b9362be09c 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -178,7 +178,7 @@ config MD_FAULTY
178 178
179 179
180config MD_CLUSTER 180config MD_CLUSTER
181 tristate "Cluster Support for MD (EXPERIMENTAL)" 181 tristate "Cluster Support for MD"
182 depends on BLK_DEV_MD 182 depends on BLK_DEV_MD
183 depends on DLM 183 depends on DLM
184 default n 184 default n
@@ -188,7 +188,8 @@ config MD_CLUSTER
188 nodes in the cluster can access the MD devices simultaneously. 188 nodes in the cluster can access the MD devices simultaneously.
189 189
190 This brings the redundancy (and uptime) of RAID levels across the 190 This brings the redundancy (and uptime) of RAID levels across the
191 nodes of the cluster. 191 nodes of the cluster. Currently, it can work with raid1 and raid10
192 (limited support).
192 193
193 If unsure, say N. 194 If unsure, say N.
194 195
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index e94b6f9be941..f701bb211783 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -19,9 +19,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
19dm-cache-smq-y += dm-cache-policy-smq.o 19dm-cache-smq-y += dm-cache-policy-smq.o
20dm-era-y += dm-era-target.o 20dm-era-y += dm-era-target.o
21dm-verity-y += dm-verity-target.o 21dm-verity-y += dm-verity-target.o
22md-mod-y += md.o bitmap.o 22md-mod-y += md.o md-bitmap.o
23raid456-y += raid5.o raid5-cache.o raid5-ppl.o 23raid456-y += raid5.o raid5-cache.o raid5-ppl.o
24dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o 24dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
25linear-y += md-linear.o
26multipath-y += md-multipath.o
27faulty-y += md-faulty.o
25 28
26# Note: link order is important. All raid personalities 29# Note: link order is important. All raid personalities
27# and must come before md.o, as they each initialise 30# and must come before md.o, as they each initialise
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a25eebd98996..366c625b9591 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -12,7 +12,7 @@
12#include "raid1.h" 12#include "raid1.h"
13#include "raid5.h" 13#include "raid5.h"
14#include "raid10.h" 14#include "raid10.h"
15#include "bitmap.h" 15#include "md-bitmap.h"
16 16
17#include <linux/device-mapper.h> 17#include <linux/device-mapper.h>
18 18
@@ -3630,8 +3630,11 @@ static void raid_postsuspend(struct dm_target *ti)
3630{ 3630{
3631 struct raid_set *rs = ti->private; 3631 struct raid_set *rs = ti->private;
3632 3632
3633 if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) 3633 if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
3634 mddev_lock_nointr(&rs->md);
3634 mddev_suspend(&rs->md); 3635 mddev_suspend(&rs->md);
3636 mddev_unlock(&rs->md);
3637 }
3635 3638
3636 rs->md.ro = 1; 3639 rs->md.ro = 1;
3637} 3640}
@@ -3888,8 +3891,11 @@ static void raid_resume(struct dm_target *ti)
3888 if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) 3891 if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
3889 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3892 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3890 3893
3891 if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) 3894 if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
3895 mddev_lock_nointr(mddev);
3892 mddev_resume(mddev); 3896 mddev_resume(mddev);
3897 mddev_unlock(mddev);
3898 }
3893} 3899}
3894 3900
3895static struct target_type raid_target = { 3901static struct target_type raid_target = {
diff --git a/drivers/md/bitmap.c b/drivers/md/md-bitmap.c
index 4d8ed74efadf..239c7bb3929b 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -29,7 +29,7 @@
29#include <linux/seq_file.h> 29#include <linux/seq_file.h>
30#include <trace/events/block.h> 30#include <trace/events/block.h>
31#include "md.h" 31#include "md.h"
32#include "bitmap.h" 32#include "md-bitmap.h"
33 33
34static inline char *bmname(struct bitmap *bitmap) 34static inline char *bmname(struct bitmap *bitmap)
35{ 35{
@@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
459 /* rocking back to read-only */ 459 /* rocking back to read-only */
460 bitmap->events_cleared = bitmap->mddev->events; 460 bitmap->events_cleared = bitmap->mddev->events;
461 sb->events_cleared = cpu_to_le64(bitmap->events_cleared); 461 sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
462 sb->state = cpu_to_le32(bitmap->flags); 462 /*
463 * clear BITMAP_WRITE_ERROR bit to protect against the case that
464 * a bitmap write error occurred but the later writes succeeded.
465 */
466 sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR));
463 /* Just in case these have been changed via sysfs: */ 467 /* Just in case these have been changed via sysfs: */
464 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 468 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
465 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 469 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
@@ -625,7 +629,7 @@ re_read:
625 err = read_sb_page(bitmap->mddev, 629 err = read_sb_page(bitmap->mddev,
626 offset, 630 offset,
627 sb_page, 631 sb_page,
628 0, PAGE_SIZE); 632 0, sizeof(bitmap_super_t));
629 } 633 }
630 if (err) 634 if (err)
631 return err; 635 return err;
@@ -1816,6 +1820,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
1816 1820
1817 BUG_ON(file && mddev->bitmap_info.offset); 1821 BUG_ON(file && mddev->bitmap_info.offset);
1818 1822
1823 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
1824 pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
1825 mdname(mddev));
1826 return ERR_PTR(-EBUSY);
1827 }
1828
1819 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); 1829 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1820 if (!bitmap) 1830 if (!bitmap)
1821 return ERR_PTR(-ENOMEM); 1831 return ERR_PTR(-ENOMEM);
@@ -2123,7 +2133,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2123 if (store.sb_page && bitmap->storage.sb_page) 2133 if (store.sb_page && bitmap->storage.sb_page)
2124 memcpy(page_address(store.sb_page), 2134 memcpy(page_address(store.sb_page),
2125 page_address(bitmap->storage.sb_page), 2135 page_address(bitmap->storage.sb_page),
2126 PAGE_SIZE); 2136 sizeof(bitmap_super_t));
2127 bitmap_file_unmap(&bitmap->storage); 2137 bitmap_file_unmap(&bitmap->storage);
2128 bitmap->storage = store; 2138 bitmap->storage = store;
2129 2139
@@ -2152,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2152 for (k = 0; k < page; k++) { 2162 for (k = 0; k < page; k++) {
2153 kfree(new_bp[k].map); 2163 kfree(new_bp[k].map);
2154 } 2164 }
2165 kfree(new_bp);
2155 2166
2156 /* restore some fields from old_counts */ 2167 /* restore some fields from old_counts */
2157 bitmap->counts.bp = old_counts.bp; 2168 bitmap->counts.bp = old_counts.bp;
@@ -2202,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
2202 block += old_blocks; 2213 block += old_blocks;
2203 } 2214 }
2204 2215
2216 if (bitmap->counts.bp != old_counts.bp) {
2217 unsigned long k;
2218 for (k = 0; k < old_counts.pages; k++)
2219 if (!old_counts.bp[k].hijacked)
2220 kfree(old_counts.bp[k].map);
2221 kfree(old_counts.bp);
2222 }
2223
2205 if (!init) { 2224 if (!init) {
2206 int i; 2225 int i;
2207 while (block < (chunks << chunkshift)) { 2226 while (block < (chunks << chunkshift)) {
diff --git a/drivers/md/bitmap.h b/drivers/md/md-bitmap.h
index 5df35ca90f58..5df35ca90f58 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/md-bitmap.h
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 03082e17c65c..79bfbc840385 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -15,7 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/raid/md_p.h> 16#include <linux/raid/md_p.h>
17#include "md.h" 17#include "md.h"
18#include "bitmap.h" 18#include "md-bitmap.h"
19#include "md-cluster.h" 19#include "md-cluster.h"
20 20
21#define LVB_SIZE 64 21#define LVB_SIZE 64
@@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
442static void remove_suspend_info(struct mddev *mddev, int slot) 442static void remove_suspend_info(struct mddev *mddev, int slot)
443{ 443{
444 struct md_cluster_info *cinfo = mddev->cluster_info; 444 struct md_cluster_info *cinfo = mddev->cluster_info;
445 mddev->pers->quiesce(mddev, 1);
445 spin_lock_irq(&cinfo->suspend_lock); 446 spin_lock_irq(&cinfo->suspend_lock);
446 __remove_suspend_info(cinfo, slot); 447 __remove_suspend_info(cinfo, slot);
447 spin_unlock_irq(&cinfo->suspend_lock); 448 spin_unlock_irq(&cinfo->suspend_lock);
448 mddev->pers->quiesce(mddev, 2); 449 mddev->pers->quiesce(mddev, 0);
449} 450}
450 451
451 452
@@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev,
492 s->lo = lo; 493 s->lo = lo;
493 s->hi = hi; 494 s->hi = hi;
494 mddev->pers->quiesce(mddev, 1); 495 mddev->pers->quiesce(mddev, 1);
495 mddev->pers->quiesce(mddev, 0);
496 spin_lock_irq(&cinfo->suspend_lock); 496 spin_lock_irq(&cinfo->suspend_lock);
497 /* Remove existing entry (if exists) before adding */ 497 /* Remove existing entry (if exists) before adding */
498 __remove_suspend_info(cinfo, slot); 498 __remove_suspend_info(cinfo, slot);
499 list_add(&s->list, &cinfo->suspend_list); 499 list_add(&s->list, &cinfo->suspend_list);
500 spin_unlock_irq(&cinfo->suspend_lock); 500 spin_unlock_irq(&cinfo->suspend_lock);
501 mddev->pers->quiesce(mddev, 2); 501 mddev->pers->quiesce(mddev, 0);
502} 502}
503 503
504static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) 504static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev)
1094/* 1094/*
1095 * return 0 if all the bitmaps have the same sync_size 1095 * return 0 if all the bitmaps have the same sync_size
1096 */ 1096 */
1097int cluster_check_sync_size(struct mddev *mddev) 1097static int cluster_check_sync_size(struct mddev *mddev)
1098{ 1098{
1099 int i, rv; 1099 int i, rv;
1100 bitmap_super_t *sb; 1100 bitmap_super_t *sb;
@@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = {
1478 1478
1479static int __init cluster_init(void) 1479static int __init cluster_init(void)
1480{ 1480{
1481 pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); 1481 pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
1482 pr_info("Registering Cluster MD functions\n"); 1482 pr_info("Registering Cluster MD functions\n");
1483 register_md_cluster_operations(&cluster_ops, THIS_MODULE); 1483 register_md_cluster_operations(&cluster_ops, THIS_MODULE);
1484 return 0; 1484 return 0;
diff --git a/drivers/md/faulty.c b/drivers/md/md-faulty.c
index 38264b38420f..38264b38420f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/md-faulty.c
diff --git a/drivers/md/linear.c b/drivers/md/md-linear.c
index c464fb48039a..773fc70dced7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/md-linear.c
@@ -23,7 +23,7 @@
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <trace/events/block.h> 24#include <trace/events/block.h>
25#include "md.h" 25#include "md.h"
26#include "linear.h" 26#include "md-linear.h"
27 27
28/* 28/*
29 * find which device holds a particular offset 29 * find which device holds a particular offset
diff --git a/drivers/md/linear.h b/drivers/md/md-linear.h
index 8381d651d4ed..8381d651d4ed 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/md-linear.h
diff --git a/drivers/md/multipath.c b/drivers/md/md-multipath.c
index b68e0666b9b0..e40065bdbfc8 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/md-multipath.c
@@ -25,7 +25,7 @@
25#include <linux/seq_file.h> 25#include <linux/seq_file.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include "md.h" 27#include "md.h"
28#include "multipath.h" 28#include "md-multipath.h"
29 29
30#define MAX_WORK_PER_DISK 128 30#define MAX_WORK_PER_DISK 128
31 31
@@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf)
243static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) 243static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
244{ 244{
245 struct mpconf *conf = mddev->private; 245 struct mpconf *conf = mddev->private;
246 struct request_queue *q;
247 int err = -EEXIST; 246 int err = -EEXIST;
248 int path; 247 int path;
249 struct multipath_info *p; 248 struct multipath_info *p;
@@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
257 256
258 for (path = first; path <= last; path++) 257 for (path = first; path <= last; path++)
259 if ((p=conf->multipaths+path)->rdev == NULL) { 258 if ((p=conf->multipaths+path)->rdev == NULL) {
260 q = rdev->bdev->bd_disk->queue;
261 disk_stack_limits(mddev->gendisk, rdev->bdev, 259 disk_stack_limits(mddev->gendisk, rdev->bdev,
262 rdev->data_offset << 9); 260 rdev->data_offset << 9);
263 261
diff --git a/drivers/md/multipath.h b/drivers/md/md-multipath.h
index 0adb941f485a..0adb941f485a 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/md-multipath.h
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 447ddcbc9566..09c3af3dcdca 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -69,7 +69,7 @@
69 69
70#include <trace/events/block.h> 70#include <trace/events/block.h>
71#include "md.h" 71#include "md.h"
72#include "bitmap.h" 72#include "md-bitmap.h"
73#include "md-cluster.h" 73#include "md-cluster.h"
74 74
75#ifndef MODULE 75#ifndef MODULE
@@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
266 * call has finished, the bio has been linked into some internal structure 266 * call has finished, the bio has been linked into some internal structure
267 * and so is visible to ->quiesce(), so we don't need the refcount any more. 267 * and so is visible to ->quiesce(), so we don't need the refcount any more.
268 */ 268 */
269static bool is_suspended(struct mddev *mddev, struct bio *bio)
270{
271 if (mddev->suspended)
272 return true;
273 if (bio_data_dir(bio) != WRITE)
274 return false;
275 if (mddev->suspend_lo >= mddev->suspend_hi)
276 return false;
277 if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
278 return false;
279 if (bio_end_sector(bio) < mddev->suspend_lo)
280 return false;
281 return true;
282}
283
269void md_handle_request(struct mddev *mddev, struct bio *bio) 284void md_handle_request(struct mddev *mddev, struct bio *bio)
270{ 285{
271check_suspended: 286check_suspended:
272 rcu_read_lock(); 287 rcu_read_lock();
273 if (mddev->suspended) { 288 if (is_suspended(mddev, bio)) {
274 DEFINE_WAIT(__wait); 289 DEFINE_WAIT(__wait);
275 for (;;) { 290 for (;;) {
276 prepare_to_wait(&mddev->sb_wait, &__wait, 291 prepare_to_wait(&mddev->sb_wait, &__wait,
277 TASK_UNINTERRUPTIBLE); 292 TASK_UNINTERRUPTIBLE);
278 if (!mddev->suspended) 293 if (!is_suspended(mddev, bio))
279 break; 294 break;
280 rcu_read_unlock(); 295 rcu_read_unlock();
281 schedule(); 296 schedule();
@@ -344,12 +359,17 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
344void mddev_suspend(struct mddev *mddev) 359void mddev_suspend(struct mddev *mddev)
345{ 360{
346 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 361 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
362 lockdep_assert_held(&mddev->reconfig_mutex);
347 if (mddev->suspended++) 363 if (mddev->suspended++)
348 return; 364 return;
349 synchronize_rcu(); 365 synchronize_rcu();
350 wake_up(&mddev->sb_wait); 366 wake_up(&mddev->sb_wait);
367 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
368 smp_mb__after_atomic();
351 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 369 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
352 mddev->pers->quiesce(mddev, 1); 370 mddev->pers->quiesce(mddev, 1);
371 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
372 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
353 373
354 del_timer_sync(&mddev->safemode_timer); 374 del_timer_sync(&mddev->safemode_timer);
355} 375}
@@ -357,6 +377,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
357 377
358void mddev_resume(struct mddev *mddev) 378void mddev_resume(struct mddev *mddev)
359{ 379{
380 lockdep_assert_held(&mddev->reconfig_mutex);
360 if (--mddev->suspended) 381 if (--mddev->suspended)
361 return; 382 return;
362 wake_up(&mddev->sb_wait); 383 wake_up(&mddev->sb_wait);
@@ -663,6 +684,7 @@ void mddev_unlock(struct mddev *mddev)
663 */ 684 */
664 spin_lock(&pers_lock); 685 spin_lock(&pers_lock);
665 md_wakeup_thread(mddev->thread); 686 md_wakeup_thread(mddev->thread);
687 wake_up(&mddev->sb_wait);
666 spin_unlock(&pers_lock); 688 spin_unlock(&pers_lock);
667} 689}
668EXPORT_SYMBOL_GPL(mddev_unlock); 690EXPORT_SYMBOL_GPL(mddev_unlock);
@@ -2313,7 +2335,7 @@ static void export_array(struct mddev *mddev)
2313 2335
2314static bool set_in_sync(struct mddev *mddev) 2336static bool set_in_sync(struct mddev *mddev)
2315{ 2337{
2316 WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); 2338 lockdep_assert_held(&mddev->lock);
2317 if (!mddev->in_sync) { 2339 if (!mddev->in_sync) {
2318 mddev->sync_checkers++; 2340 mddev->sync_checkers++;
2319 spin_unlock(&mddev->lock); 2341 spin_unlock(&mddev->lock);
@@ -2432,10 +2454,18 @@ repeat:
2432 } 2454 }
2433 } 2455 }
2434 2456
2435 /* First make sure individual recovery_offsets are correct */ 2457 /*
2458 * First make sure individual recovery_offsets are correct
2459 * curr_resync_completed can only be used during recovery.
2460 * During reshape/resync it might use array-addresses rather
2461 * that device addresses.
2462 */
2436 rdev_for_each(rdev, mddev) { 2463 rdev_for_each(rdev, mddev) {
2437 if (rdev->raid_disk >= 0 && 2464 if (rdev->raid_disk >= 0 &&
2438 mddev->delta_disks >= 0 && 2465 mddev->delta_disks >= 0 &&
2466 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2467 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2468 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2439 !test_bit(Journal, &rdev->flags) && 2469 !test_bit(Journal, &rdev->flags) &&
2440 !test_bit(In_sync, &rdev->flags) && 2470 !test_bit(In_sync, &rdev->flags) &&
2441 mddev->curr_resync_completed > rdev->recovery_offset) 2471 mddev->curr_resync_completed > rdev->recovery_offset)
@@ -4824,7 +4854,7 @@ suspend_lo_show(struct mddev *mddev, char *page)
4824static ssize_t 4854static ssize_t
4825suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4855suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4826{ 4856{
4827 unsigned long long old, new; 4857 unsigned long long new;
4828 int err; 4858 int err;
4829 4859
4830 err = kstrtoull(buf, 10, &new); 4860 err = kstrtoull(buf, 10, &new);
@@ -4840,16 +4870,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4840 if (mddev->pers == NULL || 4870 if (mddev->pers == NULL ||
4841 mddev->pers->quiesce == NULL) 4871 mddev->pers->quiesce == NULL)
4842 goto unlock; 4872 goto unlock;
4843 old = mddev->suspend_lo; 4873 mddev_suspend(mddev);
4844 mddev->suspend_lo = new; 4874 mddev->suspend_lo = new;
4845 if (new >= old) 4875 mddev_resume(mddev);
4846 /* Shrinking suspended region */ 4876
4847 mddev->pers->quiesce(mddev, 2);
4848 else {
4849 /* Expanding suspended region - need to wait */
4850 mddev->pers->quiesce(mddev, 1);
4851 mddev->pers->quiesce(mddev, 0);
4852 }
4853 err = 0; 4877 err = 0;
4854unlock: 4878unlock:
4855 mddev_unlock(mddev); 4879 mddev_unlock(mddev);
@@ -4867,7 +4891,7 @@ suspend_hi_show(struct mddev *mddev, char *page)
4867static ssize_t 4891static ssize_t
4868suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4892suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4869{ 4893{
4870 unsigned long long old, new; 4894 unsigned long long new;
4871 int err; 4895 int err;
4872 4896
4873 err = kstrtoull(buf, 10, &new); 4897 err = kstrtoull(buf, 10, &new);
@@ -4880,19 +4904,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4880 if (err) 4904 if (err)
4881 return err; 4905 return err;
4882 err = -EINVAL; 4906 err = -EINVAL;
4883 if (mddev->pers == NULL || 4907 if (mddev->pers == NULL)
4884 mddev->pers->quiesce == NULL)
4885 goto unlock; 4908 goto unlock;
4886 old = mddev->suspend_hi; 4909
4910 mddev_suspend(mddev);
4887 mddev->suspend_hi = new; 4911 mddev->suspend_hi = new;
4888 if (new <= old) 4912 mddev_resume(mddev);
4889 /* Shrinking suspended region */ 4913
4890 mddev->pers->quiesce(mddev, 2);
4891 else {
4892 /* Expanding suspended region - need to wait */
4893 mddev->pers->quiesce(mddev, 1);
4894 mddev->pers->quiesce(mddev, 0);
4895 }
4896 err = 0; 4914 err = 0;
4897unlock: 4915unlock:
4898 mddev_unlock(mddev); 4916 mddev_unlock(mddev);
@@ -5834,8 +5852,14 @@ void md_stop(struct mddev *mddev)
5834 * This is called from dm-raid 5852 * This is called from dm-raid
5835 */ 5853 */
5836 __md_stop(mddev); 5854 __md_stop(mddev);
5837 if (mddev->bio_set) 5855 if (mddev->bio_set) {
5838 bioset_free(mddev->bio_set); 5856 bioset_free(mddev->bio_set);
5857 mddev->bio_set = NULL;
5858 }
5859 if (mddev->sync_set) {
5860 bioset_free(mddev->sync_set);
5861 mddev->sync_set = NULL;
5862 }
5839} 5863}
5840 5864
5841EXPORT_SYMBOL_GPL(md_stop); 5865EXPORT_SYMBOL_GPL(md_stop);
@@ -6362,7 +6386,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6362 break; 6386 break;
6363 } 6387 }
6364 } 6388 }
6365 if (has_journal) { 6389 if (has_journal || mddev->bitmap) {
6366 export_rdev(rdev); 6390 export_rdev(rdev);
6367 return -EBUSY; 6391 return -EBUSY;
6368 } 6392 }
@@ -6618,22 +6642,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
6618 return -ENOENT; /* cannot remove what isn't there */ 6642 return -ENOENT; /* cannot remove what isn't there */
6619 err = 0; 6643 err = 0;
6620 if (mddev->pers) { 6644 if (mddev->pers) {
6621 mddev->pers->quiesce(mddev, 1);
6622 if (fd >= 0) { 6645 if (fd >= 0) {
6623 struct bitmap *bitmap; 6646 struct bitmap *bitmap;
6624 6647
6625 bitmap = bitmap_create(mddev, -1); 6648 bitmap = bitmap_create(mddev, -1);
6649 mddev_suspend(mddev);
6626 if (!IS_ERR(bitmap)) { 6650 if (!IS_ERR(bitmap)) {
6627 mddev->bitmap = bitmap; 6651 mddev->bitmap = bitmap;
6628 err = bitmap_load(mddev); 6652 err = bitmap_load(mddev);
6629 } else 6653 } else
6630 err = PTR_ERR(bitmap); 6654 err = PTR_ERR(bitmap);
6631 } 6655 if (err) {
6632 if (fd < 0 || err) { 6656 bitmap_destroy(mddev);
6657 fd = -1;
6658 }
6659 mddev_resume(mddev);
6660 } else if (fd < 0) {
6661 mddev_suspend(mddev);
6633 bitmap_destroy(mddev); 6662 bitmap_destroy(mddev);
6634 fd = -1; /* make sure to put the file */ 6663 mddev_resume(mddev);
6635 } 6664 }
6636 mddev->pers->quiesce(mddev, 0);
6637 } 6665 }
6638 if (fd < 0) { 6666 if (fd < 0) {
6639 struct file *f = mddev->bitmap_info.file; 6667 struct file *f = mddev->bitmap_info.file;
@@ -6735,7 +6763,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6735 6763
6736void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6764void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6737{ 6765{
6738 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6766 lockdep_assert_held(&mddev->reconfig_mutex);
6739 6767
6740 if (mddev->external_size) 6768 if (mddev->external_size)
6741 return; 6769 return;
@@ -6917,8 +6945,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6917 mddev->bitmap_info.default_offset; 6945 mddev->bitmap_info.default_offset;
6918 mddev->bitmap_info.space = 6946 mddev->bitmap_info.space =
6919 mddev->bitmap_info.default_space; 6947 mddev->bitmap_info.default_space;
6920 mddev->pers->quiesce(mddev, 1);
6921 bitmap = bitmap_create(mddev, -1); 6948 bitmap = bitmap_create(mddev, -1);
6949 mddev_suspend(mddev);
6922 if (!IS_ERR(bitmap)) { 6950 if (!IS_ERR(bitmap)) {
6923 mddev->bitmap = bitmap; 6951 mddev->bitmap = bitmap;
6924 rv = bitmap_load(mddev); 6952 rv = bitmap_load(mddev);
@@ -6926,7 +6954,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6926 rv = PTR_ERR(bitmap); 6954 rv = PTR_ERR(bitmap);
6927 if (rv) 6955 if (rv)
6928 bitmap_destroy(mddev); 6956 bitmap_destroy(mddev);
6929 mddev->pers->quiesce(mddev, 0); 6957 mddev_resume(mddev);
6930 } else { 6958 } else {
6931 /* remove the bitmap */ 6959 /* remove the bitmap */
6932 if (!mddev->bitmap) { 6960 if (!mddev->bitmap) {
@@ -6949,9 +6977,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6949 mddev->bitmap_info.nodes = 0; 6977 mddev->bitmap_info.nodes = 0;
6950 md_cluster_ops->leave(mddev); 6978 md_cluster_ops->leave(mddev);
6951 } 6979 }
6952 mddev->pers->quiesce(mddev, 1); 6980 mddev_suspend(mddev);
6953 bitmap_destroy(mddev); 6981 bitmap_destroy(mddev);
6954 mddev->pers->quiesce(mddev, 0); 6982 mddev_resume(mddev);
6955 mddev->bitmap_info.offset = 0; 6983 mddev->bitmap_info.offset = 0;
6956 } 6984 }
6957 } 6985 }
@@ -7468,8 +7496,8 @@ void md_wakeup_thread(struct md_thread *thread)
7468{ 7496{
7469 if (thread) { 7497 if (thread) {
7470 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7498 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7471 if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) 7499 set_bit(THREAD_WAKEUP, &thread->flags);
7472 wake_up(&thread->wqueue); 7500 wake_up(&thread->wqueue);
7473 } 7501 }
7474} 7502}
7475EXPORT_SYMBOL(md_wakeup_thread); 7503EXPORT_SYMBOL(md_wakeup_thread);
@@ -8039,7 +8067,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
8039 if (did_change) 8067 if (did_change)
8040 sysfs_notify_dirent_safe(mddev->sysfs_state); 8068 sysfs_notify_dirent_safe(mddev->sysfs_state);
8041 wait_event(mddev->sb_wait, 8069 wait_event(mddev->sb_wait,
8042 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); 8070 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8071 mddev->suspended);
8043 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8072 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8044 percpu_ref_put(&mddev->writes_pending); 8073 percpu_ref_put(&mddev->writes_pending);
8045 return false; 8074 return false;
@@ -8110,7 +8139,6 @@ void md_allow_write(struct mddev *mddev)
8110 sysfs_notify_dirent_safe(mddev->sysfs_state); 8139 sysfs_notify_dirent_safe(mddev->sysfs_state);
8111 /* wait for the dirty state to be recorded in the metadata */ 8140 /* wait for the dirty state to be recorded in the metadata */
8112 wait_event(mddev->sb_wait, 8141 wait_event(mddev->sb_wait,
8113 !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
8114 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8142 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8115 } else 8143 } else
8116 spin_unlock(&mddev->lock); 8144 spin_unlock(&mddev->lock);
@@ -8477,16 +8505,19 @@ void md_do_sync(struct md_thread *thread)
8477 } else { 8505 } else {
8478 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8506 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8479 mddev->curr_resync = MaxSector; 8507 mddev->curr_resync = MaxSector;
8480 rcu_read_lock(); 8508 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8481 rdev_for_each_rcu(rdev, mddev) 8509 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8482 if (rdev->raid_disk >= 0 && 8510 rcu_read_lock();
8483 mddev->delta_disks >= 0 && 8511 rdev_for_each_rcu(rdev, mddev)
8484 !test_bit(Journal, &rdev->flags) && 8512 if (rdev->raid_disk >= 0 &&
8485 !test_bit(Faulty, &rdev->flags) && 8513 mddev->delta_disks >= 0 &&
8486 !test_bit(In_sync, &rdev->flags) && 8514 !test_bit(Journal, &rdev->flags) &&
8487 rdev->recovery_offset < mddev->curr_resync) 8515 !test_bit(Faulty, &rdev->flags) &&
8488 rdev->recovery_offset = mddev->curr_resync; 8516 !test_bit(In_sync, &rdev->flags) &&
8489 rcu_read_unlock(); 8517 rdev->recovery_offset < mddev->curr_resync)
8518 rdev->recovery_offset = mddev->curr_resync;
8519 rcu_read_unlock();
8520 }
8490 } 8521 }
8491 } 8522 }
8492 skip: 8523 skip:
@@ -8813,6 +8844,16 @@ void md_check_recovery(struct mddev *mddev)
8813 unlock: 8844 unlock:
8814 wake_up(&mddev->sb_wait); 8845 wake_up(&mddev->sb_wait);
8815 mddev_unlock(mddev); 8846 mddev_unlock(mddev);
8847 } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
8848 /* Write superblock - thread that called mddev_suspend()
8849 * holds reconfig_mutex for us.
8850 */
8851 set_bit(MD_UPDATING_SB, &mddev->flags);
8852 smp_mb__after_atomic();
8853 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
8854 md_update_sb(mddev, 0);
8855 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
8856 wake_up(&mddev->sb_wait);
8816 } 8857 }
8817} 8858}
8818EXPORT_SYMBOL(md_check_recovery); 8859EXPORT_SYMBOL(md_check_recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d8287d3cd1bf..7d6bcf0eba0c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -237,6 +237,12 @@ enum mddev_flags {
237 */ 237 */
238 MD_HAS_PPL, /* The raid array has PPL feature set */ 238 MD_HAS_PPL, /* The raid array has PPL feature set */
239 MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ 239 MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */
240 MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update
241 * the metadata without taking reconfig_mutex.
242 */
243 MD_UPDATING_SB, /* md_check_recovery is updating the metadata
244 * without explicitly holding reconfig_mutex.
245 */
240}; 246};
241 247
242enum mddev_sb_flags { 248enum mddev_sb_flags {
@@ -494,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev)
494 mutex_lock(&mddev->reconfig_mutex); 500 mutex_lock(&mddev->reconfig_mutex);
495} 501}
496 502
497static inline int mddev_is_locked(struct mddev *mddev)
498{
499 return mutex_is_locked(&mddev->reconfig_mutex);
500}
501
502static inline int mddev_trylock(struct mddev *mddev) 503static inline int mddev_trylock(struct mddev *mddev)
503{ 504{
504 return mutex_trylock(&mddev->reconfig_mutex); 505 return mutex_trylock(&mddev->reconfig_mutex);
@@ -538,12 +539,11 @@ struct md_personality
538 int (*check_reshape) (struct mddev *mddev); 539 int (*check_reshape) (struct mddev *mddev);
539 int (*start_reshape) (struct mddev *mddev); 540 int (*start_reshape) (struct mddev *mddev);
540 void (*finish_reshape) (struct mddev *mddev); 541 void (*finish_reshape) (struct mddev *mddev);
541 /* quiesce moves between quiescence states 542 /* quiesce suspends or resumes internal processing.
542 * 0 - fully active 543 * 1 - stop new actions and wait for action io to complete
543 * 1 - no new requests allowed 544 * 0 - return to normal behaviour
544 * others - reserved
545 */ 545 */
546 void (*quiesce) (struct mddev *mddev, int state); 546 void (*quiesce) (struct mddev *mddev, int quiesce);
547 /* takeover is used to transition an array from one 547 /* takeover is used to transition an array from one
548 * personality to another. The new personality must be able 548 * personality to another. The new personality must be able
549 * to handle the data in the current layout. 549 * to handle the data in the current layout.
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 5a00fc118470..5ecba9eef441 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev)
768 return ERR_PTR(-EINVAL); 768 return ERR_PTR(-EINVAL);
769} 769}
770 770
771static void raid0_quiesce(struct mddev *mddev, int state) 771static void raid0_quiesce(struct mddev *mddev, int quiesce)
772{ 772{
773} 773}
774 774
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f3f3e40dc9d8..cc9d337a1ed3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,13 +37,12 @@
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/seq_file.h> 38#include <linux/seq_file.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/sched/signal.h>
41 40
42#include <trace/events/block.h> 41#include <trace/events/block.h>
43 42
44#include "md.h" 43#include "md.h"
45#include "raid1.h" 44#include "raid1.h"
46#include "bitmap.h" 45#include "md-bitmap.h"
47 46
48#define UNSUPPORTED_MDDEV_FLAGS \ 47#define UNSUPPORTED_MDDEV_FLAGS \
49 ((1L << MD_HAS_JOURNAL) | \ 48 ((1L << MD_HAS_JOURNAL) | \
@@ -990,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
990 _wait_barrier(conf, idx); 989 _wait_barrier(conf, idx);
991} 990}
992 991
993static void wait_all_barriers(struct r1conf *conf)
994{
995 int idx;
996
997 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
998 _wait_barrier(conf, idx);
999}
1000
1001static void _allow_barrier(struct r1conf *conf, int idx) 992static void _allow_barrier(struct r1conf *conf, int idx)
1002{ 993{
1003 atomic_dec(&conf->nr_pending[idx]); 994 atomic_dec(&conf->nr_pending[idx]);
@@ -1011,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
1011 _allow_barrier(conf, idx); 1002 _allow_barrier(conf, idx);
1012} 1003}
1013 1004
1014static void allow_all_barriers(struct r1conf *conf)
1015{
1016 int idx;
1017
1018 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
1019 _allow_barrier(conf, idx);
1020}
1021
1022/* conf->resync_lock should be held */ 1005/* conf->resync_lock should be held */
1023static int get_unqueued_pending(struct r1conf *conf) 1006static int get_unqueued_pending(struct r1conf *conf)
1024{ 1007{
@@ -1303,42 +1286,28 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
1303 int first_clone; 1286 int first_clone;
1304 int max_sectors; 1287 int max_sectors;
1305 1288
1306 /* 1289 if (mddev_is_clustered(mddev) &&
1307 * Register the new request and wait if the reconstruction
1308 * thread has put up a bar for new requests.
1309 * Continue immediately if no resync is active currently.
1310 */
1311
1312
1313 if ((bio_end_sector(bio) > mddev->suspend_lo &&
1314 bio->bi_iter.bi_sector < mddev->suspend_hi) ||
1315 (mddev_is_clustered(mddev) &&
1316 md_cluster_ops->area_resyncing(mddev, WRITE, 1290 md_cluster_ops->area_resyncing(mddev, WRITE,
1317 bio->bi_iter.bi_sector, bio_end_sector(bio)))) { 1291 bio->bi_iter.bi_sector, bio_end_sector(bio))) {
1318 1292
1319 /*
1320 * As the suspend_* range is controlled by userspace, we want
1321 * an interruptible wait.
1322 */
1323 DEFINE_WAIT(w); 1293 DEFINE_WAIT(w);
1324 for (;;) { 1294 for (;;) {
1325 sigset_t full, old;
1326 prepare_to_wait(&conf->wait_barrier, 1295 prepare_to_wait(&conf->wait_barrier,
1327 &w, TASK_INTERRUPTIBLE); 1296 &w, TASK_IDLE);
1328 if (bio_end_sector(bio) <= mddev->suspend_lo || 1297 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1329 bio->bi_iter.bi_sector >= mddev->suspend_hi || 1298 bio->bi_iter.bi_sector,
1330 (mddev_is_clustered(mddev) && 1299 bio_end_sector(bio)))
1331 !md_cluster_ops->area_resyncing(mddev, WRITE,
1332 bio->bi_iter.bi_sector,
1333 bio_end_sector(bio))))
1334 break; 1300 break;
1335 sigfillset(&full);
1336 sigprocmask(SIG_BLOCK, &full, &old);
1337 schedule(); 1301 schedule();
1338 sigprocmask(SIG_SETMASK, &old, NULL);
1339 } 1302 }
1340 finish_wait(&conf->wait_barrier, &w); 1303 finish_wait(&conf->wait_barrier, &w);
1341 } 1304 }
1305
1306 /*
1307 * Register the new request and wait if the reconstruction
1308 * thread has put up a bar for new requests.
1309 * Continue immediately if no resync is active currently.
1310 */
1342 wait_barrier(conf, bio->bi_iter.bi_sector); 1311 wait_barrier(conf, bio->bi_iter.bi_sector);
1343 1312
1344 r1_bio = alloc_r1bio(mddev, bio); 1313 r1_bio = alloc_r1bio(mddev, bio);
@@ -1654,8 +1623,12 @@ static void print_conf(struct r1conf *conf)
1654 1623
1655static void close_sync(struct r1conf *conf) 1624static void close_sync(struct r1conf *conf)
1656{ 1625{
1657 wait_all_barriers(conf); 1626 int idx;
1658 allow_all_barriers(conf); 1627
1628 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
1629 _wait_barrier(conf, idx);
1630 _allow_barrier(conf, idx);
1631 }
1659 1632
1660 mempool_destroy(conf->r1buf_pool); 1633 mempool_destroy(conf->r1buf_pool);
1661 conf->r1buf_pool = NULL; 1634 conf->r1buf_pool = NULL;
@@ -3277,21 +3250,14 @@ static int raid1_reshape(struct mddev *mddev)
3277 return 0; 3250 return 0;
3278} 3251}
3279 3252
3280static void raid1_quiesce(struct mddev *mddev, int state) 3253static void raid1_quiesce(struct mddev *mddev, int quiesce)
3281{ 3254{
3282 struct r1conf *conf = mddev->private; 3255 struct r1conf *conf = mddev->private;
3283 3256
3284 switch(state) { 3257 if (quiesce)
3285 case 2: /* wake for suspend */
3286 wake_up(&conf->wait_barrier);
3287 break;
3288 case 1:
3289 freeze_array(conf, 0); 3258 freeze_array(conf, 0);
3290 break; 3259 else
3291 case 0:
3292 unfreeze_array(conf); 3260 unfreeze_array(conf);
3293 break;
3294 }
3295} 3261}
3296 3262
3297static void *raid1_takeover(struct mddev *mddev) 3263static void *raid1_takeover(struct mddev *mddev)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 374df5796649..b9edbc747a95 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -29,7 +29,7 @@
29#include "md.h" 29#include "md.h"
30#include "raid10.h" 30#include "raid10.h"
31#include "raid0.h" 31#include "raid0.h"
32#include "bitmap.h" 32#include "md-bitmap.h"
33 33
34/* 34/*
35 * RAID10 provides a combination of RAID0 and RAID1 functionality. 35 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
136 kfree(r10_bio); 136 kfree(r10_bio);
137} 137}
138 138
139#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
139/* amount of memory to reserve for resync requests */ 140/* amount of memory to reserve for resync requests */
140#define RESYNC_WINDOW (1024*1024) 141#define RESYNC_WINDOW (1024*1024)
141/* maximum number of concurrent requests, memory permitting */ 142/* maximum number of concurrent requests, memory permitting */
142#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 143#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
144#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
145#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
143 146
144/* 147/*
145 * When performing a resync, we need to read and compare, so 148 * When performing a resync, we need to read and compare, so
@@ -383,12 +386,11 @@ static void raid10_end_read_request(struct bio *bio)
383{ 386{
384 int uptodate = !bio->bi_status; 387 int uptodate = !bio->bi_status;
385 struct r10bio *r10_bio = bio->bi_private; 388 struct r10bio *r10_bio = bio->bi_private;
386 int slot, dev; 389 int slot;
387 struct md_rdev *rdev; 390 struct md_rdev *rdev;
388 struct r10conf *conf = r10_bio->mddev->private; 391 struct r10conf *conf = r10_bio->mddev->private;
389 392
390 slot = r10_bio->read_slot; 393 slot = r10_bio->read_slot;
391 dev = r10_bio->devs[slot].devnum;
392 rdev = r10_bio->devs[slot].rdev; 394 rdev = r10_bio->devs[slot].rdev;
393 /* 395 /*
394 * this branch is our 'one mirror IO has finished' event handler: 396 * this branch is our 'one mirror IO has finished' event handler:
@@ -748,7 +750,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
748 750
749 raid10_find_phys(conf, r10_bio); 751 raid10_find_phys(conf, r10_bio);
750 rcu_read_lock(); 752 rcu_read_lock();
751 sectors = r10_bio->sectors;
752 best_slot = -1; 753 best_slot = -1;
753 best_rdev = NULL; 754 best_rdev = NULL;
754 best_dist = MaxSector; 755 best_dist = MaxSector;
@@ -761,8 +762,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
761 * the resync window. We take the first readable disk when 762 * the resync window. We take the first readable disk when
762 * above the resync window. 763 * above the resync window.
763 */ 764 */
764 if (conf->mddev->recovery_cp < MaxSector 765 if ((conf->mddev->recovery_cp < MaxSector
765 && (this_sector + sectors >= conf->next_resync)) 766 && (this_sector + sectors >= conf->next_resync)) ||
767 (mddev_is_clustered(conf->mddev) &&
768 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
769 this_sector + sectors)))
766 do_balance = 0; 770 do_balance = 0;
767 771
768 for (slot = 0; slot < conf->copies ; slot++) { 772 for (slot = 0; slot < conf->copies ; slot++) {
@@ -1293,6 +1297,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1293 sector_t sectors; 1297 sector_t sectors;
1294 int max_sectors; 1298 int max_sectors;
1295 1299
1300 if ((mddev_is_clustered(mddev) &&
1301 md_cluster_ops->area_resyncing(mddev, WRITE,
1302 bio->bi_iter.bi_sector,
1303 bio_end_sector(bio)))) {
1304 DEFINE_WAIT(w);
1305 for (;;) {
1306 prepare_to_wait(&conf->wait_barrier,
1307 &w, TASK_IDLE);
1308 if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1309 bio->bi_iter.bi_sector, bio_end_sector(bio)))
1310 break;
1311 schedule();
1312 }
1313 finish_wait(&conf->wait_barrier, &w);
1314 }
1315
1296 /* 1316 /*
1297 * Register the new request and wait if the reconstruction 1317 * Register the new request and wait if the reconstruction
1298 * thread has put up a bar for new requests. 1318 * thread has put up a bar for new requests.
@@ -2575,7 +2595,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2575 struct bio *bio; 2595 struct bio *bio;
2576 struct r10conf *conf = mddev->private; 2596 struct r10conf *conf = mddev->private;
2577 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2597 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2578 sector_t bio_last_sector;
2579 2598
2580 /* we got a read error. Maybe the drive is bad. Maybe just 2599 /* we got a read error. Maybe the drive is bad. Maybe just
2581 * the block and we can fix it. 2600 * the block and we can fix it.
@@ -2586,7 +2605,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2586 * frozen. 2605 * frozen.
2587 */ 2606 */
2588 bio = r10_bio->devs[slot].bio; 2607 bio = r10_bio->devs[slot].bio;
2589 bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
2590 bio_put(bio); 2608 bio_put(bio);
2591 r10_bio->devs[slot].bio = NULL; 2609 r10_bio->devs[slot].bio = NULL;
2592 2610
@@ -2826,6 +2844,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2826} 2844}
2827 2845
2828/* 2846/*
2847 * Set cluster_sync_high since we need other nodes to add the
2848 * range [cluster_sync_low, cluster_sync_high] to suspend list.
2849 */
2850static void raid10_set_cluster_sync_high(struct r10conf *conf)
2851{
2852 sector_t window_size;
2853 int extra_chunk, chunks;
2854
2855 /*
2856 * First, here we define "stripe" as a unit which across
2857 * all member devices one time, so we get chunks by use
2858 * raid_disks / near_copies. Otherwise, if near_copies is
2859 * close to raid_disks, then resync window could increases
2860 * linearly with the increase of raid_disks, which means
2861 * we will suspend a really large IO window while it is not
2862 * necessary. If raid_disks is not divisible by near_copies,
2863 * an extra chunk is needed to ensure the whole "stripe" is
2864 * covered.
2865 */
2866
2867 chunks = conf->geo.raid_disks / conf->geo.near_copies;
2868 if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2869 extra_chunk = 0;
2870 else
2871 extra_chunk = 1;
2872 window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2873
2874 /*
2875 * At least use a 32M window to align with raid1's resync window
2876 */
2877 window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2878 CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2879
2880 conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2881}
2882
2883/*
2829 * perform a "sync" on one "block" 2884 * perform a "sync" on one "block"
2830 * 2885 *
2831 * We need to make sure that no normal I/O request - particularly write 2886 * We need to make sure that no normal I/O request - particularly write
@@ -2897,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2897 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2952 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2898 max_sector = mddev->resync_max_sectors; 2953 max_sector = mddev->resync_max_sectors;
2899 if (sector_nr >= max_sector) { 2954 if (sector_nr >= max_sector) {
2955 conf->cluster_sync_low = 0;
2956 conf->cluster_sync_high = 0;
2957
2900 /* If we aborted, we need to abort the 2958 /* If we aborted, we need to abort the
2901 * sync on the 'current' bitmap chucks (there can 2959 * sync on the 'current' bitmap chucks (there can
2902 * be several when recovering multiple devices). 2960 * be several when recovering multiple devices).
@@ -3251,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3251 /* resync. Schedule a read for every block at this virt offset */ 3309 /* resync. Schedule a read for every block at this virt offset */
3252 int count = 0; 3310 int count = 0;
3253 3311
3254 bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); 3312 /*
3313 * Since curr_resync_completed could probably not update in
3314 * time, and we will set cluster_sync_low based on it.
3315 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
3316 * safety reason, which ensures curr_resync_completed is
3317 * updated in bitmap_cond_end_sync.
3318 */
3319 bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3320 mddev_is_clustered(mddev) &&
3321 (sector_nr + 2 * RESYNC_SECTORS >
3322 conf->cluster_sync_high));
3255 3323
3256 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 3324 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3257 &sync_blocks, mddev->degraded) && 3325 &sync_blocks, mddev->degraded) &&
@@ -3385,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
3385 } while (++page_idx < RESYNC_PAGES); 3453 } while (++page_idx < RESYNC_PAGES);
3386 r10_bio->sectors = nr_sectors; 3454 r10_bio->sectors = nr_sectors;
3387 3455
3456 if (mddev_is_clustered(mddev) &&
3457 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3458 /* It is resync not recovery */
3459 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3460 conf->cluster_sync_low = mddev->curr_resync_completed;
3461 raid10_set_cluster_sync_high(conf);
3462 /* Send resync message */
3463 md_cluster_ops->resync_info_update(mddev,
3464 conf->cluster_sync_low,
3465 conf->cluster_sync_high);
3466 }
3467 } else if (mddev_is_clustered(mddev)) {
3468 /* This is recovery not resync */
3469 sector_t sect_va1, sect_va2;
3470 bool broadcast_msg = false;
3471
3472 for (i = 0; i < conf->geo.raid_disks; i++) {
3473 /*
3474 * sector_nr is a device address for recovery, so we
3475 * need translate it to array address before compare
3476 * with cluster_sync_high.
3477 */
3478 sect_va1 = raid10_find_virt(conf, sector_nr, i);
3479
3480 if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3481 broadcast_msg = true;
3482 /*
3483 * curr_resync_completed is similar as
3484 * sector_nr, so make the translation too.
3485 */
3486 sect_va2 = raid10_find_virt(conf,
3487 mddev->curr_resync_completed, i);
3488
3489 if (conf->cluster_sync_low == 0 ||
3490 conf->cluster_sync_low > sect_va2)
3491 conf->cluster_sync_low = sect_va2;
3492 }
3493 }
3494 if (broadcast_msg) {
3495 raid10_set_cluster_sync_high(conf);
3496 md_cluster_ops->resync_info_update(mddev,
3497 conf->cluster_sync_low,
3498 conf->cluster_sync_high);
3499 }
3500 }
3501
3388 while (biolist) { 3502 while (biolist) {
3389 bio = biolist; 3503 bio = biolist;
3390 biolist = biolist->bi_next; 3504 biolist = biolist->bi_next;
@@ -3644,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
3644 if (!conf) 3758 if (!conf)
3645 goto out; 3759 goto out;
3646 3760
3761 if (mddev_is_clustered(conf->mddev)) {
3762 int fc, fo;
3763
3764 fc = (mddev->layout >> 8) & 255;
3765 fo = mddev->layout & (1<<16);
3766 if (fc > 1 || fo > 0) {
3767 pr_err("only near layout is supported by clustered"
3768 " raid10\n");
3769 goto out;
3770 }
3771 }
3772
3647 mddev->thread = conf->thread; 3773 mddev->thread = conf->thread;
3648 conf->thread = NULL; 3774 conf->thread = NULL;
3649 3775
@@ -3832,18 +3958,14 @@ static void raid10_free(struct mddev *mddev, void *priv)
3832 kfree(conf); 3958 kfree(conf);
3833} 3959}
3834 3960
3835static void raid10_quiesce(struct mddev *mddev, int state) 3961static void raid10_quiesce(struct mddev *mddev, int quiesce)
3836{ 3962{
3837 struct r10conf *conf = mddev->private; 3963 struct r10conf *conf = mddev->private;
3838 3964
3839 switch(state) { 3965 if (quiesce)
3840 case 1:
3841 raise_barrier(conf, 0); 3966 raise_barrier(conf, 0);
3842 break; 3967 else
3843 case 0:
3844 lower_barrier(conf); 3968 lower_barrier(conf);
3845 break;
3846 }
3847} 3969}
3848 3970
3849static int raid10_resize(struct mddev *mddev, sector_t sectors) 3971static int raid10_resize(struct mddev *mddev, sector_t sectors)
@@ -4578,15 +4700,18 @@ static int handle_reshape_read_error(struct mddev *mddev,
4578 /* Use sync reads to get the blocks from somewhere else */ 4700 /* Use sync reads to get the blocks from somewhere else */
4579 int sectors = r10_bio->sectors; 4701 int sectors = r10_bio->sectors;
4580 struct r10conf *conf = mddev->private; 4702 struct r10conf *conf = mddev->private;
4581 struct { 4703 struct r10bio *r10b;
4582 struct r10bio r10_bio;
4583 struct r10dev devs[conf->copies];
4584 } on_stack;
4585 struct r10bio *r10b = &on_stack.r10_bio;
4586 int slot = 0; 4704 int slot = 0;
4587 int idx = 0; 4705 int idx = 0;
4588 struct page **pages; 4706 struct page **pages;
4589 4707
4708 r10b = kmalloc(sizeof(*r10b) +
4709 sizeof(struct r10dev) * conf->copies, GFP_NOIO);
4710 if (!r10b) {
4711 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4712 return -ENOMEM;
4713 }
4714
4590 /* reshape IOs share pages from .devs[0].bio */ 4715 /* reshape IOs share pages from .devs[0].bio */
4591 pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 4716 pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4592 4717
@@ -4635,11 +4760,13 @@ static int handle_reshape_read_error(struct mddev *mddev,
4635 /* couldn't read this block, must give up */ 4760 /* couldn't read this block, must give up */
4636 set_bit(MD_RECOVERY_INTR, 4761 set_bit(MD_RECOVERY_INTR,
4637 &mddev->recovery); 4762 &mddev->recovery);
4763 kfree(r10b);
4638 return -EIO; 4764 return -EIO;
4639 } 4765 }
4640 sectors -= s; 4766 sectors -= s;
4641 idx++; 4767 idx++;
4642 } 4768 }
4769 kfree(r10b);
4643 return 0; 4770 return 0;
4644} 4771}
4645 4772
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index abceccab6671..db2ac22ac1b4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -89,6 +89,12 @@ struct r10conf {
89 * the new thread here until we fully activate the array. 89 * the new thread here until we fully activate the array.
90 */ 90 */
91 struct md_thread *thread; 91 struct md_thread *thread;
92
93 /*
94 * Keep track of cluster resync window to send to other nodes.
95 */
96 sector_t cluster_sync_low;
97 sector_t cluster_sync_high;
92}; 98};
93 99
94/* 100/*
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0b7406ac8ce1..f1c86d938502 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -23,7 +23,7 @@
23#include <linux/types.h> 23#include <linux/types.h>
24#include "md.h" 24#include "md.h"
25#include "raid5.h" 25#include "raid5.h"
26#include "bitmap.h" 26#include "md-bitmap.h"
27#include "raid5-log.h" 27#include "raid5-log.h"
28 28
29/* 29/*
@@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log)
539{ 539{
540 struct r5l_io_unit *io, *next; 540 struct r5l_io_unit *io, *next;
541 541
542 assert_spin_locked(&log->io_list_lock); 542 lockdep_assert_held(&log->io_list_lock);
543 543
544 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 544 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
545 /* don't change list order */ 545 /* don't change list order */
@@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
555{ 555{
556 struct r5l_io_unit *io, *next; 556 struct r5l_io_unit *io, *next;
557 557
558 assert_spin_locked(&log->io_list_lock); 558 lockdep_assert_held(&log->io_list_lock);
559 559
560 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 560 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
561 /* don't change list order */ 561 /* don't change list order */
@@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work)
693 struct r5l_log *log = container_of(work, struct r5l_log, 693 struct r5l_log *log = container_of(work, struct r5l_log,
694 disable_writeback_work); 694 disable_writeback_work);
695 struct mddev *mddev = log->rdev->mddev; 695 struct mddev *mddev = log->rdev->mddev;
696 struct r5conf *conf = mddev->private;
697 int locked = 0;
696 698
697 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 699 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
698 return; 700 return;
@@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work)
701 703
702 /* wait superblock change before suspend */ 704 /* wait superblock change before suspend */
703 wait_event(mddev->sb_wait, 705 wait_event(mddev->sb_wait,
704 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 706 conf->log == NULL ||
705 707 (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
706 mddev_suspend(mddev); 708 (locked = mddev_trylock(mddev))));
707 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 709 if (locked) {
708 mddev_resume(mddev); 710 mddev_suspend(mddev);
711 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
712 mddev_resume(mddev);
713 mddev_unlock(mddev);
714 }
709} 715}
710 716
711static void r5l_submit_current_io(struct r5l_log *log) 717static void r5l_submit_current_io(struct r5l_log *log)
@@ -1194,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log)
1194{ 1200{
1195 struct stripe_head *sh; 1201 struct stripe_head *sh;
1196 1202
1197 assert_spin_locked(&log->io_list_lock); 1203 lockdep_assert_held(&log->io_list_lock);
1198 1204
1199 if (!list_empty(&log->no_mem_stripes)) { 1205 if (!list_empty(&log->no_mem_stripes)) {
1200 sh = list_first_entry(&log->no_mem_stripes, 1206 sh = list_first_entry(&log->no_mem_stripes,
@@ -1210,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
1210 struct r5l_io_unit *io, *next; 1216 struct r5l_io_unit *io, *next;
1211 bool found = false; 1217 bool found = false;
1212 1218
1213 assert_spin_locked(&log->io_list_lock); 1219 lockdep_assert_held(&log->io_list_lock);
1214 1220
1215 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 1221 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
1216 /* don't change list order */ 1222 /* don't change list order */
@@ -1382,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
1382 * raid5_release_stripe() while holding conf->device_lock 1388 * raid5_release_stripe() while holding conf->device_lock
1383 */ 1389 */
1384 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1390 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
1385 assert_spin_locked(&conf->device_lock); 1391 lockdep_assert_held(&conf->device_lock);
1386 1392
1387 list_del_init(&sh->lru); 1393 list_del_init(&sh->lru);
1388 atomic_inc(&sh->count); 1394 atomic_inc(&sh->count);
@@ -1409,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
1409 int count; 1415 int count;
1410 struct stripe_head *sh, *next; 1416 struct stripe_head *sh, *next;
1411 1417
1412 assert_spin_locked(&conf->device_lock); 1418 lockdep_assert_held(&conf->device_lock);
1413 if (!conf->log) 1419 if (!conf->log)
1414 return; 1420 return;
1415 1421
@@ -1583,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1583 md_wakeup_thread(log->reclaim_thread); 1589 md_wakeup_thread(log->reclaim_thread);
1584} 1590}
1585 1591
1586void r5l_quiesce(struct r5l_log *log, int state) 1592void r5l_quiesce(struct r5l_log *log, int quiesce)
1587{ 1593{
1588 struct mddev *mddev; 1594 struct mddev *mddev;
1589 if (!log || state == 2) 1595 if (!log)
1590 return; 1596 return;
1591 if (state == 0) 1597
1592 kthread_unpark(log->reclaim_thread->tsk); 1598 if (quiesce) {
1593 else if (state == 1) {
1594 /* make sure r5l_write_super_and_discard_space exits */ 1599 /* make sure r5l_write_super_and_discard_space exits */
1595 mddev = log->rdev->mddev; 1600 mddev = log->rdev->mddev;
1596 wake_up(&mddev->sb_wait); 1601 wake_up(&mddev->sb_wait);
1597 kthread_park(log->reclaim_thread->tsk); 1602 kthread_park(log->reclaim_thread->tsk);
1598 r5l_wake_reclaim(log, MaxSector); 1603 r5l_wake_reclaim(log, MaxSector);
1599 r5l_do_reclaim(log); 1604 r5l_do_reclaim(log);
1600 } 1605 } else
1606 kthread_unpark(log->reclaim_thread->tsk);
1601} 1607}
1602 1608
1603bool r5l_log_disk_error(struct r5conf *conf) 1609bool r5l_log_disk_error(struct r5conf *conf)
@@ -3165,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf)
3165 conf->log = NULL; 3171 conf->log = NULL;
3166 synchronize_rcu(); 3172 synchronize_rcu();
3167 3173
3174 /* Ensure disable_writeback_work wakes up and exits */
3175 wake_up(&conf->mddev->sb_wait);
3168 flush_work(&log->disable_writeback_work); 3176 flush_work(&log->disable_writeback_work);
3169 md_unregister_thread(&log->reclaim_thread); 3177 md_unregister_thread(&log->reclaim_thread);
3170 mempool_destroy(log->meta_pool); 3178 mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 7f9ad5f7cda0..284578b0a349 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -9,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log);
9extern void r5l_flush_stripe_to_raid(struct r5l_log *log); 9extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
10extern void r5l_stripe_write_finished(struct stripe_head *sh); 10extern void r5l_stripe_write_finished(struct stripe_head *sh);
11extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); 11extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
12extern void r5l_quiesce(struct r5l_log *log, int state); 12extern void r5l_quiesce(struct r5l_log *log, int quiesce);
13extern bool r5l_log_disk_error(struct r5conf *conf); 13extern bool r5l_log_disk_error(struct r5conf *conf);
14extern bool r5c_is_writeback(struct r5l_log *log); 14extern bool r5c_is_writeback(struct r5l_log *log);
15extern int 15extern int
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index cd026c88f7ef..628c0bf7b9fd 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
758 (unsigned long long)sector); 758 (unsigned long long)sector);
759 759
760 rdev = conf->disks[dd_idx].rdev; 760 rdev = conf->disks[dd_idx].rdev;
761 if (!rdev) { 761 if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
762 sector >= rdev->recovery_offset)) {
762 pr_debug("%s:%*s data member disk %d missing\n", 763 pr_debug("%s:%*s data member disk %d missing\n",
763 __func__, indent, "", dd_idx); 764 __func__, indent, "", dd_idx);
764 update_parity = false; 765 update_parity = false;
@@ -1296,8 +1297,7 @@ int ppl_init_log(struct r5conf *conf)
1296 1297
1297 if (ret) { 1298 if (ret) {
1298 goto err; 1299 goto err;
1299 } else if (!mddev->pers && 1300 } else if (!mddev->pers && mddev->recovery_cp == 0 &&
1300 mddev->recovery_cp == 0 && !mddev->degraded &&
1301 ppl_conf->recovered_entries > 0 && 1301 ppl_conf->recovered_entries > 0 &&
1302 ppl_conf->mismatch_count == 0) { 1302 ppl_conf->mismatch_count == 0) {
1303 /* 1303 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7d9a50eed9db..31dc25e2871a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,7 +55,6 @@
55#include <linux/ratelimit.h> 55#include <linux/ratelimit.h>
56#include <linux/nodemask.h> 56#include <linux/nodemask.h>
57#include <linux/flex_array.h> 57#include <linux/flex_array.h>
58#include <linux/sched/signal.h>
59 58
60#include <trace/events/block.h> 59#include <trace/events/block.h>
61#include <linux/list_sort.h> 60#include <linux/list_sort.h>
@@ -63,7 +62,7 @@
63#include "md.h" 62#include "md.h"
64#include "raid5.h" 63#include "raid5.h"
65#include "raid0.h" 64#include "raid0.h"
66#include "bitmap.h" 65#include "md-bitmap.h"
67#include "raid5-log.h" 66#include "raid5-log.h"
68 67
69#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 68#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
@@ -1818,8 +1817,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1818 struct r5dev *dev = &sh->dev[i]; 1817 struct r5dev *dev = &sh->dev[i];
1819 1818
1820 if (dev->written || i == pd_idx || i == qd_idx) { 1819 if (dev->written || i == pd_idx || i == qd_idx) {
1821 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1820 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
1822 set_bit(R5_UPTODATE, &dev->flags); 1821 set_bit(R5_UPTODATE, &dev->flags);
1822 if (test_bit(STRIPE_EXPAND_READY, &sh->state))
1823 set_bit(R5_Expanded, &dev->flags);
1824 }
1823 if (fua) 1825 if (fua)
1824 set_bit(R5_WantFUA, &dev->flags); 1826 set_bit(R5_WantFUA, &dev->flags);
1825 if (sync) 1827 if (sync)
@@ -5682,28 +5684,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
5682 goto retry; 5684 goto retry;
5683 } 5685 }
5684 5686
5685 if (rw == WRITE &&
5686 logical_sector >= mddev->suspend_lo &&
5687 logical_sector < mddev->suspend_hi) {
5688 raid5_release_stripe(sh);
5689 /* As the suspend_* range is controlled by
5690 * userspace, we want an interruptible
5691 * wait.
5692 */
5693 prepare_to_wait(&conf->wait_for_overlap,
5694 &w, TASK_INTERRUPTIBLE);
5695 if (logical_sector >= mddev->suspend_lo &&
5696 logical_sector < mddev->suspend_hi) {
5697 sigset_t full, old;
5698 sigfillset(&full);
5699 sigprocmask(SIG_BLOCK, &full, &old);
5700 schedule();
5701 sigprocmask(SIG_SETMASK, &old, NULL);
5702 do_prepare = true;
5703 }
5704 goto retry;
5705 }
5706
5707 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5687 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5708 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5688 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5709 /* Stripe is busy expanding or 5689 /* Stripe is busy expanding or
@@ -5758,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
5758 */ 5738 */
5759 struct r5conf *conf = mddev->private; 5739 struct r5conf *conf = mddev->private;
5760 struct stripe_head *sh; 5740 struct stripe_head *sh;
5741 struct md_rdev *rdev;
5761 sector_t first_sector, last_sector; 5742 sector_t first_sector, last_sector;
5762 int raid_disks = conf->previous_raid_disks; 5743 int raid_disks = conf->previous_raid_disks;
5763 int data_disks = raid_disks - conf->max_degraded; 5744 int data_disks = raid_disks - conf->max_degraded;
@@ -5880,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
5880 return 0; 5861 return 0;
5881 mddev->reshape_position = conf->reshape_progress; 5862 mddev->reshape_position = conf->reshape_progress;
5882 mddev->curr_resync_completed = sector_nr; 5863 mddev->curr_resync_completed = sector_nr;
5864 if (!mddev->reshape_backwards)
5865 /* Can update recovery_offset */
5866 rdev_for_each(rdev, mddev)
5867 if (rdev->raid_disk >= 0 &&
5868 !test_bit(Journal, &rdev->flags) &&
5869 !test_bit(In_sync, &rdev->flags) &&
5870 rdev->recovery_offset < sector_nr)
5871 rdev->recovery_offset = sector_nr;
5872
5883 conf->reshape_checkpoint = jiffies; 5873 conf->reshape_checkpoint = jiffies;
5884 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5874 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5885 md_wakeup_thread(mddev->thread); 5875 md_wakeup_thread(mddev->thread);
@@ -5978,6 +5968,14 @@ finish:
5978 goto ret; 5968 goto ret;
5979 mddev->reshape_position = conf->reshape_progress; 5969 mddev->reshape_position = conf->reshape_progress;
5980 mddev->curr_resync_completed = sector_nr; 5970 mddev->curr_resync_completed = sector_nr;
5971 if (!mddev->reshape_backwards)
5972 /* Can update recovery_offset */
5973 rdev_for_each(rdev, mddev)
5974 if (rdev->raid_disk >= 0 &&
5975 !test_bit(Journal, &rdev->flags) &&
5976 !test_bit(In_sync, &rdev->flags) &&
5977 rdev->recovery_offset < sector_nr)
5978 rdev->recovery_offset = sector_nr;
5981 conf->reshape_checkpoint = jiffies; 5979 conf->reshape_checkpoint = jiffies;
5982 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5980 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5983 md_wakeup_thread(mddev->thread); 5981 md_wakeup_thread(mddev->thread);
@@ -7156,6 +7154,13 @@ static int raid5_run(struct mddev *mddev)
7156 min_offset_diff = diff; 7154 min_offset_diff = diff;
7157 } 7155 }
7158 7156
7157 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
7158 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
7159 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
7160 mdname(mddev));
7161 return -EINVAL;
7162 }
7163
7159 if (mddev->reshape_position != MaxSector) { 7164 if (mddev->reshape_position != MaxSector) {
7160 /* Check that we can continue the reshape. 7165 /* Check that we can continue the reshape.
7161 * Difficulties arise if the stripe we would write to 7166 * Difficulties arise if the stripe we would write to
@@ -7958,6 +7963,7 @@ static void end_reshape(struct r5conf *conf)
7958{ 7963{
7959 7964
7960 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7965 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7966 struct md_rdev *rdev;
7961 7967
7962 spin_lock_irq(&conf->device_lock); 7968 spin_lock_irq(&conf->device_lock);
7963 conf->previous_raid_disks = conf->raid_disks; 7969 conf->previous_raid_disks = conf->raid_disks;
@@ -7965,6 +7971,11 @@ static void end_reshape(struct r5conf *conf)
7965 smp_wmb(); 7971 smp_wmb();
7966 conf->reshape_progress = MaxSector; 7972 conf->reshape_progress = MaxSector;
7967 conf->mddev->reshape_position = MaxSector; 7973 conf->mddev->reshape_position = MaxSector;
7974 rdev_for_each(rdev, conf->mddev)
7975 if (rdev->raid_disk >= 0 &&
7976 !test_bit(Journal, &rdev->flags) &&
7977 !test_bit(In_sync, &rdev->flags))
7978 rdev->recovery_offset = MaxSector;
7968 spin_unlock_irq(&conf->device_lock); 7979 spin_unlock_irq(&conf->device_lock);
7969 wake_up(&conf->wait_for_overlap); 7980 wake_up(&conf->wait_for_overlap);
7970 7981
@@ -8020,16 +8031,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
8020 } 8031 }
8021} 8032}
8022 8033
8023static void raid5_quiesce(struct mddev *mddev, int state) 8034static void raid5_quiesce(struct mddev *mddev, int quiesce)
8024{ 8035{
8025 struct r5conf *conf = mddev->private; 8036 struct r5conf *conf = mddev->private;
8026 8037
8027 switch(state) { 8038 if (quiesce) {
8028 case 2: /* resume for a suspend */ 8039 /* stop all writes */
8029 wake_up(&conf->wait_for_overlap);
8030 break;
8031
8032 case 1: /* stop all writes */
8033 lock_all_device_hash_locks_irq(conf); 8040 lock_all_device_hash_locks_irq(conf);
8034 /* '2' tells resync/reshape to pause so that all 8041 /* '2' tells resync/reshape to pause so that all
8035 * active stripes can drain 8042 * active stripes can drain
@@ -8045,17 +8052,15 @@ static void raid5_quiesce(struct mddev *mddev, int state)
8045 unlock_all_device_hash_locks_irq(conf); 8052 unlock_all_device_hash_locks_irq(conf);
8046 /* allow reshape to continue */ 8053 /* allow reshape to continue */
8047 wake_up(&conf->wait_for_overlap); 8054 wake_up(&conf->wait_for_overlap);
8048 break; 8055 } else {
8049 8056 /* re-enable writes */
8050 case 0: /* re-enable writes */
8051 lock_all_device_hash_locks_irq(conf); 8057 lock_all_device_hash_locks_irq(conf);
8052 conf->quiesce = 0; 8058 conf->quiesce = 0;
8053 wake_up(&conf->wait_for_quiescent); 8059 wake_up(&conf->wait_for_quiescent);
8054 wake_up(&conf->wait_for_overlap); 8060 wake_up(&conf->wait_for_overlap);
8055 unlock_all_device_hash_locks_irq(conf); 8061 unlock_all_device_hash_locks_irq(conf);
8056 break;
8057 } 8062 }
8058 r5l_quiesce(conf->log, state); 8063 r5l_quiesce(conf->log, quiesce);
8059} 8064}
8060 8065
8061static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8066static void *raid45_takeover_raid0(struct mddev *mddev, int level)