From bb8bf15bd6f7c3432ce9ad631f2f59c49c1e1853 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 2 Jun 2016 23:32:04 -0400 Subject: md-cluster: fix deadlock issue when add disk to an recoverying array Add a disk to an array which is performing recovery is a little complicated, we need to do both reap the sync thread and perform add disk for the case, then it caused deadlock as follows. linux44:~ # ps aux|grep md|grep D root 1822 0.0 0.0 0 0 ? D 16:50 0:00 [md127_resync] root 1848 0.0 0.0 19860 952 pts/0 D+ 16:50 0:00 mdadm --manage /dev/md127 --re-add /dev/vdb linux44:~ # cat /proc/1848/stack [] kthread_stop+0x6e/0x120 [] md_unregister_thread+0x40/0x80 [md_mod] [] md_reap_sync_thread+0x15/0x150 [md_mod] [] action_store+0x260/0x270 [md_mod] [] md_attr_store+0xb4/0x100 [md_mod] [] sysfs_write_file+0xbe/0x140 [] vfs_write+0xb8/0x1e0 [] SyS_write+0x48/0xa0 [] system_call_fastpath+0x16/0x1b [<00007f068ea1ed30>] 0x7f068ea1ed30 linux44:~ # cat /proc/1822/stack [] md_do_sync+0x846/0xf40 [md_mod] [] md_thread+0x16d/0x180 [md_mod] [] kthread+0xb4/0xc0 [] ret_from_fork+0x58/0x90 Task1848 Task1822 md_attr_store (held reconfig_mutex by call mddev_lock()) action_store md_reap_sync_thread md_unregister_thread kthread_stop md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING)) md_check_recovery is triggered by wakeup mddev->thread, but it can't clear MD_CHANGE_PENDING flag since it can't get lock which was held by md_attr_store already. To solve the deadlock problem, we move "->resync_finish()" from md_do_sync to md_reap_sync_thread (after md_update_sb), also MD_HELD_RESYNC_LOCK is introduced since it is possible that node can't get resync lock in md_do_sync. Then we do not need to wait for MD_CHANGE_PENDING is cleared or not since metadata should be updated after md_update_sb, so just call resync_finish if MD_HELD_RESYNC_LOCK is set. We also unified the code after skip label, since set PENDING for non-clustered case should be harmless. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/md.h') diff --git a/drivers/md/md.h b/drivers/md/md.h index b5c4be73e6e4..03b19aad4921 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -204,6 +204,9 @@ struct mddev { #define MD_RELOAD_SB 7 /* Reload the superblock because another node * updated it. */ +#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node + * already took resync lock, need to + * release the lock */ int suspended; atomic_t active_io; -- cgit v1.2.2 From d787be4092e27728cb4c012bee9762098ef3c662 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:53 +1000 Subject: md: reduce the number of synchronize_rcu() calls when multiple devices fail. Every time a device is removed with ->hot_remove_disk() a synchronize_rcu() call is made which can delay several milliseconds in some case. If lots of devices fail at once - as could happen with a large RAID10 where one set of devices are removed all at once - these delays can add up to be very inconcenient. As failure is not reversible we can check for that first, setting a separate flag if it is found, and then all synchronize_rcu() once for all the flagged devices. Then ->hot_remove_disk() function can skip the synchronize_rcu() step if the flag is set. fix build error(Shaohua) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/md/md.h') diff --git a/drivers/md/md.h b/drivers/md/md.h index 03b19aad4921..dc65ca65b26e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -163,6 +163,11 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, -- cgit v1.2.2 From 0e3ef49eda5bae3aa75aa8c0276411bf0f27e03a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 17:33:10 +0200 Subject: md: use seconds granularity for error logging The md code stores the exact time of the last error in the last_read_error variable using a timespec structure. It only ever uses the seconds portion of that though, so we can use a scalar for it. There won't be an overflow in 2038 here, because it already used monotonic time and 32-bit is enough for that, but I've decided to use time64_t for consistency in the conversion. Signed-off-by: Arnd Bergmann Signed-off-by: Shaohua Li --- drivers/md/md.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.h') diff --git a/drivers/md/md.h b/drivers/md/md.h index dc65ca65b26e..fd56cfd8c368 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -99,7 +99,7 @@ struct md_rdev { atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ - struct timespec last_read_error; /* monotonic time since our + time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, -- cgit v1.2.2