diff options
author | Lars Ellenberg <lars.ellenberg@linbit.com> | 2010-10-16 06:13:47 -0400 |
---|---|---|
committer | Philipp Reisner <philipp.reisner@linbit.com> | 2010-10-22 09:46:11 -0400 |
commit | 82f59cc6353889b426cf13b6596d5a3d100fa09e (patch) | |
tree | 6d5a678516334f0a37a56a509b84322a0352719b /drivers/block/drbd/drbd_int.h | |
parent | 3beec1d446fba335f07787636920892dd3b2c658 (diff) |
drbd: fix potential deadlock on detach
If we have contention in drbd_al_begin_iod (heavy randon IO),
an administrative request to detach the disk may deadlock
for similar reasons as the recently fixed deadlock if detaching
because of IO-error.
The approach taken here is to either go through the intermediate
cleanup state D_FAILED, or first lock out application io,
don't just go directly to D_DISKLESS.
We need an additional state bit (WAS_IO_ERROR) to distinguish
the -> D_FAILED because of IO-error from other failures.
Sanitize D_ATTACHING -> D_FAILED to D_ATTACHING -> D_DISKLESS.
If only attaching, ldev may be missing still, but would be referenced
from within the after_state_ch for -> D_FAILED, potentially
dereferencing a NULL pointer.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Diffstat (limited to 'drivers/block/drbd/drbd_int.h')
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e0e0bf6f16a1..03c15e317c37 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -852,7 +852,8 @@ enum { | |||
852 | BITMAP_IO, /* suspend application io; | 852 | BITMAP_IO, /* suspend application io; |
853 | once no more io in flight, start bitmap io */ | 853 | once no more io in flight, start bitmap io */ |
854 | BITMAP_IO_QUEUED, /* Started bitmap IO */ | 854 | BITMAP_IO_QUEUED, /* Started bitmap IO */ |
855 | GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */ | 855 | GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ |
856 | WAS_IO_ERROR, /* Local disk failed returned IO error */ | ||
856 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ | 857 | RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ |
857 | NET_CONGESTED, /* The data socket is congested */ | 858 | NET_CONGESTED, /* The data socket is congested */ |
858 | 859 | ||
@@ -1281,6 +1282,7 @@ extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); | |||
1281 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); | 1282 | extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); |
1282 | extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); | 1283 | extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); |
1283 | extern void drbd_go_diskless(struct drbd_conf *mdev); | 1284 | extern void drbd_go_diskless(struct drbd_conf *mdev); |
1285 | extern void drbd_ldev_destroy(struct drbd_conf *mdev); | ||
1284 | 1286 | ||
1285 | 1287 | ||
1286 | /* Meta data layout | 1288 | /* Meta data layout |
@@ -1798,17 +1800,17 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, | |||
1798 | case EP_PASS_ON: | 1800 | case EP_PASS_ON: |
1799 | if (!forcedetach) { | 1801 | if (!forcedetach) { |
1800 | if (__ratelimit(&drbd_ratelimit_state)) | 1802 | if (__ratelimit(&drbd_ratelimit_state)) |
1801 | dev_err(DEV, "Local IO failed in %s." | 1803 | dev_err(DEV, "Local IO failed in %s.\n", where); |
1802 | "Passing error on...\n", where); | ||
1803 | break; | 1804 | break; |
1804 | } | 1805 | } |
1805 | /* NOTE fall through to detach case if forcedetach set */ | 1806 | /* NOTE fall through to detach case if forcedetach set */ |
1806 | case EP_DETACH: | 1807 | case EP_DETACH: |
1807 | case EP_CALL_HELPER: | 1808 | case EP_CALL_HELPER: |
1809 | set_bit(WAS_IO_ERROR, &mdev->flags); | ||
1808 | if (mdev->state.disk > D_FAILED) { | 1810 | if (mdev->state.disk > D_FAILED) { |
1809 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); | 1811 | _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); |
1810 | dev_err(DEV, "Local IO failed in %s." | 1812 | dev_err(DEV, |
1811 | "Detaching...\n", where); | 1813 | "Local IO failed in %s. Detaching...\n", where); |
1812 | } | 1814 | } |
1813 | break; | 1815 | break; |
1814 | } | 1816 | } |
@@ -2127,7 +2129,11 @@ static inline void put_ldev(struct drbd_conf *mdev) | |||
2127 | __release(local); | 2129 | __release(local); |
2128 | D_ASSERT(i >= 0); | 2130 | D_ASSERT(i >= 0); |
2129 | if (i == 0) { | 2131 | if (i == 0) { |
2132 | if (mdev->state.disk == D_DISKLESS) | ||
2133 | /* even internal references gone, safe to destroy */ | ||
2134 | drbd_ldev_destroy(mdev); | ||
2130 | if (mdev->state.disk == D_FAILED) | 2135 | if (mdev->state.disk == D_FAILED) |
2136 | /* all application IO references gone. */ | ||
2131 | drbd_go_diskless(mdev); | 2137 | drbd_go_diskless(mdev); |
2132 | wake_up(&mdev->misc_wait); | 2138 | wake_up(&mdev->misc_wait); |
2133 | } | 2139 | } |
@@ -2138,6 +2144,10 @@ static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_stat | |||
2138 | { | 2144 | { |
2139 | int io_allowed; | 2145 | int io_allowed; |
2140 | 2146 | ||
2147 | /* never get a reference while D_DISKLESS */ | ||
2148 | if (mdev->state.disk == D_DISKLESS) | ||
2149 | return 0; | ||
2150 | |||
2141 | atomic_inc(&mdev->local_cnt); | 2151 | atomic_inc(&mdev->local_cnt); |
2142 | io_allowed = (mdev->state.disk >= mins); | 2152 | io_allowed = (mdev->state.disk >= mins); |
2143 | if (!io_allowed) | 2153 | if (!io_allowed) |