aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorPhilipp Reisner <philipp.reisner@linbit.com>2014-11-10 11:21:11 -0500
committerJens Axboe <axboe@fb.com>2014-11-10 11:27:35 -0500
commita88215312c5ed74697973f6c9f0fce718bcf18ad (patch)
tree831cef10aa7728cff1fe109ba5f6ee8dad4e3225 /drivers/block
parentf221f4bcc5f40e2967e4596ef167bdbc987c8e9d (diff)
drbd: fix race between role change and handshake
Symptoms: If DRBD was "cleanly shut down" (all in sync, both Secondary before disconnect, identical data generation uuids), and then one side was promoted *during* the next connection handshake, the role change could confuse the handshake. The Primary would get stuck in WFBitmapS, the Secondary would log unexpected cstate (Connected) in receive_bitmap and get stuck in WFBitmapT. Fix: The test in is_valid_soft_transition wrong. It works because the not allowed actions (promote/attach) do not touch the cstate. The previous condition failed to demand a cstate change in one clause. In order to avoid deadlocks give up the state_mutex while waiting for the transient state to go away. Conflicts: drbd/drbd_state.c drbd/drbd_state.h drbd/drbd_wrappers.h Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/drbd/drbd_nl.c2
-rw-r--r--drivers/block/drbd/drbd_state.c41
-rw-r--r--drivers/block/drbd/drbd_state.h5
3 files changed, 40 insertions, 8 deletions
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 4782d074c8cd..74df8cfad414 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -588,7 +588,7 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for
588 val.i = 0; val.role = new_role; 588 val.i = 0; val.role = new_role;
589 589
590 while (try++ < max_tries) { 590 while (try++ < max_tries) {
591 rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE); 591 rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
592 592
593 /* in case we first succeeded to outdate, 593 /* in case we first succeeded to outdate,
594 * but now suddenly could establish a connection */ 594 * but now suddenly could establish a connection */
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 84b11f887d73..4529d9282cef 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -215,6 +215,18 @@ static bool no_peer_wf_report_params(struct drbd_connection *connection)
215 return rv; 215 return rv;
216} 216}
217 217
218static void wake_up_all_devices(struct drbd_connection *connection)
219{
220 struct drbd_peer_device *peer_device;
221 int vnr;
222
223 rcu_read_lock();
224 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
225 wake_up(&peer_device->device->state_wait);
226 rcu_read_unlock();
227
228}
229
218 230
219/** 231/**
220 * cl_wide_st_chg() - true if the state change is a cluster wide one 232 * cl_wide_st_chg() - true if the state change is a cluster wide one
@@ -410,6 +422,22 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask,
410 return rv; 422 return rv;
411} 423}
412 424
425enum drbd_state_rv
426_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
427 union drbd_state val, enum chg_state_flags f)
428{
429 enum drbd_state_rv rv;
430
431 BUG_ON(f & CS_SERIALIZE);
432
433 wait_event_cmd(device->state_wait,
434 (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
435 mutex_unlock(device->state_mutex),
436 mutex_lock(device->state_mutex));
437
438 return rv;
439}
440
413static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) 441static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
414{ 442{
415 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", 443 drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
@@ -629,14 +657,11 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
629 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) 657 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
630 rv = SS_IN_TRANSIENT_STATE; 658 rv = SS_IN_TRANSIENT_STATE;
631 659
632 /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
633 rv = SS_IN_TRANSIENT_STATE; */
634
635 /* While establishing a connection only allow cstate to change. 660 /* While establishing a connection only allow cstate to change.
636 Delay/refuse role changes, detach attach etc... */ 661 Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
637 if (test_bit(STATE_SENT, &connection->flags) && 662 if (test_bit(STATE_SENT, &connection->flags) &&
638 !(os.conn == C_WF_REPORT_PARAMS || 663 !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
639 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) 664 (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
640 rv = SS_IN_TRANSIENT_STATE; 665 rv = SS_IN_TRANSIENT_STATE;
641 666
642 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) 667 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
@@ -1032,8 +1057,10 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
1032 1057
1033 /* Wake up role changes, that were delayed because of connection establishing */ 1058 /* Wake up role changes, that were delayed because of connection establishing */
1034 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && 1059 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
1035 no_peer_wf_report_params(connection)) 1060 no_peer_wf_report_params(connection)) {
1036 clear_bit(STATE_SENT, &connection->flags); 1061 clear_bit(STATE_SENT, &connection->flags);
1062 wake_up_all_devices(connection);
1063 }
1037 1064
1038 wake_up(&device->misc_wait); 1065 wake_up(&device->misc_wait);
1039 wake_up(&device->state_wait); 1066 wake_up(&device->state_wait);
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index cc41605ba21c..7f53c40823cd 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -117,6 +117,11 @@ extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
117 union drbd_state, 117 union drbd_state,
118 union drbd_state, 118 union drbd_state,
119 enum chg_state_flags); 119 enum chg_state_flags);
120
121extern enum drbd_state_rv
122_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
123 union drbd_state, enum chg_state_flags);
124
120extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, 125extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
121 enum chg_state_flags, 126 enum chg_state_flags,
122 struct completion *done); 127 struct completion *done);