diff options
author | David Teigland <teigland@redhat.com> | 2012-06-25 14:48:05 -0400 |
---|---|---|
committer | David Teigland <teigland@redhat.com> | 2012-07-16 15:24:43 -0400 |
commit | 96006ea6d4eea73466e90ef353bf34e507724e77 (patch) | |
tree | ba6d5498e805d0042fbbe1a8e0326d8c0d16bceb /fs/dlm | |
parent | c503a62103c46d56447f56306b52be6f844689ba (diff) |
dlm: fix missing dir remove
I don't know exactly how, but in some cases, a dir
record is not removed, or a new one is created when
it shouldn't be. The result is that the dir node
lookup returns a master node where the rsb does not
exist. In this case, The master node will repeatedly
return -EBADR for requests, and the lock requests will
be stuck.
Until all possible ways for this to happen can be
eliminated, a simple and effective way to recover from
this situation is for the supposed master node to send
a standard remove message to the dir node when it
receives a request for a resource it has no rsb for.
Signed-off-by: David Teigland <teigland@redhat.com>
Diffstat (limited to 'fs/dlm')
-rw-r--r-- | fs/dlm/lock.c | 70 |
1 files changed, 68 insertions, 2 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 04e3f15aa0cc..b56950758188 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c | |||
@@ -4000,12 +4000,70 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) | |||
4000 | return error; | 4000 | return error; |
4001 | } | 4001 | } |
4002 | 4002 | ||
4003 | static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len) | ||
4004 | { | ||
4005 | char name[DLM_RESNAME_MAXLEN + 1]; | ||
4006 | struct dlm_message *ms; | ||
4007 | struct dlm_mhandle *mh; | ||
4008 | struct dlm_rsb *r; | ||
4009 | uint32_t hash, b; | ||
4010 | int rv, dir_nodeid; | ||
4011 | |||
4012 | memset(name, 0, sizeof(name)); | ||
4013 | memcpy(name, ms_name, len); | ||
4014 | |||
4015 | hash = jhash(name, len, 0); | ||
4016 | b = hash & (ls->ls_rsbtbl_size - 1); | ||
4017 | |||
4018 | dir_nodeid = dlm_hash2nodeid(ls, hash); | ||
4019 | |||
4020 | log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name); | ||
4021 | |||
4022 | spin_lock(&ls->ls_rsbtbl[b].lock); | ||
4023 | rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r); | ||
4024 | if (!rv) { | ||
4025 | spin_unlock(&ls->ls_rsbtbl[b].lock); | ||
4026 | log_error(ls, "repeat_remove on keep %s", name); | ||
4027 | return; | ||
4028 | } | ||
4029 | |||
4030 | rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r); | ||
4031 | if (!rv) { | ||
4032 | spin_unlock(&ls->ls_rsbtbl[b].lock); | ||
4033 | log_error(ls, "repeat_remove on toss %s", name); | ||
4034 | return; | ||
4035 | } | ||
4036 | |||
4037 | /* use ls->remove_name2 to avoid conflict with shrink? */ | ||
4038 | |||
4039 | spin_lock(&ls->ls_remove_spin); | ||
4040 | ls->ls_remove_len = len; | ||
4041 | memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN); | ||
4042 | spin_unlock(&ls->ls_remove_spin); | ||
4043 | spin_unlock(&ls->ls_rsbtbl[b].lock); | ||
4044 | |||
4045 | rv = _create_message(ls, sizeof(struct dlm_message) + len, | ||
4046 | dir_nodeid, DLM_MSG_REMOVE, &ms, &mh); | ||
4047 | if (rv) | ||
4048 | return; | ||
4049 | |||
4050 | memcpy(ms->m_extra, name, len); | ||
4051 | ms->m_hash = hash; | ||
4052 | |||
4053 | send_message(mh, ms); | ||
4054 | |||
4055 | spin_lock(&ls->ls_remove_spin); | ||
4056 | ls->ls_remove_len = 0; | ||
4057 | memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN); | ||
4058 | spin_unlock(&ls->ls_remove_spin); | ||
4059 | } | ||
4060 | |||
4003 | static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) | 4061 | static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) |
4004 | { | 4062 | { |
4005 | struct dlm_lkb *lkb; | 4063 | struct dlm_lkb *lkb; |
4006 | struct dlm_rsb *r; | 4064 | struct dlm_rsb *r; |
4007 | int from_nodeid; | 4065 | int from_nodeid; |
4008 | int error, namelen; | 4066 | int error, namelen = 0; |
4009 | 4067 | ||
4010 | from_nodeid = ms->m_header.h_nodeid; | 4068 | from_nodeid = ms->m_header.h_nodeid; |
4011 | 4069 | ||
@@ -4073,13 +4131,21 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) | |||
4073 | delayed in being sent/arriving/being processed on the dir node. | 4131 | delayed in being sent/arriving/being processed on the dir node. |
4074 | Another node would repeatedly lookup up the master, and the dir | 4132 | Another node would repeatedly lookup up the master, and the dir |
4075 | node would continue returning our nodeid until our send_remove | 4133 | node would continue returning our nodeid until our send_remove |
4076 | took effect. */ | 4134 | took effect. |
4135 | |||
4136 | We send another remove message in case our previous send_remove | ||
4137 | was lost/ignored/missed somehow. */ | ||
4077 | 4138 | ||
4078 | if (error != -ENOTBLK) { | 4139 | if (error != -ENOTBLK) { |
4079 | log_limit(ls, "receive_request %x from %d %d", | 4140 | log_limit(ls, "receive_request %x from %d %d", |
4080 | ms->m_lkid, from_nodeid, error); | 4141 | ms->m_lkid, from_nodeid, error); |
4081 | } | 4142 | } |
4082 | 4143 | ||
4144 | if (namelen && error == -EBADR) { | ||
4145 | send_repeat_remove(ls, ms->m_extra, namelen); | ||
4146 | msleep(1000); | ||
4147 | } | ||
4148 | |||
4083 | setup_stub_lkb(ls, ms); | 4149 | setup_stub_lkb(ls, ms); |
4084 | send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); | 4150 | send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); |
4085 | return error; | 4151 | return error; |