aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dlm
diff options
context:
space:
mode:
authorDavid Teigland <teigland@redhat.com>2012-06-25 14:48:05 -0400
committerDavid Teigland <teigland@redhat.com>2012-07-16 15:24:43 -0400
commit96006ea6d4eea73466e90ef353bf34e507724e77 (patch)
treeba6d5498e805d0042fbbe1a8e0326d8c0d16bceb /fs/dlm
parentc503a62103c46d56447f56306b52be6f844689ba (diff)
dlm: fix missing dir remove
I don't know exactly how, but in some cases, a dir record is not removed, or a new one is created when it shouldn't be. The result is that the dir node lookup returns a master node where the rsb does not exist. In this case, The master node will repeatedly return -EBADR for requests, and the lock requests will be stuck. Until all possible ways for this to happen can be eliminated, a simple and effective way to recover from this situation is for the supposed master node to send a standard remove message to the dir node when it receives a request for a resource it has no rsb for. Signed-off-by: David Teigland <teigland@redhat.com>
Diffstat (limited to 'fs/dlm')
-rw-r--r--fs/dlm/lock.c70
1 files changed, 68 insertions, 2 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 04e3f15aa0cc..b56950758188 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4000,12 +4000,70 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
4000 return error; 4000 return error;
4001} 4001}
4002 4002
4003static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
4004{
4005 char name[DLM_RESNAME_MAXLEN + 1];
4006 struct dlm_message *ms;
4007 struct dlm_mhandle *mh;
4008 struct dlm_rsb *r;
4009 uint32_t hash, b;
4010 int rv, dir_nodeid;
4011
4012 memset(name, 0, sizeof(name));
4013 memcpy(name, ms_name, len);
4014
4015 hash = jhash(name, len, 0);
4016 b = hash & (ls->ls_rsbtbl_size - 1);
4017
4018 dir_nodeid = dlm_hash2nodeid(ls, hash);
4019
4020 log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
4021
4022 spin_lock(&ls->ls_rsbtbl[b].lock);
4023 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4024 if (!rv) {
4025 spin_unlock(&ls->ls_rsbtbl[b].lock);
4026 log_error(ls, "repeat_remove on keep %s", name);
4027 return;
4028 }
4029
4030 rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4031 if (!rv) {
4032 spin_unlock(&ls->ls_rsbtbl[b].lock);
4033 log_error(ls, "repeat_remove on toss %s", name);
4034 return;
4035 }
4036
4037 /* use ls->remove_name2 to avoid conflict with shrink? */
4038
4039 spin_lock(&ls->ls_remove_spin);
4040 ls->ls_remove_len = len;
4041 memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
4042 spin_unlock(&ls->ls_remove_spin);
4043 spin_unlock(&ls->ls_rsbtbl[b].lock);
4044
4045 rv = _create_message(ls, sizeof(struct dlm_message) + len,
4046 dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
4047 if (rv)
4048 return;
4049
4050 memcpy(ms->m_extra, name, len);
4051 ms->m_hash = hash;
4052
4053 send_message(mh, ms);
4054
4055 spin_lock(&ls->ls_remove_spin);
4056 ls->ls_remove_len = 0;
4057 memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
4058 spin_unlock(&ls->ls_remove_spin);
4059}
4060
4003static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) 4061static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4004{ 4062{
4005 struct dlm_lkb *lkb; 4063 struct dlm_lkb *lkb;
4006 struct dlm_rsb *r; 4064 struct dlm_rsb *r;
4007 int from_nodeid; 4065 int from_nodeid;
4008 int error, namelen; 4066 int error, namelen = 0;
4009 4067
4010 from_nodeid = ms->m_header.h_nodeid; 4068 from_nodeid = ms->m_header.h_nodeid;
4011 4069
@@ -4073,13 +4131,21 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4073 delayed in being sent/arriving/being processed on the dir node. 4131 delayed in being sent/arriving/being processed on the dir node.
4074 Another node would repeatedly lookup up the master, and the dir 4132 Another node would repeatedly lookup up the master, and the dir
4075 node would continue returning our nodeid until our send_remove 4133 node would continue returning our nodeid until our send_remove
4076 took effect. */ 4134 took effect.
4135
4136 We send another remove message in case our previous send_remove
4137 was lost/ignored/missed somehow. */
4077 4138
4078 if (error != -ENOTBLK) { 4139 if (error != -ENOTBLK) {
4079 log_limit(ls, "receive_request %x from %d %d", 4140 log_limit(ls, "receive_request %x from %d %d",
4080 ms->m_lkid, from_nodeid, error); 4141 ms->m_lkid, from_nodeid, error);
4081 } 4142 }
4082 4143
4144 if (namelen && error == -EBADR) {
4145 send_repeat_remove(ls, ms->m_extra, namelen);
4146 msleep(1000);
4147 }
4148
4083 setup_stub_lkb(ls, ms); 4149 setup_stub_lkb(ls, ms);
4084 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 4150 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4085 return error; 4151 return error;