aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dlm
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dlm')
-rw-r--r--fs/dlm/Kconfig21
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c173
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c387
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h543
-rw-r--r--fs/dlm/lock.c3871
-rw-r--r--fs/dlm/lock.h62
-rw-r--r--fs/dlm/lockspace.c717
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1238
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c327
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c116
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c472
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c765
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c290
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c788
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
36 files changed, 11966 insertions, 0 deletions
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 depends on IP_SCTP
8 select CONFIGFS_FS
9 help
10 A general purpose distributed lock manager for kernel or userspace
11 applications.
12
13config DLM_DEBUG
14 bool "DLM debugging"
15 depends on DLM
16 help
17 Under the debugfs mount point, the name of each lockspace will
18 appear as a file in the "dlm" directory. The output is the
19 list of resource and locks the local node knows about.
20
21endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
1obj-$(CONFIG_DLM) += dlm.o
2dlm-y := ast.o \
3 config.o \
4 dir.o \
5 lock.o \
6 lockspace.o \
7 lowcomms.o \
8 main.o \
9 member.o \
10 memory.o \
11 midcomms.o \
12 rcom.o \
13 recover.o \
14 recoverd.o \
15 requestqueue.o \
16 user.o \
17 util.o
18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
19
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "user.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 if (lkb->lkb_flags & DLM_IFL_USER) {
38 dlm_user_add_ast(lkb, type);
39 return;
40 }
41 DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
42
43 spin_lock(&ast_queue_lock);
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 }
48 lkb->lkb_ast_type |= type;
49 spin_unlock(&ast_queue_lock);
50
51 set_bit(WAKE_ASTS, &astd_wakeflags);
52 wake_up_process(astd_task);
53}
54
55static void process_asts(void)
56{
57 struct dlm_ls *ls = NULL;
58 struct dlm_rsb *r = NULL;
59 struct dlm_lkb *lkb;
60 void (*cast) (long param);
61 void (*bast) (long param, int mode);
62 int type = 0, found, bmode;
63
64 for (;;) {
65 found = 0;
66 spin_lock(&ast_queue_lock);
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
68 r = lkb->lkb_resource;
69 ls = r->res_ls;
70
71 if (dlm_locking_stopped(ls))
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81
82 if (!found)
83 break;
84
85 cast = lkb->lkb_astaddr;
86 bast = lkb->lkb_bastaddr;
87 bmode = lkb->lkb_bastmode;
88
89 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam);
91
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
98 bast(lkb->lkb_astparam, bmode);
99
100 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */
102 dlm_put_lkb(lkb);
103
104 schedule();
105 }
106}
107
108static inline int no_asts(void)
109{
110 int ret;
111
112 spin_lock(&ast_queue_lock);
113 ret = list_empty(&ast_queue);
114 spin_unlock(&ast_queue_lock);
115 return ret;
116}
117
118static int dlm_astd(void *data)
119{
120 while (!kthread_should_stop()) {
121 set_current_state(TASK_INTERRUPTIBLE);
122 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
123 schedule();
124 set_current_state(TASK_RUNNING);
125
126 mutex_lock(&astd_running);
127 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
128 process_asts();
129 mutex_unlock(&astd_running);
130 }
131 return 0;
132}
133
134void dlm_astd_wake(void)
135{
136 if (!no_asts()) {
137 set_bit(WAKE_ASTS, &astd_wakeflags);
138 wake_up_process(astd_task);
139 }
140}
141
142int dlm_astd_start(void)
143{
144 struct task_struct *p;
145 int error = 0;
146
147 INIT_LIST_HEAD(&ast_queue);
148 spin_lock_init(&ast_queue_lock);
149 mutex_init(&astd_running);
150
151 p = kthread_run(dlm_astd, NULL, "dlm_astd");
152 if (IS_ERR(p))
153 error = PTR_ERR(p);
154 else
155 astd_task = p;
156 return error;
157}
158
159void dlm_astd_stop(void)
160{
161 kthread_stop(astd_task);
162}
163
164void dlm_astd_suspend(void)
165{
166 mutex_lock(&astd_running);
167}
168
169void dlm_astd_resume(void)
170{
171 mutex_unlock(&astd_running);
172}
173
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21#define DLM_DEBUG_BUF_LEN 4096
22static char debug_buf[DLM_DEBUG_BUF_LEN];
23static struct mutex debug_buf_lock;
24
25static struct dentry *dlm_root;
26
27struct rsb_iter {
28 int entry;
29 struct dlm_ls *ls;
30 struct list_head *next;
31 struct dlm_rsb *rsb;
32};
33
34/*
35 * dump all rsb's in the lockspace hash table
36 */
37
38static char *print_lockmode(int mode)
39{
40 switch (mode) {
41 case DLM_LOCK_IV:
42 return "--";
43 case DLM_LOCK_NL:
44 return "NL";
45 case DLM_LOCK_CR:
46 return "CR";
47 case DLM_LOCK_CW:
48 return "CW";
49 case DLM_LOCK_PR:
50 return "PR";
51 case DLM_LOCK_PW:
52 return "PW";
53 case DLM_LOCK_EX:
54 return "EX";
55 default:
56 return "??";
57 }
58}
59
60static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
61 struct dlm_rsb *res)
62{
63 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
64
65 if (lkb->lkb_status == DLM_LKSTS_CONVERT
66 || lkb->lkb_status == DLM_LKSTS_WAITING)
67 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
68
69 if (lkb->lkb_nodeid) {
70 if (lkb->lkb_nodeid != res->res_nodeid)
71 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
72 lkb->lkb_remid);
73 else
74 seq_printf(s, " Master: %08x", lkb->lkb_remid);
75 }
76
77 if (lkb->lkb_wait_type)
78 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
79
80 seq_printf(s, "\n");
81}
82
83static int print_resource(struct dlm_rsb *res, struct seq_file *s)
84{
85 struct dlm_lkb *lkb;
86 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
87
88 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
89 for (i = 0; i < res->res_length; i++) {
90 if (isprint(res->res_name[i]))
91 seq_printf(s, "%c", res->res_name[i]);
92 else
93 seq_printf(s, "%c", '.');
94 }
95 if (res->res_nodeid > 0)
96 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
97 res->res_nodeid);
98 else if (res->res_nodeid == 0)
99 seq_printf(s, "\" \nMaster Copy\n");
100 else if (res->res_nodeid == -1)
101 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
102 res->res_first_lkid);
103 else
104 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
105
106 /* Print the LVB: */
107 if (res->res_lvbptr) {
108 seq_printf(s, "LVB: ");
109 for (i = 0; i < lvblen; i++) {
110 if (i == lvblen / 2)
111 seq_printf(s, "\n ");
112 seq_printf(s, "%02x ",
113 (unsigned char) res->res_lvbptr[i]);
114 }
115 if (rsb_flag(res, RSB_VALNOTVALID))
116 seq_printf(s, " (INVALID)");
117 seq_printf(s, "\n");
118 }
119
120 root_list = !list_empty(&res->res_root_list);
121 recover_list = !list_empty(&res->res_recover_list);
122
123 if (root_list || recover_list) {
124 seq_printf(s, "Recovery: root %d recover %d flags %lx "
125 "count %d\n", root_list, recover_list,
126 res->res_flags, res->res_recover_locks_count);
127 }
128
129 /* Print the locks attached to this resource */
130 seq_printf(s, "Granted Queue\n");
131 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
132 print_lock(s, lkb, res);
133
134 seq_printf(s, "Conversion Queue\n");
135 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
136 print_lock(s, lkb, res);
137
138 seq_printf(s, "Waiting Queue\n");
139 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
140 print_lock(s, lkb, res);
141
142 if (list_empty(&res->res_lookup))
143 goto out;
144
145 seq_printf(s, "Lookup Queue\n");
146 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
147 seq_printf(s, "%08x %s", lkb->lkb_id,
148 print_lockmode(lkb->lkb_rqmode));
149 if (lkb->lkb_wait_type)
150 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
151 seq_printf(s, "\n");
152 }
153 out:
154 return 0;
155}
156
157static int rsb_iter_next(struct rsb_iter *ri)
158{
159 struct dlm_ls *ls = ri->ls;
160 int i;
161
162 if (!ri->next) {
163 top:
164 /* Find the next non-empty hash bucket */
165 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
168 ri->next = ls->ls_rsbtbl[i].list.next;
169 read_unlock(&ls->ls_rsbtbl[i].lock);
170 break;
171 }
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 }
174 ri->entry = i;
175
176 if (ri->entry >= ls->ls_rsbtbl_size)
177 return 1;
178 } else {
179 i = ri->entry;
180 read_lock(&ls->ls_rsbtbl[i].lock);
181 ri->next = ri->next->next;
182 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
183 /* End of list - move to next bucket */
184 ri->next = NULL;
185 ri->entry++;
186 read_unlock(&ls->ls_rsbtbl[i].lock);
187 goto top;
188 }
189 read_unlock(&ls->ls_rsbtbl[i].lock);
190 }
191 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
192
193 return 0;
194}
195
196static void rsb_iter_free(struct rsb_iter *ri)
197{
198 kfree(ri);
199}
200
201static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
202{
203 struct rsb_iter *ri;
204
205 ri = kmalloc(sizeof *ri, GFP_KERNEL);
206 if (!ri)
207 return NULL;
208
209 ri->ls = ls;
210 ri->entry = 0;
211 ri->next = NULL;
212
213 if (rsb_iter_next(ri)) {
214 rsb_iter_free(ri);
215 return NULL;
216 }
217
218 return ri;
219}
220
221static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
222{
223 struct rsb_iter *ri;
224 loff_t n = *pos;
225
226 ri = rsb_iter_init(file->private);
227 if (!ri)
228 return NULL;
229
230 while (n--) {
231 if (rsb_iter_next(ri)) {
232 rsb_iter_free(ri);
233 return NULL;
234 }
235 }
236
237 return ri;
238}
239
240static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
241{
242 struct rsb_iter *ri = iter_ptr;
243
244 (*pos)++;
245
246 if (rsb_iter_next(ri)) {
247 rsb_iter_free(ri);
248 return NULL;
249 }
250
251 return ri;
252}
253
254static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
255{
256 /* nothing for now */
257}
258
259static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
260{
261 struct rsb_iter *ri = iter_ptr;
262
263 print_resource(ri->rsb, file);
264
265 return 0;
266}
267
268static struct seq_operations rsb_seq_ops = {
269 .start = rsb_seq_start,
270 .next = rsb_seq_next,
271 .stop = rsb_seq_stop,
272 .show = rsb_seq_show,
273};
274
275static int rsb_open(struct inode *inode, struct file *file)
276{
277 struct seq_file *seq;
278 int ret;
279
280 ret = seq_open(file, &rsb_seq_ops);
281 if (ret)
282 return ret;
283
284 seq = file->private_data;
285 seq->private = inode->i_private;
286
287 return 0;
288}
289
290static struct file_operations rsb_fops = {
291 .owner = THIS_MODULE,
292 .open = rsb_open,
293 .read = seq_read,
294 .llseek = seq_lseek,
295 .release = seq_release
296};
297
298/*
299 * dump lkb's on the ls_waiters list
300 */
301
302static int waiters_open(struct inode *inode, struct file *file)
303{
304 file->private_data = inode->i_private;
305 return 0;
306}
307
308static ssize_t waiters_read(struct file *file, char __user *userbuf,
309 size_t count, loff_t *ppos)
310{
311 struct dlm_ls *ls = file->private_data;
312 struct dlm_lkb *lkb;
313 size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
314
315 mutex_lock(&debug_buf_lock);
316 mutex_lock(&ls->ls_waiters_mutex);
317 memset(debug_buf, 0, sizeof(debug_buf));
318
319 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
320 ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
321 lkb->lkb_id, lkb->lkb_wait_type,
322 lkb->lkb_nodeid, lkb->lkb_resource->res_name);
323 if (ret >= len - pos)
324 break;
325 pos += ret;
326 }
327 mutex_unlock(&ls->ls_waiters_mutex);
328
329 rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
330 mutex_unlock(&debug_buf_lock);
331 return rv;
332}
333
334static struct file_operations waiters_fops = {
335 .owner = THIS_MODULE,
336 .open = waiters_open,
337 .read = waiters_read
338};
339
340int dlm_create_debug_file(struct dlm_ls *ls)
341{
342 char name[DLM_LOCKSPACE_LEN+8];
343
344 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
345 S_IFREG | S_IRUGO,
346 dlm_root,
347 ls,
348 &rsb_fops);
349 if (!ls->ls_debug_rsb_dentry)
350 return -ENOMEM;
351
352 memset(name, 0, sizeof(name));
353 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
354
355 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
356 S_IFREG | S_IRUGO,
357 dlm_root,
358 ls,
359 &waiters_fops);
360 if (!ls->ls_debug_waiters_dentry) {
361 debugfs_remove(ls->ls_debug_rsb_dentry);
362 return -ENOMEM;
363 }
364
365 return 0;
366}
367
368void dlm_delete_debug_file(struct dlm_ls *ls)
369{
370 if (ls->ls_debug_rsb_dentry)
371 debugfs_remove(ls->ls_debug_rsb_dentry);
372 if (ls->ls_debug_waiters_dentry)
373 debugfs_remove(ls->ls_debug_waiters_dentry);
374}
375
376int dlm_register_debugfs(void)
377{
378 mutex_init(&debug_buf_lock);
379 dlm_root = debugfs_create_dir("dlm", NULL);
380 return dlm_root ? 0 : -ENOMEM;
381}
382
383void dlm_unregister_debugfs(void)
384{
385 debugfs_remove(dlm_root);
386}
387
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/miscdevice.h>
39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h>
42
43#include <linux/dlm.h>
44
45#define DLM_LOCKSPACE_LEN 64
46
47/* Size of the temp buffer midcomms allocates on the stack.
48 We try to make this large enough so most messages fit.
49 FIXME: should sctp make this unnecessary? */
50
51#define DLM_INBUF_LEN 148
52
53struct dlm_ls;
54struct dlm_lkb;
55struct dlm_rsb;
56struct dlm_member;
57struct dlm_lkbtable;
58struct dlm_rsbtable;
59struct dlm_dirtable;
60struct dlm_direntry;
61struct dlm_recover;
62struct dlm_header;
63struct dlm_message;
64struct dlm_rcom;
65struct dlm_mhandle;
66
67#define log_print(fmt, args...) \
68 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
69#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71
72#define DLM_LOG_DEBUG
73#ifdef DLM_LOG_DEBUG
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
75#else
76#define log_debug(ls, fmt, args...)
77#endif
78
79#define DLM_ASSERT(x, do) \
80{ \
81 if (!(x)) \
82 { \
83 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
84 "DLM: assertion: \"%s\"\n" \
85 "DLM: time = %lu\n", \
86 __LINE__, __FILE__, #x, jiffies); \
87 {do} \
88 printk("\n"); \
89 BUG(); \
90 panic("DLM: Record message above and reboot.\n"); \
91 } \
92}
93
94#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
95
96
97struct dlm_direntry {
98 struct list_head list;
99 uint32_t master_nodeid;
100 uint16_t length;
101 char name[1];
102};
103
104struct dlm_dirtable {
105 struct list_head list;
106 rwlock_t lock;
107};
108
109struct dlm_rsbtable {
110 struct list_head list;
111 struct list_head toss;
112 rwlock_t lock;
113};
114
115struct dlm_lkbtable {
116 struct list_head list;
117 rwlock_t lock;
118 uint16_t counter;
119};
120
121/*
122 * Lockspace member (per node in a ls)
123 */
124
125struct dlm_member {
126 struct list_head list;
127 int nodeid;
128 int weight;
129};
130
131/*
132 * Save and manage recovery state for a lockspace.
133 */
134
135struct dlm_recover {
136 struct list_head list;
137 int *nodeids;
138 int node_count;
139 uint64_t seq;
140};
141
142/*
143 * Pass input args to second stage locking function.
144 */
145
146struct dlm_args {
147 uint32_t flags;
148 void *astaddr;
149 long astparam;
150 void *bastaddr;
151 int mode;
152 struct dlm_lksb *lksb;
153};
154
155
156/*
157 * Lock block
158 *
159 * A lock can be one of three types:
160 *
161 * local copy lock is mastered locally
162 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
163 * process copy lock is mastered on a remote node
164 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
165 * master copy master node's copy of a lock owned by remote node
166 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
167 *
168 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
169 * dlm_unlock. The dlm does not modify these or use any private flags in
170 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
171 * are sent as-is to the remote master when the lock is remote.
172 *
173 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
174 * Some internal flags are shared between the master and process nodes;
175 * these shared flags are kept in the lower two bytes. One of these
176 * flags set on the master copy will be propagated to the process copy
177 * and v.v. Other internal flags are private to the master or process
178 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
179 *
180 * lkb_sbflags: status block flags. These flags are copied directly into
181 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
182 * ast. All defined in dlm.h with DLM_SBF_ prefix.
183 *
184 * lkb_status: the lock status indicates which rsb queue the lock is
185 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
186 *
187 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
188 * reply is needed. Only set when the lkb is on the lockspace waiters
189 * list awaiting a reply from a remote node.
190 *
191 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
192 * is a master copy, nodeid specifies the remote lock holder, when the
193 * lkb is a process copy, the nodeid specifies the lock master.
194 */
195
196/* lkb_ast_type */
197
198#define AST_COMP 1
199#define AST_BAST 2
200
201/* lkb_status */
202
203#define DLM_LKSTS_WAITING 1
204#define DLM_LKSTS_GRANTED 2
205#define DLM_LKSTS_CONVERT 3
206
207/* lkb_flags */
208
209#define DLM_IFL_MSTCPY 0x00010000
210#define DLM_IFL_RESEND 0x00020000
211#define DLM_IFL_DEAD 0x00040000
212#define DLM_IFL_USER 0x00000001
213#define DLM_IFL_ORPHAN 0x00000002
214
215struct dlm_lkb {
216 struct dlm_rsb *lkb_resource; /* the rsb */
217 struct kref lkb_ref;
218 int lkb_nodeid; /* copied from rsb */
219 int lkb_ownpid; /* pid of lock owner */
220 uint32_t lkb_id; /* our lock ID */
221 uint32_t lkb_remid; /* lock ID on remote partner */
222 uint32_t lkb_exflags; /* external flags from caller */
223 uint32_t lkb_sbflags; /* lksb flags */
224 uint32_t lkb_flags; /* internal flags */
225 uint32_t lkb_lvbseq; /* lvb sequence number */
226
227 int8_t lkb_status; /* granted, waiting, convert */
228 int8_t lkb_rqmode; /* requested lock mode */
229 int8_t lkb_grmode; /* granted lock mode */
230 int8_t lkb_bastmode; /* requested mode */
231 int8_t lkb_highbast; /* highest mode bast sent for */
232
233 int8_t lkb_wait_type; /* type of reply waiting for */
234 int8_t lkb_ast_type; /* type of ast queued for */
235
236 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
237 struct list_head lkb_statequeue; /* rsb g/c/w list */
238 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
239 struct list_head lkb_wait_reply; /* waiting for remote reply */
240 struct list_head lkb_astqueue; /* need ast to be sent */
241 struct list_head lkb_ownqueue; /* list of locks for a process */
242
243 char *lkb_lvbptr;
244 struct dlm_lksb *lkb_lksb; /* caller's status block */
245 void *lkb_astaddr; /* caller's ast function */
246 void *lkb_bastaddr; /* caller's bast function */
247 long lkb_astparam; /* caller's ast arg */
248};
249
250
251struct dlm_rsb {
252 struct dlm_ls *res_ls; /* the lockspace */
253 struct kref res_ref;
254 struct mutex res_mutex;
255 unsigned long res_flags;
256 int res_length; /* length of rsb name */
257 int res_nodeid;
258 uint32_t res_lvbseq;
259 uint32_t res_hash;
260 uint32_t res_bucket; /* rsbtbl */
261 unsigned long res_toss_time;
262 uint32_t res_first_lkid;
263 struct list_head res_lookup; /* lkbs waiting on first */
264 struct list_head res_hashchain; /* rsbtbl */
265 struct list_head res_grantqueue;
266 struct list_head res_convertqueue;
267 struct list_head res_waitqueue;
268
269 struct list_head res_root_list; /* used for recovery */
270 struct list_head res_recover_list; /* used for recovery */
271 int res_recover_locks_count;
272
273 char *res_lvbptr;
274 char res_name[1];
275};
276
277/* find_rsb() flags */
278
279#define R_MASTER 1 /* only return rsb if it's a master */
280#define R_CREATE 2 /* create/add rsb if not found */
281
282/* rsb_flags */
283
284enum rsb_flags {
285 RSB_MASTER_UNCERTAIN,
286 RSB_VALNOTVALID,
287 RSB_VALNOTVALID_PREV,
288 RSB_NEW_MASTER,
289 RSB_NEW_MASTER2,
290 RSB_RECOVER_CONVERT,
291 RSB_LOCKS_PURGED,
292};
293
294static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
295{
296 __set_bit(flag, &r->res_flags);
297}
298
299static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
300{
301 __clear_bit(flag, &r->res_flags);
302}
303
304static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
305{
306 return test_bit(flag, &r->res_flags);
307}
308
309
310/* dlm_header is first element of all structs sent between nodes */
311
312#define DLM_HEADER_MAJOR 0x00020000
313#define DLM_HEADER_MINOR 0x00000001
314
315#define DLM_MSG 1
316#define DLM_RCOM 2
317
318struct dlm_header {
319 uint32_t h_version;
320 uint32_t h_lockspace;
321 uint32_t h_nodeid; /* nodeid of sender */
322 uint16_t h_length;
323 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
324 uint8_t h_pad;
325};
326
327
328#define DLM_MSG_REQUEST 1
329#define DLM_MSG_CONVERT 2
330#define DLM_MSG_UNLOCK 3
331#define DLM_MSG_CANCEL 4
332#define DLM_MSG_REQUEST_REPLY 5
333#define DLM_MSG_CONVERT_REPLY 6
334#define DLM_MSG_UNLOCK_REPLY 7
335#define DLM_MSG_CANCEL_REPLY 8
336#define DLM_MSG_GRANT 9
337#define DLM_MSG_BAST 10
338#define DLM_MSG_LOOKUP 11
339#define DLM_MSG_REMOVE 12
340#define DLM_MSG_LOOKUP_REPLY 13
341
342struct dlm_message {
343 struct dlm_header m_header;
344 uint32_t m_type; /* DLM_MSG_ */
345 uint32_t m_nodeid;
346 uint32_t m_pid;
347 uint32_t m_lkid; /* lkid on sender */
348 uint32_t m_remid; /* lkid on receiver */
349 uint32_t m_parent_lkid;
350 uint32_t m_parent_remid;
351 uint32_t m_exflags;
352 uint32_t m_sbflags;
353 uint32_t m_flags;
354 uint32_t m_lvbseq;
355 uint32_t m_hash;
356 int m_status;
357 int m_grmode;
358 int m_rqmode;
359 int m_bastmode;
360 int m_asts;
361 int m_result; /* 0 or -EXXX */
362 char m_extra[0]; /* name or lvb */
363};
364
365
366#define DLM_RS_NODES 0x00000001
367#define DLM_RS_NODES_ALL 0x00000002
368#define DLM_RS_DIR 0x00000004
369#define DLM_RS_DIR_ALL 0x00000008
370#define DLM_RS_LOCKS 0x00000010
371#define DLM_RS_LOCKS_ALL 0x00000020
372#define DLM_RS_DONE 0x00000040
373#define DLM_RS_DONE_ALL 0x00000080
374
375#define DLM_RCOM_STATUS 1
376#define DLM_RCOM_NAMES 2
377#define DLM_RCOM_LOOKUP 3
378#define DLM_RCOM_LOCK 4
379#define DLM_RCOM_STATUS_REPLY 5
380#define DLM_RCOM_NAMES_REPLY 6
381#define DLM_RCOM_LOOKUP_REPLY 7
382#define DLM_RCOM_LOCK_REPLY 8
383
384struct dlm_rcom {
385 struct dlm_header rc_header;
386 uint32_t rc_type; /* DLM_RCOM_ */
387 int rc_result; /* multi-purpose */
388 uint64_t rc_id; /* match reply with request */
389 char rc_buf[0];
390};
391
392struct rcom_config {
393 uint32_t rf_lvblen;
394 uint32_t rf_lsflags;
395 uint64_t rf_unused;
396};
397
398struct rcom_lock {
399 uint32_t rl_ownpid;
400 uint32_t rl_lkid;
401 uint32_t rl_remid;
402 uint32_t rl_parent_lkid;
403 uint32_t rl_parent_remid;
404 uint32_t rl_exflags;
405 uint32_t rl_flags;
406 uint32_t rl_lvbseq;
407 int rl_result;
408 int8_t rl_rqmode;
409 int8_t rl_grmode;
410 int8_t rl_status;
411 int8_t rl_asts;
412 uint16_t rl_wait_type;
413 uint16_t rl_namelen;
414 char rl_name[DLM_RESNAME_MAXLEN];
415 char rl_lvb[0];
416};
417
418struct dlm_ls {
419 struct list_head ls_list; /* list of lockspaces */
420 dlm_lockspace_t *ls_local_handle;
421 uint32_t ls_global_id; /* global unique lockspace ID */
422 uint32_t ls_exflags;
423 int ls_lvblen;
424 int ls_count; /* reference count */
425 unsigned long ls_flags; /* LSFL_ */
426 struct kobject ls_kobj;
427
428 struct dlm_rsbtable *ls_rsbtbl;
429 uint32_t ls_rsbtbl_size;
430
431 struct dlm_lkbtable *ls_lkbtbl;
432 uint32_t ls_lkbtbl_size;
433
434 struct dlm_dirtable *ls_dirtbl;
435 uint32_t ls_dirtbl_size;
436
437 struct mutex ls_waiters_mutex;
438 struct list_head ls_waiters; /* lkbs needing a reply */
439
440 struct list_head ls_nodes; /* current nodes in ls */
441 struct list_head ls_nodes_gone; /* dead node list, recovery */
442 int ls_num_nodes; /* number of nodes in ls */
443 int ls_low_nodeid;
444 int ls_total_weight;
445 int *ls_node_array;
446
447 struct dlm_rsb ls_stub_rsb; /* for returning errors */
448 struct dlm_lkb ls_stub_lkb; /* for returning errors */
449 struct dlm_message ls_stub_ms; /* for faking a reply */
450
451 struct dentry *ls_debug_rsb_dentry; /* debugfs */
452 struct dentry *ls_debug_waiters_dentry; /* debugfs */
453
454 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
455 int ls_uevent_result;
456
457 struct miscdevice ls_device;
458
459 /* recovery related */
460
461 struct timer_list ls_timer;
462 struct task_struct *ls_recoverd_task;
463 struct mutex ls_recoverd_active;
464 spinlock_t ls_recover_lock;
465 uint32_t ls_recover_status; /* DLM_RS_ */
466 uint64_t ls_recover_seq;
467 struct dlm_recover *ls_recover_args;
468 struct rw_semaphore ls_in_recovery; /* block local requests */
469 struct list_head ls_requestqueue;/* queue remote requests */
470 struct mutex ls_requestqueue_mutex;
471 char *ls_recover_buf;
472 int ls_recover_nodeid; /* for debugging */
473 uint64_t ls_rcom_seq;
474 struct list_head ls_recover_list;
475 spinlock_t ls_recover_list_lock;
476 int ls_recover_list_count;
477 wait_queue_head_t ls_wait_general;
478 struct mutex ls_clear_proc_locks;
479
480 struct list_head ls_root_list; /* root resources */
481 struct rw_semaphore ls_root_sem; /* protect root_list */
482
483 int ls_namelen;
484 char ls_name[1];
485};
486
487#define LSFL_WORK 0
488#define LSFL_RUNNING 1
489#define LSFL_RECOVERY_STOP 2
490#define LSFL_RCOM_READY 3
491#define LSFL_UEVENT_WAIT 4
492
493/* much of this is just saving user space pointers associated with the
494 lock that we pass back to the user lib with an ast */
495
496struct dlm_user_args {
497 struct dlm_user_proc *proc; /* each process that opens the lockspace
498 device has private data
499 (dlm_user_proc) on the struct file,
500 the process's locks point back to it*/
501 struct dlm_lksb lksb;
502 int old_mode;
503 int update_user_lvb;
504 struct dlm_lksb __user *user_lksb;
505 void __user *castparam;
506 void __user *castaddr;
507 void __user *bastparam;
508 void __user *bastaddr;
509};
510
511#define DLM_PROC_FLAGS_CLOSING 1
512#define DLM_PROC_FLAGS_COMPAT 2
513
514/* locks list is kept so we can remove all a process's locks when it
515 exits (or orphan those that are persistent) */
516
517struct dlm_user_proc {
518 dlm_lockspace_t *lockspace;
519 unsigned long flags; /* DLM_PROC_FLAGS */
520 struct list_head asts;
521 spinlock_t asts_spin;
522 struct list_head locks;
523 spinlock_t locks_spin;
524 wait_queue_head_t wait;
525};
526
527static inline int dlm_locking_stopped(struct dlm_ls *ls)
528{
529 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
530}
531
532static inline int dlm_recovery_stopped(struct dlm_ls *ls)
533{
534 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
535}
536
537static inline int dlm_no_directory(struct dlm_ls *ls)
538{
539 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
540}
541
542#endif /* __DLM_INTERNAL_DOT_H__ */
543
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58#include <linux/types.h>
59#include "dlm_internal.h"
60#include <linux/dlm_device.h>
61#include "memory.h"
62#include "lowcomms.h"
63#include "requestqueue.h"
64#include "util.h"
65#include "dir.h"
66#include "member.h"
67#include "lockspace.h"
68#include "ast.h"
69#include "lock.h"
70#include "rcom.h"
71#include "recover.h"
72#include "lvb_table.h"
73#include "user.h"
74#include "config.h"
75
76static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms);
88
89/*
90 * Lock compatibilty matrix - thanks Steve
91 * UN = Unlocked state. Not really a state, used as a flag
92 * PD = Padding. Used to make the matrix a nice power of two in size
93 * Other states are the same as the VMS DLM.
94 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
95 */
96
97static const int __dlm_compat_matrix[8][8] = {
98 /* UN NL CR CW PR PW EX PD */
99 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
100 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
101 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
102 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
103 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
104 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
105 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
106 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
107};
108
109/*
110 * This defines the direction of transfer of LVB data.
111 * Granted mode is the row; requested mode is the column.
112 * Usage: matrix[grmode+1][rqmode+1]
113 * 1 = LVB is returned to the caller
114 * 0 = LVB is written to the resource
115 * -1 = nothing happens to the LVB
116 */
117
118const int dlm_lvb_operations[8][8] = {
119 /* UN NL CR CW PR PW EX PD*/
120 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
121 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
122 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
123 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
124 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
125 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
126 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
127 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
128};
129
130#define modes_compat(gr, rq) \
131 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
132
133int dlm_modes_compat(int mode1, int mode2)
134{
135 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
136}
137
138/*
139 * Compatibility matrix for conversions with QUECVT set.
140 * Granted mode is the row; requested mode is the column.
141 * Usage: matrix[grmode+1][rqmode+1]
142 */
143
144static const int __quecvt_compat_matrix[8][8] = {
145 /* UN NL CR CW PR PW EX PD */
146 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
147 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
148 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
149 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
150 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
151 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
152 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
153 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
154};
155
156void dlm_print_lkb(struct dlm_lkb *lkb)
157{
158 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
159 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
160 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
161 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
162 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
163}
164
165void dlm_print_rsb(struct dlm_rsb *r)
166{
167 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
168 r->res_nodeid, r->res_flags, r->res_first_lkid,
169 r->res_recover_locks_count, r->res_name);
170}
171
172void dlm_dump_rsb(struct dlm_rsb *r)
173{
174 struct dlm_lkb *lkb;
175
176 dlm_print_rsb(r);
177
178 printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
179 list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
180 printk(KERN_ERR "rsb lookup list\n");
181 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
182 dlm_print_lkb(lkb);
183 printk(KERN_ERR "rsb grant queue:\n");
184 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
185 dlm_print_lkb(lkb);
186 printk(KERN_ERR "rsb convert queue:\n");
187 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
188 dlm_print_lkb(lkb);
189 printk(KERN_ERR "rsb wait queue:\n");
190 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
191 dlm_print_lkb(lkb);
192}
193
194/* Threads cannot use the lockspace while it's being recovered */
195
196static inline void lock_recovery(struct dlm_ls *ls)
197{
198 down_read(&ls->ls_in_recovery);
199}
200
201static inline void unlock_recovery(struct dlm_ls *ls)
202{
203 up_read(&ls->ls_in_recovery);
204}
205
206static inline int lock_recovery_try(struct dlm_ls *ls)
207{
208 return down_read_trylock(&ls->ls_in_recovery);
209}
210
211static inline int can_be_queued(struct dlm_lkb *lkb)
212{
213 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
214}
215
216static inline int force_blocking_asts(struct dlm_lkb *lkb)
217{
218 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
219}
220
221static inline int is_demoted(struct dlm_lkb *lkb)
222{
223 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
224}
225
226static inline int is_remote(struct dlm_rsb *r)
227{
228 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
229 return !!r->res_nodeid;
230}
231
232static inline int is_process_copy(struct dlm_lkb *lkb)
233{
234 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
235}
236
237static inline int is_master_copy(struct dlm_lkb *lkb)
238{
239 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
240 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
241 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
242}
243
244static inline int middle_conversion(struct dlm_lkb *lkb)
245{
246 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
247 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
248 return 1;
249 return 0;
250}
251
252static inline int down_conversion(struct dlm_lkb *lkb)
253{
254 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
255}
256
257static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
258{
259 if (is_master_copy(lkb))
260 return;
261
262 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
263
264 lkb->lkb_lksb->sb_status = rv;
265 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
266
267 dlm_add_ast(lkb, AST_COMP);
268}
269
270static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
271{
272 if (is_master_copy(lkb))
273 send_bast(r, lkb, rqmode);
274 else {
275 lkb->lkb_bastmode = rqmode;
276 dlm_add_ast(lkb, AST_BAST);
277 }
278}
279
280/*
281 * Basic operations on rsb's and lkb's
282 */
283
284static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
285{
286 struct dlm_rsb *r;
287
288 r = allocate_rsb(ls, len);
289 if (!r)
290 return NULL;
291
292 r->res_ls = ls;
293 r->res_length = len;
294 memcpy(r->res_name, name, len);
295 mutex_init(&r->res_mutex);
296
297 INIT_LIST_HEAD(&r->res_lookup);
298 INIT_LIST_HEAD(&r->res_grantqueue);
299 INIT_LIST_HEAD(&r->res_convertqueue);
300 INIT_LIST_HEAD(&r->res_waitqueue);
301 INIT_LIST_HEAD(&r->res_root_list);
302 INIT_LIST_HEAD(&r->res_recover_list);
303
304 return r;
305}
306
307static int search_rsb_list(struct list_head *head, char *name, int len,
308 unsigned int flags, struct dlm_rsb **r_ret)
309{
310 struct dlm_rsb *r;
311 int error = 0;
312
313 list_for_each_entry(r, head, res_hashchain) {
314 if (len == r->res_length && !memcmp(name, r->res_name, len))
315 goto found;
316 }
317 return -EBADR;
318
319 found:
320 if (r->res_nodeid && (flags & R_MASTER))
321 error = -ENOTBLK;
322 *r_ret = r;
323 return error;
324}
325
326static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
327 unsigned int flags, struct dlm_rsb **r_ret)
328{
329 struct dlm_rsb *r;
330 int error;
331
332 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
333 if (!error) {
334 kref_get(&r->res_ref);
335 goto out;
336 }
337 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
338 if (error)
339 goto out;
340
341 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
342
343 if (dlm_no_directory(ls))
344 goto out;
345
346 if (r->res_nodeid == -1) {
347 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
348 r->res_first_lkid = 0;
349 } else if (r->res_nodeid > 0) {
350 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
351 r->res_first_lkid = 0;
352 } else {
353 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
354 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
355 }
356 out:
357 *r_ret = r;
358 return error;
359}
360
361static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
362 unsigned int flags, struct dlm_rsb **r_ret)
363{
364 int error;
365 write_lock(&ls->ls_rsbtbl[b].lock);
366 error = _search_rsb(ls, name, len, b, flags, r_ret);
367 write_unlock(&ls->ls_rsbtbl[b].lock);
368 return error;
369}
370
371/*
372 * Find rsb in rsbtbl and potentially create/add one
373 *
374 * Delaying the release of rsb's has a similar benefit to applications keeping
375 * NL locks on an rsb, but without the guarantee that the cached master value
376 * will still be valid when the rsb is reused. Apps aren't always smart enough
377 * to keep NL locks on an rsb that they may lock again shortly; this can lead
378 * to excessive master lookups and removals if we don't delay the release.
379 *
380 * Searching for an rsb means looking through both the normal list and toss
381 * list. When found on the toss list the rsb is moved to the normal list with
382 * ref count of 1; when found on normal list the ref count is incremented.
383 */
384
385static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
386 unsigned int flags, struct dlm_rsb **r_ret)
387{
388 struct dlm_rsb *r, *tmp;
389 uint32_t hash, bucket;
390 int error = 0;
391
392 if (dlm_no_directory(ls))
393 flags |= R_CREATE;
394
395 hash = jhash(name, namelen, 0);
396 bucket = hash & (ls->ls_rsbtbl_size - 1);
397
398 error = search_rsb(ls, name, namelen, bucket, flags, &r);
399 if (!error)
400 goto out;
401
402 if (error == -EBADR && !(flags & R_CREATE))
403 goto out;
404
405 /* the rsb was found but wasn't a master copy */
406 if (error == -ENOTBLK)
407 goto out;
408
409 error = -ENOMEM;
410 r = create_rsb(ls, name, namelen);
411 if (!r)
412 goto out;
413
414 r->res_hash = hash;
415 r->res_bucket = bucket;
416 r->res_nodeid = -1;
417 kref_init(&r->res_ref);
418
419 /* With no directory, the master can be set immediately */
420 if (dlm_no_directory(ls)) {
421 int nodeid = dlm_dir_nodeid(r);
422 if (nodeid == dlm_our_nodeid())
423 nodeid = 0;
424 r->res_nodeid = nodeid;
425 }
426
427 write_lock(&ls->ls_rsbtbl[bucket].lock);
428 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
429 if (!error) {
430 write_unlock(&ls->ls_rsbtbl[bucket].lock);
431 free_rsb(r);
432 r = tmp;
433 goto out;
434 }
435 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
436 write_unlock(&ls->ls_rsbtbl[bucket].lock);
437 error = 0;
438 out:
439 *r_ret = r;
440 return error;
441}
442
443int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
444 unsigned int flags, struct dlm_rsb **r_ret)
445{
446 return find_rsb(ls, name, namelen, flags, r_ret);
447}
448
449/* This is only called to add a reference when the code already holds
450 a valid reference to the rsb, so there's no need for locking. */
451
452static inline void hold_rsb(struct dlm_rsb *r)
453{
454 kref_get(&r->res_ref);
455}
456
457void dlm_hold_rsb(struct dlm_rsb *r)
458{
459 hold_rsb(r);
460}
461
462static void toss_rsb(struct kref *kref)
463{
464 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
465 struct dlm_ls *ls = r->res_ls;
466
467 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
468 kref_init(&r->res_ref);
469 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
470 r->res_toss_time = jiffies;
471 if (r->res_lvbptr) {
472 free_lvb(r->res_lvbptr);
473 r->res_lvbptr = NULL;
474 }
475}
476
477/* When all references to the rsb are gone it's transfered to
478 the tossed list for later disposal. */
479
480static void put_rsb(struct dlm_rsb *r)
481{
482 struct dlm_ls *ls = r->res_ls;
483 uint32_t bucket = r->res_bucket;
484
485 write_lock(&ls->ls_rsbtbl[bucket].lock);
486 kref_put(&r->res_ref, toss_rsb);
487 write_unlock(&ls->ls_rsbtbl[bucket].lock);
488}
489
490void dlm_put_rsb(struct dlm_rsb *r)
491{
492 put_rsb(r);
493}
494
495/* See comment for unhold_lkb */
496
497static void unhold_rsb(struct dlm_rsb *r)
498{
499 int rv;
500 rv = kref_put(&r->res_ref, toss_rsb);
501 DLM_ASSERT(!rv, dlm_dump_rsb(r););
502}
503
504static void kill_rsb(struct kref *kref)
505{
506 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
507
508 /* All work is done after the return from kref_put() so we
509 can release the write_lock before the remove and free. */
510
511 DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
512 DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
513 DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
514 DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
515 DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
516 DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
517}
518
519/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
520 The rsb must exist as long as any lkb's for it do. */
521
522static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
523{
524 hold_rsb(r);
525 lkb->lkb_resource = r;
526}
527
528static void detach_lkb(struct dlm_lkb *lkb)
529{
530 if (lkb->lkb_resource) {
531 put_rsb(lkb->lkb_resource);
532 lkb->lkb_resource = NULL;
533 }
534}
535
536static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
537{
538 struct dlm_lkb *lkb, *tmp;
539 uint32_t lkid = 0;
540 uint16_t bucket;
541
542 lkb = allocate_lkb(ls);
543 if (!lkb)
544 return -ENOMEM;
545
546 lkb->lkb_nodeid = -1;
547 lkb->lkb_grmode = DLM_LOCK_IV;
548 kref_init(&lkb->lkb_ref);
549 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
550
551 get_random_bytes(&bucket, sizeof(bucket));
552 bucket &= (ls->ls_lkbtbl_size - 1);
553
554 write_lock(&ls->ls_lkbtbl[bucket].lock);
555
556 /* counter can roll over so we must verify lkid is not in use */
557
558 while (lkid == 0) {
559 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
560
561 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
562 lkb_idtbl_list) {
563 if (tmp->lkb_id != lkid)
564 continue;
565 lkid = 0;
566 break;
567 }
568 }
569
570 lkb->lkb_id = lkid;
571 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
572 write_unlock(&ls->ls_lkbtbl[bucket].lock);
573
574 *lkb_ret = lkb;
575 return 0;
576}
577
578static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
579{
580 uint16_t bucket = lkid & 0xFFFF;
581 struct dlm_lkb *lkb;
582
583 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
584 if (lkb->lkb_id == lkid)
585 return lkb;
586 }
587 return NULL;
588}
589
590static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
591{
592 struct dlm_lkb *lkb;
593 uint16_t bucket = lkid & 0xFFFF;
594
595 if (bucket >= ls->ls_lkbtbl_size)
596 return -EBADSLT;
597
598 read_lock(&ls->ls_lkbtbl[bucket].lock);
599 lkb = __find_lkb(ls, lkid);
600 if (lkb)
601 kref_get(&lkb->lkb_ref);
602 read_unlock(&ls->ls_lkbtbl[bucket].lock);
603
604 *lkb_ret = lkb;
605 return lkb ? 0 : -ENOENT;
606}
607
608static void kill_lkb(struct kref *kref)
609{
610 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
611
612 /* All work is done after the return from kref_put() so we
613 can release the write_lock before the detach_lkb */
614
615 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
616}
617
618/* __put_lkb() is used when an lkb may not have an rsb attached to
619 it so we need to provide the lockspace explicitly */
620
621static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
622{
623 uint16_t bucket = lkb->lkb_id & 0xFFFF;
624
625 write_lock(&ls->ls_lkbtbl[bucket].lock);
626 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
627 list_del(&lkb->lkb_idtbl_list);
628 write_unlock(&ls->ls_lkbtbl[bucket].lock);
629
630 detach_lkb(lkb);
631
632 /* for local/process lkbs, lvbptr points to caller's lksb */
633 if (lkb->lkb_lvbptr && is_master_copy(lkb))
634 free_lvb(lkb->lkb_lvbptr);
635 free_lkb(lkb);
636 return 1;
637 } else {
638 write_unlock(&ls->ls_lkbtbl[bucket].lock);
639 return 0;
640 }
641}
642
643int dlm_put_lkb(struct dlm_lkb *lkb)
644{
645 struct dlm_ls *ls;
646
647 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
648 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
649
650 ls = lkb->lkb_resource->res_ls;
651 return __put_lkb(ls, lkb);
652}
653
654/* This is only called to add a reference when the code already holds
655 a valid reference to the lkb, so there's no need for locking. */
656
657static inline void hold_lkb(struct dlm_lkb *lkb)
658{
659 kref_get(&lkb->lkb_ref);
660}
661
662/* This is called when we need to remove a reference and are certain
663 it's not the last ref. e.g. del_lkb is always called between a
664 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
665 put_lkb would work fine, but would involve unnecessary locking */
666
667static inline void unhold_lkb(struct dlm_lkb *lkb)
668{
669 int rv;
670 rv = kref_put(&lkb->lkb_ref, kill_lkb);
671 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
672}
673
674static void lkb_add_ordered(struct list_head *new, struct list_head *head,
675 int mode)
676{
677 struct dlm_lkb *lkb = NULL;
678
679 list_for_each_entry(lkb, head, lkb_statequeue)
680 if (lkb->lkb_rqmode < mode)
681 break;
682
683 if (!lkb)
684 list_add_tail(new, head);
685 else
686 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
687}
688
689/* add/remove lkb to rsb's grant/convert/wait queue */
690
691static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
692{
693 kref_get(&lkb->lkb_ref);
694
695 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
696
697 lkb->lkb_status = status;
698
699 switch (status) {
700 case DLM_LKSTS_WAITING:
701 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
702 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
703 else
704 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
705 break;
706 case DLM_LKSTS_GRANTED:
707 /* convention says granted locks kept in order of grmode */
708 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
709 lkb->lkb_grmode);
710 break;
711 case DLM_LKSTS_CONVERT:
712 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
713 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
714 else
715 list_add_tail(&lkb->lkb_statequeue,
716 &r->res_convertqueue);
717 break;
718 default:
719 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
720 }
721}
722
723static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
724{
725 lkb->lkb_status = 0;
726 list_del(&lkb->lkb_statequeue);
727 unhold_lkb(lkb);
728}
729
730static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
731{
732 hold_lkb(lkb);
733 del_lkb(r, lkb);
734 add_lkb(r, lkb, sts);
735 unhold_lkb(lkb);
736}
737
738/* add/remove lkb from global waiters list of lkb's waiting for
739 a reply from a remote node */
740
741static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
742{
743 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
744
745 mutex_lock(&ls->ls_waiters_mutex);
746 if (lkb->lkb_wait_type) {
747 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
748 goto out;
749 }
750 lkb->lkb_wait_type = mstype;
751 kref_get(&lkb->lkb_ref);
752 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
753 out:
754 mutex_unlock(&ls->ls_waiters_mutex);
755}
756
757static int _remove_from_waiters(struct dlm_lkb *lkb)
758{
759 int error = 0;
760
761 if (!lkb->lkb_wait_type) {
762 log_print("remove_from_waiters error");
763 error = -EINVAL;
764 goto out;
765 }
766 lkb->lkb_wait_type = 0;
767 list_del(&lkb->lkb_wait_reply);
768 unhold_lkb(lkb);
769 out:
770 return error;
771}
772
773static int remove_from_waiters(struct dlm_lkb *lkb)
774{
775 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
776 int error;
777
778 mutex_lock(&ls->ls_waiters_mutex);
779 error = _remove_from_waiters(lkb);
780 mutex_unlock(&ls->ls_waiters_mutex);
781 return error;
782}
783
784static void dir_remove(struct dlm_rsb *r)
785{
786 int to_nodeid;
787
788 if (dlm_no_directory(r->res_ls))
789 return;
790
791 to_nodeid = dlm_dir_nodeid(r);
792 if (to_nodeid != dlm_our_nodeid())
793 send_remove(r);
794 else
795 dlm_dir_remove_entry(r->res_ls, to_nodeid,
796 r->res_name, r->res_length);
797}
798
799/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
800 found since they are in order of newest to oldest? */
801
802static int shrink_bucket(struct dlm_ls *ls, int b)
803{
804 struct dlm_rsb *r;
805 int count = 0, found;
806
807 for (;;) {
808 found = 0;
809 write_lock(&ls->ls_rsbtbl[b].lock);
810 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
811 res_hashchain) {
812 if (!time_after_eq(jiffies, r->res_toss_time +
813 dlm_config.toss_secs * HZ))
814 continue;
815 found = 1;
816 break;
817 }
818
819 if (!found) {
820 write_unlock(&ls->ls_rsbtbl[b].lock);
821 break;
822 }
823
824 if (kref_put(&r->res_ref, kill_rsb)) {
825 list_del(&r->res_hashchain);
826 write_unlock(&ls->ls_rsbtbl[b].lock);
827
828 if (is_master(r))
829 dir_remove(r);
830 free_rsb(r);
831 count++;
832 } else {
833 write_unlock(&ls->ls_rsbtbl[b].lock);
834 log_error(ls, "tossed rsb in use %s", r->res_name);
835 }
836 }
837
838 return count;
839}
840
841void dlm_scan_rsbs(struct dlm_ls *ls)
842{
843 int i;
844
845 if (dlm_locking_stopped(ls))
846 return;
847
848 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
849 shrink_bucket(ls, i);
850 cond_resched();
851 }
852}
853
854/* lkb is master or local copy */
855
856static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
857{
858 int b, len = r->res_ls->ls_lvblen;
859
860 /* b=1 lvb returned to caller
861 b=0 lvb written to rsb or invalidated
862 b=-1 do nothing */
863
864 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
865
866 if (b == 1) {
867 if (!lkb->lkb_lvbptr)
868 return;
869
870 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
871 return;
872
873 if (!r->res_lvbptr)
874 return;
875
876 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
877 lkb->lkb_lvbseq = r->res_lvbseq;
878
879 } else if (b == 0) {
880 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
881 rsb_set_flag(r, RSB_VALNOTVALID);
882 return;
883 }
884
885 if (!lkb->lkb_lvbptr)
886 return;
887
888 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
889 return;
890
891 if (!r->res_lvbptr)
892 r->res_lvbptr = allocate_lvb(r->res_ls);
893
894 if (!r->res_lvbptr)
895 return;
896
897 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
898 r->res_lvbseq++;
899 lkb->lkb_lvbseq = r->res_lvbseq;
900 rsb_clear_flag(r, RSB_VALNOTVALID);
901 }
902
903 if (rsb_flag(r, RSB_VALNOTVALID))
904 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
905}
906
907static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
908{
909 if (lkb->lkb_grmode < DLM_LOCK_PW)
910 return;
911
912 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
913 rsb_set_flag(r, RSB_VALNOTVALID);
914 return;
915 }
916
917 if (!lkb->lkb_lvbptr)
918 return;
919
920 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
921 return;
922
923 if (!r->res_lvbptr)
924 r->res_lvbptr = allocate_lvb(r->res_ls);
925
926 if (!r->res_lvbptr)
927 return;
928
929 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
930 r->res_lvbseq++;
931 rsb_clear_flag(r, RSB_VALNOTVALID);
932}
933
934/* lkb is process copy (pc) */
935
936static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
937 struct dlm_message *ms)
938{
939 int b;
940
941 if (!lkb->lkb_lvbptr)
942 return;
943
944 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
945 return;
946
947 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
948 if (b == 1) {
949 int len = receive_extralen(ms);
950 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
951 lkb->lkb_lvbseq = ms->m_lvbseq;
952 }
953}
954
955/* Manipulate lkb's on rsb's convert/granted/waiting queues
956 remove_lock -- used for unlock, removes lkb from granted
957 revert_lock -- used for cancel, moves lkb from convert to granted
958 grant_lock -- used for request and convert, adds lkb to granted or
959 moves lkb from convert or waiting to granted
960
961 Each of these is used for master or local copy lkb's. There is
962 also a _pc() variation used to make the corresponding change on
963 a process copy (pc) lkb. */
964
965static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
966{
967 del_lkb(r, lkb);
968 lkb->lkb_grmode = DLM_LOCK_IV;
969 /* this unhold undoes the original ref from create_lkb()
970 so this leads to the lkb being freed */
971 unhold_lkb(lkb);
972}
973
974static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
975{
976 set_lvb_unlock(r, lkb);
977 _remove_lock(r, lkb);
978}
979
980static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
981{
982 _remove_lock(r, lkb);
983}
984
985static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
986{
987 lkb->lkb_rqmode = DLM_LOCK_IV;
988
989 switch (lkb->lkb_status) {
990 case DLM_LKSTS_GRANTED:
991 break;
992 case DLM_LKSTS_CONVERT:
993 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
994 break;
995 case DLM_LKSTS_WAITING:
996 del_lkb(r, lkb);
997 lkb->lkb_grmode = DLM_LOCK_IV;
998 /* this unhold undoes the original ref from create_lkb()
999 so this leads to the lkb being freed */
1000 unhold_lkb(lkb);
1001 break;
1002 default:
1003 log_print("invalid status for revert %d", lkb->lkb_status);
1004 }
1005}
1006
1007static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1008{
1009 revert_lock(r, lkb);
1010}
1011
1012static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1013{
1014 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
1015 lkb->lkb_grmode = lkb->lkb_rqmode;
1016 if (lkb->lkb_status)
1017 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1018 else
1019 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1020 }
1021
1022 lkb->lkb_rqmode = DLM_LOCK_IV;
1023}
1024
1025static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1026{
1027 set_lvb_lock(r, lkb);
1028 _grant_lock(r, lkb);
1029 lkb->lkb_highbast = 0;
1030}
1031
1032static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1033 struct dlm_message *ms)
1034{
1035 set_lvb_lock_pc(r, lkb, ms);
1036 _grant_lock(r, lkb);
1037}
1038
1039/* called by grant_pending_locks() which means an async grant message must
1040 be sent to the requesting node in addition to granting the lock if the
1041 lkb belongs to a remote node. */
1042
1043static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1044{
1045 grant_lock(r, lkb);
1046 if (is_master_copy(lkb))
1047 send_grant(r, lkb);
1048 else
1049 queue_cast(r, lkb, 0);
1050}
1051
1052static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1053{
1054 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1055 lkb_statequeue);
1056 if (lkb->lkb_id == first->lkb_id)
1057 return 1;
1058
1059 return 0;
1060}
1061
1062/* Check if the given lkb conflicts with another lkb on the queue. */
1063
1064static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1065{
1066 struct dlm_lkb *this;
1067
1068 list_for_each_entry(this, head, lkb_statequeue) {
1069 if (this == lkb)
1070 continue;
1071 if (!modes_compat(this, lkb))
1072 return 1;
1073 }
1074 return 0;
1075}
1076
1077/*
1078 * "A conversion deadlock arises with a pair of lock requests in the converting
1079 * queue for one resource. The granted mode of each lock blocks the requested
1080 * mode of the other lock."
1081 *
1082 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1083 * convert queue from being granted, then demote lkb (set grmode to NL).
1084 * This second form requires that we check for conv-deadlk even when
1085 * now == 0 in _can_be_granted().
1086 *
1087 * Example:
1088 * Granted Queue: empty
1089 * Convert Queue: NL->EX (first lock)
1090 * PR->EX (second lock)
1091 *
1092 * The first lock can't be granted because of the granted mode of the second
1093 * lock and the second lock can't be granted because it's not first in the
1094 * list. We demote the granted mode of the second lock (the lkb passed to this
1095 * function).
1096 *
1097 * After the resolution, the "grant pending" function needs to go back and try
1098 * to grant locks on the convert queue again since the first lock can now be
1099 * granted.
1100 */
1101
1102static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1103{
1104 struct dlm_lkb *this, *first = NULL, *self = NULL;
1105
1106 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1107 if (!first)
1108 first = this;
1109 if (this == lkb) {
1110 self = lkb;
1111 continue;
1112 }
1113
1114 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1115 return 1;
1116 }
1117
1118 /* if lkb is on the convert queue and is preventing the first
1119 from being granted, then there's deadlock and we demote lkb.
1120 multiple converting locks may need to do this before the first
1121 converting lock can be granted. */
1122
1123 if (self && self != first) {
1124 if (!modes_compat(lkb, first) &&
1125 !queue_conflict(&rsb->res_grantqueue, first))
1126 return 1;
1127 }
1128
1129 return 0;
1130}
1131
1132/*
1133 * Return 1 if the lock can be granted, 0 otherwise.
1134 * Also detect and resolve conversion deadlocks.
1135 *
1136 * lkb is the lock to be granted
1137 *
1138 * now is 1 if the function is being called in the context of the
1139 * immediate request, it is 0 if called later, after the lock has been
1140 * queued.
1141 *
1142 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1143 */
1144
1145static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1146{
1147 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1148
1149 /*
1150 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1151 * a new request for a NL mode lock being blocked.
1152 *
1153 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1154 * request, then it would be granted. In essence, the use of this flag
1155 * tells the Lock Manager to expedite theis request by not considering
1156 * what may be in the CONVERTING or WAITING queues... As of this
1157 * writing, the EXPEDITE flag can be used only with new requests for NL
1158 * mode locks. This flag is not valid for conversion requests.
1159 *
1160 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1161 * conversion or used with a non-NL requested mode. We also know an
1162 * EXPEDITE request is always granted immediately, so now must always
1163 * be 1. The full condition to grant an expedite request: (now &&
1164 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1165 * therefore be shortened to just checking the flag.
1166 */
1167
1168 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1169 return 1;
1170
1171 /*
1172 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1173 * added to the remaining conditions.
1174 */
1175
1176 if (queue_conflict(&r->res_grantqueue, lkb))
1177 goto out;
1178
1179 /*
1180 * 6-3: By default, a conversion request is immediately granted if the
1181 * requested mode is compatible with the modes of all other granted
1182 * locks
1183 */
1184
1185 if (queue_conflict(&r->res_convertqueue, lkb))
1186 goto out;
1187
1188 /*
1189 * 6-5: But the default algorithm for deciding whether to grant or
1190 * queue conversion requests does not by itself guarantee that such
1191 * requests are serviced on a "first come first serve" basis. This, in
1192 * turn, can lead to a phenomenon known as "indefinate postponement".
1193 *
1194 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1195 * the system service employed to request a lock conversion. This flag
1196 * forces certain conversion requests to be queued, even if they are
1197 * compatible with the granted modes of other locks on the same
1198 * resource. Thus, the use of this flag results in conversion requests
1199 * being ordered on a "first come first servce" basis.
1200 *
1201 * DCT: This condition is all about new conversions being able to occur
1202 * "in place" while the lock remains on the granted queue (assuming
1203 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1204 * doesn't _have_ to go onto the convert queue where it's processed in
1205 * order. The "now" variable is necessary to distinguish converts
1206 * being received and processed for the first time now, because once a
1207 * convert is moved to the conversion queue the condition below applies
1208 * requiring fifo granting.
1209 */
1210
1211 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1212 return 1;
1213
1214 /*
1215 * The NOORDER flag is set to avoid the standard vms rules on grant
1216 * order.
1217 */
1218
1219 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1220 return 1;
1221
1222 /*
1223 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1224 * granted until all other conversion requests ahead of it are granted
1225 * and/or canceled.
1226 */
1227
1228 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1229 return 1;
1230
1231 /*
1232 * 6-4: By default, a new request is immediately granted only if all
1233 * three of the following conditions are satisfied when the request is
1234 * issued:
1235 * - The queue of ungranted conversion requests for the resource is
1236 * empty.
1237 * - The queue of ungranted new requests for the resource is empty.
1238 * - The mode of the new request is compatible with the most
1239 * restrictive mode of all granted locks on the resource.
1240 */
1241
1242 if (now && !conv && list_empty(&r->res_convertqueue) &&
1243 list_empty(&r->res_waitqueue))
1244 return 1;
1245
1246 /*
1247 * 6-4: Once a lock request is in the queue of ungranted new requests,
1248 * it cannot be granted until the queue of ungranted conversion
1249 * requests is empty, all ungranted new requests ahead of it are
1250 * granted and/or canceled, and it is compatible with the granted mode
1251 * of the most restrictive lock granted on the resource.
1252 */
1253
1254 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1255 first_in_list(lkb, &r->res_waitqueue))
1256 return 1;
1257
1258 out:
1259 /*
1260 * The following, enabled by CONVDEADLK, departs from VMS.
1261 */
1262
1263 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1264 conversion_deadlock_detect(r, lkb)) {
1265 lkb->lkb_grmode = DLM_LOCK_NL;
1266 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1267 }
1268
1269 return 0;
1270}
1271
1272/*
1273 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1274 * simple way to provide a big optimization to applications that can use them.
1275 */
1276
1277static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1278{
1279 uint32_t flags = lkb->lkb_exflags;
1280 int rv;
1281 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1282
1283 rv = _can_be_granted(r, lkb, now);
1284 if (rv)
1285 goto out;
1286
1287 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1288 goto out;
1289
1290 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1291 alt = DLM_LOCK_PR;
1292 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1293 alt = DLM_LOCK_CW;
1294
1295 if (alt) {
1296 lkb->lkb_rqmode = alt;
1297 rv = _can_be_granted(r, lkb, now);
1298 if (rv)
1299 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1300 else
1301 lkb->lkb_rqmode = rqmode;
1302 }
1303 out:
1304 return rv;
1305}
1306
1307static int grant_pending_convert(struct dlm_rsb *r, int high)
1308{
1309 struct dlm_lkb *lkb, *s;
1310 int hi, demoted, quit, grant_restart, demote_restart;
1311
1312 quit = 0;
1313 restart:
1314 grant_restart = 0;
1315 demote_restart = 0;
1316 hi = DLM_LOCK_IV;
1317
1318 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1319 demoted = is_demoted(lkb);
1320 if (can_be_granted(r, lkb, 0)) {
1321 grant_lock_pending(r, lkb);
1322 grant_restart = 1;
1323 } else {
1324 hi = max_t(int, lkb->lkb_rqmode, hi);
1325 if (!demoted && is_demoted(lkb))
1326 demote_restart = 1;
1327 }
1328 }
1329
1330 if (grant_restart)
1331 goto restart;
1332 if (demote_restart && !quit) {
1333 quit = 1;
1334 goto restart;
1335 }
1336
1337 return max_t(int, high, hi);
1338}
1339
1340static int grant_pending_wait(struct dlm_rsb *r, int high)
1341{
1342 struct dlm_lkb *lkb, *s;
1343
1344 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1345 if (can_be_granted(r, lkb, 0))
1346 grant_lock_pending(r, lkb);
1347 else
1348 high = max_t(int, lkb->lkb_rqmode, high);
1349 }
1350
1351 return high;
1352}
1353
1354static void grant_pending_locks(struct dlm_rsb *r)
1355{
1356 struct dlm_lkb *lkb, *s;
1357 int high = DLM_LOCK_IV;
1358
1359 DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
1360
1361 high = grant_pending_convert(r, high);
1362 high = grant_pending_wait(r, high);
1363
1364 if (high == DLM_LOCK_IV)
1365 return;
1366
1367 /*
1368 * If there are locks left on the wait/convert queue then send blocking
1369 * ASTs to granted locks based on the largest requested mode (high)
1370 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1371 */
1372
1373 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1374 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1375 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1376 queue_bast(r, lkb, high);
1377 lkb->lkb_highbast = high;
1378 }
1379 }
1380}
1381
1382static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1383 struct dlm_lkb *lkb)
1384{
1385 struct dlm_lkb *gr;
1386
1387 list_for_each_entry(gr, head, lkb_statequeue) {
1388 if (gr->lkb_bastaddr &&
1389 gr->lkb_highbast < lkb->lkb_rqmode &&
1390 !modes_compat(gr, lkb)) {
1391 queue_bast(r, gr, lkb->lkb_rqmode);
1392 gr->lkb_highbast = lkb->lkb_rqmode;
1393 }
1394 }
1395}
1396
1397static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1398{
1399 send_bast_queue(r, &r->res_grantqueue, lkb);
1400}
1401
1402static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1403{
1404 send_bast_queue(r, &r->res_grantqueue, lkb);
1405 send_bast_queue(r, &r->res_convertqueue, lkb);
1406}
1407
1408/* set_master(r, lkb) -- set the master nodeid of a resource
1409
1410 The purpose of this function is to set the nodeid field in the given
1411 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1412 known, it can just be copied to the lkb and the function will return
1413 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1414 before it can be copied to the lkb.
1415
1416 When the rsb nodeid is being looked up remotely, the initial lkb
1417 causing the lookup is kept on the ls_waiters list waiting for the
1418 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1419 on the rsb's res_lookup list until the master is verified.
1420
1421 Return values:
1422 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1423 1: the rsb master is not available and the lkb has been placed on
1424 a wait queue
1425*/
1426
1427static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1428{
1429 struct dlm_ls *ls = r->res_ls;
1430 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1431
1432 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1433 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1434 r->res_first_lkid = lkb->lkb_id;
1435 lkb->lkb_nodeid = r->res_nodeid;
1436 return 0;
1437 }
1438
1439 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1440 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1441 return 1;
1442 }
1443
1444 if (r->res_nodeid == 0) {
1445 lkb->lkb_nodeid = 0;
1446 return 0;
1447 }
1448
1449 if (r->res_nodeid > 0) {
1450 lkb->lkb_nodeid = r->res_nodeid;
1451 return 0;
1452 }
1453
1454 DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
1455
1456 dir_nodeid = dlm_dir_nodeid(r);
1457
1458 if (dir_nodeid != our_nodeid) {
1459 r->res_first_lkid = lkb->lkb_id;
1460 send_lookup(r, lkb);
1461 return 1;
1462 }
1463
1464 for (;;) {
1465 /* It's possible for dlm_scand to remove an old rsb for
1466 this same resource from the toss list, us to create
1467 a new one, look up the master locally, and find it
1468 already exists just before dlm_scand does the
1469 dir_remove() on the previous rsb. */
1470
1471 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1472 r->res_length, &ret_nodeid);
1473 if (!error)
1474 break;
1475 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1476 schedule();
1477 }
1478
1479 if (ret_nodeid == our_nodeid) {
1480 r->res_first_lkid = 0;
1481 r->res_nodeid = 0;
1482 lkb->lkb_nodeid = 0;
1483 } else {
1484 r->res_first_lkid = lkb->lkb_id;
1485 r->res_nodeid = ret_nodeid;
1486 lkb->lkb_nodeid = ret_nodeid;
1487 }
1488 return 0;
1489}
1490
1491static void process_lookup_list(struct dlm_rsb *r)
1492{
1493 struct dlm_lkb *lkb, *safe;
1494
1495 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1496 list_del(&lkb->lkb_rsb_lookup);
1497 _request_lock(r, lkb);
1498 schedule();
1499 }
1500}
1501
1502/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1503
1504static void confirm_master(struct dlm_rsb *r, int error)
1505{
1506 struct dlm_lkb *lkb;
1507
1508 if (!r->res_first_lkid)
1509 return;
1510
1511 switch (error) {
1512 case 0:
1513 case -EINPROGRESS:
1514 r->res_first_lkid = 0;
1515 process_lookup_list(r);
1516 break;
1517
1518 case -EAGAIN:
1519 /* the remote master didn't queue our NOQUEUE request;
1520 make a waiting lkb the first_lkid */
1521
1522 r->res_first_lkid = 0;
1523
1524 if (!list_empty(&r->res_lookup)) {
1525 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1526 lkb_rsb_lookup);
1527 list_del(&lkb->lkb_rsb_lookup);
1528 r->res_first_lkid = lkb->lkb_id;
1529 _request_lock(r, lkb);
1530 } else
1531 r->res_nodeid = -1;
1532 break;
1533
1534 default:
1535 log_error(r->res_ls, "confirm_master unknown error %d", error);
1536 }
1537}
1538
1539static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1540 int namelen, uint32_t parent_lkid, void *ast,
1541 void *astarg, void *bast, struct dlm_args *args)
1542{
1543 int rv = -EINVAL;
1544
1545 /* check for invalid arg usage */
1546
1547 if (mode < 0 || mode > DLM_LOCK_EX)
1548 goto out;
1549
1550 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1551 goto out;
1552
1553 if (flags & DLM_LKF_CANCEL)
1554 goto out;
1555
1556 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1557 goto out;
1558
1559 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1560 goto out;
1561
1562 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1563 goto out;
1564
1565 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1566 goto out;
1567
1568 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1569 goto out;
1570
1571 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1572 goto out;
1573
1574 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1575 goto out;
1576
1577 if (!ast || !lksb)
1578 goto out;
1579
1580 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1581 goto out;
1582
1583 /* parent/child locks not yet supported */
1584 if (parent_lkid)
1585 goto out;
1586
1587 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1588 goto out;
1589
1590 /* these args will be copied to the lkb in validate_lock_args,
1591 it cannot be done now because when converting locks, fields in
1592 an active lkb cannot be modified before locking the rsb */
1593
1594 args->flags = flags;
1595 args->astaddr = ast;
1596 args->astparam = (long) astarg;
1597 args->bastaddr = bast;
1598 args->mode = mode;
1599 args->lksb = lksb;
1600 rv = 0;
1601 out:
1602 return rv;
1603}
1604
1605static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1606{
1607 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1608 DLM_LKF_FORCEUNLOCK))
1609 return -EINVAL;
1610
1611 args->flags = flags;
1612 args->astparam = (long) astarg;
1613 return 0;
1614}
1615
1616static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1617 struct dlm_args *args)
1618{
1619 int rv = -EINVAL;
1620
1621 if (args->flags & DLM_LKF_CONVERT) {
1622 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1623 goto out;
1624
1625 if (args->flags & DLM_LKF_QUECVT &&
1626 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1627 goto out;
1628
1629 rv = -EBUSY;
1630 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1631 goto out;
1632
1633 if (lkb->lkb_wait_type)
1634 goto out;
1635 }
1636
1637 lkb->lkb_exflags = args->flags;
1638 lkb->lkb_sbflags = 0;
1639 lkb->lkb_astaddr = args->astaddr;
1640 lkb->lkb_astparam = args->astparam;
1641 lkb->lkb_bastaddr = args->bastaddr;
1642 lkb->lkb_rqmode = args->mode;
1643 lkb->lkb_lksb = args->lksb;
1644 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1645 lkb->lkb_ownpid = (int) current->pid;
1646 rv = 0;
1647 out:
1648 return rv;
1649}
1650
1651static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1652{
1653 int rv = -EINVAL;
1654
1655 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1656 goto out;
1657
1658 if (args->flags & DLM_LKF_FORCEUNLOCK)
1659 goto out_ok;
1660
1661 if (args->flags & DLM_LKF_CANCEL &&
1662 lkb->lkb_status == DLM_LKSTS_GRANTED)
1663 goto out;
1664
1665 if (!(args->flags & DLM_LKF_CANCEL) &&
1666 lkb->lkb_status != DLM_LKSTS_GRANTED)
1667 goto out;
1668
1669 rv = -EBUSY;
1670 if (lkb->lkb_wait_type)
1671 goto out;
1672
1673 out_ok:
1674 lkb->lkb_exflags = args->flags;
1675 lkb->lkb_sbflags = 0;
1676 lkb->lkb_astparam = args->astparam;
1677
1678 rv = 0;
1679 out:
1680 return rv;
1681}
1682
1683/*
1684 * Four stage 4 varieties:
1685 * do_request(), do_convert(), do_unlock(), do_cancel()
1686 * These are called on the master node for the given lock and
1687 * from the central locking logic.
1688 */
1689
1690static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1691{
1692 int error = 0;
1693
1694 if (can_be_granted(r, lkb, 1)) {
1695 grant_lock(r, lkb);
1696 queue_cast(r, lkb, 0);
1697 goto out;
1698 }
1699
1700 if (can_be_queued(lkb)) {
1701 error = -EINPROGRESS;
1702 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1703 send_blocking_asts(r, lkb);
1704 goto out;
1705 }
1706
1707 error = -EAGAIN;
1708 if (force_blocking_asts(lkb))
1709 send_blocking_asts_all(r, lkb);
1710 queue_cast(r, lkb, -EAGAIN);
1711
1712 out:
1713 return error;
1714}
1715
1716static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1717{
1718 int error = 0;
1719
1720 /* changing an existing lock may allow others to be granted */
1721
1722 if (can_be_granted(r, lkb, 1)) {
1723 grant_lock(r, lkb);
1724 queue_cast(r, lkb, 0);
1725 grant_pending_locks(r);
1726 goto out;
1727 }
1728
1729 if (can_be_queued(lkb)) {
1730 if (is_demoted(lkb))
1731 grant_pending_locks(r);
1732 error = -EINPROGRESS;
1733 del_lkb(r, lkb);
1734 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1735 send_blocking_asts(r, lkb);
1736 goto out;
1737 }
1738
1739 error = -EAGAIN;
1740 if (force_blocking_asts(lkb))
1741 send_blocking_asts_all(r, lkb);
1742 queue_cast(r, lkb, -EAGAIN);
1743
1744 out:
1745 return error;
1746}
1747
1748static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1749{
1750 remove_lock(r, lkb);
1751 queue_cast(r, lkb, -DLM_EUNLOCK);
1752 grant_pending_locks(r);
1753 return -DLM_EUNLOCK;
1754}
1755
1756/* FIXME: if revert_lock() finds that the lkb is granted, we should
1757 skip the queue_cast(ECANCEL). It indicates that the request/convert
1758 completed (and queued a normal ast) just before the cancel; we don't
1759 want to clobber the sb_result for the normal ast with ECANCEL. */
1760
1761static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1762{
1763 revert_lock(r, lkb);
1764 queue_cast(r, lkb, -DLM_ECANCEL);
1765 grant_pending_locks(r);
1766 return -DLM_ECANCEL;
1767}
1768
1769/*
1770 * Four stage 3 varieties:
1771 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1772 */
1773
1774/* add a new lkb to a possibly new rsb, called by requesting process */
1775
1776static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1777{
1778 int error;
1779
1780 /* set_master: sets lkb nodeid from r */
1781
1782 error = set_master(r, lkb);
1783 if (error < 0)
1784 goto out;
1785 if (error) {
1786 error = 0;
1787 goto out;
1788 }
1789
1790 if (is_remote(r))
1791 /* receive_request() calls do_request() on remote node */
1792 error = send_request(r, lkb);
1793 else
1794 error = do_request(r, lkb);
1795 out:
1796 return error;
1797}
1798
1799/* change some property of an existing lkb, e.g. mode */
1800
1801static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1802{
1803 int error;
1804
1805 if (is_remote(r))
1806 /* receive_convert() calls do_convert() on remote node */
1807 error = send_convert(r, lkb);
1808 else
1809 error = do_convert(r, lkb);
1810
1811 return error;
1812}
1813
1814/* remove an existing lkb from the granted queue */
1815
1816static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1817{
1818 int error;
1819
1820 if (is_remote(r))
1821 /* receive_unlock() calls do_unlock() on remote node */
1822 error = send_unlock(r, lkb);
1823 else
1824 error = do_unlock(r, lkb);
1825
1826 return error;
1827}
1828
1829/* remove an existing lkb from the convert or wait queue */
1830
1831static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1832{
1833 int error;
1834
1835 if (is_remote(r))
1836 /* receive_cancel() calls do_cancel() on remote node */
1837 error = send_cancel(r, lkb);
1838 else
1839 error = do_cancel(r, lkb);
1840
1841 return error;
1842}
1843
1844/*
1845 * Four stage 2 varieties:
1846 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1847 */
1848
1849static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1850 int len, struct dlm_args *args)
1851{
1852 struct dlm_rsb *r;
1853 int error;
1854
1855 error = validate_lock_args(ls, lkb, args);
1856 if (error)
1857 goto out;
1858
1859 error = find_rsb(ls, name, len, R_CREATE, &r);
1860 if (error)
1861 goto out;
1862
1863 lock_rsb(r);
1864
1865 attach_lkb(r, lkb);
1866 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1867
1868 error = _request_lock(r, lkb);
1869
1870 unlock_rsb(r);
1871 put_rsb(r);
1872
1873 out:
1874 return error;
1875}
1876
1877static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1878 struct dlm_args *args)
1879{
1880 struct dlm_rsb *r;
1881 int error;
1882
1883 r = lkb->lkb_resource;
1884
1885 hold_rsb(r);
1886 lock_rsb(r);
1887
1888 error = validate_lock_args(ls, lkb, args);
1889 if (error)
1890 goto out;
1891
1892 error = _convert_lock(r, lkb);
1893 out:
1894 unlock_rsb(r);
1895 put_rsb(r);
1896 return error;
1897}
1898
1899static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1900 struct dlm_args *args)
1901{
1902 struct dlm_rsb *r;
1903 int error;
1904
1905 r = lkb->lkb_resource;
1906
1907 hold_rsb(r);
1908 lock_rsb(r);
1909
1910 error = validate_unlock_args(lkb, args);
1911 if (error)
1912 goto out;
1913
1914 error = _unlock_lock(r, lkb);
1915 out:
1916 unlock_rsb(r);
1917 put_rsb(r);
1918 return error;
1919}
1920
1921static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1922 struct dlm_args *args)
1923{
1924 struct dlm_rsb *r;
1925 int error;
1926
1927 r = lkb->lkb_resource;
1928
1929 hold_rsb(r);
1930 lock_rsb(r);
1931
1932 error = validate_unlock_args(lkb, args);
1933 if (error)
1934 goto out;
1935
1936 error = _cancel_lock(r, lkb);
1937 out:
1938 unlock_rsb(r);
1939 put_rsb(r);
1940 return error;
1941}
1942
1943/*
1944 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1945 */
1946
1947int dlm_lock(dlm_lockspace_t *lockspace,
1948 int mode,
1949 struct dlm_lksb *lksb,
1950 uint32_t flags,
1951 void *name,
1952 unsigned int namelen,
1953 uint32_t parent_lkid,
1954 void (*ast) (void *astarg),
1955 void *astarg,
1956 void (*bast) (void *astarg, int mode))
1957{
1958 struct dlm_ls *ls;
1959 struct dlm_lkb *lkb;
1960 struct dlm_args args;
1961 int error, convert = flags & DLM_LKF_CONVERT;
1962
1963 ls = dlm_find_lockspace_local(lockspace);
1964 if (!ls)
1965 return -EINVAL;
1966
1967 lock_recovery(ls);
1968
1969 if (convert)
1970 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1971 else
1972 error = create_lkb(ls, &lkb);
1973
1974 if (error)
1975 goto out;
1976
1977 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1978 astarg, bast, &args);
1979 if (error)
1980 goto out_put;
1981
1982 if (convert)
1983 error = convert_lock(ls, lkb, &args);
1984 else
1985 error = request_lock(ls, lkb, name, namelen, &args);
1986
1987 if (error == -EINPROGRESS)
1988 error = 0;
1989 out_put:
1990 if (convert || error)
1991 __put_lkb(ls, lkb);
1992 if (error == -EAGAIN)
1993 error = 0;
1994 out:
1995 unlock_recovery(ls);
1996 dlm_put_lockspace(ls);
1997 return error;
1998}
1999
2000int dlm_unlock(dlm_lockspace_t *lockspace,
2001 uint32_t lkid,
2002 uint32_t flags,
2003 struct dlm_lksb *lksb,
2004 void *astarg)
2005{
2006 struct dlm_ls *ls;
2007 struct dlm_lkb *lkb;
2008 struct dlm_args args;
2009 int error;
2010
2011 ls = dlm_find_lockspace_local(lockspace);
2012 if (!ls)
2013 return -EINVAL;
2014
2015 lock_recovery(ls);
2016
2017 error = find_lkb(ls, lkid, &lkb);
2018 if (error)
2019 goto out;
2020
2021 error = set_unlock_args(flags, astarg, &args);
2022 if (error)
2023 goto out_put;
2024
2025 if (flags & DLM_LKF_CANCEL)
2026 error = cancel_lock(ls, lkb, &args);
2027 else
2028 error = unlock_lock(ls, lkb, &args);
2029
2030 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2031 error = 0;
2032 out_put:
2033 dlm_put_lkb(lkb);
2034 out:
2035 unlock_recovery(ls);
2036 dlm_put_lockspace(ls);
2037 return error;
2038}
2039
2040/*
2041 * send/receive routines for remote operations and replies
2042 *
2043 * send_args
2044 * send_common
2045 * send_request receive_request
2046 * send_convert receive_convert
2047 * send_unlock receive_unlock
2048 * send_cancel receive_cancel
2049 * send_grant receive_grant
2050 * send_bast receive_bast
2051 * send_lookup receive_lookup
2052 * send_remove receive_remove
2053 *
2054 * send_common_reply
2055 * receive_request_reply send_request_reply
2056 * receive_convert_reply send_convert_reply
2057 * receive_unlock_reply send_unlock_reply
2058 * receive_cancel_reply send_cancel_reply
2059 * receive_lookup_reply send_lookup_reply
2060 */
2061
2062static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2063 int to_nodeid, int mstype,
2064 struct dlm_message **ms_ret,
2065 struct dlm_mhandle **mh_ret)
2066{
2067 struct dlm_message *ms;
2068 struct dlm_mhandle *mh;
2069 char *mb;
2070 int mb_len = sizeof(struct dlm_message);
2071
2072 switch (mstype) {
2073 case DLM_MSG_REQUEST:
2074 case DLM_MSG_LOOKUP:
2075 case DLM_MSG_REMOVE:
2076 mb_len += r->res_length;
2077 break;
2078 case DLM_MSG_CONVERT:
2079 case DLM_MSG_UNLOCK:
2080 case DLM_MSG_REQUEST_REPLY:
2081 case DLM_MSG_CONVERT_REPLY:
2082 case DLM_MSG_GRANT:
2083 if (lkb && lkb->lkb_lvbptr)
2084 mb_len += r->res_ls->ls_lvblen;
2085 break;
2086 }
2087
2088 /* get_buffer gives us a message handle (mh) that we need to
2089 pass into lowcomms_commit and a message buffer (mb) that we
2090 write our data into */
2091
2092 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2093 if (!mh)
2094 return -ENOBUFS;
2095
2096 memset(mb, 0, mb_len);
2097
2098 ms = (struct dlm_message *) mb;
2099
2100 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2101 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2102 ms->m_header.h_nodeid = dlm_our_nodeid();
2103 ms->m_header.h_length = mb_len;
2104 ms->m_header.h_cmd = DLM_MSG;
2105
2106 ms->m_type = mstype;
2107
2108 *mh_ret = mh;
2109 *ms_ret = ms;
2110 return 0;
2111}
2112
2113/* further lowcomms enhancements or alternate implementations may make
2114 the return value from this function useful at some point */
2115
2116static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2117{
2118 dlm_message_out(ms);
2119 dlm_lowcomms_commit_buffer(mh);
2120 return 0;
2121}
2122
2123static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2124 struct dlm_message *ms)
2125{
2126 ms->m_nodeid = lkb->lkb_nodeid;
2127 ms->m_pid = lkb->lkb_ownpid;
2128 ms->m_lkid = lkb->lkb_id;
2129 ms->m_remid = lkb->lkb_remid;
2130 ms->m_exflags = lkb->lkb_exflags;
2131 ms->m_sbflags = lkb->lkb_sbflags;
2132 ms->m_flags = lkb->lkb_flags;
2133 ms->m_lvbseq = lkb->lkb_lvbseq;
2134 ms->m_status = lkb->lkb_status;
2135 ms->m_grmode = lkb->lkb_grmode;
2136 ms->m_rqmode = lkb->lkb_rqmode;
2137 ms->m_hash = r->res_hash;
2138
2139 /* m_result and m_bastmode are set from function args,
2140 not from lkb fields */
2141
2142 if (lkb->lkb_bastaddr)
2143 ms->m_asts |= AST_BAST;
2144 if (lkb->lkb_astaddr)
2145 ms->m_asts |= AST_COMP;
2146
2147 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2148 memcpy(ms->m_extra, r->res_name, r->res_length);
2149
2150 else if (lkb->lkb_lvbptr)
2151 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2152
2153}
2154
2155static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2156{
2157 struct dlm_message *ms;
2158 struct dlm_mhandle *mh;
2159 int to_nodeid, error;
2160
2161 add_to_waiters(lkb, mstype);
2162
2163 to_nodeid = r->res_nodeid;
2164
2165 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2166 if (error)
2167 goto fail;
2168
2169 send_args(r, lkb, ms);
2170
2171 error = send_message(mh, ms);
2172 if (error)
2173 goto fail;
2174 return 0;
2175
2176 fail:
2177 remove_from_waiters(lkb);
2178 return error;
2179}
2180
2181static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2182{
2183 return send_common(r, lkb, DLM_MSG_REQUEST);
2184}
2185
2186static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2187{
2188 int error;
2189
2190 error = send_common(r, lkb, DLM_MSG_CONVERT);
2191
2192 /* down conversions go without a reply from the master */
2193 if (!error && down_conversion(lkb)) {
2194 remove_from_waiters(lkb);
2195 r->res_ls->ls_stub_ms.m_result = 0;
2196 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2197 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2198 }
2199
2200 return error;
2201}
2202
2203/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2204 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2205 that the master is still correct. */
2206
2207static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2208{
2209 return send_common(r, lkb, DLM_MSG_UNLOCK);
2210}
2211
2212static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2213{
2214 return send_common(r, lkb, DLM_MSG_CANCEL);
2215}
2216
2217static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2218{
2219 struct dlm_message *ms;
2220 struct dlm_mhandle *mh;
2221 int to_nodeid, error;
2222
2223 to_nodeid = lkb->lkb_nodeid;
2224
2225 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2226 if (error)
2227 goto out;
2228
2229 send_args(r, lkb, ms);
2230
2231 ms->m_result = 0;
2232
2233 error = send_message(mh, ms);
2234 out:
2235 return error;
2236}
2237
2238static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2239{
2240 struct dlm_message *ms;
2241 struct dlm_mhandle *mh;
2242 int to_nodeid, error;
2243
2244 to_nodeid = lkb->lkb_nodeid;
2245
2246 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2247 if (error)
2248 goto out;
2249
2250 send_args(r, lkb, ms);
2251
2252 ms->m_bastmode = mode;
2253
2254 error = send_message(mh, ms);
2255 out:
2256 return error;
2257}
2258
2259static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2260{
2261 struct dlm_message *ms;
2262 struct dlm_mhandle *mh;
2263 int to_nodeid, error;
2264
2265 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2266
2267 to_nodeid = dlm_dir_nodeid(r);
2268
2269 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2270 if (error)
2271 goto fail;
2272
2273 send_args(r, lkb, ms);
2274
2275 error = send_message(mh, ms);
2276 if (error)
2277 goto fail;
2278 return 0;
2279
2280 fail:
2281 remove_from_waiters(lkb);
2282 return error;
2283}
2284
2285static int send_remove(struct dlm_rsb *r)
2286{
2287 struct dlm_message *ms;
2288 struct dlm_mhandle *mh;
2289 int to_nodeid, error;
2290
2291 to_nodeid = dlm_dir_nodeid(r);
2292
2293 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2294 if (error)
2295 goto out;
2296
2297 memcpy(ms->m_extra, r->res_name, r->res_length);
2298 ms->m_hash = r->res_hash;
2299
2300 error = send_message(mh, ms);
2301 out:
2302 return error;
2303}
2304
2305static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2306 int mstype, int rv)
2307{
2308 struct dlm_message *ms;
2309 struct dlm_mhandle *mh;
2310 int to_nodeid, error;
2311
2312 to_nodeid = lkb->lkb_nodeid;
2313
2314 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2315 if (error)
2316 goto out;
2317
2318 send_args(r, lkb, ms);
2319
2320 ms->m_result = rv;
2321
2322 error = send_message(mh, ms);
2323 out:
2324 return error;
2325}
2326
2327static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2328{
2329 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2330}
2331
2332static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2333{
2334 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2335}
2336
2337static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2338{
2339 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2340}
2341
2342static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2343{
2344 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2345}
2346
2347static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2348 int ret_nodeid, int rv)
2349{
2350 struct dlm_rsb *r = &ls->ls_stub_rsb;
2351 struct dlm_message *ms;
2352 struct dlm_mhandle *mh;
2353 int error, nodeid = ms_in->m_header.h_nodeid;
2354
2355 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2356 if (error)
2357 goto out;
2358
2359 ms->m_lkid = ms_in->m_lkid;
2360 ms->m_result = rv;
2361 ms->m_nodeid = ret_nodeid;
2362
2363 error = send_message(mh, ms);
2364 out:
2365 return error;
2366}
2367
2368/* which args we save from a received message depends heavily on the type
2369 of message, unlike the send side where we can safely send everything about
2370 the lkb for any type of message */
2371
2372static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2373{
2374 lkb->lkb_exflags = ms->m_exflags;
2375 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2376 (ms->m_flags & 0x0000FFFF);
2377}
2378
2379static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2380{
2381 lkb->lkb_sbflags = ms->m_sbflags;
2382 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2383 (ms->m_flags & 0x0000FFFF);
2384}
2385
2386static int receive_extralen(struct dlm_message *ms)
2387{
2388 return (ms->m_header.h_length - sizeof(struct dlm_message));
2389}
2390
2391static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2392 struct dlm_message *ms)
2393{
2394 int len;
2395
2396 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2397 if (!lkb->lkb_lvbptr)
2398 lkb->lkb_lvbptr = allocate_lvb(ls);
2399 if (!lkb->lkb_lvbptr)
2400 return -ENOMEM;
2401 len = receive_extralen(ms);
2402 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2403 }
2404 return 0;
2405}
2406
2407static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2408 struct dlm_message *ms)
2409{
2410 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2411 lkb->lkb_ownpid = ms->m_pid;
2412 lkb->lkb_remid = ms->m_lkid;
2413 lkb->lkb_grmode = DLM_LOCK_IV;
2414 lkb->lkb_rqmode = ms->m_rqmode;
2415 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2416 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2417
2418 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2419
2420 if (receive_lvb(ls, lkb, ms))
2421 return -ENOMEM;
2422
2423 return 0;
2424}
2425
2426static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2427 struct dlm_message *ms)
2428{
2429 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2430 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2431 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2432 lkb->lkb_id, lkb->lkb_remid);
2433 return -EINVAL;
2434 }
2435
2436 if (!is_master_copy(lkb))
2437 return -EINVAL;
2438
2439 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2440 return -EBUSY;
2441
2442 if (receive_lvb(ls, lkb, ms))
2443 return -ENOMEM;
2444
2445 lkb->lkb_rqmode = ms->m_rqmode;
2446 lkb->lkb_lvbseq = ms->m_lvbseq;
2447
2448 return 0;
2449}
2450
2451static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2452 struct dlm_message *ms)
2453{
2454 if (!is_master_copy(lkb))
2455 return -EINVAL;
2456 if (receive_lvb(ls, lkb, ms))
2457 return -ENOMEM;
2458 return 0;
2459}
2460
2461/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2462 uses to send a reply and that the remote end uses to process the reply. */
2463
2464static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2465{
2466 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2467 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2468 lkb->lkb_remid = ms->m_lkid;
2469}
2470
2471static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2472{
2473 struct dlm_lkb *lkb;
2474 struct dlm_rsb *r;
2475 int error, namelen;
2476
2477 error = create_lkb(ls, &lkb);
2478 if (error)
2479 goto fail;
2480
2481 receive_flags(lkb, ms);
2482 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2483 error = receive_request_args(ls, lkb, ms);
2484 if (error) {
2485 __put_lkb(ls, lkb);
2486 goto fail;
2487 }
2488
2489 namelen = receive_extralen(ms);
2490
2491 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2492 if (error) {
2493 __put_lkb(ls, lkb);
2494 goto fail;
2495 }
2496
2497 lock_rsb(r);
2498
2499 attach_lkb(r, lkb);
2500 error = do_request(r, lkb);
2501 send_request_reply(r, lkb, error);
2502
2503 unlock_rsb(r);
2504 put_rsb(r);
2505
2506 if (error == -EINPROGRESS)
2507 error = 0;
2508 if (error)
2509 dlm_put_lkb(lkb);
2510 return;
2511
2512 fail:
2513 setup_stub_lkb(ls, ms);
2514 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2515}
2516
2517static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2518{
2519 struct dlm_lkb *lkb;
2520 struct dlm_rsb *r;
2521 int error, reply = 1;
2522
2523 error = find_lkb(ls, ms->m_remid, &lkb);
2524 if (error)
2525 goto fail;
2526
2527 r = lkb->lkb_resource;
2528
2529 hold_rsb(r);
2530 lock_rsb(r);
2531
2532 receive_flags(lkb, ms);
2533 error = receive_convert_args(ls, lkb, ms);
2534 if (error)
2535 goto out;
2536 reply = !down_conversion(lkb);
2537
2538 error = do_convert(r, lkb);
2539 out:
2540 if (reply)
2541 send_convert_reply(r, lkb, error);
2542
2543 unlock_rsb(r);
2544 put_rsb(r);
2545 dlm_put_lkb(lkb);
2546 return;
2547
2548 fail:
2549 setup_stub_lkb(ls, ms);
2550 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2551}
2552
2553static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2554{
2555 struct dlm_lkb *lkb;
2556 struct dlm_rsb *r;
2557 int error;
2558
2559 error = find_lkb(ls, ms->m_remid, &lkb);
2560 if (error)
2561 goto fail;
2562
2563 r = lkb->lkb_resource;
2564
2565 hold_rsb(r);
2566 lock_rsb(r);
2567
2568 receive_flags(lkb, ms);
2569 error = receive_unlock_args(ls, lkb, ms);
2570 if (error)
2571 goto out;
2572
2573 error = do_unlock(r, lkb);
2574 out:
2575 send_unlock_reply(r, lkb, error);
2576
2577 unlock_rsb(r);
2578 put_rsb(r);
2579 dlm_put_lkb(lkb);
2580 return;
2581
2582 fail:
2583 setup_stub_lkb(ls, ms);
2584 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2585}
2586
2587static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2588{
2589 struct dlm_lkb *lkb;
2590 struct dlm_rsb *r;
2591 int error;
2592
2593 error = find_lkb(ls, ms->m_remid, &lkb);
2594 if (error)
2595 goto fail;
2596
2597 receive_flags(lkb, ms);
2598
2599 r = lkb->lkb_resource;
2600
2601 hold_rsb(r);
2602 lock_rsb(r);
2603
2604 error = do_cancel(r, lkb);
2605 send_cancel_reply(r, lkb, error);
2606
2607 unlock_rsb(r);
2608 put_rsb(r);
2609 dlm_put_lkb(lkb);
2610 return;
2611
2612 fail:
2613 setup_stub_lkb(ls, ms);
2614 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2615}
2616
2617static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2618{
2619 struct dlm_lkb *lkb;
2620 struct dlm_rsb *r;
2621 int error;
2622
2623 error = find_lkb(ls, ms->m_remid, &lkb);
2624 if (error) {
2625 log_error(ls, "receive_grant no lkb");
2626 return;
2627 }
2628 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2629
2630 r = lkb->lkb_resource;
2631
2632 hold_rsb(r);
2633 lock_rsb(r);
2634
2635 receive_flags_reply(lkb, ms);
2636 grant_lock_pc(r, lkb, ms);
2637 queue_cast(r, lkb, 0);
2638
2639 unlock_rsb(r);
2640 put_rsb(r);
2641 dlm_put_lkb(lkb);
2642}
2643
2644static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2645{
2646 struct dlm_lkb *lkb;
2647 struct dlm_rsb *r;
2648 int error;
2649
2650 error = find_lkb(ls, ms->m_remid, &lkb);
2651 if (error) {
2652 log_error(ls, "receive_bast no lkb");
2653 return;
2654 }
2655 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2656
2657 r = lkb->lkb_resource;
2658
2659 hold_rsb(r);
2660 lock_rsb(r);
2661
2662 queue_bast(r, lkb, ms->m_bastmode);
2663
2664 unlock_rsb(r);
2665 put_rsb(r);
2666 dlm_put_lkb(lkb);
2667}
2668
2669static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2670{
2671 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2672
2673 from_nodeid = ms->m_header.h_nodeid;
2674 our_nodeid = dlm_our_nodeid();
2675
2676 len = receive_extralen(ms);
2677
2678 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2679 if (dir_nodeid != our_nodeid) {
2680 log_error(ls, "lookup dir_nodeid %d from %d",
2681 dir_nodeid, from_nodeid);
2682 error = -EINVAL;
2683 ret_nodeid = -1;
2684 goto out;
2685 }
2686
2687 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2688
2689 /* Optimization: we're master so treat lookup as a request */
2690 if (!error && ret_nodeid == our_nodeid) {
2691 receive_request(ls, ms);
2692 return;
2693 }
2694 out:
2695 send_lookup_reply(ls, ms, ret_nodeid, error);
2696}
2697
2698static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2699{
2700 int len, dir_nodeid, from_nodeid;
2701
2702 from_nodeid = ms->m_header.h_nodeid;
2703
2704 len = receive_extralen(ms);
2705
2706 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2707 if (dir_nodeid != dlm_our_nodeid()) {
2708 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2709 dir_nodeid, from_nodeid);
2710 return;
2711 }
2712
2713 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2714}
2715
2716static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2717{
2718 struct dlm_lkb *lkb;
2719 struct dlm_rsb *r;
2720 int error, mstype;
2721
2722 error = find_lkb(ls, ms->m_remid, &lkb);
2723 if (error) {
2724 log_error(ls, "receive_request_reply no lkb");
2725 return;
2726 }
2727 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2728
2729 mstype = lkb->lkb_wait_type;
2730 error = remove_from_waiters(lkb);
2731 if (error) {
2732 log_error(ls, "receive_request_reply not on waiters");
2733 goto out;
2734 }
2735
2736 /* this is the value returned from do_request() on the master */
2737 error = ms->m_result;
2738
2739 r = lkb->lkb_resource;
2740 hold_rsb(r);
2741 lock_rsb(r);
2742
2743 /* Optimization: the dir node was also the master, so it took our
2744 lookup as a request and sent request reply instead of lookup reply */
2745 if (mstype == DLM_MSG_LOOKUP) {
2746 r->res_nodeid = ms->m_header.h_nodeid;
2747 lkb->lkb_nodeid = r->res_nodeid;
2748 }
2749
2750 switch (error) {
2751 case -EAGAIN:
2752 /* request would block (be queued) on remote master;
2753 the unhold undoes the original ref from create_lkb()
2754 so it leads to the lkb being freed */
2755 queue_cast(r, lkb, -EAGAIN);
2756 confirm_master(r, -EAGAIN);
2757 unhold_lkb(lkb);
2758 break;
2759
2760 case -EINPROGRESS:
2761 case 0:
2762 /* request was queued or granted on remote master */
2763 receive_flags_reply(lkb, ms);
2764 lkb->lkb_remid = ms->m_lkid;
2765 if (error)
2766 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2767 else {
2768 grant_lock_pc(r, lkb, ms);
2769 queue_cast(r, lkb, 0);
2770 }
2771 confirm_master(r, error);
2772 break;
2773
2774 case -EBADR:
2775 case -ENOTBLK:
2776 /* find_rsb failed to find rsb or rsb wasn't master */
2777 r->res_nodeid = -1;
2778 lkb->lkb_nodeid = -1;
2779 _request_lock(r, lkb);
2780 break;
2781
2782 default:
2783 log_error(ls, "receive_request_reply error %d", error);
2784 }
2785
2786 unlock_rsb(r);
2787 put_rsb(r);
2788 out:
2789 dlm_put_lkb(lkb);
2790}
2791
2792static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2793 struct dlm_message *ms)
2794{
2795 int error = ms->m_result;
2796
2797 /* this is the value returned from do_convert() on the master */
2798
2799 switch (error) {
2800 case -EAGAIN:
2801 /* convert would block (be queued) on remote master */
2802 queue_cast(r, lkb, -EAGAIN);
2803 break;
2804
2805 case -EINPROGRESS:
2806 /* convert was queued on remote master */
2807 del_lkb(r, lkb);
2808 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2809 break;
2810
2811 case 0:
2812 /* convert was granted on remote master */
2813 receive_flags_reply(lkb, ms);
2814 grant_lock_pc(r, lkb, ms);
2815 queue_cast(r, lkb, 0);
2816 break;
2817
2818 default:
2819 log_error(r->res_ls, "receive_convert_reply error %d", error);
2820 }
2821}
2822
2823static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2824{
2825 struct dlm_rsb *r = lkb->lkb_resource;
2826
2827 hold_rsb(r);
2828 lock_rsb(r);
2829
2830 __receive_convert_reply(r, lkb, ms);
2831
2832 unlock_rsb(r);
2833 put_rsb(r);
2834}
2835
2836static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2837{
2838 struct dlm_lkb *lkb;
2839 int error;
2840
2841 error = find_lkb(ls, ms->m_remid, &lkb);
2842 if (error) {
2843 log_error(ls, "receive_convert_reply no lkb");
2844 return;
2845 }
2846 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2847
2848 error = remove_from_waiters(lkb);
2849 if (error) {
2850 log_error(ls, "receive_convert_reply not on waiters");
2851 goto out;
2852 }
2853
2854 _receive_convert_reply(lkb, ms);
2855 out:
2856 dlm_put_lkb(lkb);
2857}
2858
2859static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2860{
2861 struct dlm_rsb *r = lkb->lkb_resource;
2862 int error = ms->m_result;
2863
2864 hold_rsb(r);
2865 lock_rsb(r);
2866
2867 /* this is the value returned from do_unlock() on the master */
2868
2869 switch (error) {
2870 case -DLM_EUNLOCK:
2871 receive_flags_reply(lkb, ms);
2872 remove_lock_pc(r, lkb);
2873 queue_cast(r, lkb, -DLM_EUNLOCK);
2874 break;
2875 default:
2876 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2877 }
2878
2879 unlock_rsb(r);
2880 put_rsb(r);
2881}
2882
2883static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2884{
2885 struct dlm_lkb *lkb;
2886 int error;
2887
2888 error = find_lkb(ls, ms->m_remid, &lkb);
2889 if (error) {
2890 log_error(ls, "receive_unlock_reply no lkb");
2891 return;
2892 }
2893 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2894
2895 error = remove_from_waiters(lkb);
2896 if (error) {
2897 log_error(ls, "receive_unlock_reply not on waiters");
2898 goto out;
2899 }
2900
2901 _receive_unlock_reply(lkb, ms);
2902 out:
2903 dlm_put_lkb(lkb);
2904}
2905
2906static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2907{
2908 struct dlm_rsb *r = lkb->lkb_resource;
2909 int error = ms->m_result;
2910
2911 hold_rsb(r);
2912 lock_rsb(r);
2913
2914 /* this is the value returned from do_cancel() on the master */
2915
2916 switch (error) {
2917 case -DLM_ECANCEL:
2918 receive_flags_reply(lkb, ms);
2919 revert_lock_pc(r, lkb);
2920 queue_cast(r, lkb, -DLM_ECANCEL);
2921 break;
2922 default:
2923 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2924 }
2925
2926 unlock_rsb(r);
2927 put_rsb(r);
2928}
2929
2930static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2931{
2932 struct dlm_lkb *lkb;
2933 int error;
2934
2935 error = find_lkb(ls, ms->m_remid, &lkb);
2936 if (error) {
2937 log_error(ls, "receive_cancel_reply no lkb");
2938 return;
2939 }
2940 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2941
2942 error = remove_from_waiters(lkb);
2943 if (error) {
2944 log_error(ls, "receive_cancel_reply not on waiters");
2945 goto out;
2946 }
2947
2948 _receive_cancel_reply(lkb, ms);
2949 out:
2950 dlm_put_lkb(lkb);
2951}
2952
2953static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2954{
2955 struct dlm_lkb *lkb;
2956 struct dlm_rsb *r;
2957 int error, ret_nodeid;
2958
2959 error = find_lkb(ls, ms->m_lkid, &lkb);
2960 if (error) {
2961 log_error(ls, "receive_lookup_reply no lkb");
2962 return;
2963 }
2964
2965 error = remove_from_waiters(lkb);
2966 if (error) {
2967 log_error(ls, "receive_lookup_reply not on waiters");
2968 goto out;
2969 }
2970
2971 /* this is the value returned by dlm_dir_lookup on dir node
2972 FIXME: will a non-zero error ever be returned? */
2973 error = ms->m_result;
2974
2975 r = lkb->lkb_resource;
2976 hold_rsb(r);
2977 lock_rsb(r);
2978
2979 ret_nodeid = ms->m_nodeid;
2980 if (ret_nodeid == dlm_our_nodeid()) {
2981 r->res_nodeid = 0;
2982 ret_nodeid = 0;
2983 r->res_first_lkid = 0;
2984 } else {
2985 /* set_master() will copy res_nodeid to lkb_nodeid */
2986 r->res_nodeid = ret_nodeid;
2987 }
2988
2989 _request_lock(r, lkb);
2990
2991 if (!ret_nodeid)
2992 process_lookup_list(r);
2993
2994 unlock_rsb(r);
2995 put_rsb(r);
2996 out:
2997 dlm_put_lkb(lkb);
2998}
2999
3000int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3001{
3002 struct dlm_message *ms = (struct dlm_message *) hd;
3003 struct dlm_ls *ls;
3004 int error;
3005
3006 if (!recovery)
3007 dlm_message_in(ms);
3008
3009 ls = dlm_find_lockspace_global(hd->h_lockspace);
3010 if (!ls) {
3011 log_print("drop message %d from %d for unknown lockspace %d",
3012 ms->m_type, nodeid, hd->h_lockspace);
3013 return -EINVAL;
3014 }
3015
3016 /* recovery may have just ended leaving a bunch of backed-up requests
3017 in the requestqueue; wait while dlm_recoverd clears them */
3018
3019 if (!recovery)
3020 dlm_wait_requestqueue(ls);
3021
3022 /* recovery may have just started while there were a bunch of
3023 in-flight requests -- save them in requestqueue to be processed
3024 after recovery. we can't let dlm_recvd block on the recovery
3025 lock. if dlm_recoverd is calling this function to clear the
3026 requestqueue, it needs to be interrupted (-EINTR) if another
3027 recovery operation is starting. */
3028
3029 while (1) {
3030 if (dlm_locking_stopped(ls)) {
3031 if (!recovery)
3032 dlm_add_requestqueue(ls, nodeid, hd);
3033 error = -EINTR;
3034 goto out;
3035 }
3036
3037 if (lock_recovery_try(ls))
3038 break;
3039 schedule();
3040 }
3041
3042 switch (ms->m_type) {
3043
3044 /* messages sent to a master node */
3045
3046 case DLM_MSG_REQUEST:
3047 receive_request(ls, ms);
3048 break;
3049
3050 case DLM_MSG_CONVERT:
3051 receive_convert(ls, ms);
3052 break;
3053
3054 case DLM_MSG_UNLOCK:
3055 receive_unlock(ls, ms);
3056 break;
3057
3058 case DLM_MSG_CANCEL:
3059 receive_cancel(ls, ms);
3060 break;
3061
3062 /* messages sent from a master node (replies to above) */
3063
3064 case DLM_MSG_REQUEST_REPLY:
3065 receive_request_reply(ls, ms);
3066 break;
3067
3068 case DLM_MSG_CONVERT_REPLY:
3069 receive_convert_reply(ls, ms);
3070 break;
3071
3072 case DLM_MSG_UNLOCK_REPLY:
3073 receive_unlock_reply(ls, ms);
3074 break;
3075
3076 case DLM_MSG_CANCEL_REPLY:
3077 receive_cancel_reply(ls, ms);
3078 break;
3079
3080 /* messages sent from a master node (only two types of async msg) */
3081
3082 case DLM_MSG_GRANT:
3083 receive_grant(ls, ms);
3084 break;
3085
3086 case DLM_MSG_BAST:
3087 receive_bast(ls, ms);
3088 break;
3089
3090 /* messages sent to a dir node */
3091
3092 case DLM_MSG_LOOKUP:
3093 receive_lookup(ls, ms);
3094 break;
3095
3096 case DLM_MSG_REMOVE:
3097 receive_remove(ls, ms);
3098 break;
3099
3100 /* messages sent from a dir node (remove has no reply) */
3101
3102 case DLM_MSG_LOOKUP_REPLY:
3103 receive_lookup_reply(ls, ms);
3104 break;
3105
3106 default:
3107 log_error(ls, "unknown message type %d", ms->m_type);
3108 }
3109
3110 unlock_recovery(ls);
3111 out:
3112 dlm_put_lockspace(ls);
3113 dlm_astd_wake();
3114 return 0;
3115}
3116
3117
3118/*
3119 * Recovery related
3120 */
3121
3122static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3123{
3124 if (middle_conversion(lkb)) {
3125 hold_lkb(lkb);
3126 ls->ls_stub_ms.m_result = -EINPROGRESS;
3127 _remove_from_waiters(lkb);
3128 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3129
3130 /* Same special case as in receive_rcom_lock_args() */
3131 lkb->lkb_grmode = DLM_LOCK_IV;
3132 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3133 unhold_lkb(lkb);
3134
3135 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3136 lkb->lkb_flags |= DLM_IFL_RESEND;
3137 }
3138
3139 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3140 conversions are async; there's no reply from the remote master */
3141}
3142
3143/* A waiting lkb needs recovery if the master node has failed, or
3144 the master node is changing (only when no directory is used) */
3145
3146static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3147{
3148 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3149 return 1;
3150
3151 if (!dlm_no_directory(ls))
3152 return 0;
3153
3154 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3155 return 1;
3156
3157 return 0;
3158}
3159
3160/* Recovery for locks that are waiting for replies from nodes that are now
3161 gone. We can just complete unlocks and cancels by faking a reply from the
3162 dead node. Requests and up-conversions we flag to be resent after
3163 recovery. Down-conversions can just be completed with a fake reply like
3164 unlocks. Conversions between PR and CW need special attention. */
3165
3166void dlm_recover_waiters_pre(struct dlm_ls *ls)
3167{
3168 struct dlm_lkb *lkb, *safe;
3169
3170 mutex_lock(&ls->ls_waiters_mutex);
3171
3172 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3173 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3174 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3175
3176 /* all outstanding lookups, regardless of destination will be
3177 resent after recovery is done */
3178
3179 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3180 lkb->lkb_flags |= DLM_IFL_RESEND;
3181 continue;
3182 }
3183
3184 if (!waiter_needs_recovery(ls, lkb))
3185 continue;
3186
3187 switch (lkb->lkb_wait_type) {
3188
3189 case DLM_MSG_REQUEST:
3190 lkb->lkb_flags |= DLM_IFL_RESEND;
3191 break;
3192
3193 case DLM_MSG_CONVERT:
3194 recover_convert_waiter(ls, lkb);
3195 break;
3196
3197 case DLM_MSG_UNLOCK:
3198 hold_lkb(lkb);
3199 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3200 _remove_from_waiters(lkb);
3201 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3202 dlm_put_lkb(lkb);
3203 break;
3204
3205 case DLM_MSG_CANCEL:
3206 hold_lkb(lkb);
3207 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3208 _remove_from_waiters(lkb);
3209 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3210 dlm_put_lkb(lkb);
3211 break;
3212
3213 default:
3214 log_error(ls, "invalid lkb wait_type %d",
3215 lkb->lkb_wait_type);
3216 }
3217 schedule();
3218 }
3219 mutex_unlock(&ls->ls_waiters_mutex);
3220}
3221
3222static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3223{
3224 struct dlm_lkb *lkb;
3225 int rv = 0;
3226
3227 mutex_lock(&ls->ls_waiters_mutex);
3228 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3229 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3230 rv = lkb->lkb_wait_type;
3231 _remove_from_waiters(lkb);
3232 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3233 break;
3234 }
3235 }
3236 mutex_unlock(&ls->ls_waiters_mutex);
3237
3238 if (!rv)
3239 lkb = NULL;
3240 *lkb_ret = lkb;
3241 return rv;
3242}
3243
3244/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3245 master or dir-node for r. Processing the lkb may result in it being placed
3246 back on waiters. */
3247
3248int dlm_recover_waiters_post(struct dlm_ls *ls)
3249{
3250 struct dlm_lkb *lkb;
3251 struct dlm_rsb *r;
3252 int error = 0, mstype;
3253
3254 while (1) {
3255 if (dlm_locking_stopped(ls)) {
3256 log_debug(ls, "recover_waiters_post aborted");
3257 error = -EINTR;
3258 break;
3259 }
3260
3261 mstype = remove_resend_waiter(ls, &lkb);
3262 if (!mstype)
3263 break;
3264
3265 r = lkb->lkb_resource;
3266
3267 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3268 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3269
3270 switch (mstype) {
3271
3272 case DLM_MSG_LOOKUP:
3273 hold_rsb(r);
3274 lock_rsb(r);
3275 _request_lock(r, lkb);
3276 if (is_master(r))
3277 confirm_master(r, 0);
3278 unlock_rsb(r);
3279 put_rsb(r);
3280 break;
3281
3282 case DLM_MSG_REQUEST:
3283 hold_rsb(r);
3284 lock_rsb(r);
3285 _request_lock(r, lkb);
3286 if (is_master(r))
3287 confirm_master(r, 0);
3288 unlock_rsb(r);
3289 put_rsb(r);
3290 break;
3291
3292 case DLM_MSG_CONVERT:
3293 hold_rsb(r);
3294 lock_rsb(r);
3295 _convert_lock(r, lkb);
3296 unlock_rsb(r);
3297 put_rsb(r);
3298 break;
3299
3300 default:
3301 log_error(ls, "recover_waiters_post type %d", mstype);
3302 }
3303 }
3304
3305 return error;
3306}
3307
3308static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3309 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3310{
3311 struct dlm_ls *ls = r->res_ls;
3312 struct dlm_lkb *lkb, *safe;
3313
3314 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3315 if (test(ls, lkb)) {
3316 rsb_set_flag(r, RSB_LOCKS_PURGED);
3317 del_lkb(r, lkb);
3318 /* this put should free the lkb */
3319 if (!dlm_put_lkb(lkb))
3320 log_error(ls, "purged lkb not released");
3321 }
3322 }
3323}
3324
3325static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3326{
3327 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3328}
3329
3330static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3331{
3332 return is_master_copy(lkb);
3333}
3334
3335static void purge_dead_locks(struct dlm_rsb *r)
3336{
3337 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3338 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3339 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3340}
3341
3342void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3343{
3344 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3345 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3346 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3347}
3348
3349/* Get rid of locks held by nodes that are gone. */
3350
3351int dlm_purge_locks(struct dlm_ls *ls)
3352{
3353 struct dlm_rsb *r;
3354
3355 log_debug(ls, "dlm_purge_locks");
3356
3357 down_write(&ls->ls_root_sem);
3358 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3359 hold_rsb(r);
3360 lock_rsb(r);
3361 if (is_master(r))
3362 purge_dead_locks(r);
3363 unlock_rsb(r);
3364 unhold_rsb(r);
3365
3366 schedule();
3367 }
3368 up_write(&ls->ls_root_sem);
3369
3370 return 0;
3371}
3372
3373static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3374{
3375 struct dlm_rsb *r, *r_ret = NULL;
3376
3377 read_lock(&ls->ls_rsbtbl[bucket].lock);
3378 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3379 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3380 continue;
3381 hold_rsb(r);
3382 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3383 r_ret = r;
3384 break;
3385 }
3386 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3387 return r_ret;
3388}
3389
3390void dlm_grant_after_purge(struct dlm_ls *ls)
3391{
3392 struct dlm_rsb *r;
3393 int bucket = 0;
3394
3395 while (1) {
3396 r = find_purged_rsb(ls, bucket);
3397 if (!r) {
3398 if (bucket == ls->ls_rsbtbl_size - 1)
3399 break;
3400 bucket++;
3401 continue;
3402 }
3403 lock_rsb(r);
3404 if (is_master(r)) {
3405 grant_pending_locks(r);
3406 confirm_master(r, 0);
3407 }
3408 unlock_rsb(r);
3409 put_rsb(r);
3410 schedule();
3411 }
3412}
3413
3414static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3415 uint32_t remid)
3416{
3417 struct dlm_lkb *lkb;
3418
3419 list_for_each_entry(lkb, head, lkb_statequeue) {
3420 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3421 return lkb;
3422 }
3423 return NULL;
3424}
3425
3426static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3427 uint32_t remid)
3428{
3429 struct dlm_lkb *lkb;
3430
3431 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3432 if (lkb)
3433 return lkb;
3434 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3435 if (lkb)
3436 return lkb;
3437 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3438 if (lkb)
3439 return lkb;
3440 return NULL;
3441}
3442
3443static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3444 struct dlm_rsb *r, struct dlm_rcom *rc)
3445{
3446 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3447 int lvblen;
3448
3449 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3450 lkb->lkb_ownpid = rl->rl_ownpid;
3451 lkb->lkb_remid = rl->rl_lkid;
3452 lkb->lkb_exflags = rl->rl_exflags;
3453 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3454 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3455 lkb->lkb_lvbseq = rl->rl_lvbseq;
3456 lkb->lkb_rqmode = rl->rl_rqmode;
3457 lkb->lkb_grmode = rl->rl_grmode;
3458 /* don't set lkb_status because add_lkb wants to itself */
3459
3460 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3461 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3462
3463 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3464 lkb->lkb_lvbptr = allocate_lvb(ls);
3465 if (!lkb->lkb_lvbptr)
3466 return -ENOMEM;
3467 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3468 sizeof(struct rcom_lock);
3469 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3470 }
3471
3472 /* Conversions between PR and CW (middle modes) need special handling.
3473 The real granted mode of these converting locks cannot be determined
3474 until all locks have been rebuilt on the rsb (recover_conversion) */
3475
3476 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3477 rl->rl_status = DLM_LKSTS_CONVERT;
3478 lkb->lkb_grmode = DLM_LOCK_IV;
3479 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3480 }
3481
3482 return 0;
3483}
3484
3485/* This lkb may have been recovered in a previous aborted recovery so we need
3486 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3487 If so we just send back a standard reply. If not, we create a new lkb with
3488 the given values and send back our lkid. We send back our lkid by sending
3489 back the rcom_lock struct we got but with the remid field filled in. */
3490
3491int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3492{
3493 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3494 struct dlm_rsb *r;
3495 struct dlm_lkb *lkb;
3496 int error;
3497
3498 if (rl->rl_parent_lkid) {
3499 error = -EOPNOTSUPP;
3500 goto out;
3501 }
3502
3503 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3504 if (error)
3505 goto out;
3506
3507 lock_rsb(r);
3508
3509 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3510 if (lkb) {
3511 error = -EEXIST;
3512 goto out_remid;
3513 }
3514
3515 error = create_lkb(ls, &lkb);
3516 if (error)
3517 goto out_unlock;
3518
3519 error = receive_rcom_lock_args(ls, lkb, r, rc);
3520 if (error) {
3521 __put_lkb(ls, lkb);
3522 goto out_unlock;
3523 }
3524
3525 attach_lkb(r, lkb);
3526 add_lkb(r, lkb, rl->rl_status);
3527 error = 0;
3528
3529 out_remid:
3530 /* this is the new value returned to the lock holder for
3531 saving in its process-copy lkb */
3532 rl->rl_remid = lkb->lkb_id;
3533
3534 out_unlock:
3535 unlock_rsb(r);
3536 put_rsb(r);
3537 out:
3538 if (error)
3539 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3540 rl->rl_result = error;
3541 return error;
3542}
3543
3544int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3545{
3546 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3547 struct dlm_rsb *r;
3548 struct dlm_lkb *lkb;
3549 int error;
3550
3551 error = find_lkb(ls, rl->rl_lkid, &lkb);
3552 if (error) {
3553 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3554 return error;
3555 }
3556
3557 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3558
3559 error = rl->rl_result;
3560
3561 r = lkb->lkb_resource;
3562 hold_rsb(r);
3563 lock_rsb(r);
3564
3565 switch (error) {
3566 case -EEXIST:
3567 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3568 /* fall through */
3569 case 0:
3570 lkb->lkb_remid = rl->rl_remid;
3571 break;
3572 default:
3573 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3574 error, lkb->lkb_id);
3575 }
3576
3577 /* an ack for dlm_recover_locks() which waits for replies from
3578 all the locks it sends to new masters */
3579 dlm_recovered_lock(r);
3580
3581 unlock_rsb(r);
3582 put_rsb(r);
3583 dlm_put_lkb(lkb);
3584
3585 return 0;
3586}
3587
3588int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3589 int mode, uint32_t flags, void *name, unsigned int namelen,
3590 uint32_t parent_lkid)
3591{
3592 struct dlm_lkb *lkb;
3593 struct dlm_args args;
3594 int error;
3595
3596 lock_recovery(ls);
3597
3598 error = create_lkb(ls, &lkb);
3599 if (error) {
3600 kfree(ua);
3601 goto out;
3602 }
3603
3604 if (flags & DLM_LKF_VALBLK) {
3605 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3606 if (!ua->lksb.sb_lvbptr) {
3607 kfree(ua);
3608 __put_lkb(ls, lkb);
3609 error = -ENOMEM;
3610 goto out;
3611 }
3612 }
3613
3614 /* After ua is attached to lkb it will be freed by free_lkb().
3615 When DLM_IFL_USER is set, the dlm knows that this is a userspace
3616 lock and that lkb_astparam is the dlm_user_args structure. */
3617
3618 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
3619 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
3620 lkb->lkb_flags |= DLM_IFL_USER;
3621 ua->old_mode = DLM_LOCK_IV;
3622
3623 if (error) {
3624 __put_lkb(ls, lkb);
3625 goto out;
3626 }
3627
3628 error = request_lock(ls, lkb, name, namelen, &args);
3629
3630 switch (error) {
3631 case 0:
3632 break;
3633 case -EINPROGRESS:
3634 error = 0;
3635 break;
3636 case -EAGAIN:
3637 error = 0;
3638 /* fall through */
3639 default:
3640 __put_lkb(ls, lkb);
3641 goto out;
3642 }
3643
3644 /* add this new lkb to the per-process list of locks */
3645 spin_lock(&ua->proc->locks_spin);
3646 kref_get(&lkb->lkb_ref);
3647 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3648 spin_unlock(&ua->proc->locks_spin);
3649 out:
3650 unlock_recovery(ls);
3651 return error;
3652}
3653
3654int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3655 int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
3656{
3657 struct dlm_lkb *lkb;
3658 struct dlm_args args;
3659 struct dlm_user_args *ua;
3660 int error;
3661
3662 lock_recovery(ls);
3663
3664 error = find_lkb(ls, lkid, &lkb);
3665 if (error)
3666 goto out;
3667
3668 /* user can change the params on its lock when it converts it, or
3669 add an lvb that didn't exist before */
3670
3671 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3672
3673 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3674 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3675 if (!ua->lksb.sb_lvbptr) {
3676 error = -ENOMEM;
3677 goto out_put;
3678 }
3679 }
3680 if (lvb_in && ua->lksb.sb_lvbptr)
3681 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3682
3683 ua->castparam = ua_tmp->castparam;
3684 ua->castaddr = ua_tmp->castaddr;
3685 ua->bastparam = ua_tmp->bastparam;
3686 ua->bastaddr = ua_tmp->bastaddr;
3687 ua->user_lksb = ua_tmp->user_lksb;
3688 ua->old_mode = lkb->lkb_grmode;
3689
3690 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
3691 ua, DLM_FAKE_USER_AST, &args);
3692 if (error)
3693 goto out_put;
3694
3695 error = convert_lock(ls, lkb, &args);
3696
3697 if (error == -EINPROGRESS || error == -EAGAIN)
3698 error = 0;
3699 out_put:
3700 dlm_put_lkb(lkb);
3701 out:
3702 unlock_recovery(ls);
3703 kfree(ua_tmp);
3704 return error;
3705}
3706
3707int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3708 uint32_t flags, uint32_t lkid, char *lvb_in)
3709{
3710 struct dlm_lkb *lkb;
3711 struct dlm_args args;
3712 struct dlm_user_args *ua;
3713 int error;
3714
3715 lock_recovery(ls);
3716
3717 error = find_lkb(ls, lkid, &lkb);
3718 if (error)
3719 goto out;
3720
3721 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3722
3723 if (lvb_in && ua->lksb.sb_lvbptr)
3724 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3725 ua->castparam = ua_tmp->castparam;
3726 ua->user_lksb = ua_tmp->user_lksb;
3727
3728 error = set_unlock_args(flags, ua, &args);
3729 if (error)
3730 goto out_put;
3731
3732 error = unlock_lock(ls, lkb, &args);
3733
3734 if (error == -DLM_EUNLOCK)
3735 error = 0;
3736 if (error)
3737 goto out_put;
3738
3739 spin_lock(&ua->proc->locks_spin);
3740 list_del_init(&lkb->lkb_ownqueue);
3741 spin_unlock(&ua->proc->locks_spin);
3742
3743 /* this removes the reference for the proc->locks list added by
3744 dlm_user_request */
3745 unhold_lkb(lkb);
3746 out_put:
3747 dlm_put_lkb(lkb);
3748 out:
3749 unlock_recovery(ls);
3750 return error;
3751}
3752
3753int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3754 uint32_t flags, uint32_t lkid)
3755{
3756 struct dlm_lkb *lkb;
3757 struct dlm_args args;
3758 struct dlm_user_args *ua;
3759 int error;
3760
3761 lock_recovery(ls);
3762
3763 error = find_lkb(ls, lkid, &lkb);
3764 if (error)
3765 goto out;
3766
3767 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3768 ua->castparam = ua_tmp->castparam;
3769 ua->user_lksb = ua_tmp->user_lksb;
3770
3771 error = set_unlock_args(flags, ua, &args);
3772 if (error)
3773 goto out_put;
3774
3775 error = cancel_lock(ls, lkb, &args);
3776
3777 if (error == -DLM_ECANCEL)
3778 error = 0;
3779 if (error)
3780 goto out_put;
3781
3782 /* this lkb was removed from the WAITING queue */
3783 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3784 spin_lock(&ua->proc->locks_spin);
3785 list_del_init(&lkb->lkb_ownqueue);
3786 spin_unlock(&ua->proc->locks_spin);
3787 unhold_lkb(lkb);
3788 }
3789 out_put:
3790 dlm_put_lkb(lkb);
3791 out:
3792 unlock_recovery(ls);
3793 return error;
3794}
3795
3796static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3797{
3798 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3799
3800 if (ua->lksb.sb_lvbptr)
3801 kfree(ua->lksb.sb_lvbptr);
3802 kfree(ua);
3803 lkb->lkb_astparam = (long)NULL;
3804
3805 /* TODO: propogate to master if needed */
3806 return 0;
3807}
3808
3809/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
3810 Regardless of what rsb queue the lock is on, it's removed and freed. */
3811
3812static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3813{
3814 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3815 struct dlm_args args;
3816 int error;
3817
3818 /* FIXME: we need to handle the case where the lkb is in limbo
3819 while the rsb is being looked up, currently we assert in
3820 _unlock_lock/is_remote because rsb nodeid is -1. */
3821
3822 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3823
3824 error = unlock_lock(ls, lkb, &args);
3825 if (error == -DLM_EUNLOCK)
3826 error = 0;
3827 return error;
3828}
3829
3830/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3831 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3832 which we clear here. */
3833
3834/* proc CLOSING flag is set so no more device_reads should look at proc->asts
3835 list, and no more device_writes should add lkb's to proc->locks list; so we
3836 shouldn't need to take asts_spin or locks_spin here. this assumes that
3837 device reads/writes/closes are serialized -- FIXME: we may need to serialize
3838 them ourself. */
3839
3840void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3841{
3842 struct dlm_lkb *lkb, *safe;
3843
3844 lock_recovery(ls);
3845 mutex_lock(&ls->ls_clear_proc_locks);
3846
3847 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3848 if (lkb->lkb_ast_type) {
3849 list_del(&lkb->lkb_astqueue);
3850 unhold_lkb(lkb);
3851 }
3852
3853 list_del_init(&lkb->lkb_ownqueue);
3854
3855 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
3856 lkb->lkb_flags |= DLM_IFL_ORPHAN;
3857 orphan_proc_lock(ls, lkb);
3858 } else {
3859 lkb->lkb_flags |= DLM_IFL_DEAD;
3860 unlock_proc_lock(ls, lkb);
3861 }
3862
3863 /* this removes the reference for the proc->locks list
3864 added by dlm_user_request, it may result in the lkb
3865 being freed */
3866
3867 dlm_put_lkb(lkb);
3868 }
3869 mutex_unlock(&ls->ls_clear_proc_locks);
3870 unlock_recovery(ls);
3871}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb);
19int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
20int dlm_modes_compat(int mode1, int mode2);
21int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
22 unsigned int flags, struct dlm_rsb **r_ret);
23void dlm_put_rsb(struct dlm_rsb *r);
24void dlm_hold_rsb(struct dlm_rsb *r);
25int dlm_put_lkb(struct dlm_lkb *lkb);
26void dlm_scan_rsbs(struct dlm_ls *ls);
27
28int dlm_purge_locks(struct dlm_ls *ls);
29void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
30void dlm_grant_after_purge(struct dlm_ls *ls);
31int dlm_recover_waiters_post(struct dlm_ls *ls);
32void dlm_recover_waiters_pre(struct dlm_ls *ls);
33int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
35
36int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
37 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
38int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
39 int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
40int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
41 uint32_t flags, uint32_t lkid, char *lvb_in);
42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
43 uint32_t flags, uint32_t lkid);
44void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
45
46static inline int is_master(struct dlm_rsb *r)
47{
48 return !r->res_nodeid;
49}
50
51static inline void lock_rsb(struct dlm_rsb *r)
52{
53 mutex_lock(&r->res_mutex);
54}
55
56static inline void unlock_rsb(struct dlm_rsb *r)
57{
58 mutex_unlock(&r->res_mutex);
59}
60
61#endif
62
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return snprintf(buf, PAGE_SIZE, "%x\n", status);
82}
83
84static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
85{
86 return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
87}
88
89struct dlm_attr {
90 struct attribute attr;
91 ssize_t (*show)(struct dlm_ls *, char *);
92 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
93};
94
95static struct dlm_attr dlm_attr_control = {
96 .attr = {.name = "control", .mode = S_IWUSR},
97 .store = dlm_control_store
98};
99
100static struct dlm_attr dlm_attr_event = {
101 .attr = {.name = "event_done", .mode = S_IWUSR},
102 .store = dlm_event_store
103};
104
105static struct dlm_attr dlm_attr_id = {
106 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
107 .show = dlm_id_show,
108 .store = dlm_id_store
109};
110
111static struct dlm_attr dlm_attr_recover_status = {
112 .attr = {.name = "recover_status", .mode = S_IRUGO},
113 .show = dlm_recover_status_show
114};
115
116static struct dlm_attr dlm_attr_recover_nodeid = {
117 .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
118 .show = dlm_recover_nodeid_show
119};
120
121static struct attribute *dlm_attrs[] = {
122 &dlm_attr_control.attr,
123 &dlm_attr_event.attr,
124 &dlm_attr_id.attr,
125 &dlm_attr_recover_status.attr,
126 &dlm_attr_recover_nodeid.attr,
127 NULL,
128};
129
130static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
131 char *buf)
132{
133 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
134 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
135 return a->show ? a->show(ls, buf) : 0;
136}
137
138static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
139 const char *buf, size_t len)
140{
141 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
142 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
143 return a->store ? a->store(ls, buf, len) : len;
144}
145
146static struct sysfs_ops dlm_attr_ops = {
147 .show = dlm_attr_show,
148 .store = dlm_attr_store,
149};
150
151static struct kobj_type dlm_ktype = {
152 .default_attrs = dlm_attrs,
153 .sysfs_ops = &dlm_attr_ops,
154};
155
156static struct kset dlm_kset = {
157 .subsys = &kernel_subsys,
158 .kobj = {.name = "dlm",},
159 .ktype = &dlm_ktype,
160};
161
162static int kobject_setup(struct dlm_ls *ls)
163{
164 char lsname[DLM_LOCKSPACE_LEN];
165 int error;
166
167 memset(lsname, 0, DLM_LOCKSPACE_LEN);
168 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
169
170 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
171 if (error)
172 return error;
173
174 ls->ls_kobj.kset = &dlm_kset;
175 ls->ls_kobj.ktype = &dlm_ktype;
176 return 0;
177}
178
179static int do_uevent(struct dlm_ls *ls, int in)
180{
181 int error;
182
183 if (in)
184 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
185 else
186 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
187
188 error = wait_event_interruptible(ls->ls_uevent_wait,
189 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
190 if (error)
191 goto out;
192
193 error = ls->ls_uevent_result;
194 out:
195 return error;
196}
197
198
199int dlm_lockspace_init(void)
200{
201 int error;
202
203 ls_count = 0;
204 mutex_init(&ls_lock);
205 INIT_LIST_HEAD(&lslist);
206 spin_lock_init(&lslist_lock);
207
208 error = kset_register(&dlm_kset);
209 if (error)
210 printk("dlm_lockspace_init: cannot register kset %d\n", error);
211 return error;
212}
213
214void dlm_lockspace_exit(void)
215{
216 kset_unregister(&dlm_kset);
217}
218
219static int dlm_scand(void *data)
220{
221 struct dlm_ls *ls;
222
223 while (!kthread_should_stop()) {
224 list_for_each_entry(ls, &lslist, ls_list)
225 dlm_scan_rsbs(ls);
226 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
227 }
228 return 0;
229}
230
231static int dlm_scand_start(void)
232{
233 struct task_struct *p;
234 int error = 0;
235
236 p = kthread_run(dlm_scand, NULL, "dlm_scand");
237 if (IS_ERR(p))
238 error = PTR_ERR(p);
239 else
240 scand_task = p;
241 return error;
242}
243
244static void dlm_scand_stop(void)
245{
246 kthread_stop(scand_task);
247}
248
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{
268 struct dlm_ls *ls;
269
270 spin_lock(&lslist_lock);
271
272 list_for_each_entry(ls, &lslist, ls_list) {
273 if (ls->ls_global_id == id) {
274 ls->ls_count++;
275 goto out;
276 }
277 }
278 ls = NULL;
279 out:
280 spin_unlock(&lslist_lock);
281 return ls;
282}
283
284struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
285{
286 struct dlm_ls *ls;
287
288 spin_lock(&lslist_lock);
289 list_for_each_entry(ls, &lslist, ls_list) {
290 if (ls->ls_local_handle == lockspace) {
291 ls->ls_count++;
292 goto out;
293 }
294 }
295 ls = NULL;
296 out:
297 spin_unlock(&lslist_lock);
298 return ls;
299}
300
301struct dlm_ls *dlm_find_lockspace_device(int minor)
302{
303 struct dlm_ls *ls;
304
305 spin_lock(&lslist_lock);
306 list_for_each_entry(ls, &lslist, ls_list) {
307 if (ls->ls_device.minor == minor) {
308 ls->ls_count++;
309 goto out;
310 }
311 }
312 ls = NULL;
313 out:
314 spin_unlock(&lslist_lock);
315 return ls;
316}
317
318void dlm_put_lockspace(struct dlm_ls *ls)
319{
320 spin_lock(&lslist_lock);
321 ls->ls_count--;
322 spin_unlock(&lslist_lock);
323}
324
325static void remove_lockspace(struct dlm_ls *ls)
326{
327 for (;;) {
328 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) {
330 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock);
332 return;
333 }
334 spin_unlock(&lslist_lock);
335 ssleep(1);
336 }
337}
338
339static int threads_start(void)
340{
341 int error;
342
343 /* Thread which process lock requests for all lockspace's */
344 error = dlm_astd_start();
345 if (error) {
346 log_print("cannot start dlm_astd thread %d", error);
347 goto fail;
348 }
349
350 error = dlm_scand_start();
351 if (error) {
352 log_print("cannot start dlm_scand thread %d", error);
353 goto astd_fail;
354 }
355
356 /* Thread for sending/receiving messages for all lockspace's */
357 error = dlm_lowcomms_start();
358 if (error) {
359 log_print("cannot start dlm lowcomms %d", error);
360 goto scand_fail;
361 }
362
363 return 0;
364
365 scand_fail:
366 dlm_scand_stop();
367 astd_fail:
368 dlm_astd_stop();
369 fail:
370 return error;
371}
372
373static void threads_stop(void)
374{
375 dlm_scand_stop();
376 dlm_lowcomms_stop();
377 dlm_astd_stop();
378}
379
380static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen)
382{
383 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM;
385
386 if (namelen > DLM_LOCKSPACE_LEN)
387 return -EINVAL;
388
389 if (!lvblen || (lvblen % 8))
390 return -EINVAL;
391
392 if (!try_module_get(THIS_MODULE))
393 return -EINVAL;
394
395 ls = dlm_find_lockspace_name(name, namelen);
396 if (ls) {
397 *lockspace = ls;
398 module_put(THIS_MODULE);
399 return -EEXIST;
400 }
401
402 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
403 if (!ls)
404 goto out;
405 memcpy(ls->ls_name, name, namelen);
406 ls->ls_namelen = namelen;
407 ls->ls_exflags = flags;
408 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0;
410 ls->ls_flags = 0;
411
412 size = dlm_config.rsbtbl_size;
413 ls->ls_rsbtbl_size = size;
414
415 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
416 if (!ls->ls_rsbtbl)
417 goto out_lsfree;
418 for (i = 0; i < size; i++) {
419 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
420 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
421 rwlock_init(&ls->ls_rsbtbl[i].lock);
422 }
423
424 size = dlm_config.lkbtbl_size;
425 ls->ls_lkbtbl_size = size;
426
427 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
428 if (!ls->ls_lkbtbl)
429 goto out_rsbfree;
430 for (i = 0; i < size; i++) {
431 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
432 rwlock_init(&ls->ls_lkbtbl[i].lock);
433 ls->ls_lkbtbl[i].counter = 1;
434 }
435
436 size = dlm_config.dirtbl_size;
437 ls->ls_dirtbl_size = size;
438
439 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
440 if (!ls->ls_dirtbl)
441 goto out_lkbfree;
442 for (i = 0; i < size; i++) {
443 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
444 rwlock_init(&ls->ls_dirtbl[i].lock);
445 }
446
447 INIT_LIST_HEAD(&ls->ls_waiters);
448 mutex_init(&ls->ls_waiters_mutex);
449
450 INIT_LIST_HEAD(&ls->ls_nodes);
451 INIT_LIST_HEAD(&ls->ls_nodes_gone);
452 ls->ls_num_nodes = 0;
453 ls->ls_low_nodeid = 0;
454 ls->ls_total_weight = 0;
455 ls->ls_node_array = NULL;
456
457 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
458 ls->ls_stub_rsb.res_ls = ls;
459
460 ls->ls_debug_rsb_dentry = NULL;
461 ls->ls_debug_waiters_dentry = NULL;
462
463 init_waitqueue_head(&ls->ls_uevent_wait);
464 ls->ls_uevent_result = 0;
465
466 ls->ls_recoverd_task = NULL;
467 mutex_init(&ls->ls_recoverd_active);
468 spin_lock_init(&ls->ls_recover_lock);
469 ls->ls_recover_status = 0;
470 ls->ls_recover_seq = 0;
471 ls->ls_recover_args = NULL;
472 init_rwsem(&ls->ls_in_recovery);
473 INIT_LIST_HEAD(&ls->ls_requestqueue);
474 mutex_init(&ls->ls_requestqueue_mutex);
475 mutex_init(&ls->ls_clear_proc_locks);
476
477 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
478 if (!ls->ls_recover_buf)
479 goto out_dirfree;
480
481 INIT_LIST_HEAD(&ls->ls_recover_list);
482 spin_lock_init(&ls->ls_recover_list_lock);
483 ls->ls_recover_list_count = 0;
484 ls->ls_local_handle = ls;
485 init_waitqueue_head(&ls->ls_wait_general);
486 INIT_LIST_HEAD(&ls->ls_root_list);
487 init_rwsem(&ls->ls_root_sem);
488
489 down_write(&ls->ls_in_recovery);
490
491 spin_lock(&lslist_lock);
492 list_add(&ls->ls_list, &lslist);
493 spin_unlock(&lslist_lock);
494
495 /* needs to find ls in lslist */
496 error = dlm_recoverd_start(ls);
497 if (error) {
498 log_error(ls, "can't start dlm_recoverd %d", error);
499 goto out_rcomfree;
500 }
501
502 dlm_create_debug_file(ls);
503
504 error = kobject_setup(ls);
505 if (error)
506 goto out_del;
507
508 error = kobject_register(&ls->ls_kobj);
509 if (error)
510 goto out_del;
511
512 error = do_uevent(ls, 1);
513 if (error)
514 goto out_unreg;
515
516 *lockspace = ls;
517 return 0;
518
519 out_unreg:
520 kobject_unregister(&ls->ls_kobj);
521 out_del:
522 dlm_delete_debug_file(ls);
523 dlm_recoverd_stop(ls);
524 out_rcomfree:
525 spin_lock(&lslist_lock);
526 list_del(&ls->ls_list);
527 spin_unlock(&lslist_lock);
528 kfree(ls->ls_recover_buf);
529 out_dirfree:
530 kfree(ls->ls_dirtbl);
531 out_lkbfree:
532 kfree(ls->ls_lkbtbl);
533 out_rsbfree:
534 kfree(ls->ls_rsbtbl);
535 out_lsfree:
536 kfree(ls);
537 out:
538 module_put(THIS_MODULE);
539 return error;
540}
541
542int dlm_new_lockspace(char *name, int namelen, void **lockspace,
543 uint32_t flags, int lvblen)
544{
545 int error = 0;
546
547 mutex_lock(&ls_lock);
548 if (!ls_count)
549 error = threads_start();
550 if (error)
551 goto out;
552
553 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
554 if (!error)
555 ls_count++;
556 out:
557 mutex_unlock(&ls_lock);
558 return error;
559}
560
561/* Return 1 if the lockspace still has active remote locks,
562 * 2 if the lockspace still has active local locks.
563 */
564static int lockspace_busy(struct dlm_ls *ls)
565{
566 int i, lkb_found = 0;
567 struct dlm_lkb *lkb;
568
569 /* NOTE: We check the lockidtbl here rather than the resource table.
570 This is because there may be LKBs queued as ASTs that have been
571 unlinked from their RSBs and are pending deletion once the AST has
572 been delivered */
573
574 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
575 read_lock(&ls->ls_lkbtbl[i].lock);
576 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
577 lkb_found = 1;
578 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
579 lkb_idtbl_list) {
580 if (!lkb->lkb_nodeid) {
581 read_unlock(&ls->ls_lkbtbl[i].lock);
582 return 2;
583 }
584 }
585 }
586 read_unlock(&ls->ls_lkbtbl[i].lock);
587 }
588 return lkb_found;
589}
590
591static int release_lockspace(struct dlm_ls *ls, int force)
592{
593 struct dlm_lkb *lkb;
594 struct dlm_rsb *rsb;
595 struct list_head *head;
596 int i;
597 int busy = lockspace_busy(ls);
598
599 if (busy > force)
600 return -EBUSY;
601
602 if (force < 3)
603 do_uevent(ls, 0);
604
605 dlm_recoverd_stop(ls);
606
607 remove_lockspace(ls);
608
609 dlm_delete_debug_file(ls);
610
611 dlm_astd_suspend();
612
613 kfree(ls->ls_recover_buf);
614
615 /*
616 * Free direntry structs.
617 */
618
619 dlm_dir_clear(ls);
620 kfree(ls->ls_dirtbl);
621
622 /*
623 * Free all lkb's on lkbtbl[] lists.
624 */
625
626 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
627 head = &ls->ls_lkbtbl[i].list;
628 while (!list_empty(head)) {
629 lkb = list_entry(head->next, struct dlm_lkb,
630 lkb_idtbl_list);
631
632 list_del(&lkb->lkb_idtbl_list);
633
634 dlm_del_ast(lkb);
635
636 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
637 free_lvb(lkb->lkb_lvbptr);
638
639 free_lkb(lkb);
640 }
641 }
642 dlm_astd_resume();
643
644 kfree(ls->ls_lkbtbl);
645
646 /*
647 * Free all rsb's on rsbtbl[] lists
648 */
649
650 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
651 head = &ls->ls_rsbtbl[i].list;
652 while (!list_empty(head)) {
653 rsb = list_entry(head->next, struct dlm_rsb,
654 res_hashchain);
655
656 list_del(&rsb->res_hashchain);
657 free_rsb(rsb);
658 }
659
660 head = &ls->ls_rsbtbl[i].toss;
661 while (!list_empty(head)) {
662 rsb = list_entry(head->next, struct dlm_rsb,
663 res_hashchain);
664 list_del(&rsb->res_hashchain);
665 free_rsb(rsb);
666 }
667 }
668
669 kfree(ls->ls_rsbtbl);
670
671 /*
672 * Free structures on any other lists
673 */
674
675 kfree(ls->ls_recover_args);
676 dlm_clear_free_entries(ls);
677 dlm_clear_members(ls);
678 dlm_clear_members_gone(ls);
679 kfree(ls->ls_node_array);
680 kobject_unregister(&ls->ls_kobj);
681 kfree(ls);
682
683 mutex_lock(&ls_lock);
684 ls_count--;
685 if (!ls_count)
686 threads_stop();
687 mutex_unlock(&ls_lock);
688
689 module_put(THIS_MODULE);
690 return 0;
691}
692
693/*
694 * Called when a system has released all its locks and is not going to use the
695 * lockspace any longer. We free everything we're managing for this lockspace.
696 * Remaining nodes will go through the recovery process as if we'd died. The
697 * lockspace must continue to function as usual, participating in recoveries,
698 * until this returns.
699 *
700 * Force has 4 possible values:
701 * 0 - don't destroy locksapce if it has any LKBs
702 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
703 * 2 - destroy lockspace regardless of LKBs
704 * 3 - destroy lockspace as part of a forced shutdown
705 */
706
707int dlm_release_lockspace(void *lockspace, int force)
708{
709 struct dlm_ls *ls;
710
711 ls = dlm_find_lockspace_local(lockspace);
712 if (!ls)
713 return -EINVAL;
714 dlm_put_lockspace(ls);
715 return release_lockspace(ls, force);
716}
717
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls);
23
24#endif /* __LOCKSPACE_DOT_H__ */
25
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..23f5ce12080b
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!dlm_local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (dlm_local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!dlm_local_count)
264 return;
265
266 if (!port) {
267 if (dlm_local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = dlm_local_addr[0]->ss_family;
277 if (dlm_local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 dlm_local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 dlm_local_addr[dlm_local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!dlm_local_count) {
663 init_local();
664 if (!dlm_local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < dlm_local_count; i++) {
704 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 len = e->len;
938 offset = e->offset;
939 BUG_ON(len == 0 && e->users == 0);
940 spin_unlock(&ni->writequeue_lock);
941 kmap(e->page);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066int dlm_lowcomms_close(int nodeid)
1067{
1068 struct nodeinfo *ni;
1069
1070 ni = nodeid2nodeinfo(nodeid, 0);
1071 if (!ni)
1072 return -1;
1073
1074 spin_lock(&ni->lock);
1075 if (ni->assoc_id) {
1076 ni->assoc_id = 0;
1077 /* Don't send shutdown here, sctp will just queue it
1078 till the node comes back up! */
1079 }
1080 spin_unlock(&ni->lock);
1081
1082 clean_one_writequeue(ni);
1083 clear_bit(NI_INIT_PENDING, &ni->flags);
1084 return 0;
1085}
1086
1087static int write_list_empty(void)
1088{
1089 int status;
1090
1091 spin_lock_bh(&write_nodes_lock);
1092 status = list_empty(&write_nodes);
1093 spin_unlock_bh(&write_nodes_lock);
1094
1095 return status;
1096}
1097
1098static int dlm_recvd(void *data)
1099{
1100 DECLARE_WAITQUEUE(wait, current);
1101
1102 while (!kthread_should_stop()) {
1103 int count = 0;
1104
1105 set_current_state(TASK_INTERRUPTIBLE);
1106 add_wait_queue(&lowcomms_recv_wait, &wait);
1107 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1108 schedule();
1109 remove_wait_queue(&lowcomms_recv_wait, &wait);
1110 set_current_state(TASK_RUNNING);
1111
1112 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1113 int ret;
1114
1115 do {
1116 ret = receive_from_sock();
1117
1118 /* Don't starve out everyone else */
1119 if (++count >= MAX_RX_MSG_COUNT) {
1120 schedule();
1121 count = 0;
1122 }
1123 } while (!kthread_should_stop() && ret >=0);
1124 }
1125 schedule();
1126 }
1127
1128 return 0;
1129}
1130
1131static int dlm_sendd(void *data)
1132{
1133 DECLARE_WAITQUEUE(wait, current);
1134
1135 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1136
1137 while (!kthread_should_stop()) {
1138 set_current_state(TASK_INTERRUPTIBLE);
1139 if (write_list_empty())
1140 schedule();
1141 set_current_state(TASK_RUNNING);
1142
1143 if (sctp_con.eagain_flag) {
1144 sctp_con.eagain_flag = 0;
1145 refill_write_queue();
1146 }
1147 process_output_queue();
1148 }
1149
1150 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1151
1152 return 0;
1153}
1154
1155static void daemons_stop(void)
1156{
1157 kthread_stop(recv_task);
1158 kthread_stop(send_task);
1159}
1160
1161static int daemons_start(void)
1162{
1163 struct task_struct *p;
1164 int error;
1165
1166 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1167 error = IS_ERR(p);
1168 if (error) {
1169 log_print("can't start dlm_recvd %d", error);
1170 return error;
1171 }
1172 recv_task = p;
1173
1174 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1175 error = IS_ERR(p);
1176 if (error) {
1177 log_print("can't start dlm_sendd %d", error);
1178 kthread_stop(recv_task);
1179 return error;
1180 }
1181 send_task = p;
1182
1183 return 0;
1184}
1185
1186/*
1187 * This is quite likely to sleep...
1188 */
1189int dlm_lowcomms_start(void)
1190{
1191 int error;
1192
1193 error = init_sock();
1194 if (error)
1195 goto fail_sock;
1196 error = daemons_start();
1197 if (error)
1198 goto fail_sock;
1199 atomic_set(&accepting, 1);
1200 return 0;
1201
1202 fail_sock:
1203 close_connection();
1204 return error;
1205}
1206
1207/* Set all the activity flags to prevent any socket activity. */
1208
1209void dlm_lowcomms_stop(void)
1210{
1211 atomic_set(&accepting, 0);
1212 sctp_con.flags = 0x7;
1213 daemons_stop();
1214 clean_writequeues();
1215 close_connection();
1216 dealloc_nodeinfo();
1217 max_nodeid = 0;
1218}
1219
1220int dlm_lowcomms_init(void)
1221{
1222 init_waitqueue_head(&lowcomms_recv_wait);
1223 spin_lock_init(&write_nodes_lock);
1224 INIT_LIST_HEAD(&write_nodes);
1225 init_rwsem(&nodeinfo_lock);
1226 return 0;
1227}
1228
1229void dlm_lowcomms_exit(void)
1230{
1231 int i;
1232
1233 for (i = 0; i < dlm_local_count; i++)
1234 kfree(dlm_local_addr[i]);
1235 dlm_local_count = 0;
1236 dlm_local_nodeid = 0;
1237}
1238
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "user.h"
18#include "memory.h"
19#include "lowcomms.h"
20#include "config.h"
21
22#ifdef CONFIG_DLM_DEBUG
23int dlm_register_debugfs(void);
24void dlm_unregister_debugfs(void);
25#else
26static inline int dlm_register_debugfs(void) { return 0; }
27static inline void dlm_unregister_debugfs(void) { }
28#endif
29
30static int __init init_dlm(void)
31{
32 int error;
33
34 error = dlm_memory_init();
35 if (error)
36 goto out;
37
38 error = dlm_lockspace_init();
39 if (error)
40 goto out_mem;
41
42 error = dlm_config_init();
43 if (error)
44 goto out_lockspace;
45
46 error = dlm_register_debugfs();
47 if (error)
48 goto out_config;
49
50 error = dlm_lowcomms_init();
51 if (error)
52 goto out_debug;
53
54 error = dlm_user_init();
55 if (error)
56 goto out_lowcomms;
57
58 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
59
60 return 0;
61
62 out_lowcomms:
63 dlm_lowcomms_exit();
64 out_debug:
65 dlm_unregister_debugfs();
66 out_config:
67 dlm_config_exit();
68 out_lockspace:
69 dlm_lockspace_exit();
70 out_mem:
71 dlm_memory_exit();
72 out:
73 return error;
74}
75
76static void __exit exit_dlm(void)
77{
78 dlm_user_exit();
79 dlm_lowcomms_exit();
80 dlm_config_exit();
81 dlm_memory_exit();
82 dlm_lockspace_exit();
83 dlm_unregister_debugfs();
84}
85
86module_init(init_dlm);
87module_exit(exit_dlm);
88
89MODULE_DESCRIPTION("Distributed Lock Manager");
90MODULE_AUTHOR("Red Hat, Inc.");
91MODULE_LICENSE("GPL");
92
93EXPORT_SYMBOL_GPL(dlm_new_lockspace);
94EXPORT_SYMBOL_GPL(dlm_release_lockspace);
95EXPORT_SYMBOL_GPL(dlm_lock);
96EXPORT_SYMBOL_GPL(dlm_unlock);
97
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static int ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 int error = 0;
169
170 list_for_each_entry(memb, &ls->ls_nodes, list) {
171 error = dlm_recovery_stopped(ls);
172 if (error)
173 break;
174 error = dlm_rcom_status(ls, memb->nodeid);
175 if (error)
176 break;
177 }
178 if (error)
179 log_debug(ls, "ping_members aborted %d last nodeid %d",
180 error, ls->ls_recover_nodeid);
181 return error;
182}
183
184int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
185{
186 struct dlm_member *memb, *safe;
187 int i, error, found, pos = 0, neg = 0, low = -1;
188
189 /* move departed members from ls_nodes to ls_nodes_gone */
190
191 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
192 found = 0;
193 for (i = 0; i < rv->node_count; i++) {
194 if (memb->nodeid == rv->nodeids[i]) {
195 found = 1;
196 break;
197 }
198 }
199
200 if (!found) {
201 neg++;
202 dlm_remove_member(ls, memb);
203 log_debug(ls, "remove member %d", memb->nodeid);
204 }
205 }
206
207 /* add new members to ls_nodes */
208
209 for (i = 0; i < rv->node_count; i++) {
210 if (dlm_is_member(ls, rv->nodeids[i]))
211 continue;
212 dlm_add_member(ls, rv->nodeids[i]);
213 pos++;
214 log_debug(ls, "add member %d", rv->nodeids[i]);
215 }
216
217 list_for_each_entry(memb, &ls->ls_nodes, list) {
218 if (low == -1 || memb->nodeid < low)
219 low = memb->nodeid;
220 }
221 ls->ls_low_nodeid = low;
222
223 make_member_array(ls);
224 dlm_set_recover_status(ls, DLM_RS_NODES);
225 *neg_out = neg;
226
227 error = ping_members(ls);
228 if (error)
229 goto out;
230
231 error = dlm_recover_members_wait(ls);
232 out:
233 log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
234 return error;
235}
236
237/*
238 * Following called from lockspace.c
239 */
240
241int dlm_ls_stop(struct dlm_ls *ls)
242{
243 int new;
244
245 /*
246 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
247 * dlm_recovery_stopped()) and prevents any new locks from being
248 * processed (see RUNNING, dlm_locking_stopped()).
249 */
250
251 spin_lock(&ls->ls_recover_lock);
252 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
253 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
254 ls->ls_recover_seq++;
255 spin_unlock(&ls->ls_recover_lock);
256
257 /*
258 * This in_recovery lock does two things:
259 *
260 * 1) Keeps this function from returning until all threads are out
261 * of locking routines and locking is truely stopped.
262 * 2) Keeps any new requests from being processed until it's unlocked
263 * when recovery is complete.
264 */
265
266 if (new)
267 down_write(&ls->ls_in_recovery);
268
269 /*
270 * The recoverd suspend/resume makes sure that dlm_recoverd (if
271 * running) has noticed the clearing of RUNNING above and quit
272 * processing the previous recovery. This will be true for all nodes
273 * before any nodes start the new recovery.
274 */
275
276 dlm_recoverd_suspend(ls);
277 ls->ls_recover_status = 0;
278 dlm_recoverd_resume(ls);
279 return 0;
280}
281
282int dlm_ls_start(struct dlm_ls *ls)
283{
284 struct dlm_recover *rv = NULL, *rv_old;
285 int *ids = NULL;
286 int error, count;
287
288 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
289 if (!rv)
290 return -ENOMEM;
291
292 error = count = dlm_nodeid_list(ls->ls_name, &ids);
293 if (error <= 0)
294 goto fail;
295
296 spin_lock(&ls->ls_recover_lock);
297
298 /* the lockspace needs to be stopped before it can be started */
299
300 if (!dlm_locking_stopped(ls)) {
301 spin_unlock(&ls->ls_recover_lock);
302 log_error(ls, "start ignored: lockspace running");
303 error = -EINVAL;
304 goto fail;
305 }
306
307 rv->nodeids = ids;
308 rv->node_count = count;
309 rv->seq = ++ls->ls_recover_seq;
310 rv_old = ls->ls_recover_args;
311 ls->ls_recover_args = rv;
312 spin_unlock(&ls->ls_recover_lock);
313
314 if (rv_old) {
315 kfree(rv_old->nodeids);
316 kfree(rv_old);
317 }
318
319 dlm_recoverd_kick(ls);
320 return 0;
321
322 fail:
323 kfree(rv);
324 kfree(ids);
325 return error;
326}
327
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 if (lkb->lkb_flags & DLM_IFL_USER) {
88 struct dlm_user_args *ua;
89 ua = (struct dlm_user_args *)lkb->lkb_astparam;
90 if (ua) {
91 if (ua->lksb.sb_lvbptr)
92 kfree(ua->lksb.sb_lvbptr);
93 kfree(ua);
94 }
95 }
96 kmem_cache_free(lkb_cache, lkb);
97}
98
99struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
100{
101 struct dlm_direntry *de;
102
103 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
104 printk("namelen = %d\n", namelen););
105
106 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
107 if (de)
108 memset(de, 0, sizeof(*de) + namelen);
109 return de;
110}
111
112void free_direntry(struct dlm_direntry *de)
113{
114 kfree(de);
115}
116
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100 ls->ls_recover_nodeid = nodeid;
101
102 if (nodeid == dlm_our_nodeid()) {
103 rc = (struct dlm_rcom *) ls->ls_recover_buf;
104 rc->rc_result = dlm_recover_status(ls);
105 goto out;
106 }
107
108 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
109 if (error)
110 goto out;
111 rc->rc_id = ++ls->ls_rcom_seq;
112
113 send_rcom(ls, mh, rc);
114
115 error = dlm_wait_function(ls, &rcom_response);
116 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
117 if (error)
118 goto out;
119
120 rc = (struct dlm_rcom *) ls->ls_recover_buf;
121
122 if (rc->rc_result == -ESRCH) {
123 /* we pretend the remote lockspace exists with 0 status */
124 log_debug(ls, "remote node %d not ready", nodeid);
125 rc->rc_result = 0;
126 } else
127 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
128 nodeid);
129 /* the caller looks at rc_result for the remote recovery status */
130 out:
131 return error;
132}
133
134static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
135{
136 struct dlm_rcom *rc;
137 struct dlm_mhandle *mh;
138 int error, nodeid = rc_in->rc_header.h_nodeid;
139
140 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
141 sizeof(struct rcom_config), &rc, &mh);
142 if (error)
143 return;
144 rc->rc_id = rc_in->rc_id;
145 rc->rc_result = dlm_recover_status(ls);
146 make_config(ls, (struct rcom_config *) rc->rc_buf);
147
148 send_rcom(ls, mh, rc);
149}
150
151static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
152{
153 if (rc_in->rc_id != ls->ls_rcom_seq) {
154 log_debug(ls, "reject old reply %d got %llx wanted %llx",
155 rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
156 return;
157 }
158 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
159 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
160 wake_up(&ls->ls_wait_general);
161}
162
163static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
164{
165 receive_sync_reply(ls, rc_in);
166}
167
168int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
169{
170 struct dlm_rcom *rc;
171 struct dlm_mhandle *mh;
172 int error = 0, len = sizeof(struct dlm_rcom);
173
174 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
175 ls->ls_recover_nodeid = nodeid;
176
177 if (nodeid == dlm_our_nodeid()) {
178 dlm_copy_master_names(ls, last_name, last_len,
179 ls->ls_recover_buf + len,
180 dlm_config.buffer_size - len, nodeid);
181 goto out;
182 }
183
184 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
185 if (error)
186 goto out;
187 memcpy(rc->rc_buf, last_name, last_len);
188 rc->rc_id = ++ls->ls_rcom_seq;
189
190 send_rcom(ls, mh, rc);
191
192 error = dlm_wait_function(ls, &rcom_response);
193 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
194 out:
195 return error;
196}
197
198static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
199{
200 struct dlm_rcom *rc;
201 struct dlm_mhandle *mh;
202 int error, inlen, outlen;
203 int nodeid = rc_in->rc_header.h_nodeid;
204 uint32_t status = dlm_recover_status(ls);
205
206 /*
207 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
208 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
209 * It could only happen in rare cases where we get a late NAMES
210 * message from a previous instance of recovery.
211 */
212
213 if (!(status & DLM_RS_NODES)) {
214 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
215 return;
216 }
217
218 nodeid = rc_in->rc_header.h_nodeid;
219 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
220 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
221
222 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
223 if (error)
224 return;
225 rc->rc_id = rc_in->rc_id;
226
227 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
228 nodeid);
229 send_rcom(ls, mh, rc);
230}
231
232static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
233{
234 receive_sync_reply(ls, rc_in);
235}
236
237int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
238{
239 struct dlm_rcom *rc;
240 struct dlm_mhandle *mh;
241 struct dlm_ls *ls = r->res_ls;
242 int error;
243
244 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
245 &rc, &mh);
246 if (error)
247 goto out;
248 memcpy(rc->rc_buf, r->res_name, r->res_length);
249 rc->rc_id = (unsigned long) r;
250
251 send_rcom(ls, mh, rc);
252 out:
253 return error;
254}
255
256static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
257{
258 struct dlm_rcom *rc;
259 struct dlm_mhandle *mh;
260 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
261 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
262
263 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
264 if (error)
265 return;
266
267 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
268 if (error)
269 ret_nodeid = error;
270 rc->rc_result = ret_nodeid;
271 rc->rc_id = rc_in->rc_id;
272
273 send_rcom(ls, mh, rc);
274}
275
276static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
277{
278 dlm_recover_master_reply(ls, rc_in);
279}
280
281static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
282 struct rcom_lock *rl)
283{
284 memset(rl, 0, sizeof(*rl));
285
286 rl->rl_ownpid = lkb->lkb_ownpid;
287 rl->rl_lkid = lkb->lkb_id;
288 rl->rl_exflags = lkb->lkb_exflags;
289 rl->rl_flags = lkb->lkb_flags;
290 rl->rl_lvbseq = lkb->lkb_lvbseq;
291 rl->rl_rqmode = lkb->lkb_rqmode;
292 rl->rl_grmode = lkb->lkb_grmode;
293 rl->rl_status = lkb->lkb_status;
294 rl->rl_wait_type = lkb->lkb_wait_type;
295
296 if (lkb->lkb_bastaddr)
297 rl->rl_asts |= AST_BAST;
298 if (lkb->lkb_astaddr)
299 rl->rl_asts |= AST_COMP;
300
301 rl->rl_namelen = r->res_length;
302 memcpy(rl->rl_name, r->res_name, r->res_length);
303
304 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
305 If so, receive_rcom_lock_args() won't take this copy. */
306
307 if (lkb->lkb_lvbptr)
308 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
309}
310
311int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
312{
313 struct dlm_ls *ls = r->res_ls;
314 struct dlm_rcom *rc;
315 struct dlm_mhandle *mh;
316 struct rcom_lock *rl;
317 int error, len = sizeof(struct rcom_lock);
318
319 if (lkb->lkb_lvbptr)
320 len += ls->ls_lvblen;
321
322 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
323 if (error)
324 goto out;
325
326 rl = (struct rcom_lock *) rc->rc_buf;
327 pack_rcom_lock(r, lkb, rl);
328 rc->rc_id = (unsigned long) r;
329
330 send_rcom(ls, mh, rc);
331 out:
332 return error;
333}
334
335static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
336{
337 struct dlm_rcom *rc;
338 struct dlm_mhandle *mh;
339 int error, nodeid = rc_in->rc_header.h_nodeid;
340
341 dlm_recover_master_copy(ls, rc_in);
342
343 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
344 sizeof(struct rcom_lock), &rc, &mh);
345 if (error)
346 return;
347
348 /* We send back the same rcom_lock struct we received, but
349 dlm_recover_master_copy() has filled in rl_remid and rl_result */
350
351 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
352 rc->rc_id = rc_in->rc_id;
353
354 send_rcom(ls, mh, rc);
355}
356
357static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
358{
359 uint32_t status = dlm_recover_status(ls);
360
361 if (!(status & DLM_RS_DIR)) {
362 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
363 rc_in->rc_header.h_nodeid);
364 return;
365 }
366
367 dlm_recover_process_copy(ls, rc_in);
368}
369
370static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
371{
372 struct dlm_rcom *rc;
373 struct dlm_mhandle *mh;
374 char *mb;
375 int mb_len = sizeof(struct dlm_rcom);
376
377 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
378 if (!mh)
379 return -ENOBUFS;
380 memset(mb, 0, mb_len);
381
382 rc = (struct dlm_rcom *) mb;
383
384 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
385 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
386 rc->rc_header.h_nodeid = dlm_our_nodeid();
387 rc->rc_header.h_length = mb_len;
388 rc->rc_header.h_cmd = DLM_RCOM;
389
390 rc->rc_type = DLM_RCOM_STATUS_REPLY;
391 rc->rc_id = rc_in->rc_id;
392 rc->rc_result = -ESRCH;
393
394 dlm_rcom_out(rc);
395 dlm_lowcomms_commit_buffer(mh);
396
397 return 0;
398}
399
400/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
401 recovery-only comms are sent through here. */
402
403void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
404{
405 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
406 struct dlm_ls *ls;
407
408 dlm_rcom_in(rc);
409
410 /* If the lockspace doesn't exist then still send a status message
411 back; it's possible that it just doesn't have its global_id yet. */
412
413 ls = dlm_find_lockspace_global(hd->h_lockspace);
414 if (!ls) {
415 log_print("lockspace %x from %d not found",
416 hd->h_lockspace, nodeid);
417 send_ls_not_ready(nodeid, rc);
418 return;
419 }
420
421 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
422 log_error(ls, "ignoring recovery message %x from %d",
423 rc->rc_type, nodeid);
424 goto out;
425 }
426
427 if (nodeid != rc->rc_header.h_nodeid) {
428 log_error(ls, "bad rcom nodeid %d from %d",
429 rc->rc_header.h_nodeid, nodeid);
430 goto out;
431 }
432
433 switch (rc->rc_type) {
434 case DLM_RCOM_STATUS:
435 receive_rcom_status(ls, rc);
436 break;
437
438 case DLM_RCOM_NAMES:
439 receive_rcom_names(ls, rc);
440 break;
441
442 case DLM_RCOM_LOOKUP:
443 receive_rcom_lookup(ls, rc);
444 break;
445
446 case DLM_RCOM_LOCK:
447 receive_rcom_lock(ls, rc);
448 break;
449
450 case DLM_RCOM_STATUS_REPLY:
451 receive_rcom_status_reply(ls, rc);
452 break;
453
454 case DLM_RCOM_NAMES_REPLY:
455 receive_rcom_names_reply(ls, rc);
456 break;
457
458 case DLM_RCOM_LOOKUP_REPLY:
459 receive_rcom_lookup_reply(ls, rc);
460 break;
461
462 case DLM_RCOM_LOCK_REPLY:
463 receive_rcom_lock_reply(ls, rc);
464 break;
465
466 default:
467 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
468 }
469 out:
470 dlm_put_lockspace(ls);
471}
472
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
309 * rsb's to consider.
310 */
311
312static void set_new_master(struct dlm_rsb *r, int nodeid)
313{
314 lock_rsb(r);
315 r->res_nodeid = nodeid;
316 set_master_lkbs(r);
317 rsb_set_flag(r, RSB_NEW_MASTER);
318 rsb_set_flag(r, RSB_NEW_MASTER2);
319 unlock_rsb(r);
320}
321
322/*
323 * We do async lookups on rsb's that need new masters. The rsb's
324 * waiting for a lookup reply are kept on the recover_list.
325 */
326
327static int recover_master(struct dlm_rsb *r)
328{
329 struct dlm_ls *ls = r->res_ls;
330 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
331
332 dir_nodeid = dlm_dir_nodeid(r);
333
334 if (dir_nodeid == our_nodeid) {
335 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
336 r->res_length, &ret_nodeid);
337 if (error)
338 log_error(ls, "recover dir lookup error %d", error);
339
340 if (ret_nodeid == our_nodeid)
341 ret_nodeid = 0;
342 set_new_master(r, ret_nodeid);
343 } else {
344 recover_list_add(r);
345 error = dlm_send_rcom_lookup(r, dir_nodeid);
346 }
347
348 return error;
349}
350
351/*
352 * When not using a directory, most resource names will hash to a new static
353 * master nodeid and the resource will need to be remastered.
354 */
355
356static int recover_master_static(struct dlm_rsb *r)
357{
358 int master = dlm_dir_nodeid(r);
359
360 if (master == dlm_our_nodeid())
361 master = 0;
362
363 if (r->res_nodeid != master) {
364 if (is_master(r))
365 dlm_purge_mstcpy_locks(r);
366 set_new_master(r, master);
367 return 1;
368 }
369 return 0;
370}
371
372/*
373 * Go through local root resources and for each rsb which has a master which
374 * has departed, get the new master nodeid from the directory. The dir will
375 * assign mastery to the first node to look up the new master. That means
376 * we'll discover in this lookup if we're the new master of any rsb's.
377 *
378 * We fire off all the dir lookup requests individually and asynchronously to
379 * the correct dir node.
380 */
381
382int dlm_recover_masters(struct dlm_ls *ls)
383{
384 struct dlm_rsb *r;
385 int error = 0, count = 0;
386
387 log_debug(ls, "dlm_recover_masters");
388
389 down_read(&ls->ls_root_sem);
390 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
391 if (dlm_recovery_stopped(ls)) {
392 up_read(&ls->ls_root_sem);
393 error = -EINTR;
394 goto out;
395 }
396
397 if (dlm_no_directory(ls))
398 count += recover_master_static(r);
399 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
400 recover_master(r);
401 count++;
402 }
403
404 schedule();
405 }
406 up_read(&ls->ls_root_sem);
407
408 log_debug(ls, "dlm_recover_masters %d resources", count);
409
410 error = dlm_wait_function(ls, &recover_list_empty);
411 out:
412 if (error)
413 recover_list_clear(ls);
414 return error;
415}
416
417int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
418{
419 struct dlm_rsb *r;
420 int nodeid;
421
422 r = recover_list_find(ls, rc->rc_id);
423 if (!r) {
424 log_error(ls, "dlm_recover_master_reply no id %llx",
425 (unsigned long long)rc->rc_id);
426 goto out;
427 }
428
429 nodeid = rc->rc_result;
430 if (nodeid == dlm_our_nodeid())
431 nodeid = 0;
432
433 set_new_master(r, nodeid);
434 recover_list_del(r);
435
436 if (recover_list_empty(ls))
437 wake_up(&ls->ls_wait_general);
438 out:
439 return 0;
440}
441
442
443/* Lock recovery: rebuild the process-copy locks we hold on a
444 remastered rsb on the new rsb master.
445
446 dlm_recover_locks
447 recover_locks
448 recover_locks_queue
449 dlm_send_rcom_lock -> receive_rcom_lock
450 dlm_recover_master_copy
451 receive_rcom_lock_reply <-
452 dlm_recover_process_copy
453*/
454
455
456/*
457 * keep a count of the number of lkb's we send to the new master; when we get
458 * an equal number of replies then recovery for the rsb is done
459 */
460
461static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
462{
463 struct dlm_lkb *lkb;
464 int error = 0;
465
466 list_for_each_entry(lkb, head, lkb_statequeue) {
467 error = dlm_send_rcom_lock(r, lkb);
468 if (error)
469 break;
470 r->res_recover_locks_count++;
471 }
472
473 return error;
474}
475
476static int recover_locks(struct dlm_rsb *r)
477{
478 int error = 0;
479
480 lock_rsb(r);
481
482 DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
483
484 error = recover_locks_queue(r, &r->res_grantqueue);
485 if (error)
486 goto out;
487 error = recover_locks_queue(r, &r->res_convertqueue);
488 if (error)
489 goto out;
490 error = recover_locks_queue(r, &r->res_waitqueue);
491 if (error)
492 goto out;
493
494 if (r->res_recover_locks_count)
495 recover_list_add(r);
496 else
497 rsb_clear_flag(r, RSB_NEW_MASTER);
498 out:
499 unlock_rsb(r);
500 return error;
501}
502
503int dlm_recover_locks(struct dlm_ls *ls)
504{
505 struct dlm_rsb *r;
506 int error, count = 0;
507
508 log_debug(ls, "dlm_recover_locks");
509
510 down_read(&ls->ls_root_sem);
511 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
512 if (is_master(r)) {
513 rsb_clear_flag(r, RSB_NEW_MASTER);
514 continue;
515 }
516
517 if (!rsb_flag(r, RSB_NEW_MASTER))
518 continue;
519
520 if (dlm_recovery_stopped(ls)) {
521 error = -EINTR;
522 up_read(&ls->ls_root_sem);
523 goto out;
524 }
525
526 error = recover_locks(r);
527 if (error) {
528 up_read(&ls->ls_root_sem);
529 goto out;
530 }
531
532 count += r->res_recover_locks_count;
533 }
534 up_read(&ls->ls_root_sem);
535
536 log_debug(ls, "dlm_recover_locks %d locks", count);
537
538 error = dlm_wait_function(ls, &recover_list_empty);
539 out:
540 if (error)
541 recover_list_clear(ls);
542 else
543 dlm_set_recover_status(ls, DLM_RS_LOCKS);
544 return error;
545}
546
547void dlm_recovered_lock(struct dlm_rsb *r)
548{
549 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
550
551 r->res_recover_locks_count--;
552 if (!r->res_recover_locks_count) {
553 rsb_clear_flag(r, RSB_NEW_MASTER);
554 recover_list_del(r);
555 }
556
557 if (recover_list_empty(r->res_ls))
558 wake_up(&r->res_ls->ls_wait_general);
559}
560
561/*
562 * The lvb needs to be recovered on all master rsb's. This includes setting
563 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
564 * based on the lvb's of the locks held on the rsb.
565 *
566 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
567 * was already set prior to recovery, it's not cleared, regardless of locks.
568 *
569 * The LVB contents are only considered for changing when this is a new master
570 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
571 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
572 * from the lkb with the largest lvb sequence number.
573 */
574
575static void recover_lvb(struct dlm_rsb *r)
576{
577 struct dlm_lkb *lkb, *high_lkb = NULL;
578 uint32_t high_seq = 0;
579 int lock_lvb_exists = 0;
580 int big_lock_exists = 0;
581 int lvblen = r->res_ls->ls_lvblen;
582
583 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
584 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
585 continue;
586
587 lock_lvb_exists = 1;
588
589 if (lkb->lkb_grmode > DLM_LOCK_CR) {
590 big_lock_exists = 1;
591 goto setflag;
592 }
593
594 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
595 high_lkb = lkb;
596 high_seq = lkb->lkb_lvbseq;
597 }
598 }
599
600 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
601 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
602 continue;
603
604 lock_lvb_exists = 1;
605
606 if (lkb->lkb_grmode > DLM_LOCK_CR) {
607 big_lock_exists = 1;
608 goto setflag;
609 }
610
611 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
612 high_lkb = lkb;
613 high_seq = lkb->lkb_lvbseq;
614 }
615 }
616
617 setflag:
618 if (!lock_lvb_exists)
619 goto out;
620
621 if (!big_lock_exists)
622 rsb_set_flag(r, RSB_VALNOTVALID);
623
624 /* don't mess with the lvb unless we're the new master */
625 if (!rsb_flag(r, RSB_NEW_MASTER2))
626 goto out;
627
628 if (!r->res_lvbptr) {
629 r->res_lvbptr = allocate_lvb(r->res_ls);
630 if (!r->res_lvbptr)
631 goto out;
632 }
633
634 if (big_lock_exists) {
635 r->res_lvbseq = lkb->lkb_lvbseq;
636 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
637 } else if (high_lkb) {
638 r->res_lvbseq = high_lkb->lkb_lvbseq;
639 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
640 } else {
641 r->res_lvbseq = 0;
642 memset(r->res_lvbptr, 0, lvblen);
643 }
644 out:
645 return;
646}
647
648/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
649 converting PR->CW or CW->PR need to have their lkb_grmode set. */
650
651static void recover_conversion(struct dlm_rsb *r)
652{
653 struct dlm_lkb *lkb;
654 int grmode = -1;
655
656 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
657 if (lkb->lkb_grmode == DLM_LOCK_PR ||
658 lkb->lkb_grmode == DLM_LOCK_CW) {
659 grmode = lkb->lkb_grmode;
660 break;
661 }
662 }
663
664 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
665 if (lkb->lkb_grmode != DLM_LOCK_IV)
666 continue;
667 if (grmode == -1)
668 lkb->lkb_grmode = lkb->lkb_rqmode;
669 else
670 lkb->lkb_grmode = grmode;
671 }
672}
673
674/* We've become the new master for this rsb and waiting/converting locks may
675 need to be granted in dlm_grant_after_purge() due to locks that may have
676 existed from a removed node. */
677
678static void set_locks_purged(struct dlm_rsb *r)
679{
680 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
681 rsb_set_flag(r, RSB_LOCKS_PURGED);
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 if (rsb_flag(r, RSB_NEW_MASTER2))
698 set_locks_purged(r);
699 recover_lvb(r);
700 count++;
701 }
702 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
703 rsb_clear_flag(r, RSB_NEW_MASTER2);
704 unlock_rsb(r);
705 }
706 up_read(&ls->ls_root_sem);
707
708 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
709}
710
711/* Create a single list of all root rsb's to be used during recovery */
712
713int dlm_create_root_list(struct dlm_ls *ls)
714{
715 struct dlm_rsb *r;
716 int i, error = 0;
717
718 down_write(&ls->ls_root_sem);
719 if (!list_empty(&ls->ls_root_list)) {
720 log_error(ls, "root list not empty");
721 error = -EINVAL;
722 goto out;
723 }
724
725 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
726 read_lock(&ls->ls_rsbtbl[i].lock);
727 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
728 list_add(&r->res_root_list, &ls->ls_root_list);
729 dlm_hold_rsb(r);
730 }
731 read_unlock(&ls->ls_rsbtbl[i].lock);
732 }
733 out:
734 up_write(&ls->ls_root_sem);
735 return error;
736}
737
738void dlm_release_root_list(struct dlm_ls *ls)
739{
740 struct dlm_rsb *r, *safe;
741
742 down_write(&ls->ls_root_sem);
743 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
744 list_del_init(&r->res_root_list);
745 dlm_put_rsb(r);
746 }
747 up_write(&ls->ls_root_sem);
748}
749
750void dlm_clear_toss_list(struct dlm_ls *ls)
751{
752 struct dlm_rsb *r, *safe;
753 int i;
754
755 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
756 write_lock(&ls->ls_rsbtbl[i].lock);
757 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
758 res_hashchain) {
759 list_del(&r->res_hashchain);
760 free_rsb(r);
761 }
762 write_unlock(&ls->ls_rsbtbl[i].lock);
763 }
764}
765
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237 if (!ls) {
238 log_print("dlm_recoverd: no lockspace %p", arg);
239 return -1;
240 }
241
242 while (!kthread_should_stop()) {
243 set_current_state(TASK_INTERRUPTIBLE);
244 if (!test_bit(LSFL_WORK, &ls->ls_flags))
245 schedule();
246 set_current_state(TASK_RUNNING);
247
248 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
249 do_ls_recovery(ls);
250 }
251
252 dlm_put_lockspace(ls);
253 return 0;
254}
255
256void dlm_recoverd_kick(struct dlm_ls *ls)
257{
258 set_bit(LSFL_WORK, &ls->ls_flags);
259 wake_up_process(ls->ls_recoverd_task);
260}
261
262int dlm_recoverd_start(struct dlm_ls *ls)
263{
264 struct task_struct *p;
265 int error = 0;
266
267 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
268 if (IS_ERR(p))
269 error = PTR_ERR(p);
270 else
271 ls->ls_recoverd_task = p;
272 return error;
273}
274
275void dlm_recoverd_stop(struct dlm_ls *ls)
276{
277 kthread_stop(ls->ls_recoverd_task);
278}
279
280void dlm_recoverd_suspend(struct dlm_ls *ls)
281{
282 wake_up(&ls->ls_wait_general);
283 mutex_lock(&ls->ls_recoverd_active);
284}
285
286void dlm_recoverd_resume(struct dlm_ls *ls)
287{
288 mutex_unlock(&ls->ls_recoverd_active);
289}
290
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/init.h>
11#include <linux/wait.h>
12#include <linux/module.h>
13#include <linux/file.h>
14#include <linux/fs.h>
15#include <linux/poll.h>
16#include <linux/signal.h>
17#include <linux/spinlock.h>
18#include <linux/dlm.h>
19#include <linux/dlm_device.h>
20
21#include "dlm_internal.h"
22#include "lockspace.h"
23#include "lock.h"
24#include "lvb_table.h"
25
26static const char *name_prefix="dlm";
27static struct miscdevice ctl_device;
28static struct file_operations device_fops;
29
30#ifdef CONFIG_COMPAT
31
32struct dlm_lock_params32 {
33 __u8 mode;
34 __u8 namelen;
35 __u16 flags;
36 __u32 lkid;
37 __u32 parent;
38
39 __u32 castparam;
40 __u32 castaddr;
41 __u32 bastparam;
42 __u32 bastaddr;
43 __u32 lksb;
44
45 char lvb[DLM_USER_LVB_LEN];
46 char name[0];
47};
48
49struct dlm_write_request32 {
50 __u32 version[3];
51 __u8 cmd;
52 __u8 is64bit;
53 __u8 unused[2];
54
55 union {
56 struct dlm_lock_params32 lock;
57 struct dlm_lspace_params lspace;
58 } i;
59};
60
61struct dlm_lksb32 {
62 __u32 sb_status;
63 __u32 sb_lkid;
64 __u8 sb_flags;
65 __u32 sb_lvbptr;
66};
67
68struct dlm_lock_result32 {
69 __u32 length;
70 __u32 user_astaddr;
71 __u32 user_astparam;
72 __u32 user_lksb;
73 struct dlm_lksb32 lksb;
74 __u8 bast_mode;
75 __u8 unused[3];
76 /* Offsets may be zero if no data is present */
77 __u32 lvb_offset;
78};
79
80static void compat_input(struct dlm_write_request *kb,
81 struct dlm_write_request32 *kb32)
82{
83 kb->version[0] = kb32->version[0];
84 kb->version[1] = kb32->version[1];
85 kb->version[2] = kb32->version[2];
86
87 kb->cmd = kb32->cmd;
88 kb->is64bit = kb32->is64bit;
89 if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
90 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
91 kb->i.lspace.flags = kb32->i.lspace.flags;
92 kb->i.lspace.minor = kb32->i.lspace.minor;
93 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
94 } else {
95 kb->i.lock.mode = kb32->i.lock.mode;
96 kb->i.lock.namelen = kb32->i.lock.namelen;
97 kb->i.lock.flags = kb32->i.lock.flags;
98 kb->i.lock.lkid = kb32->i.lock.lkid;
99 kb->i.lock.parent = kb32->i.lock.parent;
100 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
101 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
102 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
103 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
104 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
105 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
106 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
107 }
108}
109
110static void compat_output(struct dlm_lock_result *res,
111 struct dlm_lock_result32 *res32)
112{
113 res32->length = res->length - (sizeof(struct dlm_lock_result) -
114 sizeof(struct dlm_lock_result32));
115 res32->user_astaddr = (__u32)(long)res->user_astaddr;
116 res32->user_astparam = (__u32)(long)res->user_astparam;
117 res32->user_lksb = (__u32)(long)res->user_lksb;
118 res32->bast_mode = res->bast_mode;
119
120 res32->lvb_offset = res->lvb_offset;
121 res32->length = res->length;
122
123 res32->lksb.sb_status = res->lksb.sb_status;
124 res32->lksb.sb_flags = res->lksb.sb_flags;
125 res32->lksb.sb_lkid = res->lksb.sb_lkid;
126 res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
127}
128#endif
129
130
131void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
132{
133 struct dlm_ls *ls;
134 struct dlm_user_args *ua;
135 struct dlm_user_proc *proc;
136 int remove_ownqueue = 0;
137
138 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
139 lkb before dealing with it. We need to check this
140 flag before taking ls_clear_proc_locks mutex because if
141 it's set, dlm_clear_proc_locks() holds the mutex. */
142
143 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
144 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
145 return;
146 }
147
148 ls = lkb->lkb_resource->res_ls;
149 mutex_lock(&ls->ls_clear_proc_locks);
150
151 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
152 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
153 lkb->ua so we can't try to use it. */
154
155 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
156 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
157 goto out;
158 }
159
160 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
161 ua = (struct dlm_user_args *)lkb->lkb_astparam;
162 proc = ua->proc;
163
164 if (type == AST_BAST && ua->bastaddr == NULL)
165 goto out;
166
167 spin_lock(&proc->asts_spin);
168 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
169 kref_get(&lkb->lkb_ref);
170 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
171 lkb->lkb_ast_type |= type;
172 wake_up_interruptible(&proc->wait);
173 }
174
175 /* noqueue requests that fail may need to be removed from the
176 proc's locks list, there should be a better way of detecting
177 this situation than checking all these things... */
178
179 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
181 remove_ownqueue = 1;
182
183 /* We want to copy the lvb to userspace when the completion
184 ast is read if the status is 0, the lock has an lvb and
185 lvb_ops says we should. We could probably have set_lvb_lock()
186 set update_user_lvb instead and not need old_mode */
187
188 if ((lkb->lkb_ast_type & AST_COMP) &&
189 (lkb->lkb_lksb->sb_status == 0) &&
190 lkb->lkb_lksb->sb_lvbptr &&
191 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
192 ua->update_user_lvb = 1;
193 else
194 ua->update_user_lvb = 0;
195
196 spin_unlock(&proc->asts_spin);
197
198 if (remove_ownqueue) {
199 spin_lock(&ua->proc->locks_spin);
200 list_del_init(&lkb->lkb_ownqueue);
201 spin_unlock(&ua->proc->locks_spin);
202 dlm_put_lkb(lkb);
203 }
204 out:
205 mutex_unlock(&ls->ls_clear_proc_locks);
206}
207
208static int device_user_lock(struct dlm_user_proc *proc,
209 struct dlm_lock_params *params)
210{
211 struct dlm_ls *ls;
212 struct dlm_user_args *ua;
213 int error = -ENOMEM;
214
215 ls = dlm_find_lockspace_local(proc->lockspace);
216 if (!ls)
217 return -ENOENT;
218
219 if (!params->castaddr || !params->lksb) {
220 error = -EINVAL;
221 goto out;
222 }
223
224 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
225 if (!ua)
226 goto out;
227 ua->proc = proc;
228 ua->user_lksb = params->lksb;
229 ua->castparam = params->castparam;
230 ua->castaddr = params->castaddr;
231 ua->bastparam = params->bastparam;
232 ua->bastaddr = params->bastaddr;
233
234 if (params->flags & DLM_LKF_CONVERT)
235 error = dlm_user_convert(ls, ua,
236 params->mode, params->flags,
237 params->lkid, params->lvb);
238 else {
239 error = dlm_user_request(ls, ua,
240 params->mode, params->flags,
241 params->name, params->namelen,
242 params->parent);
243 if (!error)
244 error = ua->lksb.sb_lkid;
245 }
246 out:
247 dlm_put_lockspace(ls);
248 return error;
249}
250
251static int device_user_unlock(struct dlm_user_proc *proc,
252 struct dlm_lock_params *params)
253{
254 struct dlm_ls *ls;
255 struct dlm_user_args *ua;
256 int error = -ENOMEM;
257
258 ls = dlm_find_lockspace_local(proc->lockspace);
259 if (!ls)
260 return -ENOENT;
261
262 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
263 if (!ua)
264 goto out;
265 ua->proc = proc;
266 ua->user_lksb = params->lksb;
267 ua->castparam = params->castparam;
268 ua->castaddr = params->castaddr;
269
270 if (params->flags & DLM_LKF_CANCEL)
271 error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
272 else
273 error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
274 params->lvb);
275 out:
276 dlm_put_lockspace(ls);
277 return error;
278}
279
280static int device_create_lockspace(struct dlm_lspace_params *params)
281{
282 dlm_lockspace_t *lockspace;
283 struct dlm_ls *ls;
284 int error, len;
285
286 if (!capable(CAP_SYS_ADMIN))
287 return -EPERM;
288
289 error = dlm_new_lockspace(params->name, strlen(params->name),
290 &lockspace, 0, DLM_USER_LVB_LEN);
291 if (error)
292 return error;
293
294 ls = dlm_find_lockspace_local(lockspace);
295 if (!ls)
296 return -ENOENT;
297
298 error = -ENOMEM;
299 len = strlen(params->name) + strlen(name_prefix) + 2;
300 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
301 if (!ls->ls_device.name)
302 goto fail;
303 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
304 params->name);
305 ls->ls_device.fops = &device_fops;
306 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
307
308 error = misc_register(&ls->ls_device);
309 if (error) {
310 kfree(ls->ls_device.name);
311 goto fail;
312 }
313
314 error = ls->ls_device.minor;
315 dlm_put_lockspace(ls);
316 return error;
317
318 fail:
319 dlm_put_lockspace(ls);
320 dlm_release_lockspace(lockspace, 0);
321 return error;
322}
323
324static int device_remove_lockspace(struct dlm_lspace_params *params)
325{
326 dlm_lockspace_t *lockspace;
327 struct dlm_ls *ls;
328 int error, force = 0;
329
330 if (!capable(CAP_SYS_ADMIN))
331 return -EPERM;
332
333 ls = dlm_find_lockspace_device(params->minor);
334 if (!ls)
335 return -ENOENT;
336
337 error = misc_deregister(&ls->ls_device);
338 if (error) {
339 dlm_put_lockspace(ls);
340 goto out;
341 }
342 kfree(ls->ls_device.name);
343
344 if (params->flags & DLM_USER_LSFLG_FORCEFREE)
345 force = 2;
346
347 lockspace = ls->ls_local_handle;
348
349 /* dlm_release_lockspace waits for references to go to zero,
350 so all processes will need to close their device for the ls
351 before the release will procede */
352
353 dlm_put_lockspace(ls);
354 error = dlm_release_lockspace(lockspace, force);
355 out:
356 return error;
357}
358
359/* Check the user's version matches ours */
360static int check_version(struct dlm_write_request *req)
361{
362 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
363 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
364 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
365
366 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
367 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
368 current->comm,
369 current->pid,
370 req->version[0],
371 req->version[1],
372 req->version[2],
373 DLM_DEVICE_VERSION_MAJOR,
374 DLM_DEVICE_VERSION_MINOR,
375 DLM_DEVICE_VERSION_PATCH);
376 return -EINVAL;
377 }
378 return 0;
379}
380
381/*
382 * device_write
383 *
384 * device_user_lock
385 * dlm_user_request -> request_lock
386 * dlm_user_convert -> convert_lock
387 *
388 * device_user_unlock
389 * dlm_user_unlock -> unlock_lock
390 * dlm_user_cancel -> cancel_lock
391 *
392 * device_create_lockspace
393 * dlm_new_lockspace
394 *
395 * device_remove_lockspace
396 * dlm_release_lockspace
397 */
398
399/* a write to a lockspace device is a lock or unlock request, a write
400 to the control device is to create/remove a lockspace */
401
402static ssize_t device_write(struct file *file, const char __user *buf,
403 size_t count, loff_t *ppos)
404{
405 struct dlm_user_proc *proc = file->private_data;
406 struct dlm_write_request *kbuf;
407 sigset_t tmpsig, allsigs;
408 int error;
409
410#ifdef CONFIG_COMPAT
411 if (count < sizeof(struct dlm_write_request32))
412#else
413 if (count < sizeof(struct dlm_write_request))
414#endif
415 return -EINVAL;
416
417 kbuf = kmalloc(count, GFP_KERNEL);
418 if (!kbuf)
419 return -ENOMEM;
420
421 if (copy_from_user(kbuf, buf, count)) {
422 error = -EFAULT;
423 goto out_free;
424 }
425
426 if (check_version(kbuf)) {
427 error = -EBADE;
428 goto out_free;
429 }
430
431#ifdef CONFIG_COMPAT
432 if (!kbuf->is64bit) {
433 struct dlm_write_request32 *k32buf;
434 k32buf = (struct dlm_write_request32 *)kbuf;
435 kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
436 sizeof(struct dlm_write_request32)), GFP_KERNEL);
437 if (!kbuf)
438 return -ENOMEM;
439
440 if (proc)
441 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
442 compat_input(kbuf, k32buf);
443 kfree(k32buf);
444 }
445#endif
446
447 /* do we really need this? can a write happen after a close? */
448 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
449 test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
450 return -EINVAL;
451
452 sigfillset(&allsigs);
453 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
454
455 error = -EINVAL;
456
457 switch (kbuf->cmd)
458 {
459 case DLM_USER_LOCK:
460 if (!proc) {
461 log_print("no locking on control device");
462 goto out_sig;
463 }
464 error = device_user_lock(proc, &kbuf->i.lock);
465 break;
466
467 case DLM_USER_UNLOCK:
468 if (!proc) {
469 log_print("no locking on control device");
470 goto out_sig;
471 }
472 error = device_user_unlock(proc, &kbuf->i.lock);
473 break;
474
475 case DLM_USER_CREATE_LOCKSPACE:
476 if (proc) {
477 log_print("create/remove only on control device");
478 goto out_sig;
479 }
480 error = device_create_lockspace(&kbuf->i.lspace);
481 break;
482
483 case DLM_USER_REMOVE_LOCKSPACE:
484 if (proc) {
485 log_print("create/remove only on control device");
486 goto out_sig;
487 }
488 error = device_remove_lockspace(&kbuf->i.lspace);
489 break;
490
491 default:
492 log_print("Unknown command passed to DLM device : %d\n",
493 kbuf->cmd);
494 }
495
496 out_sig:
497 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
498 recalc_sigpending();
499 out_free:
500 kfree(kbuf);
501 return error;
502}
503
504/* Every process that opens the lockspace device has its own "proc" structure
505 hanging off the open file that's used to keep track of locks owned by the
506 process and asts that need to be delivered to the process. */
507
508static int device_open(struct inode *inode, struct file *file)
509{
510 struct dlm_user_proc *proc;
511 struct dlm_ls *ls;
512
513 ls = dlm_find_lockspace_device(iminor(inode));
514 if (!ls)
515 return -ENOENT;
516
517 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
518 if (!proc) {
519 dlm_put_lockspace(ls);
520 return -ENOMEM;
521 }
522
523 proc->lockspace = ls->ls_local_handle;
524 INIT_LIST_HEAD(&proc->asts);
525 INIT_LIST_HEAD(&proc->locks);
526 spin_lock_init(&proc->asts_spin);
527 spin_lock_init(&proc->locks_spin);
528 init_waitqueue_head(&proc->wait);
529 file->private_data = proc;
530
531 return 0;
532}
533
534static int device_close(struct inode *inode, struct file *file)
535{
536 struct dlm_user_proc *proc = file->private_data;
537 struct dlm_ls *ls;
538 sigset_t tmpsig, allsigs;
539
540 ls = dlm_find_lockspace_local(proc->lockspace);
541 if (!ls)
542 return -ENOENT;
543
544 sigfillset(&allsigs);
545 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
546
547 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
548
549 dlm_clear_proc_locks(ls, proc);
550
551 /* at this point no more lkb's should exist for this lockspace,
552 so there's no chance of dlm_user_add_ast() being called and
553 looking for lkb->ua->proc */
554
555 kfree(proc);
556 file->private_data = NULL;
557
558 dlm_put_lockspace(ls);
559 dlm_put_lockspace(ls); /* for the find in device_open() */
560
561 /* FIXME: AUTOFREE: if this ls is no longer used do
562 device_remove_lockspace() */
563
564 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
565 recalc_sigpending();
566
567 return 0;
568}
569
570static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
571 int bmode, char __user *buf, size_t count)
572{
573#ifdef CONFIG_COMPAT
574 struct dlm_lock_result32 result32;
575#endif
576 struct dlm_lock_result result;
577 void *resultptr;
578 int error=0;
579 int len;
580 int struct_len;
581
582 memset(&result, 0, sizeof(struct dlm_lock_result));
583 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
584 result.user_lksb = ua->user_lksb;
585
586 /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
587 in a conversion unless the conversion is successful. See code
588 in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
589 notes that a new blocking AST address and parameter are set even if
590 the conversion fails, so maybe we should just do that. */
591
592 if (type == AST_BAST) {
593 result.user_astaddr = ua->bastaddr;
594 result.user_astparam = ua->bastparam;
595 result.bast_mode = bmode;
596 } else {
597 result.user_astaddr = ua->castaddr;
598 result.user_astparam = ua->castparam;
599 }
600
601#ifdef CONFIG_COMPAT
602 if (compat)
603 len = sizeof(struct dlm_lock_result32);
604 else
605#endif
606 len = sizeof(struct dlm_lock_result);
607 struct_len = len;
608
609 /* copy lvb to userspace if there is one, it's been updated, and
610 the user buffer has space for it */
611
612 if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
613 count >= len + DLM_USER_LVB_LEN) {
614 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
615 DLM_USER_LVB_LEN)) {
616 error = -EFAULT;
617 goto out;
618 }
619
620 result.lvb_offset = len;
621 len += DLM_USER_LVB_LEN;
622 }
623
624 result.length = len;
625 resultptr = &result;
626#ifdef CONFIG_COMPAT
627 if (compat) {
628 compat_output(&result, &result32);
629 resultptr = &result32;
630 }
631#endif
632
633 if (copy_to_user(buf, resultptr, struct_len))
634 error = -EFAULT;
635 else
636 error = len;
637 out:
638 return error;
639}
640
641/* a read returns a single ast described in a struct dlm_lock_result */
642
643static ssize_t device_read(struct file *file, char __user *buf, size_t count,
644 loff_t *ppos)
645{
646 struct dlm_user_proc *proc = file->private_data;
647 struct dlm_lkb *lkb;
648 struct dlm_user_args *ua;
649 DECLARE_WAITQUEUE(wait, current);
650 int error, type=0, bmode=0, removed = 0;
651
652#ifdef CONFIG_COMPAT
653 if (count < sizeof(struct dlm_lock_result32))
654#else
655 if (count < sizeof(struct dlm_lock_result))
656#endif
657 return -EINVAL;
658
659 /* do we really need this? can a read happen after a close? */
660 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
661 return -EINVAL;
662
663 spin_lock(&proc->asts_spin);
664 if (list_empty(&proc->asts)) {
665 if (file->f_flags & O_NONBLOCK) {
666 spin_unlock(&proc->asts_spin);
667 return -EAGAIN;
668 }
669
670 add_wait_queue(&proc->wait, &wait);
671
672 repeat:
673 set_current_state(TASK_INTERRUPTIBLE);
674 if (list_empty(&proc->asts) && !signal_pending(current)) {
675 spin_unlock(&proc->asts_spin);
676 schedule();
677 spin_lock(&proc->asts_spin);
678 goto repeat;
679 }
680 set_current_state(TASK_RUNNING);
681 remove_wait_queue(&proc->wait, &wait);
682
683 if (signal_pending(current)) {
684 spin_unlock(&proc->asts_spin);
685 return -ERESTARTSYS;
686 }
687 }
688
689 if (list_empty(&proc->asts)) {
690 spin_unlock(&proc->asts_spin);
691 return -EAGAIN;
692 }
693
694 /* there may be both completion and blocking asts to return for
695 the lkb, don't remove lkb from asts list unless no asts remain */
696
697 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
698
699 if (lkb->lkb_ast_type & AST_COMP) {
700 lkb->lkb_ast_type &= ~AST_COMP;
701 type = AST_COMP;
702 } else if (lkb->lkb_ast_type & AST_BAST) {
703 lkb->lkb_ast_type &= ~AST_BAST;
704 type = AST_BAST;
705 bmode = lkb->lkb_bastmode;
706 }
707
708 if (!lkb->lkb_ast_type) {
709 list_del(&lkb->lkb_astqueue);
710 removed = 1;
711 }
712 spin_unlock(&proc->asts_spin);
713
714 ua = (struct dlm_user_args *)lkb->lkb_astparam;
715 error = copy_result_to_user(ua,
716 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
717 type, bmode, buf, count);
718
719 /* removes reference for the proc->asts lists added by
720 dlm_user_add_ast() and may result in the lkb being freed */
721 if (removed)
722 dlm_put_lkb(lkb);
723
724 return error;
725}
726
727static unsigned int device_poll(struct file *file, poll_table *wait)
728{
729 struct dlm_user_proc *proc = file->private_data;
730
731 poll_wait(file, &proc->wait, wait);
732
733 spin_lock(&proc->asts_spin);
734 if (!list_empty(&proc->asts)) {
735 spin_unlock(&proc->asts_spin);
736 return POLLIN | POLLRDNORM;
737 }
738 spin_unlock(&proc->asts_spin);
739 return 0;
740}
741
742static int ctl_device_open(struct inode *inode, struct file *file)
743{
744 file->private_data = NULL;
745 return 0;
746}
747
748static int ctl_device_close(struct inode *inode, struct file *file)
749{
750 return 0;
751}
752
753static struct file_operations device_fops = {
754 .open = device_open,
755 .release = device_close,
756 .read = device_read,
757 .write = device_write,
758 .poll = device_poll,
759 .owner = THIS_MODULE,
760};
761
762static struct file_operations ctl_device_fops = {
763 .open = ctl_device_open,
764 .release = ctl_device_close,
765 .write = device_write,
766 .owner = THIS_MODULE,
767};
768
769int dlm_user_init(void)
770{
771 int error;
772
773 ctl_device.name = "dlm-control";
774 ctl_device.fops = &ctl_device_fops;
775 ctl_device.minor = MISC_DYNAMIC_MINOR;
776
777 error = misc_register(&ctl_device);
778 if (error)
779 log_print("misc_register failed for control device");
780
781 return error;
782}
783
784void dlm_user_exit(void)
785{
786 misc_deregister(&ctl_device);
787}
788
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__
11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void);
14void dlm_user_exit(void);
15
16#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22