aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/dlm/Kconfig30
-rw-r--r--fs/dlm/Makefile21
-rw-r--r--fs/dlm/ast.c167
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c787
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c310
-rw-r--r--fs/dlm/device.c1084
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h518
-rw-r--r--fs/dlm/lock.c3610
-rw-r--r--fs/dlm/lock.h50
-rw-r--r--fs/dlm/lockspace.c666
-rw-r--r--fs/dlm/lockspace.h24
-rw-r--r--fs/dlm/lowcomms.c1218
-rw-r--r--fs/dlm/lowcomms.h25
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c89
-rw-r--r--fs/dlm/member.c314
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c122
-rw-r--r--fs/dlm/memory.h31
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c460
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c762
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c285
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/util.c173
-rw-r--r--fs/dlm/util.h22
-rw-r--r--include/linux/dlm.h312
-rw-r--r--include/linux/dlm_device.h84
39 files changed, 12178 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index bec8afa5704d..3a32f3f97e9b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1831,6 +1831,7 @@ source "fs/partitions/Kconfig"
1831endmenu 1831endmenu
1832 1832
1833source "fs/nls/Kconfig" 1833source "fs/nls/Kconfig"
1834source "fs/dlm/Kconfig"
1834 1835
1835endmenu 1836endmenu
1836 1837
diff --git a/fs/Makefile b/fs/Makefile
index 0922727732c5..b298f4fdc6f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_SYSFS) += sysfs/
48obj-y += devpts/ 48obj-y += devpts/
49 49
50obj-$(CONFIG_PROFILING) += dcookies.o 50obj-$(CONFIG_PROFILING) += dcookies.o
51obj-$(CONFIG_DLM) += dlm/
51 52
52# Do not add any filesystems before this line 53# Do not add any filesystems before this line
53obj-$(CONFIG_REISERFS_FS) += reiserfs/ 54obj-$(CONFIG_REISERFS_FS) += reiserfs/
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..d01f735e6e06
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,30 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on SYSFS
7 depends on IPV6 || IPV6=n
8 select IP_SCTP
9 select CONFIGFS_FS
10 help
11 A general purpose distributed lock manager for kernel or userspace
12 applications.
13
14config DLM_DEVICE
15 tristate "DLM device for userspace access"
16 depends on DLM
17 help
18 This module creates a misc device through which the dlm lockspace
19 and locking functions become available to userspace applications
20 (usually through the libdlm library).
21
22config DLM_DEBUG
23 bool "DLM debugging"
24 depends on DLM
25 help
26 Under the debugfs mount point, the name of each lockspace will
27 appear as a file in the "dlm" directory. The output is the
28 list of resource and locks the local node knows about.
29
30endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1e6232e7d8e5
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,21 @@
1obj-$(CONFIG_DLM) += dlm.o
2obj-$(CONFIG_DLM_DEVICE) += dlm_device.o
3
4dlm-y := ast.o \
5 config.o \
6 dir.o \
7 lock.o \
8 lockspace.o \
9 lowcomms.o \
10 main.o \
11 member.o \
12 memory.o \
13 midcomms.o \
14 rcom.o \
15 recover.o \
16 recoverd.o \
17 requestqueue.o \
18 util.o
19dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
20
21dlm_device-y := device.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..2bd1c5e1a72c
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,167 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "ast.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct semaphore astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 spin_lock(&ast_queue_lock);
38 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
39 kref_get(&lkb->lkb_ref);
40 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
41 }
42 lkb->lkb_ast_type |= type;
43 spin_unlock(&ast_queue_lock);
44
45 set_bit(WAKE_ASTS, &astd_wakeflags);
46 wake_up_process(astd_task);
47}
48
49static void process_asts(void)
50{
51 struct dlm_ls *ls = NULL;
52 struct dlm_rsb *r = NULL;
53 struct dlm_lkb *lkb;
54 void (*cast) (long param);
55 void (*bast) (long param, int mode);
56 int type = 0, found, bmode;
57
58 for (;;) {
59 found = FALSE;
60 spin_lock(&ast_queue_lock);
61 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
62 r = lkb->lkb_resource;
63 ls = r->res_ls;
64
65 if (dlm_locking_stopped(ls))
66 continue;
67
68 list_del(&lkb->lkb_astqueue);
69 type = lkb->lkb_ast_type;
70 lkb->lkb_ast_type = 0;
71 found = TRUE;
72 break;
73 }
74 spin_unlock(&ast_queue_lock);
75
76 if (!found)
77 break;
78
79 cast = lkb->lkb_astaddr;
80 bast = lkb->lkb_bastaddr;
81 bmode = lkb->lkb_bastmode;
82
83 if ((type & AST_COMP) && cast)
84 cast(lkb->lkb_astparam);
85
86 /* FIXME: Is it safe to look at lkb_grmode here
87 without doing a lock_rsb() ?
88 Look at other checks in v1 to avoid basts. */
89
90 if ((type & AST_BAST) && bast)
91 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
92 bast(lkb->lkb_astparam, bmode);
93
94 /* this removes the reference added by dlm_add_ast
95 and may result in the lkb being freed */
96 dlm_put_lkb(lkb);
97
98 schedule();
99 }
100}
101
102static inline int no_asts(void)
103{
104 int ret;
105
106 spin_lock(&ast_queue_lock);
107 ret = list_empty(&ast_queue);
108 spin_unlock(&ast_queue_lock);
109 return ret;
110}
111
112static int dlm_astd(void *data)
113{
114 while (!kthread_should_stop()) {
115 set_current_state(TASK_INTERRUPTIBLE);
116 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
117 schedule();
118 set_current_state(TASK_RUNNING);
119
120 down(&astd_running);
121 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
122 process_asts();
123 up(&astd_running);
124 }
125 return 0;
126}
127
128void dlm_astd_wake(void)
129{
130 if (!no_asts()) {
131 set_bit(WAKE_ASTS, &astd_wakeflags);
132 wake_up_process(astd_task);
133 }
134}
135
136int dlm_astd_start(void)
137{
138 struct task_struct *p;
139 int error = 0;
140
141 INIT_LIST_HEAD(&ast_queue);
142 spin_lock_init(&ast_queue_lock);
143 init_MUTEX(&astd_running);
144
145 p = kthread_run(dlm_astd, NULL, "dlm_astd");
146 if (IS_ERR(p))
147 error = PTR_ERR(p);
148 else
149 astd_task = p;
150 return error;
151}
152
153void dlm_astd_stop(void)
154{
155 kthread_stop(astd_task);
156}
157
158void dlm_astd_suspend(void)
159{
160 down(&astd_running);
161}
162
163void dlm_astd_resume(void)
164{
165 up(&astd_running);
166}
167
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..024ace9973a8
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,787 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20
21/*
22 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
24 * /config/dlm/<cluster>/comms/<comm>/nodeid
25 * /config/dlm/<cluster>/comms/<comm>/local
26 * /config/dlm/<cluster>/comms/<comm>/addr
27 * The <cluster> level is useless, but I haven't figured out how to avoid it.
28 */
29
30static struct config_group *space_list;
31static struct config_group *comm_list;
32static struct comm *local_comm;
33
34struct clusters;
35struct cluster;
36struct spaces;
37struct space;
38struct comms;
39struct comm;
40struct nodes;
41struct node;
42
43static struct config_group *make_cluster(struct config_group *, const char *);
44static void drop_cluster(struct config_group *, struct config_item *);
45static void release_cluster(struct config_item *);
46static struct config_group *make_space(struct config_group *, const char *);
47static void drop_space(struct config_group *, struct config_item *);
48static void release_space(struct config_item *);
49static struct config_item *make_comm(struct config_group *, const char *);
50static void drop_comm(struct config_group *, struct config_item *);
51static void release_comm(struct config_item *);
52static struct config_item *make_node(struct config_group *, const char *);
53static void drop_node(struct config_group *, struct config_item *);
54static void release_node(struct config_item *);
55
56static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
57 char *buf);
58static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
59 const char *buf, size_t len);
60static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
61 char *buf);
62static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
63 const char *buf, size_t len);
64
65static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
66static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
67static ssize_t comm_local_read(struct comm *cm, char *buf);
68static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
69static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t node_nodeid_read(struct node *nd, char *buf);
71static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
72static ssize_t node_weight_read(struct node *nd, char *buf);
73static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
74
75enum {
76 COMM_ATTR_NODEID = 0,
77 COMM_ATTR_LOCAL,
78 COMM_ATTR_ADDR,
79};
80
81struct comm_attribute {
82 struct configfs_attribute attr;
83 ssize_t (*show)(struct comm *, char *);
84 ssize_t (*store)(struct comm *, const char *, size_t);
85};
86
87static struct comm_attribute comm_attr_nodeid = {
88 .attr = { .ca_owner = THIS_MODULE,
89 .ca_name = "nodeid",
90 .ca_mode = S_IRUGO | S_IWUSR },
91 .show = comm_nodeid_read,
92 .store = comm_nodeid_write,
93};
94
95static struct comm_attribute comm_attr_local = {
96 .attr = { .ca_owner = THIS_MODULE,
97 .ca_name = "local",
98 .ca_mode = S_IRUGO | S_IWUSR },
99 .show = comm_local_read,
100 .store = comm_local_write,
101};
102
103static struct comm_attribute comm_attr_addr = {
104 .attr = { .ca_owner = THIS_MODULE,
105 .ca_name = "addr",
106 .ca_mode = S_IRUGO | S_IWUSR },
107 .store = comm_addr_write,
108};
109
110static struct configfs_attribute *comm_attrs[] = {
111 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
112 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
113 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
114 NULL,
115};
116
117enum {
118 NODE_ATTR_NODEID = 0,
119 NODE_ATTR_WEIGHT,
120};
121
122struct node_attribute {
123 struct configfs_attribute attr;
124 ssize_t (*show)(struct node *, char *);
125 ssize_t (*store)(struct node *, const char *, size_t);
126};
127
128static struct node_attribute node_attr_nodeid = {
129 .attr = { .ca_owner = THIS_MODULE,
130 .ca_name = "nodeid",
131 .ca_mode = S_IRUGO | S_IWUSR },
132 .show = node_nodeid_read,
133 .store = node_nodeid_write,
134};
135
136static struct node_attribute node_attr_weight = {
137 .attr = { .ca_owner = THIS_MODULE,
138 .ca_name = "weight",
139 .ca_mode = S_IRUGO | S_IWUSR },
140 .show = node_weight_read,
141 .store = node_weight_write,
142};
143
144static struct configfs_attribute *node_attrs[] = {
145 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
146 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
147 NULL,
148};
149
150struct clusters {
151 struct configfs_subsystem subsys;
152};
153
154struct cluster {
155 struct config_group group;
156};
157
158struct spaces {
159 struct config_group ss_group;
160};
161
162struct space {
163 struct config_group group;
164 struct list_head members;
165 struct semaphore members_lock;
166 int members_count;
167};
168
169struct comms {
170 struct config_group cs_group;
171};
172
173struct comm {
174 struct config_item item;
175 int nodeid;
176 int local;
177 int addr_count;
178 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
179};
180
181struct nodes {
182 struct config_group ns_group;
183};
184
185struct node {
186 struct config_item item;
187 struct list_head list; /* space->members */
188 int nodeid;
189 int weight;
190};
191
192static struct configfs_group_operations clusters_ops = {
193 .make_group = make_cluster,
194 .drop_item = drop_cluster,
195};
196
197static struct configfs_item_operations cluster_ops = {
198 .release = release_cluster,
199};
200
201static struct configfs_group_operations spaces_ops = {
202 .make_group = make_space,
203 .drop_item = drop_space,
204};
205
206static struct configfs_item_operations space_ops = {
207 .release = release_space,
208};
209
210static struct configfs_group_operations comms_ops = {
211 .make_item = make_comm,
212 .drop_item = drop_comm,
213};
214
215static struct configfs_item_operations comm_ops = {
216 .release = release_comm,
217 .show_attribute = show_comm,
218 .store_attribute = store_comm,
219};
220
221static struct configfs_group_operations nodes_ops = {
222 .make_item = make_node,
223 .drop_item = drop_node,
224};
225
226static struct configfs_item_operations node_ops = {
227 .release = release_node,
228 .show_attribute = show_node,
229 .store_attribute = store_node,
230};
231
232static struct config_item_type clusters_type = {
233 .ct_group_ops = &clusters_ops,
234 .ct_owner = THIS_MODULE,
235};
236
237static struct config_item_type cluster_type = {
238 .ct_item_ops = &cluster_ops,
239 .ct_owner = THIS_MODULE,
240};
241
242static struct config_item_type spaces_type = {
243 .ct_group_ops = &spaces_ops,
244 .ct_owner = THIS_MODULE,
245};
246
247static struct config_item_type space_type = {
248 .ct_item_ops = &space_ops,
249 .ct_owner = THIS_MODULE,
250};
251
252static struct config_item_type comms_type = {
253 .ct_group_ops = &comms_ops,
254 .ct_owner = THIS_MODULE,
255};
256
257static struct config_item_type comm_type = {
258 .ct_item_ops = &comm_ops,
259 .ct_attrs = comm_attrs,
260 .ct_owner = THIS_MODULE,
261};
262
263static struct config_item_type nodes_type = {
264 .ct_group_ops = &nodes_ops,
265 .ct_owner = THIS_MODULE,
266};
267
268static struct config_item_type node_type = {
269 .ct_item_ops = &node_ops,
270 .ct_attrs = node_attrs,
271 .ct_owner = THIS_MODULE,
272};
273
274static struct cluster *to_cluster(struct config_item *i)
275{
276 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
277}
278
279static struct space *to_space(struct config_item *i)
280{
281 return i ? container_of(to_config_group(i), struct space, group) : NULL;
282}
283
284static struct comm *to_comm(struct config_item *i)
285{
286 return i ? container_of(i, struct comm, item) : NULL;
287}
288
289static struct node *to_node(struct config_item *i)
290{
291 return i ? container_of(i, struct node, item) : NULL;
292}
293
294static struct config_group *make_cluster(struct config_group *g,
295 const char *name)
296{
297 struct cluster *cl = NULL;
298 struct spaces *sps = NULL;
299 struct comms *cms = NULL;
300 void *gps = NULL;
301
302 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
303 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
304 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
305 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
306
307 if (!cl || !gps || !sps || !cms)
308 goto fail;
309
310 config_group_init_type_name(&cl->group, name, &cluster_type);
311 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
312 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
313
314 cl->group.default_groups = gps;
315 cl->group.default_groups[0] = &sps->ss_group;
316 cl->group.default_groups[1] = &cms->cs_group;
317 cl->group.default_groups[2] = NULL;
318
319 space_list = &sps->ss_group;
320 comm_list = &cms->cs_group;
321 return &cl->group;
322
323 fail:
324 kfree(cl);
325 kfree(gps);
326 kfree(sps);
327 kfree(cms);
328 return NULL;
329}
330
331static void drop_cluster(struct config_group *g, struct config_item *i)
332{
333 struct cluster *cl = to_cluster(i);
334 struct config_item *tmp;
335 int j;
336
337 for (j = 0; cl->group.default_groups[j]; j++) {
338 tmp = &cl->group.default_groups[j]->cg_item;
339 cl->group.default_groups[j] = NULL;
340 config_item_put(tmp);
341 }
342
343 space_list = NULL;
344 comm_list = NULL;
345
346 config_item_put(i);
347}
348
349static void release_cluster(struct config_item *i)
350{
351 struct cluster *cl = to_cluster(i);
352 kfree(cl->group.default_groups);
353 kfree(cl);
354}
355
356static struct config_group *make_space(struct config_group *g, const char *name)
357{
358 struct space *sp = NULL;
359 struct nodes *nds = NULL;
360 void *gps = NULL;
361
362 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
363 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
364 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
365
366 if (!sp || !gps || !nds)
367 goto fail;
368
369 config_group_init_type_name(&sp->group, name, &space_type);
370 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
371
372 sp->group.default_groups = gps;
373 sp->group.default_groups[0] = &nds->ns_group;
374 sp->group.default_groups[1] = NULL;
375
376 INIT_LIST_HEAD(&sp->members);
377 init_MUTEX(&sp->members_lock);
378 sp->members_count = 0;
379 return &sp->group;
380
381 fail:
382 kfree(sp);
383 kfree(gps);
384 kfree(nds);
385 return NULL;
386}
387
388static void drop_space(struct config_group *g, struct config_item *i)
389{
390 struct space *sp = to_space(i);
391 struct config_item *tmp;
392 int j;
393
394 /* assert list_empty(&sp->members) */
395
396 for (j = 0; sp->group.default_groups[j]; j++) {
397 tmp = &sp->group.default_groups[j]->cg_item;
398 sp->group.default_groups[j] = NULL;
399 config_item_put(tmp);
400 }
401
402 config_item_put(i);
403}
404
405static void release_space(struct config_item *i)
406{
407 struct space *sp = to_space(i);
408 kfree(sp->group.default_groups);
409 kfree(sp);
410}
411
412static struct config_item *make_comm(struct config_group *g, const char *name)
413{
414 struct comm *cm;
415
416 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
417 if (!cm)
418 return NULL;
419
420 config_item_init_type_name(&cm->item, name, &comm_type);
421 cm->nodeid = -1;
422 cm->local = 0;
423 cm->addr_count = 0;
424 return &cm->item;
425}
426
427static void drop_comm(struct config_group *g, struct config_item *i)
428{
429 struct comm *cm = to_comm(i);
430 if (local_comm == cm)
431 local_comm = NULL;
432 while (cm->addr_count--)
433 kfree(cm->addr[cm->addr_count]);
434 config_item_put(i);
435}
436
437static void release_comm(struct config_item *i)
438{
439 struct comm *cm = to_comm(i);
440 kfree(cm);
441}
442
443static struct config_item *make_node(struct config_group *g, const char *name)
444{
445 struct space *sp = to_space(g->cg_item.ci_parent);
446 struct node *nd;
447
448 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
449 if (!nd)
450 return NULL;
451
452 config_item_init_type_name(&nd->item, name, &node_type);
453 nd->nodeid = -1;
454 nd->weight = 1; /* default weight of 1 if none is set */
455
456 down(&sp->members_lock);
457 list_add(&nd->list, &sp->members);
458 sp->members_count++;
459 up(&sp->members_lock);
460
461 return &nd->item;
462}
463
464static void drop_node(struct config_group *g, struct config_item *i)
465{
466 struct space *sp = to_space(g->cg_item.ci_parent);
467 struct node *nd = to_node(i);
468
469 down(&sp->members_lock);
470 list_del(&nd->list);
471 sp->members_count--;
472 up(&sp->members_lock);
473
474 config_item_put(i);
475}
476
477static void release_node(struct config_item *i)
478{
479 struct node *nd = to_node(i);
480 kfree(nd);
481}
482
483static struct clusters clusters_root = {
484 .subsys = {
485 .su_group = {
486 .cg_item = {
487 .ci_namebuf = "dlm",
488 .ci_type = &clusters_type,
489 },
490 },
491 },
492};
493
494int dlm_config_init(void)
495{
496 config_group_init(&clusters_root.subsys.su_group);
497 init_MUTEX(&clusters_root.subsys.su_sem);
498 return configfs_register_subsystem(&clusters_root.subsys);
499}
500
501void dlm_config_exit(void)
502{
503 configfs_unregister_subsystem(&clusters_root.subsys);
504}
505
506/*
507 * Functions for user space to read/write attributes
508 */
509
510static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
511 char *buf)
512{
513 struct comm *cm = to_comm(i);
514 struct comm_attribute *cma =
515 container_of(a, struct comm_attribute, attr);
516 return cma->show ? cma->show(cm, buf) : 0;
517}
518
519static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
520 const char *buf, size_t len)
521{
522 struct comm *cm = to_comm(i);
523 struct comm_attribute *cma =
524 container_of(a, struct comm_attribute, attr);
525 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
526}
527
528static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
529{
530 return sprintf(buf, "%d\n", cm->nodeid);
531}
532
533static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
534{
535 cm->nodeid = simple_strtol(buf, NULL, 0);
536 return len;
537}
538
539static ssize_t comm_local_read(struct comm *cm, char *buf)
540{
541 return sprintf(buf, "%d\n", cm->local);
542}
543
544static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
545{
546 cm->local= simple_strtol(buf, NULL, 0);
547 if (cm->local && !local_comm)
548 local_comm = cm;
549 return len;
550}
551
552static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
553{
554 struct sockaddr_storage *addr;
555
556 if (len != sizeof(struct sockaddr_storage))
557 return -EINVAL;
558
559 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
560 return -ENOSPC;
561
562 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
563 if (!addr)
564 return -ENOMEM;
565
566 memcpy(addr, buf, len);
567 cm->addr[cm->addr_count++] = addr;
568 return len;
569}
570
571static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
572 char *buf)
573{
574 struct node *nd = to_node(i);
575 struct node_attribute *nda =
576 container_of(a, struct node_attribute, attr);
577 return nda->show ? nda->show(nd, buf) : 0;
578}
579
580static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
581 const char *buf, size_t len)
582{
583 struct node *nd = to_node(i);
584 struct node_attribute *nda =
585 container_of(a, struct node_attribute, attr);
586 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
587}
588
589static ssize_t node_nodeid_read(struct node *nd, char *buf)
590{
591 return sprintf(buf, "%d\n", nd->nodeid);
592}
593
594static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
595{
596 nd->nodeid = simple_strtol(buf, NULL, 0);
597 return len;
598}
599
600static ssize_t node_weight_read(struct node *nd, char *buf)
601{
602 return sprintf(buf, "%d\n", nd->weight);
603}
604
605static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
606{
607 nd->weight = simple_strtol(buf, NULL, 0);
608 return len;
609}
610
611/*
612 * Functions for the dlm to get the info that's been configured
613 */
614
615static struct space *get_space(char *name)
616{
617 if (!space_list)
618 return NULL;
619 return to_space(config_group_find_obj(space_list, name));
620}
621
622static void put_space(struct space *sp)
623{
624 config_item_put(&sp->group.cg_item);
625}
626
627static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
628{
629 struct config_item *i;
630 struct comm *cm = NULL;
631 int found = 0;
632
633 if (!comm_list)
634 return NULL;
635
636 down(&clusters_root.subsys.su_sem);
637
638 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
639 cm = to_comm(i);
640
641 if (nodeid) {
642 if (cm->nodeid != nodeid)
643 continue;
644 found = 1;
645 break;
646 } else {
647 if (!cm->addr_count ||
648 memcmp(cm->addr[0], addr, sizeof(*addr)))
649 continue;
650 found = 1;
651 break;
652 }
653 }
654 up(&clusters_root.subsys.su_sem);
655
656 if (found)
657 config_item_get(i);
658 else
659 cm = NULL;
660 return cm;
661}
662
663static void put_comm(struct comm *cm)
664{
665 config_item_put(&cm->item);
666}
667
668/* caller must free mem */
669int dlm_nodeid_list(char *lsname, int **ids_out)
670{
671 struct space *sp;
672 struct node *nd;
673 int i = 0, rv = 0;
674 int *ids;
675
676 sp = get_space(lsname);
677 if (!sp)
678 return -EEXIST;
679
680 down(&sp->members_lock);
681 if (!sp->members_count) {
682 rv = 0;
683 goto out;
684 }
685
686 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
687 if (!ids) {
688 rv = -ENOMEM;
689 goto out;
690 }
691
692 rv = sp->members_count;
693 list_for_each_entry(nd, &sp->members, list)
694 ids[i++] = nd->nodeid;
695
696 if (rv != i)
697 printk("bad nodeid count %d %d\n", rv, i);
698
699 *ids_out = ids;
700 out:
701 up(&sp->members_lock);
702 put_space(sp);
703 return rv;
704}
705
706int dlm_node_weight(char *lsname, int nodeid)
707{
708 struct space *sp;
709 struct node *nd;
710 int w = -EEXIST;
711
712 sp = get_space(lsname);
713 if (!sp)
714 goto out;
715
716 down(&sp->members_lock);
717 list_for_each_entry(nd, &sp->members, list) {
718 if (nd->nodeid != nodeid)
719 continue;
720 w = nd->weight;
721 break;
722 }
723 up(&sp->members_lock);
724 put_space(sp);
725 out:
726 return w;
727}
728
729int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
730{
731 struct comm *cm = get_comm(nodeid, NULL);
732 if (!cm)
733 return -EEXIST;
734 if (!cm->addr_count)
735 return -ENOENT;
736 memcpy(addr, cm->addr[0], sizeof(*addr));
737 put_comm(cm);
738 return 0;
739}
740
741int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
742{
743 struct comm *cm = get_comm(0, addr);
744 if (!cm)
745 return -EEXIST;
746 *nodeid = cm->nodeid;
747 put_comm(cm);
748 return 0;
749}
750
751int dlm_our_nodeid(void)
752{
753 return local_comm ? local_comm->nodeid : 0;
754}
755
756/* num 0 is first addr, num 1 is second addr */
757int dlm_our_addr(struct sockaddr_storage *addr, int num)
758{
759 if (!local_comm)
760 return -1;
761 if (num + 1 > local_comm->addr_count)
762 return -1;
763 memcpy(addr, local_comm->addr[num], sizeof(*addr));
764 return 0;
765}
766
767/* Config file defaults */
768#define DEFAULT_TCP_PORT 21064
769#define DEFAULT_BUFFER_SIZE 4096
770#define DEFAULT_RSBTBL_SIZE 256
771#define DEFAULT_LKBTBL_SIZE 1024
772#define DEFAULT_DIRTBL_SIZE 512
773#define DEFAULT_RECOVER_TIMER 5
774#define DEFAULT_TOSS_SECS 10
775#define DEFAULT_SCAN_SECS 5
776
777struct dlm_config_info dlm_config = {
778 .tcp_port = DEFAULT_TCP_PORT,
779 .buffer_size = DEFAULT_BUFFER_SIZE,
780 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
781 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
782 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
783 .recover_timer = DEFAULT_RECOVER_TIMER,
784 .toss_secs = DEFAULT_TOSS_SECS,
785 .scan_secs = DEFAULT_SCAN_SECS
786};
787
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..98b49a1ece47
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,310 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21
22static struct dentry *dlm_root;
23
24struct rsb_iter {
25 int entry;
26 struct dlm_ls *ls;
27 struct list_head *next;
28 struct dlm_rsb *rsb;
29};
30
31static char *print_lockmode(int mode)
32{
33 switch (mode) {
34 case DLM_LOCK_IV:
35 return "--";
36 case DLM_LOCK_NL:
37 return "NL";
38 case DLM_LOCK_CR:
39 return "CR";
40 case DLM_LOCK_CW:
41 return "CW";
42 case DLM_LOCK_PR:
43 return "PR";
44 case DLM_LOCK_PW:
45 return "PW";
46 case DLM_LOCK_EX:
47 return "EX";
48 default:
49 return "??";
50 }
51}
52
53static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
54 struct dlm_rsb *res)
55{
56 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
57
58 if (lkb->lkb_status == DLM_LKSTS_CONVERT
59 || lkb->lkb_status == DLM_LKSTS_WAITING)
60 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
61
62 if (lkb->lkb_range) {
63 /* FIXME: this warns on Alpha */
64 if (lkb->lkb_status == DLM_LKSTS_CONVERT
65 || lkb->lkb_status == DLM_LKSTS_GRANTED)
66 seq_printf(s, " %" PRIx64 "-%" PRIx64,
67 lkb->lkb_range[GR_RANGE_START],
68 lkb->lkb_range[GR_RANGE_END]);
69 if (lkb->lkb_status == DLM_LKSTS_CONVERT
70 || lkb->lkb_status == DLM_LKSTS_WAITING)
71 seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")",
72 lkb->lkb_range[RQ_RANGE_START],
73 lkb->lkb_range[RQ_RANGE_END]);
74 }
75
76 if (lkb->lkb_nodeid) {
77 if (lkb->lkb_nodeid != res->res_nodeid)
78 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
79 lkb->lkb_remid);
80 else
81 seq_printf(s, " Master: %08x", lkb->lkb_remid);
82 }
83
84 if (lkb->lkb_wait_type)
85 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
86
87 seq_printf(s, "\n");
88}
89
90static int print_resource(struct dlm_rsb *res, struct seq_file *s)
91{
92 struct dlm_lkb *lkb;
93 int i, lvblen = res->res_ls->ls_lvblen;
94
95 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
96 for (i = 0; i < res->res_length; i++) {
97 if (isprint(res->res_name[i]))
98 seq_printf(s, "%c", res->res_name[i]);
99 else
100 seq_printf(s, "%c", '.');
101 }
102 if (res->res_nodeid > 0)
103 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
104 res->res_nodeid);
105 else if (res->res_nodeid == 0)
106 seq_printf(s, "\" \nMaster Copy\n");
107 else if (res->res_nodeid == -1)
108 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
109 res->res_first_lkid);
110 else
111 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
112
113 /* Print the LVB: */
114 if (res->res_lvbptr) {
115 seq_printf(s, "LVB: ");
116 for (i = 0; i < lvblen; i++) {
117 if (i == lvblen / 2)
118 seq_printf(s, "\n ");
119 seq_printf(s, "%02x ",
120 (unsigned char) res->res_lvbptr[i]);
121 }
122 if (rsb_flag(res, RSB_VALNOTVALID))
123 seq_printf(s, " (INVALID)");
124 seq_printf(s, "\n");
125 }
126
127 /* Print the locks attached to this resource */
128 seq_printf(s, "Granted Queue\n");
129 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
130 print_lock(s, lkb, res);
131
132 seq_printf(s, "Conversion Queue\n");
133 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
134 print_lock(s, lkb, res);
135
136 seq_printf(s, "Waiting Queue\n");
137 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
138 print_lock(s, lkb, res);
139
140 return 0;
141}
142
143static int rsb_iter_next(struct rsb_iter *ri)
144{
145 struct dlm_ls *ls = ri->ls;
146 int i;
147
148 if (!ri->next) {
149 top:
150 /* Find the next non-empty hash bucket */
151 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
152 read_lock(&ls->ls_rsbtbl[i].lock);
153 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
154 ri->next = ls->ls_rsbtbl[i].list.next;
155 read_unlock(&ls->ls_rsbtbl[i].lock);
156 break;
157 }
158 read_unlock(&ls->ls_rsbtbl[i].lock);
159 }
160 ri->entry = i;
161
162 if (ri->entry >= ls->ls_rsbtbl_size)
163 return 1;
164 } else {
165 i = ri->entry;
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 ri->next = ri->next->next;
168 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
169 /* End of list - move to next bucket */
170 ri->next = NULL;
171 ri->entry++;
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 goto top;
174 }
175 read_unlock(&ls->ls_rsbtbl[i].lock);
176 }
177 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
178
179 return 0;
180}
181
182static void rsb_iter_free(struct rsb_iter *ri)
183{
184 kfree(ri);
185}
186
187static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
188{
189 struct rsb_iter *ri;
190
191 ri = kmalloc(sizeof *ri, GFP_KERNEL);
192 if (!ri)
193 return NULL;
194
195 ri->ls = ls;
196 ri->entry = 0;
197 ri->next = NULL;
198
199 if (rsb_iter_next(ri)) {
200 rsb_iter_free(ri);
201 return NULL;
202 }
203
204 return ri;
205}
206
207static void *seq_start(struct seq_file *file, loff_t *pos)
208{
209 struct rsb_iter *ri;
210 loff_t n = *pos;
211
212 ri = rsb_iter_init(file->private);
213 if (!ri)
214 return NULL;
215
216 while (n--) {
217 if (rsb_iter_next(ri)) {
218 rsb_iter_free(ri);
219 return NULL;
220 }
221 }
222
223 return ri;
224}
225
226static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
227{
228 struct rsb_iter *ri = iter_ptr;
229
230 (*pos)++;
231
232 if (rsb_iter_next(ri)) {
233 rsb_iter_free(ri);
234 return NULL;
235 }
236
237 return ri;
238}
239
240static void seq_stop(struct seq_file *file, void *iter_ptr)
241{
242 /* nothing for now */
243}
244
245static int seq_show(struct seq_file *file, void *iter_ptr)
246{
247 struct rsb_iter *ri = iter_ptr;
248
249 print_resource(ri->rsb, file);
250
251 return 0;
252}
253
254static struct seq_operations dlm_seq_ops = {
255 .start = seq_start,
256 .next = seq_next,
257 .stop = seq_stop,
258 .show = seq_show,
259};
260
261static int do_open(struct inode *inode, struct file *file)
262{
263 struct seq_file *seq;
264 int ret;
265
266 ret = seq_open(file, &dlm_seq_ops);
267 if (ret)
268 return ret;
269
270 seq = file->private_data;
271 seq->private = inode->u.generic_ip;
272
273 return 0;
274}
275
276static struct file_operations dlm_fops = {
277 .owner = THIS_MODULE,
278 .open = do_open,
279 .read = seq_read,
280 .llseek = seq_lseek,
281 .release = seq_release
282};
283
284int dlm_create_debug_file(struct dlm_ls *ls)
285{
286 ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
287 S_IFREG | S_IRUGO,
288 dlm_root,
289 ls,
290 &dlm_fops);
291 return ls->ls_debug_dentry ? 0 : -ENOMEM;
292}
293
294void dlm_delete_debug_file(struct dlm_ls *ls)
295{
296 if (ls->ls_debug_dentry)
297 debugfs_remove(ls->ls_debug_dentry);
298}
299
300int dlm_register_debugfs(void)
301{
302 dlm_root = debugfs_create_dir("dlm", NULL);
303 return dlm_root ? 0 : -ENOMEM;
304}
305
306void dlm_unregister_debugfs(void)
307{
308 debugfs_remove(dlm_root);
309}
310
diff --git a/fs/dlm/device.c b/fs/dlm/device.c
new file mode 100644
index 000000000000..a8bf600ed13d
--- /dev/null
+++ b/fs/dlm/device.c
@@ -0,0 +1,1084 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * device.c
16 *
17 * This is the userland interface to the DLM.
18 *
19 * The locking is done via a misc char device (find the
20 * registered minor number in /proc/misc).
21 *
22 * User code should not use this interface directly but
23 * call the library routines in libdlm.a instead.
24 *
25 */
26
27#include <linux/miscdevice.h>
28#include <linux/init.h>
29#include <linux/wait.h>
30#include <linux/module.h>
31#include <linux/file.h>
32#include <linux/fs.h>
33#include <linux/poll.h>
34#include <linux/signal.h>
35#include <linux/spinlock.h>
36#include <linux/idr.h>
37
38#include <linux/dlm.h>
39#include <linux/dlm_device.h>
40
41#include "lvb_table.h"
42
43static struct file_operations _dlm_fops;
44static const char *name_prefix="dlm";
45static struct list_head user_ls_list;
46static struct semaphore user_ls_lock;
47
48/* Lock infos are stored in here indexed by lock ID */
49static DEFINE_IDR(lockinfo_idr);
50static rwlock_t lockinfo_lock;
51
52/* Flags in li_flags */
53#define LI_FLAG_COMPLETE 1
54#define LI_FLAG_FIRSTLOCK 2
55#define LI_FLAG_PERSISTENT 3
56
57/* flags in ls_flags*/
58#define LS_FLAG_DELETED 1
59#define LS_FLAG_AUTOFREE 2
60
61
62#define LOCKINFO_MAGIC 0x53595324
63
64struct lock_info {
65 uint32_t li_magic;
66 uint8_t li_cmd;
67 int8_t li_grmode;
68 int8_t li_rqmode;
69 struct dlm_lksb li_lksb;
70 wait_queue_head_t li_waitq;
71 unsigned long li_flags;
72 void __user *li_castparam;
73 void __user *li_castaddr;
74 void __user *li_bastparam;
75 void __user *li_bastaddr;
76 void __user *li_pend_bastparam;
77 void __user *li_pend_bastaddr;
78 struct list_head li_ownerqueue;
79 struct file_info *li_file;
80 struct dlm_lksb __user *li_user_lksb;
81 struct semaphore li_firstlock;
82};
83
84/* A queued AST no less */
85struct ast_info {
86 struct dlm_lock_result result;
87 struct list_head list;
88 uint32_t lvb_updated;
89 uint32_t progress; /* How much has been read */
90};
91
92/* One of these per userland lockspace */
93struct user_ls {
94 void *ls_lockspace;
95 atomic_t ls_refcnt;
96 long ls_flags;
97
98 /* Passed into misc_register() */
99 struct miscdevice ls_miscinfo;
100 struct list_head ls_list;
101};
102
103/* misc_device info for the control device */
104static struct miscdevice ctl_device;
105
106/*
107 * Stuff we hang off the file struct.
108 * The first two are to cope with unlocking all the
109 * locks help by a process when it dies.
110 */
111struct file_info {
112 struct list_head fi_li_list; /* List of active lock_infos */
113 spinlock_t fi_li_lock;
114 struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
115 spinlock_t fi_ast_lock;
116 wait_queue_head_t fi_wait;
117 struct user_ls *fi_ls;
118 atomic_t fi_refcnt; /* Number of users */
119 unsigned long fi_flags; /* Bit 1 means the device is open */
120};
121
122
123/* get and put ops for file_info.
124 Actually I don't really like "get" and "put", but everyone
125 else seems to use them and I can't think of anything
126 nicer at the moment */
127static void get_file_info(struct file_info *f)
128{
129 atomic_inc(&f->fi_refcnt);
130}
131
132static void put_file_info(struct file_info *f)
133{
134 if (atomic_dec_and_test(&f->fi_refcnt))
135 kfree(f);
136}
137
138static void release_lockinfo(struct lock_info *li)
139{
140 put_file_info(li->li_file);
141
142 write_lock(&lockinfo_lock);
143 idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid);
144 write_unlock(&lockinfo_lock);
145
146 if (li->li_lksb.sb_lvbptr)
147 kfree(li->li_lksb.sb_lvbptr);
148 kfree(li);
149
150 module_put(THIS_MODULE);
151}
152
153static struct lock_info *get_lockinfo(uint32_t lockid)
154{
155 struct lock_info *li;
156
157 read_lock(&lockinfo_lock);
158 li = idr_find(&lockinfo_idr, lockid);
159 read_unlock(&lockinfo_lock);
160
161 return li;
162}
163
164static int add_lockinfo(struct lock_info *li)
165{
166 int n;
167 int r;
168 int ret = -EINVAL;
169
170 write_lock(&lockinfo_lock);
171
172 if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid))
173 goto out_up;
174
175 ret = -ENOMEM;
176 r = idr_pre_get(&lockinfo_idr, GFP_KERNEL);
177 if (!r)
178 goto out_up;
179
180 r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n);
181 if (r)
182 goto out_up;
183
184 if (n != li->li_lksb.sb_lkid) {
185 idr_remove(&lockinfo_idr, n);
186 goto out_up;
187 }
188
189 ret = 0;
190
191 out_up:
192 write_unlock(&lockinfo_lock);
193
194 return ret;
195}
196
197
198static struct user_ls *__find_lockspace(int minor)
199{
200 struct user_ls *lsinfo;
201
202 list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
203 if (lsinfo->ls_miscinfo.minor == minor)
204 return lsinfo;
205 }
206 return NULL;
207}
208
209/* Find a lockspace struct given the device minor number */
210static struct user_ls *find_lockspace(int minor)
211{
212 struct user_ls *lsinfo;
213
214 down(&user_ls_lock);
215 lsinfo = __find_lockspace(minor);
216 up(&user_ls_lock);
217
218 return lsinfo;
219}
220
221static void add_lockspace_to_list(struct user_ls *lsinfo)
222{
223 down(&user_ls_lock);
224 list_add(&lsinfo->ls_list, &user_ls_list);
225 up(&user_ls_lock);
226}
227
228/* Register a lockspace with the DLM and create a misc
229 device for userland to access it */
230static int register_lockspace(char *name, struct user_ls **ls, int flags)
231{
232 struct user_ls *newls;
233 int status;
234 int namelen;
235
236 namelen = strlen(name)+strlen(name_prefix)+2;
237
238 newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL);
239 if (!newls)
240 return -ENOMEM;
241 memset(newls, 0, sizeof(struct user_ls));
242
243 newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL);
244 if (!newls->ls_miscinfo.name) {
245 kfree(newls);
246 return -ENOMEM;
247 }
248
249 status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0,
250 DLM_USER_LVB_LEN);
251 if (status != 0) {
252 kfree(newls->ls_miscinfo.name);
253 kfree(newls);
254 return status;
255 }
256
257 snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s",
258 name_prefix, name);
259
260 newls->ls_miscinfo.fops = &_dlm_fops;
261 newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
262
263 status = misc_register(&newls->ls_miscinfo);
264 if (status) {
265 printk(KERN_ERR "dlm: misc register failed for %s\n", name);
266 dlm_release_lockspace(newls->ls_lockspace, 0);
267 kfree(newls->ls_miscinfo.name);
268 kfree(newls);
269 return status;
270 }
271
272 if (flags & DLM_USER_LSFLG_AUTOFREE)
273 set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags);
274
275 add_lockspace_to_list(newls);
276 *ls = newls;
277 return 0;
278}
279
280/* Called with the user_ls_lock semaphore held */
281static int unregister_lockspace(struct user_ls *lsinfo, int force)
282{
283 int status;
284
285 status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
286 if (status)
287 return status;
288
289 status = misc_deregister(&lsinfo->ls_miscinfo);
290 if (status)
291 return status;
292
293 list_del(&lsinfo->ls_list);
294 set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags);
295 lsinfo->ls_lockspace = NULL;
296 if (atomic_read(&lsinfo->ls_refcnt) == 0) {
297 kfree(lsinfo->ls_miscinfo.name);
298 kfree(lsinfo);
299 }
300
301 return 0;
302}
303
304/* Add it to userland's AST queue */
305static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam,
306 int lvb_updated)
307{
308 struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL);
309 if (!ast)
310 return;
311
312 memset(ast, 0, sizeof(*ast));
313 ast->result.user_astparam = astparam;
314 ast->result.user_astaddr = astaddr;
315 ast->result.user_lksb = li->li_user_lksb;
316 memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
317 ast->lvb_updated = lvb_updated;
318
319 spin_lock(&li->li_file->fi_ast_lock);
320 list_add_tail(&ast->list, &li->li_file->fi_ast_list);
321 spin_unlock(&li->li_file->fi_ast_lock);
322 wake_up_interruptible(&li->li_file->fi_wait);
323}
324
325static void bast_routine(void *param, int mode)
326{
327 struct lock_info *li = param;
328
329 if (li && li->li_bastaddr)
330 add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0);
331}
332
333/*
334 * This is the kernel's AST routine.
335 * All lock, unlock & query operations complete here.
336 * The only syncronous ops are those done during device close.
337 */
338static void ast_routine(void *param)
339{
340 struct lock_info *li = param;
341
342 /* Param may be NULL if a persistent lock is unlocked by someone else */
343 if (!li)
344 return;
345
346 /* If this is a succesful conversion then activate the blocking ast
347 * args from the conversion request */
348 if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
349 li->li_lksb.sb_status == 0) {
350
351 li->li_bastparam = li->li_pend_bastparam;
352 li->li_bastaddr = li->li_pend_bastaddr;
353 li->li_pend_bastaddr = NULL;
354 }
355
356 /* If it's an async request then post data to the user's AST queue. */
357 if (li->li_castaddr) {
358 int lvb_updated = 0;
359
360 /* See if the lvb has been updated */
361 if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1)
362 lvb_updated = 1;
363
364 if (li->li_lksb.sb_status == 0)
365 li->li_grmode = li->li_rqmode;
366
367 /* Only queue AST if the device is still open */
368 if (test_bit(1, &li->li_file->fi_flags))
369 add_to_astqueue(li, li->li_castaddr, li->li_castparam,
370 lvb_updated);
371
372 /* If it's a new lock operation that failed, then
373 * remove it from the owner queue and free the
374 * lock_info.
375 */
376 if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
377 li->li_lksb.sb_status != 0) {
378
379 /* Wait till dlm_lock() has finished */
380 down(&li->li_firstlock);
381 up(&li->li_firstlock);
382
383 spin_lock(&li->li_file->fi_li_lock);
384 list_del(&li->li_ownerqueue);
385 spin_unlock(&li->li_file->fi_li_lock);
386 release_lockinfo(li);
387 return;
388 }
389 /* Free unlocks & queries */
390 if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
391 li->li_cmd == DLM_USER_QUERY) {
392 release_lockinfo(li);
393 }
394 } else {
395 /* Synchronous request, just wake up the caller */
396 set_bit(LI_FLAG_COMPLETE, &li->li_flags);
397 wake_up_interruptible(&li->li_waitq);
398 }
399}
400
401/*
402 * Wait for the lock op to complete and return the status.
403 */
404static int wait_for_ast(struct lock_info *li)
405{
406 /* Wait for the AST routine to complete */
407 set_task_state(current, TASK_INTERRUPTIBLE);
408 while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
409 schedule();
410
411 set_task_state(current, TASK_RUNNING);
412
413 return li->li_lksb.sb_status;
414}
415
416
417/* Open on control device */
418static int dlm_ctl_open(struct inode *inode, struct file *file)
419{
420 file->private_data = NULL;
421 return 0;
422}
423
424/* Close on control device */
425static int dlm_ctl_close(struct inode *inode, struct file *file)
426{
427 return 0;
428}
429
430/* Open on lockspace device */
431static int dlm_open(struct inode *inode, struct file *file)
432{
433 struct file_info *f;
434 struct user_ls *lsinfo;
435
436 lsinfo = find_lockspace(iminor(inode));
437 if (!lsinfo)
438 return -ENOENT;
439
440 f = kmalloc(sizeof(struct file_info), GFP_KERNEL);
441 if (!f)
442 return -ENOMEM;
443
444 atomic_inc(&lsinfo->ls_refcnt);
445 INIT_LIST_HEAD(&f->fi_li_list);
446 INIT_LIST_HEAD(&f->fi_ast_list);
447 spin_lock_init(&f->fi_li_lock);
448 spin_lock_init(&f->fi_ast_lock);
449 init_waitqueue_head(&f->fi_wait);
450 f->fi_ls = lsinfo;
451 f->fi_flags = 0;
452 get_file_info(f);
453 set_bit(1, &f->fi_flags);
454
455 file->private_data = f;
456
457 return 0;
458}
459
460/* Check the user's version matches ours */
461static int check_version(struct dlm_write_request *req)
462{
463 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
464 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
465 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
466
467 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
468 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
469 current->comm,
470 current->pid,
471 req->version[0],
472 req->version[1],
473 req->version[2],
474 DLM_DEVICE_VERSION_MAJOR,
475 DLM_DEVICE_VERSION_MINOR,
476 DLM_DEVICE_VERSION_PATCH);
477 return -EINVAL;
478 }
479 return 0;
480}
481
482/* Close on lockspace device */
483static int dlm_close(struct inode *inode, struct file *file)
484{
485 struct file_info *f = file->private_data;
486 struct lock_info li;
487 struct lock_info *old_li, *safe;
488 sigset_t tmpsig;
489 sigset_t allsigs;
490 struct user_ls *lsinfo;
491 DECLARE_WAITQUEUE(wq, current);
492
493 lsinfo = find_lockspace(iminor(inode));
494 if (!lsinfo)
495 return -ENOENT;
496
497 /* Mark this closed so that ASTs will not be delivered any more */
498 clear_bit(1, &f->fi_flags);
499
500 /* Block signals while we are doing this */
501 sigfillset(&allsigs);
502 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
503
504 /* We use our own lock_info struct here, so that any
505 * outstanding "real" ASTs will be delivered with the
506 * corresponding "real" params, thus freeing the lock_info
507 * that belongs the lock. This catches the corner case where
508 * a lock is BUSY when we try to unlock it here
509 */
510 memset(&li, 0, sizeof(li));
511 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
512 init_waitqueue_head(&li.li_waitq);
513 add_wait_queue(&li.li_waitq, &wq);
514
515 /*
516 * Free any outstanding locks, they are on the
517 * list in LIFO order so there should be no problems
518 * about unlocking parents before children.
519 */
520 list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) {
521 int status;
522 int flags = 0;
523
524 /* Don't unlock persistent locks, just mark them orphaned */
525 if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) {
526 list_del(&old_li->li_ownerqueue);
527
528 /* Update master copy */
529 /* TODO: Check locking core updates the local and
530 remote ORPHAN flags */
531 li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid;
532 status = dlm_lock(f->fi_ls->ls_lockspace,
533 old_li->li_grmode, &li.li_lksb,
534 DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
535 NULL, 0, 0, ast_routine, NULL,
536 NULL, NULL);
537 if (status != 0)
538 printk("dlm: Error orphaning lock %x: %d\n",
539 old_li->li_lksb.sb_lkid, status);
540
541 /* But tidy our references in it */
542 release_lockinfo(old_li);
543 continue;
544 }
545
546 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
547
548 flags = DLM_LKF_FORCEUNLOCK;
549 if (old_li->li_grmode >= DLM_LOCK_PW)
550 flags |= DLM_LKF_IVVALBLK;
551
552 status = dlm_unlock(f->fi_ls->ls_lockspace,
553 old_li->li_lksb.sb_lkid, flags,
554 &li.li_lksb, &li);
555
556 /* Must wait for it to complete as the next lock could be its
557 * parent */
558 if (status == 0)
559 wait_for_ast(&li);
560
561 /* Unlock suceeded, free the lock_info struct. */
562 if (status == 0)
563 release_lockinfo(old_li);
564 }
565
566 remove_wait_queue(&li.li_waitq, &wq);
567
568 /*
569 * If this is the last reference to the lockspace
570 * then free the struct. If it's an AUTOFREE lockspace
571 * then free the whole thing.
572 */
573 down(&user_ls_lock);
574 if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
575
576 if (lsinfo->ls_lockspace) {
577 if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) {
578 unregister_lockspace(lsinfo, 1);
579 }
580 } else {
581 kfree(lsinfo->ls_miscinfo.name);
582 kfree(lsinfo);
583 }
584 }
585 up(&user_ls_lock);
586 put_file_info(f);
587
588 /* Restore signals */
589 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
590 recalc_sigpending();
591
592 return 0;
593}
594
595static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd,
596 struct dlm_lspace_params *kparams)
597{
598 int status;
599 struct user_ls *lsinfo;
600
601 if (!capable(CAP_SYS_ADMIN))
602 return -EPERM;
603
604 status = register_lockspace(kparams->name, &lsinfo, kparams->flags);
605
606 /* If it succeeded then return the minor number */
607 if (status == 0)
608 status = lsinfo->ls_miscinfo.minor;
609
610 return status;
611}
612
613static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd,
614 struct dlm_lspace_params *kparams)
615{
616 int status;
617 int force = 1;
618 struct user_ls *lsinfo;
619
620 if (!capable(CAP_SYS_ADMIN))
621 return -EPERM;
622
623 down(&user_ls_lock);
624 lsinfo = __find_lockspace(kparams->minor);
625 if (!lsinfo) {
626 up(&user_ls_lock);
627 return -EINVAL;
628 }
629
630 if (kparams->flags & DLM_USER_LSFLG_FORCEFREE)
631 force = 2;
632
633 status = unregister_lockspace(lsinfo, force);
634 up(&user_ls_lock);
635
636 return status;
637}
638
639/* Read call, might block if no ASTs are waiting.
640 * It will only ever return one message at a time, regardless
641 * of how many are pending.
642 */
643static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count,
644 loff_t *ppos)
645{
646 struct file_info *fi = file->private_data;
647 struct ast_info *ast;
648 int data_size;
649 int offset;
650 DECLARE_WAITQUEUE(wait, current);
651
652 if (count < sizeof(struct dlm_lock_result))
653 return -EINVAL;
654
655 spin_lock(&fi->fi_ast_lock);
656 if (list_empty(&fi->fi_ast_list)) {
657
658 /* No waiting ASTs.
659 * Return EOF if the lockspace been deleted.
660 */
661 if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
662 return 0;
663
664 if (file->f_flags & O_NONBLOCK) {
665 spin_unlock(&fi->fi_ast_lock);
666 return -EAGAIN;
667 }
668
669 add_wait_queue(&fi->fi_wait, &wait);
670
671 repeat:
672 set_current_state(TASK_INTERRUPTIBLE);
673 if (list_empty(&fi->fi_ast_list) &&
674 !signal_pending(current)) {
675
676 spin_unlock(&fi->fi_ast_lock);
677 schedule();
678 spin_lock(&fi->fi_ast_lock);
679 goto repeat;
680 }
681
682 current->state = TASK_RUNNING;
683 remove_wait_queue(&fi->fi_wait, &wait);
684
685 if (signal_pending(current)) {
686 spin_unlock(&fi->fi_ast_lock);
687 return -ERESTARTSYS;
688 }
689 }
690
691 ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
692 list_del(&ast->list);
693 spin_unlock(&fi->fi_ast_lock);
694
695 /* Work out the size of the returned data */
696 data_size = sizeof(struct dlm_lock_result);
697 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr)
698 data_size += DLM_USER_LVB_LEN;
699
700 offset = sizeof(struct dlm_lock_result);
701
702 /* Room for the extended data ? */
703 if (count >= data_size) {
704
705 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) {
706 if (copy_to_user(buffer+offset,
707 ast->result.lksb.sb_lvbptr,
708 DLM_USER_LVB_LEN))
709 return -EFAULT;
710 ast->result.lvb_offset = offset;
711 offset += DLM_USER_LVB_LEN;
712 }
713 }
714
715 ast->result.length = data_size;
716 /* Copy the header now it has all the offsets in it */
717 if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
718 offset = -EFAULT;
719
720 /* If we only returned a header and there's more to come then put it
721 back on the list */
722 if (count < data_size) {
723 spin_lock(&fi->fi_ast_lock);
724 list_add(&ast->list, &fi->fi_ast_list);
725 spin_unlock(&fi->fi_ast_lock);
726 } else
727 kfree(ast);
728 return offset;
729}
730
731static unsigned int dlm_poll(struct file *file, poll_table *wait)
732{
733 struct file_info *fi = file->private_data;
734
735 poll_wait(file, &fi->fi_wait, wait);
736
737 spin_lock(&fi->fi_ast_lock);
738 if (!list_empty(&fi->fi_ast_list)) {
739 spin_unlock(&fi->fi_ast_lock);
740 return POLLIN | POLLRDNORM;
741 }
742
743 spin_unlock(&fi->fi_ast_lock);
744 return 0;
745}
746
747static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd,
748 struct dlm_lock_params *kparams)
749{
750 struct lock_info *li;
751
752 if (!try_module_get(THIS_MODULE))
753 return NULL;
754
755 li = kmalloc(sizeof(struct lock_info), GFP_KERNEL);
756 if (li) {
757 li->li_magic = LOCKINFO_MAGIC;
758 li->li_file = fi;
759 li->li_cmd = cmd;
760 li->li_flags = 0;
761 li->li_grmode = -1;
762 li->li_rqmode = -1;
763 li->li_pend_bastparam = NULL;
764 li->li_pend_bastaddr = NULL;
765 li->li_castaddr = NULL;
766 li->li_castparam = NULL;
767 li->li_lksb.sb_lvbptr = NULL;
768 li->li_bastaddr = kparams->bastaddr;
769 li->li_bastparam = kparams->bastparam;
770
771 get_file_info(fi);
772 }
773 return li;
774}
775
776static int do_user_lock(struct file_info *fi, uint8_t cmd,
777 struct dlm_lock_params *kparams)
778{
779 struct lock_info *li;
780 int status;
781
782 /*
783 * Validate things that we need to have correct.
784 */
785 if (!kparams->castaddr)
786 return -EINVAL;
787
788 if (!kparams->lksb)
789 return -EINVAL;
790
791 /* Persistent child locks are not available yet */
792 if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
793 return -EINVAL;
794
795 /* For conversions, there should already be a lockinfo struct,
796 unless we are adopting an orphaned persistent lock */
797 if (kparams->flags & DLM_LKF_CONVERT) {
798
799 li = get_lockinfo(kparams->lkid);
800
801 /* If this is a persistent lock we will have to create a
802 lockinfo again */
803 if (!li && DLM_LKF_PERSISTENT) {
804 li = allocate_lockinfo(fi, cmd, kparams);
805
806 li->li_lksb.sb_lkid = kparams->lkid;
807 li->li_castaddr = kparams->castaddr;
808 li->li_castparam = kparams->castparam;
809
810 /* OK, this isn;t exactly a FIRSTLOCK but it is the
811 first time we've used this lockinfo, and if things
812 fail we want rid of it */
813 init_MUTEX_LOCKED(&li->li_firstlock);
814 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
815 add_lockinfo(li);
816
817 /* TODO: do a query to get the current state ?? */
818 }
819 if (!li)
820 return -EINVAL;
821
822 if (li->li_magic != LOCKINFO_MAGIC)
823 return -EINVAL;
824
825 /* For conversions don't overwrite the current blocking AST
826 info so that:
827 a) if a blocking AST fires before the conversion is queued
828 it runs the current handler
829 b) if the conversion is cancelled, the original blocking AST
830 declaration is active
831 The pend_ info is made active when the conversion
832 completes.
833 */
834 li->li_pend_bastaddr = kparams->bastaddr;
835 li->li_pend_bastparam = kparams->bastparam;
836 } else {
837 li = allocate_lockinfo(fi, cmd, kparams);
838 if (!li)
839 return -ENOMEM;
840
841 /* semaphore to allow us to complete our work before
842 the AST routine runs. In fact we only need (and use) this
843 when the initial lock fails */
844 init_MUTEX_LOCKED(&li->li_firstlock);
845 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
846 }
847
848 li->li_user_lksb = kparams->lksb;
849 li->li_castaddr = kparams->castaddr;
850 li->li_castparam = kparams->castparam;
851 li->li_lksb.sb_lkid = kparams->lkid;
852 li->li_rqmode = kparams->mode;
853 if (kparams->flags & DLM_LKF_PERSISTENT)
854 set_bit(LI_FLAG_PERSISTENT, &li->li_flags);
855
856 /* Copy in the value block */
857 if (kparams->flags & DLM_LKF_VALBLK) {
858 if (!li->li_lksb.sb_lvbptr) {
859 li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN,
860 GFP_KERNEL);
861 if (!li->li_lksb.sb_lvbptr) {
862 status = -ENOMEM;
863 goto out_err;
864 }
865 }
866
867 memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN);
868 }
869
870 /* Lock it ... */
871 status = dlm_lock(fi->fi_ls->ls_lockspace,
872 kparams->mode, &li->li_lksb,
873 kparams->flags,
874 kparams->name, kparams->namelen,
875 kparams->parent,
876 ast_routine,
877 li,
878 (li->li_pend_bastaddr || li->li_bastaddr) ?
879 bast_routine : NULL,
880 kparams->range.ra_end ? &kparams->range : NULL);
881 if (status)
882 goto out_err;
883
884 /* If it succeeded (this far) with a new lock then keep track of
885 it on the file's lockinfo list */
886 if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
887
888 spin_lock(&fi->fi_li_lock);
889 list_add(&li->li_ownerqueue, &fi->fi_li_list);
890 spin_unlock(&fi->fi_li_lock);
891 if (add_lockinfo(li))
892 printk(KERN_WARNING "Add lockinfo failed\n");
893
894 up(&li->li_firstlock);
895 }
896
897 /* Return the lockid as the user needs it /now/ */
898 return li->li_lksb.sb_lkid;
899
900 out_err:
901 if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags))
902 release_lockinfo(li);
903 return status;
904
905}
906
907static int do_user_unlock(struct file_info *fi, uint8_t cmd,
908 struct dlm_lock_params *kparams)
909{
910 struct lock_info *li;
911 int status;
912 int convert_cancel = 0;
913
914 li = get_lockinfo(kparams->lkid);
915 if (!li) {
916 li = allocate_lockinfo(fi, cmd, kparams);
917 spin_lock(&fi->fi_li_lock);
918 list_add(&li->li_ownerqueue, &fi->fi_li_list);
919 spin_unlock(&fi->fi_li_lock);
920 }
921 if (!li)
922 return -ENOMEM;
923
924 if (li->li_magic != LOCKINFO_MAGIC)
925 return -EINVAL;
926
927 li->li_user_lksb = kparams->lksb;
928 li->li_castparam = kparams->castparam;
929 li->li_cmd = cmd;
930
931 /* Cancelling a conversion doesn't remove the lock...*/
932 if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1)
933 convert_cancel = 1;
934
935 /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
936 the existing li_castaddr as that's the completion routine for
937 unlocks. dlm_unlock_wait() specifies a new AST routine to be
938 executed when the unlock completes. */
939 if (kparams->castaddr)
940 li->li_castaddr = kparams->castaddr;
941
942 /* Use existing lksb & astparams */
943 status = dlm_unlock(fi->fi_ls->ls_lockspace,
944 kparams->lkid,
945 kparams->flags, &li->li_lksb, li);
946
947 if (!status && !convert_cancel) {
948 spin_lock(&fi->fi_li_lock);
949 list_del(&li->li_ownerqueue);
950 spin_unlock(&fi->fi_li_lock);
951 }
952
953 return status;
954}
955
956/* Write call, submit a locking request */
957static ssize_t dlm_write(struct file *file, const char __user *buffer,
958 size_t count, loff_t *ppos)
959{
960 struct file_info *fi = file->private_data;
961 struct dlm_write_request *kparams;
962 sigset_t tmpsig;
963 sigset_t allsigs;
964 int status;
965
966 /* -1 because lock name is optional */
967 if (count < sizeof(struct dlm_write_request)-1)
968 return -EINVAL;
969
970 /* Has the lockspace been deleted */
971 if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
972 return -ENOENT;
973
974 kparams = kmalloc(count, GFP_KERNEL);
975 if (!kparams)
976 return -ENOMEM;
977
978 status = -EFAULT;
979 /* Get the command info */
980 if (copy_from_user(kparams, buffer, count))
981 goto out_free;
982
983 status = -EBADE;
984 if (check_version(kparams))
985 goto out_free;
986
987 /* Block signals while we are doing this */
988 sigfillset(&allsigs);
989 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
990
991 status = -EINVAL;
992 switch (kparams->cmd)
993 {
994 case DLM_USER_LOCK:
995 if (!fi) goto out_sig;
996 status = do_user_lock(fi, kparams->cmd, &kparams->i.lock);
997 break;
998
999 case DLM_USER_UNLOCK:
1000 if (!fi) goto out_sig;
1001 status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock);
1002 break;
1003
1004 case DLM_USER_CREATE_LOCKSPACE:
1005 if (fi) goto out_sig;
1006 status = do_user_create_lockspace(fi, kparams->cmd,
1007 &kparams->i.lspace);
1008 break;
1009
1010 case DLM_USER_REMOVE_LOCKSPACE:
1011 if (fi) goto out_sig;
1012 status = do_user_remove_lockspace(fi, kparams->cmd,
1013 &kparams->i.lspace);
1014 break;
1015 default:
1016 printk("Unknown command passed to DLM device : %d\n",
1017 kparams->cmd);
1018 break;
1019 }
1020
1021 out_sig:
1022 /* Restore signals */
1023 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1024 recalc_sigpending();
1025
1026 out_free:
1027 kfree(kparams);
1028 if (status == 0)
1029 return count;
1030 else
1031 return status;
1032}
1033
1034static struct file_operations _dlm_fops = {
1035 .open = dlm_open,
1036 .release = dlm_close,
1037 .read = dlm_read,
1038 .write = dlm_write,
1039 .poll = dlm_poll,
1040 .owner = THIS_MODULE,
1041};
1042
1043static struct file_operations _dlm_ctl_fops = {
1044 .open = dlm_ctl_open,
1045 .release = dlm_ctl_close,
1046 .write = dlm_write,
1047 .owner = THIS_MODULE,
1048};
1049
1050/*
1051 * Create control device
1052 */
1053static int __init dlm_device_init(void)
1054{
1055 int r;
1056
1057 INIT_LIST_HEAD(&user_ls_list);
1058 init_MUTEX(&user_ls_lock);
1059 rwlock_init(&lockinfo_lock);
1060
1061 ctl_device.name = "dlm-control";
1062 ctl_device.fops = &_dlm_ctl_fops;
1063 ctl_device.minor = MISC_DYNAMIC_MINOR;
1064
1065 r = misc_register(&ctl_device);
1066 if (r) {
1067 printk(KERN_ERR "dlm: misc_register failed for control dev\n");
1068 return r;
1069 }
1070
1071 return 0;
1072}
1073
1074static void __exit dlm_device_exit(void)
1075{
1076 misc_deregister(&ctl_device);
1077}
1078
1079MODULE_DESCRIPTION("Distributed Lock Manager device interface");
1080MODULE_AUTHOR("Red Hat, Inc.");
1081MODULE_LICENSE("GPL");
1082
1083module_init(dlm_device_init);
1084module_exit(dlm_device_exit);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..0f1dde54bcd2
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = FALSE;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = TRUE;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..0020cd07baf7
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,518 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <asm/semaphore.h>
39#include <asm/uaccess.h>
40
41#include <linux/dlm.h>
42
43#define DLM_LOCKSPACE_LEN 64
44
45#ifndef TRUE
46#define TRUE 1
47#endif
48
49#ifndef FALSE
50#define FALSE 0
51#endif
52
53#if (BITS_PER_LONG == 64)
54#define PRIx64 "lx"
55#else
56#define PRIx64 "Lx"
57#endif
58
59/* Size of the temp buffer midcomms allocates on the stack.
60 We try to make this large enough so most messages fit.
61 FIXME: should sctp make this unnecessary? */
62
63#define DLM_INBUF_LEN 148
64
65struct dlm_ls;
66struct dlm_lkb;
67struct dlm_rsb;
68struct dlm_member;
69struct dlm_lkbtable;
70struct dlm_rsbtable;
71struct dlm_dirtable;
72struct dlm_direntry;
73struct dlm_recover;
74struct dlm_header;
75struct dlm_message;
76struct dlm_rcom;
77struct dlm_mhandle;
78
79#define log_print(fmt, args...) \
80 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
81#define log_error(ls, fmt, args...) \
82 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
83
84#ifdef DLM_LOG_DEBUG
85#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
86#else
87#define log_debug(ls, fmt, args...)
88#endif
89
90#define DLM_ASSERT(x, do) \
91{ \
92 if (!(x)) \
93 { \
94 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
95 "DLM: assertion: \"%s\"\n" \
96 "DLM: time = %lu\n", \
97 __LINE__, __FILE__, #x, jiffies); \
98 {do} \
99 printk("\n"); \
100 BUG(); \
101 panic("DLM: Record message above and reboot.\n"); \
102 } \
103}
104
105
106struct dlm_direntry {
107 struct list_head list;
108 uint32_t master_nodeid;
109 uint16_t length;
110 char name[1];
111};
112
113struct dlm_dirtable {
114 struct list_head list;
115 rwlock_t lock;
116};
117
118struct dlm_rsbtable {
119 struct list_head list;
120 struct list_head toss;
121 rwlock_t lock;
122};
123
124struct dlm_lkbtable {
125 struct list_head list;
126 rwlock_t lock;
127 uint16_t counter;
128};
129
130/*
131 * Lockspace member (per node in a ls)
132 */
133
134struct dlm_member {
135 struct list_head list;
136 int nodeid;
137 int weight;
138};
139
140/*
141 * Save and manage recovery state for a lockspace.
142 */
143
144struct dlm_recover {
145 struct list_head list;
146 int *nodeids;
147 int node_count;
148 uint64_t seq;
149};
150
151/*
152 * Pass input args to second stage locking function.
153 */
154
155struct dlm_args {
156 uint32_t flags;
157 void *astaddr;
158 long astparam;
159 void *bastaddr;
160 int mode;
161 struct dlm_lksb *lksb;
162 struct dlm_range *range;
163};
164
165
166/*
167 * Lock block
168 *
169 * A lock can be one of three types:
170 *
171 * local copy lock is mastered locally
172 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
173 * process copy lock is mastered on a remote node
174 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
175 * master copy master node's copy of a lock owned by remote node
176 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
177 *
178 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
179 * dlm_unlock. The dlm does not modify these or use any private flags in
180 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
181 * are sent as-is to the remote master when the lock is remote.
182 *
183 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
184 * Some internal flags are shared between the master and process nodes;
185 * these shared flags are kept in the lower two bytes. One of these
186 * flags set on the master copy will be propagated to the process copy
187 * and v.v. Other internal flags are private to the master or process
188 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
189 *
190 * lkb_sbflags: status block flags. These flags are copied directly into
191 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
192 * ast. All defined in dlm.h with DLM_SBF_ prefix.
193 *
194 * lkb_status: the lock status indicates which rsb queue the lock is
195 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
196 *
197 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
198 * reply is needed. Only set when the lkb is on the lockspace waiters
199 * list awaiting a reply from a remote node.
200 *
201 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
202 * is a master copy, nodeid specifies the remote lock holder, when the
203 * lkb is a process copy, the nodeid specifies the lock master.
204 */
205
206/* lkb_ast_type */
207
208#define AST_COMP 1
209#define AST_BAST 2
210
211/* lkb_range[] */
212
213#define GR_RANGE_START 0
214#define GR_RANGE_END 1
215#define RQ_RANGE_START 2
216#define RQ_RANGE_END 3
217
218/* lkb_status */
219
220#define DLM_LKSTS_WAITING 1
221#define DLM_LKSTS_GRANTED 2
222#define DLM_LKSTS_CONVERT 3
223
224/* lkb_flags */
225
226#define DLM_IFL_MSTCPY 0x00010000
227#define DLM_IFL_RESEND 0x00020000
228#define DLM_IFL_RANGE 0x00000001
229
230struct dlm_lkb {
231 struct dlm_rsb *lkb_resource; /* the rsb */
232 struct kref lkb_ref;
233 int lkb_nodeid; /* copied from rsb */
234 int lkb_ownpid; /* pid of lock owner */
235 uint32_t lkb_id; /* our lock ID */
236 uint32_t lkb_remid; /* lock ID on remote partner */
237 uint32_t lkb_exflags; /* external flags from caller */
238 uint32_t lkb_sbflags; /* lksb flags */
239 uint32_t lkb_flags; /* internal flags */
240 uint32_t lkb_lvbseq; /* lvb sequence number */
241
242 int8_t lkb_status; /* granted, waiting, convert */
243 int8_t lkb_rqmode; /* requested lock mode */
244 int8_t lkb_grmode; /* granted lock mode */
245 int8_t lkb_bastmode; /* requested mode */
246 int8_t lkb_highbast; /* highest mode bast sent for */
247
248 int8_t lkb_wait_type; /* type of reply waiting for */
249 int8_t lkb_ast_type; /* type of ast queued for */
250
251 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
252 struct list_head lkb_statequeue; /* rsb g/c/w list */
253 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
254 struct list_head lkb_wait_reply; /* waiting for remote reply */
255 struct list_head lkb_astqueue; /* need ast to be sent */
256
257 uint64_t *lkb_range; /* array of gr/rq ranges */
258 char *lkb_lvbptr;
259 struct dlm_lksb *lkb_lksb; /* caller's status block */
260 void *lkb_astaddr; /* caller's ast function */
261 void *lkb_bastaddr; /* caller's bast function */
262 long lkb_astparam; /* caller's ast arg */
263};
264
265
266struct dlm_rsb {
267 struct dlm_ls *res_ls; /* the lockspace */
268 struct kref res_ref;
269 struct semaphore res_sem;
270 unsigned long res_flags;
271 int res_length; /* length of rsb name */
272 int res_nodeid;
273 uint32_t res_lvbseq;
274 uint32_t res_hash;
275 uint32_t res_bucket; /* rsbtbl */
276 unsigned long res_toss_time;
277 uint32_t res_first_lkid;
278 struct list_head res_lookup; /* lkbs waiting on first */
279 struct list_head res_hashchain; /* rsbtbl */
280 struct list_head res_grantqueue;
281 struct list_head res_convertqueue;
282 struct list_head res_waitqueue;
283
284 struct list_head res_root_list; /* used for recovery */
285 struct list_head res_recover_list; /* used for recovery */
286 int res_recover_locks_count;
287
288 char *res_lvbptr;
289 char res_name[1];
290};
291
292/* find_rsb() flags */
293
294#define R_MASTER 1 /* only return rsb if it's a master */
295#define R_CREATE 2 /* create/add rsb if not found */
296
297/* rsb_flags */
298
299enum rsb_flags {
300 RSB_MASTER_UNCERTAIN,
301 RSB_VALNOTVALID,
302 RSB_VALNOTVALID_PREV,
303 RSB_NEW_MASTER,
304 RSB_NEW_MASTER2,
305 RSB_RECOVER_CONVERT,
306};
307
308static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
309{
310 __set_bit(flag, &r->res_flags);
311}
312
313static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
314{
315 __clear_bit(flag, &r->res_flags);
316}
317
318static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
319{
320 return test_bit(flag, &r->res_flags);
321}
322
323
324/* dlm_header is first element of all structs sent between nodes */
325
326#define DLM_HEADER_MAJOR 0x00020000
327#define DLM_HEADER_MINOR 0x00000001
328
329#define DLM_MSG 1
330#define DLM_RCOM 2
331
332struct dlm_header {
333 uint32_t h_version;
334 uint32_t h_lockspace;
335 uint32_t h_nodeid; /* nodeid of sender */
336 uint16_t h_length;
337 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
338 uint8_t h_pad;
339};
340
341
342#define DLM_MSG_REQUEST 1
343#define DLM_MSG_CONVERT 2
344#define DLM_MSG_UNLOCK 3
345#define DLM_MSG_CANCEL 4
346#define DLM_MSG_REQUEST_REPLY 5
347#define DLM_MSG_CONVERT_REPLY 6
348#define DLM_MSG_UNLOCK_REPLY 7
349#define DLM_MSG_CANCEL_REPLY 8
350#define DLM_MSG_GRANT 9
351#define DLM_MSG_BAST 10
352#define DLM_MSG_LOOKUP 11
353#define DLM_MSG_REMOVE 12
354#define DLM_MSG_LOOKUP_REPLY 13
355
356struct dlm_message {
357 struct dlm_header m_header;
358 uint32_t m_type; /* DLM_MSG_ */
359 uint32_t m_nodeid;
360 uint32_t m_pid;
361 uint32_t m_lkid; /* lkid on sender */
362 uint32_t m_remid; /* lkid on receiver */
363 uint32_t m_parent_lkid;
364 uint32_t m_parent_remid;
365 uint32_t m_exflags;
366 uint32_t m_sbflags;
367 uint32_t m_flags;
368 uint32_t m_lvbseq;
369 uint32_t m_hash;
370 int m_status;
371 int m_grmode;
372 int m_rqmode;
373 int m_bastmode;
374 int m_asts;
375 int m_result; /* 0 or -EXXX */
376 uint64_t m_range[2];
377 char m_extra[0]; /* name or lvb */
378};
379
380
381#define DLM_RS_NODES 0x00000001
382#define DLM_RS_NODES_ALL 0x00000002
383#define DLM_RS_DIR 0x00000004
384#define DLM_RS_DIR_ALL 0x00000008
385#define DLM_RS_LOCKS 0x00000010
386#define DLM_RS_LOCKS_ALL 0x00000020
387#define DLM_RS_DONE 0x00000040
388#define DLM_RS_DONE_ALL 0x00000080
389
390#define DLM_RCOM_STATUS 1
391#define DLM_RCOM_NAMES 2
392#define DLM_RCOM_LOOKUP 3
393#define DLM_RCOM_LOCK 4
394#define DLM_RCOM_STATUS_REPLY 5
395#define DLM_RCOM_NAMES_REPLY 6
396#define DLM_RCOM_LOOKUP_REPLY 7
397#define DLM_RCOM_LOCK_REPLY 8
398
399struct dlm_rcom {
400 struct dlm_header rc_header;
401 uint32_t rc_type; /* DLM_RCOM_ */
402 int rc_result; /* multi-purpose */
403 uint64_t rc_id; /* match reply with request */
404 char rc_buf[0];
405};
406
407struct rcom_config {
408 uint32_t rf_lvblen;
409 uint32_t rf_lsflags;
410 uint64_t rf_unused;
411};
412
413struct rcom_lock {
414 uint32_t rl_ownpid;
415 uint32_t rl_lkid;
416 uint32_t rl_remid;
417 uint32_t rl_parent_lkid;
418 uint32_t rl_parent_remid;
419 uint32_t rl_exflags;
420 uint32_t rl_flags;
421 uint32_t rl_lvbseq;
422 int rl_result;
423 int8_t rl_rqmode;
424 int8_t rl_grmode;
425 int8_t rl_status;
426 int8_t rl_asts;
427 uint16_t rl_wait_type;
428 uint16_t rl_namelen;
429 uint64_t rl_range[4];
430 char rl_name[DLM_RESNAME_MAXLEN];
431 char rl_lvb[0];
432};
433
434struct dlm_ls {
435 struct list_head ls_list; /* list of lockspaces */
436 uint32_t ls_global_id; /* global unique lockspace ID */
437 uint32_t ls_exflags;
438 int ls_lvblen;
439 int ls_count; /* reference count */
440 unsigned long ls_flags; /* LSFL_ */
441 struct kobject ls_kobj;
442
443 struct dlm_rsbtable *ls_rsbtbl;
444 uint32_t ls_rsbtbl_size;
445
446 struct dlm_lkbtable *ls_lkbtbl;
447 uint32_t ls_lkbtbl_size;
448
449 struct dlm_dirtable *ls_dirtbl;
450 uint32_t ls_dirtbl_size;
451
452 struct semaphore ls_waiters_sem;
453 struct list_head ls_waiters; /* lkbs needing a reply */
454
455 struct list_head ls_nodes; /* current nodes in ls */
456 struct list_head ls_nodes_gone; /* dead node list, recovery */
457 int ls_num_nodes; /* number of nodes in ls */
458 int ls_low_nodeid;
459 int ls_total_weight;
460 int *ls_node_array;
461
462 struct dlm_rsb ls_stub_rsb; /* for returning errors */
463 struct dlm_lkb ls_stub_lkb; /* for returning errors */
464 struct dlm_message ls_stub_ms; /* for faking a reply */
465
466 struct dentry *ls_debug_dentry; /* debugfs */
467
468 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
469 int ls_uevent_result;
470
471 /* recovery related */
472
473 struct timer_list ls_timer;
474 struct task_struct *ls_recoverd_task;
475 struct semaphore ls_recoverd_active;
476 spinlock_t ls_recover_lock;
477 uint32_t ls_recover_status; /* DLM_RS_ */
478 uint64_t ls_recover_seq;
479 struct dlm_recover *ls_recover_args;
480 struct rw_semaphore ls_in_recovery; /* block local requests */
481 struct list_head ls_requestqueue;/* queue remote requests */
482 struct semaphore ls_requestqueue_lock;
483 char *ls_recover_buf;
484 struct list_head ls_recover_list;
485 spinlock_t ls_recover_list_lock;
486 int ls_recover_list_count;
487 wait_queue_head_t ls_wait_general;
488
489 struct list_head ls_root_list; /* root resources */
490 struct rw_semaphore ls_root_sem; /* protect root_list */
491
492 int ls_namelen;
493 char ls_name[1];
494};
495
496#define LSFL_WORK 0
497#define LSFL_RUNNING 1
498#define LSFL_RECOVERY_STOP 2
499#define LSFL_RCOM_READY 3
500#define LSFL_UEVENT_WAIT 4
501
502static inline int dlm_locking_stopped(struct dlm_ls *ls)
503{
504 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
505}
506
507static inline int dlm_recovery_stopped(struct dlm_ls *ls)
508{
509 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
510}
511
512static inline int dlm_no_directory(struct dlm_ls *ls)
513{
514 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
515}
516
517#endif /* __DLM_INTERNAL_DOT_H__ */
518
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..81efb361f95d
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3610 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58
59#include "dlm_internal.h"
60#include "memory.h"
61#include "lowcomms.h"
62#include "requestqueue.h"
63#include "util.h"
64#include "dir.h"
65#include "member.h"
66#include "lockspace.h"
67#include "ast.h"
68#include "lock.h"
69#include "rcom.h"
70#include "recover.h"
71#include "lvb_table.h"
72#include "config.h"
73
74static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
75static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
76static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
80static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_remove(struct dlm_rsb *r);
82static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
84 struct dlm_message *ms);
85static int receive_extralen(struct dlm_message *ms);
86
87/*
88 * Lock compatibilty matrix - thanks Steve
89 * UN = Unlocked state. Not really a state, used as a flag
90 * PD = Padding. Used to make the matrix a nice power of two in size
91 * Other states are the same as the VMS DLM.
92 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
93 */
94
95static const int __dlm_compat_matrix[8][8] = {
96 /* UN NL CR CW PR PW EX PD */
97 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
98 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
99 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
100 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
101 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
102 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
103 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
104 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
105};
106
107/*
108 * This defines the direction of transfer of LVB data.
109 * Granted mode is the row; requested mode is the column.
110 * Usage: matrix[grmode+1][rqmode+1]
111 * 1 = LVB is returned to the caller
112 * 0 = LVB is written to the resource
113 * -1 = nothing happens to the LVB
114 */
115
116const int dlm_lvb_operations[8][8] = {
117 /* UN NL CR CW PR PW EX PD*/
118 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
119 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
120 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
121 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
122 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
123 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
124 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
125 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
126};
127EXPORT_SYMBOL_GPL(dlm_lvb_operations);
128
129#define modes_compat(gr, rq) \
130 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
131
132int dlm_modes_compat(int mode1, int mode2)
133{
134 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
135}
136
137/*
138 * Compatibility matrix for conversions with QUECVT set.
139 * Granted mode is the row; requested mode is the column.
140 * Usage: matrix[grmode+1][rqmode+1]
141 */
142
143static const int __quecvt_compat_matrix[8][8] = {
144 /* UN NL CR CW PR PW EX PD */
145 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
146 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
147 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
148 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
149 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
150 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
151 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
152 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
153};
154
155static void dlm_print_lkb(struct dlm_lkb *lkb)
156{
157 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
158 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
159 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
160 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
161 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
162}
163
164void dlm_print_rsb(struct dlm_rsb *r)
165{
166 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
167 r->res_nodeid, r->res_flags, r->res_first_lkid,
168 r->res_recover_locks_count, r->res_name);
169}
170
171/* Threads cannot use the lockspace while it's being recovered */
172
173static inline void lock_recovery(struct dlm_ls *ls)
174{
175 down_read(&ls->ls_in_recovery);
176}
177
178static inline void unlock_recovery(struct dlm_ls *ls)
179{
180 up_read(&ls->ls_in_recovery);
181}
182
183static inline int lock_recovery_try(struct dlm_ls *ls)
184{
185 return down_read_trylock(&ls->ls_in_recovery);
186}
187
188static inline int can_be_queued(struct dlm_lkb *lkb)
189{
190 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
191}
192
193static inline int force_blocking_asts(struct dlm_lkb *lkb)
194{
195 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
196}
197
198static inline int is_demoted(struct dlm_lkb *lkb)
199{
200 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
201}
202
203static inline int is_remote(struct dlm_rsb *r)
204{
205 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
206 return !!r->res_nodeid;
207}
208
209static inline int is_process_copy(struct dlm_lkb *lkb)
210{
211 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
212}
213
214static inline int is_master_copy(struct dlm_lkb *lkb)
215{
216 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
217 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
218 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? TRUE : FALSE;
219}
220
221static inline int middle_conversion(struct dlm_lkb *lkb)
222{
223 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
224 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
225 return TRUE;
226 return FALSE;
227}
228
229static inline int down_conversion(struct dlm_lkb *lkb)
230{
231 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
232}
233
234static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
235{
236 if (is_master_copy(lkb))
237 return;
238
239 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
240
241 lkb->lkb_lksb->sb_status = rv;
242 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
243
244 dlm_add_ast(lkb, AST_COMP);
245}
246
247static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
248{
249 if (is_master_copy(lkb))
250 send_bast(r, lkb, rqmode);
251 else {
252 lkb->lkb_bastmode = rqmode;
253 dlm_add_ast(lkb, AST_BAST);
254 }
255}
256
257/*
258 * Basic operations on rsb's and lkb's
259 */
260
261static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
262{
263 struct dlm_rsb *r;
264
265 r = allocate_rsb(ls, len);
266 if (!r)
267 return NULL;
268
269 r->res_ls = ls;
270 r->res_length = len;
271 memcpy(r->res_name, name, len);
272 init_MUTEX(&r->res_sem);
273
274 INIT_LIST_HEAD(&r->res_lookup);
275 INIT_LIST_HEAD(&r->res_grantqueue);
276 INIT_LIST_HEAD(&r->res_convertqueue);
277 INIT_LIST_HEAD(&r->res_waitqueue);
278 INIT_LIST_HEAD(&r->res_root_list);
279 INIT_LIST_HEAD(&r->res_recover_list);
280
281 return r;
282}
283
284static int search_rsb_list(struct list_head *head, char *name, int len,
285 unsigned int flags, struct dlm_rsb **r_ret)
286{
287 struct dlm_rsb *r;
288 int error = 0;
289
290 list_for_each_entry(r, head, res_hashchain) {
291 if (len == r->res_length && !memcmp(name, r->res_name, len))
292 goto found;
293 }
294 return -ENOENT;
295
296 found:
297 if (r->res_nodeid && (flags & R_MASTER))
298 error = -ENOTBLK;
299 *r_ret = r;
300 return error;
301}
302
303static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
304 unsigned int flags, struct dlm_rsb **r_ret)
305{
306 struct dlm_rsb *r;
307 int error;
308
309 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
310 if (!error) {
311 kref_get(&r->res_ref);
312 goto out;
313 }
314 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
315 if (error)
316 goto out;
317
318 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
319
320 if (dlm_no_directory(ls))
321 goto out;
322
323 if (r->res_nodeid == -1) {
324 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
325 r->res_first_lkid = 0;
326 } else if (r->res_nodeid > 0) {
327 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
328 r->res_first_lkid = 0;
329 } else {
330 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
331 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
332 }
333 out:
334 *r_ret = r;
335 return error;
336}
337
338static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
339 unsigned int flags, struct dlm_rsb **r_ret)
340{
341 int error;
342 write_lock(&ls->ls_rsbtbl[b].lock);
343 error = _search_rsb(ls, name, len, b, flags, r_ret);
344 write_unlock(&ls->ls_rsbtbl[b].lock);
345 return error;
346}
347
348/*
349 * Find rsb in rsbtbl and potentially create/add one
350 *
351 * Delaying the release of rsb's has a similar benefit to applications keeping
352 * NL locks on an rsb, but without the guarantee that the cached master value
353 * will still be valid when the rsb is reused. Apps aren't always smart enough
354 * to keep NL locks on an rsb that they may lock again shortly; this can lead
355 * to excessive master lookups and removals if we don't delay the release.
356 *
357 * Searching for an rsb means looking through both the normal list and toss
358 * list. When found on the toss list the rsb is moved to the normal list with
359 * ref count of 1; when found on normal list the ref count is incremented.
360 */
361
362static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
363 unsigned int flags, struct dlm_rsb **r_ret)
364{
365 struct dlm_rsb *r, *tmp;
366 uint32_t hash, bucket;
367 int error = 0;
368
369 if (dlm_no_directory(ls))
370 flags |= R_CREATE;
371
372 hash = jhash(name, namelen, 0);
373 bucket = hash & (ls->ls_rsbtbl_size - 1);
374
375 error = search_rsb(ls, name, namelen, bucket, flags, &r);
376 if (!error)
377 goto out;
378
379 if (error == -ENOENT && !(flags & R_CREATE))
380 goto out;
381
382 /* the rsb was found but wasn't a master copy */
383 if (error == -ENOTBLK)
384 goto out;
385
386 error = -ENOMEM;
387 r = create_rsb(ls, name, namelen);
388 if (!r)
389 goto out;
390
391 r->res_hash = hash;
392 r->res_bucket = bucket;
393 r->res_nodeid = -1;
394 kref_init(&r->res_ref);
395
396 /* With no directory, the master can be set immediately */
397 if (dlm_no_directory(ls)) {
398 int nodeid = dlm_dir_nodeid(r);
399 if (nodeid == dlm_our_nodeid())
400 nodeid = 0;
401 r->res_nodeid = nodeid;
402 }
403
404 write_lock(&ls->ls_rsbtbl[bucket].lock);
405 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
406 if (!error) {
407 write_unlock(&ls->ls_rsbtbl[bucket].lock);
408 free_rsb(r);
409 r = tmp;
410 goto out;
411 }
412 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
413 write_unlock(&ls->ls_rsbtbl[bucket].lock);
414 error = 0;
415 out:
416 *r_ret = r;
417 return error;
418}
419
420int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
421 unsigned int flags, struct dlm_rsb **r_ret)
422{
423 return find_rsb(ls, name, namelen, flags, r_ret);
424}
425
426/* This is only called to add a reference when the code already holds
427 a valid reference to the rsb, so there's no need for locking. */
428
429static inline void hold_rsb(struct dlm_rsb *r)
430{
431 kref_get(&r->res_ref);
432}
433
434void dlm_hold_rsb(struct dlm_rsb *r)
435{
436 hold_rsb(r);
437}
438
439static void toss_rsb(struct kref *kref)
440{
441 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
442 struct dlm_ls *ls = r->res_ls;
443
444 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
445 kref_init(&r->res_ref);
446 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
447 r->res_toss_time = jiffies;
448 if (r->res_lvbptr) {
449 free_lvb(r->res_lvbptr);
450 r->res_lvbptr = NULL;
451 }
452}
453
454/* When all references to the rsb are gone it's transfered to
455 the tossed list for later disposal. */
456
457static void put_rsb(struct dlm_rsb *r)
458{
459 struct dlm_ls *ls = r->res_ls;
460 uint32_t bucket = r->res_bucket;
461
462 write_lock(&ls->ls_rsbtbl[bucket].lock);
463 kref_put(&r->res_ref, toss_rsb);
464 write_unlock(&ls->ls_rsbtbl[bucket].lock);
465}
466
467void dlm_put_rsb(struct dlm_rsb *r)
468{
469 put_rsb(r);
470}
471
472/* See comment for unhold_lkb */
473
474static void unhold_rsb(struct dlm_rsb *r)
475{
476 int rv;
477 rv = kref_put(&r->res_ref, toss_rsb);
478 DLM_ASSERT(!rv, dlm_print_rsb(r););
479}
480
481static void kill_rsb(struct kref *kref)
482{
483 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
484
485 /* All work is done after the return from kref_put() so we
486 can release the write_lock before the remove and free. */
487
488 DLM_ASSERT(list_empty(&r->res_lookup),);
489 DLM_ASSERT(list_empty(&r->res_grantqueue),);
490 DLM_ASSERT(list_empty(&r->res_convertqueue),);
491 DLM_ASSERT(list_empty(&r->res_waitqueue),);
492 DLM_ASSERT(list_empty(&r->res_root_list),);
493 DLM_ASSERT(list_empty(&r->res_recover_list),);
494}
495
496/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
497 The rsb must exist as long as any lkb's for it do. */
498
499static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
500{
501 hold_rsb(r);
502 lkb->lkb_resource = r;
503}
504
505static void detach_lkb(struct dlm_lkb *lkb)
506{
507 if (lkb->lkb_resource) {
508 put_rsb(lkb->lkb_resource);
509 lkb->lkb_resource = NULL;
510 }
511}
512
513static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
514{
515 struct dlm_lkb *lkb, *tmp;
516 uint32_t lkid = 0;
517 uint16_t bucket;
518
519 lkb = allocate_lkb(ls);
520 if (!lkb)
521 return -ENOMEM;
522
523 lkb->lkb_nodeid = -1;
524 lkb->lkb_grmode = DLM_LOCK_IV;
525 kref_init(&lkb->lkb_ref);
526
527 get_random_bytes(&bucket, sizeof(bucket));
528 bucket &= (ls->ls_lkbtbl_size - 1);
529
530 write_lock(&ls->ls_lkbtbl[bucket].lock);
531
532 /* counter can roll over so we must verify lkid is not in use */
533
534 while (lkid == 0) {
535 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
536
537 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
538 lkb_idtbl_list) {
539 if (tmp->lkb_id != lkid)
540 continue;
541 lkid = 0;
542 break;
543 }
544 }
545
546 lkb->lkb_id = lkid;
547 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
548 write_unlock(&ls->ls_lkbtbl[bucket].lock);
549
550 *lkb_ret = lkb;
551 return 0;
552}
553
554static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
555{
556 uint16_t bucket = lkid & 0xFFFF;
557 struct dlm_lkb *lkb;
558
559 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
560 if (lkb->lkb_id == lkid)
561 return lkb;
562 }
563 return NULL;
564}
565
566static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
567{
568 struct dlm_lkb *lkb;
569 uint16_t bucket = lkid & 0xFFFF;
570
571 if (bucket >= ls->ls_lkbtbl_size)
572 return -EBADSLT;
573
574 read_lock(&ls->ls_lkbtbl[bucket].lock);
575 lkb = __find_lkb(ls, lkid);
576 if (lkb)
577 kref_get(&lkb->lkb_ref);
578 read_unlock(&ls->ls_lkbtbl[bucket].lock);
579
580 *lkb_ret = lkb;
581 return lkb ? 0 : -ENOENT;
582}
583
584static void kill_lkb(struct kref *kref)
585{
586 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
587
588 /* All work is done after the return from kref_put() so we
589 can release the write_lock before the detach_lkb */
590
591 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
592}
593
594static int put_lkb(struct dlm_lkb *lkb)
595{
596 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
597 uint16_t bucket = lkb->lkb_id & 0xFFFF;
598
599 write_lock(&ls->ls_lkbtbl[bucket].lock);
600 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
601 list_del(&lkb->lkb_idtbl_list);
602 write_unlock(&ls->ls_lkbtbl[bucket].lock);
603
604 detach_lkb(lkb);
605
606 /* for local/process lkbs, lvbptr points to caller's lksb */
607 if (lkb->lkb_lvbptr && is_master_copy(lkb))
608 free_lvb(lkb->lkb_lvbptr);
609 if (lkb->lkb_range)
610 free_range(lkb->lkb_range);
611 free_lkb(lkb);
612 return 1;
613 } else {
614 write_unlock(&ls->ls_lkbtbl[bucket].lock);
615 return 0;
616 }
617}
618
619int dlm_put_lkb(struct dlm_lkb *lkb)
620{
621 return put_lkb(lkb);
622}
623
624/* This is only called to add a reference when the code already holds
625 a valid reference to the lkb, so there's no need for locking. */
626
627static inline void hold_lkb(struct dlm_lkb *lkb)
628{
629 kref_get(&lkb->lkb_ref);
630}
631
632/* This is called when we need to remove a reference and are certain
633 it's not the last ref. e.g. del_lkb is always called between a
634 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
635 put_lkb would work fine, but would involve unnecessary locking */
636
637static inline void unhold_lkb(struct dlm_lkb *lkb)
638{
639 int rv;
640 rv = kref_put(&lkb->lkb_ref, kill_lkb);
641 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
642}
643
644static void lkb_add_ordered(struct list_head *new, struct list_head *head,
645 int mode)
646{
647 struct dlm_lkb *lkb = NULL;
648
649 list_for_each_entry(lkb, head, lkb_statequeue)
650 if (lkb->lkb_rqmode < mode)
651 break;
652
653 if (!lkb)
654 list_add_tail(new, head);
655 else
656 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
657}
658
659/* add/remove lkb to rsb's grant/convert/wait queue */
660
661static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
662{
663 kref_get(&lkb->lkb_ref);
664
665 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
666
667 lkb->lkb_status = status;
668
669 switch (status) {
670 case DLM_LKSTS_WAITING:
671 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
672 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
673 else
674 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
675 break;
676 case DLM_LKSTS_GRANTED:
677 /* convention says granted locks kept in order of grmode */
678 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
679 lkb->lkb_grmode);
680 break;
681 case DLM_LKSTS_CONVERT:
682 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
683 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
684 else
685 list_add_tail(&lkb->lkb_statequeue,
686 &r->res_convertqueue);
687 break;
688 default:
689 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
690 }
691}
692
693static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
694{
695 lkb->lkb_status = 0;
696 list_del(&lkb->lkb_statequeue);
697 unhold_lkb(lkb);
698}
699
700static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
701{
702 hold_lkb(lkb);
703 del_lkb(r, lkb);
704 add_lkb(r, lkb, sts);
705 unhold_lkb(lkb);
706}
707
708/* add/remove lkb from global waiters list of lkb's waiting for
709 a reply from a remote node */
710
711static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
712{
713 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
714
715 down(&ls->ls_waiters_sem);
716 if (lkb->lkb_wait_type) {
717 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
718 goto out;
719 }
720 lkb->lkb_wait_type = mstype;
721 kref_get(&lkb->lkb_ref);
722 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
723 out:
724 up(&ls->ls_waiters_sem);
725}
726
727static int _remove_from_waiters(struct dlm_lkb *lkb)
728{
729 int error = 0;
730
731 if (!lkb->lkb_wait_type) {
732 log_print("remove_from_waiters error");
733 error = -EINVAL;
734 goto out;
735 }
736 lkb->lkb_wait_type = 0;
737 list_del(&lkb->lkb_wait_reply);
738 unhold_lkb(lkb);
739 out:
740 return error;
741}
742
743static int remove_from_waiters(struct dlm_lkb *lkb)
744{
745 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
746 int error;
747
748 down(&ls->ls_waiters_sem);
749 error = _remove_from_waiters(lkb);
750 up(&ls->ls_waiters_sem);
751 return error;
752}
753
754static void dir_remove(struct dlm_rsb *r)
755{
756 int to_nodeid;
757
758 if (dlm_no_directory(r->res_ls))
759 return;
760
761 to_nodeid = dlm_dir_nodeid(r);
762 if (to_nodeid != dlm_our_nodeid())
763 send_remove(r);
764 else
765 dlm_dir_remove_entry(r->res_ls, to_nodeid,
766 r->res_name, r->res_length);
767}
768
769/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
770 found since they are in order of newest to oldest? */
771
772static int shrink_bucket(struct dlm_ls *ls, int b)
773{
774 struct dlm_rsb *r;
775 int count = 0, found;
776
777 for (;;) {
778 found = FALSE;
779 write_lock(&ls->ls_rsbtbl[b].lock);
780 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
781 res_hashchain) {
782 if (!time_after_eq(jiffies, r->res_toss_time +
783 dlm_config.toss_secs * HZ))
784 continue;
785 found = TRUE;
786 break;
787 }
788
789 if (!found) {
790 write_unlock(&ls->ls_rsbtbl[b].lock);
791 break;
792 }
793
794 if (kref_put(&r->res_ref, kill_rsb)) {
795 list_del(&r->res_hashchain);
796 write_unlock(&ls->ls_rsbtbl[b].lock);
797
798 if (is_master(r))
799 dir_remove(r);
800 free_rsb(r);
801 count++;
802 } else {
803 write_unlock(&ls->ls_rsbtbl[b].lock);
804 log_error(ls, "tossed rsb in use %s", r->res_name);
805 }
806 }
807
808 return count;
809}
810
811void dlm_scan_rsbs(struct dlm_ls *ls)
812{
813 int i;
814
815 if (dlm_locking_stopped(ls))
816 return;
817
818 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
819 shrink_bucket(ls, i);
820 cond_resched();
821 }
822}
823
824/* lkb is master or local copy */
825
826static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
827{
828 int b, len = r->res_ls->ls_lvblen;
829
830 /* b=1 lvb returned to caller
831 b=0 lvb written to rsb or invalidated
832 b=-1 do nothing */
833
834 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
835
836 if (b == 1) {
837 if (!lkb->lkb_lvbptr)
838 return;
839
840 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
841 return;
842
843 if (!r->res_lvbptr)
844 return;
845
846 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
847 lkb->lkb_lvbseq = r->res_lvbseq;
848
849 } else if (b == 0) {
850 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
851 rsb_set_flag(r, RSB_VALNOTVALID);
852 return;
853 }
854
855 if (!lkb->lkb_lvbptr)
856 return;
857
858 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
859 return;
860
861 if (!r->res_lvbptr)
862 r->res_lvbptr = allocate_lvb(r->res_ls);
863
864 if (!r->res_lvbptr)
865 return;
866
867 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
868 r->res_lvbseq++;
869 lkb->lkb_lvbseq = r->res_lvbseq;
870 rsb_clear_flag(r, RSB_VALNOTVALID);
871 }
872
873 if (rsb_flag(r, RSB_VALNOTVALID))
874 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
875}
876
877static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
878{
879 if (lkb->lkb_grmode < DLM_LOCK_PW)
880 return;
881
882 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
883 rsb_set_flag(r, RSB_VALNOTVALID);
884 return;
885 }
886
887 if (!lkb->lkb_lvbptr)
888 return;
889
890 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
891 return;
892
893 if (!r->res_lvbptr)
894 r->res_lvbptr = allocate_lvb(r->res_ls);
895
896 if (!r->res_lvbptr)
897 return;
898
899 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
900 r->res_lvbseq++;
901 rsb_clear_flag(r, RSB_VALNOTVALID);
902}
903
904/* lkb is process copy (pc) */
905
906static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
907 struct dlm_message *ms)
908{
909 int b;
910
911 if (!lkb->lkb_lvbptr)
912 return;
913
914 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
915 return;
916
917 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
918 if (b == 1) {
919 int len = receive_extralen(ms);
920 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
921 lkb->lkb_lvbseq = ms->m_lvbseq;
922 }
923}
924
925/* Manipulate lkb's on rsb's convert/granted/waiting queues
926 remove_lock -- used for unlock, removes lkb from granted
927 revert_lock -- used for cancel, moves lkb from convert to granted
928 grant_lock -- used for request and convert, adds lkb to granted or
929 moves lkb from convert or waiting to granted
930
931 Each of these is used for master or local copy lkb's. There is
932 also a _pc() variation used to make the corresponding change on
933 a process copy (pc) lkb. */
934
935static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
936{
937 del_lkb(r, lkb);
938 lkb->lkb_grmode = DLM_LOCK_IV;
939 /* this unhold undoes the original ref from create_lkb()
940 so this leads to the lkb being freed */
941 unhold_lkb(lkb);
942}
943
944static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
945{
946 set_lvb_unlock(r, lkb);
947 _remove_lock(r, lkb);
948}
949
950static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
951{
952 _remove_lock(r, lkb);
953}
954
955static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
956{
957 lkb->lkb_rqmode = DLM_LOCK_IV;
958
959 switch (lkb->lkb_status) {
960 case DLM_LKSTS_CONVERT:
961 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
962 break;
963 case DLM_LKSTS_WAITING:
964 del_lkb(r, lkb);
965 lkb->lkb_grmode = DLM_LOCK_IV;
966 /* this unhold undoes the original ref from create_lkb()
967 so this leads to the lkb being freed */
968 unhold_lkb(lkb);
969 break;
970 default:
971 log_print("invalid status for revert %d", lkb->lkb_status);
972 }
973}
974
975static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
976{
977 revert_lock(r, lkb);
978}
979
980static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
981{
982 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
983 lkb->lkb_grmode = lkb->lkb_rqmode;
984 if (lkb->lkb_status)
985 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
986 else
987 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
988 }
989
990 lkb->lkb_rqmode = DLM_LOCK_IV;
991
992 if (lkb->lkb_range) {
993 lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
994 lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
995 }
996}
997
998static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
999{
1000 set_lvb_lock(r, lkb);
1001 _grant_lock(r, lkb);
1002 lkb->lkb_highbast = 0;
1003}
1004
1005static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1006 struct dlm_message *ms)
1007{
1008 set_lvb_lock_pc(r, lkb, ms);
1009 _grant_lock(r, lkb);
1010}
1011
1012/* called by grant_pending_locks() which means an async grant message must
1013 be sent to the requesting node in addition to granting the lock if the
1014 lkb belongs to a remote node. */
1015
1016static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1017{
1018 grant_lock(r, lkb);
1019 if (is_master_copy(lkb))
1020 send_grant(r, lkb);
1021 else
1022 queue_cast(r, lkb, 0);
1023}
1024
1025static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1026{
1027 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1028 lkb_statequeue);
1029 if (lkb->lkb_id == first->lkb_id)
1030 return TRUE;
1031
1032 return FALSE;
1033}
1034
1035/* Return 1 if the locks' ranges overlap. If the lkb has no range then it is
1036 assumed to cover 0-ffffffff.ffffffff */
1037
1038static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
1039{
1040 if (!lkb1->lkb_range || !lkb2->lkb_range)
1041 return TRUE;
1042
1043 if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
1044 lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
1045 return FALSE;
1046
1047 return TRUE;
1048}
1049
1050/* Check if the given lkb conflicts with another lkb on the queue. */
1051
1052static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1053{
1054 struct dlm_lkb *this;
1055
1056 list_for_each_entry(this, head, lkb_statequeue) {
1057 if (this == lkb)
1058 continue;
1059 if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
1060 return TRUE;
1061 }
1062 return FALSE;
1063}
1064
1065/*
1066 * "A conversion deadlock arises with a pair of lock requests in the converting
1067 * queue for one resource. The granted mode of each lock blocks the requested
1068 * mode of the other lock."
1069 *
1070 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1071 * convert queue from being granted, then demote lkb (set grmode to NL).
1072 * This second form requires that we check for conv-deadlk even when
1073 * now == 0 in _can_be_granted().
1074 *
1075 * Example:
1076 * Granted Queue: empty
1077 * Convert Queue: NL->EX (first lock)
1078 * PR->EX (second lock)
1079 *
1080 * The first lock can't be granted because of the granted mode of the second
1081 * lock and the second lock can't be granted because it's not first in the
1082 * list. We demote the granted mode of the second lock (the lkb passed to this
1083 * function).
1084 *
1085 * After the resolution, the "grant pending" function needs to go back and try
1086 * to grant locks on the convert queue again since the first lock can now be
1087 * granted.
1088 */
1089
1090static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1091{
1092 struct dlm_lkb *this, *first = NULL, *self = NULL;
1093
1094 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1095 if (!first)
1096 first = this;
1097 if (this == lkb) {
1098 self = lkb;
1099 continue;
1100 }
1101
1102 if (!ranges_overlap(lkb, this))
1103 continue;
1104
1105 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1106 return TRUE;
1107 }
1108
1109 /* if lkb is on the convert queue and is preventing the first
1110 from being granted, then there's deadlock and we demote lkb.
1111 multiple converting locks may need to do this before the first
1112 converting lock can be granted. */
1113
1114 if (self && self != first) {
1115 if (!modes_compat(lkb, first) &&
1116 !queue_conflict(&rsb->res_grantqueue, first))
1117 return TRUE;
1118 }
1119
1120 return FALSE;
1121}
1122
1123/*
1124 * Return 1 if the lock can be granted, 0 otherwise.
1125 * Also detect and resolve conversion deadlocks.
1126 *
1127 * lkb is the lock to be granted
1128 *
1129 * now is 1 if the function is being called in the context of the
1130 * immediate request, it is 0 if called later, after the lock has been
1131 * queued.
1132 *
1133 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1134 */
1135
1136static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1137{
1138 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1139
1140 /*
1141 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1142 * a new request for a NL mode lock being blocked.
1143 *
1144 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1145 * request, then it would be granted. In essence, the use of this flag
1146 * tells the Lock Manager to expedite theis request by not considering
1147 * what may be in the CONVERTING or WAITING queues... As of this
1148 * writing, the EXPEDITE flag can be used only with new requests for NL
1149 * mode locks. This flag is not valid for conversion requests.
1150 *
1151 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1152 * conversion or used with a non-NL requested mode. We also know an
1153 * EXPEDITE request is always granted immediately, so now must always
1154 * be 1. The full condition to grant an expedite request: (now &&
1155 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1156 * therefore be shortened to just checking the flag.
1157 */
1158
1159 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1160 return TRUE;
1161
1162 /*
1163 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1164 * added to the remaining conditions.
1165 */
1166
1167 if (queue_conflict(&r->res_grantqueue, lkb))
1168 goto out;
1169
1170 /*
1171 * 6-3: By default, a conversion request is immediately granted if the
1172 * requested mode is compatible with the modes of all other granted
1173 * locks
1174 */
1175
1176 if (queue_conflict(&r->res_convertqueue, lkb))
1177 goto out;
1178
1179 /*
1180 * 6-5: But the default algorithm for deciding whether to grant or
1181 * queue conversion requests does not by itself guarantee that such
1182 * requests are serviced on a "first come first serve" basis. This, in
1183 * turn, can lead to a phenomenon known as "indefinate postponement".
1184 *
1185 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1186 * the system service employed to request a lock conversion. This flag
1187 * forces certain conversion requests to be queued, even if they are
1188 * compatible with the granted modes of other locks on the same
1189 * resource. Thus, the use of this flag results in conversion requests
1190 * being ordered on a "first come first servce" basis.
1191 *
1192 * DCT: This condition is all about new conversions being able to occur
1193 * "in place" while the lock remains on the granted queue (assuming
1194 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1195 * doesn't _have_ to go onto the convert queue where it's processed in
1196 * order. The "now" variable is necessary to distinguish converts
1197 * being received and processed for the first time now, because once a
1198 * convert is moved to the conversion queue the condition below applies
1199 * requiring fifo granting.
1200 */
1201
1202 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1203 return TRUE;
1204
1205 /*
1206 * When using range locks the NOORDER flag is set to avoid the standard
1207 * vms rules on grant order.
1208 */
1209
1210 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1211 return TRUE;
1212
1213 /*
1214 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1215 * granted until all other conversion requests ahead of it are granted
1216 * and/or canceled.
1217 */
1218
1219 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1220 return TRUE;
1221
1222 /*
1223 * 6-4: By default, a new request is immediately granted only if all
1224 * three of the following conditions are satisfied when the request is
1225 * issued:
1226 * - The queue of ungranted conversion requests for the resource is
1227 * empty.
1228 * - The queue of ungranted new requests for the resource is empty.
1229 * - The mode of the new request is compatible with the most
1230 * restrictive mode of all granted locks on the resource.
1231 */
1232
1233 if (now && !conv && list_empty(&r->res_convertqueue) &&
1234 list_empty(&r->res_waitqueue))
1235 return TRUE;
1236
1237 /*
1238 * 6-4: Once a lock request is in the queue of ungranted new requests,
1239 * it cannot be granted until the queue of ungranted conversion
1240 * requests is empty, all ungranted new requests ahead of it are
1241 * granted and/or canceled, and it is compatible with the granted mode
1242 * of the most restrictive lock granted on the resource.
1243 */
1244
1245 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1246 first_in_list(lkb, &r->res_waitqueue))
1247 return TRUE;
1248
1249 out:
1250 /*
1251 * The following, enabled by CONVDEADLK, departs from VMS.
1252 */
1253
1254 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1255 conversion_deadlock_detect(r, lkb)) {
1256 lkb->lkb_grmode = DLM_LOCK_NL;
1257 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1258 }
1259
1260 return FALSE;
1261}
1262
1263/*
1264 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1265 * simple way to provide a big optimization to applications that can use them.
1266 */
1267
1268static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1269{
1270 uint32_t flags = lkb->lkb_exflags;
1271 int rv;
1272 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1273
1274 rv = _can_be_granted(r, lkb, now);
1275 if (rv)
1276 goto out;
1277
1278 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1279 goto out;
1280
1281 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1282 alt = DLM_LOCK_PR;
1283 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1284 alt = DLM_LOCK_CW;
1285
1286 if (alt) {
1287 lkb->lkb_rqmode = alt;
1288 rv = _can_be_granted(r, lkb, now);
1289 if (rv)
1290 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1291 else
1292 lkb->lkb_rqmode = rqmode;
1293 }
1294 out:
1295 return rv;
1296}
1297
1298static int grant_pending_convert(struct dlm_rsb *r, int high)
1299{
1300 struct dlm_lkb *lkb, *s;
1301 int hi, demoted, quit, grant_restart, demote_restart;
1302
1303 quit = 0;
1304 restart:
1305 grant_restart = 0;
1306 demote_restart = 0;
1307 hi = DLM_LOCK_IV;
1308
1309 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1310 demoted = is_demoted(lkb);
1311 if (can_be_granted(r, lkb, FALSE)) {
1312 grant_lock_pending(r, lkb);
1313 grant_restart = 1;
1314 } else {
1315 hi = max_t(int, lkb->lkb_rqmode, hi);
1316 if (!demoted && is_demoted(lkb))
1317 demote_restart = 1;
1318 }
1319 }
1320
1321 if (grant_restart)
1322 goto restart;
1323 if (demote_restart && !quit) {
1324 quit = 1;
1325 goto restart;
1326 }
1327
1328 return max_t(int, high, hi);
1329}
1330
1331static int grant_pending_wait(struct dlm_rsb *r, int high)
1332{
1333 struct dlm_lkb *lkb, *s;
1334
1335 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1336 if (can_be_granted(r, lkb, FALSE))
1337 grant_lock_pending(r, lkb);
1338 else
1339 high = max_t(int, lkb->lkb_rqmode, high);
1340 }
1341
1342 return high;
1343}
1344
1345static void grant_pending_locks(struct dlm_rsb *r)
1346{
1347 struct dlm_lkb *lkb, *s;
1348 int high = DLM_LOCK_IV;
1349
1350 DLM_ASSERT(is_master(r), dlm_print_rsb(r););
1351
1352 high = grant_pending_convert(r, high);
1353 high = grant_pending_wait(r, high);
1354
1355 if (high == DLM_LOCK_IV)
1356 return;
1357
1358 /*
1359 * If there are locks left on the wait/convert queue then send blocking
1360 * ASTs to granted locks based on the largest requested mode (high)
1361 * found above. This can generate spurious blocking ASTs for range
1362 * locks. FIXME: highbast < high comparison not valid for PR/CW.
1363 */
1364
1365 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1366 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1367 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1368 queue_bast(r, lkb, high);
1369 lkb->lkb_highbast = high;
1370 }
1371 }
1372}
1373
1374static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1375 struct dlm_lkb *lkb)
1376{
1377 struct dlm_lkb *gr;
1378
1379 list_for_each_entry(gr, head, lkb_statequeue) {
1380 if (gr->lkb_bastaddr &&
1381 gr->lkb_highbast < lkb->lkb_rqmode &&
1382 ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
1383 queue_bast(r, gr, lkb->lkb_rqmode);
1384 gr->lkb_highbast = lkb->lkb_rqmode;
1385 }
1386 }
1387}
1388
1389static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1390{
1391 send_bast_queue(r, &r->res_grantqueue, lkb);
1392}
1393
1394static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1395{
1396 send_bast_queue(r, &r->res_grantqueue, lkb);
1397 send_bast_queue(r, &r->res_convertqueue, lkb);
1398}
1399
1400/* set_master(r, lkb) -- set the master nodeid of a resource
1401
1402 The purpose of this function is to set the nodeid field in the given
1403 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1404 known, it can just be copied to the lkb and the function will return
1405 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1406 before it can be copied to the lkb.
1407
1408 When the rsb nodeid is being looked up remotely, the initial lkb
1409 causing the lookup is kept on the ls_waiters list waiting for the
1410 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1411 on the rsb's res_lookup list until the master is verified.
1412
1413 Return values:
1414 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1415 1: the rsb master is not available and the lkb has been placed on
1416 a wait queue
1417*/
1418
1419static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1420{
1421 struct dlm_ls *ls = r->res_ls;
1422 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1423
1424 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1425 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1426 r->res_first_lkid = lkb->lkb_id;
1427 lkb->lkb_nodeid = r->res_nodeid;
1428 return 0;
1429 }
1430
1431 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1432 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1433 return 1;
1434 }
1435
1436 if (r->res_nodeid == 0) {
1437 lkb->lkb_nodeid = 0;
1438 return 0;
1439 }
1440
1441 if (r->res_nodeid > 0) {
1442 lkb->lkb_nodeid = r->res_nodeid;
1443 return 0;
1444 }
1445
1446 DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
1447
1448 dir_nodeid = dlm_dir_nodeid(r);
1449
1450 if (dir_nodeid != our_nodeid) {
1451 r->res_first_lkid = lkb->lkb_id;
1452 send_lookup(r, lkb);
1453 return 1;
1454 }
1455
1456 for (;;) {
1457 /* It's possible for dlm_scand to remove an old rsb for
1458 this same resource from the toss list, us to create
1459 a new one, look up the master locally, and find it
1460 already exists just before dlm_scand does the
1461 dir_remove() on the previous rsb. */
1462
1463 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1464 r->res_length, &ret_nodeid);
1465 if (!error)
1466 break;
1467 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1468 schedule();
1469 }
1470
1471 if (ret_nodeid == our_nodeid) {
1472 r->res_first_lkid = 0;
1473 r->res_nodeid = 0;
1474 lkb->lkb_nodeid = 0;
1475 } else {
1476 r->res_first_lkid = lkb->lkb_id;
1477 r->res_nodeid = ret_nodeid;
1478 lkb->lkb_nodeid = ret_nodeid;
1479 }
1480 return 0;
1481}
1482
1483static void process_lookup_list(struct dlm_rsb *r)
1484{
1485 struct dlm_lkb *lkb, *safe;
1486
1487 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1488 list_del(&lkb->lkb_rsb_lookup);
1489 _request_lock(r, lkb);
1490 schedule();
1491 }
1492}
1493
1494/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1495
1496static void confirm_master(struct dlm_rsb *r, int error)
1497{
1498 struct dlm_lkb *lkb;
1499
1500 if (!r->res_first_lkid)
1501 return;
1502
1503 switch (error) {
1504 case 0:
1505 case -EINPROGRESS:
1506 r->res_first_lkid = 0;
1507 process_lookup_list(r);
1508 break;
1509
1510 case -EAGAIN:
1511 /* the remote master didn't queue our NOQUEUE request;
1512 make a waiting lkb the first_lkid */
1513
1514 r->res_first_lkid = 0;
1515
1516 if (!list_empty(&r->res_lookup)) {
1517 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1518 lkb_rsb_lookup);
1519 list_del(&lkb->lkb_rsb_lookup);
1520 r->res_first_lkid = lkb->lkb_id;
1521 _request_lock(r, lkb);
1522 } else
1523 r->res_nodeid = -1;
1524 break;
1525
1526 default:
1527 log_error(r->res_ls, "confirm_master unknown error %d", error);
1528 }
1529}
1530
1531static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1532 int namelen, uint32_t parent_lkid, void *ast,
1533 void *astarg, void *bast, struct dlm_range *range,
1534 struct dlm_args *args)
1535{
1536 int rv = -EINVAL;
1537
1538 /* check for invalid arg usage */
1539
1540 if (mode < 0 || mode > DLM_LOCK_EX)
1541 goto out;
1542
1543 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1544 goto out;
1545
1546 if (flags & DLM_LKF_CANCEL)
1547 goto out;
1548
1549 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1550 goto out;
1551
1552 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1553 goto out;
1554
1555 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1556 goto out;
1557
1558 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1559 goto out;
1560
1561 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1562 goto out;
1563
1564 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1565 goto out;
1566
1567 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1568 goto out;
1569
1570 if (!ast || !lksb)
1571 goto out;
1572
1573 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1574 goto out;
1575
1576 /* parent/child locks not yet supported */
1577 if (parent_lkid)
1578 goto out;
1579
1580 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1581 goto out;
1582
1583 /* these args will be copied to the lkb in validate_lock_args,
1584 it cannot be done now because when converting locks, fields in
1585 an active lkb cannot be modified before locking the rsb */
1586
1587 args->flags = flags;
1588 args->astaddr = ast;
1589 args->astparam = (long) astarg;
1590 args->bastaddr = bast;
1591 args->mode = mode;
1592 args->lksb = lksb;
1593 args->range = range;
1594 rv = 0;
1595 out:
1596 return rv;
1597}
1598
1599static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1600{
1601 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1602 DLM_LKF_FORCEUNLOCK))
1603 return -EINVAL;
1604
1605 args->flags = flags;
1606 args->astparam = (long) astarg;
1607 return 0;
1608}
1609
1610static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1611 struct dlm_args *args)
1612{
1613 int rv = -EINVAL;
1614
1615 if (args->flags & DLM_LKF_CONVERT) {
1616 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1617 goto out;
1618
1619 if (args->flags & DLM_LKF_QUECVT &&
1620 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1621 goto out;
1622
1623 rv = -EBUSY;
1624 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1625 goto out;
1626
1627 if (lkb->lkb_wait_type)
1628 goto out;
1629 }
1630
1631 lkb->lkb_exflags = args->flags;
1632 lkb->lkb_sbflags = 0;
1633 lkb->lkb_astaddr = args->astaddr;
1634 lkb->lkb_astparam = args->astparam;
1635 lkb->lkb_bastaddr = args->bastaddr;
1636 lkb->lkb_rqmode = args->mode;
1637 lkb->lkb_lksb = args->lksb;
1638 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1639 lkb->lkb_ownpid = (int) current->pid;
1640
1641 rv = 0;
1642 if (!args->range)
1643 goto out;
1644
1645 if (!lkb->lkb_range) {
1646 rv = -ENOMEM;
1647 lkb->lkb_range = allocate_range(ls);
1648 if (!lkb->lkb_range)
1649 goto out;
1650 /* This is needed for conversions that contain ranges
1651 where the original lock didn't but it's harmless for
1652 new locks too. */
1653 lkb->lkb_range[GR_RANGE_START] = 0LL;
1654 lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
1655 }
1656
1657 lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start;
1658 lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end;
1659 lkb->lkb_flags |= DLM_IFL_RANGE;
1660 rv = 0;
1661 out:
1662 return rv;
1663}
1664
1665static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1666{
1667 int rv = -EINVAL;
1668
1669 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1670 goto out;
1671
1672 if (args->flags & DLM_LKF_FORCEUNLOCK)
1673 goto out_ok;
1674
1675 if (args->flags & DLM_LKF_CANCEL &&
1676 lkb->lkb_status == DLM_LKSTS_GRANTED)
1677 goto out;
1678
1679 if (!(args->flags & DLM_LKF_CANCEL) &&
1680 lkb->lkb_status != DLM_LKSTS_GRANTED)
1681 goto out;
1682
1683 rv = -EBUSY;
1684 if (lkb->lkb_wait_type)
1685 goto out;
1686
1687 out_ok:
1688 lkb->lkb_exflags = args->flags;
1689 lkb->lkb_sbflags = 0;
1690 lkb->lkb_astparam = args->astparam;
1691
1692 rv = 0;
1693 out:
1694 return rv;
1695}
1696
1697/*
1698 * Four stage 4 varieties:
1699 * do_request(), do_convert(), do_unlock(), do_cancel()
1700 * These are called on the master node for the given lock and
1701 * from the central locking logic.
1702 */
1703
1704static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1705{
1706 int error = 0;
1707
1708 if (can_be_granted(r, lkb, TRUE)) {
1709 grant_lock(r, lkb);
1710 queue_cast(r, lkb, 0);
1711 goto out;
1712 }
1713
1714 if (can_be_queued(lkb)) {
1715 error = -EINPROGRESS;
1716 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1717 send_blocking_asts(r, lkb);
1718 goto out;
1719 }
1720
1721 error = -EAGAIN;
1722 if (force_blocking_asts(lkb))
1723 send_blocking_asts_all(r, lkb);
1724 queue_cast(r, lkb, -EAGAIN);
1725
1726 out:
1727 return error;
1728}
1729
1730static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1731{
1732 int error = 0;
1733
1734 /* changing an existing lock may allow others to be granted */
1735
1736 if (can_be_granted(r, lkb, TRUE)) {
1737 grant_lock(r, lkb);
1738 queue_cast(r, lkb, 0);
1739 grant_pending_locks(r);
1740 goto out;
1741 }
1742
1743 if (can_be_queued(lkb)) {
1744 if (is_demoted(lkb))
1745 grant_pending_locks(r);
1746 error = -EINPROGRESS;
1747 del_lkb(r, lkb);
1748 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1749 send_blocking_asts(r, lkb);
1750 goto out;
1751 }
1752
1753 error = -EAGAIN;
1754 if (force_blocking_asts(lkb))
1755 send_blocking_asts_all(r, lkb);
1756 queue_cast(r, lkb, -EAGAIN);
1757
1758 out:
1759 return error;
1760}
1761
1762static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1763{
1764 remove_lock(r, lkb);
1765 queue_cast(r, lkb, -DLM_EUNLOCK);
1766 grant_pending_locks(r);
1767 return -DLM_EUNLOCK;
1768}
1769
1770static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1771{
1772 revert_lock(r, lkb);
1773 queue_cast(r, lkb, -DLM_ECANCEL);
1774 grant_pending_locks(r);
1775 return -DLM_ECANCEL;
1776}
1777
1778/*
1779 * Four stage 3 varieties:
1780 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1781 */
1782
1783/* add a new lkb to a possibly new rsb, called by requesting process */
1784
1785static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1786{
1787 int error;
1788
1789 /* set_master: sets lkb nodeid from r */
1790
1791 error = set_master(r, lkb);
1792 if (error < 0)
1793 goto out;
1794 if (error) {
1795 error = 0;
1796 goto out;
1797 }
1798
1799 if (is_remote(r))
1800 /* receive_request() calls do_request() on remote node */
1801 error = send_request(r, lkb);
1802 else
1803 error = do_request(r, lkb);
1804 out:
1805 return error;
1806}
1807
1808/* change some property of an existing lkb, e.g. mode, range */
1809
1810static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1811{
1812 int error;
1813
1814 if (is_remote(r))
1815 /* receive_convert() calls do_convert() on remote node */
1816 error = send_convert(r, lkb);
1817 else
1818 error = do_convert(r, lkb);
1819
1820 return error;
1821}
1822
1823/* remove an existing lkb from the granted queue */
1824
1825static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1826{
1827 int error;
1828
1829 if (is_remote(r))
1830 /* receive_unlock() calls do_unlock() on remote node */
1831 error = send_unlock(r, lkb);
1832 else
1833 error = do_unlock(r, lkb);
1834
1835 return error;
1836}
1837
1838/* remove an existing lkb from the convert or wait queue */
1839
1840static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1841{
1842 int error;
1843
1844 if (is_remote(r))
1845 /* receive_cancel() calls do_cancel() on remote node */
1846 error = send_cancel(r, lkb);
1847 else
1848 error = do_cancel(r, lkb);
1849
1850 return error;
1851}
1852
1853/*
1854 * Four stage 2 varieties:
1855 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1856 */
1857
1858static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1859 int len, struct dlm_args *args)
1860{
1861 struct dlm_rsb *r;
1862 int error;
1863
1864 error = validate_lock_args(ls, lkb, args);
1865 if (error)
1866 goto out;
1867
1868 error = find_rsb(ls, name, len, R_CREATE, &r);
1869 if (error)
1870 goto out;
1871
1872 lock_rsb(r);
1873
1874 attach_lkb(r, lkb);
1875 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1876
1877 error = _request_lock(r, lkb);
1878
1879 unlock_rsb(r);
1880 put_rsb(r);
1881
1882 out:
1883 return error;
1884}
1885
1886static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1887 struct dlm_args *args)
1888{
1889 struct dlm_rsb *r;
1890 int error;
1891
1892 r = lkb->lkb_resource;
1893
1894 hold_rsb(r);
1895 lock_rsb(r);
1896
1897 error = validate_lock_args(ls, lkb, args);
1898 if (error)
1899 goto out;
1900
1901 error = _convert_lock(r, lkb);
1902 out:
1903 unlock_rsb(r);
1904 put_rsb(r);
1905 return error;
1906}
1907
1908static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1909 struct dlm_args *args)
1910{
1911 struct dlm_rsb *r;
1912 int error;
1913
1914 r = lkb->lkb_resource;
1915
1916 hold_rsb(r);
1917 lock_rsb(r);
1918
1919 error = validate_unlock_args(lkb, args);
1920 if (error)
1921 goto out;
1922
1923 error = _unlock_lock(r, lkb);
1924 out:
1925 unlock_rsb(r);
1926 put_rsb(r);
1927 return error;
1928}
1929
1930static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1931 struct dlm_args *args)
1932{
1933 struct dlm_rsb *r;
1934 int error;
1935
1936 r = lkb->lkb_resource;
1937
1938 hold_rsb(r);
1939 lock_rsb(r);
1940
1941 error = validate_unlock_args(lkb, args);
1942 if (error)
1943 goto out;
1944
1945 error = _cancel_lock(r, lkb);
1946 out:
1947 unlock_rsb(r);
1948 put_rsb(r);
1949 return error;
1950}
1951
1952/*
1953 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1954 */
1955
1956int dlm_lock(dlm_lockspace_t *lockspace,
1957 int mode,
1958 struct dlm_lksb *lksb,
1959 uint32_t flags,
1960 void *name,
1961 unsigned int namelen,
1962 uint32_t parent_lkid,
1963 void (*ast) (void *astarg),
1964 void *astarg,
1965 void (*bast) (void *astarg, int mode),
1966 struct dlm_range *range)
1967{
1968 struct dlm_ls *ls;
1969 struct dlm_lkb *lkb;
1970 struct dlm_args args;
1971 int error, convert = flags & DLM_LKF_CONVERT;
1972
1973 ls = dlm_find_lockspace_local(lockspace);
1974 if (!ls)
1975 return -EINVAL;
1976
1977 lock_recovery(ls);
1978
1979 if (convert)
1980 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1981 else
1982 error = create_lkb(ls, &lkb);
1983
1984 if (error)
1985 goto out;
1986
1987 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1988 astarg, bast, range, &args);
1989 if (error)
1990 goto out_put;
1991
1992 if (convert)
1993 error = convert_lock(ls, lkb, &args);
1994 else
1995 error = request_lock(ls, lkb, name, namelen, &args);
1996
1997 if (error == -EINPROGRESS)
1998 error = 0;
1999 out_put:
2000 if (convert || error)
2001 put_lkb(lkb);
2002 if (error == -EAGAIN)
2003 error = 0;
2004 out:
2005 unlock_recovery(ls);
2006 dlm_put_lockspace(ls);
2007 return error;
2008}
2009
2010int dlm_unlock(dlm_lockspace_t *lockspace,
2011 uint32_t lkid,
2012 uint32_t flags,
2013 struct dlm_lksb *lksb,
2014 void *astarg)
2015{
2016 struct dlm_ls *ls;
2017 struct dlm_lkb *lkb;
2018 struct dlm_args args;
2019 int error;
2020
2021 ls = dlm_find_lockspace_local(lockspace);
2022 if (!ls)
2023 return -EINVAL;
2024
2025 lock_recovery(ls);
2026
2027 error = find_lkb(ls, lkid, &lkb);
2028 if (error)
2029 goto out;
2030
2031 error = set_unlock_args(flags, astarg, &args);
2032 if (error)
2033 goto out_put;
2034
2035 if (flags & DLM_LKF_CANCEL)
2036 error = cancel_lock(ls, lkb, &args);
2037 else
2038 error = unlock_lock(ls, lkb, &args);
2039
2040 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2041 error = 0;
2042 out_put:
2043 put_lkb(lkb);
2044 out:
2045 unlock_recovery(ls);
2046 dlm_put_lockspace(ls);
2047 return error;
2048}
2049
2050/*
2051 * send/receive routines for remote operations and replies
2052 *
2053 * send_args
2054 * send_common
2055 * send_request receive_request
2056 * send_convert receive_convert
2057 * send_unlock receive_unlock
2058 * send_cancel receive_cancel
2059 * send_grant receive_grant
2060 * send_bast receive_bast
2061 * send_lookup receive_lookup
2062 * send_remove receive_remove
2063 *
2064 * send_common_reply
2065 * receive_request_reply send_request_reply
2066 * receive_convert_reply send_convert_reply
2067 * receive_unlock_reply send_unlock_reply
2068 * receive_cancel_reply send_cancel_reply
2069 * receive_lookup_reply send_lookup_reply
2070 */
2071
2072static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2073 int to_nodeid, int mstype,
2074 struct dlm_message **ms_ret,
2075 struct dlm_mhandle **mh_ret)
2076{
2077 struct dlm_message *ms;
2078 struct dlm_mhandle *mh;
2079 char *mb;
2080 int mb_len = sizeof(struct dlm_message);
2081
2082 switch (mstype) {
2083 case DLM_MSG_REQUEST:
2084 case DLM_MSG_LOOKUP:
2085 case DLM_MSG_REMOVE:
2086 mb_len += r->res_length;
2087 break;
2088 case DLM_MSG_CONVERT:
2089 case DLM_MSG_UNLOCK:
2090 case DLM_MSG_REQUEST_REPLY:
2091 case DLM_MSG_CONVERT_REPLY:
2092 case DLM_MSG_GRANT:
2093 if (lkb && lkb->lkb_lvbptr)
2094 mb_len += r->res_ls->ls_lvblen;
2095 break;
2096 }
2097
2098 /* get_buffer gives us a message handle (mh) that we need to
2099 pass into lowcomms_commit and a message buffer (mb) that we
2100 write our data into */
2101
2102 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2103 if (!mh)
2104 return -ENOBUFS;
2105
2106 memset(mb, 0, mb_len);
2107
2108 ms = (struct dlm_message *) mb;
2109
2110 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2111 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2112 ms->m_header.h_nodeid = dlm_our_nodeid();
2113 ms->m_header.h_length = mb_len;
2114 ms->m_header.h_cmd = DLM_MSG;
2115
2116 ms->m_type = mstype;
2117
2118 *mh_ret = mh;
2119 *ms_ret = ms;
2120 return 0;
2121}
2122
2123/* further lowcomms enhancements or alternate implementations may make
2124 the return value from this function useful at some point */
2125
2126static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2127{
2128 dlm_message_out(ms);
2129 dlm_lowcomms_commit_buffer(mh);
2130 return 0;
2131}
2132
2133static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2134 struct dlm_message *ms)
2135{
2136 ms->m_nodeid = lkb->lkb_nodeid;
2137 ms->m_pid = lkb->lkb_ownpid;
2138 ms->m_lkid = lkb->lkb_id;
2139 ms->m_remid = lkb->lkb_remid;
2140 ms->m_exflags = lkb->lkb_exflags;
2141 ms->m_sbflags = lkb->lkb_sbflags;
2142 ms->m_flags = lkb->lkb_flags;
2143 ms->m_lvbseq = lkb->lkb_lvbseq;
2144 ms->m_status = lkb->lkb_status;
2145 ms->m_grmode = lkb->lkb_grmode;
2146 ms->m_rqmode = lkb->lkb_rqmode;
2147 ms->m_hash = r->res_hash;
2148
2149 /* m_result and m_bastmode are set from function args,
2150 not from lkb fields */
2151
2152 if (lkb->lkb_bastaddr)
2153 ms->m_asts |= AST_BAST;
2154 if (lkb->lkb_astaddr)
2155 ms->m_asts |= AST_COMP;
2156
2157 if (lkb->lkb_range) {
2158 ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START];
2159 ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END];
2160 }
2161
2162 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2163 memcpy(ms->m_extra, r->res_name, r->res_length);
2164
2165 else if (lkb->lkb_lvbptr)
2166 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2167
2168}
2169
2170static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2171{
2172 struct dlm_message *ms;
2173 struct dlm_mhandle *mh;
2174 int to_nodeid, error;
2175
2176 add_to_waiters(lkb, mstype);
2177
2178 to_nodeid = r->res_nodeid;
2179
2180 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2181 if (error)
2182 goto fail;
2183
2184 send_args(r, lkb, ms);
2185
2186 error = send_message(mh, ms);
2187 if (error)
2188 goto fail;
2189 return 0;
2190
2191 fail:
2192 remove_from_waiters(lkb);
2193 return error;
2194}
2195
2196static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2197{
2198 return send_common(r, lkb, DLM_MSG_REQUEST);
2199}
2200
2201static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2202{
2203 int error;
2204
2205 error = send_common(r, lkb, DLM_MSG_CONVERT);
2206
2207 /* down conversions go without a reply from the master */
2208 if (!error && down_conversion(lkb)) {
2209 remove_from_waiters(lkb);
2210 r->res_ls->ls_stub_ms.m_result = 0;
2211 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2212 }
2213
2214 return error;
2215}
2216
2217/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2218 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2219 that the master is still correct. */
2220
2221static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2222{
2223 return send_common(r, lkb, DLM_MSG_UNLOCK);
2224}
2225
2226static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2227{
2228 return send_common(r, lkb, DLM_MSG_CANCEL);
2229}
2230
2231static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2232{
2233 struct dlm_message *ms;
2234 struct dlm_mhandle *mh;
2235 int to_nodeid, error;
2236
2237 to_nodeid = lkb->lkb_nodeid;
2238
2239 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2240 if (error)
2241 goto out;
2242
2243 send_args(r, lkb, ms);
2244
2245 ms->m_result = 0;
2246
2247 error = send_message(mh, ms);
2248 out:
2249 return error;
2250}
2251
2252static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2253{
2254 struct dlm_message *ms;
2255 struct dlm_mhandle *mh;
2256 int to_nodeid, error;
2257
2258 to_nodeid = lkb->lkb_nodeid;
2259
2260 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2261 if (error)
2262 goto out;
2263
2264 send_args(r, lkb, ms);
2265
2266 ms->m_bastmode = mode;
2267
2268 error = send_message(mh, ms);
2269 out:
2270 return error;
2271}
2272
2273static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2274{
2275 struct dlm_message *ms;
2276 struct dlm_mhandle *mh;
2277 int to_nodeid, error;
2278
2279 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2280
2281 to_nodeid = dlm_dir_nodeid(r);
2282
2283 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2284 if (error)
2285 goto fail;
2286
2287 send_args(r, lkb, ms);
2288
2289 error = send_message(mh, ms);
2290 if (error)
2291 goto fail;
2292 return 0;
2293
2294 fail:
2295 remove_from_waiters(lkb);
2296 return error;
2297}
2298
2299static int send_remove(struct dlm_rsb *r)
2300{
2301 struct dlm_message *ms;
2302 struct dlm_mhandle *mh;
2303 int to_nodeid, error;
2304
2305 to_nodeid = dlm_dir_nodeid(r);
2306
2307 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2308 if (error)
2309 goto out;
2310
2311 memcpy(ms->m_extra, r->res_name, r->res_length);
2312 ms->m_hash = r->res_hash;
2313
2314 error = send_message(mh, ms);
2315 out:
2316 return error;
2317}
2318
2319static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2320 int mstype, int rv)
2321{
2322 struct dlm_message *ms;
2323 struct dlm_mhandle *mh;
2324 int to_nodeid, error;
2325
2326 to_nodeid = lkb->lkb_nodeid;
2327
2328 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2329 if (error)
2330 goto out;
2331
2332 send_args(r, lkb, ms);
2333
2334 ms->m_result = rv;
2335
2336 error = send_message(mh, ms);
2337 out:
2338 return error;
2339}
2340
2341static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2342{
2343 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2344}
2345
2346static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2347{
2348 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2349}
2350
2351static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2352{
2353 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2354}
2355
2356static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2357{
2358 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2359}
2360
2361static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2362 int ret_nodeid, int rv)
2363{
2364 struct dlm_rsb *r = &ls->ls_stub_rsb;
2365 struct dlm_message *ms;
2366 struct dlm_mhandle *mh;
2367 int error, nodeid = ms_in->m_header.h_nodeid;
2368
2369 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2370 if (error)
2371 goto out;
2372
2373 ms->m_lkid = ms_in->m_lkid;
2374 ms->m_result = rv;
2375 ms->m_nodeid = ret_nodeid;
2376
2377 error = send_message(mh, ms);
2378 out:
2379 return error;
2380}
2381
2382/* which args we save from a received message depends heavily on the type
2383 of message, unlike the send side where we can safely send everything about
2384 the lkb for any type of message */
2385
2386static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2387{
2388 lkb->lkb_exflags = ms->m_exflags;
2389 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2390 (ms->m_flags & 0x0000FFFF);
2391}
2392
2393static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2394{
2395 lkb->lkb_sbflags = ms->m_sbflags;
2396 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2397 (ms->m_flags & 0x0000FFFF);
2398}
2399
2400static int receive_extralen(struct dlm_message *ms)
2401{
2402 return (ms->m_header.h_length - sizeof(struct dlm_message));
2403}
2404
2405static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb,
2406 struct dlm_message *ms)
2407{
2408 if (lkb->lkb_flags & DLM_IFL_RANGE) {
2409 if (!lkb->lkb_range)
2410 lkb->lkb_range = allocate_range(ls);
2411 if (!lkb->lkb_range)
2412 return -ENOMEM;
2413 lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0];
2414 lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1];
2415 }
2416 return 0;
2417}
2418
2419static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2420 struct dlm_message *ms)
2421{
2422 int len;
2423
2424 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2425 if (!lkb->lkb_lvbptr)
2426 lkb->lkb_lvbptr = allocate_lvb(ls);
2427 if (!lkb->lkb_lvbptr)
2428 return -ENOMEM;
2429 len = receive_extralen(ms);
2430 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2431 }
2432 return 0;
2433}
2434
2435static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2436 struct dlm_message *ms)
2437{
2438 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2439 lkb->lkb_ownpid = ms->m_pid;
2440 lkb->lkb_remid = ms->m_lkid;
2441 lkb->lkb_grmode = DLM_LOCK_IV;
2442 lkb->lkb_rqmode = ms->m_rqmode;
2443 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2444 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2445
2446 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2447
2448 if (receive_range(ls, lkb, ms))
2449 return -ENOMEM;
2450
2451 if (receive_lvb(ls, lkb, ms))
2452 return -ENOMEM;
2453
2454 return 0;
2455}
2456
2457static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2458 struct dlm_message *ms)
2459{
2460 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2461 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2462 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2463 lkb->lkb_id, lkb->lkb_remid);
2464 return -EINVAL;
2465 }
2466
2467 if (!is_master_copy(lkb))
2468 return -EINVAL;
2469
2470 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2471 return -EBUSY;
2472
2473 if (receive_range(ls, lkb, ms))
2474 return -ENOMEM;
2475 if (lkb->lkb_range) {
2476 lkb->lkb_range[GR_RANGE_START] = 0LL;
2477 lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
2478 }
2479
2480 if (receive_lvb(ls, lkb, ms))
2481 return -ENOMEM;
2482
2483 lkb->lkb_rqmode = ms->m_rqmode;
2484 lkb->lkb_lvbseq = ms->m_lvbseq;
2485
2486 return 0;
2487}
2488
2489static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2490 struct dlm_message *ms)
2491{
2492 if (!is_master_copy(lkb))
2493 return -EINVAL;
2494 if (receive_lvb(ls, lkb, ms))
2495 return -ENOMEM;
2496 return 0;
2497}
2498
2499/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2500 uses to send a reply and that the remote end uses to process the reply. */
2501
2502static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2503{
2504 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2505 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2506 lkb->lkb_remid = ms->m_lkid;
2507}
2508
2509static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2510{
2511 struct dlm_lkb *lkb;
2512 struct dlm_rsb *r;
2513 int error, namelen;
2514
2515 error = create_lkb(ls, &lkb);
2516 if (error)
2517 goto fail;
2518
2519 receive_flags(lkb, ms);
2520 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2521 error = receive_request_args(ls, lkb, ms);
2522 if (error) {
2523 put_lkb(lkb);
2524 goto fail;
2525 }
2526
2527 namelen = receive_extralen(ms);
2528
2529 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2530 if (error) {
2531 put_lkb(lkb);
2532 goto fail;
2533 }
2534
2535 lock_rsb(r);
2536
2537 attach_lkb(r, lkb);
2538 error = do_request(r, lkb);
2539 send_request_reply(r, lkb, error);
2540
2541 unlock_rsb(r);
2542 put_rsb(r);
2543
2544 if (error == -EINPROGRESS)
2545 error = 0;
2546 if (error)
2547 put_lkb(lkb);
2548 return;
2549
2550 fail:
2551 setup_stub_lkb(ls, ms);
2552 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2553}
2554
2555static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2556{
2557 struct dlm_lkb *lkb;
2558 struct dlm_rsb *r;
2559 int error, reply = TRUE;
2560
2561 error = find_lkb(ls, ms->m_remid, &lkb);
2562 if (error)
2563 goto fail;
2564
2565 r = lkb->lkb_resource;
2566
2567 hold_rsb(r);
2568 lock_rsb(r);
2569
2570 receive_flags(lkb, ms);
2571 error = receive_convert_args(ls, lkb, ms);
2572 if (error)
2573 goto out;
2574 reply = !down_conversion(lkb);
2575
2576 error = do_convert(r, lkb);
2577 out:
2578 if (reply)
2579 send_convert_reply(r, lkb, error);
2580
2581 unlock_rsb(r);
2582 put_rsb(r);
2583 put_lkb(lkb);
2584 return;
2585
2586 fail:
2587 setup_stub_lkb(ls, ms);
2588 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2589}
2590
2591static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2592{
2593 struct dlm_lkb *lkb;
2594 struct dlm_rsb *r;
2595 int error;
2596
2597 error = find_lkb(ls, ms->m_remid, &lkb);
2598 if (error)
2599 goto fail;
2600
2601 r = lkb->lkb_resource;
2602
2603 hold_rsb(r);
2604 lock_rsb(r);
2605
2606 receive_flags(lkb, ms);
2607 error = receive_unlock_args(ls, lkb, ms);
2608 if (error)
2609 goto out;
2610
2611 error = do_unlock(r, lkb);
2612 out:
2613 send_unlock_reply(r, lkb, error);
2614
2615 unlock_rsb(r);
2616 put_rsb(r);
2617 put_lkb(lkb);
2618 return;
2619
2620 fail:
2621 setup_stub_lkb(ls, ms);
2622 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2623}
2624
2625static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2626{
2627 struct dlm_lkb *lkb;
2628 struct dlm_rsb *r;
2629 int error;
2630
2631 error = find_lkb(ls, ms->m_remid, &lkb);
2632 if (error)
2633 goto fail;
2634
2635 receive_flags(lkb, ms);
2636
2637 r = lkb->lkb_resource;
2638
2639 hold_rsb(r);
2640 lock_rsb(r);
2641
2642 error = do_cancel(r, lkb);
2643 send_cancel_reply(r, lkb, error);
2644
2645 unlock_rsb(r);
2646 put_rsb(r);
2647 put_lkb(lkb);
2648 return;
2649
2650 fail:
2651 setup_stub_lkb(ls, ms);
2652 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2653}
2654
2655static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2656{
2657 struct dlm_lkb *lkb;
2658 struct dlm_rsb *r;
2659 int error;
2660
2661 error = find_lkb(ls, ms->m_remid, &lkb);
2662 if (error) {
2663 log_error(ls, "receive_grant no lkb");
2664 return;
2665 }
2666 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2667
2668 r = lkb->lkb_resource;
2669
2670 hold_rsb(r);
2671 lock_rsb(r);
2672
2673 receive_flags_reply(lkb, ms);
2674 grant_lock_pc(r, lkb, ms);
2675 queue_cast(r, lkb, 0);
2676
2677 unlock_rsb(r);
2678 put_rsb(r);
2679 put_lkb(lkb);
2680}
2681
2682static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2683{
2684 struct dlm_lkb *lkb;
2685 struct dlm_rsb *r;
2686 int error;
2687
2688 error = find_lkb(ls, ms->m_remid, &lkb);
2689 if (error) {
2690 log_error(ls, "receive_bast no lkb");
2691 return;
2692 }
2693 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2694
2695 r = lkb->lkb_resource;
2696
2697 hold_rsb(r);
2698 lock_rsb(r);
2699
2700 queue_bast(r, lkb, ms->m_bastmode);
2701
2702 unlock_rsb(r);
2703 put_rsb(r);
2704 put_lkb(lkb);
2705}
2706
2707static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2708{
2709 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2710
2711 from_nodeid = ms->m_header.h_nodeid;
2712 our_nodeid = dlm_our_nodeid();
2713
2714 len = receive_extralen(ms);
2715
2716 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2717 if (dir_nodeid != our_nodeid) {
2718 log_error(ls, "lookup dir_nodeid %d from %d",
2719 dir_nodeid, from_nodeid);
2720 error = -EINVAL;
2721 ret_nodeid = -1;
2722 goto out;
2723 }
2724
2725 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2726
2727 /* Optimization: we're master so treat lookup as a request */
2728 if (!error && ret_nodeid == our_nodeid) {
2729 receive_request(ls, ms);
2730 return;
2731 }
2732 out:
2733 send_lookup_reply(ls, ms, ret_nodeid, error);
2734}
2735
2736static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2737{
2738 int len, dir_nodeid, from_nodeid;
2739
2740 from_nodeid = ms->m_header.h_nodeid;
2741
2742 len = receive_extralen(ms);
2743
2744 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2745 if (dir_nodeid != dlm_our_nodeid()) {
2746 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2747 dir_nodeid, from_nodeid);
2748 return;
2749 }
2750
2751 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2752}
2753
2754static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2755{
2756 struct dlm_lkb *lkb;
2757 struct dlm_rsb *r;
2758 int error, mstype;
2759
2760 error = find_lkb(ls, ms->m_remid, &lkb);
2761 if (error) {
2762 log_error(ls, "receive_request_reply no lkb");
2763 return;
2764 }
2765 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2766
2767 mstype = lkb->lkb_wait_type;
2768 error = remove_from_waiters(lkb);
2769 if (error) {
2770 log_error(ls, "receive_request_reply not on waiters");
2771 goto out;
2772 }
2773
2774 /* this is the value returned from do_request() on the master */
2775 error = ms->m_result;
2776
2777 r = lkb->lkb_resource;
2778 hold_rsb(r);
2779 lock_rsb(r);
2780
2781 /* Optimization: the dir node was also the master, so it took our
2782 lookup as a request and sent request reply instead of lookup reply */
2783 if (mstype == DLM_MSG_LOOKUP) {
2784 r->res_nodeid = ms->m_header.h_nodeid;
2785 lkb->lkb_nodeid = r->res_nodeid;
2786 }
2787
2788 switch (error) {
2789 case -EAGAIN:
2790 /* request would block (be queued) on remote master;
2791 the unhold undoes the original ref from create_lkb()
2792 so it leads to the lkb being freed */
2793 queue_cast(r, lkb, -EAGAIN);
2794 confirm_master(r, -EAGAIN);
2795 unhold_lkb(lkb);
2796 break;
2797
2798 case -EINPROGRESS:
2799 case 0:
2800 /* request was queued or granted on remote master */
2801 receive_flags_reply(lkb, ms);
2802 lkb->lkb_remid = ms->m_lkid;
2803 if (error)
2804 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2805 else {
2806 grant_lock_pc(r, lkb, ms);
2807 queue_cast(r, lkb, 0);
2808 }
2809 confirm_master(r, error);
2810 break;
2811
2812 case -ENOENT:
2813 case -ENOTBLK:
2814 /* find_rsb failed to find rsb or rsb wasn't master */
2815 r->res_nodeid = -1;
2816 lkb->lkb_nodeid = -1;
2817 _request_lock(r, lkb);
2818 break;
2819
2820 default:
2821 log_error(ls, "receive_request_reply error %d", error);
2822 }
2823
2824 unlock_rsb(r);
2825 put_rsb(r);
2826 out:
2827 put_lkb(lkb);
2828}
2829
2830static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2831 struct dlm_message *ms)
2832{
2833 int error = ms->m_result;
2834
2835 /* this is the value returned from do_convert() on the master */
2836
2837 switch (error) {
2838 case -EAGAIN:
2839 /* convert would block (be queued) on remote master */
2840 queue_cast(r, lkb, -EAGAIN);
2841 break;
2842
2843 case -EINPROGRESS:
2844 /* convert was queued on remote master */
2845 del_lkb(r, lkb);
2846 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2847 break;
2848
2849 case 0:
2850 /* convert was granted on remote master */
2851 receive_flags_reply(lkb, ms);
2852 grant_lock_pc(r, lkb, ms);
2853 queue_cast(r, lkb, 0);
2854 break;
2855
2856 default:
2857 log_error(r->res_ls, "receive_convert_reply error %d", error);
2858 }
2859}
2860
2861static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2862{
2863 struct dlm_rsb *r = lkb->lkb_resource;
2864
2865 hold_rsb(r);
2866 lock_rsb(r);
2867
2868 __receive_convert_reply(r, lkb, ms);
2869
2870 unlock_rsb(r);
2871 put_rsb(r);
2872}
2873
2874static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2875{
2876 struct dlm_lkb *lkb;
2877 int error;
2878
2879 error = find_lkb(ls, ms->m_remid, &lkb);
2880 if (error) {
2881 log_error(ls, "receive_convert_reply no lkb");
2882 return;
2883 }
2884 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2885
2886 error = remove_from_waiters(lkb);
2887 if (error) {
2888 log_error(ls, "receive_convert_reply not on waiters");
2889 goto out;
2890 }
2891
2892 _receive_convert_reply(lkb, ms);
2893 out:
2894 put_lkb(lkb);
2895}
2896
2897static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2898{
2899 struct dlm_rsb *r = lkb->lkb_resource;
2900 int error = ms->m_result;
2901
2902 hold_rsb(r);
2903 lock_rsb(r);
2904
2905 /* this is the value returned from do_unlock() on the master */
2906
2907 switch (error) {
2908 case -DLM_EUNLOCK:
2909 receive_flags_reply(lkb, ms);
2910 remove_lock_pc(r, lkb);
2911 queue_cast(r, lkb, -DLM_EUNLOCK);
2912 break;
2913 default:
2914 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2915 }
2916
2917 unlock_rsb(r);
2918 put_rsb(r);
2919}
2920
2921static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2922{
2923 struct dlm_lkb *lkb;
2924 int error;
2925
2926 error = find_lkb(ls, ms->m_remid, &lkb);
2927 if (error) {
2928 log_error(ls, "receive_unlock_reply no lkb");
2929 return;
2930 }
2931 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2932
2933 error = remove_from_waiters(lkb);
2934 if (error) {
2935 log_error(ls, "receive_unlock_reply not on waiters");
2936 goto out;
2937 }
2938
2939 _receive_unlock_reply(lkb, ms);
2940 out:
2941 put_lkb(lkb);
2942}
2943
2944static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2945{
2946 struct dlm_rsb *r = lkb->lkb_resource;
2947 int error = ms->m_result;
2948
2949 hold_rsb(r);
2950 lock_rsb(r);
2951
2952 /* this is the value returned from do_cancel() on the master */
2953
2954 switch (error) {
2955 case -DLM_ECANCEL:
2956 receive_flags_reply(lkb, ms);
2957 revert_lock_pc(r, lkb);
2958 queue_cast(r, lkb, -DLM_ECANCEL);
2959 break;
2960 default:
2961 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2962 }
2963
2964 unlock_rsb(r);
2965 put_rsb(r);
2966}
2967
2968static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2969{
2970 struct dlm_lkb *lkb;
2971 int error;
2972
2973 error = find_lkb(ls, ms->m_remid, &lkb);
2974 if (error) {
2975 log_error(ls, "receive_cancel_reply no lkb");
2976 return;
2977 }
2978 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2979
2980 error = remove_from_waiters(lkb);
2981 if (error) {
2982 log_error(ls, "receive_cancel_reply not on waiters");
2983 goto out;
2984 }
2985
2986 _receive_cancel_reply(lkb, ms);
2987 out:
2988 put_lkb(lkb);
2989}
2990
2991static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2992{
2993 struct dlm_lkb *lkb;
2994 struct dlm_rsb *r;
2995 int error, ret_nodeid;
2996
2997 error = find_lkb(ls, ms->m_lkid, &lkb);
2998 if (error) {
2999 log_error(ls, "receive_lookup_reply no lkb");
3000 return;
3001 }
3002
3003 error = remove_from_waiters(lkb);
3004 if (error) {
3005 log_error(ls, "receive_lookup_reply not on waiters");
3006 goto out;
3007 }
3008
3009 /* this is the value returned by dlm_dir_lookup on dir node
3010 FIXME: will a non-zero error ever be returned? */
3011 error = ms->m_result;
3012
3013 r = lkb->lkb_resource;
3014 hold_rsb(r);
3015 lock_rsb(r);
3016
3017 ret_nodeid = ms->m_nodeid;
3018 if (ret_nodeid == dlm_our_nodeid()) {
3019 r->res_nodeid = 0;
3020 ret_nodeid = 0;
3021 r->res_first_lkid = 0;
3022 } else {
3023 /* set_master() will copy res_nodeid to lkb_nodeid */
3024 r->res_nodeid = ret_nodeid;
3025 }
3026
3027 _request_lock(r, lkb);
3028
3029 if (!ret_nodeid)
3030 process_lookup_list(r);
3031
3032 unlock_rsb(r);
3033 put_rsb(r);
3034 out:
3035 put_lkb(lkb);
3036}
3037
3038int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3039{
3040 struct dlm_message *ms = (struct dlm_message *) hd;
3041 struct dlm_ls *ls;
3042 int error;
3043
3044 if (!recovery)
3045 dlm_message_in(ms);
3046
3047 ls = dlm_find_lockspace_global(hd->h_lockspace);
3048 if (!ls) {
3049 log_print("drop message %d from %d for unknown lockspace %d",
3050 ms->m_type, nodeid, hd->h_lockspace);
3051 return -EINVAL;
3052 }
3053
3054 /* recovery may have just ended leaving a bunch of backed-up requests
3055 in the requestqueue; wait while dlm_recoverd clears them */
3056
3057 if (!recovery)
3058 dlm_wait_requestqueue(ls);
3059
3060 /* recovery may have just started while there were a bunch of
3061 in-flight requests -- save them in requestqueue to be processed
3062 after recovery. we can't let dlm_recvd block on the recovery
3063 lock. if dlm_recoverd is calling this function to clear the
3064 requestqueue, it needs to be interrupted (-EINTR) if another
3065 recovery operation is starting. */
3066
3067 while (1) {
3068 if (dlm_locking_stopped(ls)) {
3069 if (!recovery)
3070 dlm_add_requestqueue(ls, nodeid, hd);
3071 error = -EINTR;
3072 goto out;
3073 }
3074
3075 if (lock_recovery_try(ls))
3076 break;
3077 schedule();
3078 }
3079
3080 switch (ms->m_type) {
3081
3082 /* messages sent to a master node */
3083
3084 case DLM_MSG_REQUEST:
3085 receive_request(ls, ms);
3086 break;
3087
3088 case DLM_MSG_CONVERT:
3089 receive_convert(ls, ms);
3090 break;
3091
3092 case DLM_MSG_UNLOCK:
3093 receive_unlock(ls, ms);
3094 break;
3095
3096 case DLM_MSG_CANCEL:
3097 receive_cancel(ls, ms);
3098 break;
3099
3100 /* messages sent from a master node (replies to above) */
3101
3102 case DLM_MSG_REQUEST_REPLY:
3103 receive_request_reply(ls, ms);
3104 break;
3105
3106 case DLM_MSG_CONVERT_REPLY:
3107 receive_convert_reply(ls, ms);
3108 break;
3109
3110 case DLM_MSG_UNLOCK_REPLY:
3111 receive_unlock_reply(ls, ms);
3112 break;
3113
3114 case DLM_MSG_CANCEL_REPLY:
3115 receive_cancel_reply(ls, ms);
3116 break;
3117
3118 /* messages sent from a master node (only two types of async msg) */
3119
3120 case DLM_MSG_GRANT:
3121 receive_grant(ls, ms);
3122 break;
3123
3124 case DLM_MSG_BAST:
3125 receive_bast(ls, ms);
3126 break;
3127
3128 /* messages sent to a dir node */
3129
3130 case DLM_MSG_LOOKUP:
3131 receive_lookup(ls, ms);
3132 break;
3133
3134 case DLM_MSG_REMOVE:
3135 receive_remove(ls, ms);
3136 break;
3137
3138 /* messages sent from a dir node (remove has no reply) */
3139
3140 case DLM_MSG_LOOKUP_REPLY:
3141 receive_lookup_reply(ls, ms);
3142 break;
3143
3144 default:
3145 log_error(ls, "unknown message type %d", ms->m_type);
3146 }
3147
3148 unlock_recovery(ls);
3149 out:
3150 dlm_put_lockspace(ls);
3151 dlm_astd_wake();
3152 return 0;
3153}
3154
3155
3156/*
3157 * Recovery related
3158 */
3159
3160static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3161{
3162 if (middle_conversion(lkb)) {
3163 hold_lkb(lkb);
3164 ls->ls_stub_ms.m_result = -EINPROGRESS;
3165 _remove_from_waiters(lkb);
3166 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3167
3168 /* Same special case as in receive_rcom_lock_args() */
3169 lkb->lkb_grmode = DLM_LOCK_IV;
3170 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3171 unhold_lkb(lkb);
3172
3173 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3174 lkb->lkb_flags |= DLM_IFL_RESEND;
3175 }
3176
3177 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3178 conversions are async; there's no reply from the remote master */
3179}
3180
3181/* A waiting lkb needs recovery if the master node has failed, or
3182 the master node is changing (only when no directory is used) */
3183
3184static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3185{
3186 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3187 return 1;
3188
3189 if (!dlm_no_directory(ls))
3190 return 0;
3191
3192 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3193 return 1;
3194
3195 return 0;
3196}
3197
3198/* Recovery for locks that are waiting for replies from nodes that are now
3199 gone. We can just complete unlocks and cancels by faking a reply from the
3200 dead node. Requests and up-conversions we flag to be resent after
3201 recovery. Down-conversions can just be completed with a fake reply like
3202 unlocks. Conversions between PR and CW need special attention. */
3203
3204void dlm_recover_waiters_pre(struct dlm_ls *ls)
3205{
3206 struct dlm_lkb *lkb, *safe;
3207
3208 down(&ls->ls_waiters_sem);
3209
3210 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3211 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3212 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3213
3214 /* all outstanding lookups, regardless of destination will be
3215 resent after recovery is done */
3216
3217 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3218 lkb->lkb_flags |= DLM_IFL_RESEND;
3219 continue;
3220 }
3221
3222 if (!waiter_needs_recovery(ls, lkb))
3223 continue;
3224
3225 switch (lkb->lkb_wait_type) {
3226
3227 case DLM_MSG_REQUEST:
3228 lkb->lkb_flags |= DLM_IFL_RESEND;
3229 break;
3230
3231 case DLM_MSG_CONVERT:
3232 recover_convert_waiter(ls, lkb);
3233 break;
3234
3235 case DLM_MSG_UNLOCK:
3236 hold_lkb(lkb);
3237 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3238 _remove_from_waiters(lkb);
3239 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3240 put_lkb(lkb);
3241 break;
3242
3243 case DLM_MSG_CANCEL:
3244 hold_lkb(lkb);
3245 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3246 _remove_from_waiters(lkb);
3247 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3248 put_lkb(lkb);
3249 break;
3250
3251 default:
3252 log_error(ls, "invalid lkb wait_type %d",
3253 lkb->lkb_wait_type);
3254 }
3255 }
3256 up(&ls->ls_waiters_sem);
3257}
3258
3259static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3260{
3261 struct dlm_lkb *lkb;
3262 int rv = 0;
3263
3264 down(&ls->ls_waiters_sem);
3265 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3266 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3267 rv = lkb->lkb_wait_type;
3268 _remove_from_waiters(lkb);
3269 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3270 break;
3271 }
3272 }
3273 up(&ls->ls_waiters_sem);
3274
3275 if (!rv)
3276 lkb = NULL;
3277 *lkb_ret = lkb;
3278 return rv;
3279}
3280
3281/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3282 master or dir-node for r. Processing the lkb may result in it being placed
3283 back on waiters. */
3284
3285int dlm_recover_waiters_post(struct dlm_ls *ls)
3286{
3287 struct dlm_lkb *lkb;
3288 struct dlm_rsb *r;
3289 int error = 0, mstype;
3290
3291 while (1) {
3292 if (dlm_locking_stopped(ls)) {
3293 log_debug(ls, "recover_waiters_post aborted");
3294 error = -EINTR;
3295 break;
3296 }
3297
3298 mstype = remove_resend_waiter(ls, &lkb);
3299 if (!mstype)
3300 break;
3301
3302 r = lkb->lkb_resource;
3303
3304 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3305 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3306
3307 switch (mstype) {
3308
3309 case DLM_MSG_LOOKUP:
3310 hold_rsb(r);
3311 lock_rsb(r);
3312 _request_lock(r, lkb);
3313 if (is_master(r))
3314 confirm_master(r, 0);
3315 unlock_rsb(r);
3316 put_rsb(r);
3317 break;
3318
3319 case DLM_MSG_REQUEST:
3320 hold_rsb(r);
3321 lock_rsb(r);
3322 _request_lock(r, lkb);
3323 unlock_rsb(r);
3324 put_rsb(r);
3325 break;
3326
3327 case DLM_MSG_CONVERT:
3328 hold_rsb(r);
3329 lock_rsb(r);
3330 _convert_lock(r, lkb);
3331 unlock_rsb(r);
3332 put_rsb(r);
3333 break;
3334
3335 default:
3336 log_error(ls, "recover_waiters_post type %d", mstype);
3337 }
3338 }
3339
3340 return error;
3341}
3342
3343static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3344 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3345{
3346 struct dlm_ls *ls = r->res_ls;
3347 struct dlm_lkb *lkb, *safe;
3348
3349 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3350 if (test(ls, lkb)) {
3351 del_lkb(r, lkb);
3352 /* this put should free the lkb */
3353 if (!put_lkb(lkb))
3354 log_error(ls, "purged lkb not released");
3355 }
3356 }
3357}
3358
3359static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3360{
3361 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3362}
3363
3364static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3365{
3366 return is_master_copy(lkb);
3367}
3368
3369static void purge_dead_locks(struct dlm_rsb *r)
3370{
3371 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3372 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3373 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3374}
3375
3376void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3377{
3378 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3379 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3380 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3381}
3382
3383/* Get rid of locks held by nodes that are gone. */
3384
3385int dlm_purge_locks(struct dlm_ls *ls)
3386{
3387 struct dlm_rsb *r;
3388
3389 log_debug(ls, "dlm_purge_locks");
3390
3391 down_write(&ls->ls_root_sem);
3392 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3393 hold_rsb(r);
3394 lock_rsb(r);
3395 if (is_master(r))
3396 purge_dead_locks(r);
3397 unlock_rsb(r);
3398 unhold_rsb(r);
3399
3400 schedule();
3401 }
3402 up_write(&ls->ls_root_sem);
3403
3404 return 0;
3405}
3406
3407int dlm_grant_after_purge(struct dlm_ls *ls)
3408{
3409 struct dlm_rsb *r;
3410 int i;
3411
3412 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
3413 read_lock(&ls->ls_rsbtbl[i].lock);
3414 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
3415 hold_rsb(r);
3416 lock_rsb(r);
3417 if (is_master(r)) {
3418 grant_pending_locks(r);
3419 confirm_master(r, 0);
3420 }
3421 unlock_rsb(r);
3422 put_rsb(r);
3423 }
3424 read_unlock(&ls->ls_rsbtbl[i].lock);
3425 }
3426
3427 return 0;
3428}
3429
3430static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3431 uint32_t remid)
3432{
3433 struct dlm_lkb *lkb;
3434
3435 list_for_each_entry(lkb, head, lkb_statequeue) {
3436 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3437 return lkb;
3438 }
3439 return NULL;
3440}
3441
3442static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3443 uint32_t remid)
3444{
3445 struct dlm_lkb *lkb;
3446
3447 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3448 if (lkb)
3449 return lkb;
3450 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3451 if (lkb)
3452 return lkb;
3453 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3454 if (lkb)
3455 return lkb;
3456 return NULL;
3457}
3458
3459static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3460 struct dlm_rsb *r, struct dlm_rcom *rc)
3461{
3462 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3463 int lvblen;
3464
3465 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3466 lkb->lkb_ownpid = rl->rl_ownpid;
3467 lkb->lkb_remid = rl->rl_lkid;
3468 lkb->lkb_exflags = rl->rl_exflags;
3469 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3470 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3471 lkb->lkb_lvbseq = rl->rl_lvbseq;
3472 lkb->lkb_rqmode = rl->rl_rqmode;
3473 lkb->lkb_grmode = rl->rl_grmode;
3474 /* don't set lkb_status because add_lkb wants to itself */
3475
3476 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3477 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3478
3479 if (lkb->lkb_flags & DLM_IFL_RANGE) {
3480 lkb->lkb_range = allocate_range(ls);
3481 if (!lkb->lkb_range)
3482 return -ENOMEM;
3483 memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t));
3484 }
3485
3486 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3487 lkb->lkb_lvbptr = allocate_lvb(ls);
3488 if (!lkb->lkb_lvbptr)
3489 return -ENOMEM;
3490 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3491 sizeof(struct rcom_lock);
3492 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3493 }
3494
3495 /* Conversions between PR and CW (middle modes) need special handling.
3496 The real granted mode of these converting locks cannot be determined
3497 until all locks have been rebuilt on the rsb (recover_conversion) */
3498
3499 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3500 rl->rl_status = DLM_LKSTS_CONVERT;
3501 lkb->lkb_grmode = DLM_LOCK_IV;
3502 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3503 }
3504
3505 return 0;
3506}
3507
3508/* This lkb may have been recovered in a previous aborted recovery so we need
3509 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3510 If so we just send back a standard reply. If not, we create a new lkb with
3511 the given values and send back our lkid. We send back our lkid by sending
3512 back the rcom_lock struct we got but with the remid field filled in. */
3513
3514int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3515{
3516 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3517 struct dlm_rsb *r;
3518 struct dlm_lkb *lkb;
3519 int error;
3520
3521 if (rl->rl_parent_lkid) {
3522 error = -EOPNOTSUPP;
3523 goto out;
3524 }
3525
3526 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3527 if (error)
3528 goto out;
3529
3530 lock_rsb(r);
3531
3532 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3533 if (lkb) {
3534 error = -EEXIST;
3535 goto out_remid;
3536 }
3537
3538 error = create_lkb(ls, &lkb);
3539 if (error)
3540 goto out_unlock;
3541
3542 error = receive_rcom_lock_args(ls, lkb, r, rc);
3543 if (error) {
3544 put_lkb(lkb);
3545 goto out_unlock;
3546 }
3547
3548 attach_lkb(r, lkb);
3549 add_lkb(r, lkb, rl->rl_status);
3550 error = 0;
3551
3552 out_remid:
3553 /* this is the new value returned to the lock holder for
3554 saving in its process-copy lkb */
3555 rl->rl_remid = lkb->lkb_id;
3556
3557 out_unlock:
3558 unlock_rsb(r);
3559 put_rsb(r);
3560 out:
3561 if (error)
3562 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3563 rl->rl_result = error;
3564 return error;
3565}
3566
3567int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3568{
3569 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3570 struct dlm_rsb *r;
3571 struct dlm_lkb *lkb;
3572 int error;
3573
3574 error = find_lkb(ls, rl->rl_lkid, &lkb);
3575 if (error) {
3576 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3577 return error;
3578 }
3579
3580 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3581
3582 error = rl->rl_result;
3583
3584 r = lkb->lkb_resource;
3585 hold_rsb(r);
3586 lock_rsb(r);
3587
3588 switch (error) {
3589 case -EEXIST:
3590 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3591 /* fall through */
3592 case 0:
3593 lkb->lkb_remid = rl->rl_remid;
3594 break;
3595 default:
3596 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3597 error, lkb->lkb_id);
3598 }
3599
3600 /* an ack for dlm_recover_locks() which waits for replies from
3601 all the locks it sends to new masters */
3602 dlm_recovered_lock(r);
3603
3604 unlock_rsb(r);
3605 put_rsb(r);
3606 put_lkb(lkb);
3607
3608 return 0;
3609}
3610
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..9e6499f773da
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,50 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
18int dlm_modes_compat(int mode1, int mode2);
19int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
20 unsigned int flags, struct dlm_rsb **r_ret);
21void dlm_put_rsb(struct dlm_rsb *r);
22void dlm_hold_rsb(struct dlm_rsb *r);
23int dlm_put_lkb(struct dlm_lkb *lkb);
24void dlm_scan_rsbs(struct dlm_ls *ls);
25
26int dlm_purge_locks(struct dlm_ls *ls);
27void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
28int dlm_grant_after_purge(struct dlm_ls *ls);
29int dlm_recover_waiters_post(struct dlm_ls *ls);
30void dlm_recover_waiters_pre(struct dlm_ls *ls);
31int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
32int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
33
34static inline int is_master(struct dlm_rsb *r)
35{
36 return !r->res_nodeid;
37}
38
39static inline void lock_rsb(struct dlm_rsb *r)
40{
41 down(&r->res_sem);
42}
43
44static inline void unlock_rsb(struct dlm_rsb *r)
45{
46 up(&r->res_sem);
47}
48
49#endif
50
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..fee4659b6582
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,666 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24
25#ifdef CONFIG_DLM_DEBUG
26int dlm_create_debug_file(struct dlm_ls *ls);
27void dlm_delete_debug_file(struct dlm_ls *ls);
28#else
29static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
30static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
31#endif
32
33static int ls_count;
34static struct semaphore ls_lock;
35static struct list_head lslist;
36static spinlock_t lslist_lock;
37static struct task_struct * scand_task;
38
39
40static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
41{
42 ssize_t ret = len;
43 int n = simple_strtol(buf, NULL, 0);
44
45 switch (n) {
46 case 0:
47 dlm_ls_stop(ls);
48 break;
49 case 1:
50 dlm_ls_start(ls);
51 break;
52 default:
53 ret = -EINVAL;
54 }
55 return ret;
56}
57
58static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
59{
60 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
61 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
62 wake_up(&ls->ls_uevent_wait);
63 return len;
64}
65
66static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
67{
68 return sprintf(buf, "%u\n", ls->ls_global_id);
69}
70
71static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
72{
73 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
74 return len;
75}
76
77struct dlm_attr {
78 struct attribute attr;
79 ssize_t (*show)(struct dlm_ls *, char *);
80 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
81};
82
83static struct dlm_attr dlm_attr_control = {
84 .attr = {.name = "control", .mode = S_IWUSR},
85 .store = dlm_control_store
86};
87
88static struct dlm_attr dlm_attr_event = {
89 .attr = {.name = "event_done", .mode = S_IWUSR},
90 .store = dlm_event_store
91};
92
93static struct dlm_attr dlm_attr_id = {
94 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
95 .show = dlm_id_show,
96 .store = dlm_id_store
97};
98
99static struct attribute *dlm_attrs[] = {
100 &dlm_attr_control.attr,
101 &dlm_attr_event.attr,
102 &dlm_attr_id.attr,
103 NULL,
104};
105
106static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
107 char *buf)
108{
109 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
110 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
111 return a->show ? a->show(ls, buf) : 0;
112}
113
114static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
115 const char *buf, size_t len)
116{
117 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
118 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
119 return a->store ? a->store(ls, buf, len) : len;
120}
121
122static struct sysfs_ops dlm_attr_ops = {
123 .show = dlm_attr_show,
124 .store = dlm_attr_store,
125};
126
127static struct kobj_type dlm_ktype = {
128 .default_attrs = dlm_attrs,
129 .sysfs_ops = &dlm_attr_ops,
130};
131
132static struct kset dlm_kset = {
133 .subsys = &kernel_subsys,
134 .kobj = {.name = "dlm",},
135 .ktype = &dlm_ktype,
136};
137
138static int kobject_setup(struct dlm_ls *ls)
139{
140 char lsname[DLM_LOCKSPACE_LEN];
141 int error;
142
143 memset(lsname, 0, DLM_LOCKSPACE_LEN);
144 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
145
146 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
147 if (error)
148 return error;
149
150 ls->ls_kobj.kset = &dlm_kset;
151 ls->ls_kobj.ktype = &dlm_ktype;
152 return 0;
153}
154
155static int do_uevent(struct dlm_ls *ls, int in)
156{
157 int error;
158
159 if (in)
160 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
161 else
162 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
163
164 error = wait_event_interruptible(ls->ls_uevent_wait,
165 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
166 if (error)
167 goto out;
168
169 error = ls->ls_uevent_result;
170 out:
171 return error;
172}
173
174
175int dlm_lockspace_init(void)
176{
177 int error;
178
179 ls_count = 0;
180 init_MUTEX(&ls_lock);
181 INIT_LIST_HEAD(&lslist);
182 spin_lock_init(&lslist_lock);
183
184 error = kset_register(&dlm_kset);
185 if (error)
186 printk("dlm_lockspace_init: cannot register kset %d\n", error);
187 return error;
188}
189
190void dlm_lockspace_exit(void)
191{
192 kset_unregister(&dlm_kset);
193}
194
195static int dlm_scand(void *data)
196{
197 struct dlm_ls *ls;
198
199 while (!kthread_should_stop()) {
200 list_for_each_entry(ls, &lslist, ls_list)
201 dlm_scan_rsbs(ls);
202 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
203 }
204 return 0;
205}
206
207static int dlm_scand_start(void)
208{
209 struct task_struct *p;
210 int error = 0;
211
212 p = kthread_run(dlm_scand, NULL, "dlm_scand");
213 if (IS_ERR(p))
214 error = PTR_ERR(p);
215 else
216 scand_task = p;
217 return error;
218}
219
220static void dlm_scand_stop(void)
221{
222 kthread_stop(scand_task);
223}
224
225static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
226{
227 struct dlm_ls *ls;
228
229 spin_lock(&lslist_lock);
230
231 list_for_each_entry(ls, &lslist, ls_list) {
232 if (ls->ls_namelen == namelen &&
233 memcmp(ls->ls_name, name, namelen) == 0)
234 goto out;
235 }
236 ls = NULL;
237 out:
238 spin_unlock(&lslist_lock);
239 return ls;
240}
241
242struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
243{
244 struct dlm_ls *ls;
245
246 spin_lock(&lslist_lock);
247
248 list_for_each_entry(ls, &lslist, ls_list) {
249 if (ls->ls_global_id == id) {
250 ls->ls_count++;
251 goto out;
252 }
253 }
254 ls = NULL;
255 out:
256 spin_unlock(&lslist_lock);
257 return ls;
258}
259
260struct dlm_ls *dlm_find_lockspace_local(void *id)
261{
262 struct dlm_ls *ls = id;
263
264 spin_lock(&lslist_lock);
265 ls->ls_count++;
266 spin_unlock(&lslist_lock);
267 return ls;
268}
269
270void dlm_put_lockspace(struct dlm_ls *ls)
271{
272 spin_lock(&lslist_lock);
273 ls->ls_count--;
274 spin_unlock(&lslist_lock);
275}
276
277static void remove_lockspace(struct dlm_ls *ls)
278{
279 for (;;) {
280 spin_lock(&lslist_lock);
281 if (ls->ls_count == 0) {
282 list_del(&ls->ls_list);
283 spin_unlock(&lslist_lock);
284 return;
285 }
286 spin_unlock(&lslist_lock);
287 ssleep(1);
288 }
289}
290
291static int threads_start(void)
292{
293 int error;
294
295 /* Thread which process lock requests for all lockspace's */
296 error = dlm_astd_start();
297 if (error) {
298 log_print("cannot start dlm_astd thread %d", error);
299 goto fail;
300 }
301
302 error = dlm_scand_start();
303 if (error) {
304 log_print("cannot start dlm_scand thread %d", error);
305 goto astd_fail;
306 }
307
308 /* Thread for sending/receiving messages for all lockspace's */
309 error = dlm_lowcomms_start();
310 if (error) {
311 log_print("cannot start dlm lowcomms %d", error);
312 goto scand_fail;
313 }
314
315 return 0;
316
317 scand_fail:
318 dlm_scand_stop();
319 astd_fail:
320 dlm_astd_stop();
321 fail:
322 return error;
323}
324
325static void threads_stop(void)
326{
327 dlm_scand_stop();
328 dlm_lowcomms_stop();
329 dlm_astd_stop();
330}
331
332static int new_lockspace(char *name, int namelen, void **lockspace,
333 uint32_t flags, int lvblen)
334{
335 struct dlm_ls *ls;
336 int i, size, error = -ENOMEM;
337
338 if (namelen > DLM_LOCKSPACE_LEN)
339 return -EINVAL;
340
341 if (!lvblen || (lvblen % 8))
342 return -EINVAL;
343
344 if (!try_module_get(THIS_MODULE))
345 return -EINVAL;
346
347 ls = dlm_find_lockspace_name(name, namelen);
348 if (ls) {
349 *lockspace = ls;
350 module_put(THIS_MODULE);
351 return -EEXIST;
352 }
353
354 ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
355 if (!ls)
356 goto out;
357 memset(ls, 0, sizeof(struct dlm_ls) + namelen);
358 memcpy(ls->ls_name, name, namelen);
359 ls->ls_namelen = namelen;
360 ls->ls_exflags = flags;
361 ls->ls_lvblen = lvblen;
362 ls->ls_count = 0;
363 ls->ls_flags = 0;
364
365 size = dlm_config.rsbtbl_size;
366 ls->ls_rsbtbl_size = size;
367
368 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
369 if (!ls->ls_rsbtbl)
370 goto out_lsfree;
371 for (i = 0; i < size; i++) {
372 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
373 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
374 rwlock_init(&ls->ls_rsbtbl[i].lock);
375 }
376
377 size = dlm_config.lkbtbl_size;
378 ls->ls_lkbtbl_size = size;
379
380 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
381 if (!ls->ls_lkbtbl)
382 goto out_rsbfree;
383 for (i = 0; i < size; i++) {
384 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
385 rwlock_init(&ls->ls_lkbtbl[i].lock);
386 ls->ls_lkbtbl[i].counter = 1;
387 }
388
389 size = dlm_config.dirtbl_size;
390 ls->ls_dirtbl_size = size;
391
392 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
393 if (!ls->ls_dirtbl)
394 goto out_lkbfree;
395 for (i = 0; i < size; i++) {
396 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
397 rwlock_init(&ls->ls_dirtbl[i].lock);
398 }
399
400 INIT_LIST_HEAD(&ls->ls_waiters);
401 init_MUTEX(&ls->ls_waiters_sem);
402
403 INIT_LIST_HEAD(&ls->ls_nodes);
404 INIT_LIST_HEAD(&ls->ls_nodes_gone);
405 ls->ls_num_nodes = 0;
406 ls->ls_low_nodeid = 0;
407 ls->ls_total_weight = 0;
408 ls->ls_node_array = NULL;
409
410 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
411 ls->ls_stub_rsb.res_ls = ls;
412
413 ls->ls_debug_dentry = NULL;
414
415 init_waitqueue_head(&ls->ls_uevent_wait);
416 ls->ls_uevent_result = 0;
417
418 ls->ls_recoverd_task = NULL;
419 init_MUTEX(&ls->ls_recoverd_active);
420 spin_lock_init(&ls->ls_recover_lock);
421 ls->ls_recover_status = 0;
422 ls->ls_recover_seq = 0;
423 ls->ls_recover_args = NULL;
424 init_rwsem(&ls->ls_in_recovery);
425 INIT_LIST_HEAD(&ls->ls_requestqueue);
426 init_MUTEX(&ls->ls_requestqueue_lock);
427
428 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
429 if (!ls->ls_recover_buf)
430 goto out_dirfree;
431
432 INIT_LIST_HEAD(&ls->ls_recover_list);
433 spin_lock_init(&ls->ls_recover_list_lock);
434 ls->ls_recover_list_count = 0;
435 init_waitqueue_head(&ls->ls_wait_general);
436 INIT_LIST_HEAD(&ls->ls_root_list);
437 init_rwsem(&ls->ls_root_sem);
438
439 down_write(&ls->ls_in_recovery);
440
441 error = dlm_recoverd_start(ls);
442 if (error) {
443 log_error(ls, "can't start dlm_recoverd %d", error);
444 goto out_rcomfree;
445 }
446
447 spin_lock(&lslist_lock);
448 list_add(&ls->ls_list, &lslist);
449 spin_unlock(&lslist_lock);
450
451 dlm_create_debug_file(ls);
452
453 error = kobject_setup(ls);
454 if (error)
455 goto out_del;
456
457 error = kobject_register(&ls->ls_kobj);
458 if (error)
459 goto out_del;
460
461 error = do_uevent(ls, 1);
462 if (error)
463 goto out_unreg;
464
465 *lockspace = ls;
466 return 0;
467
468 out_unreg:
469 kobject_unregister(&ls->ls_kobj);
470 out_del:
471 dlm_delete_debug_file(ls);
472 spin_lock(&lslist_lock);
473 list_del(&ls->ls_list);
474 spin_unlock(&lslist_lock);
475 dlm_recoverd_stop(ls);
476 out_rcomfree:
477 kfree(ls->ls_recover_buf);
478 out_dirfree:
479 kfree(ls->ls_dirtbl);
480 out_lkbfree:
481 kfree(ls->ls_lkbtbl);
482 out_rsbfree:
483 kfree(ls->ls_rsbtbl);
484 out_lsfree:
485 kfree(ls);
486 out:
487 module_put(THIS_MODULE);
488 return error;
489}
490
491int dlm_new_lockspace(char *name, int namelen, void **lockspace,
492 uint32_t flags, int lvblen)
493{
494 int error = 0;
495
496 down(&ls_lock);
497 if (!ls_count)
498 error = threads_start();
499 if (error)
500 goto out;
501
502 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
503 if (!error)
504 ls_count++;
505 out:
506 up(&ls_lock);
507 return error;
508}
509
510/* Return 1 if the lockspace still has active remote locks,
511 * 2 if the lockspace still has active local locks.
512 */
513static int lockspace_busy(struct dlm_ls *ls)
514{
515 int i, lkb_found = 0;
516 struct dlm_lkb *lkb;
517
518 /* NOTE: We check the lockidtbl here rather than the resource table.
519 This is because there may be LKBs queued as ASTs that have been
520 unlinked from their RSBs and are pending deletion once the AST has
521 been delivered */
522
523 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
524 read_lock(&ls->ls_lkbtbl[i].lock);
525 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
526 lkb_found = 1;
527 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
528 lkb_idtbl_list) {
529 if (!lkb->lkb_nodeid) {
530 read_unlock(&ls->ls_lkbtbl[i].lock);
531 return 2;
532 }
533 }
534 }
535 read_unlock(&ls->ls_lkbtbl[i].lock);
536 }
537 return lkb_found;
538}
539
540static int release_lockspace(struct dlm_ls *ls, int force)
541{
542 struct dlm_lkb *lkb;
543 struct dlm_rsb *rsb;
544 struct list_head *head;
545 int i;
546 int busy = lockspace_busy(ls);
547
548 if (busy > force)
549 return -EBUSY;
550
551 if (force < 3)
552 do_uevent(ls, 0);
553
554 dlm_recoverd_stop(ls);
555
556 remove_lockspace(ls);
557
558 dlm_delete_debug_file(ls);
559
560 dlm_astd_suspend();
561
562 kfree(ls->ls_recover_buf);
563
564 /*
565 * Free direntry structs.
566 */
567
568 dlm_dir_clear(ls);
569 kfree(ls->ls_dirtbl);
570
571 /*
572 * Free all lkb's on lkbtbl[] lists.
573 */
574
575 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
576 head = &ls->ls_lkbtbl[i].list;
577 while (!list_empty(head)) {
578 lkb = list_entry(head->next, struct dlm_lkb,
579 lkb_idtbl_list);
580
581 list_del(&lkb->lkb_idtbl_list);
582
583 dlm_del_ast(lkb);
584
585 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
586 free_lvb(lkb->lkb_lvbptr);
587
588 free_lkb(lkb);
589 }
590 }
591 dlm_astd_resume();
592
593 kfree(ls->ls_lkbtbl);
594
595 /*
596 * Free all rsb's on rsbtbl[] lists
597 */
598
599 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
600 head = &ls->ls_rsbtbl[i].list;
601 while (!list_empty(head)) {
602 rsb = list_entry(head->next, struct dlm_rsb,
603 res_hashchain);
604
605 list_del(&rsb->res_hashchain);
606 free_rsb(rsb);
607 }
608
609 head = &ls->ls_rsbtbl[i].toss;
610 while (!list_empty(head)) {
611 rsb = list_entry(head->next, struct dlm_rsb,
612 res_hashchain);
613 list_del(&rsb->res_hashchain);
614 free_rsb(rsb);
615 }
616 }
617
618 kfree(ls->ls_rsbtbl);
619
620 /*
621 * Free structures on any other lists
622 */
623
624 kfree(ls->ls_recover_args);
625 dlm_clear_free_entries(ls);
626 dlm_clear_members(ls);
627 dlm_clear_members_gone(ls);
628 kfree(ls->ls_node_array);
629 kobject_unregister(&ls->ls_kobj);
630 kfree(ls);
631
632 down(&ls_lock);
633 ls_count--;
634 if (!ls_count)
635 threads_stop();
636 up(&ls_lock);
637
638 module_put(THIS_MODULE);
639 return 0;
640}
641
642/*
643 * Called when a system has released all its locks and is not going to use the
644 * lockspace any longer. We free everything we're managing for this lockspace.
645 * Remaining nodes will go through the recovery process as if we'd died. The
646 * lockspace must continue to function as usual, participating in recoveries,
647 * until this returns.
648 *
649 * Force has 4 possible values:
650 * 0 - don't destroy locksapce if it has any LKBs
651 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
652 * 2 - destroy lockspace regardless of LKBs
653 * 3 - destroy lockspace as part of a forced shutdown
654 */
655
656int dlm_release_lockspace(void *lockspace, int force)
657{
658 struct dlm_ls *ls;
659
660 ls = dlm_find_lockspace_local(lockspace);
661 if (!ls)
662 return -EINVAL;
663 dlm_put_lockspace(ls);
664 return release_lockspace(ls, force);
665}
666
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..17bd3ba863a9
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21void dlm_put_lockspace(struct dlm_ls *ls);
22
23#endif /* __LOCKSPACE_DOT_H__ */
24
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..09b0124f7fc4
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1218 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT];
59static int local_count;
60static int local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!local_count)
264 return;
265
266 if (!port) {
267 if (local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = local_addr[0]->ss_family;
277 if (local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 local_addr[local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!local_count) {
663 init_local();
664 if (!local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < local_count; i++) {
704 memcpy(&localaddr, local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 kmap(e->page);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066static int write_list_empty(void)
1067{
1068 int status;
1069
1070 spin_lock_bh(&write_nodes_lock);
1071 status = list_empty(&write_nodes);
1072 spin_unlock_bh(&write_nodes_lock);
1073
1074 return status;
1075}
1076
1077static int dlm_recvd(void *data)
1078{
1079 DECLARE_WAITQUEUE(wait, current);
1080
1081 while (!kthread_should_stop()) {
1082 int count = 0;
1083
1084 set_current_state(TASK_INTERRUPTIBLE);
1085 add_wait_queue(&lowcomms_recv_wait, &wait);
1086 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1087 schedule();
1088 remove_wait_queue(&lowcomms_recv_wait, &wait);
1089 set_current_state(TASK_RUNNING);
1090
1091 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1092 int ret;
1093
1094 do {
1095 ret = receive_from_sock();
1096
1097 /* Don't starve out everyone else */
1098 if (++count >= MAX_RX_MSG_COUNT) {
1099 schedule();
1100 count = 0;
1101 }
1102 } while (!kthread_should_stop() && ret >=0);
1103 }
1104 schedule();
1105 }
1106
1107 return 0;
1108}
1109
1110static int dlm_sendd(void *data)
1111{
1112 DECLARE_WAITQUEUE(wait, current);
1113
1114 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1115
1116 while (!kthread_should_stop()) {
1117 set_current_state(TASK_INTERRUPTIBLE);
1118 if (write_list_empty())
1119 schedule();
1120 set_current_state(TASK_RUNNING);
1121
1122 if (sctp_con.eagain_flag) {
1123 sctp_con.eagain_flag = 0;
1124 refill_write_queue();
1125 }
1126 process_output_queue();
1127 }
1128
1129 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1130
1131 return 0;
1132}
1133
1134static void daemons_stop(void)
1135{
1136 kthread_stop(recv_task);
1137 kthread_stop(send_task);
1138}
1139
1140static int daemons_start(void)
1141{
1142 struct task_struct *p;
1143 int error;
1144
1145 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1146 error = IS_ERR(p);
1147 if (error) {
1148 log_print("can't start dlm_recvd %d", error);
1149 return error;
1150 }
1151 recv_task = p;
1152
1153 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1154 error = IS_ERR(p);
1155 if (error) {
1156 log_print("can't start dlm_sendd %d", error);
1157 kthread_stop(recv_task);
1158 return error;
1159 }
1160 send_task = p;
1161
1162 return 0;
1163}
1164
1165/*
1166 * This is quite likely to sleep...
1167 */
1168int dlm_lowcomms_start(void)
1169{
1170 int error;
1171
1172 spin_lock_init(&write_nodes_lock);
1173 INIT_LIST_HEAD(&write_nodes);
1174 init_rwsem(&nodeinfo_lock);
1175
1176 error = init_sock();
1177 if (error)
1178 goto fail_sock;
1179 error = daemons_start();
1180 if (error)
1181 goto fail_sock;
1182 atomic_set(&accepting, 1);
1183 return 0;
1184
1185 fail_sock:
1186 close_connection();
1187 return error;
1188}
1189
1190/* Set all the activity flags to prevent any socket activity. */
1191
1192void dlm_lowcomms_stop(void)
1193{
1194 atomic_set(&accepting, 0);
1195 sctp_con.flags = 0x7;
1196 daemons_stop();
1197 clean_writequeues();
1198 close_connection();
1199 dealloc_nodeinfo();
1200 max_nodeid = 0;
1201}
1202
1203int dlm_lowcomms_init(void)
1204{
1205 init_waitqueue_head(&lowcomms_recv_wait);
1206 return 0;
1207}
1208
1209void dlm_lowcomms_exit(void)
1210{
1211 int i;
1212
1213 for (i = 0; i < local_count; i++)
1214 kfree(local_addr[i]);
1215 local_count = 0;
1216 local_nodeid = 0;
1217}
1218
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..3af8035ff12f
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
22void dlm_lowcomms_commit_buffer(void *mh);
23
24#endif /* __LOWCOMMS_DOT_H__ */
25
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..81bf4cb22033
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,89 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "memory.h"
18#include "lowcomms.h"
19#include "config.h"
20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28
29static int __init init_dlm(void)
30{
31 int error;
32
33 error = dlm_memory_init();
34 if (error)
35 goto out;
36
37 error = dlm_lockspace_init();
38 if (error)
39 goto out_mem;
40
41 error = dlm_config_init();
42 if (error)
43 goto out_lockspace;
44
45 error = dlm_register_debugfs();
46 if (error)
47 goto out_config;
48
49 error = dlm_lowcomms_init();
50 if (error)
51 goto out_debug;
52
53 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
54
55 return 0;
56
57 out_debug:
58 dlm_unregister_debugfs();
59 out_config:
60 dlm_config_exit();
61 out_lockspace:
62 dlm_lockspace_exit();
63 out_mem:
64 dlm_memory_exit();
65 out:
66 return error;
67}
68
69static void __exit exit_dlm(void)
70{
71 dlm_lowcomms_exit();
72 dlm_config_exit();
73 dlm_memory_exit();
74 dlm_lockspace_exit();
75 dlm_unregister_debugfs();
76}
77
78module_init(init_dlm);
79module_exit(exit_dlm);
80
81MODULE_DESCRIPTION("Distributed Lock Manager");
82MODULE_AUTHOR("Red Hat, Inc.");
83MODULE_LICENSE("GPL");
84
85EXPORT_SYMBOL_GPL(dlm_new_lockspace);
86EXPORT_SYMBOL_GPL(dlm_release_lockspace);
87EXPORT_SYMBOL_GPL(dlm_lock);
88EXPORT_SYMBOL_GPL(dlm_unlock);
89
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..439249b62a57
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,314 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "lowcomms.h"
19#include "rcom.h"
20#include "config.h"
21
22/*
23 * Following called by dlm_recoverd thread
24 */
25
26static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
27{
28 struct dlm_member *memb = NULL;
29 struct list_head *tmp;
30 struct list_head *newlist = &new->list;
31 struct list_head *head = &ls->ls_nodes;
32
33 list_for_each(tmp, head) {
34 memb = list_entry(tmp, struct dlm_member, list);
35 if (new->nodeid < memb->nodeid)
36 break;
37 }
38
39 if (!memb)
40 list_add_tail(newlist, head);
41 else {
42 /* FIXME: can use list macro here */
43 newlist->prev = tmp->prev;
44 newlist->next = tmp;
45 tmp->prev->next = newlist;
46 tmp->prev = newlist;
47 }
48}
49
50static int dlm_add_member(struct dlm_ls *ls, int nodeid)
51{
52 struct dlm_member *memb;
53 int w;
54
55 memb = kmalloc(sizeof(struct dlm_member), GFP_KERNEL);
56 if (!memb)
57 return -ENOMEM;
58
59 w = dlm_node_weight(ls->ls_name, nodeid);
60 if (w < 0)
61 return w;
62
63 memb->nodeid = nodeid;
64 memb->weight = w;
65 add_ordered_member(ls, memb);
66 ls->ls_num_nodes++;
67 return 0;
68}
69
70static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
71{
72 list_move(&memb->list, &ls->ls_nodes_gone);
73 ls->ls_num_nodes--;
74}
75
76static int dlm_is_member(struct dlm_ls *ls, int nodeid)
77{
78 struct dlm_member *memb;
79
80 list_for_each_entry(memb, &ls->ls_nodes, list) {
81 if (memb->nodeid == nodeid)
82 return TRUE;
83 }
84 return FALSE;
85}
86
87int dlm_is_removed(struct dlm_ls *ls, int nodeid)
88{
89 struct dlm_member *memb;
90
91 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
92 if (memb->nodeid == nodeid)
93 return TRUE;
94 }
95 return FALSE;
96}
97
98static void clear_memb_list(struct list_head *head)
99{
100 struct dlm_member *memb;
101
102 while (!list_empty(head)) {
103 memb = list_entry(head->next, struct dlm_member, list);
104 list_del(&memb->list);
105 kfree(memb);
106 }
107}
108
109void dlm_clear_members(struct dlm_ls *ls)
110{
111 clear_memb_list(&ls->ls_nodes);
112 ls->ls_num_nodes = 0;
113}
114
115void dlm_clear_members_gone(struct dlm_ls *ls)
116{
117 clear_memb_list(&ls->ls_nodes_gone);
118}
119
120static void make_member_array(struct dlm_ls *ls)
121{
122 struct dlm_member *memb;
123 int i, w, x = 0, total = 0, all_zero = 0, *array;
124
125 kfree(ls->ls_node_array);
126 ls->ls_node_array = NULL;
127
128 list_for_each_entry(memb, &ls->ls_nodes, list) {
129 if (memb->weight)
130 total += memb->weight;
131 }
132
133 /* all nodes revert to weight of 1 if all have weight 0 */
134
135 if (!total) {
136 total = ls->ls_num_nodes;
137 all_zero = 1;
138 }
139
140 ls->ls_total_weight = total;
141
142 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
143 if (!array)
144 return;
145
146 list_for_each_entry(memb, &ls->ls_nodes, list) {
147 if (!all_zero && !memb->weight)
148 continue;
149
150 if (all_zero)
151 w = 1;
152 else
153 w = memb->weight;
154
155 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
156
157 for (i = 0; i < w; i++)
158 array[x++] = memb->nodeid;
159 }
160
161 ls->ls_node_array = array;
162}
163
164/* send a status request to all members just to establish comms connections */
165
166static void ping_members(struct dlm_ls *ls)
167{
168 struct dlm_member *memb;
169 list_for_each_entry(memb, &ls->ls_nodes, list)
170 dlm_rcom_status(ls, memb->nodeid);
171}
172
173int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
174{
175 struct dlm_member *memb, *safe;
176 int i, error, found, pos = 0, neg = 0, low = -1;
177
178 /* move departed members from ls_nodes to ls_nodes_gone */
179
180 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
181 found = FALSE;
182 for (i = 0; i < rv->node_count; i++) {
183 if (memb->nodeid == rv->nodeids[i]) {
184 found = TRUE;
185 break;
186 }
187 }
188
189 if (!found) {
190 neg++;
191 dlm_remove_member(ls, memb);
192 log_debug(ls, "remove member %d", memb->nodeid);
193 }
194 }
195
196 /* add new members to ls_nodes */
197
198 for (i = 0; i < rv->node_count; i++) {
199 if (dlm_is_member(ls, rv->nodeids[i]))
200 continue;
201 dlm_add_member(ls, rv->nodeids[i]);
202 pos++;
203 log_debug(ls, "add member %d", rv->nodeids[i]);
204 }
205
206 list_for_each_entry(memb, &ls->ls_nodes, list) {
207 if (low == -1 || memb->nodeid < low)
208 low = memb->nodeid;
209 }
210 ls->ls_low_nodeid = low;
211
212 make_member_array(ls);
213 dlm_set_recover_status(ls, DLM_RS_NODES);
214 *neg_out = neg;
215
216 ping_members(ls);
217
218 error = dlm_recover_members_wait(ls);
219 log_debug(ls, "total members %d", ls->ls_num_nodes);
220 return error;
221}
222
223/*
224 * Following called from lockspace.c
225 */
226
227int dlm_ls_stop(struct dlm_ls *ls)
228{
229 int new;
230
231 /*
232 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
233 * dlm_recovery_stopped()) and prevents any new locks from being
234 * processed (see RUNNING, dlm_locking_stopped()).
235 */
236
237 spin_lock(&ls->ls_recover_lock);
238 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
239 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
240 ls->ls_recover_seq++;
241 spin_unlock(&ls->ls_recover_lock);
242
243 /*
244 * This in_recovery lock does two things:
245 *
246 * 1) Keeps this function from returning until all threads are out
247 * of locking routines and locking is truely stopped.
248 * 2) Keeps any new requests from being processed until it's unlocked
249 * when recovery is complete.
250 */
251
252 if (new)
253 down_write(&ls->ls_in_recovery);
254
255 /*
256 * The recoverd suspend/resume makes sure that dlm_recoverd (if
257 * running) has noticed the clearing of RUNNING above and quit
258 * processing the previous recovery. This will be true for all nodes
259 * before any nodes start the new recovery.
260 */
261
262 dlm_recoverd_suspend(ls);
263 ls->ls_recover_status = 0;
264 dlm_recoverd_resume(ls);
265 return 0;
266}
267
268int dlm_ls_start(struct dlm_ls *ls)
269{
270 struct dlm_recover *rv = NULL, *rv_old;
271 int *ids = NULL;
272 int error, count;
273
274 rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL);
275 if (!rv)
276 return -ENOMEM;
277 memset(rv, 0, sizeof(struct dlm_recover));
278
279 error = count = dlm_nodeid_list(ls->ls_name, &ids);
280 if (error <= 0)
281 goto fail;
282
283 spin_lock(&ls->ls_recover_lock);
284
285 /* the lockspace needs to be stopped before it can be started */
286
287 if (!dlm_locking_stopped(ls)) {
288 spin_unlock(&ls->ls_recover_lock);
289 log_error(ls, "start ignored: lockspace running");
290 error = -EINVAL;
291 goto fail;
292 }
293
294 rv->nodeids = ids;
295 rv->node_count = count;
296 rv->seq = ++ls->ls_recover_seq;
297 rv_old = ls->ls_recover_args;
298 ls->ls_recover_args = rv;
299 spin_unlock(&ls->ls_recover_lock);
300
301 if (rv_old) {
302 kfree(rv_old->nodeids);
303 kfree(rv_old);
304 }
305
306 dlm_recoverd_kick(ls);
307 return 0;
308
309 fail:
310 kfree(rv);
311 kfree(ids);
312 return error;
313}
314
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..0b9851d0bdb2
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,122 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53uint64_t *allocate_range(struct dlm_ls *ls)
54{
55 int ralen = 4*sizeof(uint64_t);
56 uint64_t *p;
57
58 p = kmalloc(ralen, GFP_KERNEL);
59 if (p)
60 memset(p, 0, ralen);
61 return p;
62}
63
64void free_range(uint64_t *p)
65{
66 kfree(p);
67}
68
69/* FIXME: have some minimal space built-in to rsb for the name and
70 kmalloc a separate name if needed, like dentries are done */
71
72struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
73{
74 struct dlm_rsb *r;
75
76 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
77
78 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
79 if (r)
80 memset(r, 0, sizeof(*r) + namelen);
81 return r;
82}
83
84void free_rsb(struct dlm_rsb *r)
85{
86 if (r->res_lvbptr)
87 free_lvb(r->res_lvbptr);
88 kfree(r);
89}
90
91struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
92{
93 struct dlm_lkb *lkb;
94
95 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
96 if (lkb)
97 memset(lkb, 0, sizeof(*lkb));
98 return lkb;
99}
100
101void free_lkb(struct dlm_lkb *lkb)
102{
103 kmem_cache_free(lkb_cache, lkb);
104}
105
106struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
107{
108 struct dlm_direntry *de;
109
110 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
111
112 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
113 if (de)
114 memset(de, 0, sizeof(*de) + namelen);
115 return de;
116}
117
118void free_direntry(struct dlm_direntry *de)
119{
120 kfree(de);
121}
122
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..7b235132b0b4
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,31 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27uint64_t *allocate_range(struct dlm_ls *ls);
28void free_range(uint64_t *l);
29
30#endif /* __MEMORY_DOT_H__ */
31
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..d96f9bbb407c
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, FALSE);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..4c5c08a8860e
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,460 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100
101 if (nodeid == dlm_our_nodeid()) {
102 rc = (struct dlm_rcom *) ls->ls_recover_buf;
103 rc->rc_result = dlm_recover_status(ls);
104 goto out;
105 }
106
107 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
108 if (error)
109 goto out;
110
111 send_rcom(ls, mh, rc);
112
113 error = dlm_wait_function(ls, &rcom_response);
114 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
115 if (error)
116 goto out;
117
118 rc = (struct dlm_rcom *) ls->ls_recover_buf;
119
120 if (rc->rc_result == -ESRCH) {
121 /* we pretend the remote lockspace exists with 0 status */
122 log_debug(ls, "remote node %d not ready", nodeid);
123 rc->rc_result = 0;
124 } else
125 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
126 nodeid);
127 /* the caller looks at rc_result for the remote recovery status */
128 out:
129 return error;
130}
131
132static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
133{
134 struct dlm_rcom *rc;
135 struct dlm_mhandle *mh;
136 int error, nodeid = rc_in->rc_header.h_nodeid;
137
138 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
139 sizeof(struct rcom_config), &rc, &mh);
140 if (error)
141 return;
142 rc->rc_result = dlm_recover_status(ls);
143 make_config(ls, (struct rcom_config *) rc->rc_buf);
144
145 send_rcom(ls, mh, rc);
146}
147
148static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
149{
150 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
151 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
152 wake_up(&ls->ls_wait_general);
153}
154
155int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
156{
157 struct dlm_rcom *rc;
158 struct dlm_mhandle *mh;
159 int error = 0, len = sizeof(struct dlm_rcom);
160
161 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
162
163 if (nodeid == dlm_our_nodeid()) {
164 dlm_copy_master_names(ls, last_name, last_len,
165 ls->ls_recover_buf + len,
166 dlm_config.buffer_size - len, nodeid);
167 goto out;
168 }
169
170 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
171 if (error)
172 goto out;
173 memcpy(rc->rc_buf, last_name, last_len);
174
175 send_rcom(ls, mh, rc);
176
177 error = dlm_wait_function(ls, &rcom_response);
178 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
179 out:
180 return error;
181}
182
183static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
184{
185 struct dlm_rcom *rc;
186 struct dlm_mhandle *mh;
187 int error, inlen, outlen;
188 int nodeid = rc_in->rc_header.h_nodeid;
189 uint32_t status = dlm_recover_status(ls);
190
191 /*
192 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
193 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
194 * It could only happen in rare cases where we get a late NAMES
195 * message from a previous instance of recovery.
196 */
197
198 if (!(status & DLM_RS_NODES)) {
199 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
200 return;
201 }
202
203 nodeid = rc_in->rc_header.h_nodeid;
204 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
205 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
206
207 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
208 if (error)
209 return;
210
211 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
212 nodeid);
213 send_rcom(ls, mh, rc);
214}
215
216static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
217{
218 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
219 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
220 wake_up(&ls->ls_wait_general);
221}
222
223int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
224{
225 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh;
227 struct dlm_ls *ls = r->res_ls;
228 int error;
229
230 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
231 &rc, &mh);
232 if (error)
233 goto out;
234 memcpy(rc->rc_buf, r->res_name, r->res_length);
235 rc->rc_id = (unsigned long) r;
236
237 send_rcom(ls, mh, rc);
238 out:
239 return error;
240}
241
242static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
243{
244 struct dlm_rcom *rc;
245 struct dlm_mhandle *mh;
246 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
247 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
248
249 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
250 if (error)
251 return;
252
253 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
254 if (error)
255 ret_nodeid = error;
256 rc->rc_result = ret_nodeid;
257 rc->rc_id = rc_in->rc_id;
258
259 send_rcom(ls, mh, rc);
260}
261
262static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
263{
264 dlm_recover_master_reply(ls, rc_in);
265}
266
267static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
268 struct rcom_lock *rl)
269{
270 memset(rl, 0, sizeof(*rl));
271
272 rl->rl_ownpid = lkb->lkb_ownpid;
273 rl->rl_lkid = lkb->lkb_id;
274 rl->rl_exflags = lkb->lkb_exflags;
275 rl->rl_flags = lkb->lkb_flags;
276 rl->rl_lvbseq = lkb->lkb_lvbseq;
277 rl->rl_rqmode = lkb->lkb_rqmode;
278 rl->rl_grmode = lkb->lkb_grmode;
279 rl->rl_status = lkb->lkb_status;
280 rl->rl_wait_type = lkb->lkb_wait_type;
281
282 if (lkb->lkb_bastaddr)
283 rl->rl_asts |= AST_BAST;
284 if (lkb->lkb_astaddr)
285 rl->rl_asts |= AST_COMP;
286
287 if (lkb->lkb_range)
288 memcpy(rl->rl_range, lkb->lkb_range, 4*sizeof(uint64_t));
289
290 rl->rl_namelen = r->res_length;
291 memcpy(rl->rl_name, r->res_name, r->res_length);
292
293 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
294 If so, receive_rcom_lock_args() won't take this copy. */
295
296 if (lkb->lkb_lvbptr)
297 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
298}
299
300int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
301{
302 struct dlm_ls *ls = r->res_ls;
303 struct dlm_rcom *rc;
304 struct dlm_mhandle *mh;
305 struct rcom_lock *rl;
306 int error, len = sizeof(struct rcom_lock);
307
308 if (lkb->lkb_lvbptr)
309 len += ls->ls_lvblen;
310
311 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
312 if (error)
313 goto out;
314
315 rl = (struct rcom_lock *) rc->rc_buf;
316 pack_rcom_lock(r, lkb, rl);
317 rc->rc_id = (unsigned long) r;
318
319 send_rcom(ls, mh, rc);
320 out:
321 return error;
322}
323
324static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
325{
326 struct dlm_rcom *rc;
327 struct dlm_mhandle *mh;
328 int error, nodeid = rc_in->rc_header.h_nodeid;
329
330 dlm_recover_master_copy(ls, rc_in);
331
332 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
333 sizeof(struct rcom_lock), &rc, &mh);
334 if (error)
335 return;
336
337 /* We send back the same rcom_lock struct we received, but
338 dlm_recover_master_copy() has filled in rl_remid and rl_result */
339
340 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
341 rc->rc_id = rc_in->rc_id;
342
343 send_rcom(ls, mh, rc);
344}
345
346static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
347{
348 uint32_t status = dlm_recover_status(ls);
349
350 if (!(status & DLM_RS_DIR)) {
351 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
352 rc_in->rc_header.h_nodeid);
353 return;
354 }
355
356 dlm_recover_process_copy(ls, rc_in);
357}
358
359static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
360{
361 struct dlm_rcom *rc;
362 struct dlm_mhandle *mh;
363 char *mb;
364 int mb_len = sizeof(struct dlm_rcom);
365
366 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
367 if (!mh)
368 return -ENOBUFS;
369 memset(mb, 0, mb_len);
370
371 rc = (struct dlm_rcom *) mb;
372
373 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
374 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
375 rc->rc_header.h_nodeid = dlm_our_nodeid();
376 rc->rc_header.h_length = mb_len;
377 rc->rc_header.h_cmd = DLM_RCOM;
378
379 rc->rc_type = DLM_RCOM_STATUS_REPLY;
380 rc->rc_result = -ESRCH;
381
382 dlm_rcom_out(rc);
383 dlm_lowcomms_commit_buffer(mh);
384
385 return 0;
386}
387
388/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
389 recovery-only comms are sent through here. */
390
391void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
392{
393 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
394 struct dlm_ls *ls;
395
396 dlm_rcom_in(rc);
397
398 /* If the lockspace doesn't exist then still send a status message
399 back; it's possible that it just doesn't have its global_id yet. */
400
401 ls = dlm_find_lockspace_global(hd->h_lockspace);
402 if (!ls) {
403 log_print("lockspace %x from %d not found",
404 hd->h_lockspace, nodeid);
405 send_ls_not_ready(nodeid, rc);
406 return;
407 }
408
409 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
410 log_error(ls, "ignoring recovery message %x from %d",
411 rc->rc_type, nodeid);
412 goto out;
413 }
414
415 if (nodeid != rc->rc_header.h_nodeid) {
416 log_error(ls, "bad rcom nodeid %d from %d",
417 rc->rc_header.h_nodeid, nodeid);
418 goto out;
419 }
420
421 switch (rc->rc_type) {
422 case DLM_RCOM_STATUS:
423 receive_rcom_status(ls, rc);
424 break;
425
426 case DLM_RCOM_NAMES:
427 receive_rcom_names(ls, rc);
428 break;
429
430 case DLM_RCOM_LOOKUP:
431 receive_rcom_lookup(ls, rc);
432 break;
433
434 case DLM_RCOM_LOCK:
435 receive_rcom_lock(ls, rc);
436 break;
437
438 case DLM_RCOM_STATUS_REPLY:
439 receive_rcom_status_reply(ls, rc);
440 break;
441
442 case DLM_RCOM_NAMES_REPLY:
443 receive_rcom_names_reply(ls, rc);
444 break;
445
446 case DLM_RCOM_LOOKUP_REPLY:
447 receive_rcom_lookup_reply(ls, rc);
448 break;
449
450 case DLM_RCOM_LOCK_REPLY:
451 receive_rcom_lock_reply(ls, rc);
452 break;
453
454 default:
455 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
456 }
457 out:
458 dlm_put_lockspace(ls);
459}
460
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..1712c97bc229
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,762 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
309 */
310
311static void set_new_master(struct dlm_rsb *r, int nodeid)
312{
313 lock_rsb(r);
314 r->res_nodeid = nodeid;
315 set_master_lkbs(r);
316 rsb_set_flag(r, RSB_NEW_MASTER);
317 rsb_set_flag(r, RSB_NEW_MASTER2);
318 unlock_rsb(r);
319}
320
321/*
322 * We do async lookups on rsb's that need new masters. The rsb's
323 * waiting for a lookup reply are kept on the recover_list.
324 */
325
326static int recover_master(struct dlm_rsb *r)
327{
328 struct dlm_ls *ls = r->res_ls;
329 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
330
331 dir_nodeid = dlm_dir_nodeid(r);
332
333 if (dir_nodeid == our_nodeid) {
334 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
335 r->res_length, &ret_nodeid);
336 if (error)
337 log_error(ls, "recover dir lookup error %d", error);
338
339 if (ret_nodeid == our_nodeid)
340 ret_nodeid = 0;
341 set_new_master(r, ret_nodeid);
342 } else {
343 recover_list_add(r);
344 error = dlm_send_rcom_lookup(r, dir_nodeid);
345 }
346
347 return error;
348}
349
350/*
351 * When not using a directory, most resource names will hash to a new static
352 * master nodeid and the resource will need to be remastered.
353 */
354
355static int recover_master_static(struct dlm_rsb *r)
356{
357 int master = dlm_dir_nodeid(r);
358
359 if (master == dlm_our_nodeid())
360 master = 0;
361
362 if (r->res_nodeid != master) {
363 if (is_master(r))
364 dlm_purge_mstcpy_locks(r);
365 set_new_master(r, master);
366 return 1;
367 }
368 return 0;
369}
370
371/*
372 * Go through local root resources and for each rsb which has a master which
373 * has departed, get the new master nodeid from the directory. The dir will
374 * assign mastery to the first node to look up the new master. That means
375 * we'll discover in this lookup if we're the new master of any rsb's.
376 *
377 * We fire off all the dir lookup requests individually and asynchronously to
378 * the correct dir node.
379 */
380
381int dlm_recover_masters(struct dlm_ls *ls)
382{
383 struct dlm_rsb *r;
384 int error = 0, count = 0;
385
386 log_debug(ls, "dlm_recover_masters");
387
388 down_read(&ls->ls_root_sem);
389 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
390 if (dlm_recovery_stopped(ls)) {
391 up_read(&ls->ls_root_sem);
392 error = -EINTR;
393 goto out;
394 }
395
396 if (dlm_no_directory(ls))
397 count += recover_master_static(r);
398 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
399 recover_master(r);
400 count++;
401 }
402
403 schedule();
404 }
405 up_read(&ls->ls_root_sem);
406
407 log_debug(ls, "dlm_recover_masters %d resources", count);
408
409 error = dlm_wait_function(ls, &recover_list_empty);
410 out:
411 if (error)
412 recover_list_clear(ls);
413 return error;
414}
415
416int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
417{
418 struct dlm_rsb *r;
419 int nodeid;
420
421 r = recover_list_find(ls, rc->rc_id);
422 if (!r) {
423 log_error(ls, "dlm_recover_master_reply no id %"PRIx64"",
424 rc->rc_id);
425 goto out;
426 }
427
428 nodeid = rc->rc_result;
429 if (nodeid == dlm_our_nodeid())
430 nodeid = 0;
431
432 set_new_master(r, nodeid);
433 recover_list_del(r);
434
435 if (recover_list_empty(ls))
436 wake_up(&ls->ls_wait_general);
437 out:
438 return 0;
439}
440
441
442/* Lock recovery: rebuild the process-copy locks we hold on a
443 remastered rsb on the new rsb master.
444
445 dlm_recover_locks
446 recover_locks
447 recover_locks_queue
448 dlm_send_rcom_lock -> receive_rcom_lock
449 dlm_recover_master_copy
450 receive_rcom_lock_reply <-
451 dlm_recover_process_copy
452*/
453
454
455/*
456 * keep a count of the number of lkb's we send to the new master; when we get
457 * an equal number of replies then recovery for the rsb is done
458 */
459
460static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
461{
462 struct dlm_lkb *lkb;
463 int error = 0;
464
465 list_for_each_entry(lkb, head, lkb_statequeue) {
466 error = dlm_send_rcom_lock(r, lkb);
467 if (error)
468 break;
469 r->res_recover_locks_count++;
470 }
471
472 return error;
473}
474
475static int all_queues_empty(struct dlm_rsb *r)
476{
477 if (!list_empty(&r->res_grantqueue) ||
478 !list_empty(&r->res_convertqueue) ||
479 !list_empty(&r->res_waitqueue))
480 return FALSE;
481 return TRUE;
482}
483
484static int recover_locks(struct dlm_rsb *r)
485{
486 int error = 0;
487
488 lock_rsb(r);
489 if (all_queues_empty(r))
490 goto out;
491
492 DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
493
494 error = recover_locks_queue(r, &r->res_grantqueue);
495 if (error)
496 goto out;
497 error = recover_locks_queue(r, &r->res_convertqueue);
498 if (error)
499 goto out;
500 error = recover_locks_queue(r, &r->res_waitqueue);
501 if (error)
502 goto out;
503
504 if (r->res_recover_locks_count)
505 recover_list_add(r);
506 else
507 rsb_clear_flag(r, RSB_NEW_MASTER);
508 out:
509 unlock_rsb(r);
510 return error;
511}
512
513int dlm_recover_locks(struct dlm_ls *ls)
514{
515 struct dlm_rsb *r;
516 int error, count = 0;
517
518 log_debug(ls, "dlm_recover_locks");
519
520 down_read(&ls->ls_root_sem);
521 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
522 if (is_master(r)) {
523 rsb_clear_flag(r, RSB_NEW_MASTER);
524 continue;
525 }
526
527 if (!rsb_flag(r, RSB_NEW_MASTER))
528 continue;
529
530 if (dlm_recovery_stopped(ls)) {
531 error = -EINTR;
532 up_read(&ls->ls_root_sem);
533 goto out;
534 }
535
536 error = recover_locks(r);
537 if (error) {
538 up_read(&ls->ls_root_sem);
539 goto out;
540 }
541
542 count += r->res_recover_locks_count;
543 }
544 up_read(&ls->ls_root_sem);
545
546 log_debug(ls, "dlm_recover_locks %d locks", count);
547
548 error = dlm_wait_function(ls, &recover_list_empty);
549 out:
550 if (error)
551 recover_list_clear(ls);
552 else
553 dlm_set_recover_status(ls, DLM_RS_LOCKS);
554 return error;
555}
556
557void dlm_recovered_lock(struct dlm_rsb *r)
558{
559 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
560
561 r->res_recover_locks_count--;
562 if (!r->res_recover_locks_count) {
563 rsb_clear_flag(r, RSB_NEW_MASTER);
564 recover_list_del(r);
565 }
566
567 if (recover_list_empty(r->res_ls))
568 wake_up(&r->res_ls->ls_wait_general);
569}
570
571/*
572 * The lvb needs to be recovered on all master rsb's. This includes setting
573 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
574 * based on the lvb's of the locks held on the rsb.
575 *
576 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
577 * was already set prior to recovery, it's not cleared, regardless of locks.
578 *
579 * The LVB contents are only considered for changing when this is a new master
580 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
581 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
582 * from the lkb with the largest lvb sequence number.
583 */
584
585static void recover_lvb(struct dlm_rsb *r)
586{
587 struct dlm_lkb *lkb, *high_lkb = NULL;
588 uint32_t high_seq = 0;
589 int lock_lvb_exists = FALSE;
590 int big_lock_exists = FALSE;
591 int lvblen = r->res_ls->ls_lvblen;
592
593 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
594 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
595 continue;
596
597 lock_lvb_exists = TRUE;
598
599 if (lkb->lkb_grmode > DLM_LOCK_CR) {
600 big_lock_exists = TRUE;
601 goto setflag;
602 }
603
604 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
605 high_lkb = lkb;
606 high_seq = lkb->lkb_lvbseq;
607 }
608 }
609
610 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
611 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
612 continue;
613
614 lock_lvb_exists = TRUE;
615
616 if (lkb->lkb_grmode > DLM_LOCK_CR) {
617 big_lock_exists = TRUE;
618 goto setflag;
619 }
620
621 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
622 high_lkb = lkb;
623 high_seq = lkb->lkb_lvbseq;
624 }
625 }
626
627 setflag:
628 if (!lock_lvb_exists)
629 goto out;
630
631 if (!big_lock_exists)
632 rsb_set_flag(r, RSB_VALNOTVALID);
633
634 /* don't mess with the lvb unless we're the new master */
635 if (!rsb_flag(r, RSB_NEW_MASTER2))
636 goto out;
637
638 if (!r->res_lvbptr) {
639 r->res_lvbptr = allocate_lvb(r->res_ls);
640 if (!r->res_lvbptr)
641 goto out;
642 }
643
644 if (big_lock_exists) {
645 r->res_lvbseq = lkb->lkb_lvbseq;
646 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
647 } else if (high_lkb) {
648 r->res_lvbseq = high_lkb->lkb_lvbseq;
649 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
650 } else {
651 r->res_lvbseq = 0;
652 memset(r->res_lvbptr, 0, lvblen);
653 }
654 out:
655 return;
656}
657
658/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
659 converting PR->CW or CW->PR need to have their lkb_grmode set. */
660
661static void recover_conversion(struct dlm_rsb *r)
662{
663 struct dlm_lkb *lkb;
664 int grmode = -1;
665
666 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
667 if (lkb->lkb_grmode == DLM_LOCK_PR ||
668 lkb->lkb_grmode == DLM_LOCK_CW) {
669 grmode = lkb->lkb_grmode;
670 break;
671 }
672 }
673
674 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
675 if (lkb->lkb_grmode != DLM_LOCK_IV)
676 continue;
677 if (grmode == -1)
678 lkb->lkb_grmode = lkb->lkb_rqmode;
679 else
680 lkb->lkb_grmode = grmode;
681 }
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 recover_lvb(r);
698 count++;
699 }
700 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
701 unlock_rsb(r);
702 }
703 up_read(&ls->ls_root_sem);
704
705 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
706}
707
708/* Create a single list of all root rsb's to be used during recovery */
709
710int dlm_create_root_list(struct dlm_ls *ls)
711{
712 struct dlm_rsb *r;
713 int i, error = 0;
714
715 down_write(&ls->ls_root_sem);
716 if (!list_empty(&ls->ls_root_list)) {
717 log_error(ls, "root list not empty");
718 error = -EINVAL;
719 goto out;
720 }
721
722 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
723 read_lock(&ls->ls_rsbtbl[i].lock);
724 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
725 list_add(&r->res_root_list, &ls->ls_root_list);
726 dlm_hold_rsb(r);
727 }
728 read_unlock(&ls->ls_rsbtbl[i].lock);
729 }
730 out:
731 up_write(&ls->ls_root_sem);
732 return error;
733}
734
735void dlm_release_root_list(struct dlm_ls *ls)
736{
737 struct dlm_rsb *r, *safe;
738
739 down_write(&ls->ls_root_sem);
740 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
741 list_del_init(&r->res_root_list);
742 dlm_put_rsb(r);
743 }
744 up_write(&ls->ls_root_sem);
745}
746
747void dlm_clear_toss_list(struct dlm_ls *ls)
748{
749 struct dlm_rsb *r, *safe;
750 int i;
751
752 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
753 write_lock(&ls->ls_rsbtbl[i].lock);
754 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
755 res_hashchain) {
756 list_del(&r->res_hashchain);
757 free_rsb(r);
758 }
759 write_unlock(&ls->ls_rsbtbl[i].lock);
760 }
761}
762
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..06e4f7cab6e7
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,285 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %"PRIx64"", rv->seq);
49
50 down(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %"PRIx64" done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 up(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %"PRIx64" error %d", rv->seq, error);
211 up(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237
238 while (!kthread_should_stop()) {
239 set_current_state(TASK_INTERRUPTIBLE);
240 if (!test_bit(LSFL_WORK, &ls->ls_flags))
241 schedule();
242 set_current_state(TASK_RUNNING);
243
244 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
245 do_ls_recovery(ls);
246 }
247
248 dlm_put_lockspace(ls);
249 return 0;
250}
251
252void dlm_recoverd_kick(struct dlm_ls *ls)
253{
254 set_bit(LSFL_WORK, &ls->ls_flags);
255 wake_up_process(ls->ls_recoverd_task);
256}
257
258int dlm_recoverd_start(struct dlm_ls *ls)
259{
260 struct task_struct *p;
261 int error = 0;
262
263 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
264 if (IS_ERR(p))
265 error = PTR_ERR(p);
266 else
267 ls->ls_recoverd_task = p;
268 return error;
269}
270
271void dlm_recoverd_stop(struct dlm_ls *ls)
272{
273 kthread_stop(ls->ls_recoverd_task);
274}
275
276void dlm_recoverd_suspend(struct dlm_ls *ls)
277{
278 down(&ls->ls_recoverd_active);
279}
280
281void dlm_recoverd_resume(struct dlm_ls *ls)
282{
283 up(&ls->ls_recoverd_active);
284}
285
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..36afe99e4f93
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 down(&ls->ls_requestqueue_lock);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 up(&ls->ls_requestqueue_lock);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 down(&ls->ls_requestqueue_lock);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 up(&ls->ls_requestqueue_lock);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 up(&ls->ls_requestqueue_lock);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, TRUE);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 down(&ls->ls_requestqueue_lock);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 up(&ls->ls_requestqueue_lock);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 down(&ls->ls_requestqueue_lock);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 up(&ls->ls_requestqueue_lock);
114 schedule();
115 }
116 up(&ls->ls_requestqueue_lock);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 down(&ls->ls_requestqueue_lock);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 up(&ls->ls_requestqueue_lock);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..826d122edf55
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,173 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57 ms->m_range[0] = cpu_to_le64(ms->m_range[0]);
58 ms->m_range[1] = cpu_to_le64(ms->m_range[1]);
59}
60
61void dlm_message_in(struct dlm_message *ms)
62{
63 struct dlm_header *hd = (struct dlm_header *) ms;
64
65 header_in(hd);
66
67 ms->m_type = le32_to_cpu(ms->m_type);
68 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
69 ms->m_pid = le32_to_cpu(ms->m_pid);
70 ms->m_lkid = le32_to_cpu(ms->m_lkid);
71 ms->m_remid = le32_to_cpu(ms->m_remid);
72 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
73 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
74 ms->m_exflags = le32_to_cpu(ms->m_exflags);
75 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
76 ms->m_flags = le32_to_cpu(ms->m_flags);
77 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
78 ms->m_hash = le32_to_cpu(ms->m_hash);
79 ms->m_status = le32_to_cpu(ms->m_status);
80 ms->m_grmode = le32_to_cpu(ms->m_grmode);
81 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
82 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
83 ms->m_asts = le32_to_cpu(ms->m_asts);
84 ms->m_result = le32_to_cpu(ms->m_result);
85 ms->m_range[0] = le64_to_cpu(ms->m_range[0]);
86 ms->m_range[1] = le64_to_cpu(ms->m_range[1]);
87}
88
89static void rcom_lock_out(struct rcom_lock *rl)
90{
91 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
92 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
93 rl->rl_remid = cpu_to_le32(rl->rl_remid);
94 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
95 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
96 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
97 rl->rl_flags = cpu_to_le32(rl->rl_flags);
98 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
99 rl->rl_result = cpu_to_le32(rl->rl_result);
100 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
101 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
102 rl->rl_range[0] = cpu_to_le64(rl->rl_range[0]);
103 rl->rl_range[1] = cpu_to_le64(rl->rl_range[1]);
104 rl->rl_range[2] = cpu_to_le64(rl->rl_range[2]);
105 rl->rl_range[3] = cpu_to_le64(rl->rl_range[3]);
106}
107
108static void rcom_lock_in(struct rcom_lock *rl)
109{
110 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
111 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
112 rl->rl_remid = le32_to_cpu(rl->rl_remid);
113 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
114 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
115 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
116 rl->rl_flags = le32_to_cpu(rl->rl_flags);
117 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
118 rl->rl_result = le32_to_cpu(rl->rl_result);
119 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
120 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
121 rl->rl_range[0] = le64_to_cpu(rl->rl_range[0]);
122 rl->rl_range[1] = le64_to_cpu(rl->rl_range[1]);
123 rl->rl_range[2] = le64_to_cpu(rl->rl_range[2]);
124 rl->rl_range[3] = le64_to_cpu(rl->rl_range[3]);
125}
126
127static void rcom_config_out(struct rcom_config *rf)
128{
129 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
130 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
131}
132
133static void rcom_config_in(struct rcom_config *rf)
134{
135 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
136 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
137}
138
139void dlm_rcom_out(struct dlm_rcom *rc)
140{
141 struct dlm_header *hd = (struct dlm_header *) rc;
142 int type = rc->rc_type;
143
144 header_out(hd);
145
146 rc->rc_type = cpu_to_le32(rc->rc_type);
147 rc->rc_result = cpu_to_le32(rc->rc_result);
148 rc->rc_id = cpu_to_le64(rc->rc_id);
149
150 if (type == DLM_RCOM_LOCK)
151 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
152
153 else if (type == DLM_RCOM_STATUS_REPLY)
154 rcom_config_out((struct rcom_config *) rc->rc_buf);
155}
156
157void dlm_rcom_in(struct dlm_rcom *rc)
158{
159 struct dlm_header *hd = (struct dlm_header *) rc;
160
161 header_in(hd);
162
163 rc->rc_type = le32_to_cpu(rc->rc_type);
164 rc->rc_result = le32_to_cpu(rc->rc_result);
165 rc->rc_id = le64_to_cpu(rc->rc_id);
166
167 if (rc->rc_type == DLM_RCOM_LOCK)
168 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
169
170 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
171 rcom_config_in((struct rcom_config *) rc->rc_buf);
172}
173
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..dd324ba44d80
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,312 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock range structure
164 */
165
166struct dlm_range {
167 uint64_t ra_start;
168 uint64_t ra_end;
169};
170
171/*
172 * Lock status block
173 *
174 * Use this structure to specify the contents of the lock value block. For a
175 * conversion request, this structure is used to specify the lock ID of the
176 * lock. DLM writes the status of the lock request and the lock ID assigned
177 * to the request in the lock status block.
178 *
179 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
180 * It is available when dlm_lock returns.
181 *
182 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
183 * shown for the DLM_LKF_VALBLK flag.
184 *
185 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
186 * it was first demoted to NL to avoid conversion deadlock.
187 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
188 *
189 * sb_status: the returned status of the lock request set prior to AST
190 * execution. Possible return values:
191 *
192 * 0 if lock request was successful
193 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
194 * -ENOMEM if there is no memory to process request
195 * -EINVAL if there are invalid parameters
196 * -DLM_EUNLOCK if unlock request was successful
197 * -DLM_ECANCEL if a cancel completed successfully
198 */
199
200#define DLM_SBF_DEMOTED 0x01
201#define DLM_SBF_VALNOTVALID 0x02
202#define DLM_SBF_ALTMODE 0x04
203
204struct dlm_lksb {
205 int sb_status;
206 uint32_t sb_lkid;
207 char sb_flags;
208 char * sb_lvbptr;
209};
210
211
212#ifdef __KERNEL__
213
214#define DLM_LSFL_NODIR 0x00000001
215
216/*
217 * dlm_new_lockspace
218 *
219 * Starts a lockspace with the given name. If the named lockspace exists in
220 * the cluster, the calling node joins it.
221 */
222
223int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
224 uint32_t flags, int lvblen);
225
226/*
227 * dlm_release_lockspace
228 *
229 * Stop a lockspace.
230 */
231
232int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
233
234/*
235 * dlm_lock
236 *
237 * Make an asyncronous request to acquire or convert a lock on a named
238 * resource.
239 *
240 * lockspace: context for the request
241 * mode: the requested mode of the lock (DLM_LOCK_)
242 * lksb: lock status block for input and async return values
243 * flags: input flags (DLM_LKF_)
244 * name: name of the resource to lock, can be binary
245 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
246 * parent: the lock ID of a parent lock or 0 if none
247 * lockast: function DLM executes when it completes processing the request
248 * astarg: argument passed to lockast and bast functions
249 * bast: function DLM executes when this lock later blocks another request
250 *
251 * Returns:
252 * 0 if request is successfully queued for processing
253 * -EINVAL if any input parameters are invalid
254 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
255 * -ENOMEM if there is no memory to process request
256 * -ENOTCONN if there is a communication error
257 *
258 * If the call to dlm_lock returns an error then the operation has failed and
259 * the AST routine will not be called. If dlm_lock returns 0 it is still
260 * possible that the lock operation will fail. The AST routine will be called
261 * when the locking is complete and the status is returned in the lksb.
262 *
263 * If the AST routines or parameter are passed to a conversion operation then
264 * they will overwrite those values that were passed to a previous dlm_lock
265 * call.
266 *
267 * AST routines should not block (at least not for long), but may make
268 * any locking calls they please.
269 */
270
271int dlm_lock(dlm_lockspace_t *lockspace,
272 int mode,
273 struct dlm_lksb *lksb,
274 uint32_t flags,
275 void *name,
276 unsigned int namelen,
277 uint32_t parent_lkid,
278 void (*lockast) (void *astarg),
279 void *astarg,
280 void (*bast) (void *astarg, int mode),
281 struct dlm_range *range);
282
283/*
284 * dlm_unlock
285 *
286 * Asynchronously release a lock on a resource. The AST routine is called
287 * when the resource is successfully unlocked.
288 *
289 * lockspace: context for the request
290 * lkid: the lock ID as returned in the lksb
291 * flags: input flags (DLM_LKF_)
292 * lksb: if NULL the lksb parameter passed to last lock request is used
293 * astarg: the arg used with the completion ast for the unlock
294 *
295 * Returns:
296 * 0 if request is successfully queued for processing
297 * -EINVAL if any input parameters are invalid
298 * -ENOTEMPTY if the lock still has sublocks
299 * -EBUSY if the lock is waiting for a remote lock operation
300 * -ENOTCONN if there is a communication error
301 */
302
303int dlm_unlock(dlm_lockspace_t *lockspace,
304 uint32_t lkid,
305 uint32_t flags,
306 struct dlm_lksb *lksb,
307 void *astarg);
308
309#endif /* __KERNEL__ */
310
311#endif /* __DLM_DOT_H__ */
312
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..5e17d295544b
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,84 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 3
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u16 flags;
29 __u32 lkid;
30 __u32 parent;
31 struct dlm_range range;
32 __u8 namelen;
33 void __user *castparam;
34 void __user *castaddr;
35 void __user *bastparam;
36 void __user *bastaddr;
37 struct dlm_lksb __user *lksb;
38 char lvb[DLM_USER_LVB_LEN];
39 char name[1];
40};
41
42struct dlm_lspace_params {
43 __u32 flags;
44 __u32 minor;
45 char name[1];
46};
47
48struct dlm_write_request {
49 __u32 version[3];
50 __u8 cmd;
51
52 union {
53 struct dlm_lock_params lock;
54 struct dlm_lspace_params lspace;
55 } i;
56};
57
58/* struct read from the "device" fd,
59 consists mainly of userspace pointers for the library to use */
60struct dlm_lock_result {
61 __u32 length;
62 void __user * user_astaddr;
63 void __user * user_astparam;
64 struct dlm_lksb __user * user_lksb;
65 struct dlm_lksb lksb;
66 __u8 bast_mode;
67 /* Offsets may be zero if no data is present */
68 __u32 lvb_offset;
69};
70
71/* Commands passed to the device */
72#define DLM_USER_LOCK 1
73#define DLM_USER_UNLOCK 2
74#define DLM_USER_QUERY 3
75#define DLM_USER_CREATE_LOCKSPACE 4
76#define DLM_USER_REMOVE_LOCKSPACE 5
77
78/* Arbitrary length restriction */
79#define MAX_LS_NAME_LEN 64
80
81/* Lockspace flags */
82#define DLM_USER_LSFLG_AUTOFREE 1
83#define DLM_USER_LSFLG_FORCEFREE 2
84