aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig30
-rw-r--r--fs/dlm/Makefile21
-rw-r--r--fs/dlm/ast.c167
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c787
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c310
-rw-r--r--fs/dlm/device.c1095
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h505
-rw-r--r--fs/dlm/lock.c3610
-rw-r--r--fs/dlm/lock.h50
-rw-r--r--fs/dlm/lockspace.c665
-rw-r--r--fs/dlm/lockspace.h24
-rw-r--r--fs/dlm/lowcomms.c1218
-rw-r--r--fs/dlm/lowcomms.h25
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c89
-rw-r--r--fs/dlm/member.c313
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c122
-rw-r--r--fs/dlm/memory.h31
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c460
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c762
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c285
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/util.c173
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig46
-rw-r--r--fs/gfs2/Makefile42
-rw-r--r--fs/gfs2/acl.c312
-rw-r--r--fs/gfs2/acl.h37
-rw-r--r--fs/gfs2/bits.c178
-rw-r--r--fs/gfs2/bits.h28
-rw-r--r--fs/gfs2/bmap.c1089
-rw-r--r--fs/gfs2/bmap.h35
-rw-r--r--fs/gfs2/daemon.c225
-rw-r--r--fs/gfs2/daemon.h20
-rw-r--r--fs/gfs2/dir.c2356
-rw-r--r--fs/gfs2/dir.h51
-rw-r--r--fs/gfs2/eaops.c185
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/eattr.c1563
-rw-r--r--fs/gfs2/eattr.h88
-rw-r--r--fs/gfs2/format.h21
-rw-r--r--fs/gfs2/gfs2.h60
-rw-r--r--fs/gfs2/glock.c2513
-rw-r--r--fs/gfs2/glock.h143
-rw-r--r--fs/gfs2/glops.c487
-rw-r--r--fs/gfs2/glops.h23
-rw-r--r--fs/gfs2/incore.h702
-rw-r--r--fs/gfs2/inode.c1835
-rw-r--r--fs/gfs2/inode.h81
-rw-r--r--fs/gfs2/lm.c235
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/lm_interface.h295
-rw-r--r--fs/gfs2/locking.c192
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c537
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h191
-rw-r--r--fs/gfs2/locking/dlm/main.c62
-rw-r--r--fs/gfs2/locking/dlm/mount.c247
-rw-r--r--fs/gfs2/locking/dlm/plock.c297
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c218
-rw-r--r--fs/gfs2/locking/dlm/thread.c352
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c268
-rw-r--r--fs/gfs2/log.c643
-rw-r--r--fs/gfs2/log.h65
-rw-r--r--fs/gfs2/lops.c768
-rw-r--r--fs/gfs2/lops.h96
-rw-r--r--fs/gfs2/lvb.c48
-rw-r--r--fs/gfs2/lvb.h28
-rw-r--r--fs/gfs2/main.c103
-rw-r--r--fs/gfs2/meta_io.c883
-rw-r--r--fs/gfs2/meta_io.h88
-rw-r--r--fs/gfs2/mount.c211
-rw-r--r--fs/gfs2/mount.h15
-rw-r--r--fs/gfs2/ondisk.c527
-rw-r--r--fs/gfs2/ops_address.c642
-rw-r--r--fs/gfs2/ops_address.h17
-rw-r--r--fs/gfs2/ops_dentry.c117
-rw-r--r--fs/gfs2/ops_dentry.h15
-rw-r--r--fs/gfs2/ops_export.c303
-rw-r--r--fs/gfs2/ops_export.h15
-rw-r--r--fs/gfs2/ops_file.c968
-rw-r--r--fs/gfs2/ops_file.h20
-rw-r--r--fs/gfs2/ops_fstype.c882
-rw-r--r--fs/gfs2/ops_fstype.h15
-rw-r--r--fs/gfs2/ops_inode.c1234
-rw-r--r--fs/gfs2/ops_inode.h18
-rw-r--r--fs/gfs2/ops_super.c401
-rw-r--r--fs/gfs2/ops_super.h15
-rw-r--r--fs/gfs2/ops_vm.c196
-rw-r--r--fs/gfs2/ops_vm.h16
-rw-r--r--fs/gfs2/page.c279
-rw-r--r--fs/gfs2/page.h23
-rw-r--r--fs/gfs2/quota.c1293
-rw-r--r--fs/gfs2/quota.h34
-rw-r--r--fs/gfs2/recovery.c570
-rw-r--r--fs/gfs2/recovery.h32
-rw-r--r--fs/gfs2/rgrp.c1364
-rw-r--r--fs/gfs2/rgrp.h62
-rw-r--r--fs/gfs2/super.c945
-rw-r--r--fs/gfs2/super.h55
-rw-r--r--fs/gfs2/sys.c640
-rw-r--r--fs/gfs2/sys.h24
-rw-r--r--fs/gfs2/trans.c198
-rw-r--r--fs/gfs2/trans.h39
-rw-r--r--fs/gfs2/unlinked.c453
-rw-r--r--fs/gfs2/unlinked.h25
-rw-r--r--fs/gfs2/util.c246
-rw-r--r--fs/gfs2/util.h172
123 files changed, 41676 insertions, 1 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index e9749b0eecd8..14bcce800346 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -323,6 +323,7 @@ config FS_POSIX_ACL
323 default n 323 default n
324 324
325source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
326source "fs/gfs2/Kconfig"
326 327
327config OCFS2_FS 328config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)" 329 tristate "OCFS2 file system support (EXPERIMENTAL)"
@@ -1828,6 +1829,7 @@ source "fs/partitions/Kconfig"
1828endmenu 1829endmenu
1829 1830
1830source "fs/nls/Kconfig" 1831source "fs/nls/Kconfig"
1832source "fs/dlm/Kconfig"
1831 1833
1832endmenu 1834endmenu
1833 1835
diff --git a/fs/Makefile b/fs/Makefile
index 1db711319c80..b298f4fdc6f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_SYSFS) += sysfs/
48obj-y += devpts/ 48obj-y += devpts/
49 49
50obj-$(CONFIG_PROFILING) += dcookies.o 50obj-$(CONFIG_PROFILING) += dcookies.o
51obj-$(CONFIG_DLM) += dlm/
51 52
52# Do not add any filesystems before this line 53# Do not add any filesystems before this line
53obj-$(CONFIG_REISERFS_FS) += reiserfs/ 54obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -103,3 +104,4 @@ obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 104obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_CONFIGFS_FS) += configfs/ 105obj-$(CONFIG_CONFIGFS_FS) += configfs/
105obj-$(CONFIG_OCFS2_FS) += ocfs2/ 106obj-$(CONFIG_OCFS2_FS) += ocfs2/
107obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..d01f735e6e06
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,30 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on SYSFS
7 depends on IPV6 || IPV6=n
8 select IP_SCTP
9 select CONFIGFS_FS
10 help
11 A general purpose distributed lock manager for kernel or userspace
12 applications.
13
14config DLM_DEVICE
15 tristate "DLM device for userspace access"
16 depends on DLM
17 help
18 This module creates a misc device through which the dlm lockspace
19 and locking functions become available to userspace applications
20 (usually through the libdlm library).
21
22config DLM_DEBUG
23 bool "DLM debugging"
24 depends on DLM
25 help
26 Under the debugfs mount point, the name of each lockspace will
27 appear as a file in the "dlm" directory. The output is the
28 list of resource and locks the local node knows about.
29
30endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1e6232e7d8e5
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,21 @@
1obj-$(CONFIG_DLM) += dlm.o
2obj-$(CONFIG_DLM_DEVICE) += dlm_device.o
3
4dlm-y := ast.o \
5 config.o \
6 dir.o \
7 lock.o \
8 lockspace.o \
9 lowcomms.o \
10 main.o \
11 member.o \
12 memory.o \
13 midcomms.o \
14 rcom.o \
15 recover.o \
16 recoverd.o \
17 requestqueue.o \
18 util.o
19dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
20
21dlm_device-y := device.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..57bdf09b520a
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,167 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "ast.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 spin_lock(&ast_queue_lock);
38 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
39 kref_get(&lkb->lkb_ref);
40 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
41 }
42 lkb->lkb_ast_type |= type;
43 spin_unlock(&ast_queue_lock);
44
45 set_bit(WAKE_ASTS, &astd_wakeflags);
46 wake_up_process(astd_task);
47}
48
49static void process_asts(void)
50{
51 struct dlm_ls *ls = NULL;
52 struct dlm_rsb *r = NULL;
53 struct dlm_lkb *lkb;
54 void (*cast) (long param);
55 void (*bast) (long param, int mode);
56 int type = 0, found, bmode;
57
58 for (;;) {
59 found = 0;
60 spin_lock(&ast_queue_lock);
61 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
62 r = lkb->lkb_resource;
63 ls = r->res_ls;
64
65 if (dlm_locking_stopped(ls))
66 continue;
67
68 list_del(&lkb->lkb_astqueue);
69 type = lkb->lkb_ast_type;
70 lkb->lkb_ast_type = 0;
71 found = 1;
72 break;
73 }
74 spin_unlock(&ast_queue_lock);
75
76 if (!found)
77 break;
78
79 cast = lkb->lkb_astaddr;
80 bast = lkb->lkb_bastaddr;
81 bmode = lkb->lkb_bastmode;
82
83 if ((type & AST_COMP) && cast)
84 cast(lkb->lkb_astparam);
85
86 /* FIXME: Is it safe to look at lkb_grmode here
87 without doing a lock_rsb() ?
88 Look at other checks in v1 to avoid basts. */
89
90 if ((type & AST_BAST) && bast)
91 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
92 bast(lkb->lkb_astparam, bmode);
93
94 /* this removes the reference added by dlm_add_ast
95 and may result in the lkb being freed */
96 dlm_put_lkb(lkb);
97
98 schedule();
99 }
100}
101
102static inline int no_asts(void)
103{
104 int ret;
105
106 spin_lock(&ast_queue_lock);
107 ret = list_empty(&ast_queue);
108 spin_unlock(&ast_queue_lock);
109 return ret;
110}
111
112static int dlm_astd(void *data)
113{
114 while (!kthread_should_stop()) {
115 set_current_state(TASK_INTERRUPTIBLE);
116 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
117 schedule();
118 set_current_state(TASK_RUNNING);
119
120 mutex_lock(&astd_running);
121 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
122 process_asts();
123 mutex_unlock(&astd_running);
124 }
125 return 0;
126}
127
128void dlm_astd_wake(void)
129{
130 if (!no_asts()) {
131 set_bit(WAKE_ASTS, &astd_wakeflags);
132 wake_up_process(astd_task);
133 }
134}
135
136int dlm_astd_start(void)
137{
138 struct task_struct *p;
139 int error = 0;
140
141 INIT_LIST_HEAD(&ast_queue);
142 spin_lock_init(&ast_queue_lock);
143 mutex_init(&astd_running);
144
145 p = kthread_run(dlm_astd, NULL, "dlm_astd");
146 if (IS_ERR(p))
147 error = PTR_ERR(p);
148 else
149 astd_task = p;
150 return error;
151}
152
153void dlm_astd_stop(void)
154{
155 kthread_stop(astd_task);
156}
157
158void dlm_astd_suspend(void)
159{
160 mutex_lock(&astd_running);
161}
162
163void dlm_astd_resume(void)
164{
165 mutex_unlock(&astd_running);
166}
167
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..87df9616415e
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,787 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20
21/*
22 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
24 * /config/dlm/<cluster>/comms/<comm>/nodeid
25 * /config/dlm/<cluster>/comms/<comm>/local
26 * /config/dlm/<cluster>/comms/<comm>/addr
27 * The <cluster> level is useless, but I haven't figured out how to avoid it.
28 */
29
30static struct config_group *space_list;
31static struct config_group *comm_list;
32static struct comm *local_comm;
33
34struct clusters;
35struct cluster;
36struct spaces;
37struct space;
38struct comms;
39struct comm;
40struct nodes;
41struct node;
42
43static struct config_group *make_cluster(struct config_group *, const char *);
44static void drop_cluster(struct config_group *, struct config_item *);
45static void release_cluster(struct config_item *);
46static struct config_group *make_space(struct config_group *, const char *);
47static void drop_space(struct config_group *, struct config_item *);
48static void release_space(struct config_item *);
49static struct config_item *make_comm(struct config_group *, const char *);
50static void drop_comm(struct config_group *, struct config_item *);
51static void release_comm(struct config_item *);
52static struct config_item *make_node(struct config_group *, const char *);
53static void drop_node(struct config_group *, struct config_item *);
54static void release_node(struct config_item *);
55
56static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
57 char *buf);
58static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
59 const char *buf, size_t len);
60static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
61 char *buf);
62static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
63 const char *buf, size_t len);
64
65static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
66static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
67static ssize_t comm_local_read(struct comm *cm, char *buf);
68static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
69static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t node_nodeid_read(struct node *nd, char *buf);
71static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
72static ssize_t node_weight_read(struct node *nd, char *buf);
73static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
74
75enum {
76 COMM_ATTR_NODEID = 0,
77 COMM_ATTR_LOCAL,
78 COMM_ATTR_ADDR,
79};
80
81struct comm_attribute {
82 struct configfs_attribute attr;
83 ssize_t (*show)(struct comm *, char *);
84 ssize_t (*store)(struct comm *, const char *, size_t);
85};
86
87static struct comm_attribute comm_attr_nodeid = {
88 .attr = { .ca_owner = THIS_MODULE,
89 .ca_name = "nodeid",
90 .ca_mode = S_IRUGO | S_IWUSR },
91 .show = comm_nodeid_read,
92 .store = comm_nodeid_write,
93};
94
95static struct comm_attribute comm_attr_local = {
96 .attr = { .ca_owner = THIS_MODULE,
97 .ca_name = "local",
98 .ca_mode = S_IRUGO | S_IWUSR },
99 .show = comm_local_read,
100 .store = comm_local_write,
101};
102
103static struct comm_attribute comm_attr_addr = {
104 .attr = { .ca_owner = THIS_MODULE,
105 .ca_name = "addr",
106 .ca_mode = S_IRUGO | S_IWUSR },
107 .store = comm_addr_write,
108};
109
110static struct configfs_attribute *comm_attrs[] = {
111 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
112 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
113 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
114 NULL,
115};
116
117enum {
118 NODE_ATTR_NODEID = 0,
119 NODE_ATTR_WEIGHT,
120};
121
122struct node_attribute {
123 struct configfs_attribute attr;
124 ssize_t (*show)(struct node *, char *);
125 ssize_t (*store)(struct node *, const char *, size_t);
126};
127
128static struct node_attribute node_attr_nodeid = {
129 .attr = { .ca_owner = THIS_MODULE,
130 .ca_name = "nodeid",
131 .ca_mode = S_IRUGO | S_IWUSR },
132 .show = node_nodeid_read,
133 .store = node_nodeid_write,
134};
135
136static struct node_attribute node_attr_weight = {
137 .attr = { .ca_owner = THIS_MODULE,
138 .ca_name = "weight",
139 .ca_mode = S_IRUGO | S_IWUSR },
140 .show = node_weight_read,
141 .store = node_weight_write,
142};
143
144static struct configfs_attribute *node_attrs[] = {
145 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
146 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
147 NULL,
148};
149
150struct clusters {
151 struct configfs_subsystem subsys;
152};
153
154struct cluster {
155 struct config_group group;
156};
157
158struct spaces {
159 struct config_group ss_group;
160};
161
162struct space {
163 struct config_group group;
164 struct list_head members;
165 struct mutex members_lock;
166 int members_count;
167};
168
169struct comms {
170 struct config_group cs_group;
171};
172
173struct comm {
174 struct config_item item;
175 int nodeid;
176 int local;
177 int addr_count;
178 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
179};
180
181struct nodes {
182 struct config_group ns_group;
183};
184
185struct node {
186 struct config_item item;
187 struct list_head list; /* space->members */
188 int nodeid;
189 int weight;
190};
191
192static struct configfs_group_operations clusters_ops = {
193 .make_group = make_cluster,
194 .drop_item = drop_cluster,
195};
196
197static struct configfs_item_operations cluster_ops = {
198 .release = release_cluster,
199};
200
201static struct configfs_group_operations spaces_ops = {
202 .make_group = make_space,
203 .drop_item = drop_space,
204};
205
206static struct configfs_item_operations space_ops = {
207 .release = release_space,
208};
209
210static struct configfs_group_operations comms_ops = {
211 .make_item = make_comm,
212 .drop_item = drop_comm,
213};
214
215static struct configfs_item_operations comm_ops = {
216 .release = release_comm,
217 .show_attribute = show_comm,
218 .store_attribute = store_comm,
219};
220
221static struct configfs_group_operations nodes_ops = {
222 .make_item = make_node,
223 .drop_item = drop_node,
224};
225
226static struct configfs_item_operations node_ops = {
227 .release = release_node,
228 .show_attribute = show_node,
229 .store_attribute = store_node,
230};
231
232static struct config_item_type clusters_type = {
233 .ct_group_ops = &clusters_ops,
234 .ct_owner = THIS_MODULE,
235};
236
237static struct config_item_type cluster_type = {
238 .ct_item_ops = &cluster_ops,
239 .ct_owner = THIS_MODULE,
240};
241
242static struct config_item_type spaces_type = {
243 .ct_group_ops = &spaces_ops,
244 .ct_owner = THIS_MODULE,
245};
246
247static struct config_item_type space_type = {
248 .ct_item_ops = &space_ops,
249 .ct_owner = THIS_MODULE,
250};
251
252static struct config_item_type comms_type = {
253 .ct_group_ops = &comms_ops,
254 .ct_owner = THIS_MODULE,
255};
256
257static struct config_item_type comm_type = {
258 .ct_item_ops = &comm_ops,
259 .ct_attrs = comm_attrs,
260 .ct_owner = THIS_MODULE,
261};
262
263static struct config_item_type nodes_type = {
264 .ct_group_ops = &nodes_ops,
265 .ct_owner = THIS_MODULE,
266};
267
268static struct config_item_type node_type = {
269 .ct_item_ops = &node_ops,
270 .ct_attrs = node_attrs,
271 .ct_owner = THIS_MODULE,
272};
273
274static struct cluster *to_cluster(struct config_item *i)
275{
276 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
277}
278
279static struct space *to_space(struct config_item *i)
280{
281 return i ? container_of(to_config_group(i), struct space, group) : NULL;
282}
283
284static struct comm *to_comm(struct config_item *i)
285{
286 return i ? container_of(i, struct comm, item) : NULL;
287}
288
289static struct node *to_node(struct config_item *i)
290{
291 return i ? container_of(i, struct node, item) : NULL;
292}
293
294static struct config_group *make_cluster(struct config_group *g,
295 const char *name)
296{
297 struct cluster *cl = NULL;
298 struct spaces *sps = NULL;
299 struct comms *cms = NULL;
300 void *gps = NULL;
301
302 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
303 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
304 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
305 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
306
307 if (!cl || !gps || !sps || !cms)
308 goto fail;
309
310 config_group_init_type_name(&cl->group, name, &cluster_type);
311 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
312 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
313
314 cl->group.default_groups = gps;
315 cl->group.default_groups[0] = &sps->ss_group;
316 cl->group.default_groups[1] = &cms->cs_group;
317 cl->group.default_groups[2] = NULL;
318
319 space_list = &sps->ss_group;
320 comm_list = &cms->cs_group;
321 return &cl->group;
322
323 fail:
324 kfree(cl);
325 kfree(gps);
326 kfree(sps);
327 kfree(cms);
328 return NULL;
329}
330
331static void drop_cluster(struct config_group *g, struct config_item *i)
332{
333 struct cluster *cl = to_cluster(i);
334 struct config_item *tmp;
335 int j;
336
337 for (j = 0; cl->group.default_groups[j]; j++) {
338 tmp = &cl->group.default_groups[j]->cg_item;
339 cl->group.default_groups[j] = NULL;
340 config_item_put(tmp);
341 }
342
343 space_list = NULL;
344 comm_list = NULL;
345
346 config_item_put(i);
347}
348
349static void release_cluster(struct config_item *i)
350{
351 struct cluster *cl = to_cluster(i);
352 kfree(cl->group.default_groups);
353 kfree(cl);
354}
355
356static struct config_group *make_space(struct config_group *g, const char *name)
357{
358 struct space *sp = NULL;
359 struct nodes *nds = NULL;
360 void *gps = NULL;
361
362 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
363 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
364 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
365
366 if (!sp || !gps || !nds)
367 goto fail;
368
369 config_group_init_type_name(&sp->group, name, &space_type);
370 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
371
372 sp->group.default_groups = gps;
373 sp->group.default_groups[0] = &nds->ns_group;
374 sp->group.default_groups[1] = NULL;
375
376 INIT_LIST_HEAD(&sp->members);
377 mutex_init(&sp->members_lock);
378 sp->members_count = 0;
379 return &sp->group;
380
381 fail:
382 kfree(sp);
383 kfree(gps);
384 kfree(nds);
385 return NULL;
386}
387
388static void drop_space(struct config_group *g, struct config_item *i)
389{
390 struct space *sp = to_space(i);
391 struct config_item *tmp;
392 int j;
393
394 /* assert list_empty(&sp->members) */
395
396 for (j = 0; sp->group.default_groups[j]; j++) {
397 tmp = &sp->group.default_groups[j]->cg_item;
398 sp->group.default_groups[j] = NULL;
399 config_item_put(tmp);
400 }
401
402 config_item_put(i);
403}
404
405static void release_space(struct config_item *i)
406{
407 struct space *sp = to_space(i);
408 kfree(sp->group.default_groups);
409 kfree(sp);
410}
411
412static struct config_item *make_comm(struct config_group *g, const char *name)
413{
414 struct comm *cm;
415
416 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
417 if (!cm)
418 return NULL;
419
420 config_item_init_type_name(&cm->item, name, &comm_type);
421 cm->nodeid = -1;
422 cm->local = 0;
423 cm->addr_count = 0;
424 return &cm->item;
425}
426
427static void drop_comm(struct config_group *g, struct config_item *i)
428{
429 struct comm *cm = to_comm(i);
430 if (local_comm == cm)
431 local_comm = NULL;
432 while (cm->addr_count--)
433 kfree(cm->addr[cm->addr_count]);
434 config_item_put(i);
435}
436
437static void release_comm(struct config_item *i)
438{
439 struct comm *cm = to_comm(i);
440 kfree(cm);
441}
442
443static struct config_item *make_node(struct config_group *g, const char *name)
444{
445 struct space *sp = to_space(g->cg_item.ci_parent);
446 struct node *nd;
447
448 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
449 if (!nd)
450 return NULL;
451
452 config_item_init_type_name(&nd->item, name, &node_type);
453 nd->nodeid = -1;
454 nd->weight = 1; /* default weight of 1 if none is set */
455
456 mutex_lock(&sp->members_lock);
457 list_add(&nd->list, &sp->members);
458 sp->members_count++;
459 mutex_unlock(&sp->members_lock);
460
461 return &nd->item;
462}
463
464static void drop_node(struct config_group *g, struct config_item *i)
465{
466 struct space *sp = to_space(g->cg_item.ci_parent);
467 struct node *nd = to_node(i);
468
469 mutex_lock(&sp->members_lock);
470 list_del(&nd->list);
471 sp->members_count--;
472 mutex_unlock(&sp->members_lock);
473
474 config_item_put(i);
475}
476
477static void release_node(struct config_item *i)
478{
479 struct node *nd = to_node(i);
480 kfree(nd);
481}
482
483static struct clusters clusters_root = {
484 .subsys = {
485 .su_group = {
486 .cg_item = {
487 .ci_namebuf = "dlm",
488 .ci_type = &clusters_type,
489 },
490 },
491 },
492};
493
494int dlm_config_init(void)
495{
496 config_group_init(&clusters_root.subsys.su_group);
497 init_MUTEX(&clusters_root.subsys.su_sem);
498 return configfs_register_subsystem(&clusters_root.subsys);
499}
500
501void dlm_config_exit(void)
502{
503 configfs_unregister_subsystem(&clusters_root.subsys);
504}
505
506/*
507 * Functions for user space to read/write attributes
508 */
509
510static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
511 char *buf)
512{
513 struct comm *cm = to_comm(i);
514 struct comm_attribute *cma =
515 container_of(a, struct comm_attribute, attr);
516 return cma->show ? cma->show(cm, buf) : 0;
517}
518
519static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
520 const char *buf, size_t len)
521{
522 struct comm *cm = to_comm(i);
523 struct comm_attribute *cma =
524 container_of(a, struct comm_attribute, attr);
525 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
526}
527
528static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
529{
530 return sprintf(buf, "%d\n", cm->nodeid);
531}
532
533static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
534{
535 cm->nodeid = simple_strtol(buf, NULL, 0);
536 return len;
537}
538
539static ssize_t comm_local_read(struct comm *cm, char *buf)
540{
541 return sprintf(buf, "%d\n", cm->local);
542}
543
544static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
545{
546 cm->local= simple_strtol(buf, NULL, 0);
547 if (cm->local && !local_comm)
548 local_comm = cm;
549 return len;
550}
551
552static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
553{
554 struct sockaddr_storage *addr;
555
556 if (len != sizeof(struct sockaddr_storage))
557 return -EINVAL;
558
559 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
560 return -ENOSPC;
561
562 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
563 if (!addr)
564 return -ENOMEM;
565
566 memcpy(addr, buf, len);
567 cm->addr[cm->addr_count++] = addr;
568 return len;
569}
570
571static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
572 char *buf)
573{
574 struct node *nd = to_node(i);
575 struct node_attribute *nda =
576 container_of(a, struct node_attribute, attr);
577 return nda->show ? nda->show(nd, buf) : 0;
578}
579
580static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
581 const char *buf, size_t len)
582{
583 struct node *nd = to_node(i);
584 struct node_attribute *nda =
585 container_of(a, struct node_attribute, attr);
586 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
587}
588
589static ssize_t node_nodeid_read(struct node *nd, char *buf)
590{
591 return sprintf(buf, "%d\n", nd->nodeid);
592}
593
594static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
595{
596 nd->nodeid = simple_strtol(buf, NULL, 0);
597 return len;
598}
599
600static ssize_t node_weight_read(struct node *nd, char *buf)
601{
602 return sprintf(buf, "%d\n", nd->weight);
603}
604
605static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
606{
607 nd->weight = simple_strtol(buf, NULL, 0);
608 return len;
609}
610
611/*
612 * Functions for the dlm to get the info that's been configured
613 */
614
615static struct space *get_space(char *name)
616{
617 if (!space_list)
618 return NULL;
619 return to_space(config_group_find_obj(space_list, name));
620}
621
622static void put_space(struct space *sp)
623{
624 config_item_put(&sp->group.cg_item);
625}
626
627static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
628{
629 struct config_item *i;
630 struct comm *cm = NULL;
631 int found = 0;
632
633 if (!comm_list)
634 return NULL;
635
636 down(&clusters_root.subsys.su_sem);
637
638 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
639 cm = to_comm(i);
640
641 if (nodeid) {
642 if (cm->nodeid != nodeid)
643 continue;
644 found = 1;
645 break;
646 } else {
647 if (!cm->addr_count ||
648 memcmp(cm->addr[0], addr, sizeof(*addr)))
649 continue;
650 found = 1;
651 break;
652 }
653 }
654 up(&clusters_root.subsys.su_sem);
655
656 if (found)
657 config_item_get(i);
658 else
659 cm = NULL;
660 return cm;
661}
662
663static void put_comm(struct comm *cm)
664{
665 config_item_put(&cm->item);
666}
667
668/* caller must free mem */
669int dlm_nodeid_list(char *lsname, int **ids_out)
670{
671 struct space *sp;
672 struct node *nd;
673 int i = 0, rv = 0;
674 int *ids;
675
676 sp = get_space(lsname);
677 if (!sp)
678 return -EEXIST;
679
680 mutex_lock(&sp->members_lock);
681 if (!sp->members_count) {
682 rv = 0;
683 goto out;
684 }
685
686 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
687 if (!ids) {
688 rv = -ENOMEM;
689 goto out;
690 }
691
692 rv = sp->members_count;
693 list_for_each_entry(nd, &sp->members, list)
694 ids[i++] = nd->nodeid;
695
696 if (rv != i)
697 printk("bad nodeid count %d %d\n", rv, i);
698
699 *ids_out = ids;
700 out:
701 mutex_unlock(&sp->members_lock);
702 put_space(sp);
703 return rv;
704}
705
706int dlm_node_weight(char *lsname, int nodeid)
707{
708 struct space *sp;
709 struct node *nd;
710 int w = -EEXIST;
711
712 sp = get_space(lsname);
713 if (!sp)
714 goto out;
715
716 mutex_lock(&sp->members_lock);
717 list_for_each_entry(nd, &sp->members, list) {
718 if (nd->nodeid != nodeid)
719 continue;
720 w = nd->weight;
721 break;
722 }
723 mutex_unlock(&sp->members_lock);
724 put_space(sp);
725 out:
726 return w;
727}
728
729int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
730{
731 struct comm *cm = get_comm(nodeid, NULL);
732 if (!cm)
733 return -EEXIST;
734 if (!cm->addr_count)
735 return -ENOENT;
736 memcpy(addr, cm->addr[0], sizeof(*addr));
737 put_comm(cm);
738 return 0;
739}
740
741int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
742{
743 struct comm *cm = get_comm(0, addr);
744 if (!cm)
745 return -EEXIST;
746 *nodeid = cm->nodeid;
747 put_comm(cm);
748 return 0;
749}
750
751int dlm_our_nodeid(void)
752{
753 return local_comm ? local_comm->nodeid : 0;
754}
755
756/* num 0 is first addr, num 1 is second addr */
757int dlm_our_addr(struct sockaddr_storage *addr, int num)
758{
759 if (!local_comm)
760 return -1;
761 if (num + 1 > local_comm->addr_count)
762 return -1;
763 memcpy(addr, local_comm->addr[num], sizeof(*addr));
764 return 0;
765}
766
767/* Config file defaults */
768#define DEFAULT_TCP_PORT 21064
769#define DEFAULT_BUFFER_SIZE 4096
770#define DEFAULT_RSBTBL_SIZE 256
771#define DEFAULT_LKBTBL_SIZE 1024
772#define DEFAULT_DIRTBL_SIZE 512
773#define DEFAULT_RECOVER_TIMER 5
774#define DEFAULT_TOSS_SECS 10
775#define DEFAULT_SCAN_SECS 5
776
777struct dlm_config_info dlm_config = {
778 .tcp_port = DEFAULT_TCP_PORT,
779 .buffer_size = DEFAULT_BUFFER_SIZE,
780 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
781 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
782 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
783 .recover_timer = DEFAULT_RECOVER_TIMER,
784 .toss_secs = DEFAULT_TOSS_SECS,
785 .scan_secs = DEFAULT_SCAN_SECS
786};
787
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..5080bbffd586
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,310 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21
22static struct dentry *dlm_root;
23
24struct rsb_iter {
25 int entry;
26 struct dlm_ls *ls;
27 struct list_head *next;
28 struct dlm_rsb *rsb;
29};
30
31static char *print_lockmode(int mode)
32{
33 switch (mode) {
34 case DLM_LOCK_IV:
35 return "--";
36 case DLM_LOCK_NL:
37 return "NL";
38 case DLM_LOCK_CR:
39 return "CR";
40 case DLM_LOCK_CW:
41 return "CW";
42 case DLM_LOCK_PR:
43 return "PR";
44 case DLM_LOCK_PW:
45 return "PW";
46 case DLM_LOCK_EX:
47 return "EX";
48 default:
49 return "??";
50 }
51}
52
53static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
54 struct dlm_rsb *res)
55{
56 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
57
58 if (lkb->lkb_status == DLM_LKSTS_CONVERT
59 || lkb->lkb_status == DLM_LKSTS_WAITING)
60 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
61
62 if (lkb->lkb_range) {
63 /* FIXME: this warns on Alpha */
64 if (lkb->lkb_status == DLM_LKSTS_CONVERT
65 || lkb->lkb_status == DLM_LKSTS_GRANTED)
66 seq_printf(s, " %llx-%llx",
67 lkb->lkb_range[GR_RANGE_START],
68 lkb->lkb_range[GR_RANGE_END]);
69 if (lkb->lkb_status == DLM_LKSTS_CONVERT
70 || lkb->lkb_status == DLM_LKSTS_WAITING)
71 seq_printf(s, " (%llx-%llx)",
72 lkb->lkb_range[RQ_RANGE_START],
73 lkb->lkb_range[RQ_RANGE_END]);
74 }
75
76 if (lkb->lkb_nodeid) {
77 if (lkb->lkb_nodeid != res->res_nodeid)
78 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
79 lkb->lkb_remid);
80 else
81 seq_printf(s, " Master: %08x", lkb->lkb_remid);
82 }
83
84 if (lkb->lkb_wait_type)
85 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
86
87 seq_printf(s, "\n");
88}
89
90static int print_resource(struct dlm_rsb *res, struct seq_file *s)
91{
92 struct dlm_lkb *lkb;
93 int i, lvblen = res->res_ls->ls_lvblen;
94
95 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
96 for (i = 0; i < res->res_length; i++) {
97 if (isprint(res->res_name[i]))
98 seq_printf(s, "%c", res->res_name[i]);
99 else
100 seq_printf(s, "%c", '.');
101 }
102 if (res->res_nodeid > 0)
103 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
104 res->res_nodeid);
105 else if (res->res_nodeid == 0)
106 seq_printf(s, "\" \nMaster Copy\n");
107 else if (res->res_nodeid == -1)
108 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
109 res->res_first_lkid);
110 else
111 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
112
113 /* Print the LVB: */
114 if (res->res_lvbptr) {
115 seq_printf(s, "LVB: ");
116 for (i = 0; i < lvblen; i++) {
117 if (i == lvblen / 2)
118 seq_printf(s, "\n ");
119 seq_printf(s, "%02x ",
120 (unsigned char) res->res_lvbptr[i]);
121 }
122 if (rsb_flag(res, RSB_VALNOTVALID))
123 seq_printf(s, " (INVALID)");
124 seq_printf(s, "\n");
125 }
126
127 /* Print the locks attached to this resource */
128 seq_printf(s, "Granted Queue\n");
129 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
130 print_lock(s, lkb, res);
131
132 seq_printf(s, "Conversion Queue\n");
133 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
134 print_lock(s, lkb, res);
135
136 seq_printf(s, "Waiting Queue\n");
137 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
138 print_lock(s, lkb, res);
139
140 return 0;
141}
142
143static int rsb_iter_next(struct rsb_iter *ri)
144{
145 struct dlm_ls *ls = ri->ls;
146 int i;
147
148 if (!ri->next) {
149 top:
150 /* Find the next non-empty hash bucket */
151 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
152 read_lock(&ls->ls_rsbtbl[i].lock);
153 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
154 ri->next = ls->ls_rsbtbl[i].list.next;
155 read_unlock(&ls->ls_rsbtbl[i].lock);
156 break;
157 }
158 read_unlock(&ls->ls_rsbtbl[i].lock);
159 }
160 ri->entry = i;
161
162 if (ri->entry >= ls->ls_rsbtbl_size)
163 return 1;
164 } else {
165 i = ri->entry;
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 ri->next = ri->next->next;
168 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
169 /* End of list - move to next bucket */
170 ri->next = NULL;
171 ri->entry++;
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 goto top;
174 }
175 read_unlock(&ls->ls_rsbtbl[i].lock);
176 }
177 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
178
179 return 0;
180}
181
182static void rsb_iter_free(struct rsb_iter *ri)
183{
184 kfree(ri);
185}
186
187static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
188{
189 struct rsb_iter *ri;
190
191 ri = kmalloc(sizeof *ri, GFP_KERNEL);
192 if (!ri)
193 return NULL;
194
195 ri->ls = ls;
196 ri->entry = 0;
197 ri->next = NULL;
198
199 if (rsb_iter_next(ri)) {
200 rsb_iter_free(ri);
201 return NULL;
202 }
203
204 return ri;
205}
206
207static void *seq_start(struct seq_file *file, loff_t *pos)
208{
209 struct rsb_iter *ri;
210 loff_t n = *pos;
211
212 ri = rsb_iter_init(file->private);
213 if (!ri)
214 return NULL;
215
216 while (n--) {
217 if (rsb_iter_next(ri)) {
218 rsb_iter_free(ri);
219 return NULL;
220 }
221 }
222
223 return ri;
224}
225
226static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
227{
228 struct rsb_iter *ri = iter_ptr;
229
230 (*pos)++;
231
232 if (rsb_iter_next(ri)) {
233 rsb_iter_free(ri);
234 return NULL;
235 }
236
237 return ri;
238}
239
240static void seq_stop(struct seq_file *file, void *iter_ptr)
241{
242 /* nothing for now */
243}
244
245static int seq_show(struct seq_file *file, void *iter_ptr)
246{
247 struct rsb_iter *ri = iter_ptr;
248
249 print_resource(ri->rsb, file);
250
251 return 0;
252}
253
254static struct seq_operations dlm_seq_ops = {
255 .start = seq_start,
256 .next = seq_next,
257 .stop = seq_stop,
258 .show = seq_show,
259};
260
261static int do_open(struct inode *inode, struct file *file)
262{
263 struct seq_file *seq;
264 int ret;
265
266 ret = seq_open(file, &dlm_seq_ops);
267 if (ret)
268 return ret;
269
270 seq = file->private_data;
271 seq->private = inode->u.generic_ip;
272
273 return 0;
274}
275
276static struct file_operations dlm_fops = {
277 .owner = THIS_MODULE,
278 .open = do_open,
279 .read = seq_read,
280 .llseek = seq_lseek,
281 .release = seq_release
282};
283
284int dlm_create_debug_file(struct dlm_ls *ls)
285{
286 ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
287 S_IFREG | S_IRUGO,
288 dlm_root,
289 ls,
290 &dlm_fops);
291 return ls->ls_debug_dentry ? 0 : -ENOMEM;
292}
293
294void dlm_delete_debug_file(struct dlm_ls *ls)
295{
296 if (ls->ls_debug_dentry)
297 debugfs_remove(ls->ls_debug_dentry);
298}
299
300int dlm_register_debugfs(void)
301{
302 dlm_root = debugfs_create_dir("dlm", NULL);
303 return dlm_root ? 0 : -ENOMEM;
304}
305
306void dlm_unregister_debugfs(void)
307{
308 debugfs_remove(dlm_root);
309}
310
diff --git a/fs/dlm/device.c b/fs/dlm/device.c
new file mode 100644
index 000000000000..899d4f92a4d7
--- /dev/null
+++ b/fs/dlm/device.c
@@ -0,0 +1,1095 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * device.c
16 *
17 * This is the userland interface to the DLM.
18 *
19 * The locking is done via a misc char device (find the
20 * registered minor number in /proc/misc).
21 *
22 * User code should not use this interface directly but
23 * call the library routines in libdlm.a instead.
24 *
25 */
26
27#include <linux/miscdevice.h>
28#include <linux/init.h>
29#include <linux/wait.h>
30#include <linux/module.h>
31#include <linux/file.h>
32#include <linux/fs.h>
33#include <linux/poll.h>
34#include <linux/signal.h>
35#include <linux/spinlock.h>
36#include <linux/idr.h>
37
38#include <linux/dlm.h>
39#include <linux/dlm_device.h>
40
41#include "lvb_table.h"
42
43static struct file_operations _dlm_fops;
44static const char *name_prefix="dlm";
45static struct list_head user_ls_list;
46static struct mutex user_ls_lock;
47
48/* Lock infos are stored in here indexed by lock ID */
49static DEFINE_IDR(lockinfo_idr);
50static rwlock_t lockinfo_lock;
51
52/* Flags in li_flags */
53#define LI_FLAG_COMPLETE 1
54#define LI_FLAG_FIRSTLOCK 2
55#define LI_FLAG_PERSISTENT 3
56#define LI_FLAG_ONLIST 4
57
58/* flags in ls_flags*/
59#define LS_FLAG_DELETED 1
60#define LS_FLAG_AUTOFREE 2
61
62
63#define LOCKINFO_MAGIC 0x53595324
64
65struct lock_info {
66 uint32_t li_magic;
67 uint8_t li_cmd;
68 int8_t li_grmode;
69 int8_t li_rqmode;
70 struct dlm_lksb li_lksb;
71 wait_queue_head_t li_waitq;
72 unsigned long li_flags;
73 void __user *li_castparam;
74 void __user *li_castaddr;
75 void __user *li_bastparam;
76 void __user *li_bastaddr;
77 void __user *li_pend_bastparam;
78 void __user *li_pend_bastaddr;
79 struct list_head li_ownerqueue;
80 struct file_info *li_file;
81 struct dlm_lksb __user *li_user_lksb;
82 struct semaphore li_firstlock;
83};
84
85/* A queued AST no less */
86struct ast_info {
87 struct dlm_lock_result result;
88 struct list_head list;
89 uint32_t lvb_updated;
90 uint32_t progress; /* How much has been read */
91};
92
93/* One of these per userland lockspace */
94struct user_ls {
95 void *ls_lockspace;
96 atomic_t ls_refcnt;
97 long ls_flags;
98
99 /* Passed into misc_register() */
100 struct miscdevice ls_miscinfo;
101 struct list_head ls_list;
102};
103
104/* misc_device info for the control device */
105static struct miscdevice ctl_device;
106
107/*
108 * Stuff we hang off the file struct.
109 * The first two are to cope with unlocking all the
110 * locks help by a process when it dies.
111 */
112struct file_info {
113 struct list_head fi_li_list; /* List of active lock_infos */
114 spinlock_t fi_li_lock;
115 struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
116 spinlock_t fi_ast_lock;
117 wait_queue_head_t fi_wait;
118 struct user_ls *fi_ls;
119 atomic_t fi_refcnt; /* Number of users */
120 unsigned long fi_flags; /* Bit 1 means the device is open */
121};
122
123
124/* get and put ops for file_info.
125 Actually I don't really like "get" and "put", but everyone
126 else seems to use them and I can't think of anything
127 nicer at the moment */
128static void get_file_info(struct file_info *f)
129{
130 atomic_inc(&f->fi_refcnt);
131}
132
133static void put_file_info(struct file_info *f)
134{
135 if (atomic_dec_and_test(&f->fi_refcnt))
136 kfree(f);
137}
138
139static void release_lockinfo(struct lock_info *li)
140{
141 put_file_info(li->li_file);
142
143 write_lock(&lockinfo_lock);
144 idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid);
145 write_unlock(&lockinfo_lock);
146
147 if (li->li_lksb.sb_lvbptr)
148 kfree(li->li_lksb.sb_lvbptr);
149 kfree(li);
150
151 module_put(THIS_MODULE);
152}
153
154static struct lock_info *get_lockinfo(uint32_t lockid)
155{
156 struct lock_info *li;
157
158 read_lock(&lockinfo_lock);
159 li = idr_find(&lockinfo_idr, lockid);
160 read_unlock(&lockinfo_lock);
161
162 return li;
163}
164
165static int add_lockinfo(struct lock_info *li)
166{
167 int n;
168 int r;
169 int ret = -EINVAL;
170
171 write_lock(&lockinfo_lock);
172
173 if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid))
174 goto out_up;
175
176 ret = -ENOMEM;
177 r = idr_pre_get(&lockinfo_idr, GFP_KERNEL);
178 if (!r)
179 goto out_up;
180
181 r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n);
182 if (r)
183 goto out_up;
184
185 if (n != li->li_lksb.sb_lkid) {
186 idr_remove(&lockinfo_idr, n);
187 goto out_up;
188 }
189
190 ret = 0;
191
192 out_up:
193 write_unlock(&lockinfo_lock);
194
195 return ret;
196}
197
198
199static struct user_ls *__find_lockspace(int minor)
200{
201 struct user_ls *lsinfo;
202
203 list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
204 if (lsinfo->ls_miscinfo.minor == minor)
205 return lsinfo;
206 }
207 return NULL;
208}
209
210/* Find a lockspace struct given the device minor number */
211static struct user_ls *find_lockspace(int minor)
212{
213 struct user_ls *lsinfo;
214
215 mutex_lock(&user_ls_lock);
216 lsinfo = __find_lockspace(minor);
217 mutex_unlock(&user_ls_lock);
218
219 return lsinfo;
220}
221
222static void add_lockspace_to_list(struct user_ls *lsinfo)
223{
224 mutex_lock(&user_ls_lock);
225 list_add(&lsinfo->ls_list, &user_ls_list);
226 mutex_unlock(&user_ls_lock);
227}
228
229/* Register a lockspace with the DLM and create a misc
230 device for userland to access it */
231static int register_lockspace(char *name, struct user_ls **ls, int flags)
232{
233 struct user_ls *newls;
234 int status;
235 int namelen;
236
237 namelen = strlen(name)+strlen(name_prefix)+2;
238
239 newls = kzalloc(sizeof(struct user_ls), GFP_KERNEL);
240 if (!newls)
241 return -ENOMEM;
242
243 newls->ls_miscinfo.name = kzalloc(namelen, GFP_KERNEL);
244 if (!newls->ls_miscinfo.name) {
245 kfree(newls);
246 return -ENOMEM;
247 }
248
249 status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0,
250 DLM_USER_LVB_LEN);
251 if (status != 0) {
252 kfree(newls->ls_miscinfo.name);
253 kfree(newls);
254 return status;
255 }
256
257 snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s",
258 name_prefix, name);
259
260 newls->ls_miscinfo.fops = &_dlm_fops;
261 newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
262
263 status = misc_register(&newls->ls_miscinfo);
264 if (status) {
265 printk(KERN_ERR "dlm: misc register failed for %s\n", name);
266 dlm_release_lockspace(newls->ls_lockspace, 0);
267 kfree(newls->ls_miscinfo.name);
268 kfree(newls);
269 return status;
270 }
271
272 if (flags & DLM_USER_LSFLG_AUTOFREE)
273 set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags);
274
275 add_lockspace_to_list(newls);
276 *ls = newls;
277 return 0;
278}
279
280/* Called with the user_ls_lock mutex held */
281static int unregister_lockspace(struct user_ls *lsinfo, int force)
282{
283 int status;
284
285 status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
286 if (status)
287 return status;
288
289 status = misc_deregister(&lsinfo->ls_miscinfo);
290 if (status)
291 return status;
292
293 list_del(&lsinfo->ls_list);
294 set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags);
295 lsinfo->ls_lockspace = NULL;
296 if (atomic_read(&lsinfo->ls_refcnt) == 0) {
297 kfree(lsinfo->ls_miscinfo.name);
298 kfree(lsinfo);
299 }
300
301 return 0;
302}
303
304/* Add it to userland's AST queue */
305static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam,
306 int lvb_updated)
307{
308 struct ast_info *ast = kzalloc(sizeof(struct ast_info), GFP_KERNEL);
309 if (!ast)
310 return;
311
312 ast->result.user_astparam = astparam;
313 ast->result.user_astaddr = astaddr;
314 ast->result.user_lksb = li->li_user_lksb;
315 memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
316 ast->lvb_updated = lvb_updated;
317
318 spin_lock(&li->li_file->fi_ast_lock);
319 list_add_tail(&ast->list, &li->li_file->fi_ast_list);
320 spin_unlock(&li->li_file->fi_ast_lock);
321 wake_up_interruptible(&li->li_file->fi_wait);
322}
323
324static void bast_routine(void *param, int mode)
325{
326 struct lock_info *li = param;
327
328 if (li && li->li_bastaddr)
329 add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0);
330}
331
332/*
333 * This is the kernel's AST routine.
334 * All lock, unlock & query operations complete here.
335 * The only syncronous ops are those done during device close.
336 */
337static void ast_routine(void *param)
338{
339 struct lock_info *li = param;
340
341 /* Param may be NULL if a persistent lock is unlocked by someone else */
342 if (!li)
343 return;
344
345 /* If this is a succesful conversion then activate the blocking ast
346 * args from the conversion request */
347 if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
348 li->li_lksb.sb_status == 0) {
349
350 li->li_bastparam = li->li_pend_bastparam;
351 li->li_bastaddr = li->li_pend_bastaddr;
352 li->li_pend_bastaddr = NULL;
353 }
354
355 /* If it's an async request then post data to the user's AST queue. */
356 if (li->li_castaddr) {
357 int lvb_updated = 0;
358
359 /* See if the lvb has been updated */
360 if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1)
361 lvb_updated = 1;
362
363 if (li->li_lksb.sb_status == 0)
364 li->li_grmode = li->li_rqmode;
365
366 /* Only queue AST if the device is still open */
367 if (test_bit(1, &li->li_file->fi_flags))
368 add_to_astqueue(li, li->li_castaddr, li->li_castparam,
369 lvb_updated);
370
371 /* If it's a new lock operation that failed, then
372 * remove it from the owner queue and free the
373 * lock_info.
374 */
375 if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
376 li->li_lksb.sb_status != 0) {
377
378 /* Wait till dlm_lock() has finished */
379 down(&li->li_firstlock);
380 up(&li->li_firstlock);
381
382 spin_lock(&li->li_file->fi_li_lock);
383 list_del(&li->li_ownerqueue);
384 clear_bit(LI_FLAG_ONLIST, &li->li_flags);
385 spin_unlock(&li->li_file->fi_li_lock);
386 release_lockinfo(li);
387 return;
388 }
389 /* Free unlocks & queries */
390 if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
391 li->li_cmd == DLM_USER_QUERY) {
392 release_lockinfo(li);
393 }
394 } else {
395 /* Synchronous request, just wake up the caller */
396 set_bit(LI_FLAG_COMPLETE, &li->li_flags);
397 wake_up_interruptible(&li->li_waitq);
398 }
399}
400
401/*
402 * Wait for the lock op to complete and return the status.
403 */
404static int wait_for_ast(struct lock_info *li)
405{
406 /* Wait for the AST routine to complete */
407 set_task_state(current, TASK_INTERRUPTIBLE);
408 while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
409 schedule();
410
411 set_task_state(current, TASK_RUNNING);
412
413 return li->li_lksb.sb_status;
414}
415
416
417/* Open on control device */
418static int dlm_ctl_open(struct inode *inode, struct file *file)
419{
420 file->private_data = NULL;
421 return 0;
422}
423
424/* Close on control device */
425static int dlm_ctl_close(struct inode *inode, struct file *file)
426{
427 return 0;
428}
429
430/* Open on lockspace device */
431static int dlm_open(struct inode *inode, struct file *file)
432{
433 struct file_info *f;
434 struct user_ls *lsinfo;
435
436 lsinfo = find_lockspace(iminor(inode));
437 if (!lsinfo)
438 return -ENOENT;
439
440 f = kzalloc(sizeof(struct file_info), GFP_KERNEL);
441 if (!f)
442 return -ENOMEM;
443
444 atomic_inc(&lsinfo->ls_refcnt);
445 INIT_LIST_HEAD(&f->fi_li_list);
446 INIT_LIST_HEAD(&f->fi_ast_list);
447 spin_lock_init(&f->fi_li_lock);
448 spin_lock_init(&f->fi_ast_lock);
449 init_waitqueue_head(&f->fi_wait);
450 f->fi_ls = lsinfo;
451 f->fi_flags = 0;
452 get_file_info(f);
453 set_bit(1, &f->fi_flags);
454
455 file->private_data = f;
456
457 return 0;
458}
459
460/* Check the user's version matches ours */
461static int check_version(struct dlm_write_request *req)
462{
463 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
464 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
465 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
466
467 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
468 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
469 current->comm,
470 current->pid,
471 req->version[0],
472 req->version[1],
473 req->version[2],
474 DLM_DEVICE_VERSION_MAJOR,
475 DLM_DEVICE_VERSION_MINOR,
476 DLM_DEVICE_VERSION_PATCH);
477 return -EINVAL;
478 }
479 return 0;
480}
481
482/* Close on lockspace device */
483static int dlm_close(struct inode *inode, struct file *file)
484{
485 struct file_info *f = file->private_data;
486 struct lock_info li;
487 struct lock_info *old_li, *safe;
488 sigset_t tmpsig;
489 sigset_t allsigs;
490 struct user_ls *lsinfo;
491 DECLARE_WAITQUEUE(wq, current);
492
493 lsinfo = find_lockspace(iminor(inode));
494 if (!lsinfo)
495 return -ENOENT;
496
497 /* Mark this closed so that ASTs will not be delivered any more */
498 clear_bit(1, &f->fi_flags);
499
500 /* Block signals while we are doing this */
501 sigfillset(&allsigs);
502 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
503
504 /* We use our own lock_info struct here, so that any
505 * outstanding "real" ASTs will be delivered with the
506 * corresponding "real" params, thus freeing the lock_info
507 * that belongs the lock. This catches the corner case where
508 * a lock is BUSY when we try to unlock it here
509 */
510 memset(&li, 0, sizeof(li));
511 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
512 init_waitqueue_head(&li.li_waitq);
513 add_wait_queue(&li.li_waitq, &wq);
514
515 /*
516 * Free any outstanding locks, they are on the
517 * list in LIFO order so there should be no problems
518 * about unlocking parents before children.
519 */
520 list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) {
521 int status;
522 int flags = 0;
523
524 /* Don't unlock persistent locks, just mark them orphaned */
525 if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) {
526 list_del(&old_li->li_ownerqueue);
527
528 /* Update master copy */
529 /* TODO: Check locking core updates the local and
530 remote ORPHAN flags */
531 li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid;
532 status = dlm_lock(f->fi_ls->ls_lockspace,
533 old_li->li_grmode, &li.li_lksb,
534 DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
535 NULL, 0, 0, ast_routine, NULL,
536 NULL, NULL);
537 if (status != 0)
538 printk("dlm: Error orphaning lock %x: %d\n",
539 old_li->li_lksb.sb_lkid, status);
540
541 /* But tidy our references in it */
542 release_lockinfo(old_li);
543 continue;
544 }
545
546 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
547
548 flags = DLM_LKF_FORCEUNLOCK;
549 if (old_li->li_grmode >= DLM_LOCK_PW)
550 flags |= DLM_LKF_IVVALBLK;
551
552 status = dlm_unlock(f->fi_ls->ls_lockspace,
553 old_li->li_lksb.sb_lkid, flags,
554 &li.li_lksb, &li);
555
556 /* Must wait for it to complete as the next lock could be its
557 * parent */
558 if (status == 0)
559 wait_for_ast(&li);
560
561 /* Unlock suceeded, free the lock_info struct. */
562 if (status == 0)
563 release_lockinfo(old_li);
564 }
565
566 remove_wait_queue(&li.li_waitq, &wq);
567
568 /*
569 * If this is the last reference to the lockspace
570 * then free the struct. If it's an AUTOFREE lockspace
571 * then free the whole thing.
572 */
573 mutex_lock(&user_ls_lock);
574 if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
575
576 if (lsinfo->ls_lockspace) {
577 if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) {
578 unregister_lockspace(lsinfo, 1);
579 }
580 } else {
581 kfree(lsinfo->ls_miscinfo.name);
582 kfree(lsinfo);
583 }
584 }
585 mutex_unlock(&user_ls_lock);
586 put_file_info(f);
587
588 /* Restore signals */
589 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
590 recalc_sigpending();
591
592 return 0;
593}
594
595static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd,
596 struct dlm_lspace_params *kparams)
597{
598 int status;
599 struct user_ls *lsinfo;
600
601 if (!capable(CAP_SYS_ADMIN))
602 return -EPERM;
603
604 status = register_lockspace(kparams->name, &lsinfo, kparams->flags);
605
606 /* If it succeeded then return the minor number */
607 if (status == 0)
608 status = lsinfo->ls_miscinfo.minor;
609
610 return status;
611}
612
613static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd,
614 struct dlm_lspace_params *kparams)
615{
616 int status;
617 int force = 1;
618 struct user_ls *lsinfo;
619
620 if (!capable(CAP_SYS_ADMIN))
621 return -EPERM;
622
623 mutex_lock(&user_ls_lock);
624 lsinfo = __find_lockspace(kparams->minor);
625 if (!lsinfo) {
626 mutex_unlock(&user_ls_lock);
627 return -EINVAL;
628 }
629
630 if (kparams->flags & DLM_USER_LSFLG_FORCEFREE)
631 force = 2;
632
633 status = unregister_lockspace(lsinfo, force);
634 mutex_unlock(&user_ls_lock);
635
636 return status;
637}
638
639/* Read call, might block if no ASTs are waiting.
640 * It will only ever return one message at a time, regardless
641 * of how many are pending.
642 */
643static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count,
644 loff_t *ppos)
645{
646 struct file_info *fi = file->private_data;
647 struct ast_info *ast;
648 int data_size;
649 int offset;
650 DECLARE_WAITQUEUE(wait, current);
651
652 if (count < sizeof(struct dlm_lock_result))
653 return -EINVAL;
654
655 spin_lock(&fi->fi_ast_lock);
656 if (list_empty(&fi->fi_ast_list)) {
657
658 /* No waiting ASTs.
659 * Return EOF if the lockspace been deleted.
660 */
661 if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
662 return 0;
663
664 if (file->f_flags & O_NONBLOCK) {
665 spin_unlock(&fi->fi_ast_lock);
666 return -EAGAIN;
667 }
668
669 add_wait_queue(&fi->fi_wait, &wait);
670
671 repeat:
672 set_current_state(TASK_INTERRUPTIBLE);
673 if (list_empty(&fi->fi_ast_list) &&
674 !signal_pending(current)) {
675
676 spin_unlock(&fi->fi_ast_lock);
677 schedule();
678 spin_lock(&fi->fi_ast_lock);
679 goto repeat;
680 }
681
682 current->state = TASK_RUNNING;
683 remove_wait_queue(&fi->fi_wait, &wait);
684
685 if (signal_pending(current)) {
686 spin_unlock(&fi->fi_ast_lock);
687 return -ERESTARTSYS;
688 }
689 }
690
691 ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
692 list_del(&ast->list);
693 spin_unlock(&fi->fi_ast_lock);
694
695 /* Work out the size of the returned data */
696 data_size = sizeof(struct dlm_lock_result);
697 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr)
698 data_size += DLM_USER_LVB_LEN;
699
700 offset = sizeof(struct dlm_lock_result);
701
702 /* Room for the extended data ? */
703 if (count >= data_size) {
704
705 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) {
706 if (copy_to_user(buffer+offset,
707 ast->result.lksb.sb_lvbptr,
708 DLM_USER_LVB_LEN))
709 return -EFAULT;
710 ast->result.lvb_offset = offset;
711 offset += DLM_USER_LVB_LEN;
712 }
713 }
714
715 ast->result.length = data_size;
716 /* Copy the header now it has all the offsets in it */
717 if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
718 offset = -EFAULT;
719
720 /* If we only returned a header and there's more to come then put it
721 back on the list */
722 if (count < data_size) {
723 spin_lock(&fi->fi_ast_lock);
724 list_add(&ast->list, &fi->fi_ast_list);
725 spin_unlock(&fi->fi_ast_lock);
726 } else
727 kfree(ast);
728 return offset;
729}
730
731static unsigned int dlm_poll(struct file *file, poll_table *wait)
732{
733 struct file_info *fi = file->private_data;
734
735 poll_wait(file, &fi->fi_wait, wait);
736
737 spin_lock(&fi->fi_ast_lock);
738 if (!list_empty(&fi->fi_ast_list)) {
739 spin_unlock(&fi->fi_ast_lock);
740 return POLLIN | POLLRDNORM;
741 }
742
743 spin_unlock(&fi->fi_ast_lock);
744 return 0;
745}
746
747static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd,
748 struct dlm_lock_params *kparams)
749{
750 struct lock_info *li;
751
752 if (!try_module_get(THIS_MODULE))
753 return NULL;
754
755 li = kzalloc(sizeof(struct lock_info), GFP_KERNEL);
756 if (li) {
757 li->li_magic = LOCKINFO_MAGIC;
758 li->li_file = fi;
759 li->li_cmd = cmd;
760 li->li_flags = 0;
761 li->li_grmode = -1;
762 li->li_rqmode = -1;
763 li->li_pend_bastparam = NULL;
764 li->li_pend_bastaddr = NULL;
765 li->li_castaddr = NULL;
766 li->li_castparam = NULL;
767 li->li_lksb.sb_lvbptr = NULL;
768 li->li_bastaddr = kparams->bastaddr;
769 li->li_bastparam = kparams->bastparam;
770
771 get_file_info(fi);
772 }
773 return li;
774}
775
776static int do_user_lock(struct file_info *fi, uint8_t cmd,
777 struct dlm_lock_params *kparams)
778{
779 struct lock_info *li;
780 int status;
781
782 /*
783 * Validate things that we need to have correct.
784 */
785 if (!kparams->castaddr)
786 return -EINVAL;
787
788 if (!kparams->lksb)
789 return -EINVAL;
790
791 /* Persistent child locks are not available yet */
792 if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
793 return -EINVAL;
794
795 /* For conversions, there should already be a lockinfo struct,
796 unless we are adopting an orphaned persistent lock */
797 if (kparams->flags & DLM_LKF_CONVERT) {
798
799 li = get_lockinfo(kparams->lkid);
800
801 /* If this is a persistent lock we will have to create a
802 lockinfo again */
803 if (!li && (kparams->flags & DLM_LKF_PERSISTENT)) {
804 li = allocate_lockinfo(fi, cmd, kparams);
805 if (!li)
806 return -ENOMEM;
807
808 li->li_lksb.sb_lkid = kparams->lkid;
809 li->li_castaddr = kparams->castaddr;
810 li->li_castparam = kparams->castparam;
811
812 /* OK, this isn;t exactly a FIRSTLOCK but it is the
813 first time we've used this lockinfo, and if things
814 fail we want rid of it */
815 init_MUTEX_LOCKED(&li->li_firstlock);
816 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
817 add_lockinfo(li);
818
819 /* TODO: do a query to get the current state ?? */
820 }
821 if (!li)
822 return -EINVAL;
823
824 if (li->li_magic != LOCKINFO_MAGIC)
825 return -EINVAL;
826
827 /* For conversions don't overwrite the current blocking AST
828 info so that:
829 a) if a blocking AST fires before the conversion is queued
830 it runs the current handler
831 b) if the conversion is cancelled, the original blocking AST
832 declaration is active
833 The pend_ info is made active when the conversion
834 completes.
835 */
836 li->li_pend_bastaddr = kparams->bastaddr;
837 li->li_pend_bastparam = kparams->bastparam;
838 } else {
839 li = allocate_lockinfo(fi, cmd, kparams);
840 if (!li)
841 return -ENOMEM;
842
843 /* semaphore to allow us to complete our work before
844 the AST routine runs. In fact we only need (and use) this
845 when the initial lock fails */
846 init_MUTEX_LOCKED(&li->li_firstlock);
847 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
848 }
849
850 li->li_user_lksb = kparams->lksb;
851 li->li_castaddr = kparams->castaddr;
852 li->li_castparam = kparams->castparam;
853 li->li_lksb.sb_lkid = kparams->lkid;
854 li->li_rqmode = kparams->mode;
855 if (kparams->flags & DLM_LKF_PERSISTENT)
856 set_bit(LI_FLAG_PERSISTENT, &li->li_flags);
857
858 /* Copy in the value block */
859 if (kparams->flags & DLM_LKF_VALBLK) {
860 if (!li->li_lksb.sb_lvbptr) {
861 li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN,
862 GFP_KERNEL);
863 if (!li->li_lksb.sb_lvbptr) {
864 status = -ENOMEM;
865 goto out_err;
866 }
867 }
868
869 memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN);
870 }
871
872 /* Lock it ... */
873 status = dlm_lock(fi->fi_ls->ls_lockspace,
874 kparams->mode, &li->li_lksb,
875 kparams->flags,
876 kparams->name, kparams->namelen,
877 kparams->parent,
878 ast_routine,
879 li,
880 (li->li_pend_bastaddr || li->li_bastaddr) ?
881 bast_routine : NULL,
882 kparams->range.ra_end ? &kparams->range : NULL);
883 if (status)
884 goto out_err;
885
886 /* If it succeeded (this far) with a new lock then keep track of
887 it on the file's lockinfo list */
888 if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
889
890 spin_lock(&fi->fi_li_lock);
891 list_add(&li->li_ownerqueue, &fi->fi_li_list);
892 set_bit(LI_FLAG_ONLIST, &li->li_flags);
893 spin_unlock(&fi->fi_li_lock);
894 if (add_lockinfo(li))
895 printk(KERN_WARNING "Add lockinfo failed\n");
896
897 up(&li->li_firstlock);
898 }
899
900 /* Return the lockid as the user needs it /now/ */
901 return li->li_lksb.sb_lkid;
902
903 out_err:
904 if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags))
905 release_lockinfo(li);
906 return status;
907
908}
909
910static int do_user_unlock(struct file_info *fi, uint8_t cmd,
911 struct dlm_lock_params *kparams)
912{
913 struct lock_info *li;
914 int status;
915 int convert_cancel = 0;
916
917 li = get_lockinfo(kparams->lkid);
918 if (!li) {
919 li = allocate_lockinfo(fi, cmd, kparams);
920 if (!li)
921 return -ENOMEM;
922 spin_lock(&fi->fi_li_lock);
923 list_add(&li->li_ownerqueue, &fi->fi_li_list);
924 set_bit(LI_FLAG_ONLIST, &li->li_flags);
925 spin_unlock(&fi->fi_li_lock);
926 }
927
928 if (li->li_magic != LOCKINFO_MAGIC)
929 return -EINVAL;
930
931 li->li_user_lksb = kparams->lksb;
932 li->li_castparam = kparams->castparam;
933 li->li_cmd = cmd;
934
935 /* Cancelling a conversion doesn't remove the lock...*/
936 if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1)
937 convert_cancel = 1;
938
939 /* Wait until dlm_lock() has completed */
940 if (!test_bit(LI_FLAG_ONLIST, &li->li_flags)) {
941 down(&li->li_firstlock);
942 up(&li->li_firstlock);
943 }
944
945 /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
946 the existing li_castaddr as that's the completion routine for
947 unlocks. dlm_unlock_wait() specifies a new AST routine to be
948 executed when the unlock completes. */
949 if (kparams->castaddr)
950 li->li_castaddr = kparams->castaddr;
951
952 /* Use existing lksb & astparams */
953 status = dlm_unlock(fi->fi_ls->ls_lockspace,
954 kparams->lkid,
955 kparams->flags, &li->li_lksb, li);
956
957 if (!status && !convert_cancel) {
958 spin_lock(&fi->fi_li_lock);
959 list_del(&li->li_ownerqueue);
960 clear_bit(LI_FLAG_ONLIST, &li->li_flags);
961 spin_unlock(&fi->fi_li_lock);
962 }
963
964 return status;
965}
966
967/* Write call, submit a locking request */
968static ssize_t dlm_write(struct file *file, const char __user *buffer,
969 size_t count, loff_t *ppos)
970{
971 struct file_info *fi = file->private_data;
972 struct dlm_write_request *kparams;
973 sigset_t tmpsig;
974 sigset_t allsigs;
975 int status;
976
977 /* -1 because lock name is optional */
978 if (count < sizeof(struct dlm_write_request)-1)
979 return -EINVAL;
980
981 /* Has the lockspace been deleted */
982 if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
983 return -ENOENT;
984
985 kparams = kmalloc(count, GFP_KERNEL);
986 if (!kparams)
987 return -ENOMEM;
988
989 status = -EFAULT;
990 /* Get the command info */
991 if (copy_from_user(kparams, buffer, count))
992 goto out_free;
993
994 status = -EBADE;
995 if (check_version(kparams))
996 goto out_free;
997
998 /* Block signals while we are doing this */
999 sigfillset(&allsigs);
1000 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
1001
1002 status = -EINVAL;
1003 switch (kparams->cmd)
1004 {
1005 case DLM_USER_LOCK:
1006 if (!fi) goto out_sig;
1007 status = do_user_lock(fi, kparams->cmd, &kparams->i.lock);
1008 break;
1009
1010 case DLM_USER_UNLOCK:
1011 if (!fi) goto out_sig;
1012 status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock);
1013 break;
1014
1015 case DLM_USER_CREATE_LOCKSPACE:
1016 if (fi) goto out_sig;
1017 status = do_user_create_lockspace(fi, kparams->cmd,
1018 &kparams->i.lspace);
1019 break;
1020
1021 case DLM_USER_REMOVE_LOCKSPACE:
1022 if (fi) goto out_sig;
1023 status = do_user_remove_lockspace(fi, kparams->cmd,
1024 &kparams->i.lspace);
1025 break;
1026 default:
1027 printk("Unknown command passed to DLM device : %d\n",
1028 kparams->cmd);
1029 break;
1030 }
1031
1032 out_sig:
1033 /* Restore signals */
1034 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1035 recalc_sigpending();
1036
1037 out_free:
1038 kfree(kparams);
1039 if (status == 0)
1040 return count;
1041 else
1042 return status;
1043}
1044
1045static struct file_operations _dlm_fops = {
1046 .open = dlm_open,
1047 .release = dlm_close,
1048 .read = dlm_read,
1049 .write = dlm_write,
1050 .poll = dlm_poll,
1051 .owner = THIS_MODULE,
1052};
1053
1054static struct file_operations _dlm_ctl_fops = {
1055 .open = dlm_ctl_open,
1056 .release = dlm_ctl_close,
1057 .write = dlm_write,
1058 .owner = THIS_MODULE,
1059};
1060
1061/*
1062 * Create control device
1063 */
1064static int __init dlm_device_init(void)
1065{
1066 int r;
1067
1068 INIT_LIST_HEAD(&user_ls_list);
1069 mutex_init(&user_ls_lock);
1070 rwlock_init(&lockinfo_lock);
1071
1072 ctl_device.name = "dlm-control";
1073 ctl_device.fops = &_dlm_ctl_fops;
1074 ctl_device.minor = MISC_DYNAMIC_MINOR;
1075
1076 r = misc_register(&ctl_device);
1077 if (r) {
1078 printk(KERN_ERR "dlm: misc_register failed for control dev\n");
1079 return r;
1080 }
1081
1082 return 0;
1083}
1084
1085static void __exit dlm_device_exit(void)
1086{
1087 misc_deregister(&ctl_device);
1088}
1089
1090MODULE_DESCRIPTION("Distributed Lock Manager device interface");
1091MODULE_AUTHOR("Red Hat, Inc.");
1092MODULE_LICENSE("GPL");
1093
1094module_init(dlm_device_init);
1095module_exit(dlm_device_exit);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..16f20cfd9197
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,505 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/mutex.h>
39#include <asm/semaphore.h>
40#include <asm/uaccess.h>
41
42#include <linux/dlm.h>
43
44#define DLM_LOCKSPACE_LEN 64
45
46/* Size of the temp buffer midcomms allocates on the stack.
47 We try to make this large enough so most messages fit.
48 FIXME: should sctp make this unnecessary? */
49
50#define DLM_INBUF_LEN 148
51
52struct dlm_ls;
53struct dlm_lkb;
54struct dlm_rsb;
55struct dlm_member;
56struct dlm_lkbtable;
57struct dlm_rsbtable;
58struct dlm_dirtable;
59struct dlm_direntry;
60struct dlm_recover;
61struct dlm_header;
62struct dlm_message;
63struct dlm_rcom;
64struct dlm_mhandle;
65
66#define log_print(fmt, args...) \
67 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
68#define log_error(ls, fmt, args...) \
69 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
70
71#ifdef DLM_LOG_DEBUG
72#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
73#else
74#define log_debug(ls, fmt, args...)
75#endif
76
77#define DLM_ASSERT(x, do) \
78{ \
79 if (!(x)) \
80 { \
81 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
82 "DLM: assertion: \"%s\"\n" \
83 "DLM: time = %lu\n", \
84 __LINE__, __FILE__, #x, jiffies); \
85 {do} \
86 printk("\n"); \
87 BUG(); \
88 panic("DLM: Record message above and reboot.\n"); \
89 } \
90}
91
92
93struct dlm_direntry {
94 struct list_head list;
95 uint32_t master_nodeid;
96 uint16_t length;
97 char name[1];
98};
99
100struct dlm_dirtable {
101 struct list_head list;
102 rwlock_t lock;
103};
104
105struct dlm_rsbtable {
106 struct list_head list;
107 struct list_head toss;
108 rwlock_t lock;
109};
110
111struct dlm_lkbtable {
112 struct list_head list;
113 rwlock_t lock;
114 uint16_t counter;
115};
116
117/*
118 * Lockspace member (per node in a ls)
119 */
120
121struct dlm_member {
122 struct list_head list;
123 int nodeid;
124 int weight;
125};
126
127/*
128 * Save and manage recovery state for a lockspace.
129 */
130
131struct dlm_recover {
132 struct list_head list;
133 int *nodeids;
134 int node_count;
135 uint64_t seq;
136};
137
138/*
139 * Pass input args to second stage locking function.
140 */
141
142struct dlm_args {
143 uint32_t flags;
144 void *astaddr;
145 long astparam;
146 void *bastaddr;
147 int mode;
148 struct dlm_lksb *lksb;
149 struct dlm_range *range;
150};
151
152
153/*
154 * Lock block
155 *
156 * A lock can be one of three types:
157 *
158 * local copy lock is mastered locally
159 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
160 * process copy lock is mastered on a remote node
161 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
162 * master copy master node's copy of a lock owned by remote node
163 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
164 *
165 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
166 * dlm_unlock. The dlm does not modify these or use any private flags in
167 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
168 * are sent as-is to the remote master when the lock is remote.
169 *
170 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
171 * Some internal flags are shared between the master and process nodes;
172 * these shared flags are kept in the lower two bytes. One of these
173 * flags set on the master copy will be propagated to the process copy
174 * and v.v. Other internal flags are private to the master or process
175 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
176 *
177 * lkb_sbflags: status block flags. These flags are copied directly into
178 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
179 * ast. All defined in dlm.h with DLM_SBF_ prefix.
180 *
181 * lkb_status: the lock status indicates which rsb queue the lock is
182 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
183 *
184 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
185 * reply is needed. Only set when the lkb is on the lockspace waiters
186 * list awaiting a reply from a remote node.
187 *
188 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
189 * is a master copy, nodeid specifies the remote lock holder, when the
190 * lkb is a process copy, the nodeid specifies the lock master.
191 */
192
193/* lkb_ast_type */
194
195#define AST_COMP 1
196#define AST_BAST 2
197
198/* lkb_range[] */
199
200#define GR_RANGE_START 0
201#define GR_RANGE_END 1
202#define RQ_RANGE_START 2
203#define RQ_RANGE_END 3
204
205/* lkb_status */
206
207#define DLM_LKSTS_WAITING 1
208#define DLM_LKSTS_GRANTED 2
209#define DLM_LKSTS_CONVERT 3
210
211/* lkb_flags */
212
213#define DLM_IFL_MSTCPY 0x00010000
214#define DLM_IFL_RESEND 0x00020000
215#define DLM_IFL_RANGE 0x00000001
216
217struct dlm_lkb {
218 struct dlm_rsb *lkb_resource; /* the rsb */
219 struct kref lkb_ref;
220 int lkb_nodeid; /* copied from rsb */
221 int lkb_ownpid; /* pid of lock owner */
222 uint32_t lkb_id; /* our lock ID */
223 uint32_t lkb_remid; /* lock ID on remote partner */
224 uint32_t lkb_exflags; /* external flags from caller */
225 uint32_t lkb_sbflags; /* lksb flags */
226 uint32_t lkb_flags; /* internal flags */
227 uint32_t lkb_lvbseq; /* lvb sequence number */
228
229 int8_t lkb_status; /* granted, waiting, convert */
230 int8_t lkb_rqmode; /* requested lock mode */
231 int8_t lkb_grmode; /* granted lock mode */
232 int8_t lkb_bastmode; /* requested mode */
233 int8_t lkb_highbast; /* highest mode bast sent for */
234
235 int8_t lkb_wait_type; /* type of reply waiting for */
236 int8_t lkb_ast_type; /* type of ast queued for */
237
238 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
239 struct list_head lkb_statequeue; /* rsb g/c/w list */
240 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
241 struct list_head lkb_wait_reply; /* waiting for remote reply */
242 struct list_head lkb_astqueue; /* need ast to be sent */
243
244 uint64_t *lkb_range; /* array of gr/rq ranges */
245 char *lkb_lvbptr;
246 struct dlm_lksb *lkb_lksb; /* caller's status block */
247 void *lkb_astaddr; /* caller's ast function */
248 void *lkb_bastaddr; /* caller's bast function */
249 long lkb_astparam; /* caller's ast arg */
250};
251
252
253struct dlm_rsb {
254 struct dlm_ls *res_ls; /* the lockspace */
255 struct kref res_ref;
256 struct mutex res_mutex;
257 unsigned long res_flags;
258 int res_length; /* length of rsb name */
259 int res_nodeid;
260 uint32_t res_lvbseq;
261 uint32_t res_hash;
262 uint32_t res_bucket; /* rsbtbl */
263 unsigned long res_toss_time;
264 uint32_t res_first_lkid;
265 struct list_head res_lookup; /* lkbs waiting on first */
266 struct list_head res_hashchain; /* rsbtbl */
267 struct list_head res_grantqueue;
268 struct list_head res_convertqueue;
269 struct list_head res_waitqueue;
270
271 struct list_head res_root_list; /* used for recovery */
272 struct list_head res_recover_list; /* used for recovery */
273 int res_recover_locks_count;
274
275 char *res_lvbptr;
276 char res_name[1];
277};
278
279/* find_rsb() flags */
280
281#define R_MASTER 1 /* only return rsb if it's a master */
282#define R_CREATE 2 /* create/add rsb if not found */
283
284/* rsb_flags */
285
286enum rsb_flags {
287 RSB_MASTER_UNCERTAIN,
288 RSB_VALNOTVALID,
289 RSB_VALNOTVALID_PREV,
290 RSB_NEW_MASTER,
291 RSB_NEW_MASTER2,
292 RSB_RECOVER_CONVERT,
293};
294
295static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
296{
297 __set_bit(flag, &r->res_flags);
298}
299
300static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
301{
302 __clear_bit(flag, &r->res_flags);
303}
304
305static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
306{
307 return test_bit(flag, &r->res_flags);
308}
309
310
311/* dlm_header is first element of all structs sent between nodes */
312
313#define DLM_HEADER_MAJOR 0x00020000
314#define DLM_HEADER_MINOR 0x00000001
315
316#define DLM_MSG 1
317#define DLM_RCOM 2
318
319struct dlm_header {
320 uint32_t h_version;
321 uint32_t h_lockspace;
322 uint32_t h_nodeid; /* nodeid of sender */
323 uint16_t h_length;
324 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
325 uint8_t h_pad;
326};
327
328
329#define DLM_MSG_REQUEST 1
330#define DLM_MSG_CONVERT 2
331#define DLM_MSG_UNLOCK 3
332#define DLM_MSG_CANCEL 4
333#define DLM_MSG_REQUEST_REPLY 5
334#define DLM_MSG_CONVERT_REPLY 6
335#define DLM_MSG_UNLOCK_REPLY 7
336#define DLM_MSG_CANCEL_REPLY 8
337#define DLM_MSG_GRANT 9
338#define DLM_MSG_BAST 10
339#define DLM_MSG_LOOKUP 11
340#define DLM_MSG_REMOVE 12
341#define DLM_MSG_LOOKUP_REPLY 13
342
343struct dlm_message {
344 struct dlm_header m_header;
345 uint32_t m_type; /* DLM_MSG_ */
346 uint32_t m_nodeid;
347 uint32_t m_pid;
348 uint32_t m_lkid; /* lkid on sender */
349 uint32_t m_remid; /* lkid on receiver */
350 uint32_t m_parent_lkid;
351 uint32_t m_parent_remid;
352 uint32_t m_exflags;
353 uint32_t m_sbflags;
354 uint32_t m_flags;
355 uint32_t m_lvbseq;
356 uint32_t m_hash;
357 int m_status;
358 int m_grmode;
359 int m_rqmode;
360 int m_bastmode;
361 int m_asts;
362 int m_result; /* 0 or -EXXX */
363 uint64_t m_range[2];
364 char m_extra[0]; /* name or lvb */
365};
366
367
368#define DLM_RS_NODES 0x00000001
369#define DLM_RS_NODES_ALL 0x00000002
370#define DLM_RS_DIR 0x00000004
371#define DLM_RS_DIR_ALL 0x00000008
372#define DLM_RS_LOCKS 0x00000010
373#define DLM_RS_LOCKS_ALL 0x00000020
374#define DLM_RS_DONE 0x00000040
375#define DLM_RS_DONE_ALL 0x00000080
376
377#define DLM_RCOM_STATUS 1
378#define DLM_RCOM_NAMES 2
379#define DLM_RCOM_LOOKUP 3
380#define DLM_RCOM_LOCK 4
381#define DLM_RCOM_STATUS_REPLY 5
382#define DLM_RCOM_NAMES_REPLY 6
383#define DLM_RCOM_LOOKUP_REPLY 7
384#define DLM_RCOM_LOCK_REPLY 8
385
386struct dlm_rcom {
387 struct dlm_header rc_header;
388 uint32_t rc_type; /* DLM_RCOM_ */
389 int rc_result; /* multi-purpose */
390 uint64_t rc_id; /* match reply with request */
391 char rc_buf[0];
392};
393
394struct rcom_config {
395 uint32_t rf_lvblen;
396 uint32_t rf_lsflags;
397 uint64_t rf_unused;
398};
399
400struct rcom_lock {
401 uint32_t rl_ownpid;
402 uint32_t rl_lkid;
403 uint32_t rl_remid;
404 uint32_t rl_parent_lkid;
405 uint32_t rl_parent_remid;
406 uint32_t rl_exflags;
407 uint32_t rl_flags;
408 uint32_t rl_lvbseq;
409 int rl_result;
410 int8_t rl_rqmode;
411 int8_t rl_grmode;
412 int8_t rl_status;
413 int8_t rl_asts;
414 uint16_t rl_wait_type;
415 uint16_t rl_namelen;
416 uint64_t rl_range[4];
417 char rl_name[DLM_RESNAME_MAXLEN];
418 char rl_lvb[0];
419};
420
421struct dlm_ls {
422 struct list_head ls_list; /* list of lockspaces */
423 uint32_t ls_global_id; /* global unique lockspace ID */
424 uint32_t ls_exflags;
425 int ls_lvblen;
426 int ls_count; /* reference count */
427 unsigned long ls_flags; /* LSFL_ */
428 struct kobject ls_kobj;
429
430 struct dlm_rsbtable *ls_rsbtbl;
431 uint32_t ls_rsbtbl_size;
432
433 struct dlm_lkbtable *ls_lkbtbl;
434 uint32_t ls_lkbtbl_size;
435
436 struct dlm_dirtable *ls_dirtbl;
437 uint32_t ls_dirtbl_size;
438
439 struct mutex ls_waiters_mutex;
440 struct list_head ls_waiters; /* lkbs needing a reply */
441
442 struct list_head ls_nodes; /* current nodes in ls */
443 struct list_head ls_nodes_gone; /* dead node list, recovery */
444 int ls_num_nodes; /* number of nodes in ls */
445 int ls_low_nodeid;
446 int ls_total_weight;
447 int *ls_node_array;
448
449 struct dlm_rsb ls_stub_rsb; /* for returning errors */
450 struct dlm_lkb ls_stub_lkb; /* for returning errors */
451 struct dlm_message ls_stub_ms; /* for faking a reply */
452
453 struct dentry *ls_debug_dentry; /* debugfs */
454
455 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
456 int ls_uevent_result;
457
458 /* recovery related */
459
460 struct timer_list ls_timer;
461 struct task_struct *ls_recoverd_task;
462 struct mutex ls_recoverd_active;
463 spinlock_t ls_recover_lock;
464 uint32_t ls_recover_status; /* DLM_RS_ */
465 uint64_t ls_recover_seq;
466 struct dlm_recover *ls_recover_args;
467 struct rw_semaphore ls_in_recovery; /* block local requests */
468 struct list_head ls_requestqueue;/* queue remote requests */
469 struct mutex ls_requestqueue_mutex;
470 char *ls_recover_buf;
471 struct list_head ls_recover_list;
472 spinlock_t ls_recover_list_lock;
473 int ls_recover_list_count;
474 wait_queue_head_t ls_wait_general;
475
476 struct list_head ls_root_list; /* root resources */
477 struct rw_semaphore ls_root_sem; /* protect root_list */
478
479 int ls_namelen;
480 char ls_name[1];
481};
482
483#define LSFL_WORK 0
484#define LSFL_RUNNING 1
485#define LSFL_RECOVERY_STOP 2
486#define LSFL_RCOM_READY 3
487#define LSFL_UEVENT_WAIT 4
488
489static inline int dlm_locking_stopped(struct dlm_ls *ls)
490{
491 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
492}
493
494static inline int dlm_recovery_stopped(struct dlm_ls *ls)
495{
496 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
497}
498
499static inline int dlm_no_directory(struct dlm_ls *ls)
500{
501 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
502}
503
504#endif /* __DLM_INTERNAL_DOT_H__ */
505
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..29d3b95dbb63
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3610 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58
59#include "dlm_internal.h"
60#include "memory.h"
61#include "lowcomms.h"
62#include "requestqueue.h"
63#include "util.h"
64#include "dir.h"
65#include "member.h"
66#include "lockspace.h"
67#include "ast.h"
68#include "lock.h"
69#include "rcom.h"
70#include "recover.h"
71#include "lvb_table.h"
72#include "config.h"
73
74static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
75static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
76static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
80static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_remove(struct dlm_rsb *r);
82static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
84 struct dlm_message *ms);
85static int receive_extralen(struct dlm_message *ms);
86
87/*
88 * Lock compatibilty matrix - thanks Steve
89 * UN = Unlocked state. Not really a state, used as a flag
90 * PD = Padding. Used to make the matrix a nice power of two in size
91 * Other states are the same as the VMS DLM.
92 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
93 */
94
95static const int __dlm_compat_matrix[8][8] = {
96 /* UN NL CR CW PR PW EX PD */
97 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
98 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
99 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
100 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
101 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
102 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
103 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
104 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
105};
106
107/*
108 * This defines the direction of transfer of LVB data.
109 * Granted mode is the row; requested mode is the column.
110 * Usage: matrix[grmode+1][rqmode+1]
111 * 1 = LVB is returned to the caller
112 * 0 = LVB is written to the resource
113 * -1 = nothing happens to the LVB
114 */
115
116const int dlm_lvb_operations[8][8] = {
117 /* UN NL CR CW PR PW EX PD*/
118 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
119 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
120 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
121 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
122 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
123 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
124 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
125 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
126};
127EXPORT_SYMBOL_GPL(dlm_lvb_operations);
128
129#define modes_compat(gr, rq) \
130 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
131
132int dlm_modes_compat(int mode1, int mode2)
133{
134 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
135}
136
137/*
138 * Compatibility matrix for conversions with QUECVT set.
139 * Granted mode is the row; requested mode is the column.
140 * Usage: matrix[grmode+1][rqmode+1]
141 */
142
143static const int __quecvt_compat_matrix[8][8] = {
144 /* UN NL CR CW PR PW EX PD */
145 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
146 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
147 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
148 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
149 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
150 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
151 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
152 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
153};
154
155static void dlm_print_lkb(struct dlm_lkb *lkb)
156{
157 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
158 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
159 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
160 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
161 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
162}
163
164void dlm_print_rsb(struct dlm_rsb *r)
165{
166 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
167 r->res_nodeid, r->res_flags, r->res_first_lkid,
168 r->res_recover_locks_count, r->res_name);
169}
170
171/* Threads cannot use the lockspace while it's being recovered */
172
173static inline void lock_recovery(struct dlm_ls *ls)
174{
175 down_read(&ls->ls_in_recovery);
176}
177
178static inline void unlock_recovery(struct dlm_ls *ls)
179{
180 up_read(&ls->ls_in_recovery);
181}
182
183static inline int lock_recovery_try(struct dlm_ls *ls)
184{
185 return down_read_trylock(&ls->ls_in_recovery);
186}
187
188static inline int can_be_queued(struct dlm_lkb *lkb)
189{
190 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
191}
192
193static inline int force_blocking_asts(struct dlm_lkb *lkb)
194{
195 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
196}
197
198static inline int is_demoted(struct dlm_lkb *lkb)
199{
200 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
201}
202
203static inline int is_remote(struct dlm_rsb *r)
204{
205 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
206 return !!r->res_nodeid;
207}
208
209static inline int is_process_copy(struct dlm_lkb *lkb)
210{
211 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
212}
213
214static inline int is_master_copy(struct dlm_lkb *lkb)
215{
216 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
217 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
218 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
219}
220
221static inline int middle_conversion(struct dlm_lkb *lkb)
222{
223 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
224 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
225 return 1;
226 return 0;
227}
228
229static inline int down_conversion(struct dlm_lkb *lkb)
230{
231 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
232}
233
234static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
235{
236 if (is_master_copy(lkb))
237 return;
238
239 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
240
241 lkb->lkb_lksb->sb_status = rv;
242 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
243
244 dlm_add_ast(lkb, AST_COMP);
245}
246
247static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
248{
249 if (is_master_copy(lkb))
250 send_bast(r, lkb, rqmode);
251 else {
252 lkb->lkb_bastmode = rqmode;
253 dlm_add_ast(lkb, AST_BAST);
254 }
255}
256
257/*
258 * Basic operations on rsb's and lkb's
259 */
260
261static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
262{
263 struct dlm_rsb *r;
264
265 r = allocate_rsb(ls, len);
266 if (!r)
267 return NULL;
268
269 r->res_ls = ls;
270 r->res_length = len;
271 memcpy(r->res_name, name, len);
272 mutex_init(&r->res_mutex);
273
274 INIT_LIST_HEAD(&r->res_lookup);
275 INIT_LIST_HEAD(&r->res_grantqueue);
276 INIT_LIST_HEAD(&r->res_convertqueue);
277 INIT_LIST_HEAD(&r->res_waitqueue);
278 INIT_LIST_HEAD(&r->res_root_list);
279 INIT_LIST_HEAD(&r->res_recover_list);
280
281 return r;
282}
283
284static int search_rsb_list(struct list_head *head, char *name, int len,
285 unsigned int flags, struct dlm_rsb **r_ret)
286{
287 struct dlm_rsb *r;
288 int error = 0;
289
290 list_for_each_entry(r, head, res_hashchain) {
291 if (len == r->res_length && !memcmp(name, r->res_name, len))
292 goto found;
293 }
294 return -ENOENT;
295
296 found:
297 if (r->res_nodeid && (flags & R_MASTER))
298 error = -ENOTBLK;
299 *r_ret = r;
300 return error;
301}
302
303static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
304 unsigned int flags, struct dlm_rsb **r_ret)
305{
306 struct dlm_rsb *r;
307 int error;
308
309 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
310 if (!error) {
311 kref_get(&r->res_ref);
312 goto out;
313 }
314 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
315 if (error)
316 goto out;
317
318 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
319
320 if (dlm_no_directory(ls))
321 goto out;
322
323 if (r->res_nodeid == -1) {
324 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
325 r->res_first_lkid = 0;
326 } else if (r->res_nodeid > 0) {
327 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
328 r->res_first_lkid = 0;
329 } else {
330 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
331 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
332 }
333 out:
334 *r_ret = r;
335 return error;
336}
337
338static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
339 unsigned int flags, struct dlm_rsb **r_ret)
340{
341 int error;
342 write_lock(&ls->ls_rsbtbl[b].lock);
343 error = _search_rsb(ls, name, len, b, flags, r_ret);
344 write_unlock(&ls->ls_rsbtbl[b].lock);
345 return error;
346}
347
348/*
349 * Find rsb in rsbtbl and potentially create/add one
350 *
351 * Delaying the release of rsb's has a similar benefit to applications keeping
352 * NL locks on an rsb, but without the guarantee that the cached master value
353 * will still be valid when the rsb is reused. Apps aren't always smart enough
354 * to keep NL locks on an rsb that they may lock again shortly; this can lead
355 * to excessive master lookups and removals if we don't delay the release.
356 *
357 * Searching for an rsb means looking through both the normal list and toss
358 * list. When found on the toss list the rsb is moved to the normal list with
359 * ref count of 1; when found on normal list the ref count is incremented.
360 */
361
362static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
363 unsigned int flags, struct dlm_rsb **r_ret)
364{
365 struct dlm_rsb *r, *tmp;
366 uint32_t hash, bucket;
367 int error = 0;
368
369 if (dlm_no_directory(ls))
370 flags |= R_CREATE;
371
372 hash = jhash(name, namelen, 0);
373 bucket = hash & (ls->ls_rsbtbl_size - 1);
374
375 error = search_rsb(ls, name, namelen, bucket, flags, &r);
376 if (!error)
377 goto out;
378
379 if (error == -ENOENT && !(flags & R_CREATE))
380 goto out;
381
382 /* the rsb was found but wasn't a master copy */
383 if (error == -ENOTBLK)
384 goto out;
385
386 error = -ENOMEM;
387 r = create_rsb(ls, name, namelen);
388 if (!r)
389 goto out;
390
391 r->res_hash = hash;
392 r->res_bucket = bucket;
393 r->res_nodeid = -1;
394 kref_init(&r->res_ref);
395
396 /* With no directory, the master can be set immediately */
397 if (dlm_no_directory(ls)) {
398 int nodeid = dlm_dir_nodeid(r);
399 if (nodeid == dlm_our_nodeid())
400 nodeid = 0;
401 r->res_nodeid = nodeid;
402 }
403
404 write_lock(&ls->ls_rsbtbl[bucket].lock);
405 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
406 if (!error) {
407 write_unlock(&ls->ls_rsbtbl[bucket].lock);
408 free_rsb(r);
409 r = tmp;
410 goto out;
411 }
412 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
413 write_unlock(&ls->ls_rsbtbl[bucket].lock);
414 error = 0;
415 out:
416 *r_ret = r;
417 return error;
418}
419
420int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
421 unsigned int flags, struct dlm_rsb **r_ret)
422{
423 return find_rsb(ls, name, namelen, flags, r_ret);
424}
425
426/* This is only called to add a reference when the code already holds
427 a valid reference to the rsb, so there's no need for locking. */
428
429static inline void hold_rsb(struct dlm_rsb *r)
430{
431 kref_get(&r->res_ref);
432}
433
434void dlm_hold_rsb(struct dlm_rsb *r)
435{
436 hold_rsb(r);
437}
438
439static void toss_rsb(struct kref *kref)
440{
441 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
442 struct dlm_ls *ls = r->res_ls;
443
444 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
445 kref_init(&r->res_ref);
446 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
447 r->res_toss_time = jiffies;
448 if (r->res_lvbptr) {
449 free_lvb(r->res_lvbptr);
450 r->res_lvbptr = NULL;
451 }
452}
453
454/* When all references to the rsb are gone it's transfered to
455 the tossed list for later disposal. */
456
457static void put_rsb(struct dlm_rsb *r)
458{
459 struct dlm_ls *ls = r->res_ls;
460 uint32_t bucket = r->res_bucket;
461
462 write_lock(&ls->ls_rsbtbl[bucket].lock);
463 kref_put(&r->res_ref, toss_rsb);
464 write_unlock(&ls->ls_rsbtbl[bucket].lock);
465}
466
467void dlm_put_rsb(struct dlm_rsb *r)
468{
469 put_rsb(r);
470}
471
472/* See comment for unhold_lkb */
473
474static void unhold_rsb(struct dlm_rsb *r)
475{
476 int rv;
477 rv = kref_put(&r->res_ref, toss_rsb);
478 DLM_ASSERT(!rv, dlm_print_rsb(r););
479}
480
481static void kill_rsb(struct kref *kref)
482{
483 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
484
485 /* All work is done after the return from kref_put() so we
486 can release the write_lock before the remove and free. */
487
488 DLM_ASSERT(list_empty(&r->res_lookup),);
489 DLM_ASSERT(list_empty(&r->res_grantqueue),);
490 DLM_ASSERT(list_empty(&r->res_convertqueue),);
491 DLM_ASSERT(list_empty(&r->res_waitqueue),);
492 DLM_ASSERT(list_empty(&r->res_root_list),);
493 DLM_ASSERT(list_empty(&r->res_recover_list),);
494}
495
496/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
497 The rsb must exist as long as any lkb's for it do. */
498
499static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
500{
501 hold_rsb(r);
502 lkb->lkb_resource = r;
503}
504
505static void detach_lkb(struct dlm_lkb *lkb)
506{
507 if (lkb->lkb_resource) {
508 put_rsb(lkb->lkb_resource);
509 lkb->lkb_resource = NULL;
510 }
511}
512
513static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
514{
515 struct dlm_lkb *lkb, *tmp;
516 uint32_t lkid = 0;
517 uint16_t bucket;
518
519 lkb = allocate_lkb(ls);
520 if (!lkb)
521 return -ENOMEM;
522
523 lkb->lkb_nodeid = -1;
524 lkb->lkb_grmode = DLM_LOCK_IV;
525 kref_init(&lkb->lkb_ref);
526
527 get_random_bytes(&bucket, sizeof(bucket));
528 bucket &= (ls->ls_lkbtbl_size - 1);
529
530 write_lock(&ls->ls_lkbtbl[bucket].lock);
531
532 /* counter can roll over so we must verify lkid is not in use */
533
534 while (lkid == 0) {
535 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
536
537 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
538 lkb_idtbl_list) {
539 if (tmp->lkb_id != lkid)
540 continue;
541 lkid = 0;
542 break;
543 }
544 }
545
546 lkb->lkb_id = lkid;
547 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
548 write_unlock(&ls->ls_lkbtbl[bucket].lock);
549
550 *lkb_ret = lkb;
551 return 0;
552}
553
554static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
555{
556 uint16_t bucket = lkid & 0xFFFF;
557 struct dlm_lkb *lkb;
558
559 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
560 if (lkb->lkb_id == lkid)
561 return lkb;
562 }
563 return NULL;
564}
565
566static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
567{
568 struct dlm_lkb *lkb;
569 uint16_t bucket = lkid & 0xFFFF;
570
571 if (bucket >= ls->ls_lkbtbl_size)
572 return -EBADSLT;
573
574 read_lock(&ls->ls_lkbtbl[bucket].lock);
575 lkb = __find_lkb(ls, lkid);
576 if (lkb)
577 kref_get(&lkb->lkb_ref);
578 read_unlock(&ls->ls_lkbtbl[bucket].lock);
579
580 *lkb_ret = lkb;
581 return lkb ? 0 : -ENOENT;
582}
583
584static void kill_lkb(struct kref *kref)
585{
586 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
587
588 /* All work is done after the return from kref_put() so we
589 can release the write_lock before the detach_lkb */
590
591 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
592}
593
594static int put_lkb(struct dlm_lkb *lkb)
595{
596 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
597 uint16_t bucket = lkb->lkb_id & 0xFFFF;
598
599 write_lock(&ls->ls_lkbtbl[bucket].lock);
600 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
601 list_del(&lkb->lkb_idtbl_list);
602 write_unlock(&ls->ls_lkbtbl[bucket].lock);
603
604 detach_lkb(lkb);
605
606 /* for local/process lkbs, lvbptr points to caller's lksb */
607 if (lkb->lkb_lvbptr && is_master_copy(lkb))
608 free_lvb(lkb->lkb_lvbptr);
609 if (lkb->lkb_range)
610 free_range(lkb->lkb_range);
611 free_lkb(lkb);
612 return 1;
613 } else {
614 write_unlock(&ls->ls_lkbtbl[bucket].lock);
615 return 0;
616 }
617}
618
619int dlm_put_lkb(struct dlm_lkb *lkb)
620{
621 return put_lkb(lkb);
622}
623
624/* This is only called to add a reference when the code already holds
625 a valid reference to the lkb, so there's no need for locking. */
626
627static inline void hold_lkb(struct dlm_lkb *lkb)
628{
629 kref_get(&lkb->lkb_ref);
630}
631
632/* This is called when we need to remove a reference and are certain
633 it's not the last ref. e.g. del_lkb is always called between a
634 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
635 put_lkb would work fine, but would involve unnecessary locking */
636
637static inline void unhold_lkb(struct dlm_lkb *lkb)
638{
639 int rv;
640 rv = kref_put(&lkb->lkb_ref, kill_lkb);
641 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
642}
643
644static void lkb_add_ordered(struct list_head *new, struct list_head *head,
645 int mode)
646{
647 struct dlm_lkb *lkb = NULL;
648
649 list_for_each_entry(lkb, head, lkb_statequeue)
650 if (lkb->lkb_rqmode < mode)
651 break;
652
653 if (!lkb)
654 list_add_tail(new, head);
655 else
656 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
657}
658
659/* add/remove lkb to rsb's grant/convert/wait queue */
660
661static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
662{
663 kref_get(&lkb->lkb_ref);
664
665 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
666
667 lkb->lkb_status = status;
668
669 switch (status) {
670 case DLM_LKSTS_WAITING:
671 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
672 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
673 else
674 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
675 break;
676 case DLM_LKSTS_GRANTED:
677 /* convention says granted locks kept in order of grmode */
678 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
679 lkb->lkb_grmode);
680 break;
681 case DLM_LKSTS_CONVERT:
682 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
683 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
684 else
685 list_add_tail(&lkb->lkb_statequeue,
686 &r->res_convertqueue);
687 break;
688 default:
689 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
690 }
691}
692
693static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
694{
695 lkb->lkb_status = 0;
696 list_del(&lkb->lkb_statequeue);
697 unhold_lkb(lkb);
698}
699
700static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
701{
702 hold_lkb(lkb);
703 del_lkb(r, lkb);
704 add_lkb(r, lkb, sts);
705 unhold_lkb(lkb);
706}
707
708/* add/remove lkb from global waiters list of lkb's waiting for
709 a reply from a remote node */
710
711static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
712{
713 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
714
715 mutex_lock(&ls->ls_waiters_mutex);
716 if (lkb->lkb_wait_type) {
717 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
718 goto out;
719 }
720 lkb->lkb_wait_type = mstype;
721 kref_get(&lkb->lkb_ref);
722 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
723 out:
724 mutex_unlock(&ls->ls_waiters_mutex);
725}
726
727static int _remove_from_waiters(struct dlm_lkb *lkb)
728{
729 int error = 0;
730
731 if (!lkb->lkb_wait_type) {
732 log_print("remove_from_waiters error");
733 error = -EINVAL;
734 goto out;
735 }
736 lkb->lkb_wait_type = 0;
737 list_del(&lkb->lkb_wait_reply);
738 unhold_lkb(lkb);
739 out:
740 return error;
741}
742
743static int remove_from_waiters(struct dlm_lkb *lkb)
744{
745 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
746 int error;
747
748 mutex_lock(&ls->ls_waiters_mutex);
749 error = _remove_from_waiters(lkb);
750 mutex_unlock(&ls->ls_waiters_mutex);
751 return error;
752}
753
754static void dir_remove(struct dlm_rsb *r)
755{
756 int to_nodeid;
757
758 if (dlm_no_directory(r->res_ls))
759 return;
760
761 to_nodeid = dlm_dir_nodeid(r);
762 if (to_nodeid != dlm_our_nodeid())
763 send_remove(r);
764 else
765 dlm_dir_remove_entry(r->res_ls, to_nodeid,
766 r->res_name, r->res_length);
767}
768
769/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
770 found since they are in order of newest to oldest? */
771
772static int shrink_bucket(struct dlm_ls *ls, int b)
773{
774 struct dlm_rsb *r;
775 int count = 0, found;
776
777 for (;;) {
778 found = 0;
779 write_lock(&ls->ls_rsbtbl[b].lock);
780 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
781 res_hashchain) {
782 if (!time_after_eq(jiffies, r->res_toss_time +
783 dlm_config.toss_secs * HZ))
784 continue;
785 found = 1;
786 break;
787 }
788
789 if (!found) {
790 write_unlock(&ls->ls_rsbtbl[b].lock);
791 break;
792 }
793
794 if (kref_put(&r->res_ref, kill_rsb)) {
795 list_del(&r->res_hashchain);
796 write_unlock(&ls->ls_rsbtbl[b].lock);
797
798 if (is_master(r))
799 dir_remove(r);
800 free_rsb(r);
801 count++;
802 } else {
803 write_unlock(&ls->ls_rsbtbl[b].lock);
804 log_error(ls, "tossed rsb in use %s", r->res_name);
805 }
806 }
807
808 return count;
809}
810
811void dlm_scan_rsbs(struct dlm_ls *ls)
812{
813 int i;
814
815 if (dlm_locking_stopped(ls))
816 return;
817
818 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
819 shrink_bucket(ls, i);
820 cond_resched();
821 }
822}
823
824/* lkb is master or local copy */
825
826static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
827{
828 int b, len = r->res_ls->ls_lvblen;
829
830 /* b=1 lvb returned to caller
831 b=0 lvb written to rsb or invalidated
832 b=-1 do nothing */
833
834 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
835
836 if (b == 1) {
837 if (!lkb->lkb_lvbptr)
838 return;
839
840 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
841 return;
842
843 if (!r->res_lvbptr)
844 return;
845
846 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
847 lkb->lkb_lvbseq = r->res_lvbseq;
848
849 } else if (b == 0) {
850 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
851 rsb_set_flag(r, RSB_VALNOTVALID);
852 return;
853 }
854
855 if (!lkb->lkb_lvbptr)
856 return;
857
858 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
859 return;
860
861 if (!r->res_lvbptr)
862 r->res_lvbptr = allocate_lvb(r->res_ls);
863
864 if (!r->res_lvbptr)
865 return;
866
867 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
868 r->res_lvbseq++;
869 lkb->lkb_lvbseq = r->res_lvbseq;
870 rsb_clear_flag(r, RSB_VALNOTVALID);
871 }
872
873 if (rsb_flag(r, RSB_VALNOTVALID))
874 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
875}
876
877static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
878{
879 if (lkb->lkb_grmode < DLM_LOCK_PW)
880 return;
881
882 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
883 rsb_set_flag(r, RSB_VALNOTVALID);
884 return;
885 }
886
887 if (!lkb->lkb_lvbptr)
888 return;
889
890 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
891 return;
892
893 if (!r->res_lvbptr)
894 r->res_lvbptr = allocate_lvb(r->res_ls);
895
896 if (!r->res_lvbptr)
897 return;
898
899 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
900 r->res_lvbseq++;
901 rsb_clear_flag(r, RSB_VALNOTVALID);
902}
903
904/* lkb is process copy (pc) */
905
906static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
907 struct dlm_message *ms)
908{
909 int b;
910
911 if (!lkb->lkb_lvbptr)
912 return;
913
914 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
915 return;
916
917 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
918 if (b == 1) {
919 int len = receive_extralen(ms);
920 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
921 lkb->lkb_lvbseq = ms->m_lvbseq;
922 }
923}
924
925/* Manipulate lkb's on rsb's convert/granted/waiting queues
926 remove_lock -- used for unlock, removes lkb from granted
927 revert_lock -- used for cancel, moves lkb from convert to granted
928 grant_lock -- used for request and convert, adds lkb to granted or
929 moves lkb from convert or waiting to granted
930
931 Each of these is used for master or local copy lkb's. There is
932 also a _pc() variation used to make the corresponding change on
933 a process copy (pc) lkb. */
934
935static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
936{
937 del_lkb(r, lkb);
938 lkb->lkb_grmode = DLM_LOCK_IV;
939 /* this unhold undoes the original ref from create_lkb()
940 so this leads to the lkb being freed */
941 unhold_lkb(lkb);
942}
943
944static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
945{
946 set_lvb_unlock(r, lkb);
947 _remove_lock(r, lkb);
948}
949
950static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
951{
952 _remove_lock(r, lkb);
953}
954
955static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
956{
957 lkb->lkb_rqmode = DLM_LOCK_IV;
958
959 switch (lkb->lkb_status) {
960 case DLM_LKSTS_CONVERT:
961 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
962 break;
963 case DLM_LKSTS_WAITING:
964 del_lkb(r, lkb);
965 lkb->lkb_grmode = DLM_LOCK_IV;
966 /* this unhold undoes the original ref from create_lkb()
967 so this leads to the lkb being freed */
968 unhold_lkb(lkb);
969 break;
970 default:
971 log_print("invalid status for revert %d", lkb->lkb_status);
972 }
973}
974
975static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
976{
977 revert_lock(r, lkb);
978}
979
980static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
981{
982 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
983 lkb->lkb_grmode = lkb->lkb_rqmode;
984 if (lkb->lkb_status)
985 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
986 else
987 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
988 }
989
990 lkb->lkb_rqmode = DLM_LOCK_IV;
991
992 if (lkb->lkb_range) {
993 lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
994 lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
995 }
996}
997
998static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
999{
1000 set_lvb_lock(r, lkb);
1001 _grant_lock(r, lkb);
1002 lkb->lkb_highbast = 0;
1003}
1004
1005static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1006 struct dlm_message *ms)
1007{
1008 set_lvb_lock_pc(r, lkb, ms);
1009 _grant_lock(r, lkb);
1010}
1011
1012/* called by grant_pending_locks() which means an async grant message must
1013 be sent to the requesting node in addition to granting the lock if the
1014 lkb belongs to a remote node. */
1015
1016static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1017{
1018 grant_lock(r, lkb);
1019 if (is_master_copy(lkb))
1020 send_grant(r, lkb);
1021 else
1022 queue_cast(r, lkb, 0);
1023}
1024
1025static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1026{
1027 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1028 lkb_statequeue);
1029 if (lkb->lkb_id == first->lkb_id)
1030 return 1;
1031
1032 return 0;
1033}
1034
1035/* Return 1 if the locks' ranges overlap. If the lkb has no range then it is
1036 assumed to cover 0-ffffffff.ffffffff */
1037
1038static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
1039{
1040 if (!lkb1->lkb_range || !lkb2->lkb_range)
1041 return 1;
1042
1043 if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
1044 lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
1045 return 0;
1046
1047 return 1;
1048}
1049
1050/* Check if the given lkb conflicts with another lkb on the queue. */
1051
1052static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1053{
1054 struct dlm_lkb *this;
1055
1056 list_for_each_entry(this, head, lkb_statequeue) {
1057 if (this == lkb)
1058 continue;
1059 if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
1060 return 1;
1061 }
1062 return 0;
1063}
1064
1065/*
1066 * "A conversion deadlock arises with a pair of lock requests in the converting
1067 * queue for one resource. The granted mode of each lock blocks the requested
1068 * mode of the other lock."
1069 *
1070 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1071 * convert queue from being granted, then demote lkb (set grmode to NL).
1072 * This second form requires that we check for conv-deadlk even when
1073 * now == 0 in _can_be_granted().
1074 *
1075 * Example:
1076 * Granted Queue: empty
1077 * Convert Queue: NL->EX (first lock)
1078 * PR->EX (second lock)
1079 *
1080 * The first lock can't be granted because of the granted mode of the second
1081 * lock and the second lock can't be granted because it's not first in the
1082 * list. We demote the granted mode of the second lock (the lkb passed to this
1083 * function).
1084 *
1085 * After the resolution, the "grant pending" function needs to go back and try
1086 * to grant locks on the convert queue again since the first lock can now be
1087 * granted.
1088 */
1089
1090static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1091{
1092 struct dlm_lkb *this, *first = NULL, *self = NULL;
1093
1094 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1095 if (!first)
1096 first = this;
1097 if (this == lkb) {
1098 self = lkb;
1099 continue;
1100 }
1101
1102 if (!ranges_overlap(lkb, this))
1103 continue;
1104
1105 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1106 return 1;
1107 }
1108
1109 /* if lkb is on the convert queue and is preventing the first
1110 from being granted, then there's deadlock and we demote lkb.
1111 multiple converting locks may need to do this before the first
1112 converting lock can be granted. */
1113
1114 if (self && self != first) {
1115 if (!modes_compat(lkb, first) &&
1116 !queue_conflict(&rsb->res_grantqueue, first))
1117 return 1;
1118 }
1119
1120 return 0;
1121}
1122
1123/*
1124 * Return 1 if the lock can be granted, 0 otherwise.
1125 * Also detect and resolve conversion deadlocks.
1126 *
1127 * lkb is the lock to be granted
1128 *
1129 * now is 1 if the function is being called in the context of the
1130 * immediate request, it is 0 if called later, after the lock has been
1131 * queued.
1132 *
1133 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1134 */
1135
1136static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1137{
1138 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1139
1140 /*
1141 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1142 * a new request for a NL mode lock being blocked.
1143 *
1144 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1145 * request, then it would be granted. In essence, the use of this flag
1146 * tells the Lock Manager to expedite theis request by not considering
1147 * what may be in the CONVERTING or WAITING queues... As of this
1148 * writing, the EXPEDITE flag can be used only with new requests for NL
1149 * mode locks. This flag is not valid for conversion requests.
1150 *
1151 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1152 * conversion or used with a non-NL requested mode. We also know an
1153 * EXPEDITE request is always granted immediately, so now must always
1154 * be 1. The full condition to grant an expedite request: (now &&
1155 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1156 * therefore be shortened to just checking the flag.
1157 */
1158
1159 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1160 return 1;
1161
1162 /*
1163 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1164 * added to the remaining conditions.
1165 */
1166
1167 if (queue_conflict(&r->res_grantqueue, lkb))
1168 goto out;
1169
1170 /*
1171 * 6-3: By default, a conversion request is immediately granted if the
1172 * requested mode is compatible with the modes of all other granted
1173 * locks
1174 */
1175
1176 if (queue_conflict(&r->res_convertqueue, lkb))
1177 goto out;
1178
1179 /*
1180 * 6-5: But the default algorithm for deciding whether to grant or
1181 * queue conversion requests does not by itself guarantee that such
1182 * requests are serviced on a "first come first serve" basis. This, in
1183 * turn, can lead to a phenomenon known as "indefinate postponement".
1184 *
1185 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1186 * the system service employed to request a lock conversion. This flag
1187 * forces certain conversion requests to be queued, even if they are
1188 * compatible with the granted modes of other locks on the same
1189 * resource. Thus, the use of this flag results in conversion requests
1190 * being ordered on a "first come first servce" basis.
1191 *
1192 * DCT: This condition is all about new conversions being able to occur
1193 * "in place" while the lock remains on the granted queue (assuming
1194 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1195 * doesn't _have_ to go onto the convert queue where it's processed in
1196 * order. The "now" variable is necessary to distinguish converts
1197 * being received and processed for the first time now, because once a
1198 * convert is moved to the conversion queue the condition below applies
1199 * requiring fifo granting.
1200 */
1201
1202 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1203 return 1;
1204
1205 /*
1206 * When using range locks the NOORDER flag is set to avoid the standard
1207 * vms rules on grant order.
1208 */
1209
1210 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1211 return 1;
1212
1213 /*
1214 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1215 * granted until all other conversion requests ahead of it are granted
1216 * and/or canceled.
1217 */
1218
1219 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1220 return 1;
1221
1222 /*
1223 * 6-4: By default, a new request is immediately granted only if all
1224 * three of the following conditions are satisfied when the request is
1225 * issued:
1226 * - The queue of ungranted conversion requests for the resource is
1227 * empty.
1228 * - The queue of ungranted new requests for the resource is empty.
1229 * - The mode of the new request is compatible with the most
1230 * restrictive mode of all granted locks on the resource.
1231 */
1232
1233 if (now && !conv && list_empty(&r->res_convertqueue) &&
1234 list_empty(&r->res_waitqueue))
1235 return 1;
1236
1237 /*
1238 * 6-4: Once a lock request is in the queue of ungranted new requests,
1239 * it cannot be granted until the queue of ungranted conversion
1240 * requests is empty, all ungranted new requests ahead of it are
1241 * granted and/or canceled, and it is compatible with the granted mode
1242 * of the most restrictive lock granted on the resource.
1243 */
1244
1245 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1246 first_in_list(lkb, &r->res_waitqueue))
1247 return 1;
1248
1249 out:
1250 /*
1251 * The following, enabled by CONVDEADLK, departs from VMS.
1252 */
1253
1254 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1255 conversion_deadlock_detect(r, lkb)) {
1256 lkb->lkb_grmode = DLM_LOCK_NL;
1257 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1258 }
1259
1260 return 0;
1261}
1262
1263/*
1264 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1265 * simple way to provide a big optimization to applications that can use them.
1266 */
1267
1268static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1269{
1270 uint32_t flags = lkb->lkb_exflags;
1271 int rv;
1272 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1273
1274 rv = _can_be_granted(r, lkb, now);
1275 if (rv)
1276 goto out;
1277
1278 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1279 goto out;
1280
1281 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1282 alt = DLM_LOCK_PR;
1283 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1284 alt = DLM_LOCK_CW;
1285
1286 if (alt) {
1287 lkb->lkb_rqmode = alt;
1288 rv = _can_be_granted(r, lkb, now);
1289 if (rv)
1290 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1291 else
1292 lkb->lkb_rqmode = rqmode;
1293 }
1294 out:
1295 return rv;
1296}
1297
1298static int grant_pending_convert(struct dlm_rsb *r, int high)
1299{
1300 struct dlm_lkb *lkb, *s;
1301 int hi, demoted, quit, grant_restart, demote_restart;
1302
1303 quit = 0;
1304 restart:
1305 grant_restart = 0;
1306 demote_restart = 0;
1307 hi = DLM_LOCK_IV;
1308
1309 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1310 demoted = is_demoted(lkb);
1311 if (can_be_granted(r, lkb, 0)) {
1312 grant_lock_pending(r, lkb);
1313 grant_restart = 1;
1314 } else {
1315 hi = max_t(int, lkb->lkb_rqmode, hi);
1316 if (!demoted && is_demoted(lkb))
1317 demote_restart = 1;
1318 }
1319 }
1320
1321 if (grant_restart)
1322 goto restart;
1323 if (demote_restart && !quit) {
1324 quit = 1;
1325 goto restart;
1326 }
1327
1328 return max_t(int, high, hi);
1329}
1330
1331static int grant_pending_wait(struct dlm_rsb *r, int high)
1332{
1333 struct dlm_lkb *lkb, *s;
1334
1335 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1336 if (can_be_granted(r, lkb, 0))
1337 grant_lock_pending(r, lkb);
1338 else
1339 high = max_t(int, lkb->lkb_rqmode, high);
1340 }
1341
1342 return high;
1343}
1344
1345static void grant_pending_locks(struct dlm_rsb *r)
1346{
1347 struct dlm_lkb *lkb, *s;
1348 int high = DLM_LOCK_IV;
1349
1350 DLM_ASSERT(is_master(r), dlm_print_rsb(r););
1351
1352 high = grant_pending_convert(r, high);
1353 high = grant_pending_wait(r, high);
1354
1355 if (high == DLM_LOCK_IV)
1356 return;
1357
1358 /*
1359 * If there are locks left on the wait/convert queue then send blocking
1360 * ASTs to granted locks based on the largest requested mode (high)
1361 * found above. This can generate spurious blocking ASTs for range
1362 * locks. FIXME: highbast < high comparison not valid for PR/CW.
1363 */
1364
1365 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1366 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1367 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1368 queue_bast(r, lkb, high);
1369 lkb->lkb_highbast = high;
1370 }
1371 }
1372}
1373
1374static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1375 struct dlm_lkb *lkb)
1376{
1377 struct dlm_lkb *gr;
1378
1379 list_for_each_entry(gr, head, lkb_statequeue) {
1380 if (gr->lkb_bastaddr &&
1381 gr->lkb_highbast < lkb->lkb_rqmode &&
1382 ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
1383 queue_bast(r, gr, lkb->lkb_rqmode);
1384 gr->lkb_highbast = lkb->lkb_rqmode;
1385 }
1386 }
1387}
1388
1389static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1390{
1391 send_bast_queue(r, &r->res_grantqueue, lkb);
1392}
1393
1394static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1395{
1396 send_bast_queue(r, &r->res_grantqueue, lkb);
1397 send_bast_queue(r, &r->res_convertqueue, lkb);
1398}
1399
1400/* set_master(r, lkb) -- set the master nodeid of a resource
1401
1402 The purpose of this function is to set the nodeid field in the given
1403 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1404 known, it can just be copied to the lkb and the function will return
1405 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1406 before it can be copied to the lkb.
1407
1408 When the rsb nodeid is being looked up remotely, the initial lkb
1409 causing the lookup is kept on the ls_waiters list waiting for the
1410 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1411 on the rsb's res_lookup list until the master is verified.
1412
1413 Return values:
1414 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1415 1: the rsb master is not available and the lkb has been placed on
1416 a wait queue
1417*/
1418
1419static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1420{
1421 struct dlm_ls *ls = r->res_ls;
1422 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1423
1424 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1425 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1426 r->res_first_lkid = lkb->lkb_id;
1427 lkb->lkb_nodeid = r->res_nodeid;
1428 return 0;
1429 }
1430
1431 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1432 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1433 return 1;
1434 }
1435
1436 if (r->res_nodeid == 0) {
1437 lkb->lkb_nodeid = 0;
1438 return 0;
1439 }
1440
1441 if (r->res_nodeid > 0) {
1442 lkb->lkb_nodeid = r->res_nodeid;
1443 return 0;
1444 }
1445
1446 DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
1447
1448 dir_nodeid = dlm_dir_nodeid(r);
1449
1450 if (dir_nodeid != our_nodeid) {
1451 r->res_first_lkid = lkb->lkb_id;
1452 send_lookup(r, lkb);
1453 return 1;
1454 }
1455
1456 for (;;) {
1457 /* It's possible for dlm_scand to remove an old rsb for
1458 this same resource from the toss list, us to create
1459 a new one, look up the master locally, and find it
1460 already exists just before dlm_scand does the
1461 dir_remove() on the previous rsb. */
1462
1463 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1464 r->res_length, &ret_nodeid);
1465 if (!error)
1466 break;
1467 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1468 schedule();
1469 }
1470
1471 if (ret_nodeid == our_nodeid) {
1472 r->res_first_lkid = 0;
1473 r->res_nodeid = 0;
1474 lkb->lkb_nodeid = 0;
1475 } else {
1476 r->res_first_lkid = lkb->lkb_id;
1477 r->res_nodeid = ret_nodeid;
1478 lkb->lkb_nodeid = ret_nodeid;
1479 }
1480 return 0;
1481}
1482
1483static void process_lookup_list(struct dlm_rsb *r)
1484{
1485 struct dlm_lkb *lkb, *safe;
1486
1487 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1488 list_del(&lkb->lkb_rsb_lookup);
1489 _request_lock(r, lkb);
1490 schedule();
1491 }
1492}
1493
1494/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1495
1496static void confirm_master(struct dlm_rsb *r, int error)
1497{
1498 struct dlm_lkb *lkb;
1499
1500 if (!r->res_first_lkid)
1501 return;
1502
1503 switch (error) {
1504 case 0:
1505 case -EINPROGRESS:
1506 r->res_first_lkid = 0;
1507 process_lookup_list(r);
1508 break;
1509
1510 case -EAGAIN:
1511 /* the remote master didn't queue our NOQUEUE request;
1512 make a waiting lkb the first_lkid */
1513
1514 r->res_first_lkid = 0;
1515
1516 if (!list_empty(&r->res_lookup)) {
1517 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1518 lkb_rsb_lookup);
1519 list_del(&lkb->lkb_rsb_lookup);
1520 r->res_first_lkid = lkb->lkb_id;
1521 _request_lock(r, lkb);
1522 } else
1523 r->res_nodeid = -1;
1524 break;
1525
1526 default:
1527 log_error(r->res_ls, "confirm_master unknown error %d", error);
1528 }
1529}
1530
1531static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1532 int namelen, uint32_t parent_lkid, void *ast,
1533 void *astarg, void *bast, struct dlm_range *range,
1534 struct dlm_args *args)
1535{
1536 int rv = -EINVAL;
1537
1538 /* check for invalid arg usage */
1539
1540 if (mode < 0 || mode > DLM_LOCK_EX)
1541 goto out;
1542
1543 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1544 goto out;
1545
1546 if (flags & DLM_LKF_CANCEL)
1547 goto out;
1548
1549 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1550 goto out;
1551
1552 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1553 goto out;
1554
1555 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1556 goto out;
1557
1558 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1559 goto out;
1560
1561 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1562 goto out;
1563
1564 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1565 goto out;
1566
1567 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1568 goto out;
1569
1570 if (!ast || !lksb)
1571 goto out;
1572
1573 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1574 goto out;
1575
1576 /* parent/child locks not yet supported */
1577 if (parent_lkid)
1578 goto out;
1579
1580 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1581 goto out;
1582
1583 /* these args will be copied to the lkb in validate_lock_args,
1584 it cannot be done now because when converting locks, fields in
1585 an active lkb cannot be modified before locking the rsb */
1586
1587 args->flags = flags;
1588 args->astaddr = ast;
1589 args->astparam = (long) astarg;
1590 args->bastaddr = bast;
1591 args->mode = mode;
1592 args->lksb = lksb;
1593 args->range = range;
1594 rv = 0;
1595 out:
1596 return rv;
1597}
1598
1599static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1600{
1601 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1602 DLM_LKF_FORCEUNLOCK))
1603 return -EINVAL;
1604
1605 args->flags = flags;
1606 args->astparam = (long) astarg;
1607 return 0;
1608}
1609
1610static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1611 struct dlm_args *args)
1612{
1613 int rv = -EINVAL;
1614
1615 if (args->flags & DLM_LKF_CONVERT) {
1616 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1617 goto out;
1618
1619 if (args->flags & DLM_LKF_QUECVT &&
1620 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1621 goto out;
1622
1623 rv = -EBUSY;
1624 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1625 goto out;
1626
1627 if (lkb->lkb_wait_type)
1628 goto out;
1629 }
1630
1631 lkb->lkb_exflags = args->flags;
1632 lkb->lkb_sbflags = 0;
1633 lkb->lkb_astaddr = args->astaddr;
1634 lkb->lkb_astparam = args->astparam;
1635 lkb->lkb_bastaddr = args->bastaddr;
1636 lkb->lkb_rqmode = args->mode;
1637 lkb->lkb_lksb = args->lksb;
1638 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1639 lkb->lkb_ownpid = (int) current->pid;
1640
1641 rv = 0;
1642 if (!args->range)
1643 goto out;
1644
1645 if (!lkb->lkb_range) {
1646 rv = -ENOMEM;
1647 lkb->lkb_range = allocate_range(ls);
1648 if (!lkb->lkb_range)
1649 goto out;
1650 /* This is needed for conversions that contain ranges
1651 where the original lock didn't but it's harmless for
1652 new locks too. */
1653 lkb->lkb_range[GR_RANGE_START] = 0LL;
1654 lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
1655 }
1656
1657 lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start;
1658 lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end;
1659 lkb->lkb_flags |= DLM_IFL_RANGE;
1660 rv = 0;
1661 out:
1662 return rv;
1663}
1664
1665static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1666{
1667 int rv = -EINVAL;
1668
1669 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1670 goto out;
1671
1672 if (args->flags & DLM_LKF_FORCEUNLOCK)
1673 goto out_ok;
1674
1675 if (args->flags & DLM_LKF_CANCEL &&
1676 lkb->lkb_status == DLM_LKSTS_GRANTED)
1677 goto out;
1678
1679 if (!(args->flags & DLM_LKF_CANCEL) &&
1680 lkb->lkb_status != DLM_LKSTS_GRANTED)
1681 goto out;
1682
1683 rv = -EBUSY;
1684 if (lkb->lkb_wait_type)
1685 goto out;
1686
1687 out_ok:
1688 lkb->lkb_exflags = args->flags;
1689 lkb->lkb_sbflags = 0;
1690 lkb->lkb_astparam = args->astparam;
1691
1692 rv = 0;
1693 out:
1694 return rv;
1695}
1696
1697/*
1698 * Four stage 4 varieties:
1699 * do_request(), do_convert(), do_unlock(), do_cancel()
1700 * These are called on the master node for the given lock and
1701 * from the central locking logic.
1702 */
1703
1704static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1705{
1706 int error = 0;
1707
1708 if (can_be_granted(r, lkb, 1)) {
1709 grant_lock(r, lkb);
1710 queue_cast(r, lkb, 0);
1711 goto out;
1712 }
1713
1714 if (can_be_queued(lkb)) {
1715 error = -EINPROGRESS;
1716 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1717 send_blocking_asts(r, lkb);
1718 goto out;
1719 }
1720
1721 error = -EAGAIN;
1722 if (force_blocking_asts(lkb))
1723 send_blocking_asts_all(r, lkb);
1724 queue_cast(r, lkb, -EAGAIN);
1725
1726 out:
1727 return error;
1728}
1729
1730static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1731{
1732 int error = 0;
1733
1734 /* changing an existing lock may allow others to be granted */
1735
1736 if (can_be_granted(r, lkb, 1)) {
1737 grant_lock(r, lkb);
1738 queue_cast(r, lkb, 0);
1739 grant_pending_locks(r);
1740 goto out;
1741 }
1742
1743 if (can_be_queued(lkb)) {
1744 if (is_demoted(lkb))
1745 grant_pending_locks(r);
1746 error = -EINPROGRESS;
1747 del_lkb(r, lkb);
1748 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1749 send_blocking_asts(r, lkb);
1750 goto out;
1751 }
1752
1753 error = -EAGAIN;
1754 if (force_blocking_asts(lkb))
1755 send_blocking_asts_all(r, lkb);
1756 queue_cast(r, lkb, -EAGAIN);
1757
1758 out:
1759 return error;
1760}
1761
1762static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1763{
1764 remove_lock(r, lkb);
1765 queue_cast(r, lkb, -DLM_EUNLOCK);
1766 grant_pending_locks(r);
1767 return -DLM_EUNLOCK;
1768}
1769
1770static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1771{
1772 revert_lock(r, lkb);
1773 queue_cast(r, lkb, -DLM_ECANCEL);
1774 grant_pending_locks(r);
1775 return -DLM_ECANCEL;
1776}
1777
1778/*
1779 * Four stage 3 varieties:
1780 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1781 */
1782
1783/* add a new lkb to a possibly new rsb, called by requesting process */
1784
1785static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1786{
1787 int error;
1788
1789 /* set_master: sets lkb nodeid from r */
1790
1791 error = set_master(r, lkb);
1792 if (error < 0)
1793 goto out;
1794 if (error) {
1795 error = 0;
1796 goto out;
1797 }
1798
1799 if (is_remote(r))
1800 /* receive_request() calls do_request() on remote node */
1801 error = send_request(r, lkb);
1802 else
1803 error = do_request(r, lkb);
1804 out:
1805 return error;
1806}
1807
1808/* change some property of an existing lkb, e.g. mode, range */
1809
1810static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1811{
1812 int error;
1813
1814 if (is_remote(r))
1815 /* receive_convert() calls do_convert() on remote node */
1816 error = send_convert(r, lkb);
1817 else
1818 error = do_convert(r, lkb);
1819
1820 return error;
1821}
1822
1823/* remove an existing lkb from the granted queue */
1824
1825static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1826{
1827 int error;
1828
1829 if (is_remote(r))
1830 /* receive_unlock() calls do_unlock() on remote node */
1831 error = send_unlock(r, lkb);
1832 else
1833 error = do_unlock(r, lkb);
1834
1835 return error;
1836}
1837
1838/* remove an existing lkb from the convert or wait queue */
1839
1840static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1841{
1842 int error;
1843
1844 if (is_remote(r))
1845 /* receive_cancel() calls do_cancel() on remote node */
1846 error = send_cancel(r, lkb);
1847 else
1848 error = do_cancel(r, lkb);
1849
1850 return error;
1851}
1852
1853/*
1854 * Four stage 2 varieties:
1855 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1856 */
1857
1858static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1859 int len, struct dlm_args *args)
1860{
1861 struct dlm_rsb *r;
1862 int error;
1863
1864 error = validate_lock_args(ls, lkb, args);
1865 if (error)
1866 goto out;
1867
1868 error = find_rsb(ls, name, len, R_CREATE, &r);
1869 if (error)
1870 goto out;
1871
1872 lock_rsb(r);
1873
1874 attach_lkb(r, lkb);
1875 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1876
1877 error = _request_lock(r, lkb);
1878
1879 unlock_rsb(r);
1880 put_rsb(r);
1881
1882 out:
1883 return error;
1884}
1885
1886static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1887 struct dlm_args *args)
1888{
1889 struct dlm_rsb *r;
1890 int error;
1891
1892 r = lkb->lkb_resource;
1893
1894 hold_rsb(r);
1895 lock_rsb(r);
1896
1897 error = validate_lock_args(ls, lkb, args);
1898 if (error)
1899 goto out;
1900
1901 error = _convert_lock(r, lkb);
1902 out:
1903 unlock_rsb(r);
1904 put_rsb(r);
1905 return error;
1906}
1907
1908static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1909 struct dlm_args *args)
1910{
1911 struct dlm_rsb *r;
1912 int error;
1913
1914 r = lkb->lkb_resource;
1915
1916 hold_rsb(r);
1917 lock_rsb(r);
1918
1919 error = validate_unlock_args(lkb, args);
1920 if (error)
1921 goto out;
1922
1923 error = _unlock_lock(r, lkb);
1924 out:
1925 unlock_rsb(r);
1926 put_rsb(r);
1927 return error;
1928}
1929
1930static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1931 struct dlm_args *args)
1932{
1933 struct dlm_rsb *r;
1934 int error;
1935
1936 r = lkb->lkb_resource;
1937
1938 hold_rsb(r);
1939 lock_rsb(r);
1940
1941 error = validate_unlock_args(lkb, args);
1942 if (error)
1943 goto out;
1944
1945 error = _cancel_lock(r, lkb);
1946 out:
1947 unlock_rsb(r);
1948 put_rsb(r);
1949 return error;
1950}
1951
1952/*
1953 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1954 */
1955
1956int dlm_lock(dlm_lockspace_t *lockspace,
1957 int mode,
1958 struct dlm_lksb *lksb,
1959 uint32_t flags,
1960 void *name,
1961 unsigned int namelen,
1962 uint32_t parent_lkid,
1963 void (*ast) (void *astarg),
1964 void *astarg,
1965 void (*bast) (void *astarg, int mode),
1966 struct dlm_range *range)
1967{
1968 struct dlm_ls *ls;
1969 struct dlm_lkb *lkb;
1970 struct dlm_args args;
1971 int error, convert = flags & DLM_LKF_CONVERT;
1972
1973 ls = dlm_find_lockspace_local(lockspace);
1974 if (!ls)
1975 return -EINVAL;
1976
1977 lock_recovery(ls);
1978
1979 if (convert)
1980 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1981 else
1982 error = create_lkb(ls, &lkb);
1983
1984 if (error)
1985 goto out;
1986
1987 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1988 astarg, bast, range, &args);
1989 if (error)
1990 goto out_put;
1991
1992 if (convert)
1993 error = convert_lock(ls, lkb, &args);
1994 else
1995 error = request_lock(ls, lkb, name, namelen, &args);
1996
1997 if (error == -EINPROGRESS)
1998 error = 0;
1999 out_put:
2000 if (convert || error)
2001 put_lkb(lkb);
2002 if (error == -EAGAIN)
2003 error = 0;
2004 out:
2005 unlock_recovery(ls);
2006 dlm_put_lockspace(ls);
2007 return error;
2008}
2009
2010int dlm_unlock(dlm_lockspace_t *lockspace,
2011 uint32_t lkid,
2012 uint32_t flags,
2013 struct dlm_lksb *lksb,
2014 void *astarg)
2015{
2016 struct dlm_ls *ls;
2017 struct dlm_lkb *lkb;
2018 struct dlm_args args;
2019 int error;
2020
2021 ls = dlm_find_lockspace_local(lockspace);
2022 if (!ls)
2023 return -EINVAL;
2024
2025 lock_recovery(ls);
2026
2027 error = find_lkb(ls, lkid, &lkb);
2028 if (error)
2029 goto out;
2030
2031 error = set_unlock_args(flags, astarg, &args);
2032 if (error)
2033 goto out_put;
2034
2035 if (flags & DLM_LKF_CANCEL)
2036 error = cancel_lock(ls, lkb, &args);
2037 else
2038 error = unlock_lock(ls, lkb, &args);
2039
2040 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2041 error = 0;
2042 out_put:
2043 put_lkb(lkb);
2044 out:
2045 unlock_recovery(ls);
2046 dlm_put_lockspace(ls);
2047 return error;
2048}
2049
2050/*
2051 * send/receive routines for remote operations and replies
2052 *
2053 * send_args
2054 * send_common
2055 * send_request receive_request
2056 * send_convert receive_convert
2057 * send_unlock receive_unlock
2058 * send_cancel receive_cancel
2059 * send_grant receive_grant
2060 * send_bast receive_bast
2061 * send_lookup receive_lookup
2062 * send_remove receive_remove
2063 *
2064 * send_common_reply
2065 * receive_request_reply send_request_reply
2066 * receive_convert_reply send_convert_reply
2067 * receive_unlock_reply send_unlock_reply
2068 * receive_cancel_reply send_cancel_reply
2069 * receive_lookup_reply send_lookup_reply
2070 */
2071
2072static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2073 int to_nodeid, int mstype,
2074 struct dlm_message **ms_ret,
2075 struct dlm_mhandle **mh_ret)
2076{
2077 struct dlm_message *ms;
2078 struct dlm_mhandle *mh;
2079 char *mb;
2080 int mb_len = sizeof(struct dlm_message);
2081
2082 switch (mstype) {
2083 case DLM_MSG_REQUEST:
2084 case DLM_MSG_LOOKUP:
2085 case DLM_MSG_REMOVE:
2086 mb_len += r->res_length;
2087 break;
2088 case DLM_MSG_CONVERT:
2089 case DLM_MSG_UNLOCK:
2090 case DLM_MSG_REQUEST_REPLY:
2091 case DLM_MSG_CONVERT_REPLY:
2092 case DLM_MSG_GRANT:
2093 if (lkb && lkb->lkb_lvbptr)
2094 mb_len += r->res_ls->ls_lvblen;
2095 break;
2096 }
2097
2098 /* get_buffer gives us a message handle (mh) that we need to
2099 pass into lowcomms_commit and a message buffer (mb) that we
2100 write our data into */
2101
2102 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2103 if (!mh)
2104 return -ENOBUFS;
2105
2106 memset(mb, 0, mb_len);
2107
2108 ms = (struct dlm_message *) mb;
2109
2110 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2111 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2112 ms->m_header.h_nodeid = dlm_our_nodeid();
2113 ms->m_header.h_length = mb_len;
2114 ms->m_header.h_cmd = DLM_MSG;
2115
2116 ms->m_type = mstype;
2117
2118 *mh_ret = mh;
2119 *ms_ret = ms;
2120 return 0;
2121}
2122
2123/* further lowcomms enhancements or alternate implementations may make
2124 the return value from this function useful at some point */
2125
2126static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2127{
2128 dlm_message_out(ms);
2129 dlm_lowcomms_commit_buffer(mh);
2130 return 0;
2131}
2132
2133static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2134 struct dlm_message *ms)
2135{
2136 ms->m_nodeid = lkb->lkb_nodeid;
2137 ms->m_pid = lkb->lkb_ownpid;
2138 ms->m_lkid = lkb->lkb_id;
2139 ms->m_remid = lkb->lkb_remid;
2140 ms->m_exflags = lkb->lkb_exflags;
2141 ms->m_sbflags = lkb->lkb_sbflags;
2142 ms->m_flags = lkb->lkb_flags;
2143 ms->m_lvbseq = lkb->lkb_lvbseq;
2144 ms->m_status = lkb->lkb_status;
2145 ms->m_grmode = lkb->lkb_grmode;
2146 ms->m_rqmode = lkb->lkb_rqmode;
2147 ms->m_hash = r->res_hash;
2148
2149 /* m_result and m_bastmode are set from function args,
2150 not from lkb fields */
2151
2152 if (lkb->lkb_bastaddr)
2153 ms->m_asts |= AST_BAST;
2154 if (lkb->lkb_astaddr)
2155 ms->m_asts |= AST_COMP;
2156
2157 if (lkb->lkb_range) {
2158 ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START];
2159 ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END];
2160 }
2161
2162 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2163 memcpy(ms->m_extra, r->res_name, r->res_length);
2164
2165 else if (lkb->lkb_lvbptr)
2166 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2167
2168}
2169
2170static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2171{
2172 struct dlm_message *ms;
2173 struct dlm_mhandle *mh;
2174 int to_nodeid, error;
2175
2176 add_to_waiters(lkb, mstype);
2177
2178 to_nodeid = r->res_nodeid;
2179
2180 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2181 if (error)
2182 goto fail;
2183
2184 send_args(r, lkb, ms);
2185
2186 error = send_message(mh, ms);
2187 if (error)
2188 goto fail;
2189 return 0;
2190
2191 fail:
2192 remove_from_waiters(lkb);
2193 return error;
2194}
2195
2196static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2197{
2198 return send_common(r, lkb, DLM_MSG_REQUEST);
2199}
2200
2201static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2202{
2203 int error;
2204
2205 error = send_common(r, lkb, DLM_MSG_CONVERT);
2206
2207 /* down conversions go without a reply from the master */
2208 if (!error && down_conversion(lkb)) {
2209 remove_from_waiters(lkb);
2210 r->res_ls->ls_stub_ms.m_result = 0;
2211 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2212 }
2213
2214 return error;
2215}
2216
2217/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2218 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2219 that the master is still correct. */
2220
2221static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2222{
2223 return send_common(r, lkb, DLM_MSG_UNLOCK);
2224}
2225
2226static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2227{
2228 return send_common(r, lkb, DLM_MSG_CANCEL);
2229}
2230
2231static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2232{
2233 struct dlm_message *ms;
2234 struct dlm_mhandle *mh;
2235 int to_nodeid, error;
2236
2237 to_nodeid = lkb->lkb_nodeid;
2238
2239 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2240 if (error)
2241 goto out;
2242
2243 send_args(r, lkb, ms);
2244
2245 ms->m_result = 0;
2246
2247 error = send_message(mh, ms);
2248 out:
2249 return error;
2250}
2251
2252static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2253{
2254 struct dlm_message *ms;
2255 struct dlm_mhandle *mh;
2256 int to_nodeid, error;
2257
2258 to_nodeid = lkb->lkb_nodeid;
2259
2260 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2261 if (error)
2262 goto out;
2263
2264 send_args(r, lkb, ms);
2265
2266 ms->m_bastmode = mode;
2267
2268 error = send_message(mh, ms);
2269 out:
2270 return error;
2271}
2272
2273static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2274{
2275 struct dlm_message *ms;
2276 struct dlm_mhandle *mh;
2277 int to_nodeid, error;
2278
2279 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2280
2281 to_nodeid = dlm_dir_nodeid(r);
2282
2283 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2284 if (error)
2285 goto fail;
2286
2287 send_args(r, lkb, ms);
2288
2289 error = send_message(mh, ms);
2290 if (error)
2291 goto fail;
2292 return 0;
2293
2294 fail:
2295 remove_from_waiters(lkb);
2296 return error;
2297}
2298
2299static int send_remove(struct dlm_rsb *r)
2300{
2301 struct dlm_message *ms;
2302 struct dlm_mhandle *mh;
2303 int to_nodeid, error;
2304
2305 to_nodeid = dlm_dir_nodeid(r);
2306
2307 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2308 if (error)
2309 goto out;
2310
2311 memcpy(ms->m_extra, r->res_name, r->res_length);
2312 ms->m_hash = r->res_hash;
2313
2314 error = send_message(mh, ms);
2315 out:
2316 return error;
2317}
2318
2319static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2320 int mstype, int rv)
2321{
2322 struct dlm_message *ms;
2323 struct dlm_mhandle *mh;
2324 int to_nodeid, error;
2325
2326 to_nodeid = lkb->lkb_nodeid;
2327
2328 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2329 if (error)
2330 goto out;
2331
2332 send_args(r, lkb, ms);
2333
2334 ms->m_result = rv;
2335
2336 error = send_message(mh, ms);
2337 out:
2338 return error;
2339}
2340
2341static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2342{
2343 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2344}
2345
2346static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2347{
2348 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2349}
2350
2351static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2352{
2353 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2354}
2355
2356static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2357{
2358 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2359}
2360
2361static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2362 int ret_nodeid, int rv)
2363{
2364 struct dlm_rsb *r = &ls->ls_stub_rsb;
2365 struct dlm_message *ms;
2366 struct dlm_mhandle *mh;
2367 int error, nodeid = ms_in->m_header.h_nodeid;
2368
2369 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2370 if (error)
2371 goto out;
2372
2373 ms->m_lkid = ms_in->m_lkid;
2374 ms->m_result = rv;
2375 ms->m_nodeid = ret_nodeid;
2376
2377 error = send_message(mh, ms);
2378 out:
2379 return error;
2380}
2381
2382/* which args we save from a received message depends heavily on the type
2383 of message, unlike the send side where we can safely send everything about
2384 the lkb for any type of message */
2385
2386static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2387{
2388 lkb->lkb_exflags = ms->m_exflags;
2389 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2390 (ms->m_flags & 0x0000FFFF);
2391}
2392
2393static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2394{
2395 lkb->lkb_sbflags = ms->m_sbflags;
2396 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2397 (ms->m_flags & 0x0000FFFF);
2398}
2399
2400static int receive_extralen(struct dlm_message *ms)
2401{
2402 return (ms->m_header.h_length - sizeof(struct dlm_message));
2403}
2404
2405static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb,
2406 struct dlm_message *ms)
2407{
2408 if (lkb->lkb_flags & DLM_IFL_RANGE) {
2409 if (!lkb->lkb_range)
2410 lkb->lkb_range = allocate_range(ls);
2411 if (!lkb->lkb_range)
2412 return -ENOMEM;
2413 lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0];
2414 lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1];
2415 }
2416 return 0;
2417}
2418
2419static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2420 struct dlm_message *ms)
2421{
2422 int len;
2423
2424 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2425 if (!lkb->lkb_lvbptr)
2426 lkb->lkb_lvbptr = allocate_lvb(ls);
2427 if (!lkb->lkb_lvbptr)
2428 return -ENOMEM;
2429 len = receive_extralen(ms);
2430 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2431 }
2432 return 0;
2433}
2434
2435static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2436 struct dlm_message *ms)
2437{
2438 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2439 lkb->lkb_ownpid = ms->m_pid;
2440 lkb->lkb_remid = ms->m_lkid;
2441 lkb->lkb_grmode = DLM_LOCK_IV;
2442 lkb->lkb_rqmode = ms->m_rqmode;
2443 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2444 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2445
2446 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2447
2448 if (receive_range(ls, lkb, ms))
2449 return -ENOMEM;
2450
2451 if (receive_lvb(ls, lkb, ms))
2452 return -ENOMEM;
2453
2454 return 0;
2455}
2456
2457static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2458 struct dlm_message *ms)
2459{
2460 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2461 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2462 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2463 lkb->lkb_id, lkb->lkb_remid);
2464 return -EINVAL;
2465 }
2466
2467 if (!is_master_copy(lkb))
2468 return -EINVAL;
2469
2470 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2471 return -EBUSY;
2472
2473 if (receive_range(ls, lkb, ms))
2474 return -ENOMEM;
2475 if (lkb->lkb_range) {
2476 lkb->lkb_range[GR_RANGE_START] = 0LL;
2477 lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
2478 }
2479
2480 if (receive_lvb(ls, lkb, ms))
2481 return -ENOMEM;
2482
2483 lkb->lkb_rqmode = ms->m_rqmode;
2484 lkb->lkb_lvbseq = ms->m_lvbseq;
2485
2486 return 0;
2487}
2488
2489static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2490 struct dlm_message *ms)
2491{
2492 if (!is_master_copy(lkb))
2493 return -EINVAL;
2494 if (receive_lvb(ls, lkb, ms))
2495 return -ENOMEM;
2496 return 0;
2497}
2498
2499/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2500 uses to send a reply and that the remote end uses to process the reply. */
2501
2502static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2503{
2504 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2505 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2506 lkb->lkb_remid = ms->m_lkid;
2507}
2508
2509static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2510{
2511 struct dlm_lkb *lkb;
2512 struct dlm_rsb *r;
2513 int error, namelen;
2514
2515 error = create_lkb(ls, &lkb);
2516 if (error)
2517 goto fail;
2518
2519 receive_flags(lkb, ms);
2520 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2521 error = receive_request_args(ls, lkb, ms);
2522 if (error) {
2523 put_lkb(lkb);
2524 goto fail;
2525 }
2526
2527 namelen = receive_extralen(ms);
2528
2529 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2530 if (error) {
2531 put_lkb(lkb);
2532 goto fail;
2533 }
2534
2535 lock_rsb(r);
2536
2537 attach_lkb(r, lkb);
2538 error = do_request(r, lkb);
2539 send_request_reply(r, lkb, error);
2540
2541 unlock_rsb(r);
2542 put_rsb(r);
2543
2544 if (error == -EINPROGRESS)
2545 error = 0;
2546 if (error)
2547 put_lkb(lkb);
2548 return;
2549
2550 fail:
2551 setup_stub_lkb(ls, ms);
2552 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2553}
2554
2555static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2556{
2557 struct dlm_lkb *lkb;
2558 struct dlm_rsb *r;
2559 int error, reply = 1;
2560
2561 error = find_lkb(ls, ms->m_remid, &lkb);
2562 if (error)
2563 goto fail;
2564
2565 r = lkb->lkb_resource;
2566
2567 hold_rsb(r);
2568 lock_rsb(r);
2569
2570 receive_flags(lkb, ms);
2571 error = receive_convert_args(ls, lkb, ms);
2572 if (error)
2573 goto out;
2574 reply = !down_conversion(lkb);
2575
2576 error = do_convert(r, lkb);
2577 out:
2578 if (reply)
2579 send_convert_reply(r, lkb, error);
2580
2581 unlock_rsb(r);
2582 put_rsb(r);
2583 put_lkb(lkb);
2584 return;
2585
2586 fail:
2587 setup_stub_lkb(ls, ms);
2588 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2589}
2590
2591static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2592{
2593 struct dlm_lkb *lkb;
2594 struct dlm_rsb *r;
2595 int error;
2596
2597 error = find_lkb(ls, ms->m_remid, &lkb);
2598 if (error)
2599 goto fail;
2600
2601 r = lkb->lkb_resource;
2602
2603 hold_rsb(r);
2604 lock_rsb(r);
2605
2606 receive_flags(lkb, ms);
2607 error = receive_unlock_args(ls, lkb, ms);
2608 if (error)
2609 goto out;
2610
2611 error = do_unlock(r, lkb);
2612 out:
2613 send_unlock_reply(r, lkb, error);
2614
2615 unlock_rsb(r);
2616 put_rsb(r);
2617 put_lkb(lkb);
2618 return;
2619
2620 fail:
2621 setup_stub_lkb(ls, ms);
2622 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2623}
2624
2625static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2626{
2627 struct dlm_lkb *lkb;
2628 struct dlm_rsb *r;
2629 int error;
2630
2631 error = find_lkb(ls, ms->m_remid, &lkb);
2632 if (error)
2633 goto fail;
2634
2635 receive_flags(lkb, ms);
2636
2637 r = lkb->lkb_resource;
2638
2639 hold_rsb(r);
2640 lock_rsb(r);
2641
2642 error = do_cancel(r, lkb);
2643 send_cancel_reply(r, lkb, error);
2644
2645 unlock_rsb(r);
2646 put_rsb(r);
2647 put_lkb(lkb);
2648 return;
2649
2650 fail:
2651 setup_stub_lkb(ls, ms);
2652 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2653}
2654
2655static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2656{
2657 struct dlm_lkb *lkb;
2658 struct dlm_rsb *r;
2659 int error;
2660
2661 error = find_lkb(ls, ms->m_remid, &lkb);
2662 if (error) {
2663 log_error(ls, "receive_grant no lkb");
2664 return;
2665 }
2666 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2667
2668 r = lkb->lkb_resource;
2669
2670 hold_rsb(r);
2671 lock_rsb(r);
2672
2673 receive_flags_reply(lkb, ms);
2674 grant_lock_pc(r, lkb, ms);
2675 queue_cast(r, lkb, 0);
2676
2677 unlock_rsb(r);
2678 put_rsb(r);
2679 put_lkb(lkb);
2680}
2681
2682static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2683{
2684 struct dlm_lkb *lkb;
2685 struct dlm_rsb *r;
2686 int error;
2687
2688 error = find_lkb(ls, ms->m_remid, &lkb);
2689 if (error) {
2690 log_error(ls, "receive_bast no lkb");
2691 return;
2692 }
2693 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2694
2695 r = lkb->lkb_resource;
2696
2697 hold_rsb(r);
2698 lock_rsb(r);
2699
2700 queue_bast(r, lkb, ms->m_bastmode);
2701
2702 unlock_rsb(r);
2703 put_rsb(r);
2704 put_lkb(lkb);
2705}
2706
2707static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2708{
2709 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2710
2711 from_nodeid = ms->m_header.h_nodeid;
2712 our_nodeid = dlm_our_nodeid();
2713
2714 len = receive_extralen(ms);
2715
2716 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2717 if (dir_nodeid != our_nodeid) {
2718 log_error(ls, "lookup dir_nodeid %d from %d",
2719 dir_nodeid, from_nodeid);
2720 error = -EINVAL;
2721 ret_nodeid = -1;
2722 goto out;
2723 }
2724
2725 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2726
2727 /* Optimization: we're master so treat lookup as a request */
2728 if (!error && ret_nodeid == our_nodeid) {
2729 receive_request(ls, ms);
2730 return;
2731 }
2732 out:
2733 send_lookup_reply(ls, ms, ret_nodeid, error);
2734}
2735
2736static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2737{
2738 int len, dir_nodeid, from_nodeid;
2739
2740 from_nodeid = ms->m_header.h_nodeid;
2741
2742 len = receive_extralen(ms);
2743
2744 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2745 if (dir_nodeid != dlm_our_nodeid()) {
2746 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2747 dir_nodeid, from_nodeid);
2748 return;
2749 }
2750
2751 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2752}
2753
2754static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2755{
2756 struct dlm_lkb *lkb;
2757 struct dlm_rsb *r;
2758 int error, mstype;
2759
2760 error = find_lkb(ls, ms->m_remid, &lkb);
2761 if (error) {
2762 log_error(ls, "receive_request_reply no lkb");
2763 return;
2764 }
2765 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2766
2767 mstype = lkb->lkb_wait_type;
2768 error = remove_from_waiters(lkb);
2769 if (error) {
2770 log_error(ls, "receive_request_reply not on waiters");
2771 goto out;
2772 }
2773
2774 /* this is the value returned from do_request() on the master */
2775 error = ms->m_result;
2776
2777 r = lkb->lkb_resource;
2778 hold_rsb(r);
2779 lock_rsb(r);
2780
2781 /* Optimization: the dir node was also the master, so it took our
2782 lookup as a request and sent request reply instead of lookup reply */
2783 if (mstype == DLM_MSG_LOOKUP) {
2784 r->res_nodeid = ms->m_header.h_nodeid;
2785 lkb->lkb_nodeid = r->res_nodeid;
2786 }
2787
2788 switch (error) {
2789 case -EAGAIN:
2790 /* request would block (be queued) on remote master;
2791 the unhold undoes the original ref from create_lkb()
2792 so it leads to the lkb being freed */
2793 queue_cast(r, lkb, -EAGAIN);
2794 confirm_master(r, -EAGAIN);
2795 unhold_lkb(lkb);
2796 break;
2797
2798 case -EINPROGRESS:
2799 case 0:
2800 /* request was queued or granted on remote master */
2801 receive_flags_reply(lkb, ms);
2802 lkb->lkb_remid = ms->m_lkid;
2803 if (error)
2804 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2805 else {
2806 grant_lock_pc(r, lkb, ms);
2807 queue_cast(r, lkb, 0);
2808 }
2809 confirm_master(r, error);
2810 break;
2811
2812 case -ENOENT:
2813 case -ENOTBLK:
2814 /* find_rsb failed to find rsb or rsb wasn't master */
2815 r->res_nodeid = -1;
2816 lkb->lkb_nodeid = -1;
2817 _request_lock(r, lkb);
2818 break;
2819
2820 default:
2821 log_error(ls, "receive_request_reply error %d", error);
2822 }
2823
2824 unlock_rsb(r);
2825 put_rsb(r);
2826 out:
2827 put_lkb(lkb);
2828}
2829
2830static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2831 struct dlm_message *ms)
2832{
2833 int error = ms->m_result;
2834
2835 /* this is the value returned from do_convert() on the master */
2836
2837 switch (error) {
2838 case -EAGAIN:
2839 /* convert would block (be queued) on remote master */
2840 queue_cast(r, lkb, -EAGAIN);
2841 break;
2842
2843 case -EINPROGRESS:
2844 /* convert was queued on remote master */
2845 del_lkb(r, lkb);
2846 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2847 break;
2848
2849 case 0:
2850 /* convert was granted on remote master */
2851 receive_flags_reply(lkb, ms);
2852 grant_lock_pc(r, lkb, ms);
2853 queue_cast(r, lkb, 0);
2854 break;
2855
2856 default:
2857 log_error(r->res_ls, "receive_convert_reply error %d", error);
2858 }
2859}
2860
2861static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2862{
2863 struct dlm_rsb *r = lkb->lkb_resource;
2864
2865 hold_rsb(r);
2866 lock_rsb(r);
2867
2868 __receive_convert_reply(r, lkb, ms);
2869
2870 unlock_rsb(r);
2871 put_rsb(r);
2872}
2873
2874static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2875{
2876 struct dlm_lkb *lkb;
2877 int error;
2878
2879 error = find_lkb(ls, ms->m_remid, &lkb);
2880 if (error) {
2881 log_error(ls, "receive_convert_reply no lkb");
2882 return;
2883 }
2884 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2885
2886 error = remove_from_waiters(lkb);
2887 if (error) {
2888 log_error(ls, "receive_convert_reply not on waiters");
2889 goto out;
2890 }
2891
2892 _receive_convert_reply(lkb, ms);
2893 out:
2894 put_lkb(lkb);
2895}
2896
2897static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2898{
2899 struct dlm_rsb *r = lkb->lkb_resource;
2900 int error = ms->m_result;
2901
2902 hold_rsb(r);
2903 lock_rsb(r);
2904
2905 /* this is the value returned from do_unlock() on the master */
2906
2907 switch (error) {
2908 case -DLM_EUNLOCK:
2909 receive_flags_reply(lkb, ms);
2910 remove_lock_pc(r, lkb);
2911 queue_cast(r, lkb, -DLM_EUNLOCK);
2912 break;
2913 default:
2914 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2915 }
2916
2917 unlock_rsb(r);
2918 put_rsb(r);
2919}
2920
2921static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2922{
2923 struct dlm_lkb *lkb;
2924 int error;
2925
2926 error = find_lkb(ls, ms->m_remid, &lkb);
2927 if (error) {
2928 log_error(ls, "receive_unlock_reply no lkb");
2929 return;
2930 }
2931 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2932
2933 error = remove_from_waiters(lkb);
2934 if (error) {
2935 log_error(ls, "receive_unlock_reply not on waiters");
2936 goto out;
2937 }
2938
2939 _receive_unlock_reply(lkb, ms);
2940 out:
2941 put_lkb(lkb);
2942}
2943
2944static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2945{
2946 struct dlm_rsb *r = lkb->lkb_resource;
2947 int error = ms->m_result;
2948
2949 hold_rsb(r);
2950 lock_rsb(r);
2951
2952 /* this is the value returned from do_cancel() on the master */
2953
2954 switch (error) {
2955 case -DLM_ECANCEL:
2956 receive_flags_reply(lkb, ms);
2957 revert_lock_pc(r, lkb);
2958 queue_cast(r, lkb, -DLM_ECANCEL);
2959 break;
2960 default:
2961 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2962 }
2963
2964 unlock_rsb(r);
2965 put_rsb(r);
2966}
2967
2968static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2969{
2970 struct dlm_lkb *lkb;
2971 int error;
2972
2973 error = find_lkb(ls, ms->m_remid, &lkb);
2974 if (error) {
2975 log_error(ls, "receive_cancel_reply no lkb");
2976 return;
2977 }
2978 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2979
2980 error = remove_from_waiters(lkb);
2981 if (error) {
2982 log_error(ls, "receive_cancel_reply not on waiters");
2983 goto out;
2984 }
2985
2986 _receive_cancel_reply(lkb, ms);
2987 out:
2988 put_lkb(lkb);
2989}
2990
2991static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2992{
2993 struct dlm_lkb *lkb;
2994 struct dlm_rsb *r;
2995 int error, ret_nodeid;
2996
2997 error = find_lkb(ls, ms->m_lkid, &lkb);
2998 if (error) {
2999 log_error(ls, "receive_lookup_reply no lkb");
3000 return;
3001 }
3002
3003 error = remove_from_waiters(lkb);
3004 if (error) {
3005 log_error(ls, "receive_lookup_reply not on waiters");
3006 goto out;
3007 }
3008
3009 /* this is the value returned by dlm_dir_lookup on dir node
3010 FIXME: will a non-zero error ever be returned? */
3011 error = ms->m_result;
3012
3013 r = lkb->lkb_resource;
3014 hold_rsb(r);
3015 lock_rsb(r);
3016
3017 ret_nodeid = ms->m_nodeid;
3018 if (ret_nodeid == dlm_our_nodeid()) {
3019 r->res_nodeid = 0;
3020 ret_nodeid = 0;
3021 r->res_first_lkid = 0;
3022 } else {
3023 /* set_master() will copy res_nodeid to lkb_nodeid */
3024 r->res_nodeid = ret_nodeid;
3025 }
3026
3027 _request_lock(r, lkb);
3028
3029 if (!ret_nodeid)
3030 process_lookup_list(r);
3031
3032 unlock_rsb(r);
3033 put_rsb(r);
3034 out:
3035 put_lkb(lkb);
3036}
3037
3038int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3039{
3040 struct dlm_message *ms = (struct dlm_message *) hd;
3041 struct dlm_ls *ls;
3042 int error;
3043
3044 if (!recovery)
3045 dlm_message_in(ms);
3046
3047 ls = dlm_find_lockspace_global(hd->h_lockspace);
3048 if (!ls) {
3049 log_print("drop message %d from %d for unknown lockspace %d",
3050 ms->m_type, nodeid, hd->h_lockspace);
3051 return -EINVAL;
3052 }
3053
3054 /* recovery may have just ended leaving a bunch of backed-up requests
3055 in the requestqueue; wait while dlm_recoverd clears them */
3056
3057 if (!recovery)
3058 dlm_wait_requestqueue(ls);
3059
3060 /* recovery may have just started while there were a bunch of
3061 in-flight requests -- save them in requestqueue to be processed
3062 after recovery. we can't let dlm_recvd block on the recovery
3063 lock. if dlm_recoverd is calling this function to clear the
3064 requestqueue, it needs to be interrupted (-EINTR) if another
3065 recovery operation is starting. */
3066
3067 while (1) {
3068 if (dlm_locking_stopped(ls)) {
3069 if (!recovery)
3070 dlm_add_requestqueue(ls, nodeid, hd);
3071 error = -EINTR;
3072 goto out;
3073 }
3074
3075 if (lock_recovery_try(ls))
3076 break;
3077 schedule();
3078 }
3079
3080 switch (ms->m_type) {
3081
3082 /* messages sent to a master node */
3083
3084 case DLM_MSG_REQUEST:
3085 receive_request(ls, ms);
3086 break;
3087
3088 case DLM_MSG_CONVERT:
3089 receive_convert(ls, ms);
3090 break;
3091
3092 case DLM_MSG_UNLOCK:
3093 receive_unlock(ls, ms);
3094 break;
3095
3096 case DLM_MSG_CANCEL:
3097 receive_cancel(ls, ms);
3098 break;
3099
3100 /* messages sent from a master node (replies to above) */
3101
3102 case DLM_MSG_REQUEST_REPLY:
3103 receive_request_reply(ls, ms);
3104 break;
3105
3106 case DLM_MSG_CONVERT_REPLY:
3107 receive_convert_reply(ls, ms);
3108 break;
3109
3110 case DLM_MSG_UNLOCK_REPLY:
3111 receive_unlock_reply(ls, ms);
3112 break;
3113
3114 case DLM_MSG_CANCEL_REPLY:
3115 receive_cancel_reply(ls, ms);
3116 break;
3117
3118 /* messages sent from a master node (only two types of async msg) */
3119
3120 case DLM_MSG_GRANT:
3121 receive_grant(ls, ms);
3122 break;
3123
3124 case DLM_MSG_BAST:
3125 receive_bast(ls, ms);
3126 break;
3127
3128 /* messages sent to a dir node */
3129
3130 case DLM_MSG_LOOKUP:
3131 receive_lookup(ls, ms);
3132 break;
3133
3134 case DLM_MSG_REMOVE:
3135 receive_remove(ls, ms);
3136 break;
3137
3138 /* messages sent from a dir node (remove has no reply) */
3139
3140 case DLM_MSG_LOOKUP_REPLY:
3141 receive_lookup_reply(ls, ms);
3142 break;
3143
3144 default:
3145 log_error(ls, "unknown message type %d", ms->m_type);
3146 }
3147
3148 unlock_recovery(ls);
3149 out:
3150 dlm_put_lockspace(ls);
3151 dlm_astd_wake();
3152 return 0;
3153}
3154
3155
3156/*
3157 * Recovery related
3158 */
3159
3160static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3161{
3162 if (middle_conversion(lkb)) {
3163 hold_lkb(lkb);
3164 ls->ls_stub_ms.m_result = -EINPROGRESS;
3165 _remove_from_waiters(lkb);
3166 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3167
3168 /* Same special case as in receive_rcom_lock_args() */
3169 lkb->lkb_grmode = DLM_LOCK_IV;
3170 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3171 unhold_lkb(lkb);
3172
3173 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3174 lkb->lkb_flags |= DLM_IFL_RESEND;
3175 }
3176
3177 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3178 conversions are async; there's no reply from the remote master */
3179}
3180
3181/* A waiting lkb needs recovery if the master node has failed, or
3182 the master node is changing (only when no directory is used) */
3183
3184static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3185{
3186 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3187 return 1;
3188
3189 if (!dlm_no_directory(ls))
3190 return 0;
3191
3192 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3193 return 1;
3194
3195 return 0;
3196}
3197
3198/* Recovery for locks that are waiting for replies from nodes that are now
3199 gone. We can just complete unlocks and cancels by faking a reply from the
3200 dead node. Requests and up-conversions we flag to be resent after
3201 recovery. Down-conversions can just be completed with a fake reply like
3202 unlocks. Conversions between PR and CW need special attention. */
3203
3204void dlm_recover_waiters_pre(struct dlm_ls *ls)
3205{
3206 struct dlm_lkb *lkb, *safe;
3207
3208 mutex_lock(&ls->ls_waiters_mutex);
3209
3210 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3211 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3212 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3213
3214 /* all outstanding lookups, regardless of destination will be
3215 resent after recovery is done */
3216
3217 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3218 lkb->lkb_flags |= DLM_IFL_RESEND;
3219 continue;
3220 }
3221
3222 if (!waiter_needs_recovery(ls, lkb))
3223 continue;
3224
3225 switch (lkb->lkb_wait_type) {
3226
3227 case DLM_MSG_REQUEST:
3228 lkb->lkb_flags |= DLM_IFL_RESEND;
3229 break;
3230
3231 case DLM_MSG_CONVERT:
3232 recover_convert_waiter(ls, lkb);
3233 break;
3234
3235 case DLM_MSG_UNLOCK:
3236 hold_lkb(lkb);
3237 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3238 _remove_from_waiters(lkb);
3239 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3240 put_lkb(lkb);
3241 break;
3242
3243 case DLM_MSG_CANCEL:
3244 hold_lkb(lkb);
3245 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3246 _remove_from_waiters(lkb);
3247 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3248 put_lkb(lkb);
3249 break;
3250
3251 default:
3252 log_error(ls, "invalid lkb wait_type %d",
3253 lkb->lkb_wait_type);
3254 }
3255 }
3256 mutex_unlock(&ls->ls_waiters_mutex);
3257}
3258
3259static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3260{
3261 struct dlm_lkb *lkb;
3262 int rv = 0;
3263
3264 mutex_lock(&ls->ls_waiters_mutex);
3265 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3266 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3267 rv = lkb->lkb_wait_type;
3268 _remove_from_waiters(lkb);
3269 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3270 break;
3271 }
3272 }
3273 mutex_unlock(&ls->ls_waiters_mutex);
3274
3275 if (!rv)
3276 lkb = NULL;
3277 *lkb_ret = lkb;
3278 return rv;
3279}
3280
3281/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3282 master or dir-node for r. Processing the lkb may result in it being placed
3283 back on waiters. */
3284
3285int dlm_recover_waiters_post(struct dlm_ls *ls)
3286{
3287 struct dlm_lkb *lkb;
3288 struct dlm_rsb *r;
3289 int error = 0, mstype;
3290
3291 while (1) {
3292 if (dlm_locking_stopped(ls)) {
3293 log_debug(ls, "recover_waiters_post aborted");
3294 error = -EINTR;
3295 break;
3296 }
3297
3298 mstype = remove_resend_waiter(ls, &lkb);
3299 if (!mstype)
3300 break;
3301
3302 r = lkb->lkb_resource;
3303
3304 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3305 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3306
3307 switch (mstype) {
3308
3309 case DLM_MSG_LOOKUP:
3310 hold_rsb(r);
3311 lock_rsb(r);
3312 _request_lock(r, lkb);
3313 if (is_master(r))
3314 confirm_master(r, 0);
3315 unlock_rsb(r);
3316 put_rsb(r);
3317 break;
3318
3319 case DLM_MSG_REQUEST:
3320 hold_rsb(r);
3321 lock_rsb(r);
3322 _request_lock(r, lkb);
3323 unlock_rsb(r);
3324 put_rsb(r);
3325 break;
3326
3327 case DLM_MSG_CONVERT:
3328 hold_rsb(r);
3329 lock_rsb(r);
3330 _convert_lock(r, lkb);
3331 unlock_rsb(r);
3332 put_rsb(r);
3333 break;
3334
3335 default:
3336 log_error(ls, "recover_waiters_post type %d", mstype);
3337 }
3338 }
3339
3340 return error;
3341}
3342
3343static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3344 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3345{
3346 struct dlm_ls *ls = r->res_ls;
3347 struct dlm_lkb *lkb, *safe;
3348
3349 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3350 if (test(ls, lkb)) {
3351 del_lkb(r, lkb);
3352 /* this put should free the lkb */
3353 if (!put_lkb(lkb))
3354 log_error(ls, "purged lkb not released");
3355 }
3356 }
3357}
3358
3359static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3360{
3361 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3362}
3363
3364static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3365{
3366 return is_master_copy(lkb);
3367}
3368
3369static void purge_dead_locks(struct dlm_rsb *r)
3370{
3371 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3372 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3373 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3374}
3375
3376void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3377{
3378 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3379 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3380 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3381}
3382
3383/* Get rid of locks held by nodes that are gone. */
3384
3385int dlm_purge_locks(struct dlm_ls *ls)
3386{
3387 struct dlm_rsb *r;
3388
3389 log_debug(ls, "dlm_purge_locks");
3390
3391 down_write(&ls->ls_root_sem);
3392 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3393 hold_rsb(r);
3394 lock_rsb(r);
3395 if (is_master(r))
3396 purge_dead_locks(r);
3397 unlock_rsb(r);
3398 unhold_rsb(r);
3399
3400 schedule();
3401 }
3402 up_write(&ls->ls_root_sem);
3403
3404 return 0;
3405}
3406
3407int dlm_grant_after_purge(struct dlm_ls *ls)
3408{
3409 struct dlm_rsb *r;
3410 int i;
3411
3412 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
3413 read_lock(&ls->ls_rsbtbl[i].lock);
3414 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
3415 hold_rsb(r);
3416 lock_rsb(r);
3417 if (is_master(r)) {
3418 grant_pending_locks(r);
3419 confirm_master(r, 0);
3420 }
3421 unlock_rsb(r);
3422 put_rsb(r);
3423 }
3424 read_unlock(&ls->ls_rsbtbl[i].lock);
3425 }
3426
3427 return 0;
3428}
3429
3430static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3431 uint32_t remid)
3432{
3433 struct dlm_lkb *lkb;
3434
3435 list_for_each_entry(lkb, head, lkb_statequeue) {
3436 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3437 return lkb;
3438 }
3439 return NULL;
3440}
3441
3442static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3443 uint32_t remid)
3444{
3445 struct dlm_lkb *lkb;
3446
3447 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3448 if (lkb)
3449 return lkb;
3450 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3451 if (lkb)
3452 return lkb;
3453 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3454 if (lkb)
3455 return lkb;
3456 return NULL;
3457}
3458
3459static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3460 struct dlm_rsb *r, struct dlm_rcom *rc)
3461{
3462 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3463 int lvblen;
3464
3465 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3466 lkb->lkb_ownpid = rl->rl_ownpid;
3467 lkb->lkb_remid = rl->rl_lkid;
3468 lkb->lkb_exflags = rl->rl_exflags;
3469 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3470 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3471 lkb->lkb_lvbseq = rl->rl_lvbseq;
3472 lkb->lkb_rqmode = rl->rl_rqmode;
3473 lkb->lkb_grmode = rl->rl_grmode;
3474 /* don't set lkb_status because add_lkb wants to itself */
3475
3476 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3477 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3478
3479 if (lkb->lkb_flags & DLM_IFL_RANGE) {
3480 lkb->lkb_range = allocate_range(ls);
3481 if (!lkb->lkb_range)
3482 return -ENOMEM;
3483 memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t));
3484 }
3485
3486 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3487 lkb->lkb_lvbptr = allocate_lvb(ls);
3488 if (!lkb->lkb_lvbptr)
3489 return -ENOMEM;
3490 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3491 sizeof(struct rcom_lock);
3492 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3493 }
3494
3495 /* Conversions between PR and CW (middle modes) need special handling.
3496 The real granted mode of these converting locks cannot be determined
3497 until all locks have been rebuilt on the rsb (recover_conversion) */
3498
3499 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3500 rl->rl_status = DLM_LKSTS_CONVERT;
3501 lkb->lkb_grmode = DLM_LOCK_IV;
3502 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3503 }
3504
3505 return 0;
3506}
3507
3508/* This lkb may have been recovered in a previous aborted recovery so we need
3509 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3510 If so we just send back a standard reply. If not, we create a new lkb with
3511 the given values and send back our lkid. We send back our lkid by sending
3512 back the rcom_lock struct we got but with the remid field filled in. */
3513
3514int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3515{
3516 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3517 struct dlm_rsb *r;
3518 struct dlm_lkb *lkb;
3519 int error;
3520
3521 if (rl->rl_parent_lkid) {
3522 error = -EOPNOTSUPP;
3523 goto out;
3524 }
3525
3526 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3527 if (error)
3528 goto out;
3529
3530 lock_rsb(r);
3531
3532 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3533 if (lkb) {
3534 error = -EEXIST;
3535 goto out_remid;
3536 }
3537
3538 error = create_lkb(ls, &lkb);
3539 if (error)
3540 goto out_unlock;
3541
3542 error = receive_rcom_lock_args(ls, lkb, r, rc);
3543 if (error) {
3544 put_lkb(lkb);
3545 goto out_unlock;
3546 }
3547
3548 attach_lkb(r, lkb);
3549 add_lkb(r, lkb, rl->rl_status);
3550 error = 0;
3551
3552 out_remid:
3553 /* this is the new value returned to the lock holder for
3554 saving in its process-copy lkb */
3555 rl->rl_remid = lkb->lkb_id;
3556
3557 out_unlock:
3558 unlock_rsb(r);
3559 put_rsb(r);
3560 out:
3561 if (error)
3562 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3563 rl->rl_result = error;
3564 return error;
3565}
3566
3567int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3568{
3569 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3570 struct dlm_rsb *r;
3571 struct dlm_lkb *lkb;
3572 int error;
3573
3574 error = find_lkb(ls, rl->rl_lkid, &lkb);
3575 if (error) {
3576 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3577 return error;
3578 }
3579
3580 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3581
3582 error = rl->rl_result;
3583
3584 r = lkb->lkb_resource;
3585 hold_rsb(r);
3586 lock_rsb(r);
3587
3588 switch (error) {
3589 case -EEXIST:
3590 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3591 /* fall through */
3592 case 0:
3593 lkb->lkb_remid = rl->rl_remid;
3594 break;
3595 default:
3596 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3597 error, lkb->lkb_id);
3598 }
3599
3600 /* an ack for dlm_recover_locks() which waits for replies from
3601 all the locks it sends to new masters */
3602 dlm_recovered_lock(r);
3603
3604 unlock_rsb(r);
3605 put_rsb(r);
3606 put_lkb(lkb);
3607
3608 return 0;
3609}
3610
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..bffab9c88b1d
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,50 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
18int dlm_modes_compat(int mode1, int mode2);
19int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
20 unsigned int flags, struct dlm_rsb **r_ret);
21void dlm_put_rsb(struct dlm_rsb *r);
22void dlm_hold_rsb(struct dlm_rsb *r);
23int dlm_put_lkb(struct dlm_lkb *lkb);
24void dlm_scan_rsbs(struct dlm_ls *ls);
25
26int dlm_purge_locks(struct dlm_ls *ls);
27void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
28int dlm_grant_after_purge(struct dlm_ls *ls);
29int dlm_recover_waiters_post(struct dlm_ls *ls);
30void dlm_recover_waiters_pre(struct dlm_ls *ls);
31int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
32int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
33
34static inline int is_master(struct dlm_rsb *r)
35{
36 return !r->res_nodeid;
37}
38
39static inline void lock_rsb(struct dlm_rsb *r)
40{
41 mutex_lock(&r->res_mutex);
42}
43
44static inline void unlock_rsb(struct dlm_rsb *r)
45{
46 mutex_unlock(&r->res_mutex);
47}
48
49#endif
50
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..d2ff505d51cd
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,665 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24
25#ifdef CONFIG_DLM_DEBUG
26int dlm_create_debug_file(struct dlm_ls *ls);
27void dlm_delete_debug_file(struct dlm_ls *ls);
28#else
29static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
30static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
31#endif
32
33static int ls_count;
34static struct mutex ls_lock;
35static struct list_head lslist;
36static spinlock_t lslist_lock;
37static struct task_struct * scand_task;
38
39
40static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
41{
42 ssize_t ret = len;
43 int n = simple_strtol(buf, NULL, 0);
44
45 switch (n) {
46 case 0:
47 dlm_ls_stop(ls);
48 break;
49 case 1:
50 dlm_ls_start(ls);
51 break;
52 default:
53 ret = -EINVAL;
54 }
55 return ret;
56}
57
58static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
59{
60 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
61 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
62 wake_up(&ls->ls_uevent_wait);
63 return len;
64}
65
66static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
67{
68 return sprintf(buf, "%u\n", ls->ls_global_id);
69}
70
71static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
72{
73 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
74 return len;
75}
76
77struct dlm_attr {
78 struct attribute attr;
79 ssize_t (*show)(struct dlm_ls *, char *);
80 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
81};
82
83static struct dlm_attr dlm_attr_control = {
84 .attr = {.name = "control", .mode = S_IWUSR},
85 .store = dlm_control_store
86};
87
88static struct dlm_attr dlm_attr_event = {
89 .attr = {.name = "event_done", .mode = S_IWUSR},
90 .store = dlm_event_store
91};
92
93static struct dlm_attr dlm_attr_id = {
94 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
95 .show = dlm_id_show,
96 .store = dlm_id_store
97};
98
99static struct attribute *dlm_attrs[] = {
100 &dlm_attr_control.attr,
101 &dlm_attr_event.attr,
102 &dlm_attr_id.attr,
103 NULL,
104};
105
106static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
107 char *buf)
108{
109 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
110 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
111 return a->show ? a->show(ls, buf) : 0;
112}
113
114static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
115 const char *buf, size_t len)
116{
117 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
118 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
119 return a->store ? a->store(ls, buf, len) : len;
120}
121
122static struct sysfs_ops dlm_attr_ops = {
123 .show = dlm_attr_show,
124 .store = dlm_attr_store,
125};
126
127static struct kobj_type dlm_ktype = {
128 .default_attrs = dlm_attrs,
129 .sysfs_ops = &dlm_attr_ops,
130};
131
132static struct kset dlm_kset = {
133 .subsys = &kernel_subsys,
134 .kobj = {.name = "dlm",},
135 .ktype = &dlm_ktype,
136};
137
138static int kobject_setup(struct dlm_ls *ls)
139{
140 char lsname[DLM_LOCKSPACE_LEN];
141 int error;
142
143 memset(lsname, 0, DLM_LOCKSPACE_LEN);
144 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
145
146 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
147 if (error)
148 return error;
149
150 ls->ls_kobj.kset = &dlm_kset;
151 ls->ls_kobj.ktype = &dlm_ktype;
152 return 0;
153}
154
155static int do_uevent(struct dlm_ls *ls, int in)
156{
157 int error;
158
159 if (in)
160 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
161 else
162 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
163
164 error = wait_event_interruptible(ls->ls_uevent_wait,
165 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
166 if (error)
167 goto out;
168
169 error = ls->ls_uevent_result;
170 out:
171 return error;
172}
173
174
175int dlm_lockspace_init(void)
176{
177 int error;
178
179 ls_count = 0;
180 mutex_init(&ls_lock);
181 INIT_LIST_HEAD(&lslist);
182 spin_lock_init(&lslist_lock);
183
184 error = kset_register(&dlm_kset);
185 if (error)
186 printk("dlm_lockspace_init: cannot register kset %d\n", error);
187 return error;
188}
189
190void dlm_lockspace_exit(void)
191{
192 kset_unregister(&dlm_kset);
193}
194
195static int dlm_scand(void *data)
196{
197 struct dlm_ls *ls;
198
199 while (!kthread_should_stop()) {
200 list_for_each_entry(ls, &lslist, ls_list)
201 dlm_scan_rsbs(ls);
202 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
203 }
204 return 0;
205}
206
207static int dlm_scand_start(void)
208{
209 struct task_struct *p;
210 int error = 0;
211
212 p = kthread_run(dlm_scand, NULL, "dlm_scand");
213 if (IS_ERR(p))
214 error = PTR_ERR(p);
215 else
216 scand_task = p;
217 return error;
218}
219
220static void dlm_scand_stop(void)
221{
222 kthread_stop(scand_task);
223}
224
225static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
226{
227 struct dlm_ls *ls;
228
229 spin_lock(&lslist_lock);
230
231 list_for_each_entry(ls, &lslist, ls_list) {
232 if (ls->ls_namelen == namelen &&
233 memcmp(ls->ls_name, name, namelen) == 0)
234 goto out;
235 }
236 ls = NULL;
237 out:
238 spin_unlock(&lslist_lock);
239 return ls;
240}
241
242struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
243{
244 struct dlm_ls *ls;
245
246 spin_lock(&lslist_lock);
247
248 list_for_each_entry(ls, &lslist, ls_list) {
249 if (ls->ls_global_id == id) {
250 ls->ls_count++;
251 goto out;
252 }
253 }
254 ls = NULL;
255 out:
256 spin_unlock(&lslist_lock);
257 return ls;
258}
259
260struct dlm_ls *dlm_find_lockspace_local(void *id)
261{
262 struct dlm_ls *ls = id;
263
264 spin_lock(&lslist_lock);
265 ls->ls_count++;
266 spin_unlock(&lslist_lock);
267 return ls;
268}
269
270void dlm_put_lockspace(struct dlm_ls *ls)
271{
272 spin_lock(&lslist_lock);
273 ls->ls_count--;
274 spin_unlock(&lslist_lock);
275}
276
277static void remove_lockspace(struct dlm_ls *ls)
278{
279 for (;;) {
280 spin_lock(&lslist_lock);
281 if (ls->ls_count == 0) {
282 list_del(&ls->ls_list);
283 spin_unlock(&lslist_lock);
284 return;
285 }
286 spin_unlock(&lslist_lock);
287 ssleep(1);
288 }
289}
290
291static int threads_start(void)
292{
293 int error;
294
295 /* Thread which process lock requests for all lockspace's */
296 error = dlm_astd_start();
297 if (error) {
298 log_print("cannot start dlm_astd thread %d", error);
299 goto fail;
300 }
301
302 error = dlm_scand_start();
303 if (error) {
304 log_print("cannot start dlm_scand thread %d", error);
305 goto astd_fail;
306 }
307
308 /* Thread for sending/receiving messages for all lockspace's */
309 error = dlm_lowcomms_start();
310 if (error) {
311 log_print("cannot start dlm lowcomms %d", error);
312 goto scand_fail;
313 }
314
315 return 0;
316
317 scand_fail:
318 dlm_scand_stop();
319 astd_fail:
320 dlm_astd_stop();
321 fail:
322 return error;
323}
324
325static void threads_stop(void)
326{
327 dlm_scand_stop();
328 dlm_lowcomms_stop();
329 dlm_astd_stop();
330}
331
332static int new_lockspace(char *name, int namelen, void **lockspace,
333 uint32_t flags, int lvblen)
334{
335 struct dlm_ls *ls;
336 int i, size, error = -ENOMEM;
337
338 if (namelen > DLM_LOCKSPACE_LEN)
339 return -EINVAL;
340
341 if (!lvblen || (lvblen % 8))
342 return -EINVAL;
343
344 if (!try_module_get(THIS_MODULE))
345 return -EINVAL;
346
347 ls = dlm_find_lockspace_name(name, namelen);
348 if (ls) {
349 *lockspace = ls;
350 module_put(THIS_MODULE);
351 return -EEXIST;
352 }
353
354 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
355 if (!ls)
356 goto out;
357 memcpy(ls->ls_name, name, namelen);
358 ls->ls_namelen = namelen;
359 ls->ls_exflags = flags;
360 ls->ls_lvblen = lvblen;
361 ls->ls_count = 0;
362 ls->ls_flags = 0;
363
364 size = dlm_config.rsbtbl_size;
365 ls->ls_rsbtbl_size = size;
366
367 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
368 if (!ls->ls_rsbtbl)
369 goto out_lsfree;
370 for (i = 0; i < size; i++) {
371 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
372 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
373 rwlock_init(&ls->ls_rsbtbl[i].lock);
374 }
375
376 size = dlm_config.lkbtbl_size;
377 ls->ls_lkbtbl_size = size;
378
379 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
380 if (!ls->ls_lkbtbl)
381 goto out_rsbfree;
382 for (i = 0; i < size; i++) {
383 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
384 rwlock_init(&ls->ls_lkbtbl[i].lock);
385 ls->ls_lkbtbl[i].counter = 1;
386 }
387
388 size = dlm_config.dirtbl_size;
389 ls->ls_dirtbl_size = size;
390
391 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
392 if (!ls->ls_dirtbl)
393 goto out_lkbfree;
394 for (i = 0; i < size; i++) {
395 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
396 rwlock_init(&ls->ls_dirtbl[i].lock);
397 }
398
399 INIT_LIST_HEAD(&ls->ls_waiters);
400 mutex_init(&ls->ls_waiters_mutex);
401
402 INIT_LIST_HEAD(&ls->ls_nodes);
403 INIT_LIST_HEAD(&ls->ls_nodes_gone);
404 ls->ls_num_nodes = 0;
405 ls->ls_low_nodeid = 0;
406 ls->ls_total_weight = 0;
407 ls->ls_node_array = NULL;
408
409 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
410 ls->ls_stub_rsb.res_ls = ls;
411
412 ls->ls_debug_dentry = NULL;
413
414 init_waitqueue_head(&ls->ls_uevent_wait);
415 ls->ls_uevent_result = 0;
416
417 ls->ls_recoverd_task = NULL;
418 mutex_init(&ls->ls_recoverd_active);
419 spin_lock_init(&ls->ls_recover_lock);
420 ls->ls_recover_status = 0;
421 ls->ls_recover_seq = 0;
422 ls->ls_recover_args = NULL;
423 init_rwsem(&ls->ls_in_recovery);
424 INIT_LIST_HEAD(&ls->ls_requestqueue);
425 mutex_init(&ls->ls_requestqueue_mutex);
426
427 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
428 if (!ls->ls_recover_buf)
429 goto out_dirfree;
430
431 INIT_LIST_HEAD(&ls->ls_recover_list);
432 spin_lock_init(&ls->ls_recover_list_lock);
433 ls->ls_recover_list_count = 0;
434 init_waitqueue_head(&ls->ls_wait_general);
435 INIT_LIST_HEAD(&ls->ls_root_list);
436 init_rwsem(&ls->ls_root_sem);
437
438 down_write(&ls->ls_in_recovery);
439
440 error = dlm_recoverd_start(ls);
441 if (error) {
442 log_error(ls, "can't start dlm_recoverd %d", error);
443 goto out_rcomfree;
444 }
445
446 spin_lock(&lslist_lock);
447 list_add(&ls->ls_list, &lslist);
448 spin_unlock(&lslist_lock);
449
450 dlm_create_debug_file(ls);
451
452 error = kobject_setup(ls);
453 if (error)
454 goto out_del;
455
456 error = kobject_register(&ls->ls_kobj);
457 if (error)
458 goto out_del;
459
460 error = do_uevent(ls, 1);
461 if (error)
462 goto out_unreg;
463
464 *lockspace = ls;
465 return 0;
466
467 out_unreg:
468 kobject_unregister(&ls->ls_kobj);
469 out_del:
470 dlm_delete_debug_file(ls);
471 spin_lock(&lslist_lock);
472 list_del(&ls->ls_list);
473 spin_unlock(&lslist_lock);
474 dlm_recoverd_stop(ls);
475 out_rcomfree:
476 kfree(ls->ls_recover_buf);
477 out_dirfree:
478 kfree(ls->ls_dirtbl);
479 out_lkbfree:
480 kfree(ls->ls_lkbtbl);
481 out_rsbfree:
482 kfree(ls->ls_rsbtbl);
483 out_lsfree:
484 kfree(ls);
485 out:
486 module_put(THIS_MODULE);
487 return error;
488}
489
490int dlm_new_lockspace(char *name, int namelen, void **lockspace,
491 uint32_t flags, int lvblen)
492{
493 int error = 0;
494
495 mutex_lock(&ls_lock);
496 if (!ls_count)
497 error = threads_start();
498 if (error)
499 goto out;
500
501 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
502 if (!error)
503 ls_count++;
504 out:
505 mutex_unlock(&ls_lock);
506 return error;
507}
508
509/* Return 1 if the lockspace still has active remote locks,
510 * 2 if the lockspace still has active local locks.
511 */
512static int lockspace_busy(struct dlm_ls *ls)
513{
514 int i, lkb_found = 0;
515 struct dlm_lkb *lkb;
516
517 /* NOTE: We check the lockidtbl here rather than the resource table.
518 This is because there may be LKBs queued as ASTs that have been
519 unlinked from their RSBs and are pending deletion once the AST has
520 been delivered */
521
522 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
523 read_lock(&ls->ls_lkbtbl[i].lock);
524 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
525 lkb_found = 1;
526 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
527 lkb_idtbl_list) {
528 if (!lkb->lkb_nodeid) {
529 read_unlock(&ls->ls_lkbtbl[i].lock);
530 return 2;
531 }
532 }
533 }
534 read_unlock(&ls->ls_lkbtbl[i].lock);
535 }
536 return lkb_found;
537}
538
539static int release_lockspace(struct dlm_ls *ls, int force)
540{
541 struct dlm_lkb *lkb;
542 struct dlm_rsb *rsb;
543 struct list_head *head;
544 int i;
545 int busy = lockspace_busy(ls);
546
547 if (busy > force)
548 return -EBUSY;
549
550 if (force < 3)
551 do_uevent(ls, 0);
552
553 dlm_recoverd_stop(ls);
554
555 remove_lockspace(ls);
556
557 dlm_delete_debug_file(ls);
558
559 dlm_astd_suspend();
560
561 kfree(ls->ls_recover_buf);
562
563 /*
564 * Free direntry structs.
565 */
566
567 dlm_dir_clear(ls);
568 kfree(ls->ls_dirtbl);
569
570 /*
571 * Free all lkb's on lkbtbl[] lists.
572 */
573
574 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
575 head = &ls->ls_lkbtbl[i].list;
576 while (!list_empty(head)) {
577 lkb = list_entry(head->next, struct dlm_lkb,
578 lkb_idtbl_list);
579
580 list_del(&lkb->lkb_idtbl_list);
581
582 dlm_del_ast(lkb);
583
584 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
585 free_lvb(lkb->lkb_lvbptr);
586
587 free_lkb(lkb);
588 }
589 }
590 dlm_astd_resume();
591
592 kfree(ls->ls_lkbtbl);
593
594 /*
595 * Free all rsb's on rsbtbl[] lists
596 */
597
598 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
599 head = &ls->ls_rsbtbl[i].list;
600 while (!list_empty(head)) {
601 rsb = list_entry(head->next, struct dlm_rsb,
602 res_hashchain);
603
604 list_del(&rsb->res_hashchain);
605 free_rsb(rsb);
606 }
607
608 head = &ls->ls_rsbtbl[i].toss;
609 while (!list_empty(head)) {
610 rsb = list_entry(head->next, struct dlm_rsb,
611 res_hashchain);
612 list_del(&rsb->res_hashchain);
613 free_rsb(rsb);
614 }
615 }
616
617 kfree(ls->ls_rsbtbl);
618
619 /*
620 * Free structures on any other lists
621 */
622
623 kfree(ls->ls_recover_args);
624 dlm_clear_free_entries(ls);
625 dlm_clear_members(ls);
626 dlm_clear_members_gone(ls);
627 kfree(ls->ls_node_array);
628 kobject_unregister(&ls->ls_kobj);
629 kfree(ls);
630
631 mutex_lock(&ls_lock);
632 ls_count--;
633 if (!ls_count)
634 threads_stop();
635 mutex_unlock(&ls_lock);
636
637 module_put(THIS_MODULE);
638 return 0;
639}
640
641/*
642 * Called when a system has released all its locks and is not going to use the
643 * lockspace any longer. We free everything we're managing for this lockspace.
644 * Remaining nodes will go through the recovery process as if we'd died. The
645 * lockspace must continue to function as usual, participating in recoveries,
646 * until this returns.
647 *
648 * Force has 4 possible values:
649 * 0 - don't destroy locksapce if it has any LKBs
650 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
651 * 2 - destroy lockspace regardless of LKBs
652 * 3 - destroy lockspace as part of a forced shutdown
653 */
654
655int dlm_release_lockspace(void *lockspace, int force)
656{
657 struct dlm_ls *ls;
658
659 ls = dlm_find_lockspace_local(lockspace);
660 if (!ls)
661 return -EINVAL;
662 dlm_put_lockspace(ls);
663 return release_lockspace(ls, force);
664}
665
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..17bd3ba863a9
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21void dlm_put_lockspace(struct dlm_ls *ls);
22
23#endif /* __LOCKSPACE_DOT_H__ */
24
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..09b0124f7fc4
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1218 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT];
59static int local_count;
60static int local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!local_count)
264 return;
265
266 if (!port) {
267 if (local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = local_addr[0]->ss_family;
277 if (local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 local_addr[local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!local_count) {
663 init_local();
664 if (!local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < local_count; i++) {
704 memcpy(&localaddr, local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 kmap(e->page);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066static int write_list_empty(void)
1067{
1068 int status;
1069
1070 spin_lock_bh(&write_nodes_lock);
1071 status = list_empty(&write_nodes);
1072 spin_unlock_bh(&write_nodes_lock);
1073
1074 return status;
1075}
1076
1077static int dlm_recvd(void *data)
1078{
1079 DECLARE_WAITQUEUE(wait, current);
1080
1081 while (!kthread_should_stop()) {
1082 int count = 0;
1083
1084 set_current_state(TASK_INTERRUPTIBLE);
1085 add_wait_queue(&lowcomms_recv_wait, &wait);
1086 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1087 schedule();
1088 remove_wait_queue(&lowcomms_recv_wait, &wait);
1089 set_current_state(TASK_RUNNING);
1090
1091 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1092 int ret;
1093
1094 do {
1095 ret = receive_from_sock();
1096
1097 /* Don't starve out everyone else */
1098 if (++count >= MAX_RX_MSG_COUNT) {
1099 schedule();
1100 count = 0;
1101 }
1102 } while (!kthread_should_stop() && ret >=0);
1103 }
1104 schedule();
1105 }
1106
1107 return 0;
1108}
1109
1110static int dlm_sendd(void *data)
1111{
1112 DECLARE_WAITQUEUE(wait, current);
1113
1114 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1115
1116 while (!kthread_should_stop()) {
1117 set_current_state(TASK_INTERRUPTIBLE);
1118 if (write_list_empty())
1119 schedule();
1120 set_current_state(TASK_RUNNING);
1121
1122 if (sctp_con.eagain_flag) {
1123 sctp_con.eagain_flag = 0;
1124 refill_write_queue();
1125 }
1126 process_output_queue();
1127 }
1128
1129 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1130
1131 return 0;
1132}
1133
1134static void daemons_stop(void)
1135{
1136 kthread_stop(recv_task);
1137 kthread_stop(send_task);
1138}
1139
1140static int daemons_start(void)
1141{
1142 struct task_struct *p;
1143 int error;
1144
1145 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1146 error = IS_ERR(p);
1147 if (error) {
1148 log_print("can't start dlm_recvd %d", error);
1149 return error;
1150 }
1151 recv_task = p;
1152
1153 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1154 error = IS_ERR(p);
1155 if (error) {
1156 log_print("can't start dlm_sendd %d", error);
1157 kthread_stop(recv_task);
1158 return error;
1159 }
1160 send_task = p;
1161
1162 return 0;
1163}
1164
1165/*
1166 * This is quite likely to sleep...
1167 */
1168int dlm_lowcomms_start(void)
1169{
1170 int error;
1171
1172 spin_lock_init(&write_nodes_lock);
1173 INIT_LIST_HEAD(&write_nodes);
1174 init_rwsem(&nodeinfo_lock);
1175
1176 error = init_sock();
1177 if (error)
1178 goto fail_sock;
1179 error = daemons_start();
1180 if (error)
1181 goto fail_sock;
1182 atomic_set(&accepting, 1);
1183 return 0;
1184
1185 fail_sock:
1186 close_connection();
1187 return error;
1188}
1189
1190/* Set all the activity flags to prevent any socket activity. */
1191
1192void dlm_lowcomms_stop(void)
1193{
1194 atomic_set(&accepting, 0);
1195 sctp_con.flags = 0x7;
1196 daemons_stop();
1197 clean_writequeues();
1198 close_connection();
1199 dealloc_nodeinfo();
1200 max_nodeid = 0;
1201}
1202
1203int dlm_lowcomms_init(void)
1204{
1205 init_waitqueue_head(&lowcomms_recv_wait);
1206 return 0;
1207}
1208
1209void dlm_lowcomms_exit(void)
1210{
1211 int i;
1212
1213 for (i = 0; i < local_count; i++)
1214 kfree(local_addr[i]);
1215 local_count = 0;
1216 local_nodeid = 0;
1217}
1218
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..3af8035ff12f
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
22void dlm_lowcomms_commit_buffer(void *mh);
23
24#endif /* __LOWCOMMS_DOT_H__ */
25
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..81bf4cb22033
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,89 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "memory.h"
18#include "lowcomms.h"
19#include "config.h"
20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28
29static int __init init_dlm(void)
30{
31 int error;
32
33 error = dlm_memory_init();
34 if (error)
35 goto out;
36
37 error = dlm_lockspace_init();
38 if (error)
39 goto out_mem;
40
41 error = dlm_config_init();
42 if (error)
43 goto out_lockspace;
44
45 error = dlm_register_debugfs();
46 if (error)
47 goto out_config;
48
49 error = dlm_lowcomms_init();
50 if (error)
51 goto out_debug;
52
53 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
54
55 return 0;
56
57 out_debug:
58 dlm_unregister_debugfs();
59 out_config:
60 dlm_config_exit();
61 out_lockspace:
62 dlm_lockspace_exit();
63 out_mem:
64 dlm_memory_exit();
65 out:
66 return error;
67}
68
69static void __exit exit_dlm(void)
70{
71 dlm_lowcomms_exit();
72 dlm_config_exit();
73 dlm_memory_exit();
74 dlm_lockspace_exit();
75 dlm_unregister_debugfs();
76}
77
78module_init(init_dlm);
79module_exit(exit_dlm);
80
81MODULE_DESCRIPTION("Distributed Lock Manager");
82MODULE_AUTHOR("Red Hat, Inc.");
83MODULE_LICENSE("GPL");
84
85EXPORT_SYMBOL_GPL(dlm_new_lockspace);
86EXPORT_SYMBOL_GPL(dlm_release_lockspace);
87EXPORT_SYMBOL_GPL(dlm_lock);
88EXPORT_SYMBOL_GPL(dlm_unlock);
89
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..926cd0cb6bff
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,313 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "lowcomms.h"
19#include "rcom.h"
20#include "config.h"
21
22/*
23 * Following called by dlm_recoverd thread
24 */
25
26static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
27{
28 struct dlm_member *memb = NULL;
29 struct list_head *tmp;
30 struct list_head *newlist = &new->list;
31 struct list_head *head = &ls->ls_nodes;
32
33 list_for_each(tmp, head) {
34 memb = list_entry(tmp, struct dlm_member, list);
35 if (new->nodeid < memb->nodeid)
36 break;
37 }
38
39 if (!memb)
40 list_add_tail(newlist, head);
41 else {
42 /* FIXME: can use list macro here */
43 newlist->prev = tmp->prev;
44 newlist->next = tmp;
45 tmp->prev->next = newlist;
46 tmp->prev = newlist;
47 }
48}
49
50static int dlm_add_member(struct dlm_ls *ls, int nodeid)
51{
52 struct dlm_member *memb;
53 int w;
54
55 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
56 if (!memb)
57 return -ENOMEM;
58
59 w = dlm_node_weight(ls->ls_name, nodeid);
60 if (w < 0)
61 return w;
62
63 memb->nodeid = nodeid;
64 memb->weight = w;
65 add_ordered_member(ls, memb);
66 ls->ls_num_nodes++;
67 return 0;
68}
69
70static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
71{
72 list_move(&memb->list, &ls->ls_nodes_gone);
73 ls->ls_num_nodes--;
74}
75
76static int dlm_is_member(struct dlm_ls *ls, int nodeid)
77{
78 struct dlm_member *memb;
79
80 list_for_each_entry(memb, &ls->ls_nodes, list) {
81 if (memb->nodeid == nodeid)
82 return 1;
83 }
84 return 0;
85}
86
87int dlm_is_removed(struct dlm_ls *ls, int nodeid)
88{
89 struct dlm_member *memb;
90
91 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
92 if (memb->nodeid == nodeid)
93 return 1;
94 }
95 return 0;
96}
97
98static void clear_memb_list(struct list_head *head)
99{
100 struct dlm_member *memb;
101
102 while (!list_empty(head)) {
103 memb = list_entry(head->next, struct dlm_member, list);
104 list_del(&memb->list);
105 kfree(memb);
106 }
107}
108
109void dlm_clear_members(struct dlm_ls *ls)
110{
111 clear_memb_list(&ls->ls_nodes);
112 ls->ls_num_nodes = 0;
113}
114
115void dlm_clear_members_gone(struct dlm_ls *ls)
116{
117 clear_memb_list(&ls->ls_nodes_gone);
118}
119
120static void make_member_array(struct dlm_ls *ls)
121{
122 struct dlm_member *memb;
123 int i, w, x = 0, total = 0, all_zero = 0, *array;
124
125 kfree(ls->ls_node_array);
126 ls->ls_node_array = NULL;
127
128 list_for_each_entry(memb, &ls->ls_nodes, list) {
129 if (memb->weight)
130 total += memb->weight;
131 }
132
133 /* all nodes revert to weight of 1 if all have weight 0 */
134
135 if (!total) {
136 total = ls->ls_num_nodes;
137 all_zero = 1;
138 }
139
140 ls->ls_total_weight = total;
141
142 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
143 if (!array)
144 return;
145
146 list_for_each_entry(memb, &ls->ls_nodes, list) {
147 if (!all_zero && !memb->weight)
148 continue;
149
150 if (all_zero)
151 w = 1;
152 else
153 w = memb->weight;
154
155 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
156
157 for (i = 0; i < w; i++)
158 array[x++] = memb->nodeid;
159 }
160
161 ls->ls_node_array = array;
162}
163
164/* send a status request to all members just to establish comms connections */
165
166static void ping_members(struct dlm_ls *ls)
167{
168 struct dlm_member *memb;
169 list_for_each_entry(memb, &ls->ls_nodes, list)
170 dlm_rcom_status(ls, memb->nodeid);
171}
172
173int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
174{
175 struct dlm_member *memb, *safe;
176 int i, error, found, pos = 0, neg = 0, low = -1;
177
178 /* move departed members from ls_nodes to ls_nodes_gone */
179
180 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
181 found = 0;
182 for (i = 0; i < rv->node_count; i++) {
183 if (memb->nodeid == rv->nodeids[i]) {
184 found = 1;
185 break;
186 }
187 }
188
189 if (!found) {
190 neg++;
191 dlm_remove_member(ls, memb);
192 log_debug(ls, "remove member %d", memb->nodeid);
193 }
194 }
195
196 /* add new members to ls_nodes */
197
198 for (i = 0; i < rv->node_count; i++) {
199 if (dlm_is_member(ls, rv->nodeids[i]))
200 continue;
201 dlm_add_member(ls, rv->nodeids[i]);
202 pos++;
203 log_debug(ls, "add member %d", rv->nodeids[i]);
204 }
205
206 list_for_each_entry(memb, &ls->ls_nodes, list) {
207 if (low == -1 || memb->nodeid < low)
208 low = memb->nodeid;
209 }
210 ls->ls_low_nodeid = low;
211
212 make_member_array(ls);
213 dlm_set_recover_status(ls, DLM_RS_NODES);
214 *neg_out = neg;
215
216 ping_members(ls);
217
218 error = dlm_recover_members_wait(ls);
219 log_debug(ls, "total members %d", ls->ls_num_nodes);
220 return error;
221}
222
223/*
224 * Following called from lockspace.c
225 */
226
227int dlm_ls_stop(struct dlm_ls *ls)
228{
229 int new;
230
231 /*
232 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
233 * dlm_recovery_stopped()) and prevents any new locks from being
234 * processed (see RUNNING, dlm_locking_stopped()).
235 */
236
237 spin_lock(&ls->ls_recover_lock);
238 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
239 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
240 ls->ls_recover_seq++;
241 spin_unlock(&ls->ls_recover_lock);
242
243 /*
244 * This in_recovery lock does two things:
245 *
246 * 1) Keeps this function from returning until all threads are out
247 * of locking routines and locking is truely stopped.
248 * 2) Keeps any new requests from being processed until it's unlocked
249 * when recovery is complete.
250 */
251
252 if (new)
253 down_write(&ls->ls_in_recovery);
254
255 /*
256 * The recoverd suspend/resume makes sure that dlm_recoverd (if
257 * running) has noticed the clearing of RUNNING above and quit
258 * processing the previous recovery. This will be true for all nodes
259 * before any nodes start the new recovery.
260 */
261
262 dlm_recoverd_suspend(ls);
263 ls->ls_recover_status = 0;
264 dlm_recoverd_resume(ls);
265 return 0;
266}
267
268int dlm_ls_start(struct dlm_ls *ls)
269{
270 struct dlm_recover *rv = NULL, *rv_old;
271 int *ids = NULL;
272 int error, count;
273
274 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
275 if (!rv)
276 return -ENOMEM;
277
278 error = count = dlm_nodeid_list(ls->ls_name, &ids);
279 if (error <= 0)
280 goto fail;
281
282 spin_lock(&ls->ls_recover_lock);
283
284 /* the lockspace needs to be stopped before it can be started */
285
286 if (!dlm_locking_stopped(ls)) {
287 spin_unlock(&ls->ls_recover_lock);
288 log_error(ls, "start ignored: lockspace running");
289 error = -EINVAL;
290 goto fail;
291 }
292
293 rv->nodeids = ids;
294 rv->node_count = count;
295 rv->seq = ++ls->ls_recover_seq;
296 rv_old = ls->ls_recover_args;
297 ls->ls_recover_args = rv;
298 spin_unlock(&ls->ls_recover_lock);
299
300 if (rv_old) {
301 kfree(rv_old->nodeids);
302 kfree(rv_old);
303 }
304
305 dlm_recoverd_kick(ls);
306 return 0;
307
308 fail:
309 kfree(rv);
310 kfree(ids);
311 return error;
312}
313
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..0b9851d0bdb2
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,122 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53uint64_t *allocate_range(struct dlm_ls *ls)
54{
55 int ralen = 4*sizeof(uint64_t);
56 uint64_t *p;
57
58 p = kmalloc(ralen, GFP_KERNEL);
59 if (p)
60 memset(p, 0, ralen);
61 return p;
62}
63
64void free_range(uint64_t *p)
65{
66 kfree(p);
67}
68
69/* FIXME: have some minimal space built-in to rsb for the name and
70 kmalloc a separate name if needed, like dentries are done */
71
72struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
73{
74 struct dlm_rsb *r;
75
76 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
77
78 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
79 if (r)
80 memset(r, 0, sizeof(*r) + namelen);
81 return r;
82}
83
84void free_rsb(struct dlm_rsb *r)
85{
86 if (r->res_lvbptr)
87 free_lvb(r->res_lvbptr);
88 kfree(r);
89}
90
91struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
92{
93 struct dlm_lkb *lkb;
94
95 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
96 if (lkb)
97 memset(lkb, 0, sizeof(*lkb));
98 return lkb;
99}
100
101void free_lkb(struct dlm_lkb *lkb)
102{
103 kmem_cache_free(lkb_cache, lkb);
104}
105
106struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
107{
108 struct dlm_direntry *de;
109
110 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
111
112 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
113 if (de)
114 memset(de, 0, sizeof(*de) + namelen);
115 return de;
116}
117
118void free_direntry(struct dlm_direntry *de)
119{
120 kfree(de);
121}
122
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..7b235132b0b4
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,31 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27uint64_t *allocate_range(struct dlm_ls *ls);
28void free_range(uint64_t *l);
29
30#endif /* __MEMORY_DOT_H__ */
31
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..4c5c08a8860e
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,460 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100
101 if (nodeid == dlm_our_nodeid()) {
102 rc = (struct dlm_rcom *) ls->ls_recover_buf;
103 rc->rc_result = dlm_recover_status(ls);
104 goto out;
105 }
106
107 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
108 if (error)
109 goto out;
110
111 send_rcom(ls, mh, rc);
112
113 error = dlm_wait_function(ls, &rcom_response);
114 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
115 if (error)
116 goto out;
117
118 rc = (struct dlm_rcom *) ls->ls_recover_buf;
119
120 if (rc->rc_result == -ESRCH) {
121 /* we pretend the remote lockspace exists with 0 status */
122 log_debug(ls, "remote node %d not ready", nodeid);
123 rc->rc_result = 0;
124 } else
125 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
126 nodeid);
127 /* the caller looks at rc_result for the remote recovery status */
128 out:
129 return error;
130}
131
132static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
133{
134 struct dlm_rcom *rc;
135 struct dlm_mhandle *mh;
136 int error, nodeid = rc_in->rc_header.h_nodeid;
137
138 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
139 sizeof(struct rcom_config), &rc, &mh);
140 if (error)
141 return;
142 rc->rc_result = dlm_recover_status(ls);
143 make_config(ls, (struct rcom_config *) rc->rc_buf);
144
145 send_rcom(ls, mh, rc);
146}
147
148static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
149{
150 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
151 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
152 wake_up(&ls->ls_wait_general);
153}
154
155int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
156{
157 struct dlm_rcom *rc;
158 struct dlm_mhandle *mh;
159 int error = 0, len = sizeof(struct dlm_rcom);
160
161 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
162
163 if (nodeid == dlm_our_nodeid()) {
164 dlm_copy_master_names(ls, last_name, last_len,
165 ls->ls_recover_buf + len,
166 dlm_config.buffer_size - len, nodeid);
167 goto out;
168 }
169
170 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
171 if (error)
172 goto out;
173 memcpy(rc->rc_buf, last_name, last_len);
174
175 send_rcom(ls, mh, rc);
176
177 error = dlm_wait_function(ls, &rcom_response);
178 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
179 out:
180 return error;
181}
182
183static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
184{
185 struct dlm_rcom *rc;
186 struct dlm_mhandle *mh;
187 int error, inlen, outlen;
188 int nodeid = rc_in->rc_header.h_nodeid;
189 uint32_t status = dlm_recover_status(ls);
190
191 /*
192 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
193 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
194 * It could only happen in rare cases where we get a late NAMES
195 * message from a previous instance of recovery.
196 */
197
198 if (!(status & DLM_RS_NODES)) {
199 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
200 return;
201 }
202
203 nodeid = rc_in->rc_header.h_nodeid;
204 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
205 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
206
207 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
208 if (error)
209 return;
210
211 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
212 nodeid);
213 send_rcom(ls, mh, rc);
214}
215
216static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
217{
218 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
219 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
220 wake_up(&ls->ls_wait_general);
221}
222
223int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
224{
225 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh;
227 struct dlm_ls *ls = r->res_ls;
228 int error;
229
230 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
231 &rc, &mh);
232 if (error)
233 goto out;
234 memcpy(rc->rc_buf, r->res_name, r->res_length);
235 rc->rc_id = (unsigned long) r;
236
237 send_rcom(ls, mh, rc);
238 out:
239 return error;
240}
241
242static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
243{
244 struct dlm_rcom *rc;
245 struct dlm_mhandle *mh;
246 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
247 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
248
249 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
250 if (error)
251 return;
252
253 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
254 if (error)
255 ret_nodeid = error;
256 rc->rc_result = ret_nodeid;
257 rc->rc_id = rc_in->rc_id;
258
259 send_rcom(ls, mh, rc);
260}
261
262static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
263{
264 dlm_recover_master_reply(ls, rc_in);
265}
266
267static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
268 struct rcom_lock *rl)
269{
270 memset(rl, 0, sizeof(*rl));
271
272 rl->rl_ownpid = lkb->lkb_ownpid;
273 rl->rl_lkid = lkb->lkb_id;
274 rl->rl_exflags = lkb->lkb_exflags;
275 rl->rl_flags = lkb->lkb_flags;
276 rl->rl_lvbseq = lkb->lkb_lvbseq;
277 rl->rl_rqmode = lkb->lkb_rqmode;
278 rl->rl_grmode = lkb->lkb_grmode;
279 rl->rl_status = lkb->lkb_status;
280 rl->rl_wait_type = lkb->lkb_wait_type;
281
282 if (lkb->lkb_bastaddr)
283 rl->rl_asts |= AST_BAST;
284 if (lkb->lkb_astaddr)
285 rl->rl_asts |= AST_COMP;
286
287 if (lkb->lkb_range)
288 memcpy(rl->rl_range, lkb->lkb_range, 4*sizeof(uint64_t));
289
290 rl->rl_namelen = r->res_length;
291 memcpy(rl->rl_name, r->res_name, r->res_length);
292
293 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
294 If so, receive_rcom_lock_args() won't take this copy. */
295
296 if (lkb->lkb_lvbptr)
297 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
298}
299
300int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
301{
302 struct dlm_ls *ls = r->res_ls;
303 struct dlm_rcom *rc;
304 struct dlm_mhandle *mh;
305 struct rcom_lock *rl;
306 int error, len = sizeof(struct rcom_lock);
307
308 if (lkb->lkb_lvbptr)
309 len += ls->ls_lvblen;
310
311 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
312 if (error)
313 goto out;
314
315 rl = (struct rcom_lock *) rc->rc_buf;
316 pack_rcom_lock(r, lkb, rl);
317 rc->rc_id = (unsigned long) r;
318
319 send_rcom(ls, mh, rc);
320 out:
321 return error;
322}
323
324static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
325{
326 struct dlm_rcom *rc;
327 struct dlm_mhandle *mh;
328 int error, nodeid = rc_in->rc_header.h_nodeid;
329
330 dlm_recover_master_copy(ls, rc_in);
331
332 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
333 sizeof(struct rcom_lock), &rc, &mh);
334 if (error)
335 return;
336
337 /* We send back the same rcom_lock struct we received, but
338 dlm_recover_master_copy() has filled in rl_remid and rl_result */
339
340 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
341 rc->rc_id = rc_in->rc_id;
342
343 send_rcom(ls, mh, rc);
344}
345
346static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
347{
348 uint32_t status = dlm_recover_status(ls);
349
350 if (!(status & DLM_RS_DIR)) {
351 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
352 rc_in->rc_header.h_nodeid);
353 return;
354 }
355
356 dlm_recover_process_copy(ls, rc_in);
357}
358
359static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
360{
361 struct dlm_rcom *rc;
362 struct dlm_mhandle *mh;
363 char *mb;
364 int mb_len = sizeof(struct dlm_rcom);
365
366 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
367 if (!mh)
368 return -ENOBUFS;
369 memset(mb, 0, mb_len);
370
371 rc = (struct dlm_rcom *) mb;
372
373 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
374 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
375 rc->rc_header.h_nodeid = dlm_our_nodeid();
376 rc->rc_header.h_length = mb_len;
377 rc->rc_header.h_cmd = DLM_RCOM;
378
379 rc->rc_type = DLM_RCOM_STATUS_REPLY;
380 rc->rc_result = -ESRCH;
381
382 dlm_rcom_out(rc);
383 dlm_lowcomms_commit_buffer(mh);
384
385 return 0;
386}
387
388/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
389 recovery-only comms are sent through here. */
390
391void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
392{
393 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
394 struct dlm_ls *ls;
395
396 dlm_rcom_in(rc);
397
398 /* If the lockspace doesn't exist then still send a status message
399 back; it's possible that it just doesn't have its global_id yet. */
400
401 ls = dlm_find_lockspace_global(hd->h_lockspace);
402 if (!ls) {
403 log_print("lockspace %x from %d not found",
404 hd->h_lockspace, nodeid);
405 send_ls_not_ready(nodeid, rc);
406 return;
407 }
408
409 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
410 log_error(ls, "ignoring recovery message %x from %d",
411 rc->rc_type, nodeid);
412 goto out;
413 }
414
415 if (nodeid != rc->rc_header.h_nodeid) {
416 log_error(ls, "bad rcom nodeid %d from %d",
417 rc->rc_header.h_nodeid, nodeid);
418 goto out;
419 }
420
421 switch (rc->rc_type) {
422 case DLM_RCOM_STATUS:
423 receive_rcom_status(ls, rc);
424 break;
425
426 case DLM_RCOM_NAMES:
427 receive_rcom_names(ls, rc);
428 break;
429
430 case DLM_RCOM_LOOKUP:
431 receive_rcom_lookup(ls, rc);
432 break;
433
434 case DLM_RCOM_LOCK:
435 receive_rcom_lock(ls, rc);
436 break;
437
438 case DLM_RCOM_STATUS_REPLY:
439 receive_rcom_status_reply(ls, rc);
440 break;
441
442 case DLM_RCOM_NAMES_REPLY:
443 receive_rcom_names_reply(ls, rc);
444 break;
445
446 case DLM_RCOM_LOOKUP_REPLY:
447 receive_rcom_lookup_reply(ls, rc);
448 break;
449
450 case DLM_RCOM_LOCK_REPLY:
451 receive_rcom_lock_reply(ls, rc);
452 break;
453
454 default:
455 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
456 }
457 out:
458 dlm_put_lockspace(ls);
459}
460
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..b036ee7dcb32
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,762 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
309 */
310
311static void set_new_master(struct dlm_rsb *r, int nodeid)
312{
313 lock_rsb(r);
314 r->res_nodeid = nodeid;
315 set_master_lkbs(r);
316 rsb_set_flag(r, RSB_NEW_MASTER);
317 rsb_set_flag(r, RSB_NEW_MASTER2);
318 unlock_rsb(r);
319}
320
321/*
322 * We do async lookups on rsb's that need new masters. The rsb's
323 * waiting for a lookup reply are kept on the recover_list.
324 */
325
326static int recover_master(struct dlm_rsb *r)
327{
328 struct dlm_ls *ls = r->res_ls;
329 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
330
331 dir_nodeid = dlm_dir_nodeid(r);
332
333 if (dir_nodeid == our_nodeid) {
334 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
335 r->res_length, &ret_nodeid);
336 if (error)
337 log_error(ls, "recover dir lookup error %d", error);
338
339 if (ret_nodeid == our_nodeid)
340 ret_nodeid = 0;
341 set_new_master(r, ret_nodeid);
342 } else {
343 recover_list_add(r);
344 error = dlm_send_rcom_lookup(r, dir_nodeid);
345 }
346
347 return error;
348}
349
350/*
351 * When not using a directory, most resource names will hash to a new static
352 * master nodeid and the resource will need to be remastered.
353 */
354
355static int recover_master_static(struct dlm_rsb *r)
356{
357 int master = dlm_dir_nodeid(r);
358
359 if (master == dlm_our_nodeid())
360 master = 0;
361
362 if (r->res_nodeid != master) {
363 if (is_master(r))
364 dlm_purge_mstcpy_locks(r);
365 set_new_master(r, master);
366 return 1;
367 }
368 return 0;
369}
370
371/*
372 * Go through local root resources and for each rsb which has a master which
373 * has departed, get the new master nodeid from the directory. The dir will
374 * assign mastery to the first node to look up the new master. That means
375 * we'll discover in this lookup if we're the new master of any rsb's.
376 *
377 * We fire off all the dir lookup requests individually and asynchronously to
378 * the correct dir node.
379 */
380
381int dlm_recover_masters(struct dlm_ls *ls)
382{
383 struct dlm_rsb *r;
384 int error = 0, count = 0;
385
386 log_debug(ls, "dlm_recover_masters");
387
388 down_read(&ls->ls_root_sem);
389 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
390 if (dlm_recovery_stopped(ls)) {
391 up_read(&ls->ls_root_sem);
392 error = -EINTR;
393 goto out;
394 }
395
396 if (dlm_no_directory(ls))
397 count += recover_master_static(r);
398 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
399 recover_master(r);
400 count++;
401 }
402
403 schedule();
404 }
405 up_read(&ls->ls_root_sem);
406
407 log_debug(ls, "dlm_recover_masters %d resources", count);
408
409 error = dlm_wait_function(ls, &recover_list_empty);
410 out:
411 if (error)
412 recover_list_clear(ls);
413 return error;
414}
415
416int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
417{
418 struct dlm_rsb *r;
419 int nodeid;
420
421 r = recover_list_find(ls, rc->rc_id);
422 if (!r) {
423 log_error(ls, "dlm_recover_master_reply no id %llx",
424 rc->rc_id);
425 goto out;
426 }
427
428 nodeid = rc->rc_result;
429 if (nodeid == dlm_our_nodeid())
430 nodeid = 0;
431
432 set_new_master(r, nodeid);
433 recover_list_del(r);
434
435 if (recover_list_empty(ls))
436 wake_up(&ls->ls_wait_general);
437 out:
438 return 0;
439}
440
441
442/* Lock recovery: rebuild the process-copy locks we hold on a
443 remastered rsb on the new rsb master.
444
445 dlm_recover_locks
446 recover_locks
447 recover_locks_queue
448 dlm_send_rcom_lock -> receive_rcom_lock
449 dlm_recover_master_copy
450 receive_rcom_lock_reply <-
451 dlm_recover_process_copy
452*/
453
454
455/*
456 * keep a count of the number of lkb's we send to the new master; when we get
457 * an equal number of replies then recovery for the rsb is done
458 */
459
460static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
461{
462 struct dlm_lkb *lkb;
463 int error = 0;
464
465 list_for_each_entry(lkb, head, lkb_statequeue) {
466 error = dlm_send_rcom_lock(r, lkb);
467 if (error)
468 break;
469 r->res_recover_locks_count++;
470 }
471
472 return error;
473}
474
475static int all_queues_empty(struct dlm_rsb *r)
476{
477 if (!list_empty(&r->res_grantqueue) ||
478 !list_empty(&r->res_convertqueue) ||
479 !list_empty(&r->res_waitqueue))
480 return 0;
481 return 1;
482}
483
484static int recover_locks(struct dlm_rsb *r)
485{
486 int error = 0;
487
488 lock_rsb(r);
489 if (all_queues_empty(r))
490 goto out;
491
492 DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
493
494 error = recover_locks_queue(r, &r->res_grantqueue);
495 if (error)
496 goto out;
497 error = recover_locks_queue(r, &r->res_convertqueue);
498 if (error)
499 goto out;
500 error = recover_locks_queue(r, &r->res_waitqueue);
501 if (error)
502 goto out;
503
504 if (r->res_recover_locks_count)
505 recover_list_add(r);
506 else
507 rsb_clear_flag(r, RSB_NEW_MASTER);
508 out:
509 unlock_rsb(r);
510 return error;
511}
512
513int dlm_recover_locks(struct dlm_ls *ls)
514{
515 struct dlm_rsb *r;
516 int error, count = 0;
517
518 log_debug(ls, "dlm_recover_locks");
519
520 down_read(&ls->ls_root_sem);
521 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
522 if (is_master(r)) {
523 rsb_clear_flag(r, RSB_NEW_MASTER);
524 continue;
525 }
526
527 if (!rsb_flag(r, RSB_NEW_MASTER))
528 continue;
529
530 if (dlm_recovery_stopped(ls)) {
531 error = -EINTR;
532 up_read(&ls->ls_root_sem);
533 goto out;
534 }
535
536 error = recover_locks(r);
537 if (error) {
538 up_read(&ls->ls_root_sem);
539 goto out;
540 }
541
542 count += r->res_recover_locks_count;
543 }
544 up_read(&ls->ls_root_sem);
545
546 log_debug(ls, "dlm_recover_locks %d locks", count);
547
548 error = dlm_wait_function(ls, &recover_list_empty);
549 out:
550 if (error)
551 recover_list_clear(ls);
552 else
553 dlm_set_recover_status(ls, DLM_RS_LOCKS);
554 return error;
555}
556
557void dlm_recovered_lock(struct dlm_rsb *r)
558{
559 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
560
561 r->res_recover_locks_count--;
562 if (!r->res_recover_locks_count) {
563 rsb_clear_flag(r, RSB_NEW_MASTER);
564 recover_list_del(r);
565 }
566
567 if (recover_list_empty(r->res_ls))
568 wake_up(&r->res_ls->ls_wait_general);
569}
570
571/*
572 * The lvb needs to be recovered on all master rsb's. This includes setting
573 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
574 * based on the lvb's of the locks held on the rsb.
575 *
576 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
577 * was already set prior to recovery, it's not cleared, regardless of locks.
578 *
579 * The LVB contents are only considered for changing when this is a new master
580 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
581 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
582 * from the lkb with the largest lvb sequence number.
583 */
584
585static void recover_lvb(struct dlm_rsb *r)
586{
587 struct dlm_lkb *lkb, *high_lkb = NULL;
588 uint32_t high_seq = 0;
589 int lock_lvb_exists = 0;
590 int big_lock_exists = 0;
591 int lvblen = r->res_ls->ls_lvblen;
592
593 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
594 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
595 continue;
596
597 lock_lvb_exists = 1;
598
599 if (lkb->lkb_grmode > DLM_LOCK_CR) {
600 big_lock_exists = 1;
601 goto setflag;
602 }
603
604 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
605 high_lkb = lkb;
606 high_seq = lkb->lkb_lvbseq;
607 }
608 }
609
610 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
611 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
612 continue;
613
614 lock_lvb_exists = 1;
615
616 if (lkb->lkb_grmode > DLM_LOCK_CR) {
617 big_lock_exists = 1;
618 goto setflag;
619 }
620
621 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
622 high_lkb = lkb;
623 high_seq = lkb->lkb_lvbseq;
624 }
625 }
626
627 setflag:
628 if (!lock_lvb_exists)
629 goto out;
630
631 if (!big_lock_exists)
632 rsb_set_flag(r, RSB_VALNOTVALID);
633
634 /* don't mess with the lvb unless we're the new master */
635 if (!rsb_flag(r, RSB_NEW_MASTER2))
636 goto out;
637
638 if (!r->res_lvbptr) {
639 r->res_lvbptr = allocate_lvb(r->res_ls);
640 if (!r->res_lvbptr)
641 goto out;
642 }
643
644 if (big_lock_exists) {
645 r->res_lvbseq = lkb->lkb_lvbseq;
646 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
647 } else if (high_lkb) {
648 r->res_lvbseq = high_lkb->lkb_lvbseq;
649 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
650 } else {
651 r->res_lvbseq = 0;
652 memset(r->res_lvbptr, 0, lvblen);
653 }
654 out:
655 return;
656}
657
658/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
659 converting PR->CW or CW->PR need to have their lkb_grmode set. */
660
661static void recover_conversion(struct dlm_rsb *r)
662{
663 struct dlm_lkb *lkb;
664 int grmode = -1;
665
666 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
667 if (lkb->lkb_grmode == DLM_LOCK_PR ||
668 lkb->lkb_grmode == DLM_LOCK_CW) {
669 grmode = lkb->lkb_grmode;
670 break;
671 }
672 }
673
674 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
675 if (lkb->lkb_grmode != DLM_LOCK_IV)
676 continue;
677 if (grmode == -1)
678 lkb->lkb_grmode = lkb->lkb_rqmode;
679 else
680 lkb->lkb_grmode = grmode;
681 }
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 recover_lvb(r);
698 count++;
699 }
700 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
701 unlock_rsb(r);
702 }
703 up_read(&ls->ls_root_sem);
704
705 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
706}
707
708/* Create a single list of all root rsb's to be used during recovery */
709
710int dlm_create_root_list(struct dlm_ls *ls)
711{
712 struct dlm_rsb *r;
713 int i, error = 0;
714
715 down_write(&ls->ls_root_sem);
716 if (!list_empty(&ls->ls_root_list)) {
717 log_error(ls, "root list not empty");
718 error = -EINVAL;
719 goto out;
720 }
721
722 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
723 read_lock(&ls->ls_rsbtbl[i].lock);
724 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
725 list_add(&r->res_root_list, &ls->ls_root_list);
726 dlm_hold_rsb(r);
727 }
728 read_unlock(&ls->ls_rsbtbl[i].lock);
729 }
730 out:
731 up_write(&ls->ls_root_sem);
732 return error;
733}
734
735void dlm_release_root_list(struct dlm_ls *ls)
736{
737 struct dlm_rsb *r, *safe;
738
739 down_write(&ls->ls_root_sem);
740 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
741 list_del_init(&r->res_root_list);
742 dlm_put_rsb(r);
743 }
744 up_write(&ls->ls_root_sem);
745}
746
747void dlm_clear_toss_list(struct dlm_ls *ls)
748{
749 struct dlm_rsb *r, *safe;
750 int i;
751
752 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
753 write_lock(&ls->ls_rsbtbl[i].lock);
754 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
755 res_hashchain) {
756 list_del(&r->res_hashchain);
757 free_rsb(r);
758 }
759 write_unlock(&ls->ls_rsbtbl[i].lock);
760 }
761}
762
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..70103533677d
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,285 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237
238 while (!kthread_should_stop()) {
239 set_current_state(TASK_INTERRUPTIBLE);
240 if (!test_bit(LSFL_WORK, &ls->ls_flags))
241 schedule();
242 set_current_state(TASK_RUNNING);
243
244 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
245 do_ls_recovery(ls);
246 }
247
248 dlm_put_lockspace(ls);
249 return 0;
250}
251
252void dlm_recoverd_kick(struct dlm_ls *ls)
253{
254 set_bit(LSFL_WORK, &ls->ls_flags);
255 wake_up_process(ls->ls_recoverd_task);
256}
257
258int dlm_recoverd_start(struct dlm_ls *ls)
259{
260 struct task_struct *p;
261 int error = 0;
262
263 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
264 if (IS_ERR(p))
265 error = PTR_ERR(p);
266 else
267 ls->ls_recoverd_task = p;
268 return error;
269}
270
271void dlm_recoverd_stop(struct dlm_ls *ls)
272{
273 kthread_stop(ls->ls_recoverd_task);
274}
275
276void dlm_recoverd_suspend(struct dlm_ls *ls)
277{
278 mutex_lock(&ls->ls_recoverd_active);
279}
280
281void dlm_recoverd_resume(struct dlm_ls *ls)
282{
283 mutex_unlock(&ls->ls_recoverd_active);
284}
285
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..826d122edf55
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,173 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57 ms->m_range[0] = cpu_to_le64(ms->m_range[0]);
58 ms->m_range[1] = cpu_to_le64(ms->m_range[1]);
59}
60
61void dlm_message_in(struct dlm_message *ms)
62{
63 struct dlm_header *hd = (struct dlm_header *) ms;
64
65 header_in(hd);
66
67 ms->m_type = le32_to_cpu(ms->m_type);
68 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
69 ms->m_pid = le32_to_cpu(ms->m_pid);
70 ms->m_lkid = le32_to_cpu(ms->m_lkid);
71 ms->m_remid = le32_to_cpu(ms->m_remid);
72 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
73 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
74 ms->m_exflags = le32_to_cpu(ms->m_exflags);
75 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
76 ms->m_flags = le32_to_cpu(ms->m_flags);
77 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
78 ms->m_hash = le32_to_cpu(ms->m_hash);
79 ms->m_status = le32_to_cpu(ms->m_status);
80 ms->m_grmode = le32_to_cpu(ms->m_grmode);
81 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
82 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
83 ms->m_asts = le32_to_cpu(ms->m_asts);
84 ms->m_result = le32_to_cpu(ms->m_result);
85 ms->m_range[0] = le64_to_cpu(ms->m_range[0]);
86 ms->m_range[1] = le64_to_cpu(ms->m_range[1]);
87}
88
89static void rcom_lock_out(struct rcom_lock *rl)
90{
91 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
92 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
93 rl->rl_remid = cpu_to_le32(rl->rl_remid);
94 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
95 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
96 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
97 rl->rl_flags = cpu_to_le32(rl->rl_flags);
98 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
99 rl->rl_result = cpu_to_le32(rl->rl_result);
100 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
101 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
102 rl->rl_range[0] = cpu_to_le64(rl->rl_range[0]);
103 rl->rl_range[1] = cpu_to_le64(rl->rl_range[1]);
104 rl->rl_range[2] = cpu_to_le64(rl->rl_range[2]);
105 rl->rl_range[3] = cpu_to_le64(rl->rl_range[3]);
106}
107
108static void rcom_lock_in(struct rcom_lock *rl)
109{
110 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
111 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
112 rl->rl_remid = le32_to_cpu(rl->rl_remid);
113 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
114 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
115 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
116 rl->rl_flags = le32_to_cpu(rl->rl_flags);
117 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
118 rl->rl_result = le32_to_cpu(rl->rl_result);
119 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
120 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
121 rl->rl_range[0] = le64_to_cpu(rl->rl_range[0]);
122 rl->rl_range[1] = le64_to_cpu(rl->rl_range[1]);
123 rl->rl_range[2] = le64_to_cpu(rl->rl_range[2]);
124 rl->rl_range[3] = le64_to_cpu(rl->rl_range[3]);
125}
126
127static void rcom_config_out(struct rcom_config *rf)
128{
129 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
130 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
131}
132
133static void rcom_config_in(struct rcom_config *rf)
134{
135 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
136 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
137}
138
139void dlm_rcom_out(struct dlm_rcom *rc)
140{
141 struct dlm_header *hd = (struct dlm_header *) rc;
142 int type = rc->rc_type;
143
144 header_out(hd);
145
146 rc->rc_type = cpu_to_le32(rc->rc_type);
147 rc->rc_result = cpu_to_le32(rc->rc_result);
148 rc->rc_id = cpu_to_le64(rc->rc_id);
149
150 if (type == DLM_RCOM_LOCK)
151 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
152
153 else if (type == DLM_RCOM_STATUS_REPLY)
154 rcom_config_out((struct rcom_config *) rc->rc_buf);
155}
156
157void dlm_rcom_in(struct dlm_rcom *rc)
158{
159 struct dlm_header *hd = (struct dlm_header *) rc;
160
161 header_in(hd);
162
163 rc->rc_type = le32_to_cpu(rc->rc_type);
164 rc->rc_result = le32_to_cpu(rc->rc_result);
165 rc->rc_id = le64_to_cpu(rc->rc_id);
166
167 if (rc->rc_type == DLM_RCOM_LOCK)
168 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
169
170 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
171 rcom_config_in((struct rcom_config *) rc->rc_buf);
172}
173
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..17cb44bea1c0
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,46 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 default m
4 depends on EXPERIMENTAL
5 select FS_POSIX_ACL
6 select SYSFS
7 help
8 A cluster filesystem.
9
10 Allows a cluster of computers to simultaneously use a block device
11 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
12 and writes to the block device like a local filesystem, but also uses
13 a lock module to allow the computers coordinate their I/O so
14 filesystem consistency is maintained. One of the nifty features of
15 GFS is perfect consistency -- changes made to the filesystem on one
16 machine show up immediately on all other machines in the cluster.
17
18 To use the GFS2 filesystem, you will need to enable one or more of
19 the below locking modules. Documentation and utilities for GFS2 can
20 be found here: http://sources.redhat.com/cluster/gfs/
21
22config GFS2_FS_LOCKING_NOLOCK
23 tristate "GFS2 \"nolock\" locking module"
24 depends on GFS2_FS
25 help
26 Single node locking module for GFS2.
27
28 Use this module if you want to use GFS2 on a single node without
29 its clustering features. You can still take advantage of the
30 large file support, and upgrade to running a full cluster later on
31 if required.
32
33 If you will only be using GFS2 in cluster mode, you do not need this
34 module.
35
36config GFS2_FS_LOCKING_DLM
37 tristate "GFS2 DLM locking module"
38 depends on GFS2_FS
39 select DLM
40 help
41 Multiple node locking module for GFS2
42
43 Most users of GFS2 will require this module. It provides the locking
44 interface between GFS2 and the DLM, which is required to use GFS2
45 in a cluster environment.
46
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..88f927948113
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,42 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := \
3 acl.o \
4 bits.o \
5 bmap.o \
6 daemon.o \
7 dir.o \
8 eaops.o \
9 eattr.o \
10 glock.o \
11 glops.o \
12 inode.o \
13 lm.o \
14 log.o \
15 lops.o \
16 locking.o \
17 lvb.o \
18 main.o \
19 meta_io.o \
20 mount.o \
21 ondisk.o \
22 ops_address.o \
23 ops_dentry.o \
24 ops_export.o \
25 ops_file.o \
26 ops_fstype.o \
27 ops_inode.o \
28 ops_super.o \
29 ops_vm.o \
30 page.o \
31 quota.o \
32 recovery.o \
33 rgrp.o \
34 super.o \
35 sys.o \
36 trans.o \
37 unlinked.o \
38 util.o
39
40obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
41obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
42
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..9482a677ea47
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,312 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "acl.h"
21#include "eaops.h"
22#include "eattr.h"
23#include "glock.h"
24#include "inode.h"
25#include "meta_io.h"
26#include "trans.h"
27
28#define ACL_ACCESS 1
29#define ACL_DEFAULT 0
30
31int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
32 struct gfs2_ea_request *er,
33 int *remove, mode_t *mode)
34{
35 struct posix_acl *acl;
36 int error;
37
38 error = gfs2_acl_validate_remove(ip, access);
39 if (error)
40 return error;
41
42 if (!er->er_data)
43 return -EINVAL;
44
45 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
46 if (IS_ERR(acl))
47 return PTR_ERR(acl);
48 if (!acl) {
49 *remove = 1;
50 return 0;
51 }
52
53 error = posix_acl_valid(acl);
54 if (error)
55 goto out;
56
57 if (access) {
58 error = posix_acl_equiv_mode(acl, mode);
59 if (!error)
60 *remove = 1;
61 else if (error > 0)
62 error = 0;
63 }
64
65 out:
66 posix_acl_release(acl);
67
68 return error;
69}
70
71int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
72{
73 if (!ip->i_sbd->sd_args.ar_posix_acl)
74 return -EOPNOTSUPP;
75 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
76 return -EPERM;
77 if (S_ISLNK(ip->i_di.di_mode))
78 return -EOPNOTSUPP;
79 if (!access && !S_ISDIR(ip->i_di.di_mode))
80 return -EACCES;
81
82 return 0;
83}
84
85static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
86 struct gfs2_ea_location *el, char **data, unsigned int *len)
87{
88 struct gfs2_ea_request er;
89 struct gfs2_ea_location el_this;
90 int error;
91
92 if (!ip->i_di.di_eattr)
93 return 0;
94
95 memset(&er, 0, sizeof(struct gfs2_ea_request));
96 if (access) {
97 er.er_name = GFS2_POSIX_ACL_ACCESS;
98 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
99 } else {
100 er.er_name = GFS2_POSIX_ACL_DEFAULT;
101 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
102 }
103 er.er_type = GFS2_EATYPE_SYS;
104
105 if (!el)
106 el = &el_this;
107
108 error = gfs2_ea_find(ip, &er, el);
109 if (error)
110 return error;
111 if (!el->el_ea)
112 return 0;
113 if (!GFS2_EA_DATA_LEN(el->el_ea))
114 goto out;
115
116 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
117 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
118 error = -ENOMEM;
119 if (!er.er_data)
120 goto out;
121
122 error = gfs2_ea_get_copy(ip, el, er.er_data);
123 if (error)
124 goto out_kfree;
125
126 if (acl) {
127 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
128 if (IS_ERR(*acl))
129 error = PTR_ERR(*acl);
130 }
131
132 out_kfree:
133 if (error || !data)
134 kfree(er.er_data);
135 else {
136 *data = er.er_data;
137 *len = er.er_data_len;
138 }
139
140 out:
141 if (error || el == &el_this)
142 brelse(el->el_bh);
143
144 return error;
145}
146
147/**
148 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
149 * @inode: the file we want to do something to
150 * @mask: what we want to do
151 *
152 * Returns: errno
153 */
154
155int gfs2_check_acl_locked(struct inode *inode, int mask)
156{
157 struct posix_acl *acl = NULL;
158 int error;
159
160 error = acl_get(get_v2ip(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
161 if (error)
162 return error;
163
164 if (acl) {
165 error = posix_acl_permission(inode, acl, mask);
166 posix_acl_release(acl);
167 return error;
168 }
169
170 return -EAGAIN;
171}
172
173int gfs2_check_acl(struct inode *inode, int mask)
174{
175 struct gfs2_inode *ip = get_v2ip(inode);
176 struct gfs2_holder i_gh;
177 int error;
178
179 error = gfs2_glock_nq_init(ip->i_gl,
180 LM_ST_SHARED, LM_FLAG_ANY,
181 &i_gh);
182 if (!error) {
183 error = gfs2_check_acl_locked(inode, mask);
184 gfs2_glock_dq_uninit(&i_gh);
185 }
186
187 return error;
188}
189
190static int munge_mode(struct gfs2_inode *ip, mode_t mode)
191{
192 struct gfs2_sbd *sdp = ip->i_sbd;
193 struct buffer_head *dibh;
194 int error;
195
196 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
197 if (error)
198 return error;
199
200 error = gfs2_meta_inode_buffer(ip, &dibh);
201 if (!error) {
202 gfs2_assert_withdraw(sdp,
203 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
204 ip->i_di.di_mode = mode;
205 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
206 gfs2_dinode_out(&ip->i_di, dibh->b_data);
207 brelse(dibh);
208 }
209
210 gfs2_trans_end(sdp);
211
212 return 0;
213}
214
215int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
216{
217 struct gfs2_sbd *sdp = dip->i_sbd;
218 struct posix_acl *acl = NULL, *clone;
219 struct gfs2_ea_request er;
220 mode_t mode = ip->i_di.di_mode;
221 int error;
222
223 if (!sdp->sd_args.ar_posix_acl)
224 return 0;
225 if (S_ISLNK(ip->i_di.di_mode))
226 return 0;
227
228 memset(&er, 0, sizeof(struct gfs2_ea_request));
229 er.er_type = GFS2_EATYPE_SYS;
230
231 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
232 &er.er_data, &er.er_data_len);
233 if (error)
234 return error;
235 if (!acl) {
236 mode &= ~current->fs->umask;
237 if (mode != ip->i_di.di_mode)
238 error = munge_mode(ip, mode);
239 return error;
240 }
241
242 clone = posix_acl_clone(acl, GFP_KERNEL);
243 error = -ENOMEM;
244 if (!clone)
245 goto out;
246 posix_acl_release(acl);
247 acl = clone;
248
249 if (S_ISDIR(ip->i_di.di_mode)) {
250 er.er_name = GFS2_POSIX_ACL_DEFAULT;
251 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
252 error = gfs2_system_eaops.eo_set(ip, &er);
253 if (error)
254 goto out;
255 }
256
257 error = posix_acl_create_masq(acl, &mode);
258 if (error < 0)
259 goto out;
260 if (error > 0) {
261 er.er_name = GFS2_POSIX_ACL_ACCESS;
262 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
263 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
264 er.er_mode = mode;
265 er.er_flags = GFS2_ERF_MODE;
266 error = gfs2_system_eaops.eo_set(ip, &er);
267 if (error)
268 goto out;
269 } else
270 munge_mode(ip, mode);
271
272 out:
273 posix_acl_release(acl);
274 kfree(er.er_data);
275 return error;
276}
277
278int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
279{
280 struct posix_acl *acl = NULL, *clone;
281 struct gfs2_ea_location el;
282 char *data;
283 unsigned int len;
284 int error;
285
286 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
287 if (error)
288 return error;
289 if (!acl)
290 return gfs2_setattr_simple(ip, attr);
291
292 clone = posix_acl_clone(acl, GFP_KERNEL);
293 error = -ENOMEM;
294 if (!clone)
295 goto out;
296 posix_acl_release(acl);
297 acl = clone;
298
299 error = posix_acl_chmod_masq(acl, attr->ia_mode);
300 if (!error) {
301 posix_acl_to_xattr(acl, data, len);
302 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
303 }
304
305 out:
306 posix_acl_release(acl);
307 brelse(el.el_bh);
308 kfree(data);
309
310 return error;
311}
312
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..a174b4f6bcc2
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,37 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
14#define GFS2_POSIX_ACL_ACCESS_LEN 16
15#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
16#define GFS2_POSIX_ACL_DEFAULT_LEN 17
17
18#define GFS2_ACL_IS_ACCESS(name, len) \
19 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
20 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
21
22#define GFS2_ACL_IS_DEFAULT(name, len) \
23 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
24 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
25
26struct gfs2_ea_request;
27
28int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
29 struct gfs2_ea_request *er,
30 int *remove, mode_t *mode);
31int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
32int gfs2_check_acl_locked(struct inode *inode, int mask);
33int gfs2_check_acl(struct inode *inode, int mask);
34int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
35int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
36
37#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bits.c b/fs/gfs2/bits.c
new file mode 100644
index 000000000000..57d420a86adf
--- /dev/null
+++ b/fs/gfs2/bits.c
@@ -0,0 +1,178 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * These routines are used by the resource group routines (rgrp.c)
12 * to keep track of block allocation. Each block is represented by two
13 * bits. One bit indicates whether or not the block is used. (1=used,
14 * 0=free) The other bit indicates whether or not the block contains a
15 * dinode or not. (1=dinode, 0=not-dinode) So, each byte represents
16 * GFS2_NBBY (i.e. 4) blocks.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <asm/semaphore.h>
25
26#include "gfs2.h"
27#include "bits.h"
28
29static const char valid_change[16] = {
30 /* current */
31 /* n */ 0, 1, 0, 1,
32 /* e */ 1, 0, 0, 0,
33 /* w */ 0, 0, 0, 0,
34 1, 0, 0, 0
35};
36
37/**
38 * gfs2_setbit - Set a bit in the bitmaps
39 * @buffer: the buffer that holds the bitmaps
40 * @buflen: the length (in bytes) of the buffer
41 * @block: the block to set
42 * @new_state: the new state of the block
43 *
44 */
45
46void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
47 unsigned int buflen, uint32_t block, unsigned char new_state)
48{
49 unsigned char *byte, *end, cur_state;
50 unsigned int bit;
51
52 byte = buffer + (block / GFS2_NBBY);
53 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
54 end = buffer + buflen;
55
56 gfs2_assert(rgd->rd_sbd, byte < end);
57
58 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
59
60 if (valid_change[new_state * 4 + cur_state]) {
61 *byte ^= cur_state << bit;
62 *byte |= new_state << bit;
63 } else
64 gfs2_consist_rgrpd(rgd);
65}
66
67/**
68 * gfs2_testbit - test a bit in the bitmaps
69 * @buffer: the buffer that holds the bitmaps
70 * @buflen: the length (in bytes) of the buffer
71 * @block: the block to read
72 *
73 */
74
75unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
76 unsigned int buflen, uint32_t block)
77{
78 unsigned char *byte, *end, cur_state;
79 unsigned int bit;
80
81 byte = buffer + (block / GFS2_NBBY);
82 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
83 end = buffer + buflen;
84
85 gfs2_assert(rgd->rd_sbd, byte < end);
86
87 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
88
89 return cur_state;
90}
91
92/**
93 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
94 * a block in a given allocation state.
95 * @buffer: the buffer that holds the bitmaps
96 * @buflen: the length (in bytes) of the buffer
97 * @goal: start search at this block's bit-pair (within @buffer)
98 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
99 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
100 *
101 * Scope of @goal and returned block number is only within this bitmap buffer,
102 * not entire rgrp or filesystem. @buffer will be offset from the actual
103 * beginning of a bitmap block buffer, skipping any header structures.
104 *
105 * Return: the block number (bitmap buffer scope) that was found
106 */
107
108uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
109 unsigned int buflen, uint32_t goal,
110 unsigned char old_state)
111{
112 unsigned char *byte, *end, alloc;
113 uint32_t blk = goal;
114 unsigned int bit;
115
116 byte = buffer + (goal / GFS2_NBBY);
117 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
118 end = buffer + buflen;
119 alloc = (old_state & 1) ? 0 : 0x55;
120
121 while (byte < end) {
122 if ((*byte & 0x55) == alloc) {
123 blk += (8 - bit) >> 1;
124
125 bit = 0;
126 byte++;
127
128 continue;
129 }
130
131 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
132 return blk;
133
134 bit += GFS2_BIT_SIZE;
135 if (bit >= 8) {
136 bit = 0;
137 byte++;
138 }
139
140 blk++;
141 }
142
143 return BFITNOENT;
144}
145
146/**
147 * gfs2_bitcount - count the number of bits in a certain state
148 * @buffer: the buffer that holds the bitmaps
149 * @buflen: the length (in bytes) of the buffer
150 * @state: the state of the block we're looking for
151 *
152 * Returns: The number of bits
153 */
154
155uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
156 unsigned int buflen, unsigned char state)
157{
158 unsigned char *byte = buffer;
159 unsigned char *end = buffer + buflen;
160 unsigned char state1 = state << 2;
161 unsigned char state2 = state << 4;
162 unsigned char state3 = state << 6;
163 uint32_t count = 0;
164
165 for (; byte < end; byte++) {
166 if (((*byte) & 0x03) == state)
167 count++;
168 if (((*byte) & 0x0C) == state1)
169 count++;
170 if (((*byte) & 0x30) == state2)
171 count++;
172 if (((*byte) & 0xC0) == state3)
173 count++;
174 }
175
176 return count;
177}
178
diff --git a/fs/gfs2/bits.h b/fs/gfs2/bits.h
new file mode 100644
index 000000000000..36ccbdcb1eef
--- /dev/null
+++ b/fs/gfs2/bits.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BITS_DOT_H__
11#define __BITS_DOT_H__
12
13#define BFITNOENT 0xFFFFFFFF
14
15void gfs2_setbit(struct gfs2_rgrpd *rgd,
16 unsigned char *buffer, unsigned int buflen,
17 uint32_t block, unsigned char new_state);
18unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
19 unsigned char *buffer, unsigned int buflen,
20 uint32_t block);
21uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd,
22 unsigned char *buffer, unsigned int buflen,
23 uint32_t goal, unsigned char old_state);
24uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd,
25 unsigned char *buffer, unsigned int buflen,
26 unsigned char state);
27
28#endif /* __BITS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..4efcd8a39e98
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1089 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "bmap.h"
19#include "glock.h"
20#include "inode.h"
21#include "meta_io.h"
22#include "page.h"
23#include "quota.h"
24#include "rgrp.h"
25#include "trans.h"
26#include "dir.h"
27
28/* This doesn't need to be that large as max 64 bit pointers in a 4k
29 * block is 512, so __u16 is fine for that. It saves stack space to
30 * keep it small.
31 */
32struct metapath {
33 __u16 mp_list[GFS2_MAX_META_HEIGHT];
34};
35
36typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
37 struct buffer_head *bh, uint64_t *top,
38 uint64_t *bottom, unsigned int height,
39 void *data);
40
41struct strip_mine {
42 int sm_first;
43 unsigned int sm_height;
44};
45
46/**
47 * @gfs2_unstuffer_sync - Synchronously unstuff a dinode
48 * @ip:
49 * @dibh:
50 * @block:
51 * @private:
52 *
53 * Cheat and use a metadata buffer instead of a data page.
54 *
55 * Returns: errno
56 */
57
58int gfs2_unstuffer_sync(struct gfs2_inode *ip, struct buffer_head *dibh,
59 uint64_t block, void *private)
60{
61 struct buffer_head *bh;
62 int error;
63
64 bh = gfs2_meta_new(ip->i_gl, block);
65
66 gfs2_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs2_dinode));
67
68 set_buffer_dirty(bh);
69 error = sync_dirty_buffer(bh);
70
71 brelse(bh);
72
73 return error;
74}
75
76/**
77 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
78 * @ip: The GFS2 inode to unstuff
79 * @unstuffer: the routine that handles unstuffing a non-zero length file
80 * @private: private data for the unstuffer
81 *
82 * This routine unstuffs a dinode and returns it to a "normal" state such
83 * that the height can be grown in the traditional way.
84 *
85 * Returns: errno
86 */
87
88int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
89 void *private)
90{
91 struct buffer_head *bh, *dibh;
92 uint64_t block = 0;
93 int isdir = gfs2_is_dir(ip);
94 int error;
95
96 down_write(&ip->i_rw_mutex);
97
98 error = gfs2_meta_inode_buffer(ip, &dibh);
99 if (error)
100 goto out;
101
102 if (ip->i_di.di_size) {
103 /* Get a free block, fill it with the stuffed data,
104 and write it out to disk */
105
106 if (isdir) {
107 block = gfs2_alloc_meta(ip);
108
109 error = gfs2_dir_get_buffer(ip, block, 1, &bh);
110 if (error)
111 goto out_brelse;
112 gfs2_buffer_copy_tail(bh,
113 sizeof(struct gfs2_meta_header),
114 dibh, sizeof(struct gfs2_dinode));
115 brelse(bh);
116 } else {
117 block = gfs2_alloc_data(ip);
118
119 error = unstuffer(ip, dibh, block, private);
120 if (error)
121 goto out_brelse;
122 }
123 }
124
125 /* Set up the pointer to the new block */
126
127 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
128
129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
130
131 if (ip->i_di.di_size) {
132 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) = cpu_to_be64(block);
133 ip->i_di.di_blocks++;
134 }
135
136 ip->i_di.di_height = 1;
137
138 gfs2_dinode_out(&ip->i_di, dibh->b_data);
139
140 out_brelse:
141 brelse(dibh);
142
143 out:
144 up_write(&ip->i_rw_mutex);
145
146 return error;
147}
148
149/**
150 * calc_tree_height - Calculate the height of a metadata tree
151 * @ip: The GFS2 inode
152 * @size: The proposed size of the file
153 *
154 * Work out how tall a metadata tree needs to be in order to accommodate a
155 * file of a particular size. If size is less than the current size of
156 * the inode, then the current size of the inode is used instead of the
157 * supplied one.
158 *
159 * Returns: the height the tree should be
160 */
161
162static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
163{
164 struct gfs2_sbd *sdp = ip->i_sbd;
165 uint64_t *arr;
166 unsigned int max, height;
167
168 if (ip->i_di.di_size > size)
169 size = ip->i_di.di_size;
170
171 if (gfs2_is_dir(ip)) {
172 arr = sdp->sd_jheightsize;
173 max = sdp->sd_max_jheight;
174 } else {
175 arr = sdp->sd_heightsize;
176 max = sdp->sd_max_height;
177 }
178
179 for (height = 0; height < max; height++)
180 if (arr[height] >= size)
181 break;
182
183 return height;
184}
185
186/**
187 * build_height - Build a metadata tree of the requested height
188 * @ip: The GFS2 inode
189 * @height: The height to build to
190 *
191 * This routine makes sure that the metadata tree is tall enough to hold
192 * "size" bytes of data.
193 *
194 * Returns: errno
195 */
196
197static int build_height(struct gfs2_inode *ip, int height)
198{
199 struct gfs2_sbd *sdp = ip->i_sbd;
200 struct buffer_head *bh, *dibh;
201 uint64_t block = 0, *bp;
202 unsigned int x;
203 int new_block;
204 int error;
205
206 while (ip->i_di.di_height < height) {
207 error = gfs2_meta_inode_buffer(ip, &dibh);
208 if (error)
209 return error;
210
211 new_block = 0;
212 bp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
213 for (x = 0; x < sdp->sd_diptrs; x++, bp++)
214 if (*bp) {
215 new_block = 1;
216 break;
217 }
218
219 if (new_block) {
220 /* Get a new block, fill it with the old direct
221 pointers, and write it out */
222
223 block = gfs2_alloc_meta(ip);
224
225 bh = gfs2_meta_new(ip->i_gl, block);
226 gfs2_trans_add_bh(ip->i_gl, bh, 1);
227 gfs2_metatype_set(bh,
228 GFS2_METATYPE_IN,
229 GFS2_FORMAT_IN);
230 gfs2_buffer_copy_tail(bh,
231 sizeof(struct gfs2_meta_header),
232 dibh, sizeof(struct gfs2_dinode));
233
234 brelse(bh);
235 }
236
237 /* Set up the new direct pointer and write it out to disk */
238
239 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
240
241 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
242
243 if (new_block) {
244 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) = cpu_to_be64(block);
245 ip->i_di.di_blocks++;
246 }
247
248 ip->i_di.di_height++;
249
250 gfs2_dinode_out(&ip->i_di, dibh->b_data);
251 brelse(dibh);
252 }
253
254 return 0;
255}
256
257/**
258 * find_metapath - Find path through the metadata tree
259 * @ip: The inode pointer
260 * @mp: The metapath to return the result in
261 * @block: The disk block to look up
262 *
263 * This routine returns a struct metapath structure that defines a path
264 * through the metadata of inode "ip" to get to block "block".
265 *
266 * Example:
267 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
268 * filesystem with a blocksize of 4096.
269 *
270 * find_metapath() would return a struct metapath structure set to:
271 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
272 * and mp_list[2] = 165.
273 *
274 * That means that in order to get to the block containing the byte at
275 * offset 101342453, we would load the indirect block pointed to by pointer
276 * 0 in the dinode. We would then load the indirect block pointed to by
277 * pointer 48 in that indirect block. We would then load the data block
278 * pointed to by pointer 165 in that indirect block.
279 *
280 * ----------------------------------------
281 * | Dinode | |
282 * | | 4|
283 * | |0 1 2 3 4 5 9|
284 * | | 6|
285 * ----------------------------------------
286 * |
287 * |
288 * V
289 * ----------------------------------------
290 * | Indirect Block |
291 * | 5|
292 * | 4 4 4 4 4 5 5 1|
293 * |0 5 6 7 8 9 0 1 2|
294 * ----------------------------------------
295 * |
296 * |
297 * V
298 * ----------------------------------------
299 * | Indirect Block |
300 * | 1 1 1 1 1 5|
301 * | 6 6 6 6 6 1|
302 * |0 3 4 5 6 7 2|
303 * ----------------------------------------
304 * |
305 * |
306 * V
307 * ----------------------------------------
308 * | Data block containing offset |
309 * | 101342453 |
310 * | |
311 * | |
312 * ----------------------------------------
313 *
314 */
315
316static void find_metapath(struct gfs2_inode *ip, uint64_t block, struct metapath *mp)
317{
318 struct gfs2_sbd *sdp = ip->i_sbd;
319 uint64_t b = block;
320 unsigned int i;
321
322 for (i = ip->i_di.di_height; i--;)
323 mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
324
325}
326
327/**
328 * metapointer - Return pointer to start of metadata in a buffer
329 * @bh: The buffer
330 * @height: The metadata height (0 = dinode)
331 * @mp: The metapath
332 *
333 * Return a pointer to the block number of the next height of the metadata
334 * tree given a buffer containing the pointer to the current height of the
335 * metadata tree.
336 */
337
338static inline uint64_t *metapointer(struct buffer_head *bh,
339 unsigned int height, struct metapath *mp)
340{
341 unsigned int head_size = (height > 0) ?
342 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
343
344 return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height];
345}
346
347/**
348 * lookup_block - Get the next metadata block in metadata tree
349 * @ip: The GFS2 inode
350 * @bh: Buffer containing the pointers to metadata blocks
351 * @height: The height of the tree (0 = dinode)
352 * @mp: The metapath
353 * @create: Non-zero if we may create a new meatdata block
354 * @new: Used to indicate if we did create a new metadata block
355 * @block: the returned disk block number
356 *
357 * Given a metatree, complete to a particular height, checks to see if the next
358 * height of the tree exists. If not the next height of the tree is created.
359 * The block number of the next height of the metadata tree is returned.
360 *
361 */
362
363static void lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
364 unsigned int height, struct metapath *mp, int create,
365 int *new, uint64_t *block)
366{
367 uint64_t *ptr = metapointer(bh, height, mp);
368
369 if (*ptr) {
370 *block = be64_to_cpu(*ptr);
371 return;
372 }
373
374 *block = 0;
375
376 if (!create)
377 return;
378
379 if (height == ip->i_di.di_height - 1 &&
380 !gfs2_is_dir(ip))
381 *block = gfs2_alloc_data(ip);
382 else
383 *block = gfs2_alloc_meta(ip);
384
385 gfs2_trans_add_bh(ip->i_gl, bh, 1);
386
387 *ptr = cpu_to_be64(*block);
388 ip->i_di.di_blocks++;
389
390 *new = 1;
391}
392
393/**
394 * gfs2_block_map - Map a block from an inode to a disk block
395 * @ip: The GFS2 inode
396 * @lblock: The logical block number
397 * @new: Value/Result argument (1 = may create/did create new blocks)
398 * @dblock: the disk block number of the start of an extent
399 * @extlen: the size of the extent
400 *
401 * Find the block number on the current device which corresponds to an
402 * inode's block. If the block had to be created, "new" will be set.
403 *
404 * Returns: errno
405 */
406
407int gfs2_block_map(struct gfs2_inode *ip, uint64_t lblock, int *new,
408 uint64_t *dblock, uint32_t *extlen)
409{
410 struct gfs2_sbd *sdp = ip->i_sbd;
411 struct buffer_head *bh;
412 struct metapath mp;
413 int create = *new;
414 unsigned int bsize;
415 unsigned int height;
416 unsigned int end_of_metadata;
417 unsigned int x;
418 int error = 0;
419
420 *new = 0;
421 *dblock = 0;
422 if (extlen)
423 *extlen = 0;
424
425 if (create)
426 down_write(&ip->i_rw_mutex);
427 else
428 down_read(&ip->i_rw_mutex);
429
430 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
431 goto out;
432
433 bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
434
435 height = calc_tree_height(ip, (lblock + 1) * bsize);
436 if (ip->i_di.di_height < height) {
437 if (!create)
438 goto out;
439
440 error = build_height(ip, height);
441 if (error)
442 goto out;
443 }
444
445 find_metapath(ip, lblock, &mp);
446 end_of_metadata = ip->i_di.di_height - 1;
447
448 error = gfs2_meta_inode_buffer(ip, &bh);
449 if (error)
450 goto out;
451
452 for (x = 0; x < end_of_metadata; x++) {
453 lookup_block(ip, bh, x, &mp, create, new, dblock);
454 brelse(bh);
455 if (!*dblock)
456 goto out;
457
458 error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
459 if (error)
460 goto out;
461 }
462
463 lookup_block(ip, bh, end_of_metadata, &mp, create, new, dblock);
464
465 if (extlen && *dblock) {
466 *extlen = 1;
467
468 if (!*new) {
469 uint64_t tmp_dblock;
470 int tmp_new;
471 unsigned int nptrs;
472
473 nptrs = (end_of_metadata) ? sdp->sd_inptrs :
474 sdp->sd_diptrs;
475
476 while (++mp.mp_list[end_of_metadata] < nptrs) {
477 lookup_block(ip, bh, end_of_metadata, &mp,
478 0, &tmp_new, &tmp_dblock);
479
480 if (*dblock + *extlen != tmp_dblock)
481 break;
482
483 (*extlen)++;
484 }
485 }
486 }
487
488 brelse(bh);
489
490 if (*new) {
491 error = gfs2_meta_inode_buffer(ip, &bh);
492 if (!error) {
493 gfs2_trans_add_bh(ip->i_gl, bh, 1);
494 gfs2_dinode_out(&ip->i_di, bh->b_data);
495 brelse(bh);
496 }
497 }
498
499 out:
500 if (create)
501 up_write(&ip->i_rw_mutex);
502 else
503 up_read(&ip->i_rw_mutex);
504
505 return error;
506}
507
508/**
509 * recursive_scan - recursively scan through the end of a file
510 * @ip: the inode
511 * @dibh: the dinode buffer
512 * @mp: the path through the metadata to the point to start
513 * @height: the height the recursion is at
514 * @block: the indirect block to look at
515 * @first: 1 if this is the first block
516 * @bc: the call to make for each piece of metadata
517 * @data: data opaque to this function to pass to @bc
518 *
519 * When this is first called @height and @block should be zero and
520 * @first should be 1.
521 *
522 * Returns: errno
523 */
524
525static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
526 struct metapath *mp, unsigned int height,
527 uint64_t block, int first, block_call_t bc,
528 void *data)
529{
530 struct gfs2_sbd *sdp = ip->i_sbd;
531 struct buffer_head *bh = NULL;
532 uint64_t *top, *bottom;
533 uint64_t bn;
534 int error;
535 int mh_size = sizeof(struct gfs2_meta_header);
536
537 if (!height) {
538 error = gfs2_meta_inode_buffer(ip, &bh);
539 if (error)
540 return error;
541 dibh = bh;
542
543 top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
544 mp->mp_list[0];
545 bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
546 sdp->sd_diptrs;
547 } else {
548 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
549 if (error)
550 return error;
551
552 top = (uint64_t *)(bh->b_data + mh_size) +
553 ((first) ? mp->mp_list[height] : 0);
554
555 bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
556 }
557
558 error = bc(ip, dibh, bh, top, bottom, height, data);
559 if (error)
560 goto out;
561
562 if (height < ip->i_di.di_height - 1)
563 for (; top < bottom; top++, first = 0) {
564 if (!*top)
565 continue;
566
567 bn = be64_to_cpu(*top);
568
569 error = recursive_scan(ip, dibh, mp, height + 1, bn,
570 first, bc, data);
571 if (error)
572 break;
573 }
574
575 out:
576 brelse(bh);
577
578 return error;
579}
580
581/**
582 * do_strip - Look for a layer a particular layer of the file and strip it off
583 * @ip: the inode
584 * @dibh: the dinode buffer
585 * @bh: A buffer of pointers
586 * @top: The first pointer in the buffer
587 * @bottom: One more than the last pointer
588 * @height: the height this buffer is at
589 * @data: a pointer to a struct strip_mine
590 *
591 * Returns: errno
592 */
593
594static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
595 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
596 unsigned int height, void *data)
597{
598 struct strip_mine *sm = (struct strip_mine *)data;
599 struct gfs2_sbd *sdp = ip->i_sbd;
600 struct gfs2_rgrp_list rlist;
601 uint64_t bn, bstart;
602 uint32_t blen;
603 uint64_t *p;
604 unsigned int rg_blocks = 0;
605 int metadata;
606 unsigned int revokes = 0;
607 int x;
608 int error;
609
610 if (!*top)
611 sm->sm_first = 0;
612
613 if (height != sm->sm_height)
614 return 0;
615
616 if (sm->sm_first) {
617 top++;
618 sm->sm_first = 0;
619 }
620
621 metadata = (height != ip->i_di.di_height - 1);
622 if (metadata)
623 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
624
625 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
626 if (error)
627 return error;
628
629 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
630 bstart = 0;
631 blen = 0;
632
633 for (p = top; p < bottom; p++) {
634 if (!*p)
635 continue;
636
637 bn = be64_to_cpu(*p);
638
639 if (bstart + blen == bn)
640 blen++;
641 else {
642 if (bstart)
643 gfs2_rlist_add(sdp, &rlist, bstart);
644
645 bstart = bn;
646 blen = 1;
647 }
648 }
649
650 if (bstart)
651 gfs2_rlist_add(sdp, &rlist, bstart);
652 else
653 goto out; /* Nothing to do */
654
655 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
656
657 for (x = 0; x < rlist.rl_rgrps; x++) {
658 struct gfs2_rgrpd *rgd;
659 rgd = get_gl2rgd(rlist.rl_ghs[x].gh_gl);
660 rg_blocks += rgd->rd_ri.ri_length;
661 }
662
663 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
664 if (error)
665 goto out_rlist;
666
667 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
668 RES_INDIRECT + RES_STATFS + RES_QUOTA,
669 revokes);
670 if (error)
671 goto out_rg_gunlock;
672
673 down_write(&ip->i_rw_mutex);
674
675 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
676 gfs2_trans_add_bh(ip->i_gl, bh, 1);
677
678 bstart = 0;
679 blen = 0;
680
681 for (p = top; p < bottom; p++) {
682 if (!*p)
683 continue;
684
685 bn = be64_to_cpu(*p);
686
687 if (bstart + blen == bn)
688 blen++;
689 else {
690 if (bstart) {
691 if (metadata)
692 gfs2_free_meta(ip, bstart, blen);
693 else
694 gfs2_free_data(ip, bstart, blen);
695 }
696
697 bstart = bn;
698 blen = 1;
699 }
700
701 *p = 0;
702 if (!ip->i_di.di_blocks)
703 gfs2_consist_inode(ip);
704 ip->i_di.di_blocks--;
705 }
706 if (bstart) {
707 if (metadata)
708 gfs2_free_meta(ip, bstart, blen);
709 else
710 gfs2_free_data(ip, bstart, blen);
711 }
712
713 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
714
715 gfs2_dinode_out(&ip->i_di, dibh->b_data);
716
717 up_write(&ip->i_rw_mutex);
718
719 gfs2_trans_end(sdp);
720
721 out_rg_gunlock:
722 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
723
724 out_rlist:
725 gfs2_rlist_free(&rlist);
726
727 out:
728 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
729
730 return error;
731}
732
733/**
734 * do_grow - Make a file look bigger than it is
735 * @ip: the inode
736 * @size: the size to set the file to
737 *
738 * Called with an exclusive lock on @ip.
739 *
740 * Returns: errno
741 */
742
743static int do_grow(struct gfs2_inode *ip, uint64_t size)
744{
745 struct gfs2_sbd *sdp = ip->i_sbd;
746 struct gfs2_alloc *al;
747 struct buffer_head *dibh;
748 unsigned int h;
749 int error;
750
751 al = gfs2_alloc_get(ip);
752
753 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
754 if (error)
755 goto out;
756
757 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
758 if (error)
759 goto out_gunlock_q;
760
761 al->al_requested = sdp->sd_max_height + RES_DATA;
762
763 error = gfs2_inplace_reserve(ip);
764 if (error)
765 goto out_gunlock_q;
766
767 error = gfs2_trans_begin(sdp,
768 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
769 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
770 if (error)
771 goto out_ipres;
772
773 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
774 if (gfs2_is_stuffed(ip)) {
775 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
776 NULL);
777 if (error)
778 goto out_end_trans;
779 }
780
781 h = calc_tree_height(ip, size);
782 if (ip->i_di.di_height < h) {
783 down_write(&ip->i_rw_mutex);
784 error = build_height(ip, h);
785 up_write(&ip->i_rw_mutex);
786 if (error)
787 goto out_end_trans;
788 }
789 }
790
791 ip->i_di.di_size = size;
792 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
793
794 error = gfs2_meta_inode_buffer(ip, &dibh);
795 if (error)
796 goto out_end_trans;
797
798 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
799 gfs2_dinode_out(&ip->i_di, dibh->b_data);
800 brelse(dibh);
801
802 out_end_trans:
803 gfs2_trans_end(sdp);
804
805 out_ipres:
806 gfs2_inplace_release(ip);
807
808 out_gunlock_q:
809 gfs2_quota_unlock(ip);
810
811 out:
812 gfs2_alloc_put(ip);
813
814 return error;
815}
816
817static int trunc_start(struct gfs2_inode *ip, uint64_t size)
818{
819 struct gfs2_sbd *sdp = ip->i_sbd;
820 struct buffer_head *dibh;
821 int journaled = gfs2_is_jdata(ip);
822 int error;
823
824 error = gfs2_trans_begin(sdp,
825 RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
826 if (error)
827 return error;
828
829 error = gfs2_meta_inode_buffer(ip, &dibh);
830 if (error)
831 goto out;
832
833 if (gfs2_is_stuffed(ip)) {
834 ip->i_di.di_size = size;
835 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
836 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
837 gfs2_dinode_out(&ip->i_di, dibh->b_data);
838 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
839 error = 1;
840
841 } else {
842 if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
843 error = gfs2_block_truncate_page(ip->i_vnode->i_mapping);
844
845 if (!error) {
846 ip->i_di.di_size = size;
847 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
848 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
849 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
850 gfs2_dinode_out(&ip->i_di, dibh->b_data);
851 }
852 }
853
854 brelse(dibh);
855
856 out:
857 gfs2_trans_end(sdp);
858
859 return error;
860}
861
862static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
863{
864 unsigned int height = ip->i_di.di_height;
865 uint64_t lblock;
866 struct metapath mp;
867 int error;
868
869 if (!size)
870 lblock = 0;
871 else
872 lblock = (size - 1) >> ip->i_sbd->sd_sb.sb_bsize_shift;
873
874 find_metapath(ip, lblock, &mp);
875 gfs2_alloc_get(ip);
876
877 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
878 if (error)
879 goto out;
880
881 while (height--) {
882 struct strip_mine sm;
883 sm.sm_first = !!size;
884 sm.sm_height = height;
885
886 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
887 if (error)
888 break;
889 }
890
891 gfs2_quota_unhold(ip);
892
893 out:
894 gfs2_alloc_put(ip);
895 return error;
896}
897
898static int trunc_end(struct gfs2_inode *ip)
899{
900 struct gfs2_sbd *sdp = ip->i_sbd;
901 struct buffer_head *dibh;
902 int error;
903
904 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
905 if (error)
906 return error;
907
908 down_write(&ip->i_rw_mutex);
909
910 error = gfs2_meta_inode_buffer(ip, &dibh);
911 if (error)
912 goto out;
913
914 if (!ip->i_di.di_size) {
915 ip->i_di.di_height = 0;
916 ip->i_di.di_goal_meta =
917 ip->i_di.di_goal_data =
918 ip->i_num.no_addr;
919 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
920 }
921 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
922 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
923
924 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
925 gfs2_dinode_out(&ip->i_di, dibh->b_data);
926 brelse(dibh);
927
928 out:
929 up_write(&ip->i_rw_mutex);
930
931 gfs2_trans_end(sdp);
932
933 return error;
934}
935
936/**
937 * do_shrink - make a file smaller
938 * @ip: the inode
939 * @size: the size to make the file
940 * @truncator: function to truncate the last partial block
941 *
942 * Called with an exclusive lock on @ip.
943 *
944 * Returns: errno
945 */
946
947static int do_shrink(struct gfs2_inode *ip, uint64_t size)
948{
949 int error;
950
951 error = trunc_start(ip, size);
952 if (error < 0)
953 return error;
954 if (error > 0)
955 return 0;
956
957 error = trunc_dealloc(ip, size);
958 if (!error)
959 error = trunc_end(ip);
960
961 return error;
962}
963
964/**
965 * gfs2_truncatei - make a file a given size
966 * @ip: the inode
967 * @size: the size to make the file
968 * @truncator: function to truncate the last partial block
969 *
970 * The file size can grow, shrink, or stay the same size.
971 *
972 * Returns: errno
973 */
974
975int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size)
976{
977 int error;
978
979 if (gfs2_assert_warn(ip->i_sbd, S_ISREG(ip->i_di.di_mode)))
980 return -EINVAL;
981
982 if (size > ip->i_di.di_size)
983 error = do_grow(ip, size);
984 else
985 error = do_shrink(ip, size);
986
987 return error;
988}
989
990int gfs2_truncatei_resume(struct gfs2_inode *ip)
991{
992 int error;
993 error = trunc_dealloc(ip, ip->i_di.di_size);
994 if (!error)
995 error = trunc_end(ip);
996 return error;
997}
998
999int gfs2_file_dealloc(struct gfs2_inode *ip)
1000{
1001 return trunc_dealloc(ip, 0);
1002}
1003
1004/**
1005 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1006 * @ip: the file
1007 * @len: the number of bytes to be written to the file
1008 * @data_blocks: returns the number of data blocks required
1009 * @ind_blocks: returns the number of indirect blocks required
1010 *
1011 */
1012
1013void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1014 unsigned int *data_blocks, unsigned int *ind_blocks)
1015{
1016 struct gfs2_sbd *sdp = ip->i_sbd;
1017 unsigned int tmp;
1018
1019 if (gfs2_is_dir(ip)) {
1020 *data_blocks = DIV_RU(len, sdp->sd_jbsize) + 2;
1021 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1022 } else {
1023 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1024 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1025 }
1026
1027 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1028 tmp = DIV_RU(tmp, sdp->sd_inptrs);
1029 *ind_blocks += tmp;
1030 }
1031}
1032
1033/**
1034 * gfs2_write_alloc_required - figure out if a write will require an allocation
1035 * @ip: the file being written to
1036 * @offset: the offset to write to
1037 * @len: the number of bytes being written
1038 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1039 *
1040 * Returns: errno
1041 */
1042
1043int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
1044 unsigned int len, int *alloc_required)
1045{
1046 struct gfs2_sbd *sdp = ip->i_sbd;
1047 uint64_t lblock, lblock_stop, dblock;
1048 uint32_t extlen;
1049 int new = 0;
1050 int error = 0;
1051
1052 *alloc_required = 0;
1053
1054 if (!len)
1055 return 0;
1056
1057 if (gfs2_is_stuffed(ip)) {
1058 if (offset + len >
1059 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1060 *alloc_required = 1;
1061 return 0;
1062 }
1063
1064 if (gfs2_is_dir(ip)) {
1065 unsigned int bsize = sdp->sd_jbsize;
1066 lblock = offset;
1067 do_div(lblock, bsize);
1068 lblock_stop = offset + len + bsize - 1;
1069 do_div(lblock_stop, bsize);
1070 } else {
1071 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1072 lblock = offset >> shift;
1073 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1074 }
1075
1076 for (; lblock < lblock_stop; lblock += extlen) {
1077 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
1078 if (error)
1079 return error;
1080
1081 if (!dblock) {
1082 *alloc_required = 1;
1083 return 0;
1084 }
1085 }
1086
1087 return 0;
1088}
1089
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..ee9ec8d7515c
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13typedef int (*gfs2_unstuffer_t) (struct gfs2_inode * ip,
14 struct buffer_head * dibh, uint64_t block,
15 void *private);
16int gfs2_unstuffer_sync(struct gfs2_inode *ip, struct buffer_head *dibh,
17 uint64_t block, void *private);
18int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
19 void *private);
20
21int gfs2_block_map(struct gfs2_inode *ip,
22 uint64_t lblock, int *new,
23 uint64_t *dblock, uint32_t *extlen);
24
25int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size);
26int gfs2_truncatei_resume(struct gfs2_inode *ip);
27int gfs2_file_dealloc(struct gfs2_inode *ip);
28
29void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
30 unsigned int *data_blocks,
31 unsigned int *ind_blocks);
32int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
33 unsigned int len, int *alloc_required);
34
35#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cff8d5368d21
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,225 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "daemon.h"
21#include "glock.h"
22#include "log.h"
23#include "quota.h"
24#include "recovery.h"
25#include "super.h"
26#include "unlinked.h"
27
28/* This uses schedule_timeout() instead of msleep() because it's good for
29 the daemons to wake up more often than the timeout when unmounting so
30 the user's unmount doesn't sit there forever.
31
32 The kthread functions used to start these daemons block and flush signals. */
33
34/**
35 * gfs2_scand - Look for cached glocks and inodes to toss from memory
36 * @sdp: Pointer to GFS2 superblock
37 *
38 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
39 * See gfs2_glockd()
40 */
41
42int gfs2_scand(void *data)
43{
44 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
45 unsigned long t;
46
47 while (!kthread_should_stop()) {
48 gfs2_scand_internal(sdp);
49 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
50 schedule_timeout_interruptible(t);
51 }
52
53 return 0;
54}
55
56/**
57 * gfs2_glockd - Reclaim unused glock structures
58 * @sdp: Pointer to GFS2 superblock
59 *
60 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
61 * Number of daemons can be set by user, with num_glockd mount option.
62 */
63
64int gfs2_glockd(void *data)
65{
66 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
67 DECLARE_WAITQUEUE(wait_chan, current);
68
69 while (!kthread_should_stop()) {
70 while (atomic_read(&sdp->sd_reclaim_count))
71 gfs2_reclaim_glock(sdp);
72
73 set_current_state(TASK_INTERRUPTIBLE);
74 add_wait_queue(&sdp->sd_reclaim_wq, &wait_chan);
75 if (!atomic_read(&sdp->sd_reclaim_count) &&
76 !kthread_should_stop())
77 schedule();
78 remove_wait_queue(&sdp->sd_reclaim_wq, &wait_chan);
79 set_current_state(TASK_RUNNING);
80 }
81
82 return 0;
83}
84
85/**
86 * gfs2_recoverd - Recover dead machine's journals
87 * @sdp: Pointer to GFS2 superblock
88 *
89 */
90
91int gfs2_recoverd(void *data)
92{
93 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
94 unsigned long t;
95
96 while (!kthread_should_stop()) {
97 gfs2_check_journals(sdp);
98 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
99 schedule_timeout_interruptible(t);
100 }
101
102 return 0;
103}
104
105/**
106 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
107 * @sdp: Pointer to GFS2 superblock
108 *
109 * Also, periodically check to make sure that we're using the most recent
110 * journal index.
111 */
112
113int gfs2_logd(void *data)
114{
115 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
116 struct gfs2_holder ji_gh;
117 unsigned long t;
118
119 while (!kthread_should_stop()) {
120 /* Advance the log tail */
121
122 t = sdp->sd_log_flush_time +
123 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
124
125 gfs2_ail1_empty(sdp, DIO_ALL);
126
127 if (time_after_eq(jiffies, t)) {
128 gfs2_log_flush(sdp);
129 sdp->sd_log_flush_time = jiffies;
130 }
131
132 /* Check for latest journal index */
133
134 t = sdp->sd_jindex_refresh_time +
135 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
136
137 if (time_after_eq(jiffies, t)) {
138 if (!gfs2_jindex_hold(sdp, &ji_gh))
139 gfs2_glock_dq_uninit(&ji_gh);
140 sdp->sd_jindex_refresh_time = jiffies;
141 }
142
143 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
144 schedule_timeout_interruptible(t);
145 }
146
147 return 0;
148}
149
150/**
151 * gfs2_quotad - Write cached quota changes into the quota file
152 * @sdp: Pointer to GFS2 superblock
153 *
154 */
155
156int gfs2_quotad(void *data)
157{
158 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
159 unsigned long t;
160 int error;
161
162 while (!kthread_should_stop()) {
163 /* Update the master statfs file */
164
165 t = sdp->sd_statfs_sync_time +
166 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
167
168 if (time_after_eq(jiffies, t)) {
169 error = gfs2_statfs_sync(sdp);
170 if (error &&
171 error != -EROFS &&
172 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
173 fs_err(sdp, "quotad: (1) error=%d\n", error);
174 sdp->sd_statfs_sync_time = jiffies;
175 }
176
177 /* Update quota file */
178
179 t = sdp->sd_quota_sync_time +
180 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
181
182 if (time_after_eq(jiffies, t)) {
183 error = gfs2_quota_sync(sdp);
184 if (error &&
185 error != -EROFS &&
186 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
187 fs_err(sdp, "quotad: (2) error=%d\n", error);
188 sdp->sd_quota_sync_time = jiffies;
189 }
190
191 gfs2_quota_scan(sdp);
192
193 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
194 schedule_timeout_interruptible(t);
195 }
196
197 return 0;
198}
199
200/**
201 * gfs2_inoded - Deallocate unlinked inodes
202 * @sdp: Pointer to GFS2 superblock
203 *
204 */
205
206int gfs2_inoded(void *data)
207{
208 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
209 unsigned long t;
210 int error;
211
212 while (!kthread_should_stop()) {
213 error = gfs2_unlinked_dealloc(sdp);
214 if (error &&
215 error != -EROFS &&
216 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
217 fs_err(sdp, "inoded: error = %d\n", error);
218
219 t = gfs2_tune_get(sdp, gt_inoded_secs) * HZ;
220 schedule_timeout_interruptible(t);
221 }
222
223 return 0;
224}
225
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..a27fdeda5fbb
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18int gfs2_inoded(void *data);
19
20#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..65871a2b460e
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,2356 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11* Implements Extendible Hashing as described in:
12* "Extendible Hashing" by Fagin, et al in
13* __ACM Trans. on Database Systems__, Sept 1979.
14*
15*
16* Here's the layout of dirents which is essentially the same as that of ext2
17* within a single block. The field de_name_len is the number of bytes
18* actually required for the name (no null terminator). The field de_rec_len
19* is the number of bytes allocated to the dirent. The offset of the next
20* dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21* deleted, the preceding dirent inherits its allocated space, ie
22* prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23* by adding de_rec_len to the current dirent, this essentially causes the
24* deleted dirent to get jumped over when iterating through all the dirents.
25*
26* When deleting the first dirent in a block, there is no previous dirent so
27* the field de_ino is set to zero to designate it as deleted. When allocating
28* a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29* first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30* dirent is allocated. Otherwise it must go through all the 'used' dirents
31* searching for one in which the amount of total space minus the amount of
32* used space will provide enough space for the new dirent.
33*
34* There are two types of blocks in which dirents reside. In a stuffed dinode,
35* the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36* the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37* beginning of the leaf block. The dirents reside in leaves when
38*
39* dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40*
41* Otherwise, the dirents are "linear", within a single stuffed dinode block.
42*
43* When the dirents are in leaves, the actual contents of the directory file are
44* used as an array of 64-bit block pointers pointing to the leaf blocks. The
45* dirents are NOT in the directory file itself. There can be more than one block
46* pointer in the array that points to the same leaf. In fact, when a directory
47* is first converted from linear to exhash, all of the pointers point to the
48* same leaf.
49*
50* When a leaf is completely full, the size of the hash table can be
51* doubled unless it is already at the maximum size which is hard coded into
52* GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53* but never before the maximum hash table size has been reached.
54*/
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/completion.h>
60#include <linux/buffer_head.h>
61#include <linux/sort.h>
62#include <asm/semaphore.h>
63
64#include "gfs2.h"
65#include "dir.h"
66#include "glock.h"
67#include "inode.h"
68#include "meta_io.h"
69#include "quota.h"
70#include "rgrp.h"
71#include "trans.h"
72#include "bmap.h"
73
74#define IS_LEAF 1 /* Hashed (leaf) directory */
75#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
76
77#if 1
78#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
79#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
80#else
81#define gfs2_disk_hash2offset(h) (((uint64_t)(h)))
82#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p))))
83#endif
84
85typedef int (*leaf_call_t) (struct gfs2_inode *dip,
86 uint32_t index, uint32_t len, uint64_t leaf_no,
87 void *data);
88
89int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new,
90 struct buffer_head **bhp)
91{
92 struct buffer_head *bh;
93 int error = 0;
94
95 if (new) {
96 bh = gfs2_meta_new(ip->i_gl, block);
97 gfs2_trans_add_bh(ip->i_gl, bh, 1);
98 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
99 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
100 } else {
101 error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT, &bh);
102 if (error)
103 return error;
104 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_JD)) {
105 brelse(bh);
106 return -EIO;
107 }
108 }
109
110 *bhp = bh;
111 return 0;
112}
113
114
115
116static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
117 unsigned int offset, unsigned int size)
118
119{
120 struct buffer_head *dibh;
121 int error;
122
123 error = gfs2_meta_inode_buffer(ip, &dibh);
124 if (error)
125 return error;
126
127 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
128 memcpy(dibh->b_data + offset + sizeof(struct gfs2_inode), buf, size);
129 if (ip->i_di.di_size < offset + size)
130 ip->i_di.di_size = offset + size;
131 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
132 gfs2_dinode_out(&ip->i_di, dibh->b_data);
133
134 brelse(dibh);
135
136 return size;
137}
138
139
140
141/**
142 * gfs2_dir_write_data - Write directory information to the inode
143 * @ip: The GFS2 inode
144 * @buf: The buffer containing information to be written
145 * @offset: The file offset to start writing at
146 * @size: The amount of data to write
147 *
148 * Returns: The number of bytes correctly written or error code
149 */
150static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
151 uint64_t offset, unsigned int size)
152{
153 struct gfs2_sbd *sdp = ip->i_sbd;
154 struct buffer_head *dibh;
155 uint64_t lblock, dblock;
156 uint32_t extlen = 0;
157 unsigned int o;
158 int copied = 0;
159 int error = 0;
160
161 if (!size)
162 return 0;
163
164 if (gfs2_is_stuffed(ip) &&
165 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
166 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, size);
167
168 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
169 return -EINVAL;
170
171 if (gfs2_is_stuffed(ip)) {
172 error = gfs2_unstuff_dinode(ip, NULL, NULL);
173 if (error)
174 return error;
175 }
176
177 lblock = offset;
178 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
179
180 while (copied < size) {
181 unsigned int amount;
182 struct buffer_head *bh;
183 int new;
184
185 amount = size - copied;
186 if (amount > sdp->sd_sb.sb_bsize - o)
187 amount = sdp->sd_sb.sb_bsize - o;
188
189 if (!extlen) {
190 new = 1;
191 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
192 if (error)
193 goto fail;
194 error = -EIO;
195 if (gfs2_assert_withdraw(sdp, dblock))
196 goto fail;
197 }
198
199 error = gfs2_dir_get_buffer(ip, dblock, (amount == sdp->sd_jbsize) ? 1 : new, &bh);
200 if (error)
201 goto fail;
202
203 gfs2_trans_add_bh(ip->i_gl, bh, 1);
204 memcpy(bh->b_data + o, buf, amount);
205 brelse(bh);
206 if (error)
207 goto fail;
208
209 copied += amount;
210 lblock++;
211 dblock++;
212 extlen--;
213
214 o = sizeof(struct gfs2_meta_header);
215 }
216
217out:
218 error = gfs2_meta_inode_buffer(ip, &dibh);
219 if (error)
220 return error;
221
222 if (ip->i_di.di_size < offset + copied)
223 ip->i_di.di_size = offset + copied;
224 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
225
226 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
227 gfs2_dinode_out(&ip->i_di, dibh->b_data);
228 brelse(dibh);
229
230 return copied;
231fail:
232 if (copied)
233 goto out;
234 return error;
235}
236
237static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
238 unsigned int offset, unsigned int size)
239{
240 struct buffer_head *dibh;
241 int error;
242
243 error = gfs2_meta_inode_buffer(ip, &dibh);
244 if (!error) {
245 offset += sizeof(struct gfs2_dinode);
246 memcpy(buf, dibh->b_data + offset, size);
247 brelse(dibh);
248 }
249
250 return (error) ? error : size;
251}
252
253
254/**
255 * gfs2_dir_read_data - Read a data from a directory inode
256 * @ip: The GFS2 Inode
257 * @buf: The buffer to place result into
258 * @offset: File offset to begin jdata_readng from
259 * @size: Amount of data to transfer
260 *
261 * Returns: The amount of data actually copied or the error
262 */
263static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf,
264 uint64_t offset, unsigned int size)
265{
266 struct gfs2_sbd *sdp = ip->i_sbd;
267 uint64_t lblock, dblock;
268 uint32_t extlen = 0;
269 unsigned int o;
270 int copied = 0;
271 int error = 0;
272
273 if (offset >= ip->i_di.di_size)
274 return 0;
275
276 if ((offset + size) > ip->i_di.di_size)
277 size = ip->i_di.di_size - offset;
278
279 if (!size)
280 return 0;
281
282 if (gfs2_is_stuffed(ip))
283 return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset, size);
284
285 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
286 return -EINVAL;
287
288 lblock = offset;
289 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
290
291 while (copied < size) {
292 unsigned int amount;
293 struct buffer_head *bh;
294 int new;
295
296 amount = size - copied;
297 if (amount > sdp->sd_sb.sb_bsize - o)
298 amount = sdp->sd_sb.sb_bsize - o;
299
300 if (!extlen) {
301 new = 0;
302 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
303 if (error)
304 goto fail;
305 }
306
307 if (extlen > 1)
308 gfs2_meta_ra(ip->i_gl, dblock, extlen);
309
310 if (dblock) {
311 error = gfs2_dir_get_buffer(ip, dblock, new, &bh);
312 if (error)
313 goto fail;
314 dblock++;
315 extlen--;
316 } else
317 bh = NULL;
318
319 memcpy(buf, bh->b_data + o, amount);
320 brelse(bh);
321 if (error)
322 goto fail;
323
324 copied += amount;
325 lblock++;
326
327 o = sizeof(struct gfs2_meta_header);
328 }
329
330 return copied;
331fail:
332 return (copied) ? copied : error;
333}
334
335/**
336 * int gfs2_filecmp - Compare two filenames
337 * @file1: The first filename
338 * @file2: The second filename
339 * @len_of_file2: The length of the second file
340 *
341 * This routine compares two filenames and returns 1 if they are equal.
342 *
343 * Returns: 1 if the files are the same, otherwise 0.
344 */
345
346int gfs2_filecmp(struct qstr *file1, char *file2, int len_of_file2)
347{
348 if (file1->len != len_of_file2)
349 return 0;
350 if (memcmp(file1->name, file2, file1->len))
351 return 0;
352 return 1;
353}
354
355/**
356 * dirent_first - Return the first dirent
357 * @dip: the directory
358 * @bh: The buffer
359 * @dent: Pointer to list of dirents
360 *
361 * return first dirent whether bh points to leaf or stuffed dinode
362 *
363 * Returns: IS_LEAF, IS_DINODE, or -errno
364 */
365
366static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
367 struct gfs2_dirent **dent)
368{
369 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
370
371 if (be16_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
372 if (gfs2_meta_check(dip->i_sbd, bh))
373 return -EIO;
374 *dent = (struct gfs2_dirent *)(bh->b_data +
375 sizeof(struct gfs2_leaf));
376 return IS_LEAF;
377 } else {
378 if (gfs2_metatype_check(dip->i_sbd, bh, GFS2_METATYPE_DI))
379 return -EIO;
380 *dent = (struct gfs2_dirent *)(bh->b_data +
381 sizeof(struct gfs2_dinode));
382 return IS_DINODE;
383 }
384}
385
386/**
387 * dirent_next - Next dirent
388 * @dip: the directory
389 * @bh: The buffer
390 * @dent: Pointer to list of dirents
391 *
392 * Returns: 0 on success, error code otherwise
393 */
394
395static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
396 struct gfs2_dirent **dent)
397{
398 struct gfs2_dirent *tmp, *cur;
399 char *bh_end;
400 uint16_t cur_rec_len;
401
402 cur = *dent;
403 bh_end = bh->b_data + bh->b_size;
404 cur_rec_len = be16_to_cpu(cur->de_rec_len);
405
406 if ((char *)cur + cur_rec_len >= bh_end) {
407 if ((char *)cur + cur_rec_len > bh_end) {
408 gfs2_consist_inode(dip);
409 return -EIO;
410 }
411 return -ENOENT;
412 }
413
414 tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
415
416 if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) {
417 gfs2_consist_inode(dip);
418 return -EIO;
419 }
420
421 if (cur_rec_len == 0) {
422 gfs2_consist_inode(dip);
423 return -EIO;
424 }
425
426 /* Only the first dent could ever have de_inum.no_addr == 0 */
427 if (!tmp->de_inum.no_addr) {
428 gfs2_consist_inode(dip);
429 return -EIO;
430 }
431
432 *dent = tmp;
433
434 return 0;
435}
436
437/**
438 * dirent_del - Delete a dirent
439 * @dip: The GFS2 inode
440 * @bh: The buffer
441 * @prev: The previous dirent
442 * @cur: The current dirent
443 *
444 */
445
446static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
447 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
448{
449 uint16_t cur_rec_len, prev_rec_len;
450
451 if (!cur->de_inum.no_addr) {
452 gfs2_consist_inode(dip);
453 return;
454 }
455
456 gfs2_trans_add_bh(dip->i_gl, bh, 1);
457
458 /* If there is no prev entry, this is the first entry in the block.
459 The de_rec_len is already as big as it needs to be. Just zero
460 out the inode number and return. */
461
462 if (!prev) {
463 cur->de_inum.no_addr = 0; /* No endianess worries */
464 return;
465 }
466
467 /* Combine this dentry with the previous one. */
468
469 prev_rec_len = be16_to_cpu(prev->de_rec_len);
470 cur_rec_len = be16_to_cpu(cur->de_rec_len);
471
472 if ((char *)prev + prev_rec_len != (char *)cur)
473 gfs2_consist_inode(dip);
474 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
475 gfs2_consist_inode(dip);
476
477 prev_rec_len += cur_rec_len;
478 prev->de_rec_len = cpu_to_be16(prev_rec_len);
479}
480
481/**
482 * gfs2_dirent_alloc - Allocate a directory entry
483 * @dip: The GFS2 inode
484 * @bh: The buffer
485 * @name_len: The length of the name
486 * @dent_out: Pointer to list of dirents
487 *
488 * Returns: 0 on success, error code otherwise
489 */
490
491int gfs2_dirent_alloc(struct gfs2_inode *dip, struct buffer_head *bh,
492 int name_len, struct gfs2_dirent **dent_out)
493{
494 struct gfs2_dirent *dent, *new;
495 unsigned int rec_len = GFS2_DIRENT_SIZE(name_len);
496 unsigned int entries = 0, offset = 0;
497 int type;
498
499 type = dirent_first(dip, bh, &dent);
500 if (type < 0)
501 return type;
502
503 if (type == IS_LEAF) {
504 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
505 entries = be16_to_cpu(leaf->lf_entries);
506 offset = sizeof(struct gfs2_leaf);
507 } else {
508 struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data;
509 entries = be32_to_cpu(dinode->di_entries);
510 offset = sizeof(struct gfs2_dinode);
511 }
512
513 if (!entries) {
514 if (dent->de_inum.no_addr) {
515 gfs2_consist_inode(dip);
516 return -EIO;
517 }
518
519 gfs2_trans_add_bh(dip->i_gl, bh, 1);
520
521 dent->de_rec_len = cpu_to_be16(bh->b_size - offset);
522 dent->de_name_len = cpu_to_be16(name_len);
523
524 *dent_out = dent;
525 return 0;
526 }
527
528 do {
529 uint16_t cur_rec_len;
530 uint16_t cur_name_len;
531
532 cur_rec_len = be16_to_cpu(dent->de_rec_len);
533 cur_name_len = be16_to_cpu(dent->de_name_len);
534
535 if ((!dent->de_inum.no_addr && cur_rec_len >= rec_len) ||
536 (cur_rec_len >= GFS2_DIRENT_SIZE(cur_name_len) + rec_len)) {
537 gfs2_trans_add_bh(dip->i_gl, bh, 1);
538
539 if (dent->de_inum.no_addr) {
540 new = (struct gfs2_dirent *)((char *)dent +
541 GFS2_DIRENT_SIZE(cur_name_len));
542 memset(new, 0, sizeof(struct gfs2_dirent));
543
544 new->de_rec_len = cpu_to_be16(cur_rec_len - GFS2_DIRENT_SIZE(cur_name_len));
545 new->de_name_len = cpu_to_be16(name_len);
546
547 dent->de_rec_len = cpu_to_be16(cur_rec_len - be16_to_cpu(new->de_rec_len));
548
549 *dent_out = new;
550 return 0;
551 }
552
553 dent->de_name_len = cpu_to_be16(name_len);
554
555 *dent_out = dent;
556 return 0;
557 }
558 } while (dirent_next(dip, bh, &dent) == 0);
559
560 return -ENOSPC;
561}
562
563/**
564 * dirent_fits - See if we can fit a entry in this buffer
565 * @dip: The GFS2 inode
566 * @bh: The buffer
567 * @name_len: The length of the name
568 *
569 * Returns: 1 if it can fit, 0 otherwise
570 */
571
572static int dirent_fits(struct gfs2_inode *dip, struct buffer_head *bh,
573 int name_len)
574{
575 struct gfs2_dirent *dent;
576 unsigned int rec_len = GFS2_DIRENT_SIZE(name_len);
577 unsigned int entries = 0;
578 int type;
579
580 type = dirent_first(dip, bh, &dent);
581 if (type < 0)
582 return type;
583
584 if (type == IS_LEAF) {
585 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
586 entries = be16_to_cpu(leaf->lf_entries);
587 } else {
588 struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data;
589 entries = be32_to_cpu(dinode->di_entries);
590 }
591
592 if (!entries)
593 return 1;
594
595 do {
596 uint16_t cur_rec_len;
597 uint32_t cur_name_len;
598
599 cur_rec_len = be16_to_cpu(dent->de_rec_len);
600 cur_name_len = be16_to_cpu(dent->de_name_len);
601
602 if ((!dent->de_inum.no_addr && cur_rec_len >= rec_len) ||
603 (cur_rec_len >= GFS2_DIRENT_SIZE(cur_name_len) + rec_len))
604 return 1;
605 } while (dirent_next(dip, bh, &dent) == 0);
606
607 return 0;
608}
609
610static int leaf_search(struct gfs2_inode *dip, struct buffer_head *bh,
611 struct qstr *filename, struct gfs2_dirent **dent_out,
612 struct gfs2_dirent **dent_prev)
613{
614 uint32_t hash;
615 struct gfs2_dirent *dent, *prev = NULL;
616 unsigned int entries = 0;
617 int type;
618
619 type = dirent_first(dip, bh, &dent);
620 if (type < 0)
621 return type;
622
623 if (type == IS_LEAF) {
624 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
625 entries = be16_to_cpu(leaf->lf_entries);
626 } else if (type == IS_DINODE) {
627 struct gfs2_dinode *dinode = (struct gfs2_dinode *)bh->b_data;
628 entries = be32_to_cpu(dinode->di_entries);
629 }
630
631 hash = gfs2_disk_hash(filename->name, filename->len);
632
633 do {
634 if (!dent->de_inum.no_addr) {
635 prev = dent;
636 continue;
637 }
638
639 if (be32_to_cpu(dent->de_hash) == hash &&
640 gfs2_filecmp(filename, (char *)(dent + 1),
641 be16_to_cpu(dent->de_name_len))) {
642 *dent_out = dent;
643 if (dent_prev)
644 *dent_prev = prev;
645
646 return 0;
647 }
648
649 prev = dent;
650 } while (dirent_next(dip, bh, &dent) == 0);
651
652 return -ENOENT;
653}
654
655static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
656 struct buffer_head **bhp)
657{
658 int error;
659
660 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
661 if (!error && gfs2_metatype_check(dip->i_sbd, *bhp, GFS2_METATYPE_LF))
662 error = -EIO;
663
664 return error;
665}
666
667/**
668 * get_leaf_nr - Get a leaf number associated with the index
669 * @dip: The GFS2 inode
670 * @index:
671 * @leaf_out:
672 *
673 * Returns: 0 on success, error code otherwise
674 */
675
676static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
677 uint64_t *leaf_out)
678{
679 uint64_t leaf_no;
680 int error;
681
682 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
683 index * sizeof(uint64_t),
684 sizeof(uint64_t));
685 if (error != sizeof(uint64_t))
686 return (error < 0) ? error : -EIO;
687
688 *leaf_out = be64_to_cpu(leaf_no);
689
690 return 0;
691}
692
693static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
694 struct buffer_head **bh_out)
695{
696 uint64_t leaf_no;
697 int error;
698
699 error = get_leaf_nr(dip, index, &leaf_no);
700 if (!error)
701 error = get_leaf(dip, leaf_no, bh_out);
702
703 return error;
704}
705
706static int get_next_leaf(struct gfs2_inode *dip, struct buffer_head *bh_in,
707 struct buffer_head **bh_out)
708{
709 struct gfs2_leaf *leaf;
710 int error;
711
712 leaf = (struct gfs2_leaf *)bh_in->b_data;
713
714 if (!leaf->lf_next)
715 error = -ENOENT;
716 else
717 error = get_leaf(dip, be64_to_cpu(leaf->lf_next), bh_out);
718
719 return error;
720}
721
722static int linked_leaf_search(struct gfs2_inode *dip, struct qstr *filename,
723 struct gfs2_dirent **dent_out,
724 struct gfs2_dirent **dent_prev,
725 struct buffer_head **bh_out)
726{
727 struct buffer_head *bh = NULL, *bh_next;
728 uint32_t hsize, index;
729 uint32_t hash;
730 int error;
731
732 hsize = 1 << dip->i_di.di_depth;
733 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
734 gfs2_consist_inode(dip);
735 return -EIO;
736 }
737
738 /* Figure out the address of the leaf node. */
739
740 hash = gfs2_disk_hash(filename->name, filename->len);
741 index = hash >> (32 - dip->i_di.di_depth);
742
743 error = get_first_leaf(dip, index, &bh_next);
744 if (error)
745 return error;
746
747 /* Find the entry */
748
749 do {
750 brelse(bh);
751
752 bh = bh_next;
753
754 error = leaf_search(dip, bh, filename, dent_out, dent_prev);
755 switch (error) {
756 case 0:
757 *bh_out = bh;
758 return 0;
759
760 case -ENOENT:
761 break;
762
763 default:
764 brelse(bh);
765 return error;
766 }
767
768 error = get_next_leaf(dip, bh, &bh_next);
769 }
770 while (!error);
771
772 brelse(bh);
773
774 return error;
775}
776
777/**
778 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
779 * @dip: The GFS2 inode
780 *
781 * Returns: 0 on success, error code otherwise
782 */
783
784static int dir_make_exhash(struct gfs2_inode *dip)
785{
786 struct gfs2_sbd *sdp = dip->i_sbd;
787 struct gfs2_dirent *dent;
788 struct buffer_head *bh, *dibh;
789 struct gfs2_leaf *leaf;
790 int y;
791 uint32_t x;
792 uint64_t *lp, bn;
793 int error;
794
795 error = gfs2_meta_inode_buffer(dip, &dibh);
796 if (error)
797 return error;
798
799 /* Allocate a new block for the first leaf node */
800
801 bn = gfs2_alloc_meta(dip);
802
803 /* Turn over a new leaf */
804
805 bh = gfs2_meta_new(dip->i_gl, bn);
806 gfs2_trans_add_bh(dip->i_gl, bh, 1);
807 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
808 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
809
810 /* Fill in the leaf structure */
811
812 leaf = (struct gfs2_leaf *)bh->b_data;
813
814 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
815
816 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
817 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
818
819 /* Copy dirents */
820
821 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
822 sizeof(struct gfs2_dinode));
823
824 /* Find last entry */
825
826 x = 0;
827 dirent_first(dip, bh, &dent);
828
829 do {
830 if (!dent->de_inum.no_addr)
831 continue;
832 if (++x == dip->i_di.di_entries)
833 break;
834 }
835 while (dirent_next(dip, bh, &dent) == 0);
836
837 /* Adjust the last dirent's record length
838 (Remember that dent still points to the last entry.) */
839
840 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
841 sizeof(struct gfs2_dinode) -
842 sizeof(struct gfs2_leaf));
843
844 brelse(bh);
845
846 /* We're done with the new leaf block, now setup the new
847 hash table. */
848
849 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
850 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
851
852 lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
853
854 for (x = sdp->sd_hash_ptrs; x--; lp++)
855 *lp = cpu_to_be64(bn);
856
857 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
858 dip->i_di.di_blocks++;
859 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
860 dip->i_di.di_payload_format = 0;
861
862 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
863 dip->i_di.di_depth = y;
864
865 gfs2_dinode_out(&dip->i_di, dibh->b_data);
866
867 brelse(dibh);
868
869 return 0;
870}
871
872/**
873 * dir_split_leaf - Split a leaf block into two
874 * @dip: The GFS2 inode
875 * @index:
876 * @leaf_no:
877 *
878 * Returns: 0 on success, error code on failure
879 */
880
881static int dir_split_leaf(struct gfs2_inode *dip, uint32_t index,
882 uint64_t leaf_no)
883{
884 struct buffer_head *nbh, *obh, *dibh;
885 struct gfs2_leaf *nleaf, *oleaf;
886 struct gfs2_dirent *dent, *prev = NULL, *next = NULL, *new;
887 uint32_t start, len, half_len, divider;
888 uint64_t bn, *lp;
889 uint32_t name_len;
890 int x, moved = 0;
891 int error;
892
893 /* Allocate the new leaf block */
894
895 bn = gfs2_alloc_meta(dip);
896
897 /* Get the new leaf block */
898
899 nbh = gfs2_meta_new(dip->i_gl, bn);
900 gfs2_trans_add_bh(dip->i_gl, nbh, 1);
901 gfs2_metatype_set(nbh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
902 gfs2_buffer_clear_tail(nbh, sizeof(struct gfs2_meta_header));
903
904 nleaf = (struct gfs2_leaf *)nbh->b_data;
905
906 nleaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
907
908 /* Get the old leaf block */
909
910 error = get_leaf(dip, leaf_no, &obh);
911 if (error)
912 goto fail;
913
914 gfs2_trans_add_bh(dip->i_gl, obh, 1);
915
916 oleaf = (struct gfs2_leaf *)obh->b_data;
917
918 /* Compute the start and len of leaf pointers in the hash table. */
919
920 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
921 half_len = len >> 1;
922 if (!half_len) {
923 gfs2_consist_inode(dip);
924 error = -EIO;
925 goto fail_brelse;
926 }
927
928 start = (index & ~(len - 1));
929
930 /* Change the pointers.
931 Don't bother distinguishing stuffed from non-stuffed.
932 This code is complicated enough already. */
933
934 lp = kcalloc(half_len, sizeof(uint64_t), GFP_KERNEL | __GFP_NOFAIL);
935
936 error = gfs2_dir_read_data(dip, (char *)lp, start * sizeof(uint64_t),
937 half_len * sizeof(uint64_t));
938 if (error != half_len * sizeof(uint64_t)) {
939 if (error >= 0)
940 error = -EIO;
941 goto fail_lpfree;
942 }
943
944 /* Change the pointers */
945
946 for (x = 0; x < half_len; x++)
947 lp[x] = cpu_to_be64(bn);
948
949 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t),
950 half_len * sizeof(uint64_t));
951 if (error != half_len * sizeof(uint64_t)) {
952 if (error >= 0)
953 error = -EIO;
954 goto fail_lpfree;
955 }
956
957 kfree(lp);
958
959 /* Compute the divider */
960
961 divider = (start + half_len) << (32 - dip->i_di.di_depth);
962
963 /* Copy the entries */
964
965 dirent_first(dip, obh, &dent);
966
967 do {
968 next = dent;
969 if (dirent_next(dip, obh, &next))
970 next = NULL;
971
972 if (dent->de_inum.no_addr &&
973 be32_to_cpu(dent->de_hash) < divider) {
974 name_len = be16_to_cpu(dent->de_name_len);
975
976 gfs2_dirent_alloc(dip, nbh, name_len, &new);
977
978 new->de_inum = dent->de_inum; /* No endian worries */
979 new->de_hash = dent->de_hash; /* No endian worries */
980 new->de_type = dent->de_type; /* No endian worries */
981 memcpy((char *)(new + 1), (char *)(dent + 1),
982 name_len);
983
984 nleaf->lf_entries = be16_to_cpu(nleaf->lf_entries)+1;
985 nleaf->lf_entries = cpu_to_be16(nleaf->lf_entries);
986
987 dirent_del(dip, obh, prev, dent);
988
989 if (!oleaf->lf_entries)
990 gfs2_consist_inode(dip);
991 oleaf->lf_entries = be16_to_cpu(oleaf->lf_entries)-1;
992 oleaf->lf_entries = cpu_to_be16(oleaf->lf_entries);
993
994 if (!prev)
995 prev = dent;
996
997 moved = 1;
998 } else
999 prev = dent;
1000
1001 dent = next;
1002 }
1003 while (dent);
1004
1005 /* If none of the entries got moved into the new leaf,
1006 artificially fill in the first entry. */
1007
1008 if (!moved) {
1009 gfs2_dirent_alloc(dip, nbh, 0, &new);
1010 new->de_inum.no_addr = 0;
1011 }
1012
1013 oleaf->lf_depth = be16_to_cpu(oleaf->lf_depth) + 1;
1014 oleaf->lf_depth = cpu_to_be16(oleaf->lf_depth);
1015 nleaf->lf_depth = oleaf->lf_depth;
1016
1017 error = gfs2_meta_inode_buffer(dip, &dibh);
1018 if (!gfs2_assert_withdraw(dip->i_sbd, !error)) {
1019 dip->i_di.di_blocks++;
1020 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1021 brelse(dibh);
1022 }
1023
1024 brelse(obh);
1025 brelse(nbh);
1026
1027 return error;
1028
1029 fail_lpfree:
1030 kfree(lp);
1031
1032 fail_brelse:
1033 brelse(obh);
1034
1035 fail:
1036 brelse(nbh);
1037 return error;
1038}
1039
1040/**
1041 * dir_double_exhash - Double size of ExHash table
1042 * @dip: The GFS2 dinode
1043 *
1044 * Returns: 0 on success, error code on failure
1045 */
1046
1047static int dir_double_exhash(struct gfs2_inode *dip)
1048{
1049 struct gfs2_sbd *sdp = dip->i_sbd;
1050 struct buffer_head *dibh;
1051 uint32_t hsize;
1052 uint64_t *buf;
1053 uint64_t *from, *to;
1054 uint64_t block;
1055 int x;
1056 int error = 0;
1057
1058 hsize = 1 << dip->i_di.di_depth;
1059 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1060 gfs2_consist_inode(dip);
1061 return -EIO;
1062 }
1063
1064 /* Allocate both the "from" and "to" buffers in one big chunk */
1065
1066 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1067
1068 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1069 error = gfs2_dir_read_data(dip, (char *)buf,
1070 block * sdp->sd_hash_bsize,
1071 sdp->sd_hash_bsize);
1072 if (error != sdp->sd_hash_bsize) {
1073 if (error >= 0)
1074 error = -EIO;
1075 goto fail;
1076 }
1077
1078 from = buf;
1079 to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
1080
1081 for (x = sdp->sd_hash_ptrs; x--; from++) {
1082 *to++ = *from; /* No endianess worries */
1083 *to++ = *from;
1084 }
1085
1086 error = gfs2_dir_write_data(dip,
1087 (char *)buf + sdp->sd_hash_bsize,
1088 block * sdp->sd_sb.sb_bsize,
1089 sdp->sd_sb.sb_bsize);
1090 if (error != sdp->sd_sb.sb_bsize) {
1091 if (error >= 0)
1092 error = -EIO;
1093 goto fail;
1094 }
1095 }
1096
1097 kfree(buf);
1098
1099 error = gfs2_meta_inode_buffer(dip, &dibh);
1100 if (!gfs2_assert_withdraw(sdp, !error)) {
1101 dip->i_di.di_depth++;
1102 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1103 brelse(dibh);
1104 }
1105
1106 return error;
1107
1108 fail:
1109 kfree(buf);
1110
1111 return error;
1112}
1113
1114/**
1115 * compare_dents - compare directory entries by hash value
1116 * @a: first dent
1117 * @b: second dent
1118 *
1119 * When comparing the hash entries of @a to @b:
1120 * gt: returns 1
1121 * lt: returns -1
1122 * eq: returns 0
1123 */
1124
1125static int compare_dents(const void *a, const void *b)
1126{
1127 struct gfs2_dirent *dent_a, *dent_b;
1128 uint32_t hash_a, hash_b;
1129 int ret = 0;
1130
1131 dent_a = *(struct gfs2_dirent **)a;
1132 hash_a = dent_a->de_hash;
1133 hash_a = be32_to_cpu(hash_a);
1134
1135 dent_b = *(struct gfs2_dirent **)b;
1136 hash_b = dent_b->de_hash;
1137 hash_b = be32_to_cpu(hash_b);
1138
1139 if (hash_a > hash_b)
1140 ret = 1;
1141 else if (hash_a < hash_b)
1142 ret = -1;
1143 else {
1144 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1145 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1146
1147 if (len_a > len_b)
1148 ret = 1;
1149 else if (len_a < len_b)
1150 ret = -1;
1151 else
1152 ret = memcmp((char *)(dent_a + 1),
1153 (char *)(dent_b + 1),
1154 len_a);
1155 }
1156
1157 return ret;
1158}
1159
1160/**
1161 * do_filldir_main - read out directory entries
1162 * @dip: The GFS2 inode
1163 * @offset: The offset in the file to read from
1164 * @opaque: opaque data to pass to filldir
1165 * @filldir: The function to pass entries to
1166 * @darr: an array of struct gfs2_dirent pointers to read
1167 * @entries: the number of entries in darr
1168 * @copied: pointer to int that's non-zero if a entry has been copied out
1169 *
1170 * Jump through some hoops to make sure that if there are hash collsions,
1171 * they are read out at the beginning of a buffer. We want to minimize
1172 * the possibility that they will fall into different readdir buffers or
1173 * that someone will want to seek to that location.
1174 *
1175 * Returns: errno, >0 on exception from filldir
1176 */
1177
1178static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
1179 void *opaque, gfs2_filldir_t filldir,
1180 struct gfs2_dirent **darr, uint32_t entries,
1181 int *copied)
1182{
1183 struct gfs2_dirent *dent, *dent_next;
1184 struct gfs2_inum inum;
1185 uint64_t off, off_next;
1186 unsigned int x, y;
1187 int run = 0;
1188 int error = 0;
1189
1190 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1191
1192 dent_next = darr[0];
1193 off_next = be32_to_cpu(dent_next->de_hash);
1194 off_next = gfs2_disk_hash2offset(off_next);
1195
1196 for (x = 0, y = 1; x < entries; x++, y++) {
1197 dent = dent_next;
1198 off = off_next;
1199
1200 if (y < entries) {
1201 dent_next = darr[y];
1202 off_next = be32_to_cpu(dent_next->de_hash);
1203 off_next = gfs2_disk_hash2offset(off_next);
1204
1205 if (off < *offset)
1206 continue;
1207 *offset = off;
1208
1209 if (off_next == off) {
1210 if (*copied && !run)
1211 return 1;
1212 run = 1;
1213 } else
1214 run = 0;
1215 } else {
1216 if (off < *offset)
1217 continue;
1218 *offset = off;
1219 }
1220
1221 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1222
1223 error = filldir(opaque, (char *)(dent + 1),
1224 be16_to_cpu(dent->de_name_len),
1225 off, &inum,
1226 be16_to_cpu(dent->de_type));
1227 if (error)
1228 return 1;
1229
1230 *copied = 1;
1231 }
1232
1233 /* Increment the *offset by one, so the next time we come into the
1234 do_filldir fxn, we get the next entry instead of the last one in the
1235 current leaf */
1236
1237 (*offset)++;
1238
1239 return 0;
1240}
1241
1242/**
1243 * do_filldir_single - Read directory entries out of a single block
1244 * @dip: The GFS2 inode
1245 * @offset: The offset in the file to read from
1246 * @opaque: opaque data to pass to filldir
1247 * @filldir: The function to pass entries to
1248 * @bh: the block
1249 * @entries: the number of entries in the block
1250 * @copied: pointer to int that's non-zero if a entry has been copied out
1251 *
1252 * Returns: errno, >0 on exception from filldir
1253 */
1254
1255static int do_filldir_single(struct gfs2_inode *dip, uint64_t *offset,
1256 void *opaque, gfs2_filldir_t filldir,
1257 struct buffer_head *bh, uint32_t entries,
1258 int *copied)
1259{
1260 struct gfs2_dirent **darr;
1261 struct gfs2_dirent *de;
1262 unsigned int e = 0;
1263 int error;
1264
1265 if (!entries)
1266 return 0;
1267
1268 darr = kcalloc(entries, sizeof(struct gfs2_dirent *), GFP_KERNEL);
1269 if (!darr)
1270 return -ENOMEM;
1271
1272 dirent_first(dip, bh, &de);
1273 do {
1274 if (!de->de_inum.no_addr)
1275 continue;
1276 if (e >= entries) {
1277 gfs2_consist_inode(dip);
1278 error = -EIO;
1279 goto out;
1280 }
1281 darr[e++] = de;
1282 }
1283 while (dirent_next(dip, bh, &de) == 0);
1284
1285 if (e != entries) {
1286 gfs2_consist_inode(dip);
1287 error = -EIO;
1288 goto out;
1289 }
1290
1291 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1292 entries, copied);
1293
1294 out:
1295 kfree(darr);
1296
1297 return error;
1298}
1299
1300/**
1301 * do_filldir_multi - Read directory entries out of a linked leaf list
1302 * @dip: The GFS2 inode
1303 * @offset: The offset in the file to read from
1304 * @opaque: opaque data to pass to filldir
1305 * @filldir: The function to pass entries to
1306 * @bh: the first leaf in the list
1307 * @copied: pointer to int that's non-zero if a entry has been copied out
1308 *
1309 * Returns: errno, >0 on exception from filldir
1310 */
1311
1312static int do_filldir_multi(struct gfs2_inode *dip, uint64_t *offset,
1313 void *opaque, gfs2_filldir_t filldir,
1314 struct buffer_head *bh, int *copied)
1315{
1316 struct buffer_head **larr = NULL;
1317 struct gfs2_dirent **darr;
1318 struct gfs2_leaf *leaf;
1319 struct buffer_head *tmp_bh;
1320 struct gfs2_dirent *de;
1321 unsigned int entries, e = 0;
1322 unsigned int leaves = 0, l = 0;
1323 unsigned int x;
1324 uint64_t ln;
1325 int error = 0;
1326
1327 /* Count leaves and entries */
1328
1329 leaf = (struct gfs2_leaf *)bh->b_data;
1330 entries = be16_to_cpu(leaf->lf_entries);
1331 ln = leaf->lf_next;
1332
1333 while (ln) {
1334 ln = be64_to_cpu(ln);
1335
1336 error = get_leaf(dip, ln, &tmp_bh);
1337 if (error)
1338 return error;
1339
1340 leaf = (struct gfs2_leaf *)tmp_bh->b_data;
1341 if (leaf->lf_entries) {
1342 entries += be16_to_cpu(leaf->lf_entries);
1343 leaves++;
1344 }
1345 ln = leaf->lf_next;
1346
1347 brelse(tmp_bh);
1348 }
1349
1350 if (!entries)
1351 return 0;
1352
1353 if (leaves) {
1354 larr = kcalloc(leaves, sizeof(struct buffer_head *),GFP_KERNEL);
1355 if (!larr)
1356 return -ENOMEM;
1357 }
1358
1359 darr = kcalloc(entries, sizeof(struct gfs2_dirent *), GFP_KERNEL);
1360 if (!darr) {
1361 kfree(larr);
1362 return -ENOMEM;
1363 }
1364
1365 leaf = (struct gfs2_leaf *)bh->b_data;
1366 if (leaf->lf_entries) {
1367 dirent_first(dip, bh, &de);
1368 do {
1369 if (!de->de_inum.no_addr)
1370 continue;
1371 if (e >= entries) {
1372 gfs2_consist_inode(dip);
1373 error = -EIO;
1374 goto out;
1375 }
1376 darr[e++] = de;
1377 }
1378 while (dirent_next(dip, bh, &de) == 0);
1379 }
1380 ln = leaf->lf_next;
1381
1382 while (ln) {
1383 ln = be64_to_cpu(ln);
1384
1385 error = get_leaf(dip, ln, &tmp_bh);
1386 if (error)
1387 goto out;
1388
1389 leaf = (struct gfs2_leaf *)tmp_bh->b_data;
1390 if (leaf->lf_entries) {
1391 dirent_first(dip, tmp_bh, &de);
1392 do {
1393 if (!de->de_inum.no_addr)
1394 continue;
1395 if (e >= entries) {
1396 gfs2_consist_inode(dip);
1397 error = -EIO;
1398 goto out;
1399 }
1400 darr[e++] = de;
1401 }
1402 while (dirent_next(dip, tmp_bh, &de) == 0);
1403
1404 larr[l++] = tmp_bh;
1405
1406 ln = leaf->lf_next;
1407 } else {
1408 ln = leaf->lf_next;
1409 brelse(tmp_bh);
1410 }
1411 }
1412
1413 if (gfs2_assert_withdraw(dip->i_sbd, l == leaves)) {
1414 error = -EIO;
1415 goto out;
1416 }
1417 if (e != entries) {
1418 gfs2_consist_inode(dip);
1419 error = -EIO;
1420 goto out;
1421 }
1422
1423 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1424 entries, copied);
1425
1426 out:
1427 kfree(darr);
1428 for (x = 0; x < l; x++)
1429 brelse(larr[x]);
1430 kfree(larr);
1431
1432 return error;
1433}
1434
1435/**
1436 * dir_e_search - Search exhash (leaf) dir for inode matching name
1437 * @dip: The GFS2 inode
1438 * @filename: Filename string
1439 * @inode: If non-NULL, function fills with formal inode # and block address
1440 * @type: If non-NULL, function fills with DT_... dinode type
1441 *
1442 * Returns:
1443 */
1444
1445static int dir_e_search(struct gfs2_inode *dip, struct qstr *filename,
1446 struct gfs2_inum *inum, unsigned int *type)
1447{
1448 struct buffer_head *bh;
1449 struct gfs2_dirent *dent;
1450 int error;
1451
1452 error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
1453 if (error)
1454 return error;
1455
1456 if (inum)
1457 gfs2_inum_in(inum, (char *)&dent->de_inum);
1458 if (type)
1459 *type = be16_to_cpu(dent->de_type);
1460
1461 brelse(bh);
1462
1463 return 0;
1464}
1465
1466static int dir_e_add(struct gfs2_inode *dip, struct qstr *filename,
1467 struct gfs2_inum *inum, unsigned int type)
1468{
1469 struct buffer_head *bh, *nbh, *dibh;
1470 struct gfs2_leaf *leaf, *nleaf;
1471 struct gfs2_dirent *dent;
1472 uint32_t hsize, index;
1473 uint32_t hash;
1474 uint64_t leaf_no, bn;
1475 int error;
1476
1477 restart:
1478 hsize = 1 << dip->i_di.di_depth;
1479 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1480 gfs2_consist_inode(dip);
1481 return -EIO;
1482 }
1483
1484 /* Figure out the address of the leaf node. */
1485
1486 hash = gfs2_disk_hash(filename->name, filename->len);
1487 index = hash >> (32 - dip->i_di.di_depth);
1488
1489 error = get_leaf_nr(dip, index, &leaf_no);
1490 if (error)
1491 return error;
1492
1493 /* Add entry to the leaf */
1494
1495 for (;;) {
1496 error = get_leaf(dip, leaf_no, &bh);
1497 if (error)
1498 return error;
1499
1500 leaf = (struct gfs2_leaf *)bh->b_data;
1501
1502 if (gfs2_dirent_alloc(dip, bh, filename->len, &dent)) {
1503
1504 if (be16_to_cpu(leaf->lf_depth) < dip->i_di.di_depth) {
1505 /* Can we split the leaf? */
1506
1507 brelse(bh);
1508
1509 error = dir_split_leaf(dip, index, leaf_no);
1510 if (error)
1511 return error;
1512
1513 goto restart;
1514
1515 } else if (dip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1516 /* Can we double the hash table? */
1517
1518 brelse(bh);
1519
1520 error = dir_double_exhash(dip);
1521 if (error)
1522 return error;
1523
1524 goto restart;
1525
1526 } else if (leaf->lf_next) {
1527 /* Can we try the next leaf in the list? */
1528 leaf_no = be64_to_cpu(leaf->lf_next);
1529 brelse(bh);
1530 continue;
1531
1532 } else {
1533 /* Create a new leaf and add it to the list. */
1534
1535 bn = gfs2_alloc_meta(dip);
1536
1537 nbh = gfs2_meta_new(dip->i_gl, bn);
1538 gfs2_trans_add_bh(dip->i_gl, nbh, 1);
1539 gfs2_metatype_set(nbh,
1540 GFS2_METATYPE_LF,
1541 GFS2_FORMAT_LF);
1542 gfs2_buffer_clear_tail(nbh,
1543 sizeof(struct gfs2_meta_header));
1544
1545 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1546 leaf->lf_next = cpu_to_be64(bn);
1547
1548 nleaf = (struct gfs2_leaf *)nbh->b_data;
1549 nleaf->lf_depth = leaf->lf_depth;
1550 nleaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
1551
1552 gfs2_dirent_alloc(dip, nbh, filename->len,
1553 &dent);
1554
1555 dip->i_di.di_blocks++;
1556
1557 brelse(bh);
1558
1559 bh = nbh;
1560 leaf = nleaf;
1561 }
1562 }
1563
1564 /* If the gfs2_dirent_alloc() succeeded, it pinned the "bh" */
1565
1566 gfs2_inum_out(inum, (char *)&dent->de_inum);
1567 dent->de_hash = cpu_to_be32(hash);
1568 dent->de_type = cpu_to_be16(type);
1569 memcpy((char *)(dent + 1), filename->name, filename->len);
1570
1571 leaf->lf_entries = be16_to_cpu(leaf->lf_entries) + 1;
1572 leaf->lf_entries = cpu_to_be16(leaf->lf_entries);
1573
1574 brelse(bh);
1575
1576 error = gfs2_meta_inode_buffer(dip, &dibh);
1577 if (error)
1578 return error;
1579
1580 dip->i_di.di_entries++;
1581 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1582
1583 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1584 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1585 brelse(dibh);
1586
1587 return 0;
1588 }
1589
1590 return -ENOENT;
1591}
1592
1593static int dir_e_del(struct gfs2_inode *dip, struct qstr *filename)
1594{
1595 struct buffer_head *bh, *dibh;
1596 struct gfs2_dirent *dent, *prev;
1597 struct gfs2_leaf *leaf;
1598 unsigned int entries;
1599 int error;
1600
1601 error = linked_leaf_search(dip, filename, &dent, &prev, &bh);
1602 if (error == -ENOENT) {
1603 gfs2_consist_inode(dip);
1604 return -EIO;
1605 }
1606 if (error)
1607 return error;
1608
1609 dirent_del(dip, bh, prev, dent); /* Pins bh */
1610
1611 leaf = (struct gfs2_leaf *)bh->b_data;
1612 entries = be16_to_cpu(leaf->lf_entries);
1613 if (!entries)
1614 gfs2_consist_inode(dip);
1615 entries--;
1616 leaf->lf_entries = cpu_to_be16(entries);
1617
1618 brelse(bh);
1619
1620 error = gfs2_meta_inode_buffer(dip, &dibh);
1621 if (error)
1622 return error;
1623
1624 if (!dip->i_di.di_entries)
1625 gfs2_consist_inode(dip);
1626 dip->i_di.di_entries--;
1627 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1628
1629 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1630 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1631 brelse(dibh);
1632
1633 return 0;
1634}
1635
1636/**
1637 * dir_e_read - Reads the entries from a directory into a filldir buffer
1638 * @dip: dinode pointer
1639 * @offset: the hash of the last entry read shifted to the right once
1640 * @opaque: buffer for the filldir function to fill
1641 * @filldir: points to the filldir function to use
1642 *
1643 * Returns: errno
1644 */
1645
1646static int dir_e_read(struct gfs2_inode *dip, uint64_t *offset, void *opaque,
1647 gfs2_filldir_t filldir)
1648{
1649 struct gfs2_sbd *sdp = dip->i_sbd;
1650 struct buffer_head *bh;
1651 struct gfs2_leaf leaf;
1652 uint32_t hsize, len;
1653 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1654 uint32_t hash, index;
1655 uint64_t *lp;
1656 int copied = 0;
1657 int error = 0;
1658
1659 hsize = 1 << dip->i_di.di_depth;
1660 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1661 gfs2_consist_inode(dip);
1662 return -EIO;
1663 }
1664
1665 hash = gfs2_dir_offset2hash(*offset);
1666 index = hash >> (32 - dip->i_di.di_depth);
1667
1668 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1669 if (!lp)
1670 return -ENOMEM;
1671
1672 while (index < hsize) {
1673 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1674 ht_offset = index - lp_offset;
1675
1676 if (ht_offset_cur != ht_offset) {
1677 error = gfs2_dir_read_data(dip, (char *)lp,
1678 ht_offset * sizeof(uint64_t),
1679 sdp->sd_hash_bsize);
1680 if (error != sdp->sd_hash_bsize) {
1681 if (error >= 0)
1682 error = -EIO;
1683 goto out;
1684 }
1685 ht_offset_cur = ht_offset;
1686 }
1687
1688 error = get_leaf(dip, be64_to_cpu(lp[lp_offset]), &bh);
1689 if (error)
1690 goto out;
1691
1692 gfs2_leaf_in(&leaf, bh->b_data);
1693
1694 if (leaf.lf_next)
1695 error = do_filldir_multi(dip, offset, opaque, filldir,
1696 bh, &copied);
1697 else
1698 error = do_filldir_single(dip, offset, opaque, filldir,
1699 bh, leaf.lf_entries, &copied);
1700
1701 brelse(bh);
1702
1703 if (error) {
1704 if (error > 0)
1705 error = 0;
1706 goto out;
1707 }
1708
1709 len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
1710 index = (index & ~(len - 1)) + len;
1711 }
1712
1713 out:
1714 kfree(lp);
1715
1716 return error;
1717}
1718
1719static int dir_e_mvino(struct gfs2_inode *dip, struct qstr *filename,
1720 struct gfs2_inum *inum, unsigned int new_type)
1721{
1722 struct buffer_head *bh, *dibh;
1723 struct gfs2_dirent *dent;
1724 int error;
1725
1726 error = linked_leaf_search(dip, filename, &dent, NULL, &bh);
1727 if (error == -ENOENT) {
1728 gfs2_consist_inode(dip);
1729 return -EIO;
1730 }
1731 if (error)
1732 return error;
1733
1734 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1735
1736 gfs2_inum_out(inum, (char *)&dent->de_inum);
1737 dent->de_type = cpu_to_be16(new_type);
1738
1739 brelse(bh);
1740
1741 error = gfs2_meta_inode_buffer(dip, &dibh);
1742 if (error)
1743 return error;
1744
1745 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1746
1747 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1748 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1749 brelse(dibh);
1750
1751 return 0;
1752}
1753
1754/**
1755 * dir_l_search - Search linear (stuffed dinode) dir for inode matching name
1756 * @dip: The GFS2 inode
1757 * @filename: Filename string
1758 * @inode: If non-NULL, function fills with formal inode # and block address
1759 * @type: If non-NULL, function fills with DT_... dinode type
1760 *
1761 * Returns:
1762 */
1763
1764static int dir_l_search(struct gfs2_inode *dip, struct qstr *filename,
1765 struct gfs2_inum *inum, unsigned int *type)
1766{
1767 struct buffer_head *dibh;
1768 struct gfs2_dirent *dent;
1769 int error;
1770
1771 if (!gfs2_is_stuffed(dip)) {
1772 gfs2_consist_inode(dip);
1773 return -EIO;
1774 }
1775
1776 error = gfs2_meta_inode_buffer(dip, &dibh);
1777 if (error)
1778 return error;
1779
1780 error = leaf_search(dip, dibh, filename, &dent, NULL);
1781 if (!error) {
1782 if (inum)
1783 gfs2_inum_in(inum, (char *)&dent->de_inum);
1784 if (type)
1785 *type = be16_to_cpu(dent->de_type);
1786 }
1787
1788 brelse(dibh);
1789
1790 return error;
1791}
1792
1793static int dir_l_add(struct gfs2_inode *dip, struct qstr *filename,
1794 struct gfs2_inum *inum, unsigned int type)
1795{
1796 struct buffer_head *dibh;
1797 struct gfs2_dirent *dent;
1798 int error;
1799
1800 if (!gfs2_is_stuffed(dip)) {
1801 gfs2_consist_inode(dip);
1802 return -EIO;
1803 }
1804
1805 error = gfs2_meta_inode_buffer(dip, &dibh);
1806 if (error)
1807 return error;
1808
1809 if (gfs2_dirent_alloc(dip, dibh, filename->len, &dent)) {
1810 brelse(dibh);
1811
1812 error = dir_make_exhash(dip);
1813 if (!error)
1814 error = dir_e_add(dip, filename, inum, type);
1815
1816 return error;
1817 }
1818
1819 /* gfs2_dirent_alloc() pins */
1820
1821 gfs2_inum_out(inum, (char *)&dent->de_inum);
1822 dent->de_hash = gfs2_disk_hash(filename->name, filename->len);
1823 dent->de_hash = cpu_to_be32(dent->de_hash);
1824 dent->de_type = cpu_to_be16(type);
1825 memcpy((char *)(dent + 1), filename->name, filename->len);
1826
1827 dip->i_di.di_entries++;
1828 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1829
1830 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1831 brelse(dibh);
1832
1833 return 0;
1834}
1835
1836static int dir_l_del(struct gfs2_inode *dip, struct qstr *filename)
1837{
1838 struct buffer_head *dibh;
1839 struct gfs2_dirent *dent, *prev;
1840 int error;
1841
1842 if (!gfs2_is_stuffed(dip)) {
1843 gfs2_consist_inode(dip);
1844 return -EIO;
1845 }
1846
1847 error = gfs2_meta_inode_buffer(dip, &dibh);
1848 if (error)
1849 return error;
1850
1851 error = leaf_search(dip, dibh, filename, &dent, &prev);
1852 if (error == -ENOENT) {
1853 gfs2_consist_inode(dip);
1854 error = -EIO;
1855 goto out;
1856 }
1857 if (error)
1858 goto out;
1859
1860 dirent_del(dip, dibh, prev, dent);
1861
1862 /* dirent_del() pins */
1863
1864 if (!dip->i_di.di_entries)
1865 gfs2_consist_inode(dip);
1866 dip->i_di.di_entries--;
1867
1868 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1869
1870 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1871
1872 out:
1873 brelse(dibh);
1874
1875 return error;
1876}
1877
1878static int dir_l_read(struct gfs2_inode *dip, uint64_t *offset, void *opaque,
1879 gfs2_filldir_t filldir)
1880{
1881 struct buffer_head *dibh;
1882 int copied = 0;
1883 int error;
1884
1885 if (!gfs2_is_stuffed(dip)) {
1886 gfs2_consist_inode(dip);
1887 return -EIO;
1888 }
1889
1890 if (!dip->i_di.di_entries)
1891 return 0;
1892
1893 error = gfs2_meta_inode_buffer(dip, &dibh);
1894 if (error)
1895 return error;
1896
1897 error = do_filldir_single(dip, offset,
1898 opaque, filldir,
1899 dibh, dip->i_di.di_entries,
1900 &copied);
1901 if (error > 0)
1902 error = 0;
1903
1904 brelse(dibh);
1905
1906 return error;
1907}
1908
1909static int dir_l_mvino(struct gfs2_inode *dip, struct qstr *filename,
1910 struct gfs2_inum *inum, unsigned int new_type)
1911{
1912 struct buffer_head *dibh;
1913 struct gfs2_dirent *dent;
1914 int error;
1915
1916 if (!gfs2_is_stuffed(dip)) {
1917 gfs2_consist_inode(dip);
1918 return -EIO;
1919 }
1920
1921 error = gfs2_meta_inode_buffer(dip, &dibh);
1922 if (error)
1923 return error;
1924
1925 error = leaf_search(dip, dibh, filename, &dent, NULL);
1926 if (error == -ENOENT) {
1927 gfs2_consist_inode(dip);
1928 error = -EIO;
1929 goto out;
1930 }
1931 if (error)
1932 goto out;
1933
1934 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1935
1936 gfs2_inum_out(inum, (char *)&dent->de_inum);
1937 dent->de_type = cpu_to_be16(new_type);
1938
1939 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1940
1941 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1942
1943 out:
1944 brelse(dibh);
1945
1946 return error;
1947}
1948
1949/**
1950 * gfs2_dir_search - Search a directory
1951 * @dip: The GFS2 inode
1952 * @filename:
1953 * @inode:
1954 *
1955 * This routine searches a directory for a file or another directory.
1956 * Assumes a glock is held on dip.
1957 *
1958 * Returns: errno
1959 */
1960
1961int gfs2_dir_search(struct gfs2_inode *dip, struct qstr *filename,
1962 struct gfs2_inum *inum, unsigned int *type)
1963{
1964 int error;
1965
1966 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1967 error = dir_e_search(dip, filename, inum, type);
1968 else
1969 error = dir_l_search(dip, filename, inum, type);
1970
1971 return error;
1972}
1973
1974/**
1975 * gfs2_dir_add - Add new filename into directory
1976 * @dip: The GFS2 inode
1977 * @filename: The new name
1978 * @inode: The inode number of the entry
1979 * @type: The type of the entry
1980 *
1981 * Returns: 0 on success, error code on failure
1982 */
1983
1984int gfs2_dir_add(struct gfs2_inode *dip, struct qstr *filename,
1985 struct gfs2_inum *inum, unsigned int type)
1986{
1987 int error;
1988
1989 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1990 error = dir_e_add(dip, filename, inum, type);
1991 else
1992 error = dir_l_add(dip, filename, inum, type);
1993
1994 return error;
1995}
1996
1997/**
1998 * gfs2_dir_del - Delete a directory entry
1999 * @dip: The GFS2 inode
2000 * @filename: The filename
2001 *
2002 * Returns: 0 on success, error code on failure
2003 */
2004
2005int gfs2_dir_del(struct gfs2_inode *dip, struct qstr *filename)
2006{
2007 int error;
2008
2009 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
2010 error = dir_e_del(dip, filename);
2011 else
2012 error = dir_l_del(dip, filename);
2013
2014 return error;
2015}
2016
2017int gfs2_dir_read(struct gfs2_inode *dip, uint64_t *offset, void *opaque,
2018 gfs2_filldir_t filldir)
2019{
2020 int error;
2021
2022 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
2023 error = dir_e_read(dip, offset, opaque, filldir);
2024 else
2025 error = dir_l_read(dip, offset, opaque, filldir);
2026
2027 return error;
2028}
2029
2030/**
2031 * gfs2_dir_mvino - Change inode number of directory entry
2032 * @dip: The GFS2 inode
2033 * @filename:
2034 * @new_inode:
2035 *
2036 * This routine changes the inode number of a directory entry. It's used
2037 * by rename to change ".." when a directory is moved.
2038 * Assumes a glock is held on dvp.
2039 *
2040 * Returns: errno
2041 */
2042
2043int gfs2_dir_mvino(struct gfs2_inode *dip, struct qstr *filename,
2044 struct gfs2_inum *inum, unsigned int new_type)
2045{
2046 int error;
2047
2048 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
2049 error = dir_e_mvino(dip, filename, inum, new_type);
2050 else
2051 error = dir_l_mvino(dip, filename, inum, new_type);
2052
2053 return error;
2054}
2055
2056/**
2057 * foreach_leaf - call a function for each leaf in a directory
2058 * @dip: the directory
2059 * @lc: the function to call for each each
2060 * @data: private data to pass to it
2061 *
2062 * Returns: errno
2063 */
2064
2065static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
2066{
2067 struct gfs2_sbd *sdp = dip->i_sbd;
2068 struct buffer_head *bh;
2069 struct gfs2_leaf leaf;
2070 uint32_t hsize, len;
2071 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
2072 uint32_t index = 0;
2073 uint64_t *lp;
2074 uint64_t leaf_no;
2075 int error = 0;
2076
2077 hsize = 1 << dip->i_di.di_depth;
2078 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
2079 gfs2_consist_inode(dip);
2080 return -EIO;
2081 }
2082
2083 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
2084 if (!lp)
2085 return -ENOMEM;
2086
2087 while (index < hsize) {
2088 lp_offset = index & (sdp->sd_hash_ptrs - 1);
2089 ht_offset = index - lp_offset;
2090
2091 if (ht_offset_cur != ht_offset) {
2092 error = gfs2_dir_read_data(dip, (char *)lp,
2093 ht_offset * sizeof(uint64_t),
2094 sdp->sd_hash_bsize);
2095 if (error != sdp->sd_hash_bsize) {
2096 if (error >= 0)
2097 error = -EIO;
2098 goto out;
2099 }
2100 ht_offset_cur = ht_offset;
2101 }
2102
2103 leaf_no = be64_to_cpu(lp[lp_offset]);
2104 if (leaf_no) {
2105 error = get_leaf(dip, leaf_no, &bh);
2106 if (error)
2107 goto out;
2108 gfs2_leaf_in(&leaf, bh->b_data);
2109 brelse(bh);
2110
2111 len = 1 << (dip->i_di.di_depth - leaf.lf_depth);
2112
2113 error = lc(dip, index, len, leaf_no, data);
2114 if (error)
2115 goto out;
2116
2117 index = (index & ~(len - 1)) + len;
2118 } else
2119 index++;
2120 }
2121
2122 if (index != hsize) {
2123 gfs2_consist_inode(dip);
2124 error = -EIO;
2125 }
2126
2127 out:
2128 kfree(lp);
2129
2130 return error;
2131}
2132
2133/**
2134 * leaf_dealloc - Deallocate a directory leaf
2135 * @dip: the directory
2136 * @index: the hash table offset in the directory
2137 * @len: the number of pointers to this leaf
2138 * @leaf_no: the leaf number
2139 * @data: not used
2140 *
2141 * Returns: errno
2142 */
2143
2144static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
2145 uint64_t leaf_no, void *data)
2146{
2147 struct gfs2_sbd *sdp = dip->i_sbd;
2148 struct gfs2_leaf tmp_leaf;
2149 struct gfs2_rgrp_list rlist;
2150 struct buffer_head *bh, *dibh;
2151 uint64_t blk;
2152 unsigned int rg_blocks = 0, l_blocks = 0;
2153 char *ht;
2154 unsigned int x, size = len * sizeof(uint64_t);
2155 int error;
2156
2157 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
2158
2159 ht = kzalloc(size, GFP_KERNEL);
2160 if (!ht)
2161 return -ENOMEM;
2162
2163 gfs2_alloc_get(dip);
2164
2165 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
2166 if (error)
2167 goto out;
2168
2169 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
2170 if (error)
2171 goto out_qs;
2172
2173 /* Count the number of leaves */
2174
2175 for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
2176 error = get_leaf(dip, blk, &bh);
2177 if (error)
2178 goto out_rlist;
2179 gfs2_leaf_in(&tmp_leaf, (bh)->b_data);
2180 brelse(bh);
2181
2182 gfs2_rlist_add(sdp, &rlist, blk);
2183 l_blocks++;
2184 }
2185
2186 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
2187
2188 for (x = 0; x < rlist.rl_rgrps; x++) {
2189 struct gfs2_rgrpd *rgd;
2190 rgd = get_gl2rgd(rlist.rl_ghs[x].gh_gl);
2191 rg_blocks += rgd->rd_ri.ri_length;
2192 }
2193
2194 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
2195 if (error)
2196 goto out_rlist;
2197
2198 error = gfs2_trans_begin(sdp,
2199 rg_blocks + (DIV_RU(size, sdp->sd_jbsize) + 1) +
2200 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
2201 if (error)
2202 goto out_rg_gunlock;
2203
2204 for (blk = leaf_no; blk; blk = tmp_leaf.lf_next) {
2205 error = get_leaf(dip, blk, &bh);
2206 if (error)
2207 goto out_end_trans;
2208 gfs2_leaf_in(&tmp_leaf, bh->b_data);
2209 brelse(bh);
2210
2211 gfs2_free_meta(dip, blk, 1);
2212
2213 if (!dip->i_di.di_blocks)
2214 gfs2_consist_inode(dip);
2215 dip->i_di.di_blocks--;
2216 }
2217
2218 error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size);
2219 if (error != size) {
2220 if (error >= 0)
2221 error = -EIO;
2222 goto out_end_trans;
2223 }
2224
2225 error = gfs2_meta_inode_buffer(dip, &dibh);
2226 if (error)
2227 goto out_end_trans;
2228
2229 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
2230 gfs2_dinode_out(&dip->i_di, dibh->b_data);
2231 brelse(dibh);
2232
2233 out_end_trans:
2234 gfs2_trans_end(sdp);
2235
2236 out_rg_gunlock:
2237 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
2238
2239 out_rlist:
2240 gfs2_rlist_free(&rlist);
2241 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
2242
2243 out_qs:
2244 gfs2_quota_unhold(dip);
2245
2246 out:
2247 gfs2_alloc_put(dip);
2248 kfree(ht);
2249
2250 return error;
2251}
2252
2253/**
2254 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
2255 * @dip: the directory
2256 *
2257 * Dealloc all on-disk directory leaves to FREEMETA state
2258 * Change on-disk inode type to "regular file"
2259 *
2260 * Returns: errno
2261 */
2262
2263int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
2264{
2265 struct gfs2_sbd *sdp = dip->i_sbd;
2266 struct buffer_head *bh;
2267 int error;
2268
2269 /* Dealloc on-disk leaves to FREEMETA state */
2270 error = foreach_leaf(dip, leaf_dealloc, NULL);
2271 if (error)
2272 return error;
2273
2274 /* Make this a regular file in case we crash.
2275 (We don't want to free these blocks a second time.) */
2276
2277 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2278 if (error)
2279 return error;
2280
2281 error = gfs2_meta_inode_buffer(dip, &bh);
2282 if (!error) {
2283 gfs2_trans_add_bh(dip->i_gl, bh, 1);
2284 ((struct gfs2_dinode *)bh->b_data)->di_mode = cpu_to_be32(S_IFREG);
2285 brelse(bh);
2286 }
2287
2288 gfs2_trans_end(sdp);
2289
2290 return error;
2291}
2292
2293/**
2294 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
2295 * @ip: the file being written to
2296 * @filname: the filename that's going to be added
2297 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
2298 *
2299 * Returns: errno
2300 */
2301
2302int gfs2_diradd_alloc_required(struct gfs2_inode *dip, struct qstr *filename,
2303 int *alloc_required)
2304{
2305 struct buffer_head *bh = NULL, *bh_next;
2306 uint32_t hsize, hash, index;
2307 int error = 0;
2308
2309 *alloc_required = 0;
2310
2311 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
2312 hsize = 1 << dip->i_di.di_depth;
2313 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
2314 gfs2_consist_inode(dip);
2315 return -EIO;
2316 }
2317
2318 hash = gfs2_disk_hash(filename->name, filename->len);
2319 index = hash >> (32 - dip->i_di.di_depth);
2320
2321 error = get_first_leaf(dip, index, &bh_next);
2322 if (error)
2323 return error;
2324
2325 do {
2326 brelse(bh);
2327
2328 bh = bh_next;
2329
2330 if (dirent_fits(dip, bh, filename->len))
2331 break;
2332
2333 error = get_next_leaf(dip, bh, &bh_next);
2334 if (error == -ENOENT) {
2335 *alloc_required = 1;
2336 error = 0;
2337 break;
2338 }
2339 }
2340 while (!error);
2341
2342 brelse(bh);
2343 } else {
2344 error = gfs2_meta_inode_buffer(dip, &bh);
2345 if (error)
2346 return error;
2347
2348 if (!dirent_fits(dip, bh, filename->len))
2349 *alloc_required = 1;
2350
2351 brelse(bh);
2352 }
2353
2354 return error;
2355}
2356
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..5b01497b3ab3
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13/**
14 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
15 * @opaque: opaque data used by the function
16 * @name: the name of the directory entry
17 * @length: the length of the name
18 * @offset: the entry's offset in the directory
19 * @inum: the inode number the entry points to
20 * @type: the type of inode the entry points to
21 *
22 * Returns: 0 on success, 1 if buffer full
23 */
24
25typedef int (*gfs2_filldir_t) (void *opaque,
26 const char *name, unsigned int length,
27 uint64_t offset,
28 struct gfs2_inum *inum, unsigned int type);
29
30int gfs2_filecmp(struct qstr *file1, char *file2, int len_of_file2);
31int gfs2_dirent_alloc(struct gfs2_inode *dip, struct buffer_head *bh,
32 int name_len, struct gfs2_dirent **dent_out);
33
34int gfs2_dir_search(struct gfs2_inode *dip, struct qstr *filename,
35 struct gfs2_inum *inum, unsigned int *type);
36int gfs2_dir_add(struct gfs2_inode *dip, struct qstr *filename,
37 struct gfs2_inum *inum, unsigned int type);
38int gfs2_dir_del(struct gfs2_inode *dip, struct qstr *filename);
39int gfs2_dir_read(struct gfs2_inode *dip, uint64_t * offset, void *opaque,
40 gfs2_filldir_t filldir);
41int gfs2_dir_mvino(struct gfs2_inode *dip, struct qstr *filename,
42 struct gfs2_inum *new_inum, unsigned int new_type);
43
44int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
45
46int gfs2_diradd_alloc_required(struct gfs2_inode *dip, struct qstr *filename,
47 int *alloc_required);
48int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new,
49 struct buffer_head **bhp);
50
51#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..2914731250c5
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,185 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <asm/semaphore.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "acl.h"
21#include "eaops.h"
22#include "eattr.h"
23
24/**
25 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
26 * @namep: ea name, possibly with type appended
27 *
28 * Returns: GFS2_EATYPE_XXX
29 */
30
31unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
32{
33 unsigned int type;
34
35 if (strncmp(name, "system.", 7) == 0) {
36 type = GFS2_EATYPE_SYS;
37 if (truncated_name)
38 *truncated_name = strchr(name, '.') + 1;
39 } else if (strncmp(name, "user.", 5) == 0) {
40 type = GFS2_EATYPE_USR;
41 if (truncated_name)
42 *truncated_name = strchr(name, '.') + 1;
43 } else {
44 type = GFS2_EATYPE_UNUSED;
45 if (truncated_name)
46 *truncated_name = NULL;
47 }
48
49 return type;
50}
51
52static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
53{
54 struct inode *inode = ip->i_vnode;
55 int error = permission(inode, MAY_READ, NULL);
56 if (error)
57 return error;
58
59 return gfs2_ea_get_i(ip, er);
60}
61
62static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
63{
64 struct inode *inode = ip->i_vnode;
65
66 if (S_ISREG(inode->i_mode) ||
67 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
68 int error = permission(inode, MAY_WRITE, NULL);
69 if (error)
70 return error;
71 } else
72 return -EPERM;
73
74 return gfs2_ea_set_i(ip, er);
75}
76
77static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
78{
79 struct inode *inode = ip->i_vnode;
80
81 if (S_ISREG(inode->i_mode) ||
82 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
83 int error = permission(inode, MAY_WRITE, NULL);
84 if (error)
85 return error;
86 } else
87 return -EPERM;
88
89 return gfs2_ea_remove_i(ip, er);
90}
91
92static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
93{
94 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
95 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
96 !capable(CAP_SYS_ADMIN))
97 return -EPERM;
98
99 if (ip->i_sbd->sd_args.ar_posix_acl == 0 &&
100 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
101 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
102 return -EOPNOTSUPP;
103
104
105
106 return gfs2_ea_get_i(ip, er);
107}
108
109static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
110{
111 int remove = 0;
112 int error;
113
114 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
115 if (!(er->er_flags & GFS2_ERF_MODE)) {
116 er->er_mode = ip->i_di.di_mode;
117 er->er_flags |= GFS2_ERF_MODE;
118 }
119 error = gfs2_acl_validate_set(ip, 1, er,
120 &remove, &er->er_mode);
121 if (error)
122 return error;
123 error = gfs2_ea_set_i(ip, er);
124 if (error)
125 return error;
126 if (remove)
127 gfs2_ea_remove_i(ip, er);
128 return 0;
129
130 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
131 error = gfs2_acl_validate_set(ip, 0, er,
132 &remove, NULL);
133 if (error)
134 return error;
135 if (!remove)
136 error = gfs2_ea_set_i(ip, er);
137 else {
138 error = gfs2_ea_remove_i(ip, er);
139 if (error == -ENODATA)
140 error = 0;
141 }
142 return error;
143 }
144
145 return -EPERM;
146}
147
148static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
149{
150 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
151 int error = gfs2_acl_validate_remove(ip, 1);
152 if (error)
153 return error;
154
155 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
156 int error = gfs2_acl_validate_remove(ip, 0);
157 if (error)
158 return error;
159
160 } else
161 return -EPERM;
162
163 return gfs2_ea_remove_i(ip, er);
164}
165
166struct gfs2_eattr_operations gfs2_user_eaops = {
167 .eo_get = user_eo_get,
168 .eo_set = user_eo_set,
169 .eo_remove = user_eo_remove,
170 .eo_name = "user",
171};
172
173struct gfs2_eattr_operations gfs2_system_eaops = {
174 .eo_get = system_eo_get,
175 .eo_set = system_eo_set,
176 .eo_remove = system_eo_remove,
177 .eo_name = "system",
178};
179
180struct gfs2_eattr_operations *gfs2_ea_ops[] = {
181 NULL,
182 &gfs2_user_eaops,
183 &gfs2_system_eaops,
184};
185
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..f83c497eddca
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14
15struct gfs2_eattr_operations {
16 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
17 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 char *eo_name;
20};
21
22unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
23
24extern struct gfs2_eattr_operations gfs2_user_eaops;
25extern struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..02e45c4ecbec
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1563 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <asm/semaphore.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "acl.h"
21#include "eaops.h"
22#include "eattr.h"
23#include "glock.h"
24#include "inode.h"
25#include "meta_io.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29
30/**
31 * ea_calc_size - returns the acutal number of bytes the request will take up
32 * (not counting any unstuffed data blocks)
33 * @sdp:
34 * @er:
35 * @size:
36 *
37 * Returns: 1 if the EA should be stuffed
38 */
39
40static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
41 unsigned int *size)
42{
43 *size = GFS2_EAREQ_SIZE_STUFFED(er);
44 if (*size <= sdp->sd_jbsize)
45 return 1;
46
47 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
48
49 return 0;
50}
51
52static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
53{
54 unsigned int size;
55
56 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
57 return -ERANGE;
58
59 ea_calc_size(sdp, er, &size);
60
61 /* This can only happen with 512 byte blocks */
62 if (size > sdp->sd_jbsize)
63 return -ERANGE;
64
65 return 0;
66}
67
68typedef int (*ea_call_t) (struct gfs2_inode *ip,
69 struct buffer_head *bh,
70 struct gfs2_ea_header *ea,
71 struct gfs2_ea_header *prev,
72 void *private);
73
74static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
75 ea_call_t ea_call, void *data)
76{
77 struct gfs2_ea_header *ea, *prev = NULL;
78 int error = 0;
79
80 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_EA))
81 return -EIO;
82
83 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
84 if (!GFS2_EA_REC_LEN(ea))
85 goto fail;
86 if (!(bh->b_data <= (char *)ea &&
87 (char *)GFS2_EA2NEXT(ea) <=
88 bh->b_data + bh->b_size))
89 goto fail;
90 if (!GFS2_EATYPE_VALID(ea->ea_type))
91 goto fail;
92
93 error = ea_call(ip, bh, ea, prev, data);
94 if (error)
95 return error;
96
97 if (GFS2_EA_IS_LAST(ea)) {
98 if ((char *)GFS2_EA2NEXT(ea) !=
99 bh->b_data + bh->b_size)
100 goto fail;
101 break;
102 }
103 }
104
105 return error;
106
107 fail:
108 gfs2_consist_inode(ip);
109 return -EIO;
110}
111
112static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
113{
114 struct buffer_head *bh, *eabh;
115 uint64_t *eablk, *end;
116 int error;
117
118 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
119 DIO_START | DIO_WAIT, &bh);
120 if (error)
121 return error;
122
123 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
124 error = ea_foreach_i(ip, bh, ea_call, data);
125 goto out;
126 }
127
128 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_IN)) {
129 error = -EIO;
130 goto out;
131 }
132
133 eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
134 end = eablk + ip->i_sbd->sd_inptrs;
135
136 for (; eablk < end; eablk++) {
137 uint64_t bn;
138
139 if (!*eablk)
140 break;
141 bn = be64_to_cpu(*eablk);
142
143 error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
144 &eabh);
145 if (error)
146 break;
147 error = ea_foreach_i(ip, eabh, ea_call, data);
148 brelse(eabh);
149 if (error)
150 break;
151 }
152 out:
153 brelse(bh);
154
155 return error;
156}
157
158struct ea_find {
159 struct gfs2_ea_request *ef_er;
160 struct gfs2_ea_location *ef_el;
161};
162
163static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
164 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
165 void *private)
166{
167 struct ea_find *ef = private;
168 struct gfs2_ea_request *er = ef->ef_er;
169
170 if (ea->ea_type == GFS2_EATYPE_UNUSED)
171 return 0;
172
173 if (ea->ea_type == er->er_type) {
174 if (ea->ea_name_len == er->er_name_len &&
175 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
176 struct gfs2_ea_location *el = ef->ef_el;
177 get_bh(bh);
178 el->el_bh = bh;
179 el->el_ea = ea;
180 el->el_prev = prev;
181 return 1;
182 }
183 }
184
185#if 0
186 else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
187 er->er_type == GFS2_EATYPE_SYS)
188 return 1;
189#endif
190
191 return 0;
192}
193
194int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
195 struct gfs2_ea_location *el)
196{
197 struct ea_find ef;
198 int error;
199
200 ef.ef_er = er;
201 ef.ef_el = el;
202
203 memset(el, 0, sizeof(struct gfs2_ea_location));
204
205 error = ea_foreach(ip, ea_find_i, &ef);
206 if (error > 0)
207 return 0;
208
209 return error;
210}
211
212/**
213 * ea_dealloc_unstuffed -
214 * @ip:
215 * @bh:
216 * @ea:
217 * @prev:
218 * @private:
219 *
220 * Take advantage of the fact that all unstuffed blocks are
221 * allocated from the same RG. But watch, this may not always
222 * be true.
223 *
224 * Returns: errno
225 */
226
227static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
228 struct gfs2_ea_header *ea,
229 struct gfs2_ea_header *prev, void *private)
230{
231 int *leave = private;
232 struct gfs2_sbd *sdp = ip->i_sbd;
233 struct gfs2_rgrpd *rgd;
234 struct gfs2_holder rg_gh;
235 struct buffer_head *dibh;
236 uint64_t *dataptrs, bn = 0;
237 uint64_t bstart = 0;
238 unsigned int blen = 0;
239 unsigned int blks = 0;
240 unsigned int x;
241 int error;
242
243 if (GFS2_EA_IS_STUFFED(ea))
244 return 0;
245
246 dataptrs = GFS2_EA2DATAPTRS(ea);
247 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
248 if (*dataptrs) {
249 blks++;
250 bn = be64_to_cpu(*dataptrs);
251 }
252 if (!blks)
253 return 0;
254
255 rgd = gfs2_blk2rgrpd(sdp, bn);
256 if (!rgd) {
257 gfs2_consist_inode(ip);
258 return -EIO;
259 }
260
261 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
262 if (error)
263 return error;
264
265 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
266 RES_DINODE + RES_EATTR + RES_STATFS +
267 RES_QUOTA, blks);
268 if (error)
269 goto out_gunlock;
270
271 gfs2_trans_add_bh(ip->i_gl, bh, 1);
272
273 dataptrs = GFS2_EA2DATAPTRS(ea);
274 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
275 if (!*dataptrs)
276 break;
277 bn = be64_to_cpu(*dataptrs);
278
279 if (bstart + blen == bn)
280 blen++;
281 else {
282 if (bstart)
283 gfs2_free_meta(ip, bstart, blen);
284 bstart = bn;
285 blen = 1;
286 }
287
288 *dataptrs = 0;
289 if (!ip->i_di.di_blocks)
290 gfs2_consist_inode(ip);
291 ip->i_di.di_blocks--;
292 }
293 if (bstart)
294 gfs2_free_meta(ip, bstart, blen);
295
296 if (prev && !leave) {
297 uint32_t len;
298
299 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
300 prev->ea_rec_len = cpu_to_be32(len);
301
302 if (GFS2_EA_IS_LAST(ea))
303 prev->ea_flags |= GFS2_EAFLAG_LAST;
304 } else {
305 ea->ea_type = GFS2_EATYPE_UNUSED;
306 ea->ea_num_ptrs = 0;
307 }
308
309 error = gfs2_meta_inode_buffer(ip, &dibh);
310 if (!error) {
311 ip->i_di.di_ctime = get_seconds();
312 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
313 gfs2_dinode_out(&ip->i_di, dibh->b_data);
314 brelse(dibh);
315 }
316
317 gfs2_trans_end(sdp);
318
319 out_gunlock:
320 gfs2_glock_dq_uninit(&rg_gh);
321
322 return error;
323}
324
325static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
326 struct gfs2_ea_header *ea,
327 struct gfs2_ea_header *prev, int leave)
328{
329 struct gfs2_alloc *al;
330 int error;
331
332 al = gfs2_alloc_get(ip);
333
334 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
335 if (error)
336 goto out_alloc;
337
338 error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
339 if (error)
340 goto out_quota;
341
342 error = ea_dealloc_unstuffed(ip,
343 bh, ea, prev,
344 (leave) ? &error : NULL);
345
346 gfs2_glock_dq_uninit(&al->al_ri_gh);
347
348 out_quota:
349 gfs2_quota_unhold(ip);
350
351 out_alloc:
352 gfs2_alloc_put(ip);
353
354 return error;
355}
356
357/******************************************************************************/
358
359static int gfs2_ea_repack_i(struct gfs2_inode *ip)
360{
361 return -EOPNOTSUPP;
362}
363
364int gfs2_ea_repack(struct gfs2_inode *ip)
365{
366 struct gfs2_holder gh;
367 int error;
368
369 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
370 if (error)
371 return error;
372
373 /* Some sort of permissions checking would be nice */
374
375 error = gfs2_ea_repack_i(ip);
376
377 gfs2_glock_dq_uninit(&gh);
378
379 return error;
380}
381
382struct ea_list {
383 struct gfs2_ea_request *ei_er;
384 unsigned int ei_size;
385};
386
387static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
388 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
389 void *private)
390{
391 struct ea_list *ei = private;
392 struct gfs2_ea_request *er = ei->ei_er;
393 unsigned int ea_size = GFS2_EA_STRLEN(ea);
394
395 if (ea->ea_type == GFS2_EATYPE_UNUSED)
396 return 0;
397
398 if (er->er_data_len) {
399 char *prefix;
400 unsigned int l;
401 char c = 0;
402
403 if (ei->ei_size + ea_size > er->er_data_len)
404 return -ERANGE;
405
406 if (ea->ea_type == GFS2_EATYPE_USR) {
407 prefix = "user.";
408 l = 5;
409 } else {
410 prefix = "system.";
411 l = 7;
412 }
413
414 memcpy(er->er_data + ei->ei_size,
415 prefix, l);
416 memcpy(er->er_data + ei->ei_size + l,
417 GFS2_EA2NAME(ea),
418 ea->ea_name_len);
419 memcpy(er->er_data + ei->ei_size +
420 ea_size - 1,
421 &c, 1);
422 }
423
424 ei->ei_size += ea_size;
425
426 return 0;
427}
428
429/**
430 * gfs2_ea_list -
431 * @ip:
432 * @er:
433 *
434 * Returns: actual size of data on success, -errno on error
435 */
436
437int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
438{
439 struct gfs2_holder i_gh;
440 int error;
441
442 if (!er->er_data || !er->er_data_len) {
443 er->er_data = NULL;
444 er->er_data_len = 0;
445 }
446
447 error = gfs2_glock_nq_init(ip->i_gl,
448 LM_ST_SHARED, LM_FLAG_ANY,
449 &i_gh);
450 if (error)
451 return error;
452
453 if (ip->i_di.di_eattr) {
454 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
455
456 error = ea_foreach(ip, ea_list_i, &ei);
457 if (!error)
458 error = ei.ei_size;
459 }
460
461 gfs2_glock_dq_uninit(&i_gh);
462
463 return error;
464}
465
466/**
467 * ea_get_unstuffed - actually copies the unstuffed data into the
468 * request buffer
469 * @ip:
470 * @ea:
471 * @data:
472 *
473 * Returns: errno
474 */
475
476static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
477 char *data)
478{
479 struct gfs2_sbd *sdp = ip->i_sbd;
480 struct buffer_head **bh;
481 unsigned int amount = GFS2_EA_DATA_LEN(ea);
482 unsigned int nptrs = DIV_RU(amount, sdp->sd_jbsize);
483 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
484 unsigned int x;
485 int error = 0;
486
487 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
488 if (!bh)
489 return -ENOMEM;
490
491 for (x = 0; x < nptrs; x++) {
492 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
493 DIO_START, bh + x);
494 if (error) {
495 while (x--)
496 brelse(bh[x]);
497 goto out;
498 }
499 dataptrs++;
500 }
501
502 for (x = 0; x < nptrs; x++) {
503 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
504 if (error) {
505 for (; x < nptrs; x++)
506 brelse(bh[x]);
507 goto out;
508 }
509 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
510 for (; x < nptrs; x++)
511 brelse(bh[x]);
512 error = -EIO;
513 goto out;
514 }
515
516 memcpy(data,
517 bh[x]->b_data + sizeof(struct gfs2_meta_header),
518 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
519
520 amount -= sdp->sd_jbsize;
521 data += sdp->sd_jbsize;
522
523 brelse(bh[x]);
524 }
525
526 out:
527 kfree(bh);
528
529 return error;
530}
531
532int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
533 char *data)
534{
535 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
536 memcpy(data,
537 GFS2_EA2DATA(el->el_ea),
538 GFS2_EA_DATA_LEN(el->el_ea));
539 return 0;
540 } else
541 return ea_get_unstuffed(ip, el->el_ea, data);
542}
543
544/**
545 * gfs2_ea_get_i -
546 * @ip:
547 * @er:
548 *
549 * Returns: actual size of data on success, -errno on error
550 */
551
552int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
553{
554 struct gfs2_ea_location el;
555 int error;
556
557 if (!ip->i_di.di_eattr)
558 return -ENODATA;
559
560 error = gfs2_ea_find(ip, er, &el);
561 if (error)
562 return error;
563 if (!el.el_ea)
564 return -ENODATA;
565
566 if (er->er_data_len) {
567 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
568 error = -ERANGE;
569 else
570 error = gfs2_ea_get_copy(ip, &el, er->er_data);
571 }
572 if (!error)
573 error = GFS2_EA_DATA_LEN(el.el_ea);
574
575 brelse(el.el_bh);
576
577 return error;
578}
579
580/**
581 * gfs2_ea_get -
582 * @ip:
583 * @er:
584 *
585 * Returns: actual size of data on success, -errno on error
586 */
587
588int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
589{
590 struct gfs2_holder i_gh;
591 int error;
592
593 if (!er->er_name_len ||
594 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
595 return -EINVAL;
596 if (!er->er_data || !er->er_data_len) {
597 er->er_data = NULL;
598 er->er_data_len = 0;
599 }
600
601 error = gfs2_glock_nq_init(ip->i_gl,
602 LM_ST_SHARED, LM_FLAG_ANY,
603 &i_gh);
604 if (error)
605 return error;
606
607 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
608
609 gfs2_glock_dq_uninit(&i_gh);
610
611 return error;
612}
613
614/**
615 * ea_alloc_blk - allocates a new block for extended attributes.
616 * @ip: A pointer to the inode that's getting extended attributes
617 * @bhp:
618 *
619 * Returns: errno
620 */
621
622static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
623{
624 struct gfs2_sbd *sdp = ip->i_sbd;
625 struct gfs2_ea_header *ea;
626 uint64_t block;
627
628 block = gfs2_alloc_meta(ip);
629
630 *bhp = gfs2_meta_new(ip->i_gl, block);
631 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
632 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
633 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
634
635 ea = GFS2_EA_BH2FIRST(*bhp);
636 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
637 ea->ea_type = GFS2_EATYPE_UNUSED;
638 ea->ea_flags = GFS2_EAFLAG_LAST;
639 ea->ea_num_ptrs = 0;
640
641 ip->i_di.di_blocks++;
642
643 return 0;
644}
645
646/**
647 * ea_write - writes the request info to an ea, creating new blocks if
648 * necessary
649 * @ip: inode that is being modified
650 * @ea: the location of the new ea in a block
651 * @er: the write request
652 *
653 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
654 *
655 * returns : errno
656 */
657
658static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
659 struct gfs2_ea_request *er)
660{
661 struct gfs2_sbd *sdp = ip->i_sbd;
662
663 ea->ea_data_len = cpu_to_be32(er->er_data_len);
664 ea->ea_name_len = er->er_name_len;
665 ea->ea_type = er->er_type;
666 ea->__pad = 0;
667
668 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
669
670 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
671 ea->ea_num_ptrs = 0;
672 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
673 } else {
674 uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
675 const char *data = er->er_data;
676 unsigned int data_len = er->er_data_len;
677 unsigned int copy;
678 unsigned int x;
679
680 ea->ea_num_ptrs = DIV_RU(er->er_data_len, sdp->sd_jbsize);
681 for (x = 0; x < ea->ea_num_ptrs; x++) {
682 struct buffer_head *bh;
683 uint64_t block;
684 int mh_size = sizeof(struct gfs2_meta_header);
685
686 block = gfs2_alloc_meta(ip);
687
688 bh = gfs2_meta_new(ip->i_gl, block);
689 gfs2_trans_add_bh(ip->i_gl, bh, 1);
690 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
691
692 ip->i_di.di_blocks++;
693
694 copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
695 data_len;
696 memcpy(bh->b_data + mh_size, data, copy);
697 if (copy < sdp->sd_jbsize)
698 memset(bh->b_data + mh_size + copy, 0,
699 sdp->sd_jbsize - copy);
700
701 *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
702 data += copy;
703 data_len -= copy;
704
705 brelse(bh);
706 }
707
708 gfs2_assert_withdraw(sdp, !data_len);
709 }
710
711 return 0;
712}
713
714typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
715 struct gfs2_ea_request *er,
716 void *private);
717
718static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
719 unsigned int blks,
720 ea_skeleton_call_t skeleton_call,
721 void *private)
722{
723 struct gfs2_alloc *al;
724 struct buffer_head *dibh;
725 int error;
726
727 al = gfs2_alloc_get(ip);
728
729 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
730 if (error)
731 goto out;
732
733 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
734 if (error)
735 goto out_gunlock_q;
736
737 al->al_requested = blks;
738
739 error = gfs2_inplace_reserve(ip);
740 if (error)
741 goto out_gunlock_q;
742
743 error = gfs2_trans_begin(ip->i_sbd,
744 blks + al->al_rgd->rd_ri.ri_length +
745 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
746 if (error)
747 goto out_ipres;
748
749 error = skeleton_call(ip, er, private);
750 if (error)
751 goto out_end_trans;
752
753 error = gfs2_meta_inode_buffer(ip, &dibh);
754 if (!error) {
755 if (er->er_flags & GFS2_ERF_MODE) {
756 gfs2_assert_withdraw(ip->i_sbd,
757 (ip->i_di.di_mode & S_IFMT) ==
758 (er->er_mode & S_IFMT));
759 ip->i_di.di_mode = er->er_mode;
760 }
761 ip->i_di.di_ctime = get_seconds();
762 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
763 gfs2_dinode_out(&ip->i_di, dibh->b_data);
764 brelse(dibh);
765 }
766
767 out_end_trans:
768 gfs2_trans_end(ip->i_sbd);
769
770 out_ipres:
771 gfs2_inplace_release(ip);
772
773 out_gunlock_q:
774 gfs2_quota_unlock(ip);
775
776 out:
777 gfs2_alloc_put(ip);
778
779 return error;
780}
781
782static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
783 void *private)
784{
785 struct buffer_head *bh;
786 int error;
787
788 error = ea_alloc_blk(ip, &bh);
789 if (error)
790 return error;
791
792 ip->i_di.di_eattr = bh->b_blocknr;
793 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
794
795 brelse(bh);
796
797 return error;
798}
799
800/**
801 * ea_init - initializes a new eattr block
802 * @ip:
803 * @er:
804 *
805 * Returns: errno
806 */
807
808static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
809{
810 unsigned int jbsize = ip->i_sbd->sd_jbsize;
811 unsigned int blks = 1;
812
813 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
814 blks += DIV_RU(er->er_data_len, jbsize);
815
816 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
817}
818
819static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
820{
821 uint32_t ea_size = GFS2_EA_SIZE(ea);
822 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + ea_size);
823 uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
824 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
825
826 ea->ea_rec_len = cpu_to_be32(ea_size);
827 ea->ea_flags ^= last;
828
829 new->ea_rec_len = cpu_to_be32(new_size);
830 new->ea_flags = last;
831
832 return new;
833}
834
835static void ea_set_remove_stuffed(struct gfs2_inode *ip,
836 struct gfs2_ea_location *el)
837{
838 struct gfs2_ea_header *ea = el->el_ea;
839 struct gfs2_ea_header *prev = el->el_prev;
840 uint32_t len;
841
842 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
843
844 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
845 ea->ea_type = GFS2_EATYPE_UNUSED;
846 return;
847 } else if (GFS2_EA2NEXT(prev) != ea) {
848 prev = GFS2_EA2NEXT(prev);
849 gfs2_assert_withdraw(ip->i_sbd, GFS2_EA2NEXT(prev) == ea);
850 }
851
852 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
853 prev->ea_rec_len = cpu_to_be32(len);
854
855 if (GFS2_EA_IS_LAST(ea))
856 prev->ea_flags |= GFS2_EAFLAG_LAST;
857}
858
859struct ea_set {
860 int ea_split;
861
862 struct gfs2_ea_request *es_er;
863 struct gfs2_ea_location *es_el;
864
865 struct buffer_head *es_bh;
866 struct gfs2_ea_header *es_ea;
867};
868
869static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
870 struct gfs2_ea_header *ea, struct ea_set *es)
871{
872 struct gfs2_ea_request *er = es->es_er;
873 struct buffer_head *dibh;
874 int error;
875
876 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + 2 * RES_EATTR, 0);
877 if (error)
878 return error;
879
880 gfs2_trans_add_bh(ip->i_gl, bh, 1);
881
882 if (es->ea_split)
883 ea = ea_split_ea(ea);
884
885 ea_write(ip, ea, er);
886
887 if (es->es_el)
888 ea_set_remove_stuffed(ip, es->es_el);
889
890 error = gfs2_meta_inode_buffer(ip, &dibh);
891 if (error)
892 goto out;
893
894 if (er->er_flags & GFS2_ERF_MODE) {
895 gfs2_assert_withdraw(ip->i_sbd,
896 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
897 ip->i_di.di_mode = er->er_mode;
898 }
899 ip->i_di.di_ctime = get_seconds();
900 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
901 gfs2_dinode_out(&ip->i_di, dibh->b_data);
902 brelse(dibh);
903 out:
904 gfs2_trans_end(ip->i_sbd);
905
906 return error;
907}
908
909static int ea_set_simple_alloc(struct gfs2_inode *ip,
910 struct gfs2_ea_request *er, void *private)
911{
912 struct ea_set *es = private;
913 struct gfs2_ea_header *ea = es->es_ea;
914 int error;
915
916 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
917
918 if (es->ea_split)
919 ea = ea_split_ea(ea);
920
921 error = ea_write(ip, ea, er);
922 if (error)
923 return error;
924
925 if (es->es_el)
926 ea_set_remove_stuffed(ip, es->es_el);
927
928 return 0;
929}
930
931static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
932 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
933 void *private)
934{
935 struct ea_set *es = private;
936 unsigned int size;
937 int stuffed;
938 int error;
939
940 stuffed = ea_calc_size(ip->i_sbd, es->es_er, &size);
941
942 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
943 if (GFS2_EA_REC_LEN(ea) < size)
944 return 0;
945 if (!GFS2_EA_IS_STUFFED(ea)) {
946 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
947 if (error)
948 return error;
949 }
950 es->ea_split = 0;
951 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
952 es->ea_split = 1;
953 else
954 return 0;
955
956 if (stuffed) {
957 error = ea_set_simple_noalloc(ip, bh, ea, es);
958 if (error)
959 return error;
960 } else {
961 unsigned int blks;
962
963 es->es_bh = bh;
964 es->es_ea = ea;
965 blks = 2 + DIV_RU(es->es_er->er_data_len, ip->i_sbd->sd_jbsize);
966
967 error = ea_alloc_skeleton(ip, es->es_er, blks,
968 ea_set_simple_alloc, es);
969 if (error)
970 return error;
971 }
972
973 return 1;
974}
975
976static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
977 void *private)
978{
979 struct gfs2_sbd *sdp = ip->i_sbd;
980 struct buffer_head *indbh, *newbh;
981 uint64_t *eablk;
982 int error;
983 int mh_size = sizeof(struct gfs2_meta_header);
984
985 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
986 uint64_t *end;
987
988 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
989 DIO_START | DIO_WAIT, &indbh);
990 if (error)
991 return error;
992
993 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
994 error = -EIO;
995 goto out;
996 }
997
998 eablk = (uint64_t *)(indbh->b_data + mh_size);
999 end = eablk + sdp->sd_inptrs;
1000
1001 for (; eablk < end; eablk++)
1002 if (!*eablk)
1003 break;
1004
1005 if (eablk == end) {
1006 error = -ENOSPC;
1007 goto out;
1008 }
1009
1010 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1011 } else {
1012 uint64_t blk;
1013
1014 blk = gfs2_alloc_meta(ip);
1015
1016 indbh = gfs2_meta_new(ip->i_gl, blk);
1017 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1018 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
1019 gfs2_buffer_clear_tail(indbh, mh_size);
1020
1021 eablk = (uint64_t *)(indbh->b_data + mh_size);
1022 *eablk = cpu_to_be64(ip->i_di.di_eattr);
1023 ip->i_di.di_eattr = blk;
1024 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
1025 ip->i_di.di_blocks++;
1026
1027 eablk++;
1028 }
1029
1030 error = ea_alloc_blk(ip, &newbh);
1031 if (error)
1032 goto out;
1033
1034 *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
1035 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
1036 brelse(newbh);
1037 if (error)
1038 goto out;
1039
1040 if (private)
1041 ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
1042
1043 out:
1044 brelse(indbh);
1045
1046 return error;
1047}
1048
1049static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1050 struct gfs2_ea_location *el)
1051{
1052 struct ea_set es;
1053 unsigned int blks = 2;
1054 int error;
1055
1056 memset(&es, 0, sizeof(struct ea_set));
1057 es.es_er = er;
1058 es.es_el = el;
1059
1060 error = ea_foreach(ip, ea_set_simple, &es);
1061 if (error > 0)
1062 return 0;
1063 if (error)
1064 return error;
1065
1066 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1067 blks++;
1068 if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize)
1069 blks += DIV_RU(er->er_data_len, ip->i_sbd->sd_jbsize);
1070
1071 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1072}
1073
1074static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1075 struct gfs2_ea_location *el)
1076{
1077 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1078 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1079 gfs2_assert_withdraw(ip->i_sbd,
1080 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1081 }
1082
1083 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1084}
1085
1086int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1087{
1088 struct gfs2_ea_location el;
1089 int error;
1090
1091 if (!ip->i_di.di_eattr) {
1092 if (er->er_flags & XATTR_REPLACE)
1093 return -ENODATA;
1094 return ea_init(ip, er);
1095 }
1096
1097 error = gfs2_ea_find(ip, er, &el);
1098 if (error)
1099 return error;
1100
1101 if (el.el_ea) {
1102 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1103 brelse(el.el_bh);
1104 return -EPERM;
1105 }
1106
1107 error = -EEXIST;
1108 if (!(er->er_flags & XATTR_CREATE)) {
1109 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1110 error = ea_set_i(ip, er, &el);
1111 if (!error && unstuffed)
1112 ea_set_remove_unstuffed(ip, &el);
1113 }
1114
1115 brelse(el.el_bh);
1116 } else {
1117 error = -ENODATA;
1118 if (!(er->er_flags & XATTR_REPLACE))
1119 error = ea_set_i(ip, er, NULL);
1120 }
1121
1122 return error;
1123}
1124
1125int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1126{
1127 struct gfs2_holder i_gh;
1128 int error;
1129
1130 if (!er->er_name_len ||
1131 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1132 return -EINVAL;
1133 if (!er->er_data || !er->er_data_len) {
1134 er->er_data = NULL;
1135 er->er_data_len = 0;
1136 }
1137 error = ea_check_size(ip->i_sbd, er);
1138 if (error)
1139 return error;
1140
1141 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1142 if (error)
1143 return error;
1144
1145 if (IS_IMMUTABLE(ip->i_vnode))
1146 error = -EPERM;
1147 else
1148 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1149
1150 gfs2_glock_dq_uninit(&i_gh);
1151
1152 return error;
1153}
1154
1155static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1156{
1157 struct gfs2_ea_header *ea = el->el_ea;
1158 struct gfs2_ea_header *prev = el->el_prev;
1159 struct buffer_head *dibh;
1160 int error;
1161
1162 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
1163 if (error)
1164 return error;
1165
1166 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1167
1168 if (prev) {
1169 uint32_t len;
1170
1171 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1172 prev->ea_rec_len = cpu_to_be32(len);
1173
1174 if (GFS2_EA_IS_LAST(ea))
1175 prev->ea_flags |= GFS2_EAFLAG_LAST;
1176 } else
1177 ea->ea_type = GFS2_EATYPE_UNUSED;
1178
1179 error = gfs2_meta_inode_buffer(ip, &dibh);
1180 if (!error) {
1181 ip->i_di.di_ctime = get_seconds();
1182 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1183 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1184 brelse(dibh);
1185 }
1186
1187 gfs2_trans_end(ip->i_sbd);
1188
1189 return error;
1190}
1191
1192int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1193{
1194 struct gfs2_ea_location el;
1195 int error;
1196
1197 if (!ip->i_di.di_eattr)
1198 return -ENODATA;
1199
1200 error = gfs2_ea_find(ip, er, &el);
1201 if (error)
1202 return error;
1203 if (!el.el_ea)
1204 return -ENODATA;
1205
1206 if (GFS2_EA_IS_STUFFED(el.el_ea))
1207 error = ea_remove_stuffed(ip, &el);
1208 else
1209 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1210 0);
1211
1212 brelse(el.el_bh);
1213
1214 return error;
1215}
1216
1217/**
1218 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1219 * @ip: pointer to the inode of the target file
1220 * @er: request information
1221 *
1222 * Returns: errno
1223 */
1224
1225int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1226{
1227 struct gfs2_holder i_gh;
1228 int error;
1229
1230 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1231 return -EINVAL;
1232
1233 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1234 if (error)
1235 return error;
1236
1237 if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
1238 error = -EPERM;
1239 else
1240 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1241
1242 gfs2_glock_dq_uninit(&i_gh);
1243
1244 return error;
1245}
1246
1247static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1248 struct gfs2_ea_header *ea, char *data)
1249{
1250 struct gfs2_sbd *sdp = ip->i_sbd;
1251 struct buffer_head **bh;
1252 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1253 unsigned int nptrs = DIV_RU(amount, sdp->sd_jbsize);
1254 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
1255 unsigned int x;
1256 int error;
1257
1258 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1259 if (!bh)
1260 return -ENOMEM;
1261
1262 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1263 if (error)
1264 goto out;
1265
1266 for (x = 0; x < nptrs; x++) {
1267 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
1268 DIO_START, bh + x);
1269 if (error) {
1270 while (x--)
1271 brelse(bh[x]);
1272 goto fail;
1273 }
1274 dataptrs++;
1275 }
1276
1277 for (x = 0; x < nptrs; x++) {
1278 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
1279 if (error) {
1280 for (; x < nptrs; x++)
1281 brelse(bh[x]);
1282 goto fail;
1283 }
1284 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1285 for (; x < nptrs; x++)
1286 brelse(bh[x]);
1287 error = -EIO;
1288 goto fail;
1289 }
1290
1291 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1292
1293 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
1294 data,
1295 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1296
1297 amount -= sdp->sd_jbsize;
1298 data += sdp->sd_jbsize;
1299
1300 brelse(bh[x]);
1301 }
1302
1303 out:
1304 kfree(bh);
1305
1306 return error;
1307
1308 fail:
1309 gfs2_trans_end(sdp);
1310 kfree(bh);
1311
1312 return error;
1313}
1314
1315int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1316 struct iattr *attr, char *data)
1317{
1318 struct buffer_head *dibh;
1319 int error;
1320
1321 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1322 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
1323 if (error)
1324 return error;
1325
1326 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1327 memcpy(GFS2_EA2DATA(el->el_ea),
1328 data,
1329 GFS2_EA_DATA_LEN(el->el_ea));
1330 } else
1331 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1332
1333 if (error)
1334 return error;
1335
1336 error = gfs2_meta_inode_buffer(ip, &dibh);
1337 if (!error) {
1338 error = inode_setattr(ip->i_vnode, attr);
1339 gfs2_assert_warn(ip->i_sbd, !error);
1340 gfs2_inode_attr_out(ip);
1341 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1342 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1343 brelse(dibh);
1344 }
1345
1346 gfs2_trans_end(ip->i_sbd);
1347
1348 return error;
1349}
1350
1351static int ea_dealloc_indirect(struct gfs2_inode *ip)
1352{
1353 struct gfs2_sbd *sdp = ip->i_sbd;
1354 struct gfs2_rgrp_list rlist;
1355 struct buffer_head *indbh, *dibh;
1356 uint64_t *eablk, *end;
1357 unsigned int rg_blocks = 0;
1358 uint64_t bstart = 0;
1359 unsigned int blen = 0;
1360 unsigned int blks = 0;
1361 unsigned int x;
1362 int error;
1363
1364 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1365
1366 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
1367 DIO_START | DIO_WAIT, &indbh);
1368 if (error)
1369 return error;
1370
1371 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1372 error = -EIO;
1373 goto out;
1374 }
1375
1376 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1377 end = eablk + sdp->sd_inptrs;
1378
1379 for (; eablk < end; eablk++) {
1380 uint64_t bn;
1381
1382 if (!*eablk)
1383 break;
1384 bn = be64_to_cpu(*eablk);
1385
1386 if (bstart + blen == bn)
1387 blen++;
1388 else {
1389 if (bstart)
1390 gfs2_rlist_add(sdp, &rlist, bstart);
1391 bstart = bn;
1392 blen = 1;
1393 }
1394 blks++;
1395 }
1396 if (bstart)
1397 gfs2_rlist_add(sdp, &rlist, bstart);
1398 else
1399 goto out;
1400
1401 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1402
1403 for (x = 0; x < rlist.rl_rgrps; x++) {
1404 struct gfs2_rgrpd *rgd;
1405 rgd = get_gl2rgd(rlist.rl_ghs[x].gh_gl);
1406 rg_blocks += rgd->rd_ri.ri_length;
1407 }
1408
1409 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1410 if (error)
1411 goto out_rlist_free;
1412
1413 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
1414 RES_INDIRECT + RES_STATFS +
1415 RES_QUOTA, blks);
1416 if (error)
1417 goto out_gunlock;
1418
1419 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1420
1421 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1422 bstart = 0;
1423 blen = 0;
1424
1425 for (; eablk < end; eablk++) {
1426 uint64_t bn;
1427
1428 if (!*eablk)
1429 break;
1430 bn = be64_to_cpu(*eablk);
1431
1432 if (bstart + blen == bn)
1433 blen++;
1434 else {
1435 if (bstart)
1436 gfs2_free_meta(ip, bstart, blen);
1437 bstart = bn;
1438 blen = 1;
1439 }
1440
1441 *eablk = 0;
1442 if (!ip->i_di.di_blocks)
1443 gfs2_consist_inode(ip);
1444 ip->i_di.di_blocks--;
1445 }
1446 if (bstart)
1447 gfs2_free_meta(ip, bstart, blen);
1448
1449 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1450
1451 error = gfs2_meta_inode_buffer(ip, &dibh);
1452 if (!error) {
1453 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1454 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1455 brelse(dibh);
1456 }
1457
1458 gfs2_trans_end(sdp);
1459
1460 out_gunlock:
1461 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1462
1463 out_rlist_free:
1464 gfs2_rlist_free(&rlist);
1465
1466 out:
1467 brelse(indbh);
1468
1469 return error;
1470}
1471
1472static int ea_dealloc_block(struct gfs2_inode *ip)
1473{
1474 struct gfs2_sbd *sdp = ip->i_sbd;
1475 struct gfs2_alloc *al = &ip->i_alloc;
1476 struct gfs2_rgrpd *rgd;
1477 struct buffer_head *dibh;
1478 int error;
1479
1480 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1481 if (!rgd) {
1482 gfs2_consist_inode(ip);
1483 return -EIO;
1484 }
1485
1486 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1487 &al->al_rgd_gh);
1488 if (error)
1489 return error;
1490
1491 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
1492 RES_STATFS + RES_QUOTA, 1);
1493 if (error)
1494 goto out_gunlock;
1495
1496 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1497
1498 ip->i_di.di_eattr = 0;
1499 if (!ip->i_di.di_blocks)
1500 gfs2_consist_inode(ip);
1501 ip->i_di.di_blocks--;
1502
1503 error = gfs2_meta_inode_buffer(ip, &dibh);
1504 if (!error) {
1505 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1506 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1507 brelse(dibh);
1508 }
1509
1510 gfs2_trans_end(sdp);
1511
1512 out_gunlock:
1513 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1514
1515 return error;
1516}
1517
1518/**
1519 * gfs2_ea_dealloc - deallocate the extended attribute fork
1520 * @ip: the inode
1521 *
1522 * Returns: errno
1523 */
1524
1525int gfs2_ea_dealloc(struct gfs2_inode *ip)
1526{
1527 struct gfs2_alloc *al;
1528 int error;
1529
1530 al = gfs2_alloc_get(ip);
1531
1532 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1533 if (error)
1534 goto out_alloc;
1535
1536 error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
1537 if (error)
1538 goto out_quota;
1539
1540 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1541 if (error)
1542 goto out_rindex;
1543
1544 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1545 error = ea_dealloc_indirect(ip);
1546 if (error)
1547 goto out_rindex;
1548 }
1549
1550 error = ea_dealloc_block(ip);
1551
1552 out_rindex:
1553 gfs2_glock_dq_uninit(&al->al_ri_gh);
1554
1555 out_quota:
1556 gfs2_quota_unhold(ip);
1557
1558 out_alloc:
1559 gfs2_alloc_put(ip);
1560
1561 return error;
1562}
1563
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..e5a42abf68a3
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
14#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
15
16#define GFS2_EA_SIZE(ea) \
17ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
18 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
19 (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
20
21#define GFS2_EA_STRLEN(ea) \
22((((ea)->ea_type == GFS2_EATYPE_USR) ? 5 : 7) + (ea)->ea_name_len + 1)
23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
26
27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(uint64_t) * DIV_RU((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36
37#define GFS2_EA2DATAPTRS(ea) \
38((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
39
40#define GFS2_EA2NEXT(ea) \
41((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
42
43#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request {
49 char *er_name;
50 char *er_data;
51 unsigned int er_name_len;
52 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56};
57
58struct gfs2_ea_location {
59 struct buffer_head *el_bh;
60 struct gfs2_ea_header *el_ea;
61 struct gfs2_ea_header *el_prev;
62};
63
64int gfs2_ea_repack(struct gfs2_inode *ip);
65
66int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
68int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69
70int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
73int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
74
75int gfs2_ea_dealloc(struct gfs2_inode *ip);
76
77/* Exported to acl.c */
78
79int gfs2_ea_find(struct gfs2_inode *ip,
80 struct gfs2_ea_request *er,
81 struct gfs2_ea_location *el);
82int gfs2_ea_get_copy(struct gfs2_inode *ip,
83 struct gfs2_ea_location *el,
84 char *data);
85int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
86 struct iattr *attr, char *data);
87
88#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644
index 000000000000..c7bf32ce3eca
--- /dev/null
+++ b/fs/gfs2/format.h
@@ -0,0 +1,21 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __FORMAT_DOT_H__
11#define __FORMAT_DOT_H__
12
13static const uint32_t gfs2_old_fs_formats[] = {
14 0
15};
16
17static const uint32_t gfs2_old_multihost_formats[] = {
18 0
19};
20
21#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..6c53d080675c
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,60 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13#include <linux/gfs2_ondisk.h>
14
15#include "lm_interface.h"
16#include "lvb.h"
17#include "incore.h"
18#include "util.h"
19
20enum {
21 NO_CREATE = 0,
22 CREATE = 1,
23};
24
25enum {
26 NO_WAIT = 0,
27 WAIT = 1,
28};
29
30enum {
31 NO_FORCE = 0,
32 FORCE = 1,
33};
34
35/* Divide num by den. Round up if there is a remainder. */
36#define DIV_RU(num, den) (((num) + (den) - 1) / (den))
37
38#define GFS2_FAST_NAME_SIZE 8
39
40#define get_v2sdp(sb) ((struct gfs2_sbd *)(sb)->s_fs_info)
41#define set_v2sdp(sb, sdp) (sb)->s_fs_info = (sdp)
42#define get_v2ip(inode) ((struct gfs2_inode *)(inode)->u.generic_ip)
43#define set_v2ip(inode, ip) (inode)->u.generic_ip = (ip)
44#define get_v2fp(file) ((struct gfs2_file *)(file)->private_data)
45#define set_v2fp(file, fp) (file)->private_data = (fp)
46#define get_v2bd(bh) ((struct gfs2_bufdata *)(bh)->b_private)
47#define set_v2bd(bh, bd) (bh)->b_private = (bd)
48
49#define get_transaction ((struct gfs2_trans *)(current->journal_info))
50#define set_transaction(tr) (current->journal_info) = (tr)
51
52#define get_gl2ip(gl) ((struct gfs2_inode *)(gl)->gl_object)
53#define set_gl2ip(gl, ip) (gl)->gl_object = (ip)
54#define get_gl2rgd(gl) ((struct gfs2_rgrpd *)(gl)->gl_object)
55#define set_gl2rgd(gl, rgd) (gl)->gl_object = (rgd)
56#define get_gl2gl(gl) ((struct gfs2_glock *)(gl)->gl_object)
57#define set_gl2gl(gl, gl2) (gl)->gl_object = (gl2)
58
59#endif /* __GFS2_DOT_H__ */
60
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..4df78ecfeeb3
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2513 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kref.h>
19#include <asm/semaphore.h>
20#include <asm/uaccess.h>
21
22#include "gfs2.h"
23#include "glock.h"
24#include "glops.h"
25#include "inode.h"
26#include "lm.h"
27#include "lops.h"
28#include "meta_io.h"
29#include "quota.h"
30#include "super.h"
31
32/* Must be kept in sync with the beginning of struct gfs2_glock */
33struct glock_plug {
34 struct list_head gl_list;
35 unsigned long gl_flags;
36};
37
38struct greedy {
39 struct gfs2_holder gr_gh;
40 struct work_struct gr_work;
41};
42
43typedef void (*glock_examiner) (struct gfs2_glock * gl);
44
45/**
46 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
47 * @actual: the current state of the lock
48 * @requested: the lock state that was requested by the caller
49 * @flags: the modifier flags passed in by the caller
50 *
51 * Returns: 1 if the locks are compatible, 0 otherwise
52 */
53
54static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
55 int flags)
56{
57 if (actual == requested)
58 return 1;
59
60 if (flags & GL_EXACT)
61 return 0;
62
63 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
64 return 1;
65
66 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
67 return 1;
68
69 return 0;
70}
71
72/**
73 * gl_hash() - Turn glock number into hash bucket number
74 * @lock: The glock number
75 *
76 * Returns: The number of the corresponding hash bucket
77 */
78
79static unsigned int gl_hash(struct lm_lockname *name)
80{
81 unsigned int h;
82
83 h = jhash(&name->ln_number, sizeof(uint64_t), 0);
84 h = jhash(&name->ln_type, sizeof(unsigned int), h);
85 h &= GFS2_GL_HASH_MASK;
86
87 return h;
88}
89
90/**
91 * glock_free() - Perform a few checks and then release struct gfs2_glock
92 * @gl: The glock to release
93 *
94 * Also calls lock module to release its internal structure for this glock.
95 *
96 */
97
98static void glock_free(struct gfs2_glock *gl)
99{
100 struct gfs2_sbd *sdp = gl->gl_sbd;
101 struct inode *aspace = gl->gl_aspace;
102
103 gfs2_lm_put_lock(sdp, gl->gl_lock);
104
105 if (aspace)
106 gfs2_aspace_put(aspace);
107
108 kmem_cache_free(gfs2_glock_cachep, gl);
109
110 atomic_dec(&sdp->sd_glock_count);
111}
112
113/**
114 * gfs2_glock_hold() - increment reference count on glock
115 * @gl: The glock to hold
116 *
117 */
118
119void gfs2_glock_hold(struct gfs2_glock *gl)
120{
121 kref_get(&gl->gl_ref);
122}
123
124/* All work is done after the return from kref_put() so we
125 can release the write_lock before the free. */
126
127static void kill_glock(struct kref *kref)
128{
129 struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
130 struct gfs2_sbd *sdp = gl->gl_sbd;
131
132 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
133 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
134 gfs2_assert(sdp, list_empty(&gl->gl_holders));
135 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
136 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
137 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
138}
139
140/**
141 * gfs2_glock_put() - Decrement reference count on glock
142 * @gl: The glock to put
143 *
144 */
145
146int gfs2_glock_put(struct gfs2_glock *gl)
147{
148 struct gfs2_sbd *sdp = gl->gl_sbd;
149 struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
150 int rv = 0;
151
152 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
153
154 write_lock(&bucket->hb_lock);
155 if (kref_put(&gl->gl_ref, kill_glock)) {
156 list_del_init(&gl->gl_list);
157 write_unlock(&bucket->hb_lock);
158 glock_free(gl);
159 rv = 1;
160 goto out;
161 }
162 write_unlock(&bucket->hb_lock);
163 out:
164 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
165 return rv;
166}
167
168/**
169 * queue_empty - check to see if a glock's queue is empty
170 * @gl: the glock
171 * @head: the head of the queue to check
172 *
173 * This function protects the list in the event that a process already
174 * has a holder on the list and is adding a second holder for itself.
175 * The glmutex lock is what generally prevents processes from working
176 * on the same glock at once, but the special case of adding a second
177 * holder for yourself ("recursive" locking) doesn't involve locking
178 * glmutex, making the spin lock necessary.
179 *
180 * Returns: 1 if the queue is empty
181 */
182
183static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
184{
185 int empty;
186 spin_lock(&gl->gl_spin);
187 empty = list_empty(head);
188 spin_unlock(&gl->gl_spin);
189 return empty;
190}
191
192/**
193 * search_bucket() - Find struct gfs2_glock by lock number
194 * @bucket: the bucket to search
195 * @name: The lock name
196 *
197 * Returns: NULL, or the struct gfs2_glock with the requested number
198 */
199
200static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
201 struct lm_lockname *name)
202{
203 struct gfs2_glock *gl;
204
205 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
206 if (test_bit(GLF_PLUG, &gl->gl_flags))
207 continue;
208 if (!lm_name_equal(&gl->gl_name, name))
209 continue;
210
211 kref_get(&gl->gl_ref);
212
213 return gl;
214 }
215
216 return NULL;
217}
218
219/**
220 * gfs2_glock_find() - Find glock by lock number
221 * @sdp: The GFS2 superblock
222 * @name: The lock name
223 *
224 * Returns: NULL, or the struct gfs2_glock with the requested number
225 */
226
227struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
228 struct lm_lockname *name)
229{
230 struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
231 struct gfs2_glock *gl;
232
233 read_lock(&bucket->hb_lock);
234 gl = search_bucket(bucket, name);
235 read_unlock(&bucket->hb_lock);
236
237 return gl;
238}
239
240/**
241 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
242 * @sdp: The GFS2 superblock
243 * @number: the lock number
244 * @glops: The glock_operations to use
245 * @create: If 0, don't create the glock if it doesn't exist
246 * @glp: the glock is returned here
247 *
248 * This does not lock a glock, just finds/creates structures for one.
249 *
250 * Returns: errno
251 */
252
253int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
254 struct gfs2_glock_operations *glops, int create,
255 struct gfs2_glock **glp)
256{
257 struct lm_lockname name;
258 struct gfs2_glock *gl, *tmp;
259 struct gfs2_gl_hash_bucket *bucket;
260 int error;
261
262 name.ln_number = number;
263 name.ln_type = glops->go_type;
264 bucket = &sdp->sd_gl_hash[gl_hash(&name)];
265
266 read_lock(&bucket->hb_lock);
267 gl = search_bucket(bucket, &name);
268 read_unlock(&bucket->hb_lock);
269
270 if (gl || !create) {
271 *glp = gl;
272 return 0;
273 }
274
275 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
276 if (!gl)
277 return -ENOMEM;
278
279 memset(gl, 0, sizeof(struct gfs2_glock));
280
281 INIT_LIST_HEAD(&gl->gl_list);
282 gl->gl_name = name;
283 kref_init(&gl->gl_ref);
284
285 spin_lock_init(&gl->gl_spin);
286
287 gl->gl_state = LM_ST_UNLOCKED;
288 INIT_LIST_HEAD(&gl->gl_holders);
289 INIT_LIST_HEAD(&gl->gl_waiters1);
290 INIT_LIST_HEAD(&gl->gl_waiters2);
291 INIT_LIST_HEAD(&gl->gl_waiters3);
292
293 gl->gl_ops = glops;
294
295 gl->gl_bucket = bucket;
296 INIT_LIST_HEAD(&gl->gl_reclaim);
297
298 gl->gl_sbd = sdp;
299
300 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
301 INIT_LIST_HEAD(&gl->gl_ail_list);
302
303 /* If this glock protects actual on-disk data or metadata blocks,
304 create a VFS inode to manage the pages/buffers holding them. */
305 if (glops == &gfs2_inode_glops ||
306 glops == &gfs2_rgrp_glops ||
307 glops == &gfs2_meta_glops) {
308 gl->gl_aspace = gfs2_aspace_get(sdp);
309 if (!gl->gl_aspace) {
310 error = -ENOMEM;
311 goto fail;
312 }
313 }
314
315 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
316 if (error)
317 goto fail_aspace;
318
319 atomic_inc(&sdp->sd_glock_count);
320
321 write_lock(&bucket->hb_lock);
322 tmp = search_bucket(bucket, &name);
323 if (tmp) {
324 write_unlock(&bucket->hb_lock);
325 glock_free(gl);
326 gl = tmp;
327 } else {
328 list_add_tail(&gl->gl_list, &bucket->hb_list);
329 write_unlock(&bucket->hb_lock);
330 }
331
332 *glp = gl;
333
334 return 0;
335
336 fail_aspace:
337 if (gl->gl_aspace)
338 gfs2_aspace_put(gl->gl_aspace);
339
340 fail:
341 kmem_cache_free(gfs2_glock_cachep, gl);
342
343 return error;
344}
345
346/**
347 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
348 * @gl: the glock
349 * @state: the state we're requesting
350 * @flags: the modifier flags
351 * @gh: the holder structure
352 *
353 */
354
355void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, int flags,
356 struct gfs2_holder *gh)
357{
358 INIT_LIST_HEAD(&gh->gh_list);
359 gh->gh_gl = gl;
360 gh->gh_owner = (flags & GL_NEVER_RECURSE) ? NULL : current;
361 gh->gh_state = state;
362 gh->gh_flags = flags;
363 gh->gh_error = 0;
364 gh->gh_iflags = 0;
365 init_completion(&gh->gh_wait);
366
367 if (gh->gh_state == LM_ST_EXCLUSIVE)
368 gh->gh_flags |= GL_LOCAL_EXCL;
369
370 gfs2_glock_hold(gl);
371}
372
373/**
374 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
375 * @state: the state we're requesting
376 * @flags: the modifier flags
377 * @gh: the holder structure
378 *
379 * Don't mess with the glock.
380 *
381 */
382
383void gfs2_holder_reinit(unsigned int state, int flags, struct gfs2_holder *gh)
384{
385 gh->gh_state = state;
386 gh->gh_flags = flags;
387 if (gh->gh_state == LM_ST_EXCLUSIVE)
388 gh->gh_flags |= GL_LOCAL_EXCL;
389
390 gh->gh_iflags &= 1 << HIF_ALLOCED;
391}
392
393/**
394 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
395 * @gh: the holder structure
396 *
397 */
398
399void gfs2_holder_uninit(struct gfs2_holder *gh)
400{
401 gfs2_glock_put(gh->gh_gl);
402 gh->gh_gl = NULL;
403}
404
405/**
406 * gfs2_holder_get - get a struct gfs2_holder structure
407 * @gl: the glock
408 * @state: the state we're requesting
409 * @flags: the modifier flags
410 * @gfp_flags: __GFP_NOFAIL
411 *
412 * Figure out how big an impact this function has. Either:
413 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
414 * 2) Leave it like it is
415 *
416 * Returns: the holder structure, NULL on ENOMEM
417 */
418
419struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, unsigned int state,
420 int flags, gfp_t gfp_flags)
421{
422 struct gfs2_holder *gh;
423
424 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
425 if (!gh)
426 return NULL;
427
428 gfs2_holder_init(gl, state, flags, gh);
429 set_bit(HIF_ALLOCED, &gh->gh_iflags);
430
431 return gh;
432}
433
434/**
435 * gfs2_holder_put - get rid of a struct gfs2_holder structure
436 * @gh: the holder structure
437 *
438 */
439
440void gfs2_holder_put(struct gfs2_holder *gh)
441{
442 gfs2_holder_uninit(gh);
443 kfree(gh);
444}
445
446/**
447 * handle_recurse - put other holder structures (marked recursive)
448 * into the holders list
449 * @gh: the holder structure
450 *
451 */
452
453static void handle_recurse(struct gfs2_holder *gh)
454{
455 struct gfs2_glock *gl = gh->gh_gl;
456 struct gfs2_sbd *sdp = gl->gl_sbd;
457 struct gfs2_holder *tmp_gh, *safe;
458 int found = 0;
459
460 if (gfs2_assert_warn(sdp, gh->gh_owner))
461 return;
462
463 list_for_each_entry_safe(tmp_gh, safe, &gl->gl_waiters3, gh_list) {
464 if (tmp_gh->gh_owner != gh->gh_owner)
465 continue;
466
467 gfs2_assert_warn(sdp,
468 test_bit(HIF_RECURSE, &tmp_gh->gh_iflags));
469
470 list_move_tail(&tmp_gh->gh_list, &gl->gl_holders);
471 tmp_gh->gh_error = 0;
472 set_bit(HIF_HOLDER, &tmp_gh->gh_iflags);
473
474 complete(&tmp_gh->gh_wait);
475
476 found = 1;
477 }
478
479 gfs2_assert_warn(sdp, found);
480}
481
482/**
483 * do_unrecurse - a recursive holder was just dropped of the waiters3 list
484 * @gh: the holder
485 *
486 * If there is only one other recursive holder, clear its HIF_RECURSE bit.
487 * If there is more than one, leave them alone.
488 *
489 */
490
491static void do_unrecurse(struct gfs2_holder *gh)
492{
493 struct gfs2_glock *gl = gh->gh_gl;
494 struct gfs2_sbd *sdp = gl->gl_sbd;
495 struct gfs2_holder *tmp_gh, *last_gh = NULL;
496 int found = 0;
497
498 if (gfs2_assert_warn(sdp, gh->gh_owner))
499 return;
500
501 list_for_each_entry(tmp_gh, &gl->gl_waiters3, gh_list) {
502 if (tmp_gh->gh_owner != gh->gh_owner)
503 continue;
504
505 gfs2_assert_warn(sdp,
506 test_bit(HIF_RECURSE, &tmp_gh->gh_iflags));
507
508 if (found)
509 return;
510
511 found = 1;
512 last_gh = tmp_gh;
513 }
514
515 if (!gfs2_assert_warn(sdp, found))
516 clear_bit(HIF_RECURSE, &last_gh->gh_iflags);
517}
518
519/**
520 * rq_mutex - process a mutex request in the queue
521 * @gh: the glock holder
522 *
523 * Returns: 1 if the queue is blocked
524 */
525
526static int rq_mutex(struct gfs2_holder *gh)
527{
528 struct gfs2_glock *gl = gh->gh_gl;
529
530 list_del_init(&gh->gh_list);
531 /* gh->gh_error never examined. */
532 set_bit(GLF_LOCK, &gl->gl_flags);
533 complete(&gh->gh_wait);
534
535 return 1;
536}
537
538/**
539 * rq_promote - process a promote request in the queue
540 * @gh: the glock holder
541 *
542 * Acquire a new inter-node lock, or change a lock state to more restrictive.
543 *
544 * Returns: 1 if the queue is blocked
545 */
546
547static int rq_promote(struct gfs2_holder *gh)
548{
549 struct gfs2_glock *gl = gh->gh_gl;
550 struct gfs2_sbd *sdp = gl->gl_sbd;
551 struct gfs2_glock_operations *glops = gl->gl_ops;
552 int recurse;
553
554 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
555 if (list_empty(&gl->gl_holders)) {
556 gl->gl_req_gh = gh;
557 set_bit(GLF_LOCK, &gl->gl_flags);
558 spin_unlock(&gl->gl_spin);
559
560 if (atomic_read(&sdp->sd_reclaim_count) >
561 gfs2_tune_get(sdp, gt_reclaim_limit) &&
562 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
563 gfs2_reclaim_glock(sdp);
564 gfs2_reclaim_glock(sdp);
565 }
566
567 glops->go_xmote_th(gl, gh->gh_state,
568 gh->gh_flags);
569
570 spin_lock(&gl->gl_spin);
571 }
572 return 1;
573 }
574
575 if (list_empty(&gl->gl_holders)) {
576 set_bit(HIF_FIRST, &gh->gh_iflags);
577 set_bit(GLF_LOCK, &gl->gl_flags);
578 recurse = 0;
579 } else {
580 struct gfs2_holder *next_gh;
581 if (gh->gh_flags & GL_LOCAL_EXCL)
582 return 1;
583 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
584 gh_list);
585 if (next_gh->gh_flags & GL_LOCAL_EXCL)
586 return 1;
587 recurse = test_bit(HIF_RECURSE, &gh->gh_iflags);
588 }
589
590 list_move_tail(&gh->gh_list, &gl->gl_holders);
591 gh->gh_error = 0;
592 set_bit(HIF_HOLDER, &gh->gh_iflags);
593
594 if (recurse)
595 handle_recurse(gh);
596
597 complete(&gh->gh_wait);
598
599 return 0;
600}
601
602/**
603 * rq_demote - process a demote request in the queue
604 * @gh: the glock holder
605 *
606 * Returns: 1 if the queue is blocked
607 */
608
609static int rq_demote(struct gfs2_holder *gh)
610{
611 struct gfs2_glock *gl = gh->gh_gl;
612 struct gfs2_glock_operations *glops = gl->gl_ops;
613
614 if (!list_empty(&gl->gl_holders))
615 return 1;
616
617 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
618 list_del_init(&gh->gh_list);
619 gh->gh_error = 0;
620 spin_unlock(&gl->gl_spin);
621 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
622 gfs2_holder_put(gh);
623 else
624 complete(&gh->gh_wait);
625 spin_lock(&gl->gl_spin);
626 } else {
627 gl->gl_req_gh = gh;
628 set_bit(GLF_LOCK, &gl->gl_flags);
629 spin_unlock(&gl->gl_spin);
630
631 if (gh->gh_state == LM_ST_UNLOCKED ||
632 gl->gl_state != LM_ST_EXCLUSIVE)
633 glops->go_drop_th(gl);
634 else
635 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
636
637 spin_lock(&gl->gl_spin);
638 }
639
640 return 0;
641}
642
643/**
644 * rq_greedy - process a queued request to drop greedy status
645 * @gh: the glock holder
646 *
647 * Returns: 1 if the queue is blocked
648 */
649
650static int rq_greedy(struct gfs2_holder *gh)
651{
652 struct gfs2_glock *gl = gh->gh_gl;
653
654 list_del_init(&gh->gh_list);
655 /* gh->gh_error never examined. */
656 clear_bit(GLF_GREEDY, &gl->gl_flags);
657 spin_unlock(&gl->gl_spin);
658
659 gfs2_holder_uninit(gh);
660 kfree(container_of(gh, struct greedy, gr_gh));
661
662 spin_lock(&gl->gl_spin);
663
664 return 0;
665}
666
667/**
668 * run_queue - process holder structures on a glock
669 * @gl: the glock
670 *
671 */
672
673static void run_queue(struct gfs2_glock *gl)
674{
675 struct gfs2_holder *gh;
676 int blocked = 1;
677
678 for (;;) {
679 if (test_bit(GLF_LOCK, &gl->gl_flags))
680 break;
681
682 if (!list_empty(&gl->gl_waiters1)) {
683 gh = list_entry(gl->gl_waiters1.next,
684 struct gfs2_holder, gh_list);
685
686 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
687 blocked = rq_mutex(gh);
688 else
689 gfs2_assert_warn(gl->gl_sbd, 0);
690
691 } else if (!list_empty(&gl->gl_waiters2) &&
692 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
693 gh = list_entry(gl->gl_waiters2.next,
694 struct gfs2_holder, gh_list);
695
696 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
697 blocked = rq_demote(gh);
698 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
699 blocked = rq_greedy(gh);
700 else
701 gfs2_assert_warn(gl->gl_sbd, 0);
702
703 } else if (!list_empty(&gl->gl_waiters3)) {
704 gh = list_entry(gl->gl_waiters3.next,
705 struct gfs2_holder, gh_list);
706
707 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
708 blocked = rq_promote(gh);
709 else
710 gfs2_assert_warn(gl->gl_sbd, 0);
711
712 } else
713 break;
714
715 if (blocked)
716 break;
717 }
718}
719
720/**
721 * gfs2_glmutex_lock - acquire a local lock on a glock
722 * @gl: the glock
723 *
724 * Gives caller exclusive access to manipulate a glock structure.
725 */
726
727void gfs2_glmutex_lock(struct gfs2_glock *gl)
728{
729 struct gfs2_holder gh;
730
731 gfs2_holder_init(gl, 0, 0, &gh);
732 set_bit(HIF_MUTEX, &gh.gh_iflags);
733
734 spin_lock(&gl->gl_spin);
735 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
736 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
737 else
738 complete(&gh.gh_wait);
739 spin_unlock(&gl->gl_spin);
740
741 wait_for_completion(&gh.gh_wait);
742 gfs2_holder_uninit(&gh);
743}
744
745/**
746 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
747 * @gl: the glock
748 *
749 * Returns: 1 if the glock is acquired
750 */
751
752int gfs2_glmutex_trylock(struct gfs2_glock *gl)
753{
754 int acquired = 1;
755
756 spin_lock(&gl->gl_spin);
757 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
758 acquired = 0;
759 spin_unlock(&gl->gl_spin);
760
761 return acquired;
762}
763
764/**
765 * gfs2_glmutex_unlock - release a local lock on a glock
766 * @gl: the glock
767 *
768 */
769
770void gfs2_glmutex_unlock(struct gfs2_glock *gl)
771{
772 spin_lock(&gl->gl_spin);
773 clear_bit(GLF_LOCK, &gl->gl_flags);
774 run_queue(gl);
775 spin_unlock(&gl->gl_spin);
776}
777
778/**
779 * handle_callback - add a demote request to a lock's queue
780 * @gl: the glock
781 * @state: the state the caller wants us to change to
782 *
783 */
784
785static void handle_callback(struct gfs2_glock *gl, unsigned int state)
786{
787 struct gfs2_holder *gh, *new_gh = NULL;
788
789 restart:
790 spin_lock(&gl->gl_spin);
791
792 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
793 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
794 gl->gl_req_gh != gh) {
795 if (gh->gh_state != state)
796 gh->gh_state = LM_ST_UNLOCKED;
797 goto out;
798 }
799 }
800
801 if (new_gh) {
802 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
803 new_gh = NULL;
804 } else {
805 spin_unlock(&gl->gl_spin);
806
807 new_gh = gfs2_holder_get(gl, state,
808 LM_FLAG_TRY | GL_NEVER_RECURSE,
809 GFP_KERNEL | __GFP_NOFAIL),
810 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
811 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
812
813 goto restart;
814 }
815
816 out:
817 spin_unlock(&gl->gl_spin);
818
819 if (new_gh)
820 gfs2_holder_put(new_gh);
821}
822
823/**
824 * state_change - record that the glock is now in a different state
825 * @gl: the glock
826 * @new_state the new state
827 *
828 */
829
830static void state_change(struct gfs2_glock *gl, unsigned int new_state)
831{
832 struct gfs2_sbd *sdp = gl->gl_sbd;
833 int held1, held2;
834
835 held1 = (gl->gl_state != LM_ST_UNLOCKED);
836 held2 = (new_state != LM_ST_UNLOCKED);
837
838 if (held1 != held2) {
839 if (held2) {
840 atomic_inc(&sdp->sd_glock_held_count);
841 gfs2_glock_hold(gl);
842 } else {
843 atomic_dec(&sdp->sd_glock_held_count);
844 gfs2_glock_put(gl);
845 }
846 }
847
848 gl->gl_state = new_state;
849}
850
851/**
852 * xmote_bh - Called after the lock module is done acquiring a lock
853 * @gl: The glock in question
854 * @ret: the int returned from the lock module
855 *
856 */
857
858static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
859{
860 struct gfs2_sbd *sdp = gl->gl_sbd;
861 struct gfs2_glock_operations *glops = gl->gl_ops;
862 struct gfs2_holder *gh = gl->gl_req_gh;
863 int prev_state = gl->gl_state;
864 int op_done = 1;
865
866 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
867 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
868 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
869
870 state_change(gl, ret & LM_OUT_ST_MASK);
871
872 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
873 if (glops->go_inval)
874 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
875 } else if (gl->gl_state == LM_ST_DEFERRED) {
876 /* We might not want to do this here.
877 Look at moving to the inode glops. */
878 if (glops->go_inval)
879 glops->go_inval(gl, DIO_DATA);
880 }
881
882 /* Deal with each possible exit condition */
883
884 if (!gh)
885 gl->gl_stamp = jiffies;
886
887 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
888 spin_lock(&gl->gl_spin);
889 list_del_init(&gh->gh_list);
890 gh->gh_error = -EIO;
891 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
892 do_unrecurse(gh);
893 spin_unlock(&gl->gl_spin);
894
895 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
896 spin_lock(&gl->gl_spin);
897 list_del_init(&gh->gh_list);
898 if (gl->gl_state == gh->gh_state ||
899 gl->gl_state == LM_ST_UNLOCKED)
900 gh->gh_error = 0;
901 else {
902 if (gfs2_assert_warn(sdp, gh->gh_flags &
903 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
904 fs_warn(sdp, "ret = 0x%.8X\n", ret);
905 gh->gh_error = GLR_TRYFAILED;
906 }
907 spin_unlock(&gl->gl_spin);
908
909 if (ret & LM_OUT_CANCELED)
910 handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
911
912 } else if (ret & LM_OUT_CANCELED) {
913 spin_lock(&gl->gl_spin);
914 list_del_init(&gh->gh_list);
915 gh->gh_error = GLR_CANCELED;
916 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
917 do_unrecurse(gh);
918 spin_unlock(&gl->gl_spin);
919
920 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
921 spin_lock(&gl->gl_spin);
922 list_move_tail(&gh->gh_list, &gl->gl_holders);
923 gh->gh_error = 0;
924 set_bit(HIF_HOLDER, &gh->gh_iflags);
925 spin_unlock(&gl->gl_spin);
926
927 set_bit(HIF_FIRST, &gh->gh_iflags);
928
929 op_done = 0;
930
931 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
932 spin_lock(&gl->gl_spin);
933 list_del_init(&gh->gh_list);
934 gh->gh_error = GLR_TRYFAILED;
935 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
936 do_unrecurse(gh);
937 spin_unlock(&gl->gl_spin);
938
939 } else {
940 if (gfs2_assert_withdraw(sdp, 0) == -1)
941 fs_err(sdp, "ret = 0x%.8X\n", ret);
942 }
943
944 if (glops->go_xmote_bh)
945 glops->go_xmote_bh(gl);
946
947 if (op_done) {
948 spin_lock(&gl->gl_spin);
949 gl->gl_req_gh = NULL;
950 gl->gl_req_bh = NULL;
951 clear_bit(GLF_LOCK, &gl->gl_flags);
952 run_queue(gl);
953 spin_unlock(&gl->gl_spin);
954 }
955
956 gfs2_glock_put(gl);
957
958 if (gh) {
959 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
960 gfs2_holder_put(gh);
961 else
962 complete(&gh->gh_wait);
963 }
964}
965
966/**
967 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
968 * @gl: The glock in question
969 * @state: the requested state
970 * @flags: modifier flags to the lock call
971 *
972 */
973
974void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
975{
976 struct gfs2_sbd *sdp = gl->gl_sbd;
977 struct gfs2_glock_operations *glops = gl->gl_ops;
978 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
979 LM_FLAG_NOEXP | LM_FLAG_ANY |
980 LM_FLAG_PRIORITY);
981 unsigned int lck_ret;
982
983 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
984 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
985 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
986 gfs2_assert_warn(sdp, state != gl->gl_state);
987
988 if (gl->gl_state == LM_ST_EXCLUSIVE) {
989 if (glops->go_sync)
990 glops->go_sync(gl,
991 DIO_METADATA | DIO_DATA | DIO_RELEASE);
992 }
993
994 gfs2_glock_hold(gl);
995 gl->gl_req_bh = xmote_bh;
996
997 atomic_inc(&sdp->sd_lm_lock_calls);
998
999 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
1000 lck_flags);
1001
1002 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
1003 return;
1004
1005 if (lck_ret & LM_OUT_ASYNC)
1006 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
1007 else
1008 xmote_bh(gl, lck_ret);
1009}
1010
1011/**
1012 * drop_bh - Called after a lock module unlock completes
1013 * @gl: the glock
1014 * @ret: the return status
1015 *
1016 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
1017 * Doesn't drop the reference on the glock the top half took out
1018 *
1019 */
1020
1021static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
1022{
1023 struct gfs2_sbd *sdp = gl->gl_sbd;
1024 struct gfs2_glock_operations *glops = gl->gl_ops;
1025 struct gfs2_holder *gh = gl->gl_req_gh;
1026
1027 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1028
1029 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1030 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1031 gfs2_assert_warn(sdp, !ret);
1032
1033 state_change(gl, LM_ST_UNLOCKED);
1034
1035 if (glops->go_inval)
1036 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
1037
1038 if (gh) {
1039 spin_lock(&gl->gl_spin);
1040 list_del_init(&gh->gh_list);
1041 gh->gh_error = 0;
1042 spin_unlock(&gl->gl_spin);
1043 }
1044
1045 if (glops->go_drop_bh)
1046 glops->go_drop_bh(gl);
1047
1048 spin_lock(&gl->gl_spin);
1049 gl->gl_req_gh = NULL;
1050 gl->gl_req_bh = NULL;
1051 clear_bit(GLF_LOCK, &gl->gl_flags);
1052 run_queue(gl);
1053 spin_unlock(&gl->gl_spin);
1054
1055 gfs2_glock_put(gl);
1056
1057 if (gh) {
1058 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
1059 gfs2_holder_put(gh);
1060 else
1061 complete(&gh->gh_wait);
1062 }
1063}
1064
1065/**
1066 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1067 * @gl: the glock
1068 *
1069 */
1070
1071void gfs2_glock_drop_th(struct gfs2_glock *gl)
1072{
1073 struct gfs2_sbd *sdp = gl->gl_sbd;
1074 struct gfs2_glock_operations *glops = gl->gl_ops;
1075 unsigned int ret;
1076
1077 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1078 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1079 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1080
1081 if (gl->gl_state == LM_ST_EXCLUSIVE) {
1082 if (glops->go_sync)
1083 glops->go_sync(gl,
1084 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1085 }
1086
1087 gfs2_glock_hold(gl);
1088 gl->gl_req_bh = drop_bh;
1089
1090 atomic_inc(&sdp->sd_lm_unlock_calls);
1091
1092 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1093
1094 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1095 return;
1096
1097 if (!ret)
1098 drop_bh(gl, ret);
1099 else
1100 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1101}
1102
1103/**
1104 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1105 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1106 *
1107 * Don't cancel GL_NOCANCEL requests.
1108 */
1109
1110static void do_cancels(struct gfs2_holder *gh)
1111{
1112 struct gfs2_glock *gl = gh->gh_gl;
1113
1114 spin_lock(&gl->gl_spin);
1115
1116 while (gl->gl_req_gh != gh &&
1117 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1118 !list_empty(&gh->gh_list)) {
1119 if (gl->gl_req_bh &&
1120 !(gl->gl_req_gh &&
1121 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1122 spin_unlock(&gl->gl_spin);
1123 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1124 msleep(100);
1125 spin_lock(&gl->gl_spin);
1126 } else {
1127 spin_unlock(&gl->gl_spin);
1128 msleep(100);
1129 spin_lock(&gl->gl_spin);
1130 }
1131 }
1132
1133 spin_unlock(&gl->gl_spin);
1134}
1135
1136/**
1137 * glock_wait_internal - wait on a glock acquisition
1138 * @gh: the glock holder
1139 *
1140 * Returns: 0 on success
1141 */
1142
1143static int glock_wait_internal(struct gfs2_holder *gh)
1144{
1145 struct gfs2_glock *gl = gh->gh_gl;
1146 struct gfs2_sbd *sdp = gl->gl_sbd;
1147 struct gfs2_glock_operations *glops = gl->gl_ops;
1148
1149 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1150 return -EIO;
1151
1152 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1153 spin_lock(&gl->gl_spin);
1154 if (gl->gl_req_gh != gh &&
1155 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1156 !list_empty(&gh->gh_list)) {
1157 list_del_init(&gh->gh_list);
1158 gh->gh_error = GLR_TRYFAILED;
1159 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
1160 do_unrecurse(gh);
1161 run_queue(gl);
1162 spin_unlock(&gl->gl_spin);
1163 return gh->gh_error;
1164 }
1165 spin_unlock(&gl->gl_spin);
1166 }
1167
1168 if (gh->gh_flags & LM_FLAG_PRIORITY)
1169 do_cancels(gh);
1170
1171 wait_for_completion(&gh->gh_wait);
1172
1173 if (gh->gh_error)
1174 return gh->gh_error;
1175
1176 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1177 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
1178 gh->gh_state,
1179 gh->gh_flags));
1180
1181 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1182 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1183
1184 if (glops->go_lock) {
1185 gh->gh_error = glops->go_lock(gh);
1186 if (gh->gh_error) {
1187 spin_lock(&gl->gl_spin);
1188 list_del_init(&gh->gh_list);
1189 if (test_and_clear_bit(HIF_RECURSE,
1190 &gh->gh_iflags))
1191 do_unrecurse(gh);
1192 spin_unlock(&gl->gl_spin);
1193 }
1194 }
1195
1196 spin_lock(&gl->gl_spin);
1197 gl->gl_req_gh = NULL;
1198 gl->gl_req_bh = NULL;
1199 clear_bit(GLF_LOCK, &gl->gl_flags);
1200 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
1201 handle_recurse(gh);
1202 run_queue(gl);
1203 spin_unlock(&gl->gl_spin);
1204 }
1205
1206 return gh->gh_error;
1207}
1208
1209static inline struct gfs2_holder *
1210find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1211{
1212 struct gfs2_holder *gh;
1213
1214 list_for_each_entry(gh, head, gh_list) {
1215 if (gh->gh_owner == owner)
1216 return gh;
1217 }
1218
1219 return NULL;
1220}
1221
1222/**
1223 * recurse_check -
1224 *
1225 * Make sure the new holder is compatible with the pre-existing one.
1226 *
1227 */
1228
1229static int recurse_check(struct gfs2_holder *existing, struct gfs2_holder *new,
1230 unsigned int state)
1231{
1232 struct gfs2_sbd *sdp = existing->gh_gl->gl_sbd;
1233
1234 if (gfs2_assert_warn(sdp, (new->gh_flags & LM_FLAG_ANY) ||
1235 !(existing->gh_flags & LM_FLAG_ANY)))
1236 goto fail;
1237
1238 if (gfs2_assert_warn(sdp, (existing->gh_flags & GL_LOCAL_EXCL) ||
1239 !(new->gh_flags & GL_LOCAL_EXCL)))
1240 goto fail;
1241
1242 if (gfs2_assert_warn(sdp, relaxed_state_ok(state, new->gh_state,
1243 new->gh_flags)))
1244 goto fail;
1245
1246 return 0;
1247
1248 fail:
1249 set_bit(HIF_ABORTED, &new->gh_iflags);
1250 return -EINVAL;
1251}
1252
1253/**
1254 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1255 * @gh: the holder structure to add
1256 *
1257 */
1258
1259static void add_to_queue(struct gfs2_holder *gh)
1260{
1261 struct gfs2_glock *gl = gh->gh_gl;
1262 struct gfs2_holder *existing;
1263
1264 if (!gh->gh_owner)
1265 goto out;
1266
1267 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1268 if (existing) {
1269 if (recurse_check(existing, gh, gl->gl_state))
1270 return;
1271
1272 list_add_tail(&gh->gh_list, &gl->gl_holders);
1273 set_bit(HIF_HOLDER, &gh->gh_iflags);
1274
1275 gh->gh_error = 0;
1276 complete(&gh->gh_wait);
1277
1278 return;
1279 }
1280
1281 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1282 if (existing) {
1283 if (recurse_check(existing, gh, existing->gh_state))
1284 return;
1285
1286 set_bit(HIF_RECURSE, &gh->gh_iflags);
1287 set_bit(HIF_RECURSE, &existing->gh_iflags);
1288
1289 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1290
1291 return;
1292 }
1293
1294 out:
1295 if (gh->gh_flags & LM_FLAG_PRIORITY)
1296 list_add(&gh->gh_list, &gl->gl_waiters3);
1297 else
1298 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1299}
1300
1301/**
1302 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1303 * @gh: the holder structure
1304 *
1305 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1306 *
1307 * Returns: 0, GLR_TRYFAILED, or errno on failure
1308 */
1309
1310int gfs2_glock_nq(struct gfs2_holder *gh)
1311{
1312 struct gfs2_glock *gl = gh->gh_gl;
1313 struct gfs2_sbd *sdp = gl->gl_sbd;
1314 int error = 0;
1315
1316 atomic_inc(&sdp->sd_glock_nq_calls);
1317
1318 restart:
1319 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1320 set_bit(HIF_ABORTED, &gh->gh_iflags);
1321 return -EIO;
1322 }
1323
1324 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1325
1326 spin_lock(&gl->gl_spin);
1327 add_to_queue(gh);
1328 run_queue(gl);
1329 spin_unlock(&gl->gl_spin);
1330
1331 if (!(gh->gh_flags & GL_ASYNC)) {
1332 error = glock_wait_internal(gh);
1333 if (error == GLR_CANCELED) {
1334 msleep(1000);
1335 goto restart;
1336 }
1337 }
1338
1339 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1340
1341 return error;
1342}
1343
1344/**
1345 * gfs2_glock_poll - poll to see if an async request has been completed
1346 * @gh: the holder
1347 *
1348 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1349 */
1350
1351int gfs2_glock_poll(struct gfs2_holder *gh)
1352{
1353 struct gfs2_glock *gl = gh->gh_gl;
1354 int ready = 0;
1355
1356 spin_lock(&gl->gl_spin);
1357
1358 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1359 ready = 1;
1360 else if (list_empty(&gh->gh_list)) {
1361 if (gh->gh_error == GLR_CANCELED) {
1362 spin_unlock(&gl->gl_spin);
1363 msleep(1000);
1364 if (gfs2_glock_nq(gh))
1365 return 1;
1366 return 0;
1367 } else
1368 ready = 1;
1369 }
1370
1371 spin_unlock(&gl->gl_spin);
1372
1373 return ready;
1374}
1375
1376/**
1377 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1378 * @gh: the holder structure
1379 *
1380 * Returns: 0, GLR_TRYFAILED, or errno on failure
1381 */
1382
1383int gfs2_glock_wait(struct gfs2_holder *gh)
1384{
1385 int error;
1386
1387 error = glock_wait_internal(gh);
1388 if (error == GLR_CANCELED) {
1389 msleep(1000);
1390 gh->gh_flags &= ~GL_ASYNC;
1391 error = gfs2_glock_nq(gh);
1392 }
1393
1394 return error;
1395}
1396
1397/**
1398 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1399 * @gh: the glock holder
1400 *
1401 */
1402
1403void gfs2_glock_dq(struct gfs2_holder *gh)
1404{
1405 struct gfs2_glock *gl = gh->gh_gl;
1406 struct gfs2_sbd *sdp = gl->gl_sbd;
1407 struct gfs2_glock_operations *glops = gl->gl_ops;
1408
1409 atomic_inc(&sdp->sd_glock_dq_calls);
1410
1411 if (gh->gh_flags & GL_SYNC)
1412 set_bit(GLF_SYNC, &gl->gl_flags);
1413
1414 if (gh->gh_flags & GL_NOCACHE)
1415 handle_callback(gl, LM_ST_UNLOCKED);
1416
1417 gfs2_glmutex_lock(gl);
1418
1419 spin_lock(&gl->gl_spin);
1420 list_del_init(&gh->gh_list);
1421
1422 if (list_empty(&gl->gl_holders)) {
1423 spin_unlock(&gl->gl_spin);
1424
1425 if (glops->go_unlock)
1426 glops->go_unlock(gh);
1427
1428 if (test_bit(GLF_SYNC, &gl->gl_flags)) {
1429 if (glops->go_sync)
1430 glops->go_sync(gl, DIO_METADATA | DIO_DATA);
1431 }
1432
1433 gl->gl_stamp = jiffies;
1434
1435 spin_lock(&gl->gl_spin);
1436 }
1437
1438 clear_bit(GLF_LOCK, &gl->gl_flags);
1439 run_queue(gl);
1440 spin_unlock(&gl->gl_spin);
1441}
1442
1443/**
1444 * gfs2_glock_prefetch - Try to prefetch a glock
1445 * @gl: the glock
1446 * @state: the state to prefetch in
1447 * @flags: flags passed to go_xmote_th()
1448 *
1449 */
1450
1451void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, int flags)
1452{
1453 struct gfs2_glock_operations *glops = gl->gl_ops;
1454
1455 spin_lock(&gl->gl_spin);
1456
1457 if (test_bit(GLF_LOCK, &gl->gl_flags) ||
1458 !list_empty(&gl->gl_holders) ||
1459 !list_empty(&gl->gl_waiters1) ||
1460 !list_empty(&gl->gl_waiters2) ||
1461 !list_empty(&gl->gl_waiters3) ||
1462 relaxed_state_ok(gl->gl_state, state, flags)) {
1463 spin_unlock(&gl->gl_spin);
1464 return;
1465 }
1466
1467 set_bit(GLF_PREFETCH, &gl->gl_flags);
1468 set_bit(GLF_LOCK, &gl->gl_flags);
1469 spin_unlock(&gl->gl_spin);
1470
1471 glops->go_xmote_th(gl, state, flags);
1472
1473 atomic_inc(&gl->gl_sbd->sd_glock_prefetch_calls);
1474}
1475
1476/**
1477 * gfs2_glock_force_drop - Force a glock to be uncached
1478 * @gl: the glock
1479 *
1480 */
1481
1482void gfs2_glock_force_drop(struct gfs2_glock *gl)
1483{
1484 struct gfs2_holder gh;
1485
1486 gfs2_holder_init(gl, LM_ST_UNLOCKED, GL_NEVER_RECURSE, &gh);
1487 set_bit(HIF_DEMOTE, &gh.gh_iflags);
1488
1489 spin_lock(&gl->gl_spin);
1490 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
1491 run_queue(gl);
1492 spin_unlock(&gl->gl_spin);
1493
1494 wait_for_completion(&gh.gh_wait);
1495 gfs2_holder_uninit(&gh);
1496}
1497
1498static void greedy_work(void *data)
1499{
1500 struct greedy *gr = (struct greedy *)data;
1501 struct gfs2_holder *gh = &gr->gr_gh;
1502 struct gfs2_glock *gl = gh->gh_gl;
1503 struct gfs2_glock_operations *glops = gl->gl_ops;
1504
1505 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1506
1507 if (glops->go_greedy)
1508 glops->go_greedy(gl);
1509
1510 spin_lock(&gl->gl_spin);
1511
1512 if (list_empty(&gl->gl_waiters2)) {
1513 clear_bit(GLF_GREEDY, &gl->gl_flags);
1514 spin_unlock(&gl->gl_spin);
1515 gfs2_holder_uninit(gh);
1516 kfree(gr);
1517 } else {
1518 gfs2_glock_hold(gl);
1519 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1520 run_queue(gl);
1521 spin_unlock(&gl->gl_spin);
1522 gfs2_glock_put(gl);
1523 }
1524}
1525
1526/**
1527 * gfs2_glock_be_greedy -
1528 * @gl:
1529 * @time:
1530 *
1531 * Returns: 0 if go_greedy will be called, 1 otherwise
1532 */
1533
1534int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1535{
1536 struct greedy *gr;
1537 struct gfs2_holder *gh;
1538
1539 if (!time ||
1540 gl->gl_sbd->sd_args.ar_localcaching ||
1541 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1542 return 1;
1543
1544 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1545 if (!gr) {
1546 clear_bit(GLF_GREEDY, &gl->gl_flags);
1547 return 1;
1548 }
1549 gh = &gr->gr_gh;
1550
1551 gfs2_holder_init(gl, 0, GL_NEVER_RECURSE, gh);
1552 set_bit(HIF_GREEDY, &gh->gh_iflags);
1553 INIT_WORK(&gr->gr_work, greedy_work, gr);
1554
1555 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1556 schedule_delayed_work(&gr->gr_work, time);
1557
1558 return 0;
1559}
1560
1561/**
1562 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
1563 * @gl: the glock
1564 * @state: the state we're requesting
1565 * @flags: the modifier flags
1566 * @gh: the holder structure
1567 *
1568 * Returns: 0, GLR_*, or errno
1569 */
1570
1571int gfs2_glock_nq_init(struct gfs2_glock *gl, unsigned int state, int flags,
1572 struct gfs2_holder *gh)
1573{
1574 int error;
1575
1576 gfs2_holder_init(gl, state, flags, gh);
1577
1578 error = gfs2_glock_nq(gh);
1579 if (error)
1580 gfs2_holder_uninit(gh);
1581
1582 return error;
1583}
1584
1585/**
1586 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1587 * @gh: the holder structure
1588 *
1589 */
1590
1591void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1592{
1593 gfs2_glock_dq(gh);
1594 gfs2_holder_uninit(gh);
1595}
1596
1597/**
1598 * gfs2_glock_nq_num - acquire a glock based on lock number
1599 * @sdp: the filesystem
1600 * @number: the lock number
1601 * @glops: the glock operations for the type of glock
1602 * @state: the state to acquire the glock in
1603 * @flags: modifier flags for the aquisition
1604 * @gh: the struct gfs2_holder
1605 *
1606 * Returns: errno
1607 */
1608
1609int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
1610 struct gfs2_glock_operations *glops, unsigned int state,
1611 int flags, struct gfs2_holder *gh)
1612{
1613 struct gfs2_glock *gl;
1614 int error;
1615
1616 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1617 if (!error) {
1618 error = gfs2_glock_nq_init(gl, state, flags, gh);
1619 gfs2_glock_put(gl);
1620 }
1621
1622 return error;
1623}
1624
1625/**
1626 * glock_compare - Compare two struct gfs2_glock structures for sorting
1627 * @arg_a: the first structure
1628 * @arg_b: the second structure
1629 *
1630 */
1631
1632static int glock_compare(const void *arg_a, const void *arg_b)
1633{
1634 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1635 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1636 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1637 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1638 int ret = 0;
1639
1640 if (a->ln_number > b->ln_number)
1641 ret = 1;
1642 else if (a->ln_number < b->ln_number)
1643 ret = -1;
1644 else {
1645 if (gh_a->gh_state == LM_ST_SHARED &&
1646 gh_b->gh_state == LM_ST_EXCLUSIVE)
1647 ret = 1;
1648 else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
1649 (gh_b->gh_flags & GL_LOCAL_EXCL))
1650 ret = 1;
1651 }
1652
1653 return ret;
1654}
1655
1656/**
1657 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1658 * @num_gh: the number of structures
1659 * @ghs: an array of struct gfs2_holder structures
1660 *
1661 * Returns: 0 on success (all glocks acquired),
1662 * errno on failure (no glocks acquired)
1663 */
1664
1665static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1666 struct gfs2_holder **p)
1667{
1668 unsigned int x;
1669 int error = 0;
1670
1671 for (x = 0; x < num_gh; x++)
1672 p[x] = &ghs[x];
1673
1674 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1675
1676 for (x = 0; x < num_gh; x++) {
1677 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1678
1679 error = gfs2_glock_nq(p[x]);
1680 if (error) {
1681 while (x--)
1682 gfs2_glock_dq(p[x]);
1683 break;
1684 }
1685 }
1686
1687 return error;
1688}
1689
1690/**
1691 * gfs2_glock_nq_m - acquire multiple glocks
1692 * @num_gh: the number of structures
1693 * @ghs: an array of struct gfs2_holder structures
1694 *
1695 * Figure out how big an impact this function has. Either:
1696 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1697 * 2) Forget async stuff and just call nq_m_sync()
1698 * 3) Leave it like it is
1699 *
1700 * Returns: 0 on success (all glocks acquired),
1701 * errno on failure (no glocks acquired)
1702 */
1703
1704int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1705{
1706 int *e;
1707 unsigned int x;
1708 int borked = 0, serious = 0;
1709 int error = 0;
1710
1711 if (!num_gh)
1712 return 0;
1713
1714 if (num_gh == 1) {
1715 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1716 return gfs2_glock_nq(ghs);
1717 }
1718
1719 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1720 if (!e)
1721 return -ENOMEM;
1722
1723 for (x = 0; x < num_gh; x++) {
1724 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1725 error = gfs2_glock_nq(&ghs[x]);
1726 if (error) {
1727 borked = 1;
1728 serious = error;
1729 num_gh = x;
1730 break;
1731 }
1732 }
1733
1734 for (x = 0; x < num_gh; x++) {
1735 error = e[x] = glock_wait_internal(&ghs[x]);
1736 if (error) {
1737 borked = 1;
1738 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1739 serious = error;
1740 }
1741 }
1742
1743 if (!borked) {
1744 kfree(e);
1745 return 0;
1746 }
1747
1748 for (x = 0; x < num_gh; x++)
1749 if (!e[x])
1750 gfs2_glock_dq(&ghs[x]);
1751
1752 if (serious)
1753 error = serious;
1754 else {
1755 for (x = 0; x < num_gh; x++)
1756 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1757 &ghs[x]);
1758 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1759 }
1760
1761 kfree(e);
1762
1763 return error;
1764}
1765
1766/**
1767 * gfs2_glock_dq_m - release multiple glocks
1768 * @num_gh: the number of structures
1769 * @ghs: an array of struct gfs2_holder structures
1770 *
1771 */
1772
1773void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1774{
1775 unsigned int x;
1776
1777 for (x = 0; x < num_gh; x++)
1778 gfs2_glock_dq(&ghs[x]);
1779}
1780
1781/**
1782 * gfs2_glock_dq_uninit_m - release multiple glocks
1783 * @num_gh: the number of structures
1784 * @ghs: an array of struct gfs2_holder structures
1785 *
1786 */
1787
1788void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1789{
1790 unsigned int x;
1791
1792 for (x = 0; x < num_gh; x++)
1793 gfs2_glock_dq_uninit(&ghs[x]);
1794}
1795
1796/**
1797 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1798 * @sdp: the filesystem
1799 * @number: the lock number
1800 * @glops: the glock operations for the type of glock
1801 * @state: the state to acquire the glock in
1802 * @flags: modifier flags for the aquisition
1803 *
1804 * Returns: errno
1805 */
1806
1807void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
1808 struct gfs2_glock_operations *glops,
1809 unsigned int state, int flags)
1810{
1811 struct gfs2_glock *gl;
1812 int error;
1813
1814 if (atomic_read(&sdp->sd_reclaim_count) <
1815 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1816 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1817 if (!error) {
1818 gfs2_glock_prefetch(gl, state, flags);
1819 gfs2_glock_put(gl);
1820 }
1821 }
1822}
1823
1824/**
1825 * gfs2_lvb_hold - attach a LVB from a glock
1826 * @gl: The glock in question
1827 *
1828 */
1829
1830int gfs2_lvb_hold(struct gfs2_glock *gl)
1831{
1832 int error;
1833
1834 gfs2_glmutex_lock(gl);
1835
1836 if (!atomic_read(&gl->gl_lvb_count)) {
1837 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1838 if (error) {
1839 gfs2_glmutex_unlock(gl);
1840 return error;
1841 }
1842 gfs2_glock_hold(gl);
1843 }
1844 atomic_inc(&gl->gl_lvb_count);
1845
1846 gfs2_glmutex_unlock(gl);
1847
1848 return 0;
1849}
1850
1851/**
1852 * gfs2_lvb_unhold - detach a LVB from a glock
1853 * @gl: The glock in question
1854 *
1855 */
1856
1857void gfs2_lvb_unhold(struct gfs2_glock *gl)
1858{
1859 gfs2_glock_hold(gl);
1860 gfs2_glmutex_lock(gl);
1861
1862 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1863 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1864 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1865 gl->gl_lvb = NULL;
1866 gfs2_glock_put(gl);
1867 }
1868
1869 gfs2_glmutex_unlock(gl);
1870 gfs2_glock_put(gl);
1871}
1872
1873void gfs2_lvb_sync(struct gfs2_glock *gl)
1874{
1875 gfs2_glmutex_lock(gl);
1876
1877 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
1878 if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
1879 gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1880
1881 gfs2_glmutex_unlock(gl);
1882}
1883
1884static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1885 unsigned int state)
1886{
1887 struct gfs2_glock *gl;
1888
1889 gl = gfs2_glock_find(sdp, name);
1890 if (!gl)
1891 return;
1892
1893 if (gl->gl_ops->go_callback)
1894 gl->gl_ops->go_callback(gl, state);
1895 handle_callback(gl, state);
1896
1897 spin_lock(&gl->gl_spin);
1898 run_queue(gl);
1899 spin_unlock(&gl->gl_spin);
1900
1901 gfs2_glock_put(gl);
1902}
1903
1904/**
1905 * gfs2_glock_cb - Callback used by locking module
1906 * @fsdata: Pointer to the superblock
1907 * @type: Type of callback
1908 * @data: Type dependent data pointer
1909 *
1910 * Called by the locking module when it wants to tell us something.
1911 * Either we need to drop a lock, one of our ASYNC requests completed, or
1912 * a journal from another client needs to be recovered.
1913 */
1914
1915void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
1916{
1917 struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
1918
1919 atomic_inc(&sdp->sd_lm_callbacks);
1920
1921 switch (type) {
1922 case LM_CB_NEED_E:
1923 blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_UNLOCKED);
1924 return;
1925
1926 case LM_CB_NEED_D:
1927 blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_DEFERRED);
1928 return;
1929
1930 case LM_CB_NEED_S:
1931 blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_SHARED);
1932 return;
1933
1934 case LM_CB_ASYNC: {
1935 struct lm_async_cb *async = (struct lm_async_cb *)data;
1936 struct gfs2_glock *gl;
1937
1938 gl = gfs2_glock_find(sdp, &async->lc_name);
1939 if (gfs2_assert_warn(sdp, gl))
1940 return;
1941 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1942 gl->gl_req_bh(gl, async->lc_ret);
1943 gfs2_glock_put(gl);
1944
1945 return;
1946 }
1947
1948 case LM_CB_NEED_RECOVERY:
1949 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1950 if (sdp->sd_recoverd_process)
1951 wake_up_process(sdp->sd_recoverd_process);
1952 return;
1953
1954 case LM_CB_DROPLOCKS:
1955 gfs2_gl_hash_clear(sdp, NO_WAIT);
1956 gfs2_quota_scan(sdp);
1957 return;
1958
1959 default:
1960 gfs2_assert_warn(sdp, 0);
1961 return;
1962 }
1963}
1964
1965/**
1966 * gfs2_try_toss_inode - try to remove a particular inode struct from cache
1967 * sdp: the filesystem
1968 * inum: the inode number
1969 *
1970 */
1971
1972void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum)
1973{
1974 struct gfs2_glock *gl;
1975 struct gfs2_inode *ip;
1976 int error;
1977
1978 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops,
1979 NO_CREATE, &gl);
1980 if (error || !gl)
1981 return;
1982
1983 if (!gfs2_glmutex_trylock(gl))
1984 goto out;
1985
1986 ip = get_gl2ip(gl);
1987 if (!ip)
1988 goto out_unlock;
1989
1990 if (atomic_read(&ip->i_count))
1991 goto out_unlock;
1992
1993 gfs2_inode_destroy(ip);
1994
1995 out_unlock:
1996 gfs2_glmutex_unlock(gl);
1997
1998 out:
1999 gfs2_glock_put(gl);
2000}
2001
2002/**
2003 * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
2004 * iopen glock from memory
2005 * @io_gl: the iopen glock
2006 * @state: the state into which the glock should be put
2007 *
2008 */
2009
2010void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
2011{
2012 struct gfs2_glock *i_gl;
2013
2014 if (state != LM_ST_UNLOCKED)
2015 return;
2016
2017 spin_lock(&io_gl->gl_spin);
2018 i_gl = get_gl2gl(io_gl);
2019 if (i_gl) {
2020 gfs2_glock_hold(i_gl);
2021 spin_unlock(&io_gl->gl_spin);
2022 } else {
2023 spin_unlock(&io_gl->gl_spin);
2024 return;
2025 }
2026
2027 if (gfs2_glmutex_trylock(i_gl)) {
2028 struct gfs2_inode *ip = get_gl2ip(i_gl);
2029 if (ip) {
2030 gfs2_try_toss_vnode(ip);
2031 gfs2_glmutex_unlock(i_gl);
2032 gfs2_glock_schedule_for_reclaim(i_gl);
2033 goto out;
2034 }
2035 gfs2_glmutex_unlock(i_gl);
2036 }
2037
2038 out:
2039 gfs2_glock_put(i_gl);
2040}
2041
2042/**
2043 * demote_ok - Check to see if it's ok to unlock a glock
2044 * @gl: the glock
2045 *
2046 * Returns: 1 if it's ok
2047 */
2048
2049static int demote_ok(struct gfs2_glock *gl)
2050{
2051 struct gfs2_sbd *sdp = gl->gl_sbd;
2052 struct gfs2_glock_operations *glops = gl->gl_ops;
2053 int demote = 1;
2054
2055 if (test_bit(GLF_STICKY, &gl->gl_flags))
2056 demote = 0;
2057 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
2058 demote = time_after_eq(jiffies,
2059 gl->gl_stamp +
2060 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
2061 else if (glops->go_demote_ok)
2062 demote = glops->go_demote_ok(gl);
2063
2064 return demote;
2065}
2066
2067/**
2068 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
2069 * @gl: the glock
2070 *
2071 */
2072
2073void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
2074{
2075 struct gfs2_sbd *sdp = gl->gl_sbd;
2076
2077 spin_lock(&sdp->sd_reclaim_lock);
2078 if (list_empty(&gl->gl_reclaim)) {
2079 gfs2_glock_hold(gl);
2080 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
2081 atomic_inc(&sdp->sd_reclaim_count);
2082 }
2083 spin_unlock(&sdp->sd_reclaim_lock);
2084
2085 wake_up(&sdp->sd_reclaim_wq);
2086}
2087
2088/**
2089 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
2090 * @sdp: the filesystem
2091 *
2092 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
2093 * different glock and we notice that there are a lot of glocks in the
2094 * reclaim list.
2095 *
2096 */
2097
2098void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
2099{
2100 struct gfs2_glock *gl;
2101
2102 spin_lock(&sdp->sd_reclaim_lock);
2103 if (list_empty(&sdp->sd_reclaim_list)) {
2104 spin_unlock(&sdp->sd_reclaim_lock);
2105 return;
2106 }
2107 gl = list_entry(sdp->sd_reclaim_list.next,
2108 struct gfs2_glock, gl_reclaim);
2109 list_del_init(&gl->gl_reclaim);
2110 spin_unlock(&sdp->sd_reclaim_lock);
2111
2112 atomic_dec(&sdp->sd_reclaim_count);
2113 atomic_inc(&sdp->sd_reclaimed);
2114
2115 if (gfs2_glmutex_trylock(gl)) {
2116 if (gl->gl_ops == &gfs2_inode_glops) {
2117 struct gfs2_inode *ip = get_gl2ip(gl);
2118 if (ip && !atomic_read(&ip->i_count))
2119 gfs2_inode_destroy(ip);
2120 }
2121 if (queue_empty(gl, &gl->gl_holders) &&
2122 gl->gl_state != LM_ST_UNLOCKED &&
2123 demote_ok(gl))
2124 handle_callback(gl, LM_ST_UNLOCKED);
2125 gfs2_glmutex_unlock(gl);
2126 }
2127
2128 gfs2_glock_put(gl);
2129}
2130
2131/**
2132 * examine_bucket - Call a function for glock in a hash bucket
2133 * @examiner: the function
2134 * @sdp: the filesystem
2135 * @bucket: the bucket
2136 *
2137 * Returns: 1 if the bucket has entries
2138 */
2139
2140static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
2141 struct gfs2_gl_hash_bucket *bucket)
2142{
2143 struct glock_plug plug;
2144 struct list_head *tmp;
2145 struct gfs2_glock *gl;
2146 int entries;
2147
2148 /* Add "plug" to end of bucket list, work back up list from there */
2149 memset(&plug.gl_flags, 0, sizeof(unsigned long));
2150 set_bit(GLF_PLUG, &plug.gl_flags);
2151
2152 write_lock(&bucket->hb_lock);
2153 list_add(&plug.gl_list, &bucket->hb_list);
2154 write_unlock(&bucket->hb_lock);
2155
2156 for (;;) {
2157 write_lock(&bucket->hb_lock);
2158
2159 for (;;) {
2160 tmp = plug.gl_list.next;
2161
2162 if (tmp == &bucket->hb_list) {
2163 list_del(&plug.gl_list);
2164 entries = !list_empty(&bucket->hb_list);
2165 write_unlock(&bucket->hb_lock);
2166 return entries;
2167 }
2168 gl = list_entry(tmp, struct gfs2_glock, gl_list);
2169
2170 /* Move plug up list */
2171 list_move(&plug.gl_list, &gl->gl_list);
2172
2173 if (test_bit(GLF_PLUG, &gl->gl_flags))
2174 continue;
2175
2176 /* examiner() must glock_put() */
2177 gfs2_glock_hold(gl);
2178
2179 break;
2180 }
2181
2182 write_unlock(&bucket->hb_lock);
2183
2184 examiner(gl);
2185 }
2186}
2187
2188/**
2189 * scan_glock - look at a glock and see if we can reclaim it
2190 * @gl: the glock to look at
2191 *
2192 */
2193
2194static void scan_glock(struct gfs2_glock *gl)
2195{
2196 if (gfs2_glmutex_trylock(gl)) {
2197 if (gl->gl_ops == &gfs2_inode_glops) {
2198 struct gfs2_inode *ip = get_gl2ip(gl);
2199 if (ip && !atomic_read(&ip->i_count))
2200 goto out_schedule;
2201 }
2202 if (queue_empty(gl, &gl->gl_holders) &&
2203 gl->gl_state != LM_ST_UNLOCKED &&
2204 demote_ok(gl))
2205 goto out_schedule;
2206
2207 gfs2_glmutex_unlock(gl);
2208 }
2209
2210 gfs2_glock_put(gl);
2211
2212 return;
2213
2214 out_schedule:
2215 gfs2_glmutex_unlock(gl);
2216 gfs2_glock_schedule_for_reclaim(gl);
2217 gfs2_glock_put(gl);
2218}
2219
2220/**
2221 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
2222 * @sdp: the filesystem
2223 *
2224 */
2225
2226void gfs2_scand_internal(struct gfs2_sbd *sdp)
2227{
2228 unsigned int x;
2229
2230 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2231 examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
2232 cond_resched();
2233 }
2234}
2235
2236/**
2237 * clear_glock - look at a glock and see if we can free it from glock cache
2238 * @gl: the glock to look at
2239 *
2240 */
2241
2242static void clear_glock(struct gfs2_glock *gl)
2243{
2244 struct gfs2_sbd *sdp = gl->gl_sbd;
2245 int released;
2246
2247 spin_lock(&sdp->sd_reclaim_lock);
2248 if (!list_empty(&gl->gl_reclaim)) {
2249 list_del_init(&gl->gl_reclaim);
2250 atomic_dec(&sdp->sd_reclaim_count);
2251 released = gfs2_glock_put(gl);
2252 gfs2_assert(sdp, !released);
2253 }
2254 spin_unlock(&sdp->sd_reclaim_lock);
2255
2256 if (gfs2_glmutex_trylock(gl)) {
2257 if (gl->gl_ops == &gfs2_inode_glops) {
2258 struct gfs2_inode *ip = get_gl2ip(gl);
2259 if (ip && !atomic_read(&ip->i_count))
2260 gfs2_inode_destroy(ip);
2261 }
2262 if (queue_empty(gl, &gl->gl_holders) &&
2263 gl->gl_state != LM_ST_UNLOCKED)
2264 handle_callback(gl, LM_ST_UNLOCKED);
2265
2266 gfs2_glmutex_unlock(gl);
2267 }
2268
2269 gfs2_glock_put(gl);
2270}
2271
2272/**
2273 * gfs2_gl_hash_clear - Empty out the glock hash table
2274 * @sdp: the filesystem
2275 * @wait: wait until it's all gone
2276 *
2277 * Called when unmounting the filesystem, or when inter-node lock manager
2278 * requests DROPLOCKS because it is running out of capacity.
2279 */
2280
2281void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2282{
2283 unsigned long t;
2284 unsigned int x;
2285 int cont;
2286
2287 t = jiffies;
2288
2289 for (;;) {
2290 cont = 0;
2291
2292 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2293 if (examine_bucket(clear_glock, sdp,
2294 &sdp->sd_gl_hash[x]))
2295 cont = 1;
2296
2297 if (!wait || !cont)
2298 break;
2299
2300 if (time_after_eq(jiffies,
2301 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2302 fs_warn(sdp, "Unmount seems to be stalled. "
2303 "Dumping lock state...\n");
2304 gfs2_dump_lockstate(sdp);
2305 t = jiffies;
2306 }
2307
2308 /* invalidate_inodes() requires that the sb inodes list
2309 not change, but an async completion callback for an
2310 unlock can occur which does glock_put() which
2311 can call iput() which will change the sb inodes list.
2312 invalidate_inodes_mutex prevents glock_put()'s during
2313 an invalidate_inodes() */
2314
2315 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
2316 invalidate_inodes(sdp->sd_vfs);
2317 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
2318 yield();
2319 }
2320}
2321
2322/*
2323 * Diagnostic routines to help debug distributed deadlock
2324 */
2325
2326/**
2327 * dump_holder - print information about a glock holder
2328 * @str: a string naming the type of holder
2329 * @gh: the glock holder
2330 *
2331 * Returns: 0 on success, -ENOBUFS when we run out of space
2332 */
2333
2334static int dump_holder(char *str, struct gfs2_holder *gh)
2335{
2336 unsigned int x;
2337 int error = -ENOBUFS;
2338
2339 printk(" %s\n", str);
2340 printk(" owner = %ld\n",
2341 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2342 printk(" gh_state = %u\n", gh->gh_state);
2343 printk(" gh_flags =");
2344 for (x = 0; x < 32; x++)
2345 if (gh->gh_flags & (1 << x))
2346 printk(" %u", x);
2347 printk(" \n");
2348 printk(" error = %d\n", gh->gh_error);
2349 printk(" gh_iflags =");
2350 for (x = 0; x < 32; x++)
2351 if (test_bit(x, &gh->gh_iflags))
2352 printk(" %u", x);
2353 printk(" \n");
2354
2355 error = 0;
2356
2357 return error;
2358}
2359
2360/**
2361 * dump_inode - print information about an inode
2362 * @ip: the inode
2363 *
2364 * Returns: 0 on success, -ENOBUFS when we run out of space
2365 */
2366
2367static int dump_inode(struct gfs2_inode *ip)
2368{
2369 unsigned int x;
2370 int error = -ENOBUFS;
2371
2372 printk(" Inode:\n");
2373 printk(" num = %llu %llu\n",
2374 ip->i_num.no_formal_ino, ip->i_num.no_addr);
2375 printk(" type = %u\n", IF2DT(ip->i_di.di_mode));
2376 printk(" i_count = %d\n", atomic_read(&ip->i_count));
2377 printk(" i_flags =");
2378 for (x = 0; x < 32; x++)
2379 if (test_bit(x, &ip->i_flags))
2380 printk(" %u", x);
2381 printk(" \n");
2382 printk(" vnode = %s\n", (ip->i_vnode) ? "yes" : "no");
2383
2384 error = 0;
2385
2386 return error;
2387}
2388
2389/**
2390 * dump_glock - print information about a glock
2391 * @gl: the glock
2392 * @count: where we are in the buffer
2393 *
2394 * Returns: 0 on success, -ENOBUFS when we run out of space
2395 */
2396
2397static int dump_glock(struct gfs2_glock *gl)
2398{
2399 struct gfs2_holder *gh;
2400 unsigned int x;
2401 int error = -ENOBUFS;
2402
2403 spin_lock(&gl->gl_spin);
2404
2405 printk("Glock (%u, %llu)\n",
2406 gl->gl_name.ln_type,
2407 gl->gl_name.ln_number);
2408 printk(" gl_flags =");
2409 for (x = 0; x < 32; x++)
2410 if (test_bit(x, &gl->gl_flags))
2411 printk(" %u", x);
2412 printk(" \n");
2413 printk(" gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
2414 printk(" gl_state = %u\n", gl->gl_state);
2415 printk(" req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2416 printk(" req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2417 printk(" lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2418 printk(" object = %s\n", (gl->gl_object) ? "yes" : "no");
2419 printk(" le = %s\n",
2420 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2421 printk(" reclaim = %s\n",
2422 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2423 if (gl->gl_aspace)
2424 printk(" aspace = %lu\n",
2425 gl->gl_aspace->i_mapping->nrpages);
2426 else
2427 printk(" aspace = no\n");
2428 printk(" ail = %d\n", atomic_read(&gl->gl_ail_count));
2429 if (gl->gl_req_gh) {
2430 error = dump_holder("Request", gl->gl_req_gh);
2431 if (error)
2432 goto out;
2433 }
2434 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2435 error = dump_holder("Holder", gh);
2436 if (error)
2437 goto out;
2438 }
2439 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2440 error = dump_holder("Waiter1", gh);
2441 if (error)
2442 goto out;
2443 }
2444 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2445 error = dump_holder("Waiter2", gh);
2446 if (error)
2447 goto out;
2448 }
2449 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2450 error = dump_holder("Waiter3", gh);
2451 if (error)
2452 goto out;
2453 }
2454 if (gl->gl_ops == &gfs2_inode_glops && get_gl2ip(gl)) {
2455 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2456 list_empty(&gl->gl_holders)) {
2457 error = dump_inode(get_gl2ip(gl));
2458 if (error)
2459 goto out;
2460 } else {
2461 error = -ENOBUFS;
2462 printk(" Inode: busy\n");
2463 }
2464 }
2465
2466 error = 0;
2467
2468 out:
2469 spin_unlock(&gl->gl_spin);
2470
2471 return error;
2472}
2473
2474/**
2475 * gfs2_dump_lockstate - print out the current lockstate
2476 * @sdp: the filesystem
2477 * @ub: the buffer to copy the information into
2478 *
2479 * If @ub is NULL, dump the lockstate to the console.
2480 *
2481 */
2482
2483int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2484{
2485 struct gfs2_gl_hash_bucket *bucket;
2486 struct gfs2_glock *gl;
2487 unsigned int x;
2488 int error = 0;
2489
2490 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2491 bucket = &sdp->sd_gl_hash[x];
2492
2493 read_lock(&bucket->hb_lock);
2494
2495 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
2496 if (test_bit(GLF_PLUG, &gl->gl_flags))
2497 continue;
2498
2499 error = dump_glock(gl);
2500 if (error)
2501 break;
2502 }
2503
2504 read_unlock(&bucket->hb_lock);
2505
2506 if (error)
2507 break;
2508 }
2509
2510
2511 return error;
2512}
2513
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..06847ebebdee
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,143 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13/* Flags for lock requests; used in gfs2_holder gh_flag field.
14 From lm_interface.h:
15#define LM_FLAG_TRY 0x00000001
16#define LM_FLAG_TRY_1CB 0x00000002
17#define LM_FLAG_NOEXP 0x00000004
18#define LM_FLAG_ANY 0x00000008
19#define LM_FLAG_PRIORITY 0x00000010 */
20
21#define GL_LOCAL_EXCL 0x00000020
22#define GL_ASYNC 0x00000040
23#define GL_EXACT 0x00000080
24#define GL_SKIP 0x00000100
25#define GL_ATIME 0x00000200
26#define GL_NOCACHE 0x00000400
27#define GL_SYNC 0x00000800
28#define GL_NOCANCEL 0x00001000
29#define GL_NEVER_RECURSE 0x00002000
30
31#define GLR_TRYFAILED 13
32#define GLR_CANCELED 14
33
34static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
35{
36 struct gfs2_holder *gh;
37 int locked = 0;
38
39 /* Look in glock's list of holders for one with current task as owner */
40 spin_lock(&gl->gl_spin);
41 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
42 if (gh->gh_owner == current) {
43 locked = 1;
44 break;
45 }
46 }
47 spin_unlock(&gl->gl_spin);
48
49 return locked;
50}
51
52static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
53{
54 return (gl->gl_state == LM_ST_EXCLUSIVE);
55}
56
57static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
58{
59 return (gl->gl_state == LM_ST_DEFERRED);
60}
61
62static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
63{
64 return (gl->gl_state == LM_ST_SHARED);
65}
66
67static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
68{
69 int ret;
70 spin_lock(&gl->gl_spin);
71 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
72 spin_unlock(&gl->gl_spin);
73 return ret;
74}
75
76struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
77 struct lm_lockname *name);
78int gfs2_glock_get(struct gfs2_sbd *sdp,
79 uint64_t number, struct gfs2_glock_operations *glops,
80 int create, struct gfs2_glock **glp);
81void gfs2_glock_hold(struct gfs2_glock *gl);
82int gfs2_glock_put(struct gfs2_glock *gl);
83
84void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, int flags,
85 struct gfs2_holder *gh);
86void gfs2_holder_reinit(unsigned int state, int flags, struct gfs2_holder *gh);
87void gfs2_holder_uninit(struct gfs2_holder *gh);
88struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, unsigned int state,
89 int flags, gfp_t gfp_flags);
90void gfs2_holder_put(struct gfs2_holder *gh);
91
92void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
93void gfs2_glock_drop_th(struct gfs2_glock *gl);
94
95void gfs2_glmutex_lock(struct gfs2_glock *gl);
96int gfs2_glmutex_trylock(struct gfs2_glock *gl);
97void gfs2_glmutex_unlock(struct gfs2_glock *gl);
98
99int gfs2_glock_nq(struct gfs2_holder *gh);
100int gfs2_glock_poll(struct gfs2_holder *gh);
101int gfs2_glock_wait(struct gfs2_holder *gh);
102void gfs2_glock_dq(struct gfs2_holder *gh);
103
104void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, int flags);
105void gfs2_glock_force_drop(struct gfs2_glock *gl);
106
107int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
108
109int gfs2_glock_nq_init(struct gfs2_glock *gl, unsigned int state, int flags,
110 struct gfs2_holder *gh);
111void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
112int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
113 uint64_t number, struct gfs2_glock_operations *glops,
114 unsigned int state, int flags, struct gfs2_holder *gh);
115
116int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
117void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
118void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
119
120void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
121 struct gfs2_glock_operations *glops,
122 unsigned int state, int flags);
123
124/* Lock Value Block functions */
125
126int gfs2_lvb_hold(struct gfs2_glock *gl);
127void gfs2_lvb_unhold(struct gfs2_glock *gl);
128void gfs2_lvb_sync(struct gfs2_glock *gl);
129
130void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
131
132void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum);
133void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
134
135void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
136void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
137
138void gfs2_scand_internal(struct gfs2_sbd *sdp);
139void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
140
141int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
142
143#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..27374306ecde
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,487 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "bmap.h"
19#include "glock.h"
20#include "glops.h"
21#include "inode.h"
22#include "log.h"
23#include "meta_io.h"
24#include "page.h"
25#include "recovery.h"
26#include "rgrp.h"
27
28/**
29 * meta_go_sync - sync out the metadata for this glock
30 * @gl: the glock
31 * @flags: DIO_*
32 *
33 * Called when demoting or unlocking an EX glock. We must flush
34 * to disk all dirty buffers/pages relating to this glock, and must not
35 * not return to caller to demote/unlock the glock until I/O is complete.
36 */
37
38static void meta_go_sync(struct gfs2_glock *gl, int flags)
39{
40 if (!(flags & DIO_METADATA))
41 return;
42
43 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
44 gfs2_log_flush_glock(gl);
45 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
46 if (flags & DIO_RELEASE)
47 gfs2_ail_empty_gl(gl);
48 }
49
50 clear_bit(GLF_SYNC, &gl->gl_flags);
51}
52
53/**
54 * meta_go_inval - invalidate the metadata for this glock
55 * @gl: the glock
56 * @flags:
57 *
58 */
59
60static void meta_go_inval(struct gfs2_glock *gl, int flags)
61{
62 if (!(flags & DIO_METADATA))
63 return;
64
65 gfs2_meta_inval(gl);
66 gl->gl_vn++;
67}
68
69/**
70 * meta_go_demote_ok - Check to see if it's ok to unlock a glock
71 * @gl: the glock
72 *
73 * Returns: 1 if we have no cached data; ok to demote meta glock
74 */
75
76static int meta_go_demote_ok(struct gfs2_glock *gl)
77{
78 return !gl->gl_aspace->i_mapping->nrpages;
79}
80
81/**
82 * inode_go_xmote_th - promote/demote a glock
83 * @gl: the glock
84 * @state: the requested state
85 * @flags:
86 *
87 */
88
89static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
90 int flags)
91{
92 if (gl->gl_state != LM_ST_UNLOCKED)
93 gfs2_pte_inval(gl);
94 gfs2_glock_xmote_th(gl, state, flags);
95}
96
97/**
98 * inode_go_xmote_bh - After promoting/demoting a glock
99 * @gl: the glock
100 *
101 */
102
103static void inode_go_xmote_bh(struct gfs2_glock *gl)
104{
105 struct gfs2_holder *gh = gl->gl_req_gh;
106 struct buffer_head *bh;
107 int error;
108
109 if (gl->gl_state != LM_ST_UNLOCKED &&
110 (!gh || !(gh->gh_flags & GL_SKIP))) {
111 error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
112 &bh);
113 if (!error)
114 brelse(bh);
115 }
116}
117
118/**
119 * inode_go_drop_th - unlock a glock
120 * @gl: the glock
121 *
122 * Invoked from rq_demote().
123 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
124 * is being purged from our node's glock cache; we're dropping lock.
125 */
126
127static void inode_go_drop_th(struct gfs2_glock *gl)
128{
129 gfs2_pte_inval(gl);
130 gfs2_glock_drop_th(gl);
131}
132
133/**
134 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
135 * @gl: the glock protecting the inode
136 * @flags:
137 *
138 */
139
140static void inode_go_sync(struct gfs2_glock *gl, int flags)
141{
142 int meta = (flags & DIO_METADATA);
143 int data = (flags & DIO_DATA);
144
145 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
146 if (meta && data) {
147 gfs2_page_sync(gl, flags | DIO_START);
148 gfs2_log_flush_glock(gl);
149 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
150 gfs2_page_sync(gl, flags | DIO_WAIT);
151 clear_bit(GLF_DIRTY, &gl->gl_flags);
152 } else if (meta) {
153 gfs2_log_flush_glock(gl);
154 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
155 } else if (data)
156 gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
157 if (flags & DIO_RELEASE)
158 gfs2_ail_empty_gl(gl);
159 }
160
161 clear_bit(GLF_SYNC, &gl->gl_flags);
162}
163
164/**
165 * inode_go_inval - prepare a inode glock to be released
166 * @gl: the glock
167 * @flags:
168 *
169 */
170
171static void inode_go_inval(struct gfs2_glock *gl, int flags)
172{
173 int meta = (flags & DIO_METADATA);
174 int data = (flags & DIO_DATA);
175
176 if (meta) {
177 gfs2_meta_inval(gl);
178 gl->gl_vn++;
179 }
180 if (data)
181 gfs2_page_inval(gl);
182}
183
184/**
185 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
186 * @gl: the glock
187 *
188 * Returns: 1 if it's ok
189 */
190
191static int inode_go_demote_ok(struct gfs2_glock *gl)
192{
193 struct gfs2_sbd *sdp = gl->gl_sbd;
194 int demote = 0;
195
196 if (!get_gl2ip(gl) && !gl->gl_aspace->i_mapping->nrpages)
197 demote = 1;
198 else if (!sdp->sd_args.ar_localcaching &&
199 time_after_eq(jiffies, gl->gl_stamp +
200 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
201 demote = 1;
202
203 return demote;
204}
205
206/**
207 * inode_go_lock - operation done after an inode lock is locked by a process
208 * @gl: the glock
209 * @flags:
210 *
211 * Returns: errno
212 */
213
214static int inode_go_lock(struct gfs2_holder *gh)
215{
216 struct gfs2_glock *gl = gh->gh_gl;
217 struct gfs2_inode *ip = get_gl2ip(gl);
218 int error = 0;
219
220 if (!ip)
221 return 0;
222
223 if (ip->i_vn != gl->gl_vn) {
224 error = gfs2_inode_refresh(ip);
225 if (error)
226 return error;
227 gfs2_inode_attr_in(ip);
228 }
229
230 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
231 (gl->gl_state == LM_ST_EXCLUSIVE) &&
232 (gh->gh_flags & GL_LOCAL_EXCL))
233 error = gfs2_truncatei_resume(ip);
234
235 return error;
236}
237
238/**
239 * inode_go_unlock - operation done before an inode lock is unlocked by a
240 * process
241 * @gl: the glock
242 * @flags:
243 *
244 */
245
246static void inode_go_unlock(struct gfs2_holder *gh)
247{
248 struct gfs2_glock *gl = gh->gh_gl;
249 struct gfs2_inode *ip = get_gl2ip(gl);
250
251 if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
252 gfs2_inode_attr_in(ip);
253
254 if (ip)
255 gfs2_meta_cache_flush(ip);
256}
257
258/**
259 * inode_greedy -
260 * @gl: the glock
261 *
262 */
263
264static void inode_greedy(struct gfs2_glock *gl)
265{
266 struct gfs2_sbd *sdp = gl->gl_sbd;
267 struct gfs2_inode *ip = get_gl2ip(gl);
268 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
269 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
270 unsigned int new_time;
271
272 spin_lock(&ip->i_spin);
273
274 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
275 new_time = ip->i_greedy + quantum;
276 if (new_time > max)
277 new_time = max;
278 } else {
279 new_time = ip->i_greedy - quantum;
280 if (!new_time || new_time > max)
281 new_time = 1;
282 }
283
284 ip->i_greedy = new_time;
285
286 spin_unlock(&ip->i_spin);
287
288 gfs2_inode_put(ip);
289}
290
291/**
292 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
293 * @gl: the glock
294 *
295 * Returns: 1 if it's ok
296 */
297
298static int rgrp_go_demote_ok(struct gfs2_glock *gl)
299{
300 return !gl->gl_aspace->i_mapping->nrpages;
301}
302
303/**
304 * rgrp_go_lock - operation done after an rgrp lock is locked by
305 * a first holder on this node.
306 * @gl: the glock
307 * @flags:
308 *
309 * Returns: errno
310 */
311
312static int rgrp_go_lock(struct gfs2_holder *gh)
313{
314 return gfs2_rgrp_bh_get(get_gl2rgd(gh->gh_gl));
315}
316
317/**
318 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
319 * a last holder on this node.
320 * @gl: the glock
321 * @flags:
322 *
323 */
324
325static void rgrp_go_unlock(struct gfs2_holder *gh)
326{
327 gfs2_rgrp_bh_put(get_gl2rgd(gh->gh_gl));
328}
329
330/**
331 * trans_go_xmote_th - promote/demote the transaction glock
332 * @gl: the glock
333 * @state: the requested state
334 * @flags:
335 *
336 */
337
338static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
339 int flags)
340{
341 struct gfs2_sbd *sdp = gl->gl_sbd;
342
343 if (gl->gl_state != LM_ST_UNLOCKED &&
344 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
345 gfs2_meta_syncfs(sdp);
346 gfs2_log_shutdown(sdp);
347 }
348
349 gfs2_glock_xmote_th(gl, state, flags);
350}
351
352/**
353 * trans_go_xmote_bh - After promoting/demoting the transaction glock
354 * @gl: the glock
355 *
356 */
357
358static void trans_go_xmote_bh(struct gfs2_glock *gl)
359{
360 struct gfs2_sbd *sdp = gl->gl_sbd;
361 struct gfs2_glock *j_gl = get_v2ip(sdp->sd_jdesc->jd_inode)->i_gl;
362 struct gfs2_log_header head;
363 int error;
364
365 if (gl->gl_state != LM_ST_UNLOCKED &&
366 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
367 gfs2_meta_cache_flush(get_v2ip(sdp->sd_jdesc->jd_inode));
368 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
369
370 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
371 if (error)
372 gfs2_consist(sdp);
373 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
374 gfs2_consist(sdp);
375
376 /* Initialize some head of the log stuff */
377 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
378 sdp->sd_log_sequence = head.lh_sequence + 1;
379 gfs2_log_pointers_init(sdp, head.lh_blkno);
380 }
381 }
382}
383
384/**
385 * trans_go_drop_th - unlock the transaction glock
386 * @gl: the glock
387 *
388 * We want to sync the device even with localcaching. Remember
389 * that localcaching journal replay only marks buffers dirty.
390 */
391
392static void trans_go_drop_th(struct gfs2_glock *gl)
393{
394 struct gfs2_sbd *sdp = gl->gl_sbd;
395
396 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
397 gfs2_meta_syncfs(sdp);
398 gfs2_log_shutdown(sdp);
399 }
400
401 gfs2_glock_drop_th(gl);
402}
403
404/**
405 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
406 * @gl: the glock
407 *
408 * Returns: 1 if it's ok
409 */
410
411static int quota_go_demote_ok(struct gfs2_glock *gl)
412{
413 return !atomic_read(&gl->gl_lvb_count);
414}
415
416struct gfs2_glock_operations gfs2_meta_glops = {
417 .go_xmote_th = gfs2_glock_xmote_th,
418 .go_drop_th = gfs2_glock_drop_th,
419 .go_sync = meta_go_sync,
420 .go_inval = meta_go_inval,
421 .go_demote_ok = meta_go_demote_ok,
422 .go_type = LM_TYPE_META
423};
424
425struct gfs2_glock_operations gfs2_inode_glops = {
426 .go_xmote_th = inode_go_xmote_th,
427 .go_xmote_bh = inode_go_xmote_bh,
428 .go_drop_th = inode_go_drop_th,
429 .go_sync = inode_go_sync,
430 .go_inval = inode_go_inval,
431 .go_demote_ok = inode_go_demote_ok,
432 .go_lock = inode_go_lock,
433 .go_unlock = inode_go_unlock,
434 .go_greedy = inode_greedy,
435 .go_type = LM_TYPE_INODE
436};
437
438struct gfs2_glock_operations gfs2_rgrp_glops = {
439 .go_xmote_th = gfs2_glock_xmote_th,
440 .go_drop_th = gfs2_glock_drop_th,
441 .go_sync = meta_go_sync,
442 .go_inval = meta_go_inval,
443 .go_demote_ok = rgrp_go_demote_ok,
444 .go_lock = rgrp_go_lock,
445 .go_unlock = rgrp_go_unlock,
446 .go_type = LM_TYPE_RGRP
447};
448
449struct gfs2_glock_operations gfs2_trans_glops = {
450 .go_xmote_th = trans_go_xmote_th,
451 .go_xmote_bh = trans_go_xmote_bh,
452 .go_drop_th = trans_go_drop_th,
453 .go_type = LM_TYPE_NONDISK
454};
455
456struct gfs2_glock_operations gfs2_iopen_glops = {
457 .go_xmote_th = gfs2_glock_xmote_th,
458 .go_drop_th = gfs2_glock_drop_th,
459 .go_callback = gfs2_iopen_go_callback,
460 .go_type = LM_TYPE_IOPEN
461};
462
463struct gfs2_glock_operations gfs2_flock_glops = {
464 .go_xmote_th = gfs2_glock_xmote_th,
465 .go_drop_th = gfs2_glock_drop_th,
466 .go_type = LM_TYPE_FLOCK
467};
468
469struct gfs2_glock_operations gfs2_nondisk_glops = {
470 .go_xmote_th = gfs2_glock_xmote_th,
471 .go_drop_th = gfs2_glock_drop_th,
472 .go_type = LM_TYPE_NONDISK
473};
474
475struct gfs2_glock_operations gfs2_quota_glops = {
476 .go_xmote_th = gfs2_glock_xmote_th,
477 .go_drop_th = gfs2_glock_drop_th,
478 .go_demote_ok = quota_go_demote_ok,
479 .go_type = LM_TYPE_QUOTA
480};
481
482struct gfs2_glock_operations gfs2_journal_glops = {
483 .go_xmote_th = gfs2_glock_xmote_th,
484 .go_drop_th = gfs2_glock_drop_th,
485 .go_type = LM_TYPE_JOURNAL
486};
487
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..94f2d264aa64
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13extern struct gfs2_glock_operations gfs2_meta_glops;
14extern struct gfs2_glock_operations gfs2_inode_glops;
15extern struct gfs2_glock_operations gfs2_rgrp_glops;
16extern struct gfs2_glock_operations gfs2_trans_glops;
17extern struct gfs2_glock_operations gfs2_iopen_glops;
18extern struct gfs2_glock_operations gfs2_flock_glops;
19extern struct gfs2_glock_operations gfs2_nondisk_glops;
20extern struct gfs2_glock_operations gfs2_quota_glops;
21extern struct gfs2_glock_operations gfs2_journal_glops;
22
23#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..7fe422537ff0
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,702 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#define DIO_FORCE 0x00000001
14#define DIO_CLEAN 0x00000002
15#define DIO_DIRTY 0x00000004
16#define DIO_START 0x00000008
17#define DIO_WAIT 0x00000010
18#define DIO_METADATA 0x00000020
19#define DIO_DATA 0x00000040
20#define DIO_RELEASE 0x00000080
21#define DIO_ALL 0x00000100
22
23struct gfs2_log_operations;
24struct gfs2_log_element;
25struct gfs2_bitmap;
26struct gfs2_rgrpd;
27struct gfs2_bufdata;
28struct gfs2_glock_operations;
29struct gfs2_holder;
30struct gfs2_glock;
31struct gfs2_alloc;
32struct gfs2_inode;
33struct gfs2_file;
34struct gfs2_revoke;
35struct gfs2_revoke_replay;
36struct gfs2_unlinked;
37struct gfs2_quota_data;
38struct gfs2_log_buf;
39struct gfs2_trans;
40struct gfs2_ail;
41struct gfs2_jdesc;
42struct gfs2_args;
43struct gfs2_tune;
44struct gfs2_gl_hash_bucket;
45struct gfs2_sbd;
46
47typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
48
49/*
50 * Structure of operations that are associated with each
51 * type of element in the log.
52 */
53
54struct gfs2_log_operations {
55 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
56 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
57 void (*lo_before_commit) (struct gfs2_sbd *sdp);
58 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
59 void (*lo_before_scan) (struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head, int pass);
61 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
62 struct gfs2_log_descriptor *ld, __be64 *ptr,
63 int pass);
64 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
65 char *lo_name;
66};
67
68struct gfs2_log_element {
69 struct list_head le_list;
70 struct gfs2_log_operations *le_ops;
71};
72
73struct gfs2_bitmap {
74 struct buffer_head *bi_bh;
75 char *bi_clone;
76 uint32_t bi_offset;
77 uint32_t bi_start;
78 uint32_t bi_len;
79};
80
81struct gfs2_rgrpd {
82 struct list_head rd_list; /* Link with superblock */
83 struct list_head rd_list_mru;
84 struct list_head rd_recent; /* Recently used rgrps */
85 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
86 struct gfs2_rindex rd_ri;
87 struct gfs2_rgrp rd_rg;
88 uint64_t rd_rg_vn;
89 struct gfs2_bitmap *rd_bits;
90 unsigned int rd_bh_count;
91 struct mutex rd_mutex;
92 uint32_t rd_free_clone;
93 struct gfs2_log_element rd_le;
94 uint32_t rd_last_alloc_data;
95 uint32_t rd_last_alloc_meta;
96 struct gfs2_sbd *rd_sbd;
97};
98
99enum gfs2_state_bits {
100 BH_Pinned = BH_PrivateStart,
101 BH_Escaped = BH_PrivateStart + 1,
102};
103
104BUFFER_FNS(Pinned, pinned)
105TAS_BUFFER_FNS(Pinned, pinned)
106BUFFER_FNS(Escaped, escaped)
107TAS_BUFFER_FNS(Escaped, escaped)
108
109struct gfs2_bufdata {
110 struct buffer_head *bd_bh;
111 struct gfs2_glock *bd_gl;
112
113 struct list_head bd_list_tr;
114 struct gfs2_log_element bd_le;
115
116 struct gfs2_ail *bd_ail;
117 struct list_head bd_ail_st_list;
118 struct list_head bd_ail_gl_list;
119};
120
121struct gfs2_glock_operations {
122 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
123 int flags);
124 void (*go_xmote_bh) (struct gfs2_glock * gl);
125 void (*go_drop_th) (struct gfs2_glock * gl);
126 void (*go_drop_bh) (struct gfs2_glock * gl);
127 void (*go_sync) (struct gfs2_glock * gl, int flags);
128 void (*go_inval) (struct gfs2_glock * gl, int flags);
129 int (*go_demote_ok) (struct gfs2_glock * gl);
130 int (*go_lock) (struct gfs2_holder * gh);
131 void (*go_unlock) (struct gfs2_holder * gh);
132 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
133 void (*go_greedy) (struct gfs2_glock * gl);
134 int go_type;
135};
136
137enum {
138 /* Actions */
139 HIF_MUTEX = 0,
140 HIF_PROMOTE = 1,
141 HIF_DEMOTE = 2,
142 HIF_GREEDY = 3,
143
144 /* States */
145 HIF_ALLOCED = 4,
146 HIF_DEALLOC = 5,
147 HIF_HOLDER = 6,
148 HIF_FIRST = 7,
149 HIF_RECURSE = 8,
150 HIF_ABORTED = 9,
151};
152
153struct gfs2_holder {
154 struct list_head gh_list;
155
156 struct gfs2_glock *gh_gl;
157 struct task_struct *gh_owner;
158 unsigned int gh_state;
159 int gh_flags;
160
161 int gh_error;
162 unsigned long gh_iflags;
163 struct completion gh_wait;
164};
165
166enum {
167 GLF_PLUG = 0,
168 GLF_LOCK = 1,
169 GLF_STICKY = 2,
170 GLF_PREFETCH = 3,
171 GLF_SYNC = 4,
172 GLF_DIRTY = 5,
173 GLF_SKIP_WAITERS2 = 6,
174 GLF_GREEDY = 7,
175};
176
177struct gfs2_glock {
178 struct list_head gl_list;
179 unsigned long gl_flags; /* GLF_... */
180 struct lm_lockname gl_name;
181 struct kref gl_ref;
182
183 spinlock_t gl_spin;
184
185 unsigned int gl_state;
186 struct list_head gl_holders;
187 struct list_head gl_waiters1; /* HIF_MUTEX */
188 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
189 struct list_head gl_waiters3; /* HIF_PROMOTE */
190
191 struct gfs2_glock_operations *gl_ops;
192
193 struct gfs2_holder *gl_req_gh;
194 gfs2_glop_bh_t gl_req_bh;
195
196 lm_lock_t *gl_lock;
197 char *gl_lvb;
198 atomic_t gl_lvb_count;
199
200 uint64_t gl_vn;
201 unsigned long gl_stamp;
202 void *gl_object;
203
204 struct gfs2_gl_hash_bucket *gl_bucket;
205 struct list_head gl_reclaim;
206
207 struct gfs2_sbd *gl_sbd;
208
209 struct inode *gl_aspace;
210 struct gfs2_log_element gl_le;
211 struct list_head gl_ail_list;
212 atomic_t gl_ail_count;
213};
214
215struct gfs2_alloc {
216 /* Quota stuff */
217
218 unsigned int al_qd_num;
219 struct gfs2_quota_data *al_qd[4];
220 struct gfs2_holder al_qd_ghs[4];
221
222 /* Filled in by the caller to gfs2_inplace_reserve() */
223
224 uint32_t al_requested;
225
226 /* Filled in by gfs2_inplace_reserve() */
227
228 char *al_file;
229 unsigned int al_line;
230 struct gfs2_holder al_ri_gh;
231 struct gfs2_holder al_rgd_gh;
232 struct gfs2_rgrpd *al_rgd;
233
234 /* Filled in by gfs2_alloc_*() */
235
236 uint32_t al_alloced;
237};
238
239enum {
240 GIF_MIN_INIT = 0,
241 GIF_QD_LOCKED = 1,
242 GIF_PAGED = 2,
243 GIF_SW_PAGED = 3,
244};
245
246struct gfs2_inode {
247 struct gfs2_inum i_num;
248
249 atomic_t i_count;
250 unsigned long i_flags; /* GIF_... */
251
252 uint64_t i_vn;
253 struct gfs2_dinode i_di;
254
255 struct gfs2_glock *i_gl;
256 struct gfs2_sbd *i_sbd;
257 struct inode *i_vnode;
258
259 struct gfs2_holder i_iopen_gh;
260 struct gfs2_holder i_gh; /* for prepare/commit_write only */
261 struct gfs2_alloc i_alloc;
262 uint64_t i_last_rg_alloc;
263
264 spinlock_t i_spin;
265 struct rw_semaphore i_rw_mutex;
266
267 unsigned int i_greedy;
268 unsigned long i_last_pfault;
269
270 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
271};
272
273enum {
274 GFF_DID_DIRECT_ALLOC = 0,
275};
276
277struct gfs2_file {
278 unsigned long f_flags; /* GFF_... */
279
280 struct mutex f_fl_mutex;
281 struct gfs2_holder f_fl_gh;
282
283 struct gfs2_inode *f_inode;
284 struct file *f_vfile;
285};
286
287struct gfs2_revoke {
288 struct gfs2_log_element rv_le;
289 uint64_t rv_blkno;
290};
291
292struct gfs2_revoke_replay {
293 struct list_head rr_list;
294 uint64_t rr_blkno;
295 unsigned int rr_where;
296};
297
298enum {
299 ULF_LOCKED = 0,
300};
301
302struct gfs2_unlinked {
303 struct list_head ul_list;
304 unsigned int ul_count;
305 struct gfs2_unlinked_tag ul_ut;
306 unsigned long ul_flags; /* ULF_... */
307 unsigned int ul_slot;
308};
309
310enum {
311 QDF_USER = 0,
312 QDF_CHANGE = 1,
313 QDF_LOCKED = 2,
314};
315
316struct gfs2_quota_data {
317 struct list_head qd_list;
318 unsigned int qd_count;
319
320 uint32_t qd_id;
321 unsigned long qd_flags; /* QDF_... */
322
323 int64_t qd_change;
324 int64_t qd_change_sync;
325
326 unsigned int qd_slot;
327 unsigned int qd_slot_count;
328
329 struct buffer_head *qd_bh;
330 struct gfs2_quota_change *qd_bh_qc;
331 unsigned int qd_bh_count;
332
333 struct gfs2_glock *qd_gl;
334 struct gfs2_quota_lvb qd_qb;
335
336 uint64_t qd_sync_gen;
337 unsigned long qd_last_warn;
338 unsigned long qd_last_touched;
339};
340
341struct gfs2_log_buf {
342 struct list_head lb_list;
343 struct buffer_head *lb_bh;
344 struct buffer_head *lb_real;
345};
346
347struct gfs2_trans {
348 char *tr_file;
349 unsigned int tr_line;
350
351 unsigned int tr_blocks;
352 unsigned int tr_revokes;
353 unsigned int tr_reserved;
354
355 struct gfs2_holder *tr_t_gh;
356
357 int tr_touched;
358
359 unsigned int tr_num_buf;
360 unsigned int tr_num_buf_new;
361 unsigned int tr_num_buf_rm;
362 struct list_head tr_list_buf;
363
364 unsigned int tr_num_revoke;
365 unsigned int tr_num_revoke_rm;
366};
367
368struct gfs2_ail {
369 struct list_head ai_list;
370
371 unsigned int ai_first;
372 struct list_head ai_ail1_list;
373 struct list_head ai_ail2_list;
374
375 uint64_t ai_sync_gen;
376};
377
378struct gfs2_jdesc {
379 struct list_head jd_list;
380
381 struct inode *jd_inode;
382 unsigned int jd_jid;
383 int jd_dirty;
384
385 unsigned int jd_blocks;
386};
387
388#define GFS2_GLOCKD_DEFAULT 1
389#define GFS2_GLOCKD_MAX 16
390
391#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
392#define GFS2_QUOTA_OFF 0
393#define GFS2_QUOTA_ACCOUNT 1
394#define GFS2_QUOTA_ON 2
395
396#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
397#define GFS2_DATA_WRITEBACK 1
398#define GFS2_DATA_ORDERED 2
399
400struct gfs2_args {
401 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
402 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
403 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
404 int ar_spectator; /* Don't get a journal because we're always RO */
405 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
406 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
407 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
408 int ar_debug; /* Oops on errors instead of trying to be graceful */
409 int ar_upgrade; /* Upgrade ondisk/multihost format */
410 unsigned int ar_num_glockd; /* Number of glockd threads */
411 int ar_posix_acl; /* Enable posix acls */
412 int ar_quota; /* off/account/on */
413 int ar_suiddir; /* suiddir support */
414 int ar_data; /* ordered/writeback */
415};
416
417struct gfs2_tune {
418 spinlock_t gt_spin;
419
420 unsigned int gt_ilimit;
421 unsigned int gt_ilimit_tries;
422 unsigned int gt_ilimit_min;
423 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
424 unsigned int gt_incore_log_blocks;
425 unsigned int gt_log_flush_secs;
426 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
427
428 unsigned int gt_scand_secs;
429 unsigned int gt_recoverd_secs;
430 unsigned int gt_logd_secs;
431 unsigned int gt_quotad_secs;
432 unsigned int gt_inoded_secs;
433
434 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
435 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
436 unsigned int gt_quota_scale_num; /* Numerator */
437 unsigned int gt_quota_scale_den; /* Denominator */
438 unsigned int gt_quota_cache_secs;
439 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
440 unsigned int gt_atime_quantum; /* Min secs between atime updates */
441 unsigned int gt_new_files_jdata;
442 unsigned int gt_new_files_directio;
443 unsigned int gt_max_atomic_write; /* Split big writes into this size */
444 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
445 unsigned int gt_lockdump_size;
446 unsigned int gt_stall_secs; /* Detects trouble! */
447 unsigned int gt_complain_secs;
448 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
449 unsigned int gt_entries_per_readdir;
450 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
451 unsigned int gt_greedy_default;
452 unsigned int gt_greedy_quantum;
453 unsigned int gt_greedy_max;
454 unsigned int gt_statfs_quantum;
455 unsigned int gt_statfs_slow;
456};
457
458struct gfs2_gl_hash_bucket {
459 rwlock_t hb_lock;
460 struct list_head hb_list;
461};
462
463enum {
464 SDF_JOURNAL_CHECKED = 0,
465 SDF_JOURNAL_LIVE = 1,
466 SDF_SHUTDOWN = 2,
467 SDF_NOATIME = 3,
468};
469
470#define GFS2_GL_HASH_SHIFT 13
471#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
472#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
473#define GFS2_FSNAME_LEN 256
474
475struct gfs2_sbd {
476 struct super_block *sd_vfs;
477 struct kobject sd_kobj;
478 unsigned long sd_flags; /* SDF_... */
479 struct gfs2_sb sd_sb;
480
481 /* Constants computed on mount */
482
483 uint32_t sd_fsb2bb;
484 uint32_t sd_fsb2bb_shift;
485 uint32_t sd_diptrs; /* Number of pointers in a dinode */
486 uint32_t sd_inptrs; /* Number of pointers in a indirect block */
487 uint32_t sd_jbsize; /* Size of a journaled data block */
488 uint32_t sd_hash_bsize; /* sizeof(exhash block) */
489 uint32_t sd_hash_bsize_shift;
490 uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */
491 uint32_t sd_ut_per_block;
492 uint32_t sd_qc_per_block;
493 uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
494 uint32_t sd_max_height; /* Max height of a file's metadata tree */
495 uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
496 uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */
497 uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
498
499 struct gfs2_args sd_args; /* Mount arguments */
500 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
501
502 /* Lock Stuff */
503
504 struct lm_lockstruct sd_lockstruct;
505 struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE];
506 struct list_head sd_reclaim_list;
507 spinlock_t sd_reclaim_lock;
508 wait_queue_head_t sd_reclaim_wq;
509 atomic_t sd_reclaim_count;
510 struct gfs2_holder sd_live_gh;
511 struct gfs2_glock *sd_rename_gl;
512 struct gfs2_glock *sd_trans_gl;
513 struct mutex sd_invalidate_inodes_mutex;
514
515 /* Inode Stuff */
516
517 struct inode *sd_master_dir;
518 struct inode *sd_jindex;
519 struct inode *sd_inum_inode;
520 struct inode *sd_statfs_inode;
521 struct inode *sd_ir_inode;
522 struct inode *sd_sc_inode;
523 struct inode *sd_ut_inode;
524 struct inode *sd_qc_inode;
525 struct inode *sd_rindex;
526 struct inode *sd_quota_inode;
527 struct inode *sd_root_dir;
528
529 /* Inum stuff */
530
531 struct mutex sd_inum_mutex;
532
533 /* StatFS stuff */
534
535 spinlock_t sd_statfs_spin;
536 struct mutex sd_statfs_mutex;
537 struct gfs2_statfs_change sd_statfs_master;
538 struct gfs2_statfs_change sd_statfs_local;
539 unsigned long sd_statfs_sync_time;
540
541 /* Resource group stuff */
542
543 uint64_t sd_rindex_vn;
544 spinlock_t sd_rindex_spin;
545 struct mutex sd_rindex_mutex;
546 struct list_head sd_rindex_list;
547 struct list_head sd_rindex_mru_list;
548 struct list_head sd_rindex_recent_list;
549 struct gfs2_rgrpd *sd_rindex_forward;
550 unsigned int sd_rgrps;
551
552 /* Journal index stuff */
553
554 struct list_head sd_jindex_list;
555 spinlock_t sd_jindex_spin;
556 struct mutex sd_jindex_mutex;
557 unsigned int sd_journals;
558 unsigned long sd_jindex_refresh_time;
559
560 struct gfs2_jdesc *sd_jdesc;
561 struct gfs2_holder sd_journal_gh;
562 struct gfs2_holder sd_jinode_gh;
563
564 struct gfs2_holder sd_ir_gh;
565 struct gfs2_holder sd_sc_gh;
566 struct gfs2_holder sd_ut_gh;
567 struct gfs2_holder sd_qc_gh;
568
569 /* Daemon stuff */
570
571 struct task_struct *sd_scand_process;
572 struct task_struct *sd_recoverd_process;
573 struct task_struct *sd_logd_process;
574 struct task_struct *sd_quotad_process;
575 struct task_struct *sd_inoded_process;
576 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
577 unsigned int sd_glockd_num;
578
579 /* Unlinked inode stuff */
580
581 struct list_head sd_unlinked_list;
582 atomic_t sd_unlinked_count;
583 spinlock_t sd_unlinked_spin;
584 struct mutex sd_unlinked_mutex;
585
586 unsigned int sd_unlinked_slots;
587 unsigned int sd_unlinked_chunks;
588 unsigned char **sd_unlinked_bitmap;
589
590 /* Quota stuff */
591
592 struct list_head sd_quota_list;
593 atomic_t sd_quota_count;
594 spinlock_t sd_quota_spin;
595 struct mutex sd_quota_mutex;
596
597 unsigned int sd_quota_slots;
598 unsigned int sd_quota_chunks;
599 unsigned char **sd_quota_bitmap;
600
601 uint64_t sd_quota_sync_gen;
602 unsigned long sd_quota_sync_time;
603
604 /* Log stuff */
605
606 spinlock_t sd_log_lock;
607 atomic_t sd_log_trans_count;
608 wait_queue_head_t sd_log_trans_wq;
609 atomic_t sd_log_flush_count;
610 wait_queue_head_t sd_log_flush_wq;
611
612 unsigned int sd_log_blks_reserved;
613 unsigned int sd_log_commited_buf;
614 unsigned int sd_log_commited_revoke;
615
616 unsigned int sd_log_num_gl;
617 unsigned int sd_log_num_buf;
618 unsigned int sd_log_num_revoke;
619 unsigned int sd_log_num_rg;
620 unsigned int sd_log_num_databuf;
621 unsigned int sd_log_num_jdata;
622
623 struct list_head sd_log_le_gl;
624 struct list_head sd_log_le_buf;
625 struct list_head sd_log_le_revoke;
626 struct list_head sd_log_le_rg;
627 struct list_head sd_log_le_databuf;
628
629 unsigned int sd_log_blks_free;
630 struct list_head sd_log_blks_list;
631 wait_queue_head_t sd_log_blks_wait;
632
633 uint64_t sd_log_sequence;
634 unsigned int sd_log_head;
635 unsigned int sd_log_tail;
636 uint64_t sd_log_wraps;
637 int sd_log_idle;
638
639 unsigned long sd_log_flush_time;
640 struct mutex sd_log_flush_lock;
641 struct list_head sd_log_flush_list;
642
643 unsigned int sd_log_flush_head;
644 uint64_t sd_log_flush_wrapped;
645
646 struct list_head sd_ail1_list;
647 struct list_head sd_ail2_list;
648 uint64_t sd_ail_sync_gen;
649
650 /* Replay stuff */
651
652 struct list_head sd_revoke_list;
653 unsigned int sd_replay_tail;
654
655 unsigned int sd_found_blocks;
656 unsigned int sd_found_revokes;
657 unsigned int sd_replayed_blocks;
658
659 /* For quiescing the filesystem */
660
661 struct gfs2_holder sd_freeze_gh;
662 struct mutex sd_freeze_lock;
663 unsigned int sd_freeze_count;
664
665 /* Counters */
666
667 atomic_t sd_glock_count;
668 atomic_t sd_glock_held_count;
669 atomic_t sd_inode_count;
670 atomic_t sd_bufdata_count;
671
672 atomic_t sd_fh2dentry_misses;
673 atomic_t sd_reclaimed;
674 atomic_t sd_log_flush_incore;
675 atomic_t sd_log_flush_ondisk;
676
677 atomic_t sd_glock_nq_calls;
678 atomic_t sd_glock_dq_calls;
679 atomic_t sd_glock_prefetch_calls;
680 atomic_t sd_lm_lock_calls;
681 atomic_t sd_lm_unlock_calls;
682 atomic_t sd_lm_callbacks;
683
684 atomic_t sd_ops_address;
685 atomic_t sd_ops_dentry;
686 atomic_t sd_ops_export;
687 atomic_t sd_ops_file;
688 atomic_t sd_ops_inode;
689 atomic_t sd_ops_super;
690 atomic_t sd_ops_vm;
691
692 char sd_fsname[GFS2_FSNAME_LEN];
693 char sd_table_name[GFS2_FSNAME_LEN];
694 char sd_proto_name[GFS2_FSNAME_LEN];
695
696 /* Debugging crud */
697
698 unsigned long sd_last_warning;
699};
700
701#endif /* __INCORE_DOT_H__ */
702
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..30ca82a1addf
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1835 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "acl.h"
21#include "bmap.h"
22#include "dir.h"
23#include "eattr.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "ops_file.h"
31#include "ops_inode.h"
32#include "quota.h"
33#include "rgrp.h"
34#include "trans.h"
35#include "unlinked.h"
36
37/**
38 * inode_attr_in - Copy attributes from the dinode into the VFS inode
39 * @ip: The GFS2 inode (with embedded disk inode data)
40 * @inode: The Linux VFS inode
41 *
42 */
43
44static void inode_attr_in(struct gfs2_inode *ip, struct inode *inode)
45{
46 inode->i_ino = ip->i_num.no_formal_ino;
47
48 switch (ip->i_di.di_mode & S_IFMT) {
49 case S_IFBLK:
50 case S_IFCHR:
51 inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
52 break;
53 default:
54 inode->i_rdev = 0;
55 break;
56 };
57
58 inode->i_mode = ip->i_di.di_mode;
59 inode->i_nlink = ip->i_di.di_nlink;
60 inode->i_uid = ip->i_di.di_uid;
61 inode->i_gid = ip->i_di.di_gid;
62 i_size_write(inode, ip->i_di.di_size);
63 inode->i_atime.tv_sec = ip->i_di.di_atime;
64 inode->i_mtime.tv_sec = ip->i_di.di_mtime;
65 inode->i_ctime.tv_sec = ip->i_di.di_ctime;
66 inode->i_atime.tv_nsec = 0;
67 inode->i_mtime.tv_nsec = 0;
68 inode->i_ctime.tv_nsec = 0;
69 inode->i_blksize = PAGE_SIZE;
70 inode->i_blocks = ip->i_di.di_blocks <<
71 (ip->i_sbd->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
72
73 if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE)
74 inode->i_flags |= S_IMMUTABLE;
75 else
76 inode->i_flags &= ~S_IMMUTABLE;
77
78 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY)
79 inode->i_flags |= S_APPEND;
80 else
81 inode->i_flags &= ~S_APPEND;
82}
83
84/**
85 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
86 * @ip: The GFS2 inode (with embedded disk inode data)
87 *
88 */
89
90void gfs2_inode_attr_in(struct gfs2_inode *ip)
91{
92 struct inode *inode;
93
94 inode = gfs2_ip2v_lookup(ip);
95 if (inode) {
96 inode_attr_in(ip, inode);
97 iput(inode);
98 }
99}
100
101/**
102 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
103 * @ip: The GFS2 inode
104 *
105 * Only copy out the attributes that we want the VFS layer
106 * to be able to modify.
107 */
108
109void gfs2_inode_attr_out(struct gfs2_inode *ip)
110{
111 struct inode *inode = ip->i_vnode;
112
113 gfs2_assert_withdraw(ip->i_sbd,
114 (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
115 ip->i_di.di_mode = inode->i_mode;
116 ip->i_di.di_uid = inode->i_uid;
117 ip->i_di.di_gid = inode->i_gid;
118 ip->i_di.di_atime = inode->i_atime.tv_sec;
119 ip->i_di.di_mtime = inode->i_mtime.tv_sec;
120 ip->i_di.di_ctime = inode->i_ctime.tv_sec;
121}
122
123/**
124 * gfs2_ip2v_lookup - Get the struct inode for a struct gfs2_inode
125 * @ip: the struct gfs2_inode to get the struct inode for
126 *
127 * Returns: A VFS inode, or NULL if none
128 */
129
130struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip)
131{
132 struct inode *inode = NULL;
133
134 gfs2_assert_warn(ip->i_sbd, test_bit(GIF_MIN_INIT, &ip->i_flags));
135
136 spin_lock(&ip->i_spin);
137 if (ip->i_vnode)
138 inode = igrab(ip->i_vnode);
139 spin_unlock(&ip->i_spin);
140
141 return inode;
142}
143
144/**
145 * gfs2_ip2v - Get/Create a struct inode for a struct gfs2_inode
146 * @ip: the struct gfs2_inode to get the struct inode for
147 *
148 * Returns: A VFS inode, or NULL if no mem
149 */
150
151struct inode *gfs2_ip2v(struct gfs2_inode *ip)
152{
153 struct inode *inode, *tmp;
154
155 inode = gfs2_ip2v_lookup(ip);
156 if (inode)
157 return inode;
158
159 tmp = new_inode(ip->i_sbd->sd_vfs);
160 if (!tmp)
161 return NULL;
162
163 inode_attr_in(ip, tmp);
164
165 if (S_ISREG(ip->i_di.di_mode)) {
166 tmp->i_op = &gfs2_file_iops;
167 tmp->i_fop = &gfs2_file_fops;
168 tmp->i_mapping->a_ops = &gfs2_file_aops;
169 } else if (S_ISDIR(ip->i_di.di_mode)) {
170 tmp->i_op = &gfs2_dir_iops;
171 tmp->i_fop = &gfs2_dir_fops;
172 } else if (S_ISLNK(ip->i_di.di_mode)) {
173 tmp->i_op = &gfs2_symlink_iops;
174 } else {
175 tmp->i_op = &gfs2_dev_iops;
176 init_special_inode(tmp, tmp->i_mode, tmp->i_rdev);
177 }
178
179 set_v2ip(tmp, NULL);
180
181 for (;;) {
182 spin_lock(&ip->i_spin);
183 if (!ip->i_vnode)
184 break;
185 inode = igrab(ip->i_vnode);
186 spin_unlock(&ip->i_spin);
187
188 if (inode) {
189 iput(tmp);
190 return inode;
191 }
192 yield();
193 }
194
195 inode = tmp;
196
197 gfs2_inode_hold(ip);
198 ip->i_vnode = inode;
199 set_v2ip(inode, ip);
200
201 spin_unlock(&ip->i_spin);
202
203 insert_inode_hash(inode);
204
205 return inode;
206}
207
208static int iget_test(struct inode *inode, void *opaque)
209{
210 struct gfs2_inode *ip = get_v2ip(inode);
211 struct gfs2_inum *inum = (struct gfs2_inum *)opaque;
212
213 if (ip && ip->i_num.no_addr == inum->no_addr)
214 return 1;
215
216 return 0;
217}
218
219struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
220{
221 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
222 iget_test, inum);
223}
224
225void gfs2_inode_min_init(struct gfs2_inode *ip, unsigned int type)
226{
227 spin_lock(&ip->i_spin);
228 if (!test_and_set_bit(GIF_MIN_INIT, &ip->i_flags)) {
229 ip->i_di.di_nlink = 1;
230 ip->i_di.di_mode = DT2IF(type);
231 }
232 spin_unlock(&ip->i_spin);
233}
234
235/**
236 * gfs2_inode_refresh - Refresh the incore copy of the dinode
237 * @ip: The GFS2 inode
238 *
239 * Returns: errno
240 */
241
242int gfs2_inode_refresh(struct gfs2_inode *ip)
243{
244 struct buffer_head *dibh;
245 int error;
246
247 error = gfs2_meta_inode_buffer(ip, &dibh);
248 if (error)
249 return error;
250
251 if (gfs2_metatype_check(ip->i_sbd, dibh, GFS2_METATYPE_DI)) {
252 brelse(dibh);
253 return -EIO;
254 }
255
256 spin_lock(&ip->i_spin);
257 gfs2_dinode_in(&ip->i_di, dibh->b_data);
258 set_bit(GIF_MIN_INIT, &ip->i_flags);
259 spin_unlock(&ip->i_spin);
260
261 brelse(dibh);
262
263 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
264 if (gfs2_consist_inode(ip))
265 gfs2_dinode_print(&ip->i_di);
266 return -EIO;
267 }
268 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
269 return -ESTALE;
270
271 ip->i_vn = ip->i_gl->gl_vn;
272
273 return 0;
274}
275
276/**
277 * inode_create - create a struct gfs2_inode
278 * @i_gl: The glock covering the inode
279 * @inum: The inode number
280 * @io_gl: the iopen glock to acquire/hold (using holder in new gfs2_inode)
281 * @io_state: the state the iopen glock should be acquired in
282 * @ipp: pointer to put the returned inode in
283 *
284 * Returns: errno
285 */
286
287static int inode_create(struct gfs2_glock *i_gl, struct gfs2_inum *inum,
288 struct gfs2_glock *io_gl, unsigned int io_state,
289 struct gfs2_inode **ipp)
290{
291 struct gfs2_sbd *sdp = i_gl->gl_sbd;
292 struct gfs2_inode *ip;
293 int error = 0;
294
295 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
296 if (!ip)
297 return -ENOMEM;
298 memset(ip, 0, sizeof(struct gfs2_inode));
299
300 ip->i_num = *inum;
301
302 atomic_set(&ip->i_count, 1);
303
304 ip->i_vn = i_gl->gl_vn - 1;
305
306 ip->i_gl = i_gl;
307 ip->i_sbd = sdp;
308
309 spin_lock_init(&ip->i_spin);
310 init_rwsem(&ip->i_rw_mutex);
311
312 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
313
314 error = gfs2_glock_nq_init(io_gl,
315 io_state, GL_LOCAL_EXCL | GL_EXACT,
316 &ip->i_iopen_gh);
317 if (error)
318 goto fail;
319 ip->i_iopen_gh.gh_owner = NULL;
320
321 spin_lock(&io_gl->gl_spin);
322 gfs2_glock_hold(i_gl);
323 set_gl2gl(io_gl, i_gl);
324 spin_unlock(&io_gl->gl_spin);
325
326 gfs2_glock_hold(i_gl);
327 set_gl2ip(i_gl, ip);
328
329 atomic_inc(&sdp->sd_inode_count);
330
331 *ipp = ip;
332
333 return 0;
334
335 fail:
336 gfs2_meta_cache_flush(ip);
337 kmem_cache_free(gfs2_inode_cachep, ip);
338 *ipp = NULL;
339
340 return error;
341}
342
343/**
344 * gfs2_inode_get - Create or get a reference on an inode
345 * @i_gl: The glock covering the inode
346 * @inum: The inode number
347 * @create:
348 * @ipp: pointer to put the returned inode in
349 *
350 * Returns: errno
351 */
352
353int gfs2_inode_get(struct gfs2_glock *i_gl, struct gfs2_inum *inum, int create,
354 struct gfs2_inode **ipp)
355{
356 struct gfs2_sbd *sdp = i_gl->gl_sbd;
357 struct gfs2_glock *io_gl;
358 int error = 0;
359
360 gfs2_glmutex_lock(i_gl);
361
362 *ipp = get_gl2ip(i_gl);
363 if (*ipp) {
364 error = -ESTALE;
365 if ((*ipp)->i_num.no_formal_ino != inum->no_formal_ino)
366 goto out;
367 atomic_inc(&(*ipp)->i_count);
368 error = 0;
369 goto out;
370 }
371
372 if (!create)
373 goto out;
374
375 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops,
376 CREATE, &io_gl);
377 if (!error) {
378 error = inode_create(i_gl, inum, io_gl, LM_ST_SHARED, ipp);
379 gfs2_glock_put(io_gl);
380 }
381
382 out:
383 gfs2_glmutex_unlock(i_gl);
384
385 return error;
386}
387
388void gfs2_inode_hold(struct gfs2_inode *ip)
389{
390 gfs2_assert(ip->i_sbd, atomic_read(&ip->i_count) > 0);
391 atomic_inc(&ip->i_count);
392}
393
394void gfs2_inode_put(struct gfs2_inode *ip)
395{
396 gfs2_assert(ip->i_sbd, atomic_read(&ip->i_count) > 0);
397 atomic_dec(&ip->i_count);
398}
399
400void gfs2_inode_destroy(struct gfs2_inode *ip)
401{
402 struct gfs2_sbd *sdp = ip->i_sbd;
403 struct gfs2_glock *io_gl = ip->i_iopen_gh.gh_gl;
404 struct gfs2_glock *i_gl = ip->i_gl;
405
406 gfs2_assert_warn(sdp, !atomic_read(&ip->i_count));
407 gfs2_assert(sdp, get_gl2gl(io_gl) == i_gl);
408
409 spin_lock(&io_gl->gl_spin);
410 set_gl2gl(io_gl, NULL);
411 gfs2_glock_put(i_gl);
412 spin_unlock(&io_gl->gl_spin);
413
414 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
415
416 gfs2_meta_cache_flush(ip);
417 kmem_cache_free(gfs2_inode_cachep, ip);
418
419 set_gl2ip(i_gl, NULL);
420 gfs2_glock_put(i_gl);
421
422 atomic_dec(&sdp->sd_inode_count);
423}
424
425static int dinode_dealloc(struct gfs2_inode *ip, struct gfs2_unlinked *ul)
426{
427 struct gfs2_sbd *sdp = ip->i_sbd;
428 struct gfs2_alloc *al;
429 struct gfs2_rgrpd *rgd;
430 int error;
431
432 if (ip->i_di.di_blocks != 1) {
433 if (gfs2_consist_inode(ip))
434 gfs2_dinode_print(&ip->i_di);
435 return -EIO;
436 }
437
438 al = gfs2_alloc_get(ip);
439
440 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
441 if (error)
442 goto out;
443
444 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
445 if (error)
446 goto out_qs;
447
448 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
449 if (!rgd) {
450 gfs2_consist_inode(ip);
451 error = -EIO;
452 goto out_rindex_relse;
453 }
454
455 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
456 &al->al_rgd_gh);
457 if (error)
458 goto out_rindex_relse;
459
460 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED +
461 RES_STATFS + RES_QUOTA, 1);
462 if (error)
463 goto out_rg_gunlock;
464
465 gfs2_trans_add_gl(ip->i_gl);
466
467 gfs2_free_di(rgd, ip);
468
469 error = gfs2_unlinked_ondisk_rm(sdp, ul);
470
471 gfs2_trans_end(sdp);
472 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
473
474 out_rg_gunlock:
475 gfs2_glock_dq_uninit(&al->al_rgd_gh);
476
477 out_rindex_relse:
478 gfs2_glock_dq_uninit(&al->al_ri_gh);
479
480 out_qs:
481 gfs2_quota_unhold(ip);
482
483 out:
484 gfs2_alloc_put(ip);
485
486 return error;
487}
488
489/**
490 * inode_dealloc - Deallocate all on-disk blocks for an inode (dinode)
491 * @sdp: the filesystem
492 * @inum: the inode number to deallocate
493 * @io_gh: a holder for the iopen glock for this inode
494 *
495 * Returns: errno
496 */
497
498static int inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul,
499 struct gfs2_holder *io_gh)
500{
501 struct gfs2_inode *ip;
502 struct gfs2_holder i_gh;
503 int error;
504
505 error = gfs2_glock_nq_num(sdp,
506 ul->ul_ut.ut_inum.no_addr, &gfs2_inode_glops,
507 LM_ST_EXCLUSIVE, 0, &i_gh);
508 if (error)
509 return error;
510
511 /* We reacquire the iopen lock here to avoid a race with the NFS server
512 calling gfs2_read_inode() with the inode number of a inode we're in
513 the process of deallocating. And we can't keep our hold on the lock
514 from inode_dealloc_init() for deadlock reasons. */
515
516 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY, io_gh);
517 error = gfs2_glock_nq(io_gh);
518 switch (error) {
519 case 0:
520 break;
521 case GLR_TRYFAILED:
522 error = 1;
523 default:
524 goto out;
525 }
526
527 gfs2_assert_warn(sdp, !get_gl2ip(i_gh.gh_gl));
528 error = inode_create(i_gh.gh_gl, &ul->ul_ut.ut_inum, io_gh->gh_gl,
529 LM_ST_EXCLUSIVE, &ip);
530
531 gfs2_glock_dq(io_gh);
532
533 if (error)
534 goto out;
535
536 error = gfs2_inode_refresh(ip);
537 if (error)
538 goto out_iput;
539
540 if (ip->i_di.di_nlink) {
541 if (gfs2_consist_inode(ip))
542 gfs2_dinode_print(&ip->i_di);
543 error = -EIO;
544 goto out_iput;
545 }
546
547 if (S_ISDIR(ip->i_di.di_mode) &&
548 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
549 error = gfs2_dir_exhash_dealloc(ip);
550 if (error)
551 goto out_iput;
552 }
553
554 if (ip->i_di.di_eattr) {
555 error = gfs2_ea_dealloc(ip);
556 if (error)
557 goto out_iput;
558 }
559
560 if (!gfs2_is_stuffed(ip)) {
561 error = gfs2_file_dealloc(ip);
562 if (error)
563 goto out_iput;
564 }
565
566 error = dinode_dealloc(ip, ul);
567 if (error)
568 goto out_iput;
569
570 out_iput:
571 gfs2_glmutex_lock(i_gh.gh_gl);
572 gfs2_inode_put(ip);
573 gfs2_inode_destroy(ip);
574 gfs2_glmutex_unlock(i_gh.gh_gl);
575
576 out:
577 gfs2_glock_dq_uninit(&i_gh);
578
579 return error;
580}
581
582/**
583 * try_inode_dealloc - Try to deallocate an inode and all its blocks
584 * @sdp: the filesystem
585 *
586 * Returns: 0 on success, -errno on error, 1 on busy (inode open)
587 */
588
589static int try_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
590{
591 struct gfs2_holder io_gh;
592 int error = 0;
593
594 gfs2_try_toss_inode(sdp, &ul->ul_ut.ut_inum);
595
596 error = gfs2_glock_nq_num(sdp,
597 ul->ul_ut.ut_inum.no_addr, &gfs2_iopen_glops,
598 LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &io_gh);
599 switch (error) {
600 case 0:
601 break;
602 case GLR_TRYFAILED:
603 return 1;
604 default:
605 return error;
606 }
607
608 gfs2_glock_dq(&io_gh);
609 error = inode_dealloc(sdp, ul, &io_gh);
610 gfs2_holder_uninit(&io_gh);
611
612 return error;
613}
614
615static int inode_dealloc_uninit(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
616{
617 struct gfs2_rgrpd *rgd;
618 struct gfs2_holder ri_gh, rgd_gh;
619 int error;
620
621 error = gfs2_rindex_hold(sdp, &ri_gh);
622 if (error)
623 return error;
624
625 rgd = gfs2_blk2rgrpd(sdp, ul->ul_ut.ut_inum.no_addr);
626 if (!rgd) {
627 gfs2_consist(sdp);
628 error = -EIO;
629 goto out;
630 }
631
632 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
633 if (error)
634 goto out;
635
636 error = gfs2_trans_begin(sdp,
637 RES_RG_BIT + RES_UNLINKED + RES_STATFS,
638 0);
639 if (error)
640 goto out_gunlock;
641
642 gfs2_free_uninit_di(rgd, ul->ul_ut.ut_inum.no_addr);
643 gfs2_unlinked_ondisk_rm(sdp, ul);
644
645 gfs2_trans_end(sdp);
646
647 out_gunlock:
648 gfs2_glock_dq_uninit(&rgd_gh);
649 out:
650 gfs2_glock_dq_uninit(&ri_gh);
651
652 return error;
653}
654
655int gfs2_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
656{
657 if (ul->ul_ut.ut_flags & GFS2_UTF_UNINIT)
658 return inode_dealloc_uninit(sdp, ul);
659 else
660 return try_inode_dealloc(sdp, ul);
661}
662
663/**
664 * gfs2_change_nlink - Change nlink count on inode
665 * @ip: The GFS2 inode
666 * @diff: The change in the nlink count required
667 *
668 * Returns: errno
669 */
670
671int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
672{
673 struct buffer_head *dibh;
674 uint32_t nlink;
675 int error;
676
677 nlink = ip->i_di.di_nlink + diff;
678
679 /* If we are reducing the nlink count, but the new value ends up being
680 bigger than the old one, we must have underflowed. */
681 if (diff < 0 && nlink > ip->i_di.di_nlink) {
682 if (gfs2_consist_inode(ip))
683 gfs2_dinode_print(&ip->i_di);
684 return -EIO;
685 }
686
687 error = gfs2_meta_inode_buffer(ip, &dibh);
688 if (error)
689 return error;
690
691 ip->i_di.di_nlink = nlink;
692 ip->i_di.di_ctime = get_seconds();
693
694 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
695 gfs2_dinode_out(&ip->i_di, dibh->b_data);
696 brelse(dibh);
697
698 return 0;
699}
700
701/**
702 * gfs2_lookupi - Look up a filename in a directory and return its inode
703 * @d_gh: An initialized holder for the directory glock
704 * @name: The name of the inode to look for
705 * @is_root: If 1, ignore the caller's permissions
706 * @i_gh: An uninitialized holder for the new inode glock
707 *
708 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
709 * @is_root is true.
710 *
711 * Returns: errno
712 */
713
714int gfs2_lookupi(struct inode *dir, struct qstr *name, int is_root,
715 struct inode **inodep)
716{
717 struct gfs2_inode *ipp;
718 struct gfs2_inode *dip = get_v2ip(dir);
719 struct gfs2_sbd *sdp = dip->i_sbd;
720 struct gfs2_holder d_gh;
721 struct gfs2_inum inum;
722 unsigned int type;
723 struct gfs2_glock *gl;
724 int error = 0;
725
726 *inodep = NULL;
727
728 if (!name->len || name->len > GFS2_FNAMESIZE)
729 return -ENAMETOOLONG;
730
731 if (gfs2_filecmp(name, ".", 1) ||
732 (gfs2_filecmp(name, "..", 2) && dir == sdp->sd_root_dir)) {
733 gfs2_inode_hold(dip);
734 ipp = dip;
735 goto done;
736 }
737
738 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
739 if (error)
740 return error;
741
742 if (!is_root) {
743 error = gfs2_repermission(dip->i_vnode, MAY_EXEC, NULL);
744 if (error)
745 goto out;
746 }
747
748 error = gfs2_dir_search(dip, name, &inum, &type);
749 if (error)
750 goto out;
751
752 error = gfs2_glock_get(sdp, inum.no_addr, &gfs2_inode_glops,
753 CREATE, &gl);
754 if (error)
755 goto out;
756
757 error = gfs2_inode_get(gl, &inum, CREATE, &ipp);
758 if (!error)
759 gfs2_inode_min_init(ipp, type);
760
761 gfs2_glock_put(gl);
762
763out:
764 gfs2_glock_dq_uninit(&d_gh);
765done:
766 if (error == 0) {
767 *inodep = gfs2_ip2v(ipp);
768 if (!*inodep)
769 error = -ENOMEM;
770 gfs2_inode_put(ipp);
771 }
772 return error;
773}
774
775static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino)
776{
777 struct gfs2_inode *ip = get_v2ip(sdp->sd_ir_inode);
778 struct buffer_head *bh;
779 struct gfs2_inum_range ir;
780 int error;
781
782 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
783 if (error)
784 return error;
785 mutex_lock(&sdp->sd_inum_mutex);
786
787 error = gfs2_meta_inode_buffer(ip, &bh);
788 if (error) {
789 mutex_unlock(&sdp->sd_inum_mutex);
790 gfs2_trans_end(sdp);
791 return error;
792 }
793
794 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
795
796 if (ir.ir_length) {
797 *formal_ino = ir.ir_start++;
798 ir.ir_length--;
799 gfs2_trans_add_bh(ip->i_gl, bh, 1);
800 gfs2_inum_range_out(&ir,
801 bh->b_data + sizeof(struct gfs2_dinode));
802 brelse(bh);
803 mutex_unlock(&sdp->sd_inum_mutex);
804 gfs2_trans_end(sdp);
805 return 0;
806 }
807
808 brelse(bh);
809
810 mutex_unlock(&sdp->sd_inum_mutex);
811 gfs2_trans_end(sdp);
812
813 return 1;
814}
815
816static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino)
817{
818 struct gfs2_inode *ip = get_v2ip(sdp->sd_ir_inode);
819 struct gfs2_inode *m_ip = get_v2ip(sdp->sd_inum_inode);
820 struct gfs2_holder gh;
821 struct buffer_head *bh;
822 struct gfs2_inum_range ir;
823 int error;
824
825 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
826 if (error)
827 return error;
828
829 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
830 if (error)
831 goto out;
832 mutex_lock(&sdp->sd_inum_mutex);
833
834 error = gfs2_meta_inode_buffer(ip, &bh);
835 if (error)
836 goto out_end_trans;
837
838 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
839
840 if (!ir.ir_length) {
841 struct buffer_head *m_bh;
842 uint64_t x, y;
843
844 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
845 if (error)
846 goto out_brelse;
847
848 x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode));
849 x = y = be64_to_cpu(x);
850 ir.ir_start = x;
851 ir.ir_length = GFS2_INUM_QUANTUM;
852 x += GFS2_INUM_QUANTUM;
853 if (x < y)
854 gfs2_consist_inode(m_ip);
855 x = cpu_to_be64(x);
856 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
857 *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
858
859 brelse(m_bh);
860 }
861
862 *formal_ino = ir.ir_start++;
863 ir.ir_length--;
864
865 gfs2_trans_add_bh(ip->i_gl, bh, 1);
866 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
867
868 out_brelse:
869 brelse(bh);
870
871 out_end_trans:
872 mutex_unlock(&sdp->sd_inum_mutex);
873 gfs2_trans_end(sdp);
874
875 out:
876 gfs2_glock_dq_uninit(&gh);
877
878 return error;
879}
880
881static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum)
882{
883 int error;
884
885 error = pick_formal_ino_1(sdp, inum);
886 if (error <= 0)
887 return error;
888
889 error = pick_formal_ino_2(sdp, inum);
890
891 return error;
892}
893
894/**
895 * create_ok - OK to create a new on-disk inode here?
896 * @dip: Directory in which dinode is to be created
897 * @name: Name of new dinode
898 * @mode:
899 *
900 * Returns: errno
901 */
902
903static int create_ok(struct gfs2_inode *dip, struct qstr *name,
904 unsigned int mode)
905{
906 int error;
907
908 error = gfs2_repermission(dip->i_vnode, MAY_WRITE | MAY_EXEC, NULL);
909 if (error)
910 return error;
911
912 /* Don't create entries in an unlinked directory */
913 if (!dip->i_di.di_nlink)
914 return -EPERM;
915
916 error = gfs2_dir_search(dip, name, NULL, NULL);
917 switch (error) {
918 case -ENOENT:
919 error = 0;
920 break;
921 case 0:
922 return -EEXIST;
923 default:
924 return error;
925 }
926
927 if (dip->i_di.di_entries == (uint32_t)-1)
928 return -EFBIG;
929 if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1)
930 return -EMLINK;
931
932 return 0;
933}
934
935static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
936 unsigned int *uid, unsigned int *gid)
937{
938 if (dip->i_sbd->sd_args.ar_suiddir &&
939 (dip->i_di.di_mode & S_ISUID) &&
940 dip->i_di.di_uid) {
941 if (S_ISDIR(*mode))
942 *mode |= S_ISUID;
943 else if (dip->i_di.di_uid != current->fsuid)
944 *mode &= ~07111;
945 *uid = dip->i_di.di_uid;
946 } else
947 *uid = current->fsuid;
948
949 if (dip->i_di.di_mode & S_ISGID) {
950 if (S_ISDIR(*mode))
951 *mode |= S_ISGID;
952 *gid = dip->i_di.di_gid;
953 } else
954 *gid = current->fsgid;
955}
956
957static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_unlinked *ul)
958{
959 struct gfs2_sbd *sdp = dip->i_sbd;
960 int error;
961
962 gfs2_alloc_get(dip);
963
964 dip->i_alloc.al_requested = RES_DINODE;
965 error = gfs2_inplace_reserve(dip);
966 if (error)
967 goto out;
968
969 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED +
970 RES_STATFS, 0);
971 if (error)
972 goto out_ipreserv;
973
974 ul->ul_ut.ut_inum.no_addr = gfs2_alloc_di(dip);
975
976 ul->ul_ut.ut_flags = GFS2_UTF_UNINIT;
977 error = gfs2_unlinked_ondisk_add(sdp, ul);
978
979 gfs2_trans_end(sdp);
980
981 out_ipreserv:
982 gfs2_inplace_release(dip);
983
984 out:
985 gfs2_alloc_put(dip);
986
987 return error;
988}
989
990/**
991 * init_dinode - Fill in a new dinode structure
992 * @dip: the directory this inode is being created in
993 * @gl: The glock covering the new inode
994 * @inum: the inode number
995 * @mode: the file permissions
996 * @uid:
997 * @gid:
998 *
999 */
1000
1001static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
1002 struct gfs2_inum *inum, unsigned int mode,
1003 unsigned int uid, unsigned int gid)
1004{
1005 struct gfs2_sbd *sdp = dip->i_sbd;
1006 struct gfs2_dinode *di;
1007 struct buffer_head *dibh;
1008
1009 dibh = gfs2_meta_new(gl, inum->no_addr);
1010 gfs2_trans_add_bh(gl, dibh, 1);
1011 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
1012 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1013 di = (struct gfs2_dinode *)dibh->b_data;
1014
1015 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
1016 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
1017 di->di_mode = cpu_to_be32(mode);
1018 di->di_uid = cpu_to_be32(uid);
1019 di->di_gid = cpu_to_be32(gid);
1020 di->di_nlink = cpu_to_be32(0);
1021 di->di_size = cpu_to_be64(0);
1022 di->di_blocks = cpu_to_be64(1);
1023 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
1024 di->di_major = di->di_minor = cpu_to_be32(0);
1025 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
1026 di->__pad[0] = di->__pad[1] = 0;
1027 di->di_flags = cpu_to_be32(0);
1028
1029 if (S_ISREG(mode)) {
1030 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
1031 gfs2_tune_get(sdp, gt_new_files_jdata))
1032 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
1033 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
1034 gfs2_tune_get(sdp, gt_new_files_directio))
1035 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
1036 } else if (S_ISDIR(mode)) {
1037 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO);
1038 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA);
1039 }
1040
1041 di->__pad1 = 0;
1042 di->di_height = cpu_to_be32(0);
1043 di->__pad2 = 0;
1044 di->__pad3 = 0;
1045 di->di_depth = cpu_to_be16(0);
1046 di->di_entries = cpu_to_be32(0);
1047 memset(&di->__pad4, 0, sizeof(di->__pad4));
1048 di->di_eattr = cpu_to_be64(0);
1049 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
1050
1051 brelse(dibh);
1052}
1053
1054static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
1055 unsigned int mode, struct gfs2_unlinked *ul)
1056{
1057 struct gfs2_sbd *sdp = dip->i_sbd;
1058 unsigned int uid, gid;
1059 int error;
1060
1061 munge_mode_uid_gid(dip, &mode, &uid, &gid);
1062
1063 gfs2_alloc_get(dip);
1064
1065 error = gfs2_quota_lock(dip, uid, gid);
1066 if (error)
1067 goto out;
1068
1069 error = gfs2_quota_check(dip, uid, gid);
1070 if (error)
1071 goto out_quota;
1072
1073 error = gfs2_trans_begin(sdp, RES_DINODE + RES_UNLINKED +
1074 RES_QUOTA, 0);
1075 if (error)
1076 goto out_quota;
1077
1078 ul->ul_ut.ut_flags = 0;
1079 error = gfs2_unlinked_ondisk_munge(sdp, ul);
1080
1081 init_dinode(dip, gl, &ul->ul_ut.ut_inum,
1082 mode, uid, gid);
1083
1084 gfs2_quota_change(dip, +1, uid, gid);
1085
1086 gfs2_trans_end(sdp);
1087
1088 out_quota:
1089 gfs2_quota_unlock(dip);
1090
1091 out:
1092 gfs2_alloc_put(dip);
1093
1094 return error;
1095}
1096
1097static int link_dinode(struct gfs2_inode *dip, struct qstr *name,
1098 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1099{
1100 struct gfs2_sbd *sdp = dip->i_sbd;
1101 struct gfs2_alloc *al;
1102 int alloc_required;
1103 struct buffer_head *dibh;
1104 int error;
1105
1106 al = gfs2_alloc_get(dip);
1107
1108 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1109 if (error)
1110 goto fail;
1111
1112 error = gfs2_diradd_alloc_required(dip, name, &alloc_required);
1113 if (alloc_required) {
1114 error = gfs2_quota_check(dip, dip->i_di.di_uid,
1115 dip->i_di.di_gid);
1116 if (error)
1117 goto fail_quota_locks;
1118
1119 al->al_requested = sdp->sd_max_dirres;
1120
1121 error = gfs2_inplace_reserve(dip);
1122 if (error)
1123 goto fail_quota_locks;
1124
1125 error = gfs2_trans_begin(sdp,
1126 sdp->sd_max_dirres +
1127 al->al_rgd->rd_ri.ri_length +
1128 2 * RES_DINODE + RES_UNLINKED +
1129 RES_STATFS + RES_QUOTA, 0);
1130 if (error)
1131 goto fail_ipreserv;
1132 } else {
1133 error = gfs2_trans_begin(sdp,
1134 RES_LEAF +
1135 2 * RES_DINODE +
1136 RES_UNLINKED, 0);
1137 if (error)
1138 goto fail_quota_locks;
1139 }
1140
1141 error = gfs2_dir_add(dip, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
1142 if (error)
1143 goto fail_end_trans;
1144
1145 error = gfs2_meta_inode_buffer(ip, &dibh);
1146 if (error)
1147 goto fail_end_trans;
1148 ip->i_di.di_nlink = 1;
1149 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1150 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1151 brelse(dibh);
1152
1153 error = gfs2_unlinked_ondisk_rm(sdp, ul);
1154 if (error)
1155 goto fail_end_trans;
1156
1157 return 0;
1158
1159 fail_end_trans:
1160 gfs2_trans_end(sdp);
1161
1162 fail_ipreserv:
1163 if (dip->i_alloc.al_rgd)
1164 gfs2_inplace_release(dip);
1165
1166 fail_quota_locks:
1167 gfs2_quota_unlock(dip);
1168
1169 fail:
1170 gfs2_alloc_put(dip);
1171
1172 return error;
1173}
1174
1175/**
1176 * gfs2_createi - Create a new inode
1177 * @ghs: An array of two holders
1178 * @name: The name of the new file
1179 * @mode: the permissions on the new inode
1180 *
1181 * @ghs[0] is an initialized holder for the directory
1182 * @ghs[1] is the holder for the inode lock
1183 *
1184 * If the return value is not NULL, the glocks on both the directory and the new
1185 * file are held. A transaction has been started and an inplace reservation
1186 * is held, as well.
1187 *
1188 * Returns: An inode
1189 */
1190
1191struct inode *gfs2_createi(struct gfs2_holder *ghs, struct qstr *name, unsigned int mode)
1192{
1193 struct inode *inode;
1194 struct gfs2_inode *dip = get_gl2ip(ghs->gh_gl);
1195 struct gfs2_sbd *sdp = dip->i_sbd;
1196 struct gfs2_unlinked *ul;
1197 struct gfs2_inode *ip;
1198 int error;
1199
1200 if (!name->len || name->len > GFS2_FNAMESIZE)
1201 return ERR_PTR(-ENAMETOOLONG);
1202
1203 error = gfs2_unlinked_get(sdp, &ul);
1204 if (error)
1205 return ERR_PTR(error);
1206
1207 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
1208 error = gfs2_glock_nq(ghs);
1209 if (error)
1210 goto fail;
1211
1212 error = create_ok(dip, name, mode);
1213 if (error)
1214 goto fail_gunlock;
1215
1216 error = pick_formal_ino(sdp, &ul->ul_ut.ut_inum.no_formal_ino);
1217 if (error)
1218 goto fail_gunlock;
1219
1220 error = alloc_dinode(dip, ul);
1221 if (error)
1222 goto fail_gunlock;
1223
1224 if (ul->ul_ut.ut_inum.no_addr < dip->i_num.no_addr) {
1225 gfs2_glock_dq(ghs);
1226
1227 error = gfs2_glock_nq_num(sdp,
1228 ul->ul_ut.ut_inum.no_addr,
1229 &gfs2_inode_glops,
1230 LM_ST_EXCLUSIVE, GL_SKIP,
1231 ghs + 1);
1232 if (error) {
1233 gfs2_unlinked_put(sdp, ul);
1234 return ERR_PTR(error);
1235 }
1236
1237 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
1238 error = gfs2_glock_nq(ghs);
1239 if (error) {
1240 gfs2_glock_dq_uninit(ghs + 1);
1241 gfs2_unlinked_put(sdp, ul);
1242 return ERR_PTR(error);
1243 }
1244
1245 error = create_ok(dip, name, mode);
1246 if (error)
1247 goto fail_gunlock2;
1248 } else {
1249 error = gfs2_glock_nq_num(sdp,
1250 ul->ul_ut.ut_inum.no_addr,
1251 &gfs2_inode_glops,
1252 LM_ST_EXCLUSIVE, GL_SKIP,
1253 ghs + 1);
1254 if (error)
1255 goto fail_gunlock;
1256 }
1257
1258 error = make_dinode(dip, ghs[1].gh_gl, mode, ul);
1259 if (error)
1260 goto fail_gunlock2;
1261
1262 error = gfs2_inode_get(ghs[1].gh_gl, &ul->ul_ut.ut_inum, CREATE, &ip);
1263 if (error)
1264 goto fail_gunlock2;
1265
1266 error = gfs2_inode_refresh(ip);
1267 if (error)
1268 goto fail_iput;
1269
1270 error = gfs2_acl_create(dip, ip);
1271 if (error)
1272 goto fail_iput;
1273
1274 error = link_dinode(dip, name, ip, ul);
1275 if (error)
1276 goto fail_iput;
1277
1278 gfs2_unlinked_put(sdp, ul);
1279
1280 inode = gfs2_ip2v(ip);
1281 gfs2_inode_put(ip);
1282 if (!inode)
1283 return ERR_PTR(-ENOMEM);
1284 return inode;
1285
1286 fail_iput:
1287 gfs2_inode_put(ip);
1288
1289 fail_gunlock2:
1290 gfs2_glock_dq_uninit(ghs + 1);
1291
1292 fail_gunlock:
1293 gfs2_glock_dq(ghs);
1294
1295 fail:
1296 gfs2_unlinked_put(sdp, ul);
1297
1298 return ERR_PTR(error);
1299}
1300
1301/**
1302 * gfs2_unlinki - Unlink a file
1303 * @dip: The inode of the directory
1304 * @name: The name of the file to be unlinked
1305 * @ip: The inode of the file to be removed
1306 *
1307 * Assumes Glocks on both dip and ip are held.
1308 *
1309 * Returns: errno
1310 */
1311
1312int gfs2_unlinki(struct gfs2_inode *dip, struct qstr *name,
1313 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1314{
1315 struct gfs2_sbd *sdp = dip->i_sbd;
1316 int error;
1317
1318 error = gfs2_dir_del(dip, name);
1319 if (error)
1320 return error;
1321
1322 error = gfs2_change_nlink(ip, -1);
1323 if (error)
1324 return error;
1325
1326 /* If this inode is being unlinked from the directory structure,
1327 we need to mark that in the log so that it isn't lost during
1328 a crash. */
1329
1330 if (!ip->i_di.di_nlink) {
1331 ul->ul_ut.ut_inum = ip->i_num;
1332 error = gfs2_unlinked_ondisk_add(sdp, ul);
1333 if (!error)
1334 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
1335 }
1336
1337 return error;
1338}
1339
1340/**
1341 * gfs2_rmdiri - Remove a directory
1342 * @dip: The parent directory of the directory to be removed
1343 * @name: The name of the directory to be removed
1344 * @ip: The GFS2 inode of the directory to be removed
1345 *
1346 * Assumes Glocks on dip and ip are held
1347 *
1348 * Returns: errno
1349 */
1350
1351int gfs2_rmdiri(struct gfs2_inode *dip, struct qstr *name,
1352 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1353{
1354 struct gfs2_sbd *sdp = dip->i_sbd;
1355 struct qstr dotname;
1356 int error;
1357
1358 if (ip->i_di.di_entries != 2) {
1359 if (gfs2_consist_inode(ip))
1360 gfs2_dinode_print(&ip->i_di);
1361 return -EIO;
1362 }
1363
1364 error = gfs2_dir_del(dip, name);
1365 if (error)
1366 return error;
1367
1368 error = gfs2_change_nlink(dip, -1);
1369 if (error)
1370 return error;
1371
1372 dotname.len = 1;
1373 dotname.name = ".";
1374 error = gfs2_dir_del(ip, &dotname);
1375 if (error)
1376 return error;
1377
1378 dotname.len = 2;
1379 dotname.name = "..";
1380 error = gfs2_dir_del(ip, &dotname);
1381 if (error)
1382 return error;
1383
1384 error = gfs2_change_nlink(ip, -2);
1385 if (error)
1386 return error;
1387
1388 /* This inode is being unlinked from the directory structure and
1389 we need to mark that in the log so that it isn't lost during
1390 a crash. */
1391
1392 ul->ul_ut.ut_inum = ip->i_num;
1393 error = gfs2_unlinked_ondisk_add(sdp, ul);
1394 if (!error)
1395 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
1396
1397 return error;
1398}
1399
1400/*
1401 * gfs2_unlink_ok - check to see that a inode is still in a directory
1402 * @dip: the directory
1403 * @name: the name of the file
1404 * @ip: the inode
1405 *
1406 * Assumes that the lock on (at least) @dip is held.
1407 *
1408 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1409 */
1410
1411int gfs2_unlink_ok(struct gfs2_inode *dip, struct qstr *name,
1412 struct gfs2_inode *ip)
1413{
1414 struct gfs2_inum inum;
1415 unsigned int type;
1416 int error;
1417
1418 if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
1419 return -EPERM;
1420
1421 if ((dip->i_di.di_mode & S_ISVTX) &&
1422 dip->i_di.di_uid != current->fsuid &&
1423 ip->i_di.di_uid != current->fsuid &&
1424 !capable(CAP_FOWNER))
1425 return -EPERM;
1426
1427 if (IS_APPEND(dip->i_vnode))
1428 return -EPERM;
1429
1430 error = gfs2_repermission(dip->i_vnode, MAY_WRITE | MAY_EXEC, NULL);
1431 if (error)
1432 return error;
1433
1434 error = gfs2_dir_search(dip, name, &inum, &type);
1435 if (error)
1436 return error;
1437
1438 if (!gfs2_inum_equal(&inum, &ip->i_num))
1439 return -ENOENT;
1440
1441 if (IF2DT(ip->i_di.di_mode) != type) {
1442 gfs2_consist_inode(dip);
1443 return -EIO;
1444 }
1445
1446 return 0;
1447}
1448
1449/*
1450 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1451 * @this: move this
1452 * @to: to here
1453 *
1454 * Follow @to back to the root and make sure we don't encounter @this
1455 * Assumes we already hold the rename lock.
1456 *
1457 * Returns: errno
1458 */
1459
1460int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1461{
1462 struct gfs2_sbd *sdp = this->i_sbd;
1463 struct inode *dir = to->i_vnode;
1464 struct inode *tmp;
1465 struct qstr dotdot;
1466 int error = 0;
1467
1468 memset(&dotdot, 0, sizeof(struct qstr));
1469 dotdot.name = "..";
1470 dotdot.len = 2;
1471
1472 igrab(dir);
1473
1474 for (;;) {
1475 if (dir == this->i_vnode) {
1476 error = -EINVAL;
1477 break;
1478 }
1479 if (dir == sdp->sd_root_dir) {
1480 error = 0;
1481 break;
1482 }
1483
1484 error = gfs2_lookupi(dir, &dotdot, 1, &tmp);
1485 if (error)
1486 break;
1487
1488 iput(dir);
1489 dir = tmp;
1490 }
1491
1492 iput(dir);
1493
1494 return error;
1495}
1496
1497/**
1498 * gfs2_readlinki - return the contents of a symlink
1499 * @ip: the symlink's inode
1500 * @buf: a pointer to the buffer to be filled
1501 * @len: a pointer to the length of @buf
1502 *
1503 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1504 * to be freed by the caller.
1505 *
1506 * Returns: errno
1507 */
1508
1509int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1510{
1511 struct gfs2_holder i_gh;
1512 struct buffer_head *dibh;
1513 unsigned int x;
1514 int error;
1515
1516 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1517 error = gfs2_glock_nq_atime(&i_gh);
1518 if (error) {
1519 gfs2_holder_uninit(&i_gh);
1520 return error;
1521 }
1522
1523 if (!ip->i_di.di_size) {
1524 gfs2_consist_inode(ip);
1525 error = -EIO;
1526 goto out;
1527 }
1528
1529 error = gfs2_meta_inode_buffer(ip, &dibh);
1530 if (error)
1531 goto out;
1532
1533 x = ip->i_di.di_size + 1;
1534 if (x > *len) {
1535 *buf = kmalloc(x, GFP_KERNEL);
1536 if (!*buf) {
1537 error = -ENOMEM;
1538 goto out_brelse;
1539 }
1540 }
1541
1542 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1543 *len = x;
1544
1545 out_brelse:
1546 brelse(dibh);
1547
1548 out:
1549 gfs2_glock_dq_uninit(&i_gh);
1550
1551 return error;
1552}
1553
1554/**
1555 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1556 * conditionally update the inode's atime
1557 * @gh: the holder to acquire
1558 *
1559 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1560 * Update if the difference between the current time and the inode's current
1561 * atime is greater than an interval specified at mount.
1562 *
1563 * Returns: errno
1564 */
1565
1566int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1567{
1568 struct gfs2_glock *gl = gh->gh_gl;
1569 struct gfs2_sbd *sdp = gl->gl_sbd;
1570 struct gfs2_inode *ip = get_gl2ip(gl);
1571 int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1572 unsigned int state;
1573 int flags;
1574 int error;
1575
1576 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1577 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1578 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1579 return -EINVAL;
1580
1581 state = gh->gh_state;
1582 flags = gh->gh_flags;
1583
1584 error = gfs2_glock_nq(gh);
1585 if (error)
1586 return error;
1587
1588 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1589 (sdp->sd_vfs->s_flags & MS_RDONLY))
1590 return 0;
1591
1592 curtime = get_seconds();
1593 if (curtime - ip->i_di.di_atime >= quantum) {
1594 gfs2_glock_dq(gh);
1595 gfs2_holder_reinit(LM_ST_EXCLUSIVE,
1596 gh->gh_flags & ~LM_FLAG_ANY,
1597 gh);
1598 error = gfs2_glock_nq(gh);
1599 if (error)
1600 return error;
1601
1602 /* Verify that atime hasn't been updated while we were
1603 trying to get exclusive lock. */
1604
1605 curtime = get_seconds();
1606 if (curtime - ip->i_di.di_atime >= quantum) {
1607 struct buffer_head *dibh;
1608
1609 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1610 if (error == -EROFS)
1611 return 0;
1612 if (error)
1613 goto fail;
1614
1615 error = gfs2_meta_inode_buffer(ip, &dibh);
1616 if (error)
1617 goto fail_end_trans;
1618
1619 ip->i_di.di_atime = curtime;
1620
1621 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1622 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1623 brelse(dibh);
1624
1625 gfs2_trans_end(sdp);
1626 }
1627
1628 /* If someone else has asked for the glock,
1629 unlock and let them have it. Then reacquire
1630 in the original state. */
1631 if (gfs2_glock_is_blocking(gl)) {
1632 gfs2_glock_dq(gh);
1633 gfs2_holder_reinit(state, flags, gh);
1634 return gfs2_glock_nq(gh);
1635 }
1636 }
1637
1638 return 0;
1639
1640 fail_end_trans:
1641 gfs2_trans_end(sdp);
1642
1643 fail:
1644 gfs2_glock_dq(gh);
1645
1646 return error;
1647}
1648
1649/**
1650 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1651 * @arg_a: the first structure
1652 * @arg_b: the second structure
1653 *
1654 * Returns: 1 if A > B
1655 * -1 if A < B
1656 * 0 if A = B
1657 */
1658
1659static int glock_compare_atime(const void *arg_a, const void *arg_b)
1660{
1661 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1662 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1663 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1664 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1665 int ret = 0;
1666
1667 if (a->ln_number > b->ln_number)
1668 ret = 1;
1669 else if (a->ln_number < b->ln_number)
1670 ret = -1;
1671 else {
1672 if (gh_a->gh_state == LM_ST_SHARED &&
1673 gh_b->gh_state == LM_ST_EXCLUSIVE)
1674 ret = 1;
1675 else if (gh_a->gh_state == LM_ST_SHARED &&
1676 (gh_b->gh_flags & GL_ATIME))
1677 ret = 1;
1678 }
1679
1680 return ret;
1681}
1682
1683/**
1684 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1685 * atime update
1686 * @num_gh: the number of structures
1687 * @ghs: an array of struct gfs2_holder structures
1688 *
1689 * Returns: 0 on success (all glocks acquired),
1690 * errno on failure (no glocks acquired)
1691 */
1692
1693int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1694{
1695 struct gfs2_holder **p;
1696 unsigned int x;
1697 int error = 0;
1698
1699 if (!num_gh)
1700 return 0;
1701
1702 if (num_gh == 1) {
1703 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1704 if (ghs->gh_flags & GL_ATIME)
1705 error = gfs2_glock_nq_atime(ghs);
1706 else
1707 error = gfs2_glock_nq(ghs);
1708 return error;
1709 }
1710
1711 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1712 if (!p)
1713 return -ENOMEM;
1714
1715 for (x = 0; x < num_gh; x++)
1716 p[x] = &ghs[x];
1717
1718 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1719
1720 for (x = 0; x < num_gh; x++) {
1721 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1722
1723 if (p[x]->gh_flags & GL_ATIME)
1724 error = gfs2_glock_nq_atime(p[x]);
1725 else
1726 error = gfs2_glock_nq(p[x]);
1727
1728 if (error) {
1729 while (x--)
1730 gfs2_glock_dq(p[x]);
1731 break;
1732 }
1733 }
1734
1735 kfree(p);
1736
1737 return error;
1738}
1739
1740/**
1741 * gfs2_try_toss_vnode - See if we can toss a vnode from memory
1742 * @ip: the inode
1743 *
1744 * Returns: 1 if the vnode was tossed
1745 */
1746
1747void gfs2_try_toss_vnode(struct gfs2_inode *ip)
1748{
1749 struct inode *inode;
1750
1751 inode = gfs2_ip2v_lookup(ip);
1752 if (!inode)
1753 return;
1754
1755 d_prune_aliases(inode);
1756
1757 if (S_ISDIR(ip->i_di.di_mode)) {
1758 struct list_head *head = &inode->i_dentry;
1759 struct dentry *d = NULL;
1760
1761 spin_lock(&dcache_lock);
1762 if (list_empty(head))
1763 spin_unlock(&dcache_lock);
1764 else {
1765 d = list_entry(head->next, struct dentry, d_alias);
1766 dget_locked(d);
1767 spin_unlock(&dcache_lock);
1768
1769 if (have_submounts(d))
1770 dput(d);
1771 else {
1772 shrink_dcache_parent(d);
1773 dput(d);
1774 d_prune_aliases(inode);
1775 }
1776 }
1777 }
1778
1779 inode->i_nlink = 0;
1780 iput(inode);
1781}
1782
1783
1784static int
1785__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1786{
1787 struct buffer_head *dibh;
1788 int error;
1789
1790 error = gfs2_meta_inode_buffer(ip, &dibh);
1791 if (!error) {
1792 error = inode_setattr(ip->i_vnode, attr);
1793 gfs2_assert_warn(ip->i_sbd, !error);
1794 gfs2_inode_attr_out(ip);
1795
1796 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1797 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1798 brelse(dibh);
1799 }
1800 return error;
1801}
1802
1803/**
1804 * gfs2_setattr_simple -
1805 * @ip:
1806 * @attr:
1807 *
1808 * Called with a reference on the vnode.
1809 *
1810 * Returns: errno
1811 */
1812
1813int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1814{
1815 int error;
1816
1817 if (get_transaction)
1818 return __gfs2_setattr_simple(ip, attr);
1819
1820 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE, 0);
1821 if (error)
1822 return error;
1823
1824 error = __gfs2_setattr_simple(ip, attr);
1825
1826 gfs2_trans_end(ip->i_sbd);
1827
1828 return error;
1829}
1830
1831int gfs2_repermission(struct inode *inode, int mask, struct nameidata *nd)
1832{
1833 return permission(inode, mask, nd);
1834}
1835
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..8ef85f5feb1b
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,81 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip);
31struct inode *gfs2_ip2v(struct gfs2_inode *ip);
32struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum);
33
34void gfs2_inode_min_init(struct gfs2_inode *ip, unsigned int type);
35int gfs2_inode_refresh(struct gfs2_inode *ip);
36
37int gfs2_inode_get(struct gfs2_glock *i_gl,
38 struct gfs2_inum *inum, int create,
39 struct gfs2_inode **ipp);
40void gfs2_inode_hold(struct gfs2_inode *ip);
41void gfs2_inode_put(struct gfs2_inode *ip);
42void gfs2_inode_destroy(struct gfs2_inode *ip);
43
44int gfs2_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
45
46int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
47int gfs2_lookupi(struct inode *dir, struct qstr *name, int is_root,
48 struct inode **ipp);
49struct inode *gfs2_createi(struct gfs2_holder *ghs, struct qstr *name, unsigned int mode);
50int gfs2_unlinki(struct gfs2_inode *dip, struct qstr *name,
51 struct gfs2_inode *ip, struct gfs2_unlinked *ul);
52int gfs2_rmdiri(struct gfs2_inode *dip, struct qstr *name,
53 struct gfs2_inode *ip, struct gfs2_unlinked *ul);
54int gfs2_unlink_ok(struct gfs2_inode *dip, struct qstr *name,
55 struct gfs2_inode *ip);
56int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
57int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
58
59int gfs2_glock_nq_atime(struct gfs2_holder *gh);
60int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
61
62void gfs2_try_toss_vnode(struct gfs2_inode *ip);
63
64int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
65
66int gfs2_repermission(struct inode *inode, int mask, struct nameidata *nd);
67
68static inline int gfs2_lookup_simple(struct inode *dip, char *name,
69 struct inode **ipp)
70{
71 struct qstr qstr;
72 int err;
73 memset(&qstr, 0, sizeof(struct qstr));
74 qstr.name = name;
75 qstr.len = strlen(name);
76 err = gfs2_lookupi(dip, &qstr, 1, ipp);
77 return err;
78}
79
80#endif /* __INODE_DOT_H__ */
81
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..cc7442261b2e
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,235 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "glock.h"
20#include "lm.h"
21#include "super.h"
22
23/**
24 * gfs2_lm_mount - mount a locking protocol
25 * @sdp: the filesystem
26 * @args: mount arguements
27 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
28 *
29 * Returns: errno
30 */
31
32int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
33{
34 char *proto = sdp->sd_proto_name;
35 char *table = sdp->sd_table_name;
36 int flags = 0;
37 int error;
38
39 if (sdp->sd_args.ar_spectator)
40 flags |= LM_MFLAG_SPECTATOR;
41
42 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
43
44 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
45 gfs2_glock_cb, sdp,
46 GFS2_MIN_LVB_SIZE, flags,
47 &sdp->sd_lockstruct, &sdp->sd_kobj);
48 if (error) {
49 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
50 proto, table, sdp->sd_args.ar_hostdata);
51 goto out;
52 }
53
54 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
55 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
56 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
57 GFS2_MIN_LVB_SIZE)) {
58 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
59 goto out;
60 }
61
62 if (sdp->sd_args.ar_spectator)
63 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
64 else
65 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
66 sdp->sd_lockstruct.ls_jid);
67
68 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
69
70 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
71 !sdp->sd_args.ar_ignore_local_fs) {
72 sdp->sd_args.ar_localflocks = 1;
73 sdp->sd_args.ar_localcaching = 1;
74 }
75
76 out:
77 return error;
78}
79
80void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
81{
82 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
83 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(sdp->sd_lockstruct.ls_lockspace);
84}
85
86void gfs2_lm_unmount(struct gfs2_sbd *sdp)
87{
88 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
89 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
90}
91
92int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
93{
94 va_list args;
95
96 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
97 return 0;
98
99 va_start(args, fmt);
100 vprintk(fmt, args);
101 va_end(args);
102
103 fs_err(sdp, "about to withdraw from the cluster\n");
104 if (sdp->sd_args.ar_debug)
105 BUG();
106
107 fs_err(sdp, "waiting for outstanding I/O\n");
108
109 /* FIXME: suspend dm device so oustanding bio's complete
110 and all further io requests fail */
111
112 fs_err(sdp, "telling LM to withdraw\n");
113 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
114 fs_err(sdp, "withdrawn\n");
115 dump_stack();
116
117 return -1;
118}
119
120int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
121 lm_lock_t **lockp)
122{
123 int error;
124 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
125 error = -EIO;
126 else
127 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(sdp->sd_lockstruct.ls_lockspace, name, lockp);
128 return error;
129}
130
131void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock)
132{
133 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
134 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
135}
136
137unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
138 unsigned int cur_state, unsigned int req_state,
139 unsigned int flags)
140{
141 int ret;
142 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
143 ret = 0;
144 else
145 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
146 cur_state,
147 req_state, flags);
148 return ret;
149}
150
151unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
152 unsigned int cur_state)
153{
154 int ret;
155 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
156 ret = 0;
157 else
158 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
159 return ret;
160}
161
162void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock)
163{
164 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
165 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
166}
167
168int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp)
169{
170 int error;
171 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
172 error = -EIO;
173 else
174 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
175 return error;
176}
177
178void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
179{
180 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
181 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
182}
183
184void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
185{
186 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
187 sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb);
188}
189
190int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
191 struct file *file, struct file_lock *fl)
192{
193 int error;
194 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
195 error = -EIO;
196 else
197 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
198 sdp->sd_lockstruct.ls_lockspace,
199 name, file, fl);
200 return error;
201}
202
203int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
204 struct file *file, int cmd, struct file_lock *fl)
205{
206 int error;
207 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
208 error = -EIO;
209 else
210 error = sdp->sd_lockstruct.ls_ops->lm_plock(
211 sdp->sd_lockstruct.ls_lockspace,
212 name, file, cmd, fl);
213 return error;
214}
215
216int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
217 struct file *file, struct file_lock *fl)
218{
219 int error;
220 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
221 error = -EIO;
222 else
223 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
224 sdp->sd_lockstruct.ls_lockspace,
225 name, file, fl);
226 return error;
227}
228
229void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
230 unsigned int message)
231{
232 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
233 sdp->sd_lockstruct.ls_ops->lm_recovery_done(sdp->sd_lockstruct.ls_lockspace, jid, message);
234}
235
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..ec812424fdec
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
14void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
16int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
17__attribute__ ((format(printf, 2, 3)));
18int gfs2_lm_get_lock(struct gfs2_sbd *sdp,
19 struct lm_lockname *name, lm_lock_t **lockp);
20void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock);
21unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
22 unsigned int cur_state, unsigned int req_state,
23 unsigned int flags);
24unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
25 unsigned int cur_state);
26void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock);
27int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp);
28void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
29void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
30int gfs2_lm_plock_get(struct gfs2_sbd *sdp,
31 struct lm_lockname *name,
32 struct file *file, struct file_lock *fl);
33int gfs2_lm_plock(struct gfs2_sbd *sdp,
34 struct lm_lockname *name,
35 struct file *file, int cmd, struct file_lock *fl);
36int gfs2_lm_punlock(struct gfs2_sbd *sdp,
37 struct lm_lockname *name,
38 struct file *file, struct file_lock *fl);
39void gfs2_lm_recovery_done(struct gfs2_sbd *sdp,
40 unsigned int jid, unsigned int message);
41
42#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/lm_interface.h b/fs/gfs2/lm_interface.h
new file mode 100644
index 000000000000..378432f17f27
--- /dev/null
+++ b/fs/gfs2/lm_interface.h
@@ -0,0 +1,295 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13/*
14 * Opaque handles represent the lock module's lockspace structure, the lock
15 * module's lock structures, and GFS's file system (superblock) structure.
16 */
17
18typedef void lm_lockspace_t;
19typedef void lm_lock_t;
20typedef void lm_fsdata_t;
21
22typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
23 void *data);
24
25/*
26 * lm_mount() flags
27 *
28 * LM_MFLAG_SPECTATOR
29 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
30 * modify the filesystem. The lock module shouldn't assign a journal to the FS
31 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
32 * dies or withdraws, all locks can be wiped immediately.
33 */
34
35#define LM_MFLAG_SPECTATOR 0x00000001
36
37/*
38 * lm_lockstruct flags
39 *
40 * LM_LSFLAG_LOCAL
41 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
42 * can make single-node optimizations.
43 */
44
45#define LM_LSFLAG_LOCAL 0x00000001
46
47/*
48 * lm_lockname types
49 */
50
51#define LM_TYPE_RESERVED 0x00
52#define LM_TYPE_NONDISK 0x01
53#define LM_TYPE_INODE 0x02
54#define LM_TYPE_RGRP 0x03
55#define LM_TYPE_META 0x04
56#define LM_TYPE_IOPEN 0x05
57#define LM_TYPE_FLOCK 0x06
58#define LM_TYPE_PLOCK 0x07
59#define LM_TYPE_QUOTA 0x08
60#define LM_TYPE_JOURNAL 0x09
61
62/*
63 * lm_lock() states
64 *
65 * SHARED is compatible with SHARED, not with DEFERRED or EX.
66 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
67 */
68
69#define LM_ST_UNLOCKED 0
70#define LM_ST_EXCLUSIVE 1
71#define LM_ST_DEFERRED 2
72#define LM_ST_SHARED 3
73
74/*
75 * lm_lock() flags
76 *
77 * LM_FLAG_TRY
78 * Don't wait to acquire the lock if it can't be granted immediately.
79 *
80 * LM_FLAG_TRY_1CB
81 * Send one blocking callback if TRY is set and the lock is not granted.
82 *
83 * LM_FLAG_NOEXP
84 * GFS sets this flag on lock requests it makes while doing journal recovery.
85 * These special requests should not be blocked due to the recovery like
86 * ordinary locks would be.
87 *
88 * LM_FLAG_ANY
89 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
90 * also be granted in SHARED. The preferred state is whichever is compatible
91 * with other granted locks, or the specified state if no other locks exist.
92 *
93 * LM_FLAG_PRIORITY
94 * Override fairness considerations. Suppose a lock is held in a shared state
95 * and there is a pending request for the deferred state. A shared lock
96 * request with the priority flag would be allowed to bypass the deferred
97 * request and directly join the other shared lock. A shared lock request
98 * without the priority flag might be forced to wait until the deferred
99 * requested had acquired and released the lock.
100 */
101
102#define LM_FLAG_TRY 0x00000001
103#define LM_FLAG_TRY_1CB 0x00000002
104#define LM_FLAG_NOEXP 0x00000004
105#define LM_FLAG_ANY 0x00000008
106#define LM_FLAG_PRIORITY 0x00000010
107
108/*
109 * lm_lock() and lm_async_cb return flags
110 *
111 * LM_OUT_ST_MASK
112 * Masks the lower two bits of lock state in the returned value.
113 *
114 * LM_OUT_CACHEABLE
115 * The lock hasn't been released so GFS can continue to cache data for it.
116 *
117 * LM_OUT_CANCELED
118 * The lock request was canceled.
119 *
120 * LM_OUT_ASYNC
121 * The result of the request will be returned in an LM_CB_ASYNC callback.
122 */
123
124#define LM_OUT_ST_MASK 0x00000003
125#define LM_OUT_CACHEABLE 0x00000004
126#define LM_OUT_CANCELED 0x00000008
127#define LM_OUT_ASYNC 0x00000080
128#define LM_OUT_ERROR 0x00000100
129
130/*
131 * lm_callback_t types
132 *
133 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
134 * Blocking callback, a remote node is requesting the given lock in
135 * EXCLUSIVE, DEFERRED, or SHARED.
136 *
137 * LM_CB_NEED_RECOVERY
138 * The given journal needs to be recovered.
139 *
140 * LM_CB_DROPLOCKS
141 * Reduce the number of cached locks.
142 *
143 * LM_CB_ASYNC
144 * The given lock has been granted.
145 */
146
147#define LM_CB_NEED_E 257
148#define LM_CB_NEED_D 258
149#define LM_CB_NEED_S 259
150#define LM_CB_NEED_RECOVERY 260
151#define LM_CB_DROPLOCKS 261
152#define LM_CB_ASYNC 262
153
154/*
155 * lm_recovery_done() messages
156 */
157
158#define LM_RD_GAVEUP 308
159#define LM_RD_SUCCESS 309
160
161
162struct lm_lockname {
163 uint64_t ln_number;
164 unsigned int ln_type;
165};
166
167#define lm_name_equal(name1, name2) \
168 (((name1)->ln_number == (name2)->ln_number) && \
169 ((name1)->ln_type == (name2)->ln_type)) \
170
171struct lm_async_cb {
172 struct lm_lockname lc_name;
173 int lc_ret;
174};
175
176struct lm_lockstruct;
177
178struct lm_lockops {
179 char lm_proto_name[256];
180
181 /*
182 * Mount/Unmount
183 */
184
185 int (*lm_mount) (char *table_name, char *host_data,
186 lm_callback_t cb, lm_fsdata_t *fsdata,
187 unsigned int min_lvb_size, int flags,
188 struct lm_lockstruct *lockstruct,
189 struct kobject *fskobj);
190
191 void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
192
193 void (*lm_unmount) (lm_lockspace_t *lockspace);
194
195 void (*lm_withdraw) (lm_lockspace_t *lockspace);
196
197 /*
198 * Lock oriented operations
199 */
200
201 int (*lm_get_lock) (lm_lockspace_t *lockspace,
202 struct lm_lockname *name, lm_lock_t **lockp);
203
204 void (*lm_put_lock) (lm_lock_t *lock);
205
206 unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
207 unsigned int req_state, unsigned int flags);
208
209 unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
210
211 void (*lm_cancel) (lm_lock_t *lock);
212
213 int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
214 void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
215 void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
216
217 /*
218 * Posix Lock oriented operations
219 */
220
221 int (*lm_plock_get) (lm_lockspace_t *lockspace,
222 struct lm_lockname *name,
223 struct file *file, struct file_lock *fl);
224
225 int (*lm_plock) (lm_lockspace_t *lockspace,
226 struct lm_lockname *name,
227 struct file *file, int cmd, struct file_lock *fl);
228
229 int (*lm_punlock) (lm_lockspace_t *lockspace,
230 struct lm_lockname *name,
231 struct file *file, struct file_lock *fl);
232
233 /*
234 * Client oriented operations
235 */
236
237 void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
238 unsigned int message);
239
240 struct module *lm_owner;
241};
242
243/*
244 * lm_mount() return values
245 *
246 * ls_jid - the journal ID this node should use
247 * ls_first - this node is the first to mount the file system
248 * ls_lvb_size - size in bytes of lock value blocks
249 * ls_lockspace - lock module's context for this file system
250 * ls_ops - lock module's functions
251 * ls_flags - lock module features
252 */
253
254struct lm_lockstruct {
255 unsigned int ls_jid;
256 unsigned int ls_first;
257 unsigned int ls_lvb_size;
258 lm_lockspace_t *ls_lockspace;
259 struct lm_lockops *ls_ops;
260 int ls_flags;
261};
262
263void __init gfs2_init_lmh(void);
264
265/*
266 * Lock module bottom interface. A lock module makes itself available to GFS
267 * with these functions.
268 *
269 * For the time being, we copy the gfs1 lock module bottom interface so the
270 * same lock modules can be used with both gfs1 and gfs2 (it won't be possible
271 * to load both gfs1 and gfs2 at once.) Eventually the lock modules will fork
272 * for gfs1/gfs2 and this API can change to the gfs2_ prefix.
273 */
274
275int gfs_register_lockproto(struct lm_lockops *proto);
276
277void gfs_unregister_lockproto(struct lm_lockops *proto);
278
279/*
280 * Lock module top interface. GFS calls these functions when mounting or
281 * unmounting a file system.
282 */
283
284int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
285 lm_callback_t cb, lm_fsdata_t *fsdata,
286 unsigned int min_lvb_size, int flags,
287 struct lm_lockstruct *lockstruct,
288 struct kobject *fskobj);
289
290void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
291
292void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
293
294#endif /* __LM_INTERFACE_DOT_H__ */
295
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..2d2f8fe53999
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,192 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19
20#include "lm_interface.h"
21
22struct lmh_wrapper {
23 struct list_head lw_list;
24 struct lm_lockops *lw_ops;
25};
26
27/* List of registered low-level locking protocols. A file system selects one
28 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
29
30static struct list_head lmh_list;
31static struct semaphore lmh_lock;
32
33/**
34 * gfs_register_lockproto - Register a low-level locking protocol
35 * @proto: the protocol definition
36 *
37 * Returns: 0 on success, -EXXX on failure
38 */
39
40int gfs_register_lockproto(struct lm_lockops *proto)
41{
42 struct lmh_wrapper *lw;
43
44 down(&lmh_lock);
45
46 list_for_each_entry(lw, &lmh_list, lw_list) {
47 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
48 up(&lmh_lock);
49 printk("GFS2: protocol %s already exists\n",
50 proto->lm_proto_name);
51 return -EEXIST;
52 }
53 }
54
55 lw = kmalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
56 if (!lw) {
57 up(&lmh_lock);
58 return -ENOMEM;
59 }
60 memset(lw, 0, sizeof(struct lmh_wrapper));
61
62 lw->lw_ops = proto;
63 list_add(&lw->lw_list, &lmh_list);
64
65 up(&lmh_lock);
66
67 return 0;
68}
69
70/**
71 * gfs_unregister_lockproto - Unregister a low-level locking protocol
72 * @proto: the protocol definition
73 *
74 */
75
76void gfs_unregister_lockproto(struct lm_lockops *proto)
77{
78 struct lmh_wrapper *lw;
79
80 down(&lmh_lock);
81
82 list_for_each_entry(lw, &lmh_list, lw_list) {
83 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
84 list_del(&lw->lw_list);
85 up(&lmh_lock);
86 kfree(lw);
87 return;
88 }
89 }
90
91 up(&lmh_lock);
92
93 printk("GFS2: can't unregister lock protocol %s\n",
94 proto->lm_proto_name);
95}
96
97/**
98 * gfs2_mount_lockproto - Mount a lock protocol
99 * @proto_name - the name of the protocol
100 * @table_name - the name of the lock space
101 * @host_data - data specific to this host
102 * @cb - the callback to the code using the lock module
103 * @fsdata - data to pass back with the callback
104 * @min_lvb_size - the mininum LVB size that the caller can deal with
105 * @flags - LM_MFLAG_*
106 * @lockstruct - a structure returned describing the mount
107 *
108 * Returns: 0 on success, -EXXX on failure
109 */
110
111int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
112 lm_callback_t cb, lm_fsdata_t *fsdata,
113 unsigned int min_lvb_size, int flags,
114 struct lm_lockstruct *lockstruct,
115 struct kobject *fskobj)
116{
117 struct lmh_wrapper *lw = NULL;
118 int try = 0;
119 int error, found;
120
121 retry:
122 down(&lmh_lock);
123
124 found = 0;
125 list_for_each_entry(lw, &lmh_list, lw_list) {
126 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
127 found = 1;
128 break;
129 }
130 }
131
132 if (!found) {
133 if (!try && capable(CAP_SYS_MODULE)) {
134 try = 1;
135 up(&lmh_lock);
136 request_module(proto_name);
137 goto retry;
138 }
139 printk("GFS2: can't find protocol %s\n", proto_name);
140 error = -ENOENT;
141 goto out;
142 }
143
144 if (!try_module_get(lw->lw_ops->lm_owner)) {
145 try = 0;
146 up(&lmh_lock);
147 msleep(1000);
148 goto retry;
149 }
150
151 error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata,
152 min_lvb_size, flags, lockstruct, fskobj);
153 if (error)
154 module_put(lw->lw_ops->lm_owner);
155 out:
156 up(&lmh_lock);
157 return error;
158}
159
160void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
161{
162 down(&lmh_lock);
163 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
164 if (lockstruct->ls_ops->lm_owner)
165 module_put(lockstruct->ls_ops->lm_owner);
166 up(&lmh_lock);
167}
168
169/**
170 * gfs2_withdraw_lockproto - abnormally unmount a lock module
171 * @lockstruct: the lockstruct passed into mount
172 *
173 */
174
175void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
176{
177 down(&lmh_lock);
178 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
179 if (lockstruct->ls_ops->lm_owner)
180 module_put(lockstruct->ls_ops->lm_owner);
181 up(&lmh_lock);
182}
183
184void __init gfs2_init_lmh(void)
185{
186 init_MUTEX(&lmh_lock);
187 INIT_LIST_HEAD(&lmh_list);
188}
189
190EXPORT_SYMBOL_GPL(gfs_register_lockproto);
191EXPORT_SYMBOL_GPL(gfs_unregister_lockproto);
192
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..a9733ff80371
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..d799865b64a4
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,537 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete((struct gdlm_lock *) astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk("lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type, lp->lockname.ln_number);
39 return;
40 }
41
42 spin_lock(&ls->async_lock);
43 if (!lp->bast_mode) {
44 list_add_tail(&lp->blist, &ls->blocking);
45 lp->bast_mode = mode;
46 } else if (lp->bast_mode < mode)
47 lp->bast_mode = mode;
48 spin_unlock(&ls->async_lock);
49 wake_up(&ls->thread_wait);
50}
51
52void gdlm_queue_delayed(struct gdlm_lock *lp)
53{
54 struct gdlm_ls *ls = lp->ls;
55
56 spin_lock(&ls->async_lock);
57 list_add_tail(&lp->delay_list, &ls->delayed);
58 spin_unlock(&ls->async_lock);
59}
60
61/* convert gfs lock-state to dlm lock-mode */
62
63static int16_t make_mode(int16_t lmstate)
64{
65 switch (lmstate) {
66 case LM_ST_UNLOCKED:
67 return DLM_LOCK_NL;
68 case LM_ST_EXCLUSIVE:
69 return DLM_LOCK_EX;
70 case LM_ST_DEFERRED:
71 return DLM_LOCK_CW;
72 case LM_ST_SHARED:
73 return DLM_LOCK_PR;
74 }
75 gdlm_assert(0, "unknown LM state %d", lmstate);
76 return -1;
77}
78
79/* convert dlm lock-mode to gfs lock-state */
80
81int16_t gdlm_make_lmstate(int16_t dlmmode)
82{
83 switch (dlmmode) {
84 case DLM_LOCK_IV:
85 case DLM_LOCK_NL:
86 return LM_ST_UNLOCKED;
87 case DLM_LOCK_EX:
88 return LM_ST_EXCLUSIVE;
89 case DLM_LOCK_CW:
90 return LM_ST_DEFERRED;
91 case DLM_LOCK_PR:
92 return LM_ST_SHARED;
93 }
94 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
95 return -1;
96}
97
98/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
99 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
100
101static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
102{
103 int16_t cur = make_mode(cur_state);
104 if (lp->cur != DLM_LOCK_IV)
105 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
106}
107
108static inline unsigned int make_flags(struct gdlm_lock *lp,
109 unsigned int gfs_flags,
110 int16_t cur, int16_t req)
111{
112 unsigned int lkf = 0;
113
114 if (gfs_flags & LM_FLAG_TRY)
115 lkf |= DLM_LKF_NOQUEUE;
116
117 if (gfs_flags & LM_FLAG_TRY_1CB) {
118 lkf |= DLM_LKF_NOQUEUE;
119 lkf |= DLM_LKF_NOQUEUEBAST;
120 }
121
122 if (gfs_flags & LM_FLAG_PRIORITY) {
123 lkf |= DLM_LKF_NOORDER;
124 lkf |= DLM_LKF_HEADQUE;
125 }
126
127 if (gfs_flags & LM_FLAG_ANY) {
128 if (req == DLM_LOCK_PR)
129 lkf |= DLM_LKF_ALTCW;
130 else if (req == DLM_LOCK_CW)
131 lkf |= DLM_LKF_ALTPR;
132 }
133
134 if (lp->lksb.sb_lkid != 0) {
135 lkf |= DLM_LKF_CONVERT;
136
137 /* Conversion deadlock avoidance by DLM */
138
139 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
140 !(lkf & DLM_LKF_NOQUEUE) &&
141 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
142 lkf |= DLM_LKF_CONVDEADLK;
143 }
144
145 if (lp->lvb)
146 lkf |= DLM_LKF_VALBLK;
147
148 return lkf;
149}
150
151/* make_strname - convert GFS lock numbers to a string */
152
153static inline void make_strname(struct lm_lockname *lockname,
154 struct gdlm_strname *str)
155{
156 sprintf(str->name, "%8x%16llx", lockname->ln_type,
157 lockname->ln_number);
158 str->namelen = GDLM_STRNAME_BYTES;
159}
160
161int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
162 struct gdlm_lock **lpp)
163{
164 struct gdlm_lock *lp;
165
166 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
167 if (!lp)
168 return -ENOMEM;
169
170 lp->lockname = *name;
171 lp->ls = ls;
172 lp->cur = DLM_LOCK_IV;
173 lp->lvb = NULL;
174 lp->hold_null = NULL;
175 init_completion(&lp->ast_wait);
176 INIT_LIST_HEAD(&lp->clist);
177 INIT_LIST_HEAD(&lp->blist);
178 INIT_LIST_HEAD(&lp->delay_list);
179
180 spin_lock(&ls->async_lock);
181 list_add(&lp->all_list, &ls->all_locks);
182 ls->all_locks_count++;
183 spin_unlock(&ls->async_lock);
184
185 *lpp = lp;
186 return 0;
187}
188
189void gdlm_delete_lp(struct gdlm_lock *lp)
190{
191 struct gdlm_ls *ls = lp->ls;
192
193 spin_lock(&ls->async_lock);
194 if (!list_empty(&lp->clist))
195 list_del_init(&lp->clist);
196 if (!list_empty(&lp->blist))
197 list_del_init(&lp->blist);
198 if (!list_empty(&lp->delay_list))
199 list_del_init(&lp->delay_list);
200 gdlm_assert(!list_empty(&lp->all_list),
201 "%x,%llx", lp->lockname.ln_type, lp->lockname.ln_number);
202 list_del_init(&lp->all_list);
203 ls->all_locks_count--;
204 spin_unlock(&ls->async_lock);
205
206 kfree(lp);
207}
208
209int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
210 lm_lock_t **lockp)
211{
212 struct gdlm_lock *lp;
213 int error;
214
215 error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp);
216
217 *lockp = (lm_lock_t *) lp;
218 return error;
219}
220
221void gdlm_put_lock(lm_lock_t *lock)
222{
223 gdlm_delete_lp((struct gdlm_lock *) lock);
224}
225
226unsigned int gdlm_do_lock(struct gdlm_lock *lp, struct dlm_range *range)
227{
228 struct gdlm_ls *ls = lp->ls;
229 struct gdlm_strname str;
230 int error, bast = 1;
231
232 /*
233 * When recovery is in progress, delay lock requests for submission
234 * once recovery is done. Requests for recovery (NOEXP) and unlocks
235 * can pass.
236 */
237
238 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
239 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
240 gdlm_queue_delayed(lp);
241 return LM_OUT_ASYNC;
242 }
243
244 /*
245 * Submit the actual lock request.
246 */
247
248 if (test_bit(LFL_NOBAST, &lp->flags))
249 bast = 0;
250
251 make_strname(&lp->lockname, &str);
252
253 set_bit(LFL_ACTIVE, &lp->flags);
254
255 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
256 lp->lockname.ln_number, lp->lksb.sb_lkid,
257 lp->cur, lp->req, lp->lkf);
258
259 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
260 str.name, str.namelen, 0, gdlm_ast, (void *) lp,
261 bast ? gdlm_bast : NULL, range);
262
263 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
264 lp->lksb.sb_status = -EAGAIN;
265 queue_complete(lp);
266 error = 0;
267 }
268
269 if (error) {
270 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
271 "flags=%lx", ls->fsname, lp->lockname.ln_type,
272 lp->lockname.ln_number, error, lp->cur, lp->req,
273 lp->lkf, lp->flags);
274 return LM_OUT_ERROR;
275 }
276 return LM_OUT_ASYNC;
277}
278
279unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
280{
281 struct gdlm_ls *ls = lp->ls;
282 unsigned int lkf = 0;
283 int error;
284
285 set_bit(LFL_DLM_UNLOCK, &lp->flags);
286 set_bit(LFL_ACTIVE, &lp->flags);
287
288 if (lp->lvb)
289 lkf = DLM_LKF_VALBLK;
290
291 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
292 lp->lockname.ln_number, lp->lksb.sb_lkid, lp->cur, lkf);
293
294 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
295
296 if (error) {
297 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
298 "flags=%lx", ls->fsname, lp->lockname.ln_type,
299 lp->lockname.ln_number, error, lp->cur, lp->req,
300 lp->lkf, lp->flags);
301 return LM_OUT_ERROR;
302 }
303 return LM_OUT_ASYNC;
304}
305
306unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state,
307 unsigned int req_state, unsigned int flags)
308{
309 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
310
311 clear_bit(LFL_DLM_CANCEL, &lp->flags);
312 if (flags & LM_FLAG_NOEXP)
313 set_bit(LFL_NOBLOCK, &lp->flags);
314
315 check_cur_state(lp, cur_state);
316 lp->req = make_mode(req_state);
317 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
318
319 return gdlm_do_lock(lp, NULL);
320}
321
322unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state)
323{
324 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
325
326 clear_bit(LFL_DLM_CANCEL, &lp->flags);
327 if (lp->cur == DLM_LOCK_IV)
328 return 0;
329 return gdlm_do_unlock(lp);
330}
331
332void gdlm_cancel(lm_lock_t *lock)
333{
334 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
335 struct gdlm_ls *ls = lp->ls;
336 int error, delay_list = 0;
337
338 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
339 return;
340
341 log_info("gdlm_cancel %x,%llx flags %lx",
342 lp->lockname.ln_type, lp->lockname.ln_number, lp->flags);
343
344 spin_lock(&ls->async_lock);
345 if (!list_empty(&lp->delay_list)) {
346 list_del_init(&lp->delay_list);
347 delay_list = 1;
348 }
349 spin_unlock(&ls->async_lock);
350
351 if (delay_list) {
352 set_bit(LFL_CANCEL, &lp->flags);
353 set_bit(LFL_ACTIVE, &lp->flags);
354 queue_complete(lp);
355 return;
356 }
357
358 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
359 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
360 log_info("gdlm_cancel skip %x,%llx flags %lx",
361 lp->lockname.ln_type, lp->lockname.ln_number,
362 lp->flags);
363 return;
364 }
365
366 /* the lock is blocked in the dlm */
367
368 set_bit(LFL_DLM_CANCEL, &lp->flags);
369 set_bit(LFL_ACTIVE, &lp->flags);
370
371 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
372 NULL, lp);
373
374 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
375 lp->lockname.ln_type, lp->lockname.ln_number, lp->flags);
376
377 if (error == -EBUSY)
378 clear_bit(LFL_DLM_CANCEL, &lp->flags);
379}
380
381int gdlm_add_lvb(struct gdlm_lock *lp)
382{
383 char *lvb;
384
385 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
386 if (!lvb)
387 return -ENOMEM;
388
389 lp->lksb.sb_lvbptr = lvb;
390 lp->lvb = lvb;
391 return 0;
392}
393
394void gdlm_del_lvb(struct gdlm_lock *lp)
395{
396 kfree(lp->lvb);
397 lp->lvb = NULL;
398 lp->lksb.sb_lvbptr = NULL;
399}
400
401/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
402 the completion) because gfs won't call hold_lvb() during a callback (from
403 the context of a lock_dlm thread). */
404
405static int hold_null_lock(struct gdlm_lock *lp)
406{
407 struct gdlm_lock *lpn = NULL;
408 int error;
409
410 if (lp->hold_null) {
411 printk("lock_dlm: lvb already held\n");
412 return 0;
413 }
414
415 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
416 if (error)
417 goto out;
418
419 lpn->lksb.sb_lvbptr = junk_lvb;
420 lpn->lvb = junk_lvb;
421
422 lpn->req = DLM_LOCK_NL;
423 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
424 set_bit(LFL_NOBAST, &lpn->flags);
425 set_bit(LFL_INLOCK, &lpn->flags);
426
427 init_completion(&lpn->ast_wait);
428 gdlm_do_lock(lpn, NULL);
429 wait_for_completion(&lpn->ast_wait);
430 error = lp->lksb.sb_status;
431 if (error) {
432 printk("lock_dlm: hold_null_lock dlm error %d\n", error);
433 gdlm_delete_lp(lpn);
434 lpn = NULL;
435 }
436 out:
437 lp->hold_null = lpn;
438 return error;
439}
440
441/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
442 the completion) because gfs may call unhold_lvb() during a callback (from
443 the context of a lock_dlm thread) which could cause a deadlock since the
444 other lock_dlm thread could be engaged in recovery. */
445
446static void unhold_null_lock(struct gdlm_lock *lp)
447{
448 struct gdlm_lock *lpn = lp->hold_null;
449
450 gdlm_assert(lpn, "%x,%llx",
451 lp->lockname.ln_type, lp->lockname.ln_number);
452 lpn->lksb.sb_lvbptr = NULL;
453 lpn->lvb = NULL;
454 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
455 gdlm_do_unlock(lpn);
456 lp->hold_null = NULL;
457}
458
459/* Acquire a NL lock because gfs requires the value block to remain
460 intact on the resource while the lvb is "held" even if it's holding no locks
461 on the resource. */
462
463int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp)
464{
465 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
466 int error;
467
468 error = gdlm_add_lvb(lp);
469 if (error)
470 return error;
471
472 *lvbp = lp->lvb;
473
474 error = hold_null_lock(lp);
475 if (error)
476 gdlm_del_lvb(lp);
477
478 return error;
479}
480
481void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb)
482{
483 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
484
485 unhold_null_lock(lp);
486 gdlm_del_lvb(lp);
487}
488
489void gdlm_sync_lvb(lm_lock_t *lock, char *lvb)
490{
491 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
492
493 if (lp->cur != DLM_LOCK_EX)
494 return;
495
496 init_completion(&lp->ast_wait);
497 set_bit(LFL_SYNC_LVB, &lp->flags);
498
499 lp->req = DLM_LOCK_EX;
500 lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
501
502 gdlm_do_lock(lp, NULL);
503 wait_for_completion(&lp->ast_wait);
504}
505
506void gdlm_submit_delayed(struct gdlm_ls *ls)
507{
508 struct gdlm_lock *lp, *safe;
509
510 spin_lock(&ls->async_lock);
511 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
512 list_del_init(&lp->delay_list);
513 list_add_tail(&lp->delay_list, &ls->submit);
514 }
515 spin_unlock(&ls->async_lock);
516 wake_up(&ls->thread_wait);
517}
518
519int gdlm_release_all_locks(struct gdlm_ls *ls)
520{
521 struct gdlm_lock *lp, *safe;
522 int count = 0;
523
524 spin_lock(&ls->async_lock);
525 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
526 list_del_init(&lp->all_list);
527
528 if (lp->lvb && lp->lvb != junk_lvb)
529 kfree(lp->lvb);
530 kfree(lp);
531 count++;
532 }
533 spin_unlock(&ls->async_lock);
534
535 return count;
536}
537
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..fa545f7872e8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include "../../lm_interface.h"
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 uint32_t id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 lm_fsdata_t *fsdata;
71 int recover_jid;
72 int recover_jid_done;
73 spinlock_t async_lock;
74 struct list_head complete;
75 struct list_head blocking;
76 struct list_head delayed;
77 struct list_head submit;
78 struct list_head all_locks;
79 uint32_t all_locks_count;
80 wait_queue_head_t wait_control;
81 struct task_struct *thread1;
82 struct task_struct *thread2;
83 wait_queue_head_t thread_wait;
84 unsigned long drop_time;
85 int drop_locks_count;
86 int drop_locks_period;
87};
88
89enum {
90 LFL_NOBLOCK = 0,
91 LFL_NOCACHE = 1,
92 LFL_DLM_UNLOCK = 2,
93 LFL_DLM_CANCEL = 3,
94 LFL_SYNC_LVB = 4,
95 LFL_FORCE_PROMOTE = 5,
96 LFL_REREQUEST = 6,
97 LFL_ACTIVE = 7,
98 LFL_INLOCK = 8,
99 LFL_CANCEL = 9,
100 LFL_NOBAST = 10,
101 LFL_HEADQUE = 11,
102 LFL_UNLOCK_DELETE = 12,
103};
104
105struct gdlm_lock {
106 struct gdlm_ls *ls;
107 struct lm_lockname lockname;
108 char *lvb;
109 struct dlm_lksb lksb;
110
111 int16_t cur;
112 int16_t req;
113 int16_t prev_req;
114 uint32_t lkf; /* dlm flags DLM_LKF_ */
115 unsigned long flags; /* lock_dlm flags LFL_ */
116
117 int bast_mode; /* protected by async_lock */
118 struct completion ast_wait;
119
120 struct list_head clist; /* complete */
121 struct list_head blist; /* blocking */
122 struct list_head delay_list; /* delayed */
123 struct list_head all_list; /* all locks for the fs */
124 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
125};
126
127#define gdlm_assert(assertion, fmt, args...) \
128do { \
129 if (unlikely(!(assertion))) { \
130 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
131 "lock_dlm: " fmt "\n", \
132 #assertion, ##args); \
133 BUG(); \
134 } \
135} while (0)
136
137#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
138#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
139#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
140#ifdef LOCK_DLM_LOG_DEBUG
141#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
142#else
143#define log_debug(fmt, arg...)
144#endif
145
146/* sysfs.c */
147
148int gdlm_sysfs_init(void);
149void gdlm_sysfs_exit(void);
150int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
151void gdlm_kobject_release(struct gdlm_ls *);
152
153/* thread.c */
154
155int gdlm_init_threads(struct gdlm_ls *);
156void gdlm_release_threads(struct gdlm_ls *);
157
158/* lock.c */
159
160int16_t gdlm_make_lmstate(int16_t);
161void gdlm_queue_delayed(struct gdlm_lock *);
162void gdlm_submit_delayed(struct gdlm_ls *);
163int gdlm_release_all_locks(struct gdlm_ls *);
164int gdlm_create_lp(struct gdlm_ls *, struct lm_lockname *, struct gdlm_lock **);
165void gdlm_delete_lp(struct gdlm_lock *);
166int gdlm_add_lvb(struct gdlm_lock *);
167void gdlm_del_lvb(struct gdlm_lock *);
168unsigned int gdlm_do_lock(struct gdlm_lock *, struct dlm_range *);
169unsigned int gdlm_do_unlock(struct gdlm_lock *);
170
171int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **);
172void gdlm_put_lock(lm_lock_t *);
173unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int);
174unsigned int gdlm_unlock(lm_lock_t *, unsigned int);
175void gdlm_cancel(lm_lock_t *);
176int gdlm_hold_lvb(lm_lock_t *, char **);
177void gdlm_unhold_lvb(lm_lock_t *, char *);
178void gdlm_sync_lvb(lm_lock_t *, char *);
179
180/* plock.c */
181
182int gdlm_plock_init(void);
183void gdlm_plock_exit(void);
184int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int,
185 struct file_lock *);
186int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *,
187 struct file_lock *);
188int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *,
189 struct file_lock *);
190#endif
191
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2c13c916a352
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk("lock_dlm: can't register protocol: %d\n", error);
26 return error;
27 }
28
29 error = gdlm_sysfs_init();
30 if (error) {
31 gfs_unregister_lockproto(&gdlm_ops);
32 return error;
33 }
34
35 error = gdlm_plock_init();
36 if (error) {
37 gdlm_sysfs_exit();
38 gfs_unregister_lockproto(&gdlm_ops);
39 return error;
40 }
41
42 gdlm_drop_count = GDLM_DROP_COUNT;
43 gdlm_drop_period = GDLM_DROP_PERIOD;
44
45 printk("Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
46 return 0;
47}
48
49void __exit exit_lock_dlm(void)
50{
51 gdlm_plock_exit();
52 gdlm_sysfs_exit();
53 gfs_unregister_lockproto(&gdlm_ops);
54}
55
56module_init(init_lock_dlm);
57module_exit(exit_lock_dlm);
58
59MODULE_DESCRIPTION("GFS DLM Locking Module");
60MODULE_AUTHOR("Red Hat, Inc.");
61MODULE_LICENSE("GPL");
62
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..042f3a75c441
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,247 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->fsdata = fsdata;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else {
105 log_error("unkonwn option: %s", x);
106 error = -EINVAL;
107 break;
108 }
109 }
110
111 return error;
112}
113
114static int gdlm_mount(char *table_name, char *host_data,
115 lm_callback_t cb, lm_fsdata_t *fsdata,
116 unsigned int min_lvb_size, int flags,
117 struct lm_lockstruct *lockstruct,
118 struct kobject *fskobj)
119{
120 struct gdlm_ls *ls;
121 int error = -ENOMEM;
122
123 if (min_lvb_size > GDLM_LVB_SIZE)
124 goto out;
125
126 ls = init_gdlm(cb, fsdata, flags, table_name);
127 if (!ls)
128 goto out;
129
130 error = gdlm_init_threads(ls);
131 if (error)
132 goto out_free;
133
134 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
135 &ls->dlm_lockspace, 0, GDLM_LVB_SIZE);
136 if (error) {
137 log_error("dlm_new_lockspace error %d", error);
138 goto out_thread;
139 }
140
141 error = gdlm_kobject_setup(ls, fskobj);
142 if (error)
143 goto out_dlm;
144
145 error = make_args(ls, host_data);
146 if (error)
147 goto out_sysfs;
148
149 lockstruct->ls_jid = ls->jid;
150 lockstruct->ls_first = ls->first;
151 lockstruct->ls_lockspace = ls;
152 lockstruct->ls_ops = &gdlm_ops;
153 lockstruct->ls_flags = 0;
154 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
155 return 0;
156
157 out_sysfs:
158 gdlm_kobject_release(ls);
159 out_dlm:
160 dlm_release_lockspace(ls->dlm_lockspace, 2);
161 out_thread:
162 gdlm_release_threads(ls);
163 out_free:
164 kfree(ls);
165 out:
166 return error;
167}
168
169static void gdlm_unmount(lm_lockspace_t *lockspace)
170{
171 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
172 int rv;
173
174 log_debug("unmount flags %lx", ls->flags);
175
176 /* FIXME: serialize unmount and withdraw in case they
177 happen at once. Also, if unmount follows withdraw,
178 wait for withdraw to finish. */
179
180 if (test_bit(DFL_WITHDRAW, &ls->flags))
181 goto out;
182
183 gdlm_kobject_release(ls);
184 dlm_release_lockspace(ls->dlm_lockspace, 2);
185 gdlm_release_threads(ls);
186 rv = gdlm_release_all_locks(ls);
187 if (rv)
188 log_info("gdlm_unmount: %d stray locks freed", rv);
189 out:
190 kfree(ls);
191}
192
193static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
194 unsigned int message)
195{
196 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
197 ls->recover_jid_done = jid;
198 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
199}
200
201static void gdlm_others_may_mount(lm_lockspace_t *lockspace)
202{
203 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
204 ls->first_done = 1;
205 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
206}
207
208/* Userspace gets the offline uevent, blocks new gfs locks on
209 other mounters, and lets us know (sets WITHDRAW flag). Then,
210 userspace leaves the mount group while we leave the lockspace. */
211
212static void gdlm_withdraw(lm_lockspace_t *lockspace)
213{
214 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
215
216 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
217
218 wait_event_interruptible(ls->wait_control,
219 test_bit(DFL_WITHDRAW, &ls->flags));
220
221 dlm_release_lockspace(ls->dlm_lockspace, 2);
222 gdlm_release_threads(ls);
223 gdlm_release_all_locks(ls);
224 gdlm_kobject_release(ls);
225}
226
227struct lm_lockops gdlm_ops = {
228 .lm_proto_name = "lock_dlm",
229 .lm_mount = gdlm_mount,
230 .lm_others_may_mount = gdlm_others_may_mount,
231 .lm_unmount = gdlm_unmount,
232 .lm_withdraw = gdlm_withdraw,
233 .lm_get_lock = gdlm_get_lock,
234 .lm_put_lock = gdlm_put_lock,
235 .lm_lock = gdlm_lock,
236 .lm_unlock = gdlm_unlock,
237 .lm_plock = gdlm_plock,
238 .lm_punlock = gdlm_punlock,
239 .lm_plock_get = gdlm_plock_get,
240 .lm_cancel = gdlm_cancel,
241 .lm_hold_lvb = gdlm_hold_lvb,
242 .lm_unhold_lvb = gdlm_unhold_lvb,
243 .lm_sync_lvb = gdlm_sync_lvb,
244 .lm_recovery_done = gdlm_recovery_done,
245 .lm_owner = THIS_MODULE,
246};
247
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..382847205bc1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,297 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = (uint32_t) fl->fl_owner;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80
81 send_op(op);
82 wait_event(recv_wq, (op->done != 0));
83
84 spin_lock(&ops_lock);
85 if (!list_empty(&op->list)) {
86 printk("plock op on list\n");
87 list_del(&op->list);
88 }
89 spin_unlock(&ops_lock);
90
91 rv = op->info.rv;
92
93 if (!rv) {
94 if (posix_lock_file_wait(file, fl) < 0)
95 log_error("gdlm_plock: vfs lock error %x,%llx",
96 name->ln_type, name->ln_number);
97 }
98
99 kfree(op);
100 return rv;
101}
102
103int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
104 struct file *file, struct file_lock *fl)
105{
106 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
107 struct plock_op *op;
108 int rv;
109
110 op = kzalloc(sizeof(*op), GFP_KERNEL);
111 if (!op)
112 return -ENOMEM;
113
114 if (posix_lock_file_wait(file, fl) < 0)
115 log_error("gdlm_punlock: vfs unlock error %x,%llx",
116 name->ln_type, name->ln_number);
117
118 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
119 op->info.pid = (uint32_t) fl->fl_owner;
120 op->info.fsid = ls->id;
121 op->info.number = name->ln_number;
122 op->info.start = fl->fl_start;
123 op->info.end = fl->fl_end;
124
125 send_op(op);
126 wait_event(recv_wq, (op->done != 0));
127
128 spin_lock(&ops_lock);
129 if (!list_empty(&op->list)) {
130 printk("punlock op on list\n");
131 list_del(&op->list);
132 }
133 spin_unlock(&ops_lock);
134
135 rv = op->info.rv;
136
137 kfree(op);
138 return rv;
139}
140
141int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
142 struct file *file, struct file_lock *fl)
143{
144 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
145 struct plock_op *op;
146 int rv;
147
148 op = kzalloc(sizeof(*op), GFP_KERNEL);
149 if (!op)
150 return -ENOMEM;
151
152 op->info.optype = GDLM_PLOCK_OP_GET;
153 op->info.pid = (uint32_t) fl->fl_owner;
154 op->info.ex = (fl->fl_type == F_WRLCK);
155 op->info.fsid = ls->id;
156 op->info.number = name->ln_number;
157 op->info.start = fl->fl_start;
158 op->info.end = fl->fl_end;
159
160 send_op(op);
161 wait_event(recv_wq, (op->done != 0));
162
163 spin_lock(&ops_lock);
164 if (!list_empty(&op->list)) {
165 printk("plock_get op on list\n");
166 list_del(&op->list);
167 }
168 spin_unlock(&ops_lock);
169
170 rv = op->info.rv;
171
172 if (rv == 0)
173 fl->fl_type = F_UNLCK;
174 else if (rv > 0) {
175 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
176 fl->fl_pid = op->info.pid;
177 fl->fl_start = op->info.start;
178 fl->fl_end = op->info.end;
179 }
180
181 kfree(op);
182 return rv;
183}
184
185/* a read copies out one plock request from the send list */
186static ssize_t dev_read(struct file *file, char __user *u, size_t count,
187 loff_t *ppos)
188{
189 struct gdlm_plock_info info;
190 struct plock_op *op = NULL;
191
192 if (count < sizeof(info))
193 return -EINVAL;
194
195 spin_lock(&ops_lock);
196 if (!list_empty(&send_list)) {
197 op = list_entry(send_list.next, struct plock_op, list);
198 list_move(&op->list, &recv_list);
199 memcpy(&info, &op->info, sizeof(info));
200 }
201 spin_unlock(&ops_lock);
202
203 if (!op)
204 return -EAGAIN;
205
206 if (copy_to_user(u, &info, sizeof(info)))
207 return -EFAULT;
208 return sizeof(info);
209}
210
211/* a write copies in one plock result that should match a plock_op
212 on the recv list */
213static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
214 loff_t *ppos)
215{
216 struct gdlm_plock_info info;
217 struct plock_op *op;
218 int found = 0;
219
220 if (count != sizeof(info))
221 return -EINVAL;
222
223 if (copy_from_user(&info, u, sizeof(info)))
224 return -EFAULT;
225
226 if (check_version(&info))
227 return -EINVAL;
228
229 spin_lock(&ops_lock);
230 list_for_each_entry(op, &recv_list, list) {
231 if (op->info.fsid == info.fsid &&
232 op->info.number == info.number) {
233 list_del_init(&op->list);
234 found = 1;
235 op->done = 1;
236 memcpy(&op->info, &info, sizeof(info));
237 break;
238 }
239 }
240 spin_unlock(&ops_lock);
241
242 if (found)
243 wake_up(&recv_wq);
244 else
245 printk("gdlm dev_write no op %x %llx\n", info.fsid,
246 info.number);
247 return count;
248}
249
250static unsigned int dev_poll(struct file *file, poll_table *wait)
251{
252 poll_wait(file, &send_wq, wait);
253
254 spin_lock(&ops_lock);
255 if (!list_empty(&send_list)) {
256 spin_unlock(&ops_lock);
257 return POLLIN | POLLRDNORM;
258 }
259 spin_unlock(&ops_lock);
260 return 0;
261}
262
263static struct file_operations dev_fops = {
264 .read = dev_read,
265 .write = dev_write,
266 .poll = dev_poll,
267 .owner = THIS_MODULE
268};
269
270static struct miscdevice plock_dev_misc = {
271 .minor = MISC_DYNAMIC_MINOR,
272 .name = GDLM_PLOCK_MISC_NAME,
273 .fops = &dev_fops
274};
275
276int gdlm_plock_init(void)
277{
278 int rv;
279
280 spin_lock_init(&ops_lock);
281 INIT_LIST_HEAD(&send_list);
282 INIT_LIST_HEAD(&recv_list);
283 init_waitqueue_head(&send_wq);
284 init_waitqueue_head(&recv_wq);
285
286 rv = misc_register(&plock_dev_misc);
287 if (rv)
288 printk("gdlm_plock_init: misc_register failed %d", rv);
289 return rv;
290}
291
292void gdlm_plock_exit(void)
293{
294 if (misc_deregister(&plock_dev_misc) < 0)
295 printk("gdlm_plock_exit: misc_deregister failed");
296}
297
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..e1e5186c97c9
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,218 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else
46 ret = -EINVAL;
47 return ret;
48}
49
50static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
51{
52 ssize_t ret;
53 int val = 0;
54
55 if (test_bit(DFL_WITHDRAW, &ls->flags))
56 val = 1;
57 ret = sprintf(buf, "%d\n", val);
58 return ret;
59}
60
61static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
62{
63 ssize_t ret = len;
64 int val;
65
66 val = simple_strtol(buf, NULL, 0);
67
68 if (val == 1)
69 set_bit(DFL_WITHDRAW, &ls->flags);
70 else
71 ret = -EINVAL;
72 wake_up(&ls->wait_control);
73 return ret;
74}
75
76static ssize_t id_show(struct gdlm_ls *ls, char *buf)
77{
78 return sprintf(buf, "%u\n", ls->id);
79}
80
81static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
82{
83 return sprintf(buf, "%d\n", ls->jid);
84}
85
86static ssize_t first_show(struct gdlm_ls *ls, char *buf)
87{
88 return sprintf(buf, "%d\n", ls->first);
89}
90
91static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
92{
93 return sprintf(buf, "%d\n", ls->first_done);
94}
95
96static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
97{
98 return sprintf(buf, "%d\n", ls->recover_jid);
99}
100
101static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
102{
103 ls->recover_jid = simple_strtol(buf, NULL, 0);
104 ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid);
105 return len;
106}
107
108static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
109{
110 return sprintf(buf, "%d\n", ls->recover_jid_done);
111}
112
113struct gdlm_attr {
114 struct attribute attr;
115 ssize_t (*show)(struct gdlm_ls *, char *);
116 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
117};
118
119#define GDLM_ATTR(_name,_mode,_show,_store) \
120static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
121
122GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
123GDLM_ATTR(block, 0644, block_show, block_store);
124GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
125GDLM_ATTR(id, 0444, id_show, NULL);
126GDLM_ATTR(jid, 0444, jid_show, NULL);
127GDLM_ATTR(first, 0444, first_show, NULL);
128GDLM_ATTR(first_done, 0444, first_done_show, NULL);
129GDLM_ATTR(recover, 0644, recover_show, recover_store);
130GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
131
132static struct attribute *gdlm_attrs[] = {
133 &gdlm_attr_proto_name.attr,
134 &gdlm_attr_block.attr,
135 &gdlm_attr_withdraw.attr,
136 &gdlm_attr_id.attr,
137 &gdlm_attr_jid.attr,
138 &gdlm_attr_first.attr,
139 &gdlm_attr_first_done.attr,
140 &gdlm_attr_recover.attr,
141 &gdlm_attr_recover_done.attr,
142 NULL,
143};
144
145static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
146 char *buf)
147{
148 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
149 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
150 return a->show ? a->show(ls, buf) : 0;
151}
152
153static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
154 const char *buf, size_t len)
155{
156 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
157 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
158 return a->store ? a->store(ls, buf, len) : len;
159}
160
161static struct sysfs_ops gdlm_attr_ops = {
162 .show = gdlm_attr_show,
163 .store = gdlm_attr_store,
164};
165
166static struct kobj_type gdlm_ktype = {
167 .default_attrs = gdlm_attrs,
168 .sysfs_ops = &gdlm_attr_ops,
169};
170
171static struct kset gdlm_kset = {
172 .subsys = &kernel_subsys,
173 .kobj = {.name = "lock_dlm",},
174 .ktype = &gdlm_ktype,
175};
176
177int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
178{
179 int error;
180
181 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
182 if (error) {
183 log_error("can't set kobj name %d", error);
184 return error;
185 }
186
187 ls->kobj.kset = &gdlm_kset;
188 ls->kobj.ktype = &gdlm_ktype;
189 ls->kobj.parent = fskobj;
190
191 error = kobject_register(&ls->kobj);
192 if (error)
193 log_error("can't register kobj %d", error);
194
195 return error;
196}
197
198void gdlm_kobject_release(struct gdlm_ls *ls)
199{
200 kobject_unregister(&ls->kobj);
201}
202
203int gdlm_sysfs_init(void)
204{
205 int error;
206
207 error = kset_register(&gdlm_kset);
208 if (error)
209 printk("lock_dlm: cannot register kset %d\n", error);
210
211 return error;
212}
213
214void gdlm_sysfs_exit(void)
215{
216 kset_unregister(&gdlm_kset);
217}
218
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..6fe669cd334b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,352 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->fsdata, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 int16_t prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type, lp->lockname.ln_number,
58 lp->flags);
59
60 lp->req = lp->cur;
61 acb.lc_ret |= LM_OUT_CANCELED;
62 if (lp->cur == DLM_LOCK_IV)
63 lp->lksb.sb_lkid = 0;
64 goto out;
65 }
66
67 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
68 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
69 log_info("unlock sb_status %d %x,%llx flags %lx",
70 lp->lksb.sb_status, lp->lockname.ln_type,
71 lp->lockname.ln_number, lp->flags);
72 return;
73 }
74
75 lp->cur = DLM_LOCK_IV;
76 lp->req = DLM_LOCK_IV;
77 lp->lksb.sb_lkid = 0;
78
79 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
80 gdlm_delete_lp(lp);
81 return;
82 }
83 goto out;
84 }
85
86 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
87 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
88
89 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
90 if (lp->req == DLM_LOCK_PR)
91 lp->req = DLM_LOCK_CW;
92 else if (lp->req == DLM_LOCK_CW)
93 lp->req = DLM_LOCK_PR;
94 }
95
96 /*
97 * A canceled lock request. The lock was just taken off the delayed
98 * list and was never even submitted to dlm.
99 */
100
101 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
102 log_info("complete internal cancel %x,%llx",
103 lp->lockname.ln_type, lp->lockname.ln_number);
104 lp->req = lp->cur;
105 acb.lc_ret |= LM_OUT_CANCELED;
106 goto out;
107 }
108
109 /*
110 * An error occured.
111 */
112
113 if (lp->lksb.sb_status) {
114 /* a "normal" error */
115 if ((lp->lksb.sb_status == -EAGAIN) &&
116 (lp->lkf & DLM_LKF_NOQUEUE)) {
117 lp->req = lp->cur;
118 if (lp->cur == DLM_LOCK_IV)
119 lp->lksb.sb_lkid = 0;
120 goto out;
121 }
122
123 /* this could only happen with cancels I think */
124 log_info("ast sb_status %d %x,%llx flags %lx",
125 lp->lksb.sb_status, lp->lockname.ln_type,
126 lp->lockname.ln_number, lp->flags);
127 return;
128 }
129
130 /*
131 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
132 */
133
134 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
135 complete(&lp->ast_wait);
136 return;
137 }
138
139 /*
140 * A lock has been demoted to NL because it initially completed during
141 * BLOCK_LOCKS. Now it must be requested in the originally requested
142 * mode.
143 */
144
145 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
146 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
147 lp->lockname.ln_type, lp->lockname.ln_number);
148 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
149 lp->lockname.ln_type, lp->lockname.ln_number);
150
151 lp->cur = DLM_LOCK_NL;
152 lp->req = lp->prev_req;
153 lp->prev_req = DLM_LOCK_IV;
154 lp->lkf &= ~DLM_LKF_CONVDEADLK;
155
156 set_bit(LFL_NOCACHE, &lp->flags);
157
158 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
159 !test_bit(LFL_NOBLOCK, &lp->flags))
160 gdlm_queue_delayed(lp);
161 else
162 queue_submit(lp);
163 return;
164 }
165
166 /*
167 * A request is granted during dlm recovery. It may be granted
168 * because the locks of a failed node were cleared. In that case,
169 * there may be inconsistent data beneath this lock and we must wait
170 * for recovery to complete to use it. When gfs recovery is done this
171 * granted lock will be converted to NL and then reacquired in this
172 * granted state.
173 */
174
175 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
176 !test_bit(LFL_NOBLOCK, &lp->flags) &&
177 lp->req != DLM_LOCK_NL) {
178
179 lp->cur = lp->req;
180 lp->prev_req = lp->req;
181 lp->req = DLM_LOCK_NL;
182 lp->lkf |= DLM_LKF_CONVERT;
183 lp->lkf &= ~DLM_LKF_CONVDEADLK;
184
185 log_debug("rereq %x,%llx id %x %d,%d",
186 lp->lockname.ln_type, lp->lockname.ln_number,
187 lp->lksb.sb_lkid, lp->cur, lp->req);
188
189 set_bit(LFL_REREQUEST, &lp->flags);
190 queue_submit(lp);
191 return;
192 }
193
194 /*
195 * DLM demoted the lock to NL before it was granted so GFS must be
196 * told it cannot cache data for this lock.
197 */
198
199 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
200 set_bit(LFL_NOCACHE, &lp->flags);
201
202 out:
203 /*
204 * This is an internal lock_dlm lock
205 */
206
207 if (test_bit(LFL_INLOCK, &lp->flags)) {
208 clear_bit(LFL_NOBLOCK, &lp->flags);
209 lp->cur = lp->req;
210 complete(&lp->ast_wait);
211 return;
212 }
213
214 /*
215 * Normal completion of a lock request. Tell GFS it now has the lock.
216 */
217
218 clear_bit(LFL_NOBLOCK, &lp->flags);
219 lp->cur = lp->req;
220
221 acb.lc_name = lp->lockname;
222 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
223
224 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
225 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
226 acb.lc_ret |= LM_OUT_CACHEABLE;
227
228 ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb);
229}
230
231static inline int no_work(struct gdlm_ls *ls, int blocking)
232{
233 int ret;
234
235 spin_lock(&ls->async_lock);
236 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
237 if (ret && blocking)
238 ret = list_empty(&ls->blocking);
239 spin_unlock(&ls->async_lock);
240
241 return ret;
242}
243
244static inline int check_drop(struct gdlm_ls *ls)
245{
246 if (!ls->drop_locks_count)
247 return 0;
248
249 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
250 ls->drop_time = jiffies;
251 if (ls->all_locks_count >= ls->drop_locks_count)
252 return 1;
253 }
254 return 0;
255}
256
257static int gdlm_thread(void *data)
258{
259 struct gdlm_ls *ls = (struct gdlm_ls *) data;
260 struct gdlm_lock *lp = NULL;
261 int blist = 0;
262 uint8_t complete, blocking, submit, drop;
263 DECLARE_WAITQUEUE(wait, current);
264
265 /* Only thread1 is allowed to do blocking callbacks since gfs
266 may wait for a completion callback within a blocking cb. */
267
268 if (current == ls->thread1)
269 blist = 1;
270
271 while (!kthread_should_stop()) {
272 set_current_state(TASK_INTERRUPTIBLE);
273 add_wait_queue(&ls->thread_wait, &wait);
274 if (no_work(ls, blist))
275 schedule();
276 remove_wait_queue(&ls->thread_wait, &wait);
277 set_current_state(TASK_RUNNING);
278
279 complete = blocking = submit = drop = 0;
280
281 spin_lock(&ls->async_lock);
282
283 if (blist && !list_empty(&ls->blocking)) {
284 lp = list_entry(ls->blocking.next, struct gdlm_lock,
285 blist);
286 list_del_init(&lp->blist);
287 blocking = lp->bast_mode;
288 lp->bast_mode = 0;
289 } else if (!list_empty(&ls->complete)) {
290 lp = list_entry(ls->complete.next, struct gdlm_lock,
291 clist);
292 list_del_init(&lp->clist);
293 complete = 1;
294 } else if (!list_empty(&ls->submit)) {
295 lp = list_entry(ls->submit.next, struct gdlm_lock,
296 delay_list);
297 list_del_init(&lp->delay_list);
298 submit = 1;
299 }
300
301 drop = check_drop(ls);
302 spin_unlock(&ls->async_lock);
303
304 if (complete)
305 process_complete(lp);
306
307 else if (blocking)
308 process_blocking(lp, blocking);
309
310 else if (submit)
311 gdlm_do_lock(lp, NULL);
312
313 if (drop)
314 ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL);
315
316 schedule();
317 }
318
319 return 0;
320}
321
322int gdlm_init_threads(struct gdlm_ls *ls)
323{
324 struct task_struct *p;
325 int error;
326
327 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
328 error = IS_ERR(p);
329 if (error) {
330 log_error("can't start lock_dlm1 thread %d", error);
331 return error;
332 }
333 ls->thread1 = p;
334
335 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
336 error = IS_ERR(p);
337 if (error) {
338 log_error("can't start lock_dlm2 thread %d", error);
339 kthread_stop(ls->thread1);
340 return error;
341 }
342 ls->thread2 = p;
343
344 return 0;
345}
346
347void gdlm_release_threads(struct gdlm_ls *ls)
348{
349 kthread_stop(ls->thread1);
350 kthread_stop(ls->thread2);
351}
352
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..cdadf956c831
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..b716e336c073
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,268 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17
18#include "../../lm_interface.h"
19
20struct nolock_lockspace {
21 unsigned int nl_lvb_size;
22};
23
24struct lm_lockops nolock_ops;
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, lm_fsdata_t *fsdata,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj)
31{
32 char *c;
33 unsigned int jid;
34 struct nolock_lockspace *nl;
35
36 /* If there is a "jid=" in the hostdata, return that jid.
37 Otherwise, return zero. */
38
39 c = strstr(host_data, "jid=");
40 if (!c)
41 jid = 0;
42 else {
43 c += 4;
44 sscanf(c, "%u", &jid);
45 }
46
47 nl = kmalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
48 if (!nl)
49 return -ENOMEM;
50
51 memset(nl, 0, sizeof(struct nolock_lockspace));
52 nl->nl_lvb_size = min_lvb_size;
53
54 lockstruct->ls_jid = jid;
55 lockstruct->ls_first = 1;
56 lockstruct->ls_lvb_size = min_lvb_size;
57 lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
58 lockstruct->ls_ops = &nolock_ops;
59 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
60
61 return 0;
62}
63
64static void nolock_others_may_mount(lm_lockspace_t *lockspace)
65{
66}
67
68static void nolock_unmount(lm_lockspace_t *lockspace)
69{
70 struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
71 kfree(nl);
72}
73
74static void nolock_withdraw(lm_lockspace_t *lockspace)
75{
76}
77
78/**
79 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
80 * @lockspace: the lockspace the lock lives in
81 * @name: the name of the lock
82 * @lockp: return the lm_lock_t here
83 *
84 * Returns: 0 on success, -EXXX on failure
85 */
86
87static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
88 lm_lock_t **lockp)
89{
90 *lockp = (lm_lock_t *)lockspace;
91 return 0;
92}
93
94/**
95 * nolock_put_lock - get rid of a lock structure
96 * @lock: the lock to throw away
97 *
98 */
99
100static void nolock_put_lock(lm_lock_t *lock)
101{
102}
103
104/**
105 * nolock_lock - acquire a lock
106 * @lock: the lock to manipulate
107 * @cur_state: the current state
108 * @req_state: the requested state
109 * @flags: modifier flags
110 *
111 * Returns: A bitmap of LM_OUT_*
112 */
113
114static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state,
115 unsigned int req_state, unsigned int flags)
116{
117 return req_state | LM_OUT_CACHEABLE;
118}
119
120/**
121 * nolock_unlock - unlock a lock
122 * @lock: the lock to manipulate
123 * @cur_state: the current state
124 *
125 * Returns: 0
126 */
127
128static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
129{
130 return 0;
131}
132
133static void nolock_cancel(lm_lock_t *lock)
134{
135}
136
137/**
138 * nolock_hold_lvb - hold on to a lock value block
139 * @lock: the lock the LVB is associated with
140 * @lvbp: return the lm_lvb_t here
141 *
142 * Returns: 0 on success, -EXXX on failure
143 */
144
145static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
146{
147 struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
148 int error = 0;
149
150 *lvbp = kmalloc(nl->nl_lvb_size, GFP_KERNEL);
151 if (*lvbp)
152 memset(*lvbp, 0, nl->nl_lvb_size);
153 else
154 error = -ENOMEM;
155
156 return error;
157}
158
159/**
160 * nolock_unhold_lvb - release a LVB
161 * @lock: the lock the LVB is associated with
162 * @lvb: the lock value block
163 *
164 */
165
166static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
167{
168 kfree(lvb);
169}
170
171/**
172 * nolock_sync_lvb - sync out the value of a lvb
173 * @lock: the lock the LVB is associated with
174 * @lvb: the lock value block
175 *
176 */
177
178static void nolock_sync_lvb(lm_lock_t *lock, char *lvb)
179{
180}
181
182static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
183 struct file *file, struct file_lock *fl)
184{
185 struct file_lock *tmp;
186
187 lock_kernel();
188 tmp = posix_test_lock(file, fl);
189 fl->fl_type = F_UNLCK;
190 if (tmp)
191 memcpy(fl, tmp, sizeof(struct file_lock));
192 unlock_kernel();
193
194 return 0;
195}
196
197static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
198 struct file *file, int cmd, struct file_lock *fl)
199{
200 int error;
201 lock_kernel();
202 error = posix_lock_file_wait(file, fl);
203 unlock_kernel();
204 return error;
205}
206
207static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
208 struct file *file, struct file_lock *fl)
209{
210 int error;
211 lock_kernel();
212 error = posix_lock_file_wait(file, fl);
213 unlock_kernel();
214 return error;
215}
216
217static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
218 unsigned int message)
219{
220}
221
222struct lm_lockops nolock_ops = {
223 .lm_proto_name = "lock_nolock",
224 .lm_mount = nolock_mount,
225 .lm_others_may_mount = nolock_others_may_mount,
226 .lm_unmount = nolock_unmount,
227 .lm_withdraw = nolock_withdraw,
228 .lm_get_lock = nolock_get_lock,
229 .lm_put_lock = nolock_put_lock,
230 .lm_lock = nolock_lock,
231 .lm_unlock = nolock_unlock,
232 .lm_cancel = nolock_cancel,
233 .lm_hold_lvb = nolock_hold_lvb,
234 .lm_unhold_lvb = nolock_unhold_lvb,
235 .lm_sync_lvb = nolock_sync_lvb,
236 .lm_plock_get = nolock_plock_get,
237 .lm_plock = nolock_plock,
238 .lm_punlock = nolock_punlock,
239 .lm_recovery_done = nolock_recovery_done,
240 .lm_owner = THIS_MODULE,
241};
242
243int __init init_nolock(void)
244{
245 int error;
246
247 error = gfs_register_lockproto(&nolock_ops);
248 if (error) {
249 printk("lock_nolock: can't register protocol: %d\n", error);
250 return error;
251 }
252
253 printk("Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
254 return 0;
255}
256
257void __exit exit_nolock(void)
258{
259 gfs_unregister_lockproto(&nolock_ops);
260}
261
262module_init(init_nolock);
263module_exit(exit_nolock);
264
265MODULE_DESCRIPTION("GFS Nolock Locking Module");
266MODULE_AUTHOR("Red Hat, Inc.");
267MODULE_LICENSE("GPL");
268
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..e6acb41332e7
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,643 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "bmap.h"
19#include "glock.h"
20#include "log.h"
21#include "lops.h"
22#include "meta_io.h"
23
24#define PULL 1
25
26static void do_lock_wait(struct gfs2_sbd *sdp, wait_queue_head_t *wq,
27 atomic_t *a)
28{
29 wait_event(*wq, atomic_read(a) ? 0 : 1);
30}
31
32static void lock_for_trans(struct gfs2_sbd *sdp)
33{
34 do_lock_wait(sdp, &sdp->sd_log_trans_wq, &sdp->sd_log_flush_count);
35 atomic_inc(&sdp->sd_log_trans_count);
36}
37
38static void unlock_from_trans(struct gfs2_sbd *sdp)
39{
40 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_trans_count));
41 if (atomic_dec_and_test(&sdp->sd_log_trans_count))
42 wake_up(&sdp->sd_log_flush_wq);
43}
44
45static void gfs2_lock_for_flush(struct gfs2_sbd *sdp)
46{
47 atomic_inc(&sdp->sd_log_flush_count);
48 do_lock_wait(sdp, &sdp->sd_log_flush_wq, &sdp->sd_log_trans_count);
49}
50
51static void gfs2_unlock_from_flush(struct gfs2_sbd *sdp)
52{
53 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_flush_count));
54 if (atomic_dec_and_test(&sdp->sd_log_flush_count))
55 wake_up(&sdp->sd_log_trans_wq);
56}
57
58/**
59 * gfs2_struct2blk - compute stuff
60 * @sdp: the filesystem
61 * @nstruct: the number of structures
62 * @ssize: the size of the structures
63 *
64 * Compute the number of log descriptor blocks needed to hold a certain number
65 * of structures of a certain size.
66 *
67 * Returns: the number of blocks needed (minimum is always 1)
68 */
69
70unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
71 unsigned int ssize)
72{
73 unsigned int blks;
74 unsigned int first, second;
75
76 blks = 1;
77 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
78
79 if (nstruct > first) {
80 second = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / ssize;
81 blks += DIV_RU(nstruct - first, second);
82 }
83
84 return blks;
85}
86
87void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
88{
89 struct list_head *head = &sdp->sd_ail1_list;
90 uint64_t sync_gen;
91 struct list_head *first, *tmp;
92 struct gfs2_ail *first_ai, *ai;
93
94 gfs2_log_lock(sdp);
95 if (list_empty(head)) {
96 gfs2_log_unlock(sdp);
97 return;
98 }
99 sync_gen = sdp->sd_ail_sync_gen++;
100
101 first = head->prev;
102 first_ai = list_entry(first, struct gfs2_ail, ai_list);
103 first_ai->ai_sync_gen = sync_gen;
104 gfs2_ail1_start_one(sdp, first_ai);
105
106 if (flags & DIO_ALL)
107 first = NULL;
108
109 for (;;) {
110 if (first &&
111 (head->prev != first ||
112 gfs2_ail1_empty_one(sdp, first_ai, 0)))
113 break;
114
115 for (tmp = head->prev; tmp != head; tmp = tmp->prev) {
116 ai = list_entry(tmp, struct gfs2_ail, ai_list);
117 if (ai->ai_sync_gen >= sync_gen)
118 continue;
119 ai->ai_sync_gen = sync_gen;
120 gfs2_ail1_start_one(sdp, ai);
121 break;
122 }
123
124 if (tmp == head)
125 break;
126 }
127
128 gfs2_log_unlock(sdp);
129}
130
131int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
132{
133 struct gfs2_ail *ai, *s;
134 int ret;
135
136 gfs2_log_lock(sdp);
137
138 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
139 if (gfs2_ail1_empty_one(sdp, ai, flags))
140 list_move(&ai->ai_list, &sdp->sd_ail2_list);
141 else if (!(flags & DIO_ALL))
142 break;
143 }
144
145 ret = list_empty(&sdp->sd_ail1_list);
146
147 gfs2_log_unlock(sdp);
148
149 return ret;
150}
151
152static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
153{
154 struct gfs2_ail *ai, *safe;
155 unsigned int old_tail = sdp->sd_log_tail;
156 int wrap = (new_tail < old_tail);
157 int a, b, rm;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
162 a = (old_tail <= ai->ai_first);
163 b = (ai->ai_first < new_tail);
164 rm = (wrap) ? (a || b) : (a && b);
165 if (!rm)
166 continue;
167
168 gfs2_ail2_empty_one(sdp, ai);
169 list_del(&ai->ai_list);
170 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
171 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
172 kfree(ai);
173 }
174
175 gfs2_log_unlock(sdp);
176}
177
178/**
179 * gfs2_log_reserve - Make a log reservation
180 * @sdp: The GFS2 superblock
181 * @blks: The number of blocks to reserve
182 *
183 * Returns: errno
184 */
185
186int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
187{
188 LIST_HEAD(list);
189 unsigned int try = 0;
190
191 if (gfs2_assert_warn(sdp, blks) ||
192 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
193 return -EINVAL;
194
195 for (;;) {
196 gfs2_log_lock(sdp);
197 if (list_empty(&list)) {
198 list_add_tail(&list, &sdp->sd_log_blks_list);
199 while (sdp->sd_log_blks_list.next != &list) {
200 DECLARE_WAITQUEUE(__wait_chan, current);
201 set_current_state(TASK_UNINTERRUPTIBLE);
202 add_wait_queue(&sdp->sd_log_blks_wait,
203 &__wait_chan);
204 gfs2_log_unlock(sdp);
205 schedule();
206 gfs2_log_lock(sdp);
207 remove_wait_queue(&sdp->sd_log_blks_wait,
208 &__wait_chan);
209 set_current_state(TASK_RUNNING);
210 }
211 }
212 /* Never give away the last block so we can
213 always pull the tail if we need to. */
214 if (sdp->sd_log_blks_free > blks) {
215 sdp->sd_log_blks_free -= blks;
216 list_del(&list);
217 gfs2_log_unlock(sdp);
218 wake_up(&sdp->sd_log_blks_wait);
219 break;
220 }
221
222 gfs2_log_unlock(sdp);
223 gfs2_ail1_empty(sdp, 0);
224 gfs2_log_flush(sdp);
225
226 if (try++)
227 gfs2_ail1_start(sdp, 0);
228 }
229 lock_for_trans(sdp);
230
231 return 0;
232}
233
234/**
235 * gfs2_log_release - Release a given number of log blocks
236 * @sdp: The GFS2 superblock
237 * @blks: The number of blocks
238 *
239 */
240
241void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
242{
243 unlock_from_trans(sdp);
244
245 gfs2_log_lock(sdp);
246 sdp->sd_log_blks_free += blks;
247 gfs2_assert_withdraw(sdp,
248 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
249 gfs2_log_unlock(sdp);
250}
251
252static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
253{
254 int new = 0;
255 uint64_t dbn;
256 int error;
257
258 error = gfs2_block_map(get_v2ip(sdp->sd_jdesc->jd_inode), lbn, &new, &dbn, NULL);
259 gfs2_assert_withdraw(sdp, !error && dbn);
260
261 return dbn;
262}
263
264/**
265 * log_distance - Compute distance between two journal blocks
266 * @sdp: The GFS2 superblock
267 * @newer: The most recent journal block of the pair
268 * @older: The older journal block of the pair
269 *
270 * Compute the distance (in the journal direction) between two
271 * blocks in the journal
272 *
273 * Returns: the distance in blocks
274 */
275
276static inline unsigned int log_distance(struct gfs2_sbd *sdp,
277 unsigned int newer,
278 unsigned int older)
279{
280 int dist;
281
282 dist = newer - older;
283 if (dist < 0)
284 dist += sdp->sd_jdesc->jd_blocks;
285
286 return dist;
287}
288
289static unsigned int current_tail(struct gfs2_sbd *sdp)
290{
291 struct gfs2_ail *ai;
292 unsigned int tail;
293
294 gfs2_log_lock(sdp);
295
296 if (list_empty(&sdp->sd_ail1_list))
297 tail = sdp->sd_log_head;
298 else {
299 ai = list_entry(sdp->sd_ail1_list.prev,
300 struct gfs2_ail, ai_list);
301 tail = ai->ai_first;
302 }
303
304 gfs2_log_unlock(sdp);
305
306 return tail;
307}
308
309static inline void log_incr_head(struct gfs2_sbd *sdp)
310{
311 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
312 gfs2_assert_withdraw(sdp,
313 sdp->sd_log_flush_head == sdp->sd_log_head);
314
315 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
316 sdp->sd_log_flush_head = 0;
317 sdp->sd_log_flush_wrapped = 1;
318 }
319}
320
321/**
322 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
323 * @sdp: The GFS2 superblock
324 *
325 * Returns: the buffer_head
326 */
327
328struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
329{
330 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
331 struct gfs2_log_buf *lb;
332 struct buffer_head *bh;
333
334 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
335 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
336
337 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
338 lock_buffer(bh);
339 memset(bh->b_data, 0, bh->b_size);
340 set_buffer_uptodate(bh);
341 clear_buffer_dirty(bh);
342 unlock_buffer(bh);
343
344 log_incr_head(sdp);
345
346 return bh;
347}
348
349/**
350 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
351 * @sdp: the filesystem
352 * @data: the data the buffer_head should point to
353 *
354 * Returns: the log buffer descriptor
355 */
356
357struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
358 struct buffer_head *real)
359{
360 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
361 struct gfs2_log_buf *lb;
362 struct buffer_head *bh;
363
364 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
365 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
366 lb->lb_real = real;
367
368 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
369 atomic_set(&bh->b_count, 1);
370 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
371 set_bh_page(bh, real->b_page, bh_offset(real));
372 bh->b_blocknr = blkno;
373 bh->b_size = sdp->sd_sb.sb_bsize;
374 bh->b_bdev = sdp->sd_vfs->s_bdev;
375
376 log_incr_head(sdp);
377
378 return bh;
379}
380
381static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
382{
383 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
384
385 ail2_empty(sdp, new_tail);
386
387 gfs2_log_lock(sdp);
388 sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0);
389 gfs2_assert_withdraw(sdp,
390 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
391 gfs2_log_unlock(sdp);
392
393 sdp->sd_log_tail = new_tail;
394}
395
396/**
397 * log_write_header - Get and initialize a journal header buffer
398 * @sdp: The GFS2 superblock
399 *
400 * Returns: the initialized log buffer descriptor
401 */
402
403static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull)
404{
405 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
406 struct buffer_head *bh;
407 struct gfs2_log_header *lh;
408 unsigned int tail;
409 uint32_t hash;
410
411 atomic_inc(&sdp->sd_log_flush_ondisk);
412
413 bh = sb_getblk(sdp->sd_vfs, blkno);
414 lock_buffer(bh);
415 memset(bh->b_data, 0, bh->b_size);
416 set_buffer_uptodate(bh);
417 clear_buffer_dirty(bh);
418 unlock_buffer(bh);
419
420 gfs2_ail1_empty(sdp, 0);
421 tail = current_tail(sdp);
422
423 lh = (struct gfs2_log_header *)bh->b_data;
424 memset(lh, 0, sizeof(struct gfs2_log_header));
425 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
426 lh->lh_header.mh_type = cpu_to_be16(GFS2_METATYPE_LH);
427 lh->lh_header.mh_format = cpu_to_be16(GFS2_FORMAT_LH);
428 lh->lh_sequence = be64_to_cpu(sdp->sd_log_sequence++);
429 lh->lh_flags = be32_to_cpu(flags);
430 lh->lh_tail = be32_to_cpu(tail);
431 lh->lh_blkno = be32_to_cpu(sdp->sd_log_flush_head);
432 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
433 lh->lh_hash = cpu_to_be32(hash);
434
435 set_buffer_dirty(bh);
436 if (sync_dirty_buffer(bh))
437 gfs2_io_error_bh(sdp, bh);
438 brelse(bh);
439
440 if (sdp->sd_log_tail != tail)
441 log_pull_tail(sdp, tail, pull);
442 else
443 gfs2_assert_withdraw(sdp, !pull);
444
445 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
446 log_incr_head(sdp);
447}
448
449static void log_flush_commit(struct gfs2_sbd *sdp)
450{
451 struct list_head *head = &sdp->sd_log_flush_list;
452 struct gfs2_log_buf *lb;
453 struct buffer_head *bh;
454 unsigned int d;
455
456 d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head);
457
458 gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved);
459
460 while (!list_empty(head)) {
461 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
462 list_del(&lb->lb_list);
463 bh = lb->lb_bh;
464
465 wait_on_buffer(bh);
466 if (!buffer_uptodate(bh))
467 gfs2_io_error_bh(sdp, bh);
468 if (lb->lb_real) {
469 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
470 schedule();
471 free_buffer_head(bh);
472 } else
473 brelse(bh);
474 kfree(lb);
475 }
476
477 log_write_header(sdp, 0, 0);
478}
479
480/**
481 * gfs2_log_flush_i - flush incore transaction(s)
482 * @sdp: the filesystem
483 * @gl: The glock structure to flush. If NULL, flush the whole incore log
484 *
485 */
486
487void gfs2_log_flush_i(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
488{
489 struct gfs2_ail *ai;
490
491 atomic_inc(&sdp->sd_log_flush_incore);
492
493 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
494 INIT_LIST_HEAD(&ai->ai_ail1_list);
495 INIT_LIST_HEAD(&ai->ai_ail2_list);
496 gfs2_lock_for_flush(sdp);
497
498 if (gl) {
499 gfs2_log_lock(sdp);
500 if (list_empty(&gl->gl_le.le_list)) {
501 gfs2_log_unlock(sdp);
502 gfs2_unlock_from_flush(sdp);
503 kfree(ai);
504 return;
505 }
506 gfs2_log_unlock(sdp);
507 }
508
509 mutex_lock(&sdp->sd_log_flush_lock);
510
511 gfs2_assert_withdraw(sdp,
512 sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
513 gfs2_assert_withdraw(sdp,
514 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
515
516 sdp->sd_log_flush_head = sdp->sd_log_head;
517 sdp->sd_log_flush_wrapped = 0;
518 ai->ai_first = sdp->sd_log_flush_head;
519
520 lops_before_commit(sdp);
521 if (!list_empty(&sdp->sd_log_flush_list))
522 log_flush_commit(sdp);
523 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
524 log_write_header(sdp, 0, PULL);
525 lops_after_commit(sdp, ai);
526 sdp->sd_log_head = sdp->sd_log_flush_head;
527 if (sdp->sd_log_flush_wrapped)
528 sdp->sd_log_wraps++;
529
530 sdp->sd_log_blks_reserved =
531 sdp->sd_log_commited_buf =
532 sdp->sd_log_commited_revoke = 0;
533
534 gfs2_log_lock(sdp);
535 if (!list_empty(&ai->ai_ail1_list)) {
536 list_add(&ai->ai_list, &sdp->sd_ail1_list);
537 ai = NULL;
538 }
539 gfs2_log_unlock(sdp);
540
541 mutex_unlock(&sdp->sd_log_flush_lock);
542 sdp->sd_vfs->s_dirt = 0;
543 gfs2_unlock_from_flush(sdp);
544
545 kfree(ai);
546}
547
548static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
549{
550 unsigned int reserved = 1;
551 unsigned int old;
552
553 gfs2_log_lock(sdp);
554
555 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
556 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
557 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
558 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
559
560 if (sdp->sd_log_commited_buf)
561 reserved += 1 + sdp->sd_log_commited_buf + sdp->sd_log_commited_buf/503;
562 if (sdp->sd_log_commited_revoke)
563 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
564 sizeof(uint64_t));
565
566 old = sdp->sd_log_blks_free;
567 sdp->sd_log_blks_free += tr->tr_reserved -
568 (reserved - sdp->sd_log_blks_reserved);
569
570 gfs2_assert_withdraw(sdp,
571 sdp->sd_log_blks_free >= old);
572 gfs2_assert_withdraw(sdp,
573 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
574
575 sdp->sd_log_blks_reserved = reserved;
576
577 gfs2_log_unlock(sdp);
578}
579
580/**
581 * gfs2_log_commit - Commit a transaction to the log
582 * @sdp: the filesystem
583 * @tr: the transaction
584 *
585 * Returns: errno
586 */
587
588void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
589{
590 log_refund(sdp, tr);
591 lops_incore_commit(sdp, tr);
592
593 sdp->sd_vfs->s_dirt = 1;
594 unlock_from_trans(sdp);
595
596 kfree(tr);
597
598 gfs2_log_lock(sdp);
599 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
600 gfs2_log_unlock(sdp);
601 gfs2_log_flush(sdp);
602 } else
603 gfs2_log_unlock(sdp);
604}
605
606/**
607 * gfs2_log_shutdown - write a shutdown header into a journal
608 * @sdp: the filesystem
609 *
610 */
611
612void gfs2_log_shutdown(struct gfs2_sbd *sdp)
613{
614 mutex_lock(&sdp->sd_log_flush_lock);
615
616 gfs2_assert_withdraw(sdp, !atomic_read(&sdp->sd_log_trans_count));
617 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
618 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
619 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
620 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
621 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
622 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
623 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
624 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
625
626 sdp->sd_log_flush_head = sdp->sd_log_head;
627 sdp->sd_log_flush_wrapped = 0;
628
629 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
630
631 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free ==
632 sdp->sd_jdesc->jd_blocks);
633 gfs2_assert_withdraw(sdp, sdp->sd_log_head == sdp->sd_log_tail);
634 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail2_list));
635
636 sdp->sd_log_head = sdp->sd_log_flush_head;
637 if (sdp->sd_log_flush_wrapped)
638 sdp->sd_log_wraps++;
639 sdp->sd_log_tail = sdp->sd_log_head;
640
641 mutex_unlock(&sdp->sd_log_flush_lock);
642}
643
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..e7a6a65c530f
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13/**
14 * gfs2_log_lock - acquire the right to mess with the log manager
15 * @sdp: the filesystem
16 *
17 */
18
19static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
20{
21 spin_lock(&sdp->sd_log_lock);
22}
23
24/**
25 * gfs2_log_unlock - release the right to mess with the log manager
26 * @sdp: the filesystem
27 *
28 */
29
30static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
31{
32 spin_unlock(&sdp->sd_log_lock);
33}
34
35static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
36 unsigned int value)
37{
38 if (++value == sdp->sd_jdesc->jd_blocks) {
39 value = 0;
40 sdp->sd_log_wraps++;
41 }
42 sdp->sd_log_head = sdp->sd_log_tail = value;
43}
44
45unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
46 unsigned int ssize);
47
48void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
49int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
50
51int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
52void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
53
54struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
55struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
56 struct buffer_head *real);
57
58#define gfs2_log_flush(sdp) gfs2_log_flush_i((sdp), NULL)
59#define gfs2_log_flush_glock(gl) gfs2_log_flush_i((gl)->gl_sbd, (gl))
60void gfs2_log_flush_i(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62
63void gfs2_log_shutdown(struct gfs2_sbd *sdp);
64
65#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..3d792f81e48c
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,768 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "glock.h"
19#include "log.h"
20#include "lops.h"
21#include "meta_io.h"
22#include "recovery.h"
23#include "rgrp.h"
24#include "trans.h"
25
26static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
27{
28 struct gfs2_glock *gl;
29
30 get_transaction->tr_touched = 1;
31
32 if (!list_empty(&le->le_list))
33 return;
34
35 gl = container_of(le, struct gfs2_glock, gl_le);
36 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
37 return;
38 gfs2_glock_hold(gl);
39 set_bit(GLF_DIRTY, &gl->gl_flags);
40
41 gfs2_log_lock(sdp);
42 sdp->sd_log_num_gl++;
43 list_add(&le->le_list, &sdp->sd_log_le_gl);
44 gfs2_log_unlock(sdp);
45}
46
47static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
48{
49 struct list_head *head = &sdp->sd_log_le_gl;
50 struct gfs2_glock *gl;
51
52 while (!list_empty(head)) {
53 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
54 list_del_init(&gl->gl_le.le_list);
55 sdp->sd_log_num_gl--;
56
57 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
58 gfs2_glock_put(gl);
59 }
60 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
61}
62
63static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
64{
65 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
66 struct gfs2_trans *tr;
67
68 if (!list_empty(&bd->bd_list_tr))
69 return;
70
71 tr = get_transaction;
72 tr->tr_touched = 1;
73 tr->tr_num_buf++;
74 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
75
76 if (!list_empty(&le->le_list))
77 return;
78
79 gfs2_trans_add_gl(bd->bd_gl);
80
81 gfs2_meta_check(sdp, bd->bd_bh);
82 gfs2_pin(sdp, bd->bd_bh);
83
84 gfs2_log_lock(sdp);
85 sdp->sd_log_num_buf++;
86 list_add(&le->le_list, &sdp->sd_log_le_buf);
87 gfs2_log_unlock(sdp);
88
89 tr->tr_num_buf_new++;
90}
91
92static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
93{
94 struct list_head *head = &tr->tr_list_buf;
95 struct gfs2_bufdata *bd;
96
97 while (!list_empty(head)) {
98 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
99 list_del_init(&bd->bd_list_tr);
100 tr->tr_num_buf--;
101 }
102 gfs2_assert_warn(sdp, !tr->tr_num_buf);
103}
104
105static void buf_lo_before_commit(struct gfs2_sbd *sdp)
106{
107 struct buffer_head *bh;
108 struct gfs2_log_descriptor *ld;
109 struct gfs2_bufdata *bd1 = NULL, *bd2;
110 unsigned int total = sdp->sd_log_num_buf;
111 unsigned int offset = sizeof(struct gfs2_log_descriptor);
112 unsigned int limit;
113 unsigned int num;
114 unsigned n;
115 __be64 *ptr;
116
117 offset += (sizeof(__be64) - 1);
118 offset &= ~(sizeof(__be64) - 1);
119 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
120 /* for 4k blocks, limit = 503 */
121
122 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
123 while(total) {
124 num = total;
125 if (total > limit)
126 num = limit;
127 bh = gfs2_log_get_buf(sdp);
128 ld = (struct gfs2_log_descriptor *)bh->b_data;
129 ptr = (__be64 *)(bh->b_data + offset);
130 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
131 ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD);
132 ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD);
133 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
134 ld->ld_length = cpu_to_be32(num + 1);
135 ld->ld_data1 = cpu_to_be32(num);
136 ld->ld_data2 = cpu_to_be32(0);
137 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
138
139 n = 0;
140 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, bd_le.le_list) {
141 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
142 if (++n >= num)
143 break;
144 }
145
146 set_buffer_dirty(bh);
147 ll_rw_block(WRITE, 1, &bh);
148
149 n = 0;
150 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, bd_le.le_list) {
151 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154 if (++n >= num)
155 break;
156 }
157
158 total -= num;
159 }
160}
161
162static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
163{
164 struct list_head *head = &sdp->sd_log_le_buf;
165 struct gfs2_bufdata *bd;
166
167 while (!list_empty(head)) {
168 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
169 list_del_init(&bd->bd_le.le_list);
170 sdp->sd_log_num_buf--;
171
172 gfs2_unpin(sdp, bd->bd_bh, ai);
173 }
174 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
175}
176
177static void buf_lo_before_scan(struct gfs2_jdesc *jd,
178 struct gfs2_log_header *head, int pass)
179{
180 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
181
182 if (pass != 0)
183 return;
184
185 sdp->sd_found_blocks = 0;
186 sdp->sd_replayed_blocks = 0;
187}
188
189static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
190 struct gfs2_log_descriptor *ld, __be64 *ptr,
191 int pass)
192{
193 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
194 struct gfs2_glock *gl = get_v2ip(jd->jd_inode)->i_gl;
195 unsigned int blks = be32_to_cpu(ld->ld_data1);
196 struct buffer_head *bh_log, *bh_ip;
197 uint64_t blkno;
198 int error = 0;
199
200 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
201 return 0;
202
203 gfs2_replay_incr_blk(sdp, &start);
204
205 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
206 blkno = be64_to_cpu(*ptr++);
207
208 sdp->sd_found_blocks++;
209
210 if (gfs2_revoke_check(sdp, blkno, start))
211 continue;
212
213 error = gfs2_replay_read_block(jd, start, &bh_log);
214 if (error)
215 return error;
216
217 bh_ip = gfs2_meta_new(gl, blkno);
218 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
219
220 if (gfs2_meta_check(sdp, bh_ip))
221 error = -EIO;
222 else
223 mark_buffer_dirty(bh_ip);
224
225 brelse(bh_log);
226 brelse(bh_ip);
227
228 if (error)
229 break;
230
231 sdp->sd_replayed_blocks++;
232 }
233
234 return error;
235}
236
237static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
238{
239 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
240
241 if (error) {
242 gfs2_meta_sync(get_v2ip(jd->jd_inode)->i_gl, DIO_START | DIO_WAIT);
243 return;
244 }
245 if (pass != 1)
246 return;
247
248 gfs2_meta_sync(get_v2ip(jd->jd_inode)->i_gl, DIO_START | DIO_WAIT);
249
250 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
251 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
252}
253
254static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
255{
256 struct gfs2_trans *tr;
257
258 tr = get_transaction;
259 tr->tr_touched = 1;
260 tr->tr_num_revoke++;
261
262 gfs2_log_lock(sdp);
263 sdp->sd_log_num_revoke++;
264 list_add(&le->le_list, &sdp->sd_log_le_revoke);
265 gfs2_log_unlock(sdp);
266}
267
268static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
269{
270 struct gfs2_log_descriptor *ld;
271 struct gfs2_meta_header *mh;
272 struct buffer_head *bh;
273 unsigned int offset;
274 struct list_head *head = &sdp->sd_log_le_revoke;
275 struct gfs2_revoke *rv;
276
277 if (!sdp->sd_log_num_revoke)
278 return;
279
280 bh = gfs2_log_get_buf(sdp);
281 ld = (struct gfs2_log_descriptor *)bh->b_data;
282 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
283 ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD);
284 ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD);
285 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
286 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(uint64_t)));
287 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
288 ld->ld_data2 = cpu_to_be32(0);
289 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
290 offset = sizeof(struct gfs2_log_descriptor);
291
292 while (!list_empty(head)) {
293 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
294 list_del_init(&rv->rv_le.le_list);
295 sdp->sd_log_num_revoke--;
296
297 if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) {
298 set_buffer_dirty(bh);
299 ll_rw_block(WRITE, 1, &bh);
300
301 bh = gfs2_log_get_buf(sdp);
302 mh = (struct gfs2_meta_header *)bh->b_data;
303 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
304 mh->mh_type = cpu_to_be16(GFS2_METATYPE_LB);
305 mh->mh_format = cpu_to_be16(GFS2_FORMAT_LB);
306 offset = sizeof(struct gfs2_meta_header);
307 }
308
309 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
310 kfree(rv);
311
312 offset += sizeof(uint64_t);
313 }
314 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
315
316 set_buffer_dirty(bh);
317 ll_rw_block(WRITE, 1, &bh);
318}
319
320static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
321 struct gfs2_log_header *head, int pass)
322{
323 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
324
325 if (pass != 0)
326 return;
327
328 sdp->sd_found_revokes = 0;
329 sdp->sd_replay_tail = head->lh_tail;
330}
331
332static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
333 struct gfs2_log_descriptor *ld, __be64 *ptr,
334 int pass)
335{
336 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
337 unsigned int blks = be32_to_cpu(ld->ld_length);
338 unsigned int revokes = be32_to_cpu(ld->ld_data1);
339 struct buffer_head *bh;
340 unsigned int offset;
341 uint64_t blkno;
342 int first = 1;
343 int error;
344
345 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
346 return 0;
347
348 offset = sizeof(struct gfs2_log_descriptor);
349
350 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
351 error = gfs2_replay_read_block(jd, start, &bh);
352 if (error)
353 return error;
354
355 if (!first)
356 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
357
358 while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) {
359 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
360
361 error = gfs2_revoke_add(sdp, blkno, start);
362 if (error < 0)
363 return error;
364 else if (error)
365 sdp->sd_found_revokes++;
366
367 if (!--revokes)
368 break;
369 offset += sizeof(uint64_t);
370 }
371
372 brelse(bh);
373 offset = sizeof(struct gfs2_meta_header);
374 first = 0;
375 }
376
377 return 0;
378}
379
380static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
381{
382 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
383
384 if (error) {
385 gfs2_revoke_clean(sdp);
386 return;
387 }
388 if (pass != 1)
389 return;
390
391 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
392 jd->jd_jid, sdp->sd_found_revokes);
393
394 gfs2_revoke_clean(sdp);
395}
396
397static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
398{
399 struct gfs2_rgrpd *rgd;
400
401 get_transaction->tr_touched = 1;
402
403 if (!list_empty(&le->le_list))
404 return;
405
406 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
407 gfs2_rgrp_bh_hold(rgd);
408
409 gfs2_log_lock(sdp);
410 sdp->sd_log_num_rg++;
411 list_add(&le->le_list, &sdp->sd_log_le_rg);
412 gfs2_log_unlock(sdp);
413}
414
415static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
416{
417 struct list_head *head = &sdp->sd_log_le_rg;
418 struct gfs2_rgrpd *rgd;
419
420 while (!list_empty(head)) {
421 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
422 list_del_init(&rgd->rd_le.le_list);
423 sdp->sd_log_num_rg--;
424
425 gfs2_rgrp_repolish_clones(rgd);
426 gfs2_rgrp_bh_put(rgd);
427 }
428 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
429}
430
431/**
432 * databuf_lo_add - Add a databuf to the transaction.
433 *
434 * This is used in two distinct cases:
435 * i) In ordered write mode
436 * We put the data buffer on a list so that we can ensure that its
437 * synced to disk at the right time
438 * ii) In journaled data mode
439 * We need to journal the data block in the same way as metadata in
440 * the functions above. The difference is that here we have a tag
441 * which is two __be64's being the block number (as per meta data)
442 * and a flag which says whether the data block needs escaping or
443 * not. This means we need a new log entry for each 251 or so data
444 * blocks, which isn't an enormous overhead but twice as much as
445 * for normal metadata blocks.
446 */
447static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
448{
449 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
450 struct gfs2_trans *tr = get_transaction;
451 struct address_space *mapping = bd->bd_bh->b_page->mapping;
452 struct gfs2_inode *ip = get_v2ip(mapping->host);
453
454 tr->tr_touched = 1;
455 if (!list_empty(&bd->bd_list_tr) &&
456 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
457 tr->tr_num_buf++;
458 gfs2_trans_add_gl(bd->bd_gl);
459 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
460 gfs2_pin(sdp, bd->bd_bh);
461 }
462 gfs2_log_lock(sdp);
463 if (!list_empty(&le->le_list)) {
464 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
465 sdp->sd_log_num_jdata++;
466 sdp->sd_log_num_databuf++;
467 list_add(&le->le_list, &sdp->sd_log_le_databuf);
468 }
469 gfs2_log_unlock(sdp);
470}
471
472static int gfs2_check_magic(struct buffer_head *bh)
473{
474 struct page *page = bh->b_page;
475 void *kaddr;
476 __be32 *ptr;
477 int rv = 0;
478
479 kaddr = kmap_atomic(page, KM_USER0);
480 ptr = kaddr + bh_offset(bh);
481 if (*ptr == cpu_to_be32(GFS2_MAGIC))
482 rv = 1;
483 kunmap_atomic(page, KM_USER0);
484
485 return rv;
486}
487
488/**
489 * databuf_lo_before_commit - Scan the data buffers, writing as we go
490 *
491 * Here we scan through the lists of buffers and make the assumption
492 * that any buffer thats been pinned is being journaled, and that
493 * any unpinned buffer is an ordered write data buffer and therefore
494 * will be written back rather than journaled.
495 */
496static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
497{
498 LIST_HEAD(started);
499 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
500 struct buffer_head *bh = NULL;
501 unsigned int offset = sizeof(struct gfs2_log_descriptor);
502 struct gfs2_log_descriptor *ld;
503 unsigned int limit;
504 unsigned int total_dbuf = sdp->sd_log_num_databuf;
505 unsigned int total_jdata = sdp->sd_log_num_jdata;
506 unsigned int num, n;
507 __be64 *ptr = NULL;
508
509 offset += (2*sizeof(__be64) - 1);
510 offset &= ~(2*sizeof(__be64) - 1);
511 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
512
513 /*
514 * Start writing ordered buffers, write journaled buffers
515 * into the log along with a header
516 */
517 gfs2_log_lock(sdp);
518 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, bd_le.le_list);
519 while(total_dbuf) {
520 num = total_jdata;
521 if (num > limit)
522 num = limit;
523 n = 0;
524 list_for_each_entry_safe_continue(bd1, bdt, &sdp->sd_log_le_databuf, bd_le.le_list) {
525 /* An ordered write buffer */
526 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
527 list_move(&bd1->bd_le.le_list, &started);
528 if (bd1 == bd2) {
529 bd2 = NULL;
530 bd2 = list_prepare_entry(bd2, &sdp->sd_log_le_databuf, bd_le.le_list);
531 }
532 total_dbuf--;
533 if (bd1->bd_bh) {
534 get_bh(bd1->bd_bh);
535 if (buffer_dirty(bd1->bd_bh)) {
536 gfs2_log_unlock(sdp);
537 wait_on_buffer(bd1->bd_bh);
538 ll_rw_block(WRITE, 1, &bd1->bd_bh);
539 gfs2_log_lock(sdp);
540 }
541 brelse(bd1->bd_bh);
542 continue;
543 }
544 continue;
545 } else if (bd1->bd_bh) { /* A journaled buffer */
546 int magic;
547 gfs2_log_unlock(sdp);
548 if (!bh) {
549 bh = gfs2_log_get_buf(sdp);
550 ld = (struct gfs2_log_descriptor *)bh->b_data;
551 ptr = (__be64 *)(bh->b_data + offset);
552 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
553 ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD);
554 ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD);
555 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA);
556 ld->ld_length = cpu_to_be32(num + 1);
557 ld->ld_data1 = cpu_to_be32(num);
558 ld->ld_data2 = cpu_to_be32(0);
559 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
560 }
561 magic = gfs2_check_magic(bd1->bd_bh);
562 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
563 *ptr++ = cpu_to_be64((__u64)magic);
564 clear_buffer_escaped(bd1->bd_bh);
565 if (unlikely(magic != 0))
566 set_buffer_escaped(bd1->bd_bh);
567 gfs2_log_lock(sdp);
568 if (n++ > num)
569 break;
570 }
571 }
572 gfs2_log_unlock(sdp);
573 if (bh) {
574 set_buffer_dirty(bh);
575 ll_rw_block(WRITE, 1, &bh);
576 bh = NULL;
577 }
578 n = 0;
579 gfs2_log_lock(sdp);
580 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, bd_le.le_list) {
581 if (!bd2->bd_bh)
582 continue;
583 /* copy buffer if it needs escaping */
584 gfs2_log_unlock(sdp);
585 if (unlikely(buffer_escaped(bd2->bd_bh))) {
586 void *kaddr;
587 struct page *page = bd2->bd_bh->b_page;
588 bh = gfs2_log_get_buf(sdp);
589 kaddr = kmap_atomic(page, KM_USER0);
590 memcpy(bh->b_data, kaddr + bh_offset(bd2->bd_bh), sdp->sd_sb.sb_bsize);
591 kunmap_atomic(page, KM_USER0);
592 *(__be32 *)bh->b_data = 0;
593 } else {
594 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
595 }
596 set_buffer_dirty(bh);
597 ll_rw_block(WRITE, 1, &bh);
598 gfs2_log_lock(sdp);
599 if (++n >= num)
600 break;
601 }
602 bh = NULL;
603 total_dbuf -= num;
604 total_jdata -= num;
605 }
606 gfs2_log_unlock(sdp);
607
608 /* Wait on all ordered buffers */
609 while (!list_empty(&started)) {
610 gfs2_log_lock(sdp);
611 bd1 = list_entry(started.next, struct gfs2_bufdata, bd_le.le_list);
612 list_del(&bd1->bd_le.le_list);
613 sdp->sd_log_num_databuf--;
614
615 bh = bd1->bd_bh;
616 if (bh) {
617 set_v2bd(bh, NULL);
618 gfs2_log_unlock(sdp);
619 wait_on_buffer(bh);
620 brelse(bh);
621 } else
622 gfs2_log_unlock(sdp);
623
624 kfree(bd1);
625 }
626
627 /* We've removed all the ordered write bufs here, so only jdata left */
628 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
629}
630
631static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
632 struct gfs2_log_descriptor *ld,
633 __be64 *ptr, int pass)
634{
635 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
636 struct gfs2_glock *gl = get_v2ip(jd->jd_inode)->i_gl;
637 unsigned int blks = be32_to_cpu(ld->ld_data1);
638 struct buffer_head *bh_log, *bh_ip;
639 uint64_t blkno;
640 uint64_t esc;
641 int error = 0;
642
643 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
644 return 0;
645
646 gfs2_replay_incr_blk(sdp, &start);
647 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
648 blkno = be64_to_cpu(*ptr++);
649 esc = be64_to_cpu(*ptr++);
650
651 sdp->sd_found_blocks++;
652
653 if (gfs2_revoke_check(sdp, blkno, start))
654 continue;
655
656 error = gfs2_replay_read_block(jd, start, &bh_log);
657 if (error)
658 return error;
659
660 bh_ip = gfs2_meta_new(gl, blkno);
661 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
662
663 /* Unescape */
664 if (esc) {
665 __be32 *eptr = (__be32 *)bh_ip->b_data;
666 *eptr = cpu_to_be32(GFS2_MAGIC);
667 }
668 mark_buffer_dirty(bh_ip);
669
670 brelse(bh_log);
671 brelse(bh_ip);
672 if (error)
673 break;
674
675 sdp->sd_replayed_blocks++;
676 }
677
678 return error;
679}
680
681/* FIXME: sort out accounting for log blocks etc. */
682
683static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
684{
685 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
686
687 if (error) {
688 gfs2_meta_sync(get_v2ip(jd->jd_inode)->i_gl, DIO_START | DIO_WAIT);
689 return;
690 }
691 if (pass != 1)
692 return;
693
694 /* data sync? */
695 gfs2_meta_sync(get_v2ip(jd->jd_inode)->i_gl, DIO_START | DIO_WAIT);
696
697 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
698 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
699}
700
701static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
702{
703 struct list_head *head = &sdp->sd_log_le_databuf;
704 struct gfs2_bufdata *bd;
705
706 while (!list_empty(head)) {
707 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
708 list_del(&bd->bd_le.le_list);
709 sdp->sd_log_num_databuf--;
710 sdp->sd_log_num_jdata--;
711 gfs2_unpin(sdp, bd->bd_bh, ai);
712 }
713 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
714 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
715}
716
717
718struct gfs2_log_operations gfs2_glock_lops = {
719 .lo_add = glock_lo_add,
720 .lo_after_commit = glock_lo_after_commit,
721 .lo_name = "glock"
722};
723
724struct gfs2_log_operations gfs2_buf_lops = {
725 .lo_add = buf_lo_add,
726 .lo_incore_commit = buf_lo_incore_commit,
727 .lo_before_commit = buf_lo_before_commit,
728 .lo_after_commit = buf_lo_after_commit,
729 .lo_before_scan = buf_lo_before_scan,
730 .lo_scan_elements = buf_lo_scan_elements,
731 .lo_after_scan = buf_lo_after_scan,
732 .lo_name = "buf"
733};
734
735struct gfs2_log_operations gfs2_revoke_lops = {
736 .lo_add = revoke_lo_add,
737 .lo_before_commit = revoke_lo_before_commit,
738 .lo_before_scan = revoke_lo_before_scan,
739 .lo_scan_elements = revoke_lo_scan_elements,
740 .lo_after_scan = revoke_lo_after_scan,
741 .lo_name = "revoke"
742};
743
744struct gfs2_log_operations gfs2_rg_lops = {
745 .lo_add = rg_lo_add,
746 .lo_after_commit = rg_lo_after_commit,
747 .lo_name = "rg"
748};
749
750struct gfs2_log_operations gfs2_databuf_lops = {
751 .lo_add = databuf_lo_add,
752 .lo_incore_commit = buf_lo_incore_commit,
753 .lo_before_commit = databuf_lo_before_commit,
754 .lo_after_commit = databuf_lo_after_commit,
755 .lo_scan_elements = databuf_lo_scan_elements,
756 .lo_after_scan = databuf_lo_after_scan,
757 .lo_name = "databuf"
758};
759
760struct gfs2_log_operations *gfs2_log_ops[] = {
761 &gfs2_glock_lops,
762 &gfs2_buf_lops,
763 &gfs2_revoke_lops,
764 &gfs2_rg_lops,
765 &gfs2_databuf_lops,
766 NULL
767};
768
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..417f5aade4b1
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13extern struct gfs2_log_operations gfs2_glock_lops;
14extern struct gfs2_log_operations gfs2_buf_lops;
15extern struct gfs2_log_operations gfs2_revoke_lops;
16extern struct gfs2_log_operations gfs2_rg_lops;
17extern struct gfs2_log_operations gfs2_databuf_lops;
18
19extern struct gfs2_log_operations *gfs2_log_ops[];
20
21static inline void lops_init_le(struct gfs2_log_element *le,
22 struct gfs2_log_operations *lops)
23{
24 INIT_LIST_HEAD(&le->le_list);
25 le->le_ops = lops;
26}
27
28static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
29{
30 if (le->le_ops->lo_add)
31 le->le_ops->lo_add(sdp, le);
32}
33
34static inline void lops_incore_commit(struct gfs2_sbd *sdp,
35 struct gfs2_trans *tr)
36{
37 int x;
38 for (x = 0; gfs2_log_ops[x]; x++)
39 if (gfs2_log_ops[x]->lo_incore_commit)
40 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
41}
42
43static inline void lops_before_commit(struct gfs2_sbd *sdp)
44{
45 int x;
46 for (x = 0; gfs2_log_ops[x]; x++)
47 if (gfs2_log_ops[x]->lo_before_commit)
48 gfs2_log_ops[x]->lo_before_commit(sdp);
49}
50
51static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 int x;
54 for (x = 0; gfs2_log_ops[x]; x++)
55 if (gfs2_log_ops[x]->lo_after_commit)
56 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
57}
58
59static inline void lops_before_scan(struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head,
61 unsigned int pass)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_before_scan)
66 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
67}
68
69static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
70 struct gfs2_log_descriptor *ld,
71 __be64 *ptr,
72 unsigned int pass)
73{
74 int x, error;
75 for (x = 0; gfs2_log_ops[x]; x++)
76 if (gfs2_log_ops[x]->lo_scan_elements) {
77 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
78 ld, ptr, pass);
79 if (error)
80 return error;
81 }
82
83 return 0;
84}
85
86static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
87 unsigned int pass)
88{
89 int x;
90 for (x = 0; gfs2_log_ops[x]; x++)
91 if (gfs2_log_ops[x]->lo_before_scan)
92 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
93}
94
95#endif /* __LOPS_DOT_H__ */
96
diff --git a/fs/gfs2/lvb.c b/fs/gfs2/lvb.c
new file mode 100644
index 000000000000..8af62568a3f4
--- /dev/null
+++ b/fs/gfs2/lvb.c
@@ -0,0 +1,48 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18
19#define pv(struct, member, fmt) printk(" "#member" = "fmt"\n", struct->member);
20
21void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb)
22{
23 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
24
25 qb->qb_magic = be32_to_cpu(str->qb_magic);
26 qb->qb_limit = be64_to_cpu(str->qb_limit);
27 qb->qb_warn = be64_to_cpu(str->qb_warn);
28 qb->qb_value = be64_to_cpu(str->qb_value);
29}
30
31void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb)
32{
33 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
34
35 str->qb_magic = cpu_to_be32(qb->qb_magic);
36 str->qb_limit = cpu_to_be64(qb->qb_limit);
37 str->qb_warn = cpu_to_be64(qb->qb_warn);
38 str->qb_value = cpu_to_be64(qb->qb_value);
39}
40
41void gfs2_quota_lvb_print(struct gfs2_quota_lvb *qb)
42{
43 pv(qb, qb_magic, "%u");
44 pv(qb, qb_limit, "%llu");
45 pv(qb, qb_warn, "%llu");
46 pv(qb, qb_value, "%lld");
47}
48
diff --git a/fs/gfs2/lvb.h b/fs/gfs2/lvb.h
new file mode 100644
index 000000000000..ca9732b2d9f4
--- /dev/null
+++ b/fs/gfs2/lvb.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LVB_DOT_H__
11#define __LVB_DOT_H__
12
13#define GFS2_MIN_LVB_SIZE 32
14
15struct gfs2_quota_lvb {
16 uint32_t qb_magic;
17 uint32_t __pad;
18 uint64_t qb_limit; /* Hard limit of # blocks to alloc */
19 uint64_t qb_warn; /* Warn user when alloc is above this # */
20 int64_t qb_value; /* Current # blocks allocated */
21};
22
23void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb);
24void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb);
25void gfs2_quota_lvb_print(struct gfs2_quota_lvb *qb);
26
27#endif /* __LVB_DOT_H__ */
28
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..0c60f2b10fdd
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,103 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "ops_fstype.h"
21#include "sys.h"
22
23/**
24 * init_gfs2_fs - Register GFS2 as a filesystem
25 *
26 * Returns: 0 on success, error code on failure
27 */
28
29static int __init init_gfs2_fs(void)
30{
31 int error;
32
33 gfs2_init_lmh();
34
35 error = gfs2_sys_init();
36 if (error)
37 return error;
38
39 error = -ENOMEM;
40
41 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
42 sizeof(struct gfs2_glock),
43 0, 0, NULL, NULL);
44 if (!gfs2_glock_cachep)
45 goto fail;
46
47 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
48 sizeof(struct gfs2_inode),
49 0, 0, NULL, NULL);
50 if (!gfs2_inode_cachep)
51 goto fail;
52
53 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
54 sizeof(struct gfs2_bufdata),
55 0, 0, NULL, NULL);
56 if (!gfs2_bufdata_cachep)
57 goto fail;
58
59 error = register_filesystem(&gfs2_fs_type);
60 if (error)
61 goto fail;
62
63 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
64
65 return 0;
66
67 fail:
68 if (gfs2_bufdata_cachep)
69 kmem_cache_destroy(gfs2_bufdata_cachep);
70
71 if (gfs2_inode_cachep)
72 kmem_cache_destroy(gfs2_inode_cachep);
73
74 if (gfs2_glock_cachep)
75 kmem_cache_destroy(gfs2_glock_cachep);
76
77 gfs2_sys_uninit();
78 return error;
79}
80
81/**
82 * exit_gfs2_fs - Unregister the file system
83 *
84 */
85
86static void __exit exit_gfs2_fs(void)
87{
88 unregister_filesystem(&gfs2_fs_type);
89
90 kmem_cache_destroy(gfs2_bufdata_cachep);
91 kmem_cache_destroy(gfs2_inode_cachep);
92 kmem_cache_destroy(gfs2_glock_cachep);
93
94 gfs2_sys_uninit();
95}
96
97MODULE_DESCRIPTION("Global File System");
98MODULE_AUTHOR("Red Hat, Inc.");
99MODULE_LICENSE("GPL");
100
101module_init(init_gfs2_fs);
102module_exit(exit_gfs2_fs);
103
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..8fba84306755
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,883 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <asm/semaphore.h>
21
22#include "gfs2.h"
23#include "glock.h"
24#include "glops.h"
25#include "inode.h"
26#include "log.h"
27#include "lops.h"
28#include "meta_io.h"
29#include "rgrp.h"
30#include "trans.h"
31
32#define buffer_busy(bh) \
33((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
34#define buffer_in_io(bh) \
35((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
36
37static int aspace_get_block(struct inode *inode, sector_t lblock,
38 struct buffer_head *bh_result, int create)
39{
40 gfs2_assert_warn(get_v2sdp(inode->i_sb), 0);
41 return -EOPNOTSUPP;
42}
43
44static int gfs2_aspace_writepage(struct page *page,
45 struct writeback_control *wbc)
46{
47 return block_write_full_page(page, aspace_get_block, wbc);
48}
49
50/**
51 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
52 * @bh: the buffer we're stuck on
53 *
54 */
55
56static void stuck_releasepage(struct buffer_head *bh)
57{
58 struct gfs2_sbd *sdp = get_v2sdp(bh->b_page->mapping->host->i_sb);
59 struct gfs2_bufdata *bd = get_v2bd(bh);
60 struct gfs2_glock *gl;
61
62 fs_warn(sdp, "stuck in gfs2_releasepage()\n");
63 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
64 (uint64_t)bh->b_blocknr, atomic_read(&bh->b_count));
65 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
66 fs_warn(sdp, "get_v2bd(bh) = %s\n", (bd) ? "!NULL" : "NULL");
67
68 if (!bd)
69 return;
70
71 gl = bd->bd_gl;
72
73 fs_warn(sdp, "gl = (%u, %llu)\n",
74 gl->gl_name.ln_type, gl->gl_name.ln_number);
75
76 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
77 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
78 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
79
80 if (gl->gl_ops == &gfs2_inode_glops) {
81 struct gfs2_inode *ip = get_gl2ip(gl);
82 unsigned int x;
83
84 if (!ip)
85 return;
86
87 fs_warn(sdp, "ip = %llu %llu\n",
88 ip->i_num.no_formal_ino, ip->i_num.no_addr);
89 fs_warn(sdp, "ip->i_count = %d, ip->i_vnode = %s\n",
90 atomic_read(&ip->i_count),
91 (ip->i_vnode) ? "!NULL" : "NULL");
92
93 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
94 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
95 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
96 }
97}
98
99/**
100 * gfs2_aspace_releasepage - free the metadata associated with a page
101 * @page: the page that's being released
102 * @gfp_mask: passed from Linux VFS, ignored by us
103 *
104 * Call try_to_free_buffers() if the buffers in this page can be
105 * released.
106 *
107 * Returns: 0
108 */
109
110static int gfs2_aspace_releasepage(struct page *page, gfp_t gfp_mask)
111{
112 struct inode *aspace = page->mapping->host;
113 struct gfs2_sbd *sdp = get_v2sdp(aspace->i_sb);
114 struct buffer_head *bh, *head;
115 struct gfs2_bufdata *bd;
116 unsigned long t;
117
118 if (!page_has_buffers(page))
119 goto out;
120
121 head = bh = page_buffers(page);
122 do {
123 t = jiffies;
124
125 while (atomic_read(&bh->b_count)) {
126 if (atomic_read(&aspace->i_writecount)) {
127 if (time_after_eq(jiffies, t +
128 gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
129 stuck_releasepage(bh);
130 t = jiffies;
131 }
132
133 yield();
134 continue;
135 }
136
137 return 0;
138 }
139
140 gfs2_assert_warn(sdp, !buffer_pinned(bh));
141
142 bd = get_v2bd(bh);
143 if (bd) {
144 gfs2_assert_warn(sdp, bd->bd_bh == bh);
145 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
146 gfs2_assert_warn(sdp, list_empty(&bd->bd_le.le_list));
147 gfs2_assert_warn(sdp, !bd->bd_ail);
148 kmem_cache_free(gfs2_bufdata_cachep, bd);
149 atomic_dec(&sdp->sd_bufdata_count);
150 set_v2bd(bh, NULL);
151 }
152
153 bh = bh->b_this_page;
154 }
155 while (bh != head);
156
157 out:
158 return try_to_free_buffers(page);
159}
160
161static struct address_space_operations aspace_aops = {
162 .writepage = gfs2_aspace_writepage,
163 .releasepage = gfs2_aspace_releasepage,
164};
165
166/**
167 * gfs2_aspace_get - Create and initialize a struct inode structure
168 * @sdp: the filesystem the aspace is in
169 *
170 * Right now a struct inode is just a struct inode. Maybe Linux
171 * will supply a more lightweight address space construct (that works)
172 * in the future.
173 *
174 * Make sure pages/buffers in this aspace aren't in high memory.
175 *
176 * Returns: the aspace
177 */
178
179struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
180{
181 struct inode *aspace;
182
183 aspace = new_inode(sdp->sd_vfs);
184 if (aspace) {
185 mapping_set_gfp_mask(aspace->i_mapping, GFP_KERNEL);
186 aspace->i_mapping->a_ops = &aspace_aops;
187 aspace->i_size = ~0ULL;
188 set_v2ip(aspace, NULL);
189 insert_inode_hash(aspace);
190 }
191
192 return aspace;
193}
194
195void gfs2_aspace_put(struct inode *aspace)
196{
197 remove_inode_hash(aspace);
198 iput(aspace);
199}
200
201/**
202 * gfs2_ail1_start_one - Start I/O on a part of the AIL
203 * @sdp: the filesystem
204 * @tr: the part of the AIL
205 *
206 */
207
208void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
209{
210 struct gfs2_bufdata *bd, *s;
211 struct buffer_head *bh;
212 int retry;
213
214 do {
215 retry = 0;
216
217 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
218 bd_ail_st_list) {
219 bh = bd->bd_bh;
220
221 gfs2_assert(sdp, bd->bd_ail == ai);
222
223 if (!buffer_busy(bh)) {
224 if (!buffer_uptodate(bh))
225 gfs2_io_error_bh(sdp, bh);
226 list_move(&bd->bd_ail_st_list,
227 &ai->ai_ail2_list);
228 continue;
229 }
230
231 if (!buffer_dirty(bh))
232 continue;
233
234 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
235
236 gfs2_log_unlock(sdp);
237 wait_on_buffer(bh);
238 ll_rw_block(WRITE, 1, &bh);
239 gfs2_log_lock(sdp);
240
241 retry = 1;
242 break;
243 }
244 } while (retry);
245}
246
247/**
248 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
249 * @sdp: the filesystem
250 * @ai: the AIL entry
251 *
252 */
253
254int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
255{
256 struct gfs2_bufdata *bd, *s;
257 struct buffer_head *bh;
258
259 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
260 bd_ail_st_list) {
261 bh = bd->bd_bh;
262
263 gfs2_assert(sdp, bd->bd_ail == ai);
264
265 if (buffer_busy(bh)) {
266 if (flags & DIO_ALL)
267 continue;
268 else
269 break;
270 }
271
272 if (!buffer_uptodate(bh))
273 gfs2_io_error_bh(sdp, bh);
274
275 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
276 }
277
278 return list_empty(&ai->ai_ail1_list);
279}
280
281/**
282 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
283 * @sdp: the filesystem
284 * @ai: the AIL entry
285 *
286 */
287
288void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
289{
290 struct list_head *head = &ai->ai_ail2_list;
291 struct gfs2_bufdata *bd;
292
293 while (!list_empty(head)) {
294 bd = list_entry(head->prev, struct gfs2_bufdata,
295 bd_ail_st_list);
296 gfs2_assert(sdp, bd->bd_ail == ai);
297 bd->bd_ail = NULL;
298 list_del(&bd->bd_ail_st_list);
299 list_del(&bd->bd_ail_gl_list);
300 atomic_dec(&bd->bd_gl->gl_ail_count);
301 brelse(bd->bd_bh);
302 }
303}
304
305/**
306 * ail_empty_gl - remove all buffers for a given lock from the AIL
307 * @gl: the glock
308 *
309 * None of the buffers should be dirty, locked, or pinned.
310 */
311
312void gfs2_ail_empty_gl(struct gfs2_glock *gl)
313{
314 struct gfs2_sbd *sdp = gl->gl_sbd;
315 unsigned int blocks;
316 struct list_head *head = &gl->gl_ail_list;
317 struct gfs2_bufdata *bd;
318 struct buffer_head *bh;
319 uint64_t blkno;
320 int error;
321
322 blocks = atomic_read(&gl->gl_ail_count);
323 if (!blocks)
324 return;
325
326 error = gfs2_trans_begin(sdp, 0, blocks);
327 if (gfs2_assert_withdraw(sdp, !error))
328 return;
329
330 gfs2_log_lock(sdp);
331 while (!list_empty(head)) {
332 bd = list_entry(head->next, struct gfs2_bufdata,
333 bd_ail_gl_list);
334 bh = bd->bd_bh;
335 blkno = bh->b_blocknr;
336 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
337
338 bd->bd_ail = NULL;
339 list_del(&bd->bd_ail_st_list);
340 list_del(&bd->bd_ail_gl_list);
341 atomic_dec(&gl->gl_ail_count);
342 brelse(bh);
343 gfs2_log_unlock(sdp);
344
345 gfs2_trans_add_revoke(sdp, blkno);
346
347 gfs2_log_lock(sdp);
348 }
349 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
350 gfs2_log_unlock(sdp);
351
352 gfs2_trans_end(sdp);
353 gfs2_log_flush(sdp);
354}
355
356/**
357 * gfs2_meta_inval - Invalidate all buffers associated with a glock
358 * @gl: the glock
359 *
360 */
361
362void gfs2_meta_inval(struct gfs2_glock *gl)
363{
364 struct gfs2_sbd *sdp = gl->gl_sbd;
365 struct inode *aspace = gl->gl_aspace;
366 struct address_space *mapping = gl->gl_aspace->i_mapping;
367
368 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
369
370 atomic_inc(&aspace->i_writecount);
371 truncate_inode_pages(mapping, 0);
372 atomic_dec(&aspace->i_writecount);
373
374 gfs2_assert_withdraw(sdp, !mapping->nrpages);
375}
376
377/**
378 * gfs2_meta_sync - Sync all buffers associated with a glock
379 * @gl: The glock
380 * @flags: DIO_START | DIO_WAIT
381 *
382 */
383
384void gfs2_meta_sync(struct gfs2_glock *gl, int flags)
385{
386 struct address_space *mapping = gl->gl_aspace->i_mapping;
387 int error = 0;
388
389 if (flags & DIO_START)
390 filemap_fdatawrite(mapping);
391 if (!error && (flags & DIO_WAIT))
392 error = filemap_fdatawait(mapping);
393
394 if (error)
395 gfs2_io_error(gl->gl_sbd);
396}
397
398/**
399 * getbuf - Get a buffer with a given address space
400 * @sdp: the filesystem
401 * @aspace: the address space
402 * @blkno: the block number (filesystem scope)
403 * @create: 1 if the buffer should be created
404 *
405 * Returns: the buffer
406 */
407
408static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
409 uint64_t blkno, int create)
410{
411 struct page *page;
412 struct buffer_head *bh;
413 unsigned int shift;
414 unsigned long index;
415 unsigned int bufnum;
416
417 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
418 index = blkno >> shift; /* convert block to page */
419 bufnum = blkno - (index << shift); /* block buf index within page */
420
421 if (create) {
422 for (;;) {
423 page = grab_cache_page(aspace->i_mapping, index);
424 if (page)
425 break;
426 yield();
427 }
428 } else {
429 page = find_lock_page(aspace->i_mapping, index);
430 if (!page)
431 return NULL;
432 }
433
434 if (!page_has_buffers(page))
435 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
436
437 /* Locate header for our buffer within our page */
438 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
439 /* Do nothing */;
440 get_bh(bh);
441
442 if (!buffer_mapped(bh))
443 map_bh(bh, sdp->sd_vfs, blkno);
444
445 unlock_page(page);
446 mark_page_accessed(page);
447 page_cache_release(page);
448
449 return bh;
450}
451
452static void meta_prep_new(struct buffer_head *bh)
453{
454 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
455
456 lock_buffer(bh);
457 clear_buffer_dirty(bh);
458 set_buffer_uptodate(bh);
459 unlock_buffer(bh);
460
461 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
462}
463
464/**
465 * gfs2_meta_new - Get a block
466 * @gl: The glock associated with this block
467 * @blkno: The block number
468 *
469 * Returns: The buffer
470 */
471
472struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno)
473{
474 struct buffer_head *bh;
475 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
476 meta_prep_new(bh);
477 return bh;
478}
479
480/**
481 * gfs2_meta_read - Read a block from disk
482 * @gl: The glock covering the block
483 * @blkno: The block number
484 * @flags: flags to gfs2_dreread()
485 * @bhp: the place where the buffer is returned (NULL on failure)
486 *
487 * Returns: errno
488 */
489
490int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags,
491 struct buffer_head **bhp)
492{
493 int error;
494
495 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
496 error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags);
497 if (error)
498 brelse(*bhp);
499
500 return error;
501}
502
503/**
504 * gfs2_meta_reread - Reread a block from disk
505 * @sdp: the filesystem
506 * @bh: The block to read
507 * @flags: Flags that control the read
508 *
509 * Returns: errno
510 */
511
512int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags)
513{
514 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
515 return -EIO;
516
517 if (flags & DIO_FORCE)
518 clear_buffer_uptodate(bh);
519
520 if ((flags & DIO_START) && !buffer_uptodate(bh))
521 ll_rw_block(READ, 1, &bh);
522
523 if (flags & DIO_WAIT) {
524 wait_on_buffer(bh);
525
526 if (!buffer_uptodate(bh)) {
527 struct gfs2_trans *tr = get_transaction;
528 if (tr && tr->tr_touched)
529 gfs2_io_error_bh(sdp, bh);
530 return -EIO;
531 }
532 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
533 return -EIO;
534 }
535
536 return 0;
537}
538
539/**
540 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
541 * @gl: the glock the buffer belongs to
542 * @bh: The buffer to be attached to
543 * @meta: Flag to indicate whether its metadata or not
544 */
545
546void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
547{
548 struct gfs2_bufdata *bd;
549
550 if (meta)
551 lock_page(bh->b_page);
552
553 if (get_v2bd(bh)) {
554 if (meta)
555 unlock_page(bh->b_page);
556 return;
557 }
558
559 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
560 atomic_inc(&gl->gl_sbd->sd_bufdata_count);
561
562 memset(bd, 0, sizeof(struct gfs2_bufdata));
563
564 bd->bd_bh = bh;
565 bd->bd_gl = gl;
566
567 INIT_LIST_HEAD(&bd->bd_list_tr);
568 if (meta) {
569 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
570 } else {
571 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
572 get_bh(bh);
573 }
574 set_v2bd(bh, bd);
575
576 if (meta)
577 unlock_page(bh->b_page);
578}
579
580/**
581 * gfs2_pin - Pin a buffer in memory
582 * @sdp: the filesystem the buffer belongs to
583 * @bh: The buffer to be pinned
584 *
585 */
586
587void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
588{
589 struct gfs2_bufdata *bd = get_v2bd(bh);
590
591 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
592
593 if (test_set_buffer_pinned(bh))
594 gfs2_assert_withdraw(sdp, 0);
595
596 wait_on_buffer(bh);
597
598 /* If this buffer is in the AIL and it has already been written
599 to in-place disk block, remove it from the AIL. */
600
601 gfs2_log_lock(sdp);
602 if (bd->bd_ail && !buffer_in_io(bh))
603 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
604 gfs2_log_unlock(sdp);
605
606 clear_buffer_dirty(bh);
607 wait_on_buffer(bh);
608
609 if (!buffer_uptodate(bh))
610 gfs2_io_error_bh(sdp, bh);
611
612 get_bh(bh);
613}
614
615/**
616 * gfs2_unpin - Unpin a buffer
617 * @sdp: the filesystem the buffer belongs to
618 * @bh: The buffer to unpin
619 * @ai:
620 *
621 */
622
623void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
624 struct gfs2_ail *ai)
625{
626 struct gfs2_bufdata *bd = get_v2bd(bh);
627
628 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
629
630 if (!buffer_pinned(bh))
631 gfs2_assert_withdraw(sdp, 0);
632
633 mark_buffer_dirty(bh);
634 clear_buffer_pinned(bh);
635
636 gfs2_log_lock(sdp);
637 if (bd->bd_ail) {
638 list_del(&bd->bd_ail_st_list);
639 brelse(bh);
640 } else {
641 struct gfs2_glock *gl = bd->bd_gl;
642 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
643 atomic_inc(&gl->gl_ail_count);
644 }
645 bd->bd_ail = ai;
646 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
647 gfs2_log_unlock(sdp);
648}
649
650/**
651 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
652 * @ip: the inode who owns the buffers
653 * @bstart: the first buffer in the run
654 * @blen: the number of buffers in the run
655 *
656 */
657
658void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
659{
660 struct gfs2_sbd *sdp = ip->i_sbd;
661 struct inode *aspace = ip->i_gl->gl_aspace;
662 struct buffer_head *bh;
663
664 while (blen) {
665 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
666 if (bh) {
667 struct gfs2_bufdata *bd = get_v2bd(bh);
668
669 if (test_clear_buffer_pinned(bh)) {
670 gfs2_log_lock(sdp);
671 list_del_init(&bd->bd_le.le_list);
672 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
673 sdp->sd_log_num_buf--;
674 gfs2_log_unlock(sdp);
675 get_transaction->tr_num_buf_rm++;
676 brelse(bh);
677 }
678 if (bd) {
679 gfs2_log_lock(sdp);
680 if (bd->bd_ail) {
681 uint64_t blkno = bh->b_blocknr;
682 bd->bd_ail = NULL;
683 list_del(&bd->bd_ail_st_list);
684 list_del(&bd->bd_ail_gl_list);
685 atomic_dec(&bd->bd_gl->gl_ail_count);
686 brelse(bh);
687 gfs2_log_unlock(sdp);
688 gfs2_trans_add_revoke(sdp, blkno);
689 } else
690 gfs2_log_unlock(sdp);
691 }
692
693 lock_buffer(bh);
694 clear_buffer_dirty(bh);
695 clear_buffer_uptodate(bh);
696 unlock_buffer(bh);
697
698 brelse(bh);
699 }
700
701 bstart++;
702 blen--;
703 }
704}
705
706/**
707 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
708 * @ip: The GFS2 inode
709 *
710 * This releases buffers that are in the most-recently-used array of
711 * blocks used for indirect block addressing for this inode.
712 */
713
714void gfs2_meta_cache_flush(struct gfs2_inode *ip)
715{
716 struct buffer_head **bh_slot;
717 unsigned int x;
718
719 spin_lock(&ip->i_spin);
720
721 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
722 bh_slot = &ip->i_cache[x];
723 if (!*bh_slot)
724 break;
725 brelse(*bh_slot);
726 *bh_slot = NULL;
727 }
728
729 spin_unlock(&ip->i_spin);
730}
731
732/**
733 * gfs2_meta_indirect_buffer - Get a metadata buffer
734 * @ip: The GFS2 inode
735 * @height: The level of this buf in the metadata (indir addr) tree (if any)
736 * @num: The block number (device relative) of the buffer
737 * @new: Non-zero if we may create a new buffer
738 * @bhp: the buffer is returned here
739 *
740 * Try to use the gfs2_inode's MRU metadata tree cache.
741 *
742 * Returns: errno
743 */
744
745int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
746 int new, struct buffer_head **bhp)
747{
748 struct buffer_head *bh, **bh_slot = ip->i_cache + height;
749 int error;
750
751 spin_lock(&ip->i_spin);
752 bh = *bh_slot;
753 if (bh) {
754 if (bh->b_blocknr == num)
755 get_bh(bh);
756 else
757 bh = NULL;
758 }
759 spin_unlock(&ip->i_spin);
760
761 if (bh) {
762 if (new)
763 meta_prep_new(bh);
764 else {
765 error = gfs2_meta_reread(ip->i_sbd, bh,
766 DIO_START | DIO_WAIT);
767 if (error) {
768 brelse(bh);
769 return error;
770 }
771 }
772 } else {
773 if (new)
774 bh = gfs2_meta_new(ip->i_gl, num);
775 else {
776 error = gfs2_meta_read(ip->i_gl, num,
777 DIO_START | DIO_WAIT, &bh);
778 if (error)
779 return error;
780 }
781
782 spin_lock(&ip->i_spin);
783 if (*bh_slot != bh) {
784 brelse(*bh_slot);
785 *bh_slot = bh;
786 get_bh(bh);
787 }
788 spin_unlock(&ip->i_spin);
789 }
790
791 if (new) {
792 if (gfs2_assert_warn(ip->i_sbd, height)) {
793 brelse(bh);
794 return -EIO;
795 }
796 gfs2_trans_add_bh(ip->i_gl, bh, 1);
797 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
798 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
799
800 } else if (gfs2_metatype_check(ip->i_sbd, bh,
801 (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) {
802 brelse(bh);
803 return -EIO;
804 }
805
806 *bhp = bh;
807
808 return 0;
809}
810
811/**
812 * gfs2_meta_ra - start readahead on an extent of a file
813 * @gl: the glock the blocks belong to
814 * @dblock: the starting disk block
815 * @extlen: the number of blocks in the extent
816 *
817 */
818
819void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen)
820{
821 struct gfs2_sbd *sdp = gl->gl_sbd;
822 struct inode *aspace = gl->gl_aspace;
823 struct buffer_head *first_bh, *bh;
824 uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> sdp->sd_sb.sb_bsize_shift;
825 int error;
826
827 if (!extlen || !max_ra)
828 return;
829 if (extlen > max_ra)
830 extlen = max_ra;
831
832 first_bh = getbuf(sdp, aspace, dblock, CREATE);
833
834 if (buffer_uptodate(first_bh))
835 goto out;
836 if (!buffer_locked(first_bh)) {
837 error = gfs2_meta_reread(sdp, first_bh, DIO_START);
838 if (error)
839 goto out;
840 }
841
842 dblock++;
843 extlen--;
844
845 while (extlen) {
846 bh = getbuf(sdp, aspace, dblock, CREATE);
847
848 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
849 error = gfs2_meta_reread(sdp, bh, DIO_START);
850 brelse(bh);
851 if (error)
852 goto out;
853 } else
854 brelse(bh);
855
856 dblock++;
857 extlen--;
858
859 if (buffer_uptodate(first_bh))
860 break;
861 }
862
863 out:
864 brelse(first_bh);
865}
866
867/**
868 * gfs2_meta_syncfs - sync all the buffers in a filesystem
869 * @sdp: the filesystem
870 *
871 */
872
873void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
874{
875 gfs2_log_flush(sdp);
876 for (;;) {
877 gfs2_ail1_start(sdp, DIO_ALL);
878 if (gfs2_ail1_empty(sdp, DIO_ALL))
879 break;
880 msleep(100);
881 }
882}
883
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..887cac302c1b
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13static inline void gfs2_buffer_clear(struct buffer_head *bh)
14{
15 memset(bh->b_data, 0, bh->b_size);
16}
17
18static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
19{
20 memset(bh->b_data + head, 0, bh->b_size - head);
21}
22
23static inline void gfs2_buffer_clear_ends(struct buffer_head *bh, int offset,
24 int amount, int journaled)
25{
26 int z_off1 = (journaled) ? sizeof(struct gfs2_meta_header) : 0;
27 int z_len1 = offset - z_off1;
28 int z_off2 = offset + amount;
29 int z_len2 = (bh)->b_size - z_off2;
30
31 if (z_len1)
32 memset(bh->b_data + z_off1, 0, z_len1);
33
34 if (z_len2)
35 memset(bh->b_data + z_off2, 0, z_len2);
36}
37
38static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
39 int to_head,
40 struct buffer_head *from_bh,
41 int from_head)
42{
43 memcpy(to_bh->b_data + to_head,
44 from_bh->b_data + from_head,
45 from_bh->b_size - from_head);
46 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
47 0,
48 from_head - to_head);
49}
50
51struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
52void gfs2_aspace_put(struct inode *aspace);
53
54void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
55int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags);
56void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
57void gfs2_ail_empty_gl(struct gfs2_glock *gl);
58
59void gfs2_meta_inval(struct gfs2_glock *gl);
60void gfs2_meta_sync(struct gfs2_glock *gl, int flags);
61
62struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno);
63int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno,
64 int flags, struct buffer_head **bhp);
65int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags);
66
67void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
68void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
69void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
70 struct gfs2_ail *ai);
71
72void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
73
74void gfs2_meta_cache_flush(struct gfs2_inode *ip);
75int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
76 int new, struct buffer_head **bhp);
77
78static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
79 struct buffer_head **bhp)
80{
81 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
82}
83
84void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen);
85void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
86
87#endif /* __DIO_DOT_H__ */
88
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..3e42697aafc7
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,211 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "mount.h"
19#include "sys.h"
20
21/**
22 * gfs2_mount_args - Parse mount options
23 * @sdp:
24 * @data:
25 *
26 * Return: errno
27 */
28
29int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
30{
31 struct gfs2_args *args = &sdp->sd_args;
32 char *data = data_arg;
33 char *options, *o, *v;
34 int error = 0;
35
36 if (!remount) {
37 /* If someone preloaded options, use those instead */
38 spin_lock(&gfs2_sys_margs_lock);
39 if (gfs2_sys_margs) {
40 data = gfs2_sys_margs;
41 gfs2_sys_margs = NULL;
42 }
43 spin_unlock(&gfs2_sys_margs_lock);
44
45 /* Set some defaults */
46 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
47 args->ar_quota = GFS2_QUOTA_DEFAULT;
48 args->ar_data = GFS2_DATA_DEFAULT;
49 }
50
51 /* Split the options into tokens with the "," character and
52 process them */
53
54 for (options = data; (o = strsep(&options, ",")); ) {
55 if (!*o)
56 continue;
57
58 v = strchr(o, '=');
59 if (v)
60 *v++ = 0;
61
62 if (!strcmp(o, "lockproto")) {
63 if (!v)
64 goto need_value;
65 if (remount && strcmp(v, args->ar_lockproto))
66 goto cant_remount;
67 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
68 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
69 }
70
71 else if (!strcmp(o, "locktable")) {
72 if (!v)
73 goto need_value;
74 if (remount && strcmp(v, args->ar_locktable))
75 goto cant_remount;
76 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
77 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
78 }
79
80 else if (!strcmp(o, "hostdata")) {
81 if (!v)
82 goto need_value;
83 if (remount && strcmp(v, args->ar_hostdata))
84 goto cant_remount;
85 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
86 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
87 }
88
89 else if (!strcmp(o, "spectator")) {
90 if (remount && !args->ar_spectator)
91 goto cant_remount;
92 args->ar_spectator = 1;
93 sdp->sd_vfs->s_flags |= MS_RDONLY;
94 }
95
96 else if (!strcmp(o, "ignore_local_fs")) {
97 if (remount && !args->ar_ignore_local_fs)
98 goto cant_remount;
99 args->ar_ignore_local_fs = 1;
100 }
101
102 else if (!strcmp(o, "localflocks")) {
103 if (remount && !args->ar_localflocks)
104 goto cant_remount;
105 args->ar_localflocks = 1;
106 }
107
108 else if (!strcmp(o, "localcaching")) {
109 if (remount && !args->ar_localcaching)
110 goto cant_remount;
111 args->ar_localcaching = 1;
112 }
113
114 else if (!strcmp(o, "debug"))
115 args->ar_debug = 1;
116
117 else if (!strcmp(o, "nodebug"))
118 args->ar_debug = 0;
119
120 else if (!strcmp(o, "upgrade")) {
121 if (remount && !args->ar_upgrade)
122 goto cant_remount;
123 args->ar_upgrade = 1;
124 }
125
126 else if (!strcmp(o, "num_glockd")) {
127 unsigned int x;
128 if (!v)
129 goto need_value;
130 sscanf(v, "%u", &x);
131 if (remount && x != args->ar_num_glockd)
132 goto cant_remount;
133 if (!x || x > GFS2_GLOCKD_MAX) {
134 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
135 GFS2_GLOCKD_MAX, x);
136 error = -EINVAL;
137 break;
138 }
139 args->ar_num_glockd = x;
140 }
141
142 else if (!strcmp(o, "acl")) {
143 args->ar_posix_acl = 1;
144 sdp->sd_vfs->s_flags |= MS_POSIXACL;
145 }
146
147 else if (!strcmp(o, "noacl")) {
148 args->ar_posix_acl = 0;
149 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
150 }
151
152 else if (!strcmp(o, "quota")) {
153 if (!v)
154 goto need_value;
155 if (!strcmp(v, "off"))
156 args->ar_quota = GFS2_QUOTA_OFF;
157 else if (!strcmp(v, "account"))
158 args->ar_quota = GFS2_QUOTA_ACCOUNT;
159 else if (!strcmp(v, "on"))
160 args->ar_quota = GFS2_QUOTA_ON;
161 else {
162 fs_info(sdp, "invalid value for quota\n");
163 error = -EINVAL;
164 break;
165 }
166 }
167
168 else if (!strcmp(o, "suiddir"))
169 args->ar_suiddir = 1;
170
171 else if (!strcmp(o, "nosuiddir"))
172 args->ar_suiddir = 0;
173
174 else if (!strcmp(o, "data")) {
175 if (!v)
176 goto need_value;
177 if (!strcmp(v, "writeback"))
178 args->ar_data = GFS2_DATA_WRITEBACK;
179 else if (!strcmp(v, "ordered"))
180 args->ar_data = GFS2_DATA_ORDERED;
181 else {
182 fs_info(sdp, "invalid value for data\n");
183 error = -EINVAL;
184 break;
185 }
186 }
187
188 else {
189 fs_info(sdp, "unknown option: %s\n", o);
190 error = -EINVAL;
191 break;
192 }
193 }
194
195 if (error)
196 fs_info(sdp, "invalid mount option(s)\n");
197
198 if (data != data_arg)
199 kfree(data);
200
201 return error;
202
203 need_value:
204 fs_info(sdp, "need value for option %s\n", o);
205 return -EINVAL;
206
207 cant_remount:
208 fs_info(sdp, "can't remount with option %s\n", o);
209 return -EINVAL;
210}
211
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..bc8331cd7b2c
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
14
15#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..854b5049b8d5
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,527 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include <linux/gfs2_ondisk.h>
19
20#define pv(struct, member, fmt) printk(" "#member" = "fmt"\n", struct->member);
21#define pa(struct, member, count) print_array(#member, struct->member, count);
22
23/**
24 * print_array - Print out an array of bytes
25 * @title: what to print before the array
26 * @buf: the array
27 * @count: the number of bytes
28 *
29 */
30
31static void print_array(char *title, char *buf, int count)
32{
33 int x;
34
35 printk(" %s =\n", title);
36 for (x = 0; x < count; x++) {
37 printk("%.2X ", (unsigned char)buf[x]);
38 if (x % 16 == 15)
39 printk("\n");
40 }
41 if (x % 16)
42 printk("\n");
43}
44
45/*
46 * gfs2_xxx_in - read in an xxx struct
47 * first arg: the cpu-order structure
48 * buf: the disk-order buffer
49 *
50 * gfs2_xxx_out - write out an xxx struct
51 * first arg: the cpu-order structure
52 * buf: the disk-order buffer
53 *
54 * gfs2_xxx_print - print out an xxx struct
55 * first arg: the cpu-order structure
56 */
57
58void gfs2_inum_in(struct gfs2_inum *no, char *buf)
59{
60 struct gfs2_inum *str = (struct gfs2_inum *)buf;
61
62 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
63 no->no_addr = be64_to_cpu(str->no_addr);
64}
65
66void gfs2_inum_out(struct gfs2_inum *no, char *buf)
67{
68 struct gfs2_inum *str = (struct gfs2_inum *)buf;
69
70 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
71 str->no_addr = cpu_to_be64(no->no_addr);
72}
73
74void gfs2_inum_print(struct gfs2_inum *no)
75{
76 pv(no, no_formal_ino, "%llu");
77 pv(no, no_addr, "%llu");
78}
79
80static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf)
81{
82 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
83
84 mh->mh_magic = be32_to_cpu(str->mh_magic);
85 mh->mh_type = be16_to_cpu(str->mh_type);
86 mh->mh_format = be16_to_cpu(str->mh_format);
87}
88
89static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf)
90{
91 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
92
93 str->mh_magic = cpu_to_be32(mh->mh_magic);
94 str->mh_type = cpu_to_be16(mh->mh_type);
95 str->mh_format = cpu_to_be16(mh->mh_format);
96}
97
98void gfs2_meta_header_print(struct gfs2_meta_header *mh)
99{
100 pv(mh, mh_magic, "0x%.8X");
101 pv(mh, mh_type, "%u");
102 pv(mh, mh_format, "%u");
103}
104
105void gfs2_sb_in(struct gfs2_sb *sb, char *buf)
106{
107 struct gfs2_sb *str = (struct gfs2_sb *)buf;
108
109 gfs2_meta_header_in(&sb->sb_header, buf);
110
111 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
112 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
113 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
114 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
115
116 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
117 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
118
119 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
120 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
121}
122
123void gfs2_sb_print(struct gfs2_sb *sb)
124{
125 gfs2_meta_header_print(&sb->sb_header);
126
127 pv(sb, sb_fs_format, "%u");
128 pv(sb, sb_multihost_format, "%u");
129
130 pv(sb, sb_bsize, "%u");
131 pv(sb, sb_bsize_shift, "%u");
132
133 gfs2_inum_print(&sb->sb_master_dir);
134
135 pv(sb, sb_lockproto, "%s");
136 pv(sb, sb_locktable, "%s");
137}
138
139void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf)
140{
141 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
142
143 ri->ri_addr = be64_to_cpu(str->ri_addr);
144 ri->ri_length = be32_to_cpu(str->ri_length);
145 ri->ri_data0 = be64_to_cpu(str->ri_data0);
146 ri->ri_data = be32_to_cpu(str->ri_data);
147 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
148
149}
150
151void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf)
152{
153 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
154
155 str->ri_addr = cpu_to_be64(ri->ri_addr);
156 str->ri_length = cpu_to_be32(ri->ri_length);
157 str->__pad = 0;
158
159 str->ri_data0 = cpu_to_be64(ri->ri_data0);
160 str->ri_data = cpu_to_be32(ri->ri_data);
161 str->ri_bitbytes = cpu_to_be32(ri->ri_bitbytes);
162 memset(str->ri_reserved, 0, sizeof(str->ri_reserved));
163}
164
165void gfs2_rindex_print(struct gfs2_rindex *ri)
166{
167 pv(ri, ri_addr, "%llu");
168 pv(ri, ri_length, "%u");
169
170 pv(ri, ri_data0, "%llu");
171 pv(ri, ri_data, "%u");
172
173 pv(ri, ri_bitbytes, "%u");
174}
175
176void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf)
177{
178 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
179
180 gfs2_meta_header_in(&rg->rg_header, buf);
181 rg->rg_flags = be32_to_cpu(str->rg_flags);
182 rg->rg_free = be32_to_cpu(str->rg_free);
183 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
184}
185
186void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf)
187{
188 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
189
190 gfs2_meta_header_out(&rg->rg_header, buf);
191 str->rg_flags = cpu_to_be32(rg->rg_flags);
192 str->rg_free = cpu_to_be32(rg->rg_free);
193 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
194
195 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
196}
197
198void gfs2_rgrp_print(struct gfs2_rgrp *rg)
199{
200 gfs2_meta_header_print(&rg->rg_header);
201 pv(rg, rg_flags, "%u");
202 pv(rg, rg_free, "%u");
203 pv(rg, rg_dinodes, "%u");
204
205 pa(rg, rg_reserved, 36);
206}
207
208void gfs2_quota_in(struct gfs2_quota *qu, char *buf)
209{
210 struct gfs2_quota *str = (struct gfs2_quota *)buf;
211
212 qu->qu_limit = be64_to_cpu(str->qu_limit);
213 qu->qu_warn = be64_to_cpu(str->qu_warn);
214 qu->qu_value = be64_to_cpu(str->qu_value);
215}
216
217void gfs2_quota_out(struct gfs2_quota *qu, char *buf)
218{
219 struct gfs2_quota *str = (struct gfs2_quota *)buf;
220
221 str->qu_limit = cpu_to_be64(qu->qu_limit);
222 str->qu_warn = cpu_to_be64(qu->qu_warn);
223 str->qu_value = cpu_to_be64(qu->qu_value);
224}
225
226void gfs2_quota_print(struct gfs2_quota *qu)
227{
228 pv(qu, qu_limit, "%llu");
229 pv(qu, qu_warn, "%llu");
230 pv(qu, qu_value, "%lld");
231}
232
233void gfs2_dinode_in(struct gfs2_dinode *di, char *buf)
234{
235 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
236
237 gfs2_meta_header_in(&di->di_header, buf);
238 gfs2_inum_in(&di->di_num, (char *)&str->di_num);
239
240 di->di_mode = be32_to_cpu(str->di_mode);
241 di->di_uid = be32_to_cpu(str->di_uid);
242 di->di_gid = be32_to_cpu(str->di_gid);
243 di->di_nlink = be32_to_cpu(str->di_nlink);
244 di->di_size = be64_to_cpu(str->di_size);
245 di->di_blocks = be64_to_cpu(str->di_blocks);
246 di->di_atime = be64_to_cpu(str->di_atime);
247 di->di_mtime = be64_to_cpu(str->di_mtime);
248 di->di_ctime = be64_to_cpu(str->di_ctime);
249 di->di_major = be32_to_cpu(str->di_major);
250 di->di_minor = be32_to_cpu(str->di_minor);
251
252 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
253 di->di_goal_data = be64_to_cpu(str->di_goal_data);
254
255 di->di_flags = be32_to_cpu(str->di_flags);
256 di->di_payload_format = be32_to_cpu(str->di_payload_format);
257 di->di_height = be16_to_cpu(str->di_height);
258
259 di->di_depth = be16_to_cpu(str->di_depth);
260 di->di_entries = be32_to_cpu(str->di_entries);
261
262 di->di_eattr = be64_to_cpu(str->di_eattr);
263
264}
265
266void gfs2_dinode_out(struct gfs2_dinode *di, char *buf)
267{
268 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
269
270 gfs2_meta_header_out(&di->di_header, buf);
271 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
272
273 str->di_mode = cpu_to_be32(di->di_mode);
274 str->di_uid = cpu_to_be32(di->di_uid);
275 str->di_gid = cpu_to_be32(di->di_gid);
276 str->di_nlink = cpu_to_be32(di->di_nlink);
277 str->di_size = cpu_to_be64(di->di_size);
278 str->di_blocks = cpu_to_be64(di->di_blocks);
279 str->di_atime = cpu_to_be64(di->di_atime);
280 str->di_mtime = cpu_to_be64(di->di_mtime);
281 str->di_ctime = cpu_to_be64(di->di_ctime);
282 str->di_major = cpu_to_be32(di->di_major);
283 str->di_minor = cpu_to_be32(di->di_minor);
284
285 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
286 str->di_goal_data = cpu_to_be64(di->di_goal_data);
287
288 str->di_flags = cpu_to_be32(di->di_flags);
289 str->di_payload_format = cpu_to_be32(di->di_payload_format);
290 str->di_height = cpu_to_be16(di->di_height);
291
292 str->di_depth = cpu_to_be16(di->di_depth);
293 str->di_entries = cpu_to_be32(di->di_entries);
294
295 str->di_eattr = cpu_to_be64(di->di_eattr);
296
297}
298
299void gfs2_dinode_print(struct gfs2_dinode *di)
300{
301 gfs2_meta_header_print(&di->di_header);
302 gfs2_inum_print(&di->di_num);
303
304 pv(di, di_mode, "0%o");
305 pv(di, di_uid, "%u");
306 pv(di, di_gid, "%u");
307 pv(di, di_nlink, "%u");
308 pv(di, di_size, "%llu");
309 pv(di, di_blocks, "%llu");
310 pv(di, di_atime, "%lld");
311 pv(di, di_mtime, "%lld");
312 pv(di, di_ctime, "%lld");
313 pv(di, di_major, "%u");
314 pv(di, di_minor, "%u");
315
316 pv(di, di_goal_meta, "%llu");
317 pv(di, di_goal_data, "%llu");
318
319 pv(di, di_flags, "0x%.8X");
320 pv(di, di_payload_format, "%u");
321 pv(di, di_height, "%u");
322
323 pv(di, di_depth, "%u");
324 pv(di, di_entries, "%u");
325
326 pv(di, di_eattr, "%llu");
327}
328
329void gfs2_dirent_print(struct gfs2_dirent *de, char *name)
330{
331 char buf[GFS2_FNAMESIZE + 1];
332
333 gfs2_inum_print(&de->de_inum);
334 pv(de, de_hash, "0x%.8X");
335 pv(de, de_rec_len, "%u");
336 pv(de, de_name_len, "%u");
337 pv(de, de_type, "%u");
338
339 memset(buf, 0, GFS2_FNAMESIZE + 1);
340 memcpy(buf, name, de->de_name_len);
341 printk(" name = %s\n", buf);
342}
343
344void gfs2_leaf_in(struct gfs2_leaf *lf, char *buf)
345{
346 struct gfs2_leaf *str = (struct gfs2_leaf *)buf;
347
348 gfs2_meta_header_in(&lf->lf_header, buf);
349 lf->lf_depth = be16_to_cpu(str->lf_depth);
350 lf->lf_entries = be16_to_cpu(str->lf_entries);
351 lf->lf_dirent_format = be32_to_cpu(str->lf_dirent_format);
352 lf->lf_next = be64_to_cpu(str->lf_next);
353}
354
355void gfs2_leaf_print(struct gfs2_leaf *lf)
356{
357 gfs2_meta_header_print(&lf->lf_header);
358 pv(lf, lf_depth, "%u");
359 pv(lf, lf_entries, "%u");
360 pv(lf, lf_dirent_format, "%u");
361 pv(lf, lf_next, "%llu");
362
363 pa(lf, lf_reserved, 32);
364}
365
366void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf)
367{
368 struct gfs2_ea_header *str = (struct gfs2_ea_header *)buf;
369
370 ea->ea_rec_len = be32_to_cpu(str->ea_rec_len);
371 ea->ea_data_len = be32_to_cpu(str->ea_data_len);
372 ea->ea_name_len = str->ea_name_len;
373 ea->ea_type = str->ea_type;
374 ea->ea_flags = str->ea_flags;
375 ea->ea_num_ptrs = str->ea_num_ptrs;
376}
377
378void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf)
379{
380 struct gfs2_ea_header *str = (struct gfs2_ea_header *)buf;
381
382 str->ea_rec_len = cpu_to_be32(ea->ea_rec_len);
383 str->ea_data_len = cpu_to_be32(ea->ea_data_len);
384 str->ea_name_len = ea->ea_name_len;
385 str->ea_type = ea->ea_type;
386 str->ea_flags = ea->ea_flags;
387 str->ea_num_ptrs = ea->ea_num_ptrs;
388 str->__pad = 0;
389}
390
391void gfs2_ea_header_print(struct gfs2_ea_header *ea, char *name)
392{
393 char buf[GFS2_EA_MAX_NAME_LEN + 1];
394
395 pv(ea, ea_rec_len, "%u");
396 pv(ea, ea_data_len, "%u");
397 pv(ea, ea_name_len, "%u");
398 pv(ea, ea_type, "%u");
399 pv(ea, ea_flags, "%u");
400 pv(ea, ea_num_ptrs, "%u");
401
402 memset(buf, 0, GFS2_EA_MAX_NAME_LEN + 1);
403 memcpy(buf, name, ea->ea_name_len);
404 printk(" name = %s\n", buf);
405}
406
407void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf)
408{
409 struct gfs2_log_header *str = (struct gfs2_log_header *)buf;
410
411 gfs2_meta_header_in(&lh->lh_header, buf);
412 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
413 lh->lh_flags = be32_to_cpu(str->lh_flags);
414 lh->lh_tail = be32_to_cpu(str->lh_tail);
415 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
416 lh->lh_hash = be32_to_cpu(str->lh_hash);
417}
418
419void gfs2_log_header_print(struct gfs2_log_header *lh)
420{
421 gfs2_meta_header_print(&lh->lh_header);
422 pv(lh, lh_sequence, "%llu");
423 pv(lh, lh_flags, "0x%.8X");
424 pv(lh, lh_tail, "%u");
425 pv(lh, lh_blkno, "%u");
426 pv(lh, lh_hash, "0x%.8X");
427}
428
429void gfs2_log_descriptor_print(struct gfs2_log_descriptor *ld)
430{
431 gfs2_meta_header_print(&ld->ld_header);
432 pv(ld, ld_type, "%u");
433 pv(ld, ld_length, "%u");
434 pv(ld, ld_data1, "%u");
435 pv(ld, ld_data2, "%u");
436
437 pa(ld, ld_reserved, 32);
438}
439
440void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf)
441{
442 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
443
444 ir->ir_start = be64_to_cpu(str->ir_start);
445 ir->ir_length = be64_to_cpu(str->ir_length);
446}
447
448void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf)
449{
450 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
451
452 str->ir_start = cpu_to_be64(ir->ir_start);
453 str->ir_length = cpu_to_be64(ir->ir_length);
454}
455
456void gfs2_inum_range_print(struct gfs2_inum_range *ir)
457{
458 pv(ir, ir_start, "%llu");
459 pv(ir, ir_length, "%llu");
460}
461
462void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf)
463{
464 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
465
466 sc->sc_total = be64_to_cpu(str->sc_total);
467 sc->sc_free = be64_to_cpu(str->sc_free);
468 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
469}
470
471void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf)
472{
473 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
474
475 str->sc_total = cpu_to_be64(sc->sc_total);
476 str->sc_free = cpu_to_be64(sc->sc_free);
477 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
478}
479
480void gfs2_statfs_change_print(struct gfs2_statfs_change *sc)
481{
482 pv(sc, sc_total, "%lld");
483 pv(sc, sc_free, "%lld");
484 pv(sc, sc_dinodes, "%lld");
485}
486
487void gfs2_unlinked_tag_in(struct gfs2_unlinked_tag *ut, char *buf)
488{
489 struct gfs2_unlinked_tag *str = (struct gfs2_unlinked_tag *)buf;
490
491 gfs2_inum_in(&ut->ut_inum, buf);
492 ut->ut_flags = be32_to_cpu(str->ut_flags);
493}
494
495void gfs2_unlinked_tag_out(struct gfs2_unlinked_tag *ut, char *buf)
496{
497 struct gfs2_unlinked_tag *str = (struct gfs2_unlinked_tag *)buf;
498
499 gfs2_inum_out(&ut->ut_inum, buf);
500 str->ut_flags = cpu_to_be32(ut->ut_flags);
501 str->__pad = 0;
502}
503
504void gfs2_unlinked_tag_print(struct gfs2_unlinked_tag *ut)
505{
506 gfs2_inum_print(&ut->ut_inum);
507 pv(ut, ut_flags, "%u");
508}
509
510void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf)
511{
512 struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf;
513
514 qc->qc_change = be64_to_cpu(str->qc_change);
515 qc->qc_flags = be32_to_cpu(str->qc_flags);
516 qc->qc_id = be32_to_cpu(str->qc_id);
517}
518
519void gfs2_quota_change_print(struct gfs2_quota_change *qc)
520{
521 pv(qc, qc_change, "%lld");
522 pv(qc, qc_flags, "0x%.8X");
523 pv(qc, qc_id, "%u");
524}
525
526
527
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..c719a2a40698
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,642 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/mpage.h>
17#include <linux/fs.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "ops_address.h"
27#include "page.h"
28#include "quota.h"
29#include "trans.h"
30#include "rgrp.h"
31#include "ops_file.h"
32
33/**
34 * gfs2_get_block - Fills in a buffer head with details about a block
35 * @inode: The inode
36 * @lblock: The block number to look up
37 * @bh_result: The buffer head to return the result in
38 * @create: Non-zero if we may add block to the file
39 *
40 * Returns: errno
41 */
42
43int gfs2_get_block(struct inode *inode, sector_t lblock,
44 struct buffer_head *bh_result, int create)
45{
46 struct gfs2_inode *ip = get_v2ip(inode);
47 int new = create;
48 uint64_t dblock;
49 int error;
50
51 error = gfs2_block_map(ip, lblock, &new, &dblock, NULL);
52 if (error)
53 return error;
54
55 if (!dblock)
56 return 0;
57
58 map_bh(bh_result, inode->i_sb, dblock);
59 if (new)
60 set_buffer_new(bh_result);
61
62 return 0;
63}
64
65/**
66 * get_block_noalloc - Fills in a buffer head with details about a block
67 * @inode: The inode
68 * @lblock: The block number to look up
69 * @bh_result: The buffer head to return the result in
70 * @create: Non-zero if we may add block to the file
71 *
72 * Returns: errno
73 */
74
75static int get_block_noalloc(struct inode *inode, sector_t lblock,
76 struct buffer_head *bh_result, int create)
77{
78 struct gfs2_inode *ip = get_v2ip(inode);
79 int new = 0;
80 uint64_t dblock;
81 int error;
82
83 error = gfs2_block_map(ip, lblock, &new, &dblock, NULL);
84 if (error)
85 return error;
86
87 if (dblock)
88 map_bh(bh_result, inode->i_sb, dblock);
89 else if (gfs2_assert_withdraw(ip->i_sbd, !create))
90 error = -EIO;
91
92 return error;
93}
94
95static int get_blocks(struct inode *inode, sector_t lblock,
96 unsigned long max_blocks, struct buffer_head *bh_result,
97 int create)
98{
99 struct gfs2_inode *ip = get_v2ip(inode);
100 int new = create;
101 uint64_t dblock;
102 uint32_t extlen;
103 int error;
104
105 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
106 if (error)
107 return error;
108
109 if (!dblock)
110 return 0;
111
112 map_bh(bh_result, inode->i_sb, dblock);
113 if (new)
114 set_buffer_new(bh_result);
115
116 if (extlen > max_blocks)
117 extlen = max_blocks;
118 bh_result->b_size = extlen << inode->i_blkbits;
119
120 return 0;
121}
122
123static int get_blocks_noalloc(struct inode *inode, sector_t lblock,
124 unsigned long max_blocks,
125 struct buffer_head *bh_result, int create)
126{
127 struct gfs2_inode *ip = get_v2ip(inode);
128 int new = 0;
129 uint64_t dblock;
130 uint32_t extlen;
131 int error;
132
133 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
134 if (error)
135 return error;
136
137 if (dblock) {
138 map_bh(bh_result, inode->i_sb, dblock);
139 if (extlen > max_blocks)
140 extlen = max_blocks;
141 bh_result->b_size = extlen << inode->i_blkbits;
142 } else if (gfs2_assert_withdraw(ip->i_sbd, !create))
143 error = -EIO;
144
145 return error;
146}
147
148/**
149 * gfs2_writepage - Write complete page
150 * @page: Page to write
151 *
152 * Returns: errno
153 *
154 * Some of this is copied from block_write_full_page() although we still
155 * call it to do most of the work.
156 */
157
158static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
159{
160 struct inode *inode = page->mapping->host;
161 struct gfs2_inode *ip = get_v2ip(page->mapping->host);
162 struct gfs2_sbd *sdp = ip->i_sbd;
163 loff_t i_size = i_size_read(inode);
164 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
165 unsigned offset;
166 int error;
167 int done_trans = 0;
168
169 atomic_inc(&sdp->sd_ops_address);
170 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
171 unlock_page(page);
172 return -EIO;
173 }
174 if (get_transaction)
175 goto out_ignore;
176
177 /* Is the page fully outside i_size? (truncate in progress) */
178 offset = i_size & (PAGE_CACHE_SIZE-1);
179 if (page->index >= end_index+1 || !offset) {
180 page->mapping->a_ops->invalidatepage(page, 0);
181 unlock_page(page);
182 return 0; /* don't care */
183 }
184
185 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
186 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
187 if (error)
188 goto out_ignore;
189 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
190 done_trans = 1;
191 }
192 error = block_write_full_page(page, get_block_noalloc, wbc);
193 if (done_trans)
194 gfs2_trans_end(sdp);
195 gfs2_meta_cache_flush(ip);
196 return error;
197
198out_ignore:
199 redirty_page_for_writepage(wbc, page);
200 unlock_page(page);
201 return 0;
202}
203
204/**
205 * stuffed_readpage - Fill in a Linux page with stuffed file data
206 * @ip: the inode
207 * @page: the page
208 *
209 * Returns: errno
210 */
211
212static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
213{
214 struct buffer_head *dibh;
215 void *kaddr;
216 int error;
217
218 error = gfs2_meta_inode_buffer(ip, &dibh);
219 if (error)
220 return error;
221
222 kaddr = kmap_atomic(page, KM_USER0);
223 memcpy((char *)kaddr,
224 dibh->b_data + sizeof(struct gfs2_dinode),
225 ip->i_di.di_size);
226 memset((char *)kaddr + ip->i_di.di_size,
227 0,
228 PAGE_CACHE_SIZE - ip->i_di.di_size);
229 kunmap_atomic(page, KM_USER0);
230
231 brelse(dibh);
232
233 SetPageUptodate(page);
234
235 return 0;
236}
237
238static int zero_readpage(struct page *page)
239{
240 void *kaddr;
241
242 kaddr = kmap_atomic(page, KM_USER0);
243 memset(kaddr, 0, PAGE_CACHE_SIZE);
244 kunmap_atomic(page, KM_USER0);
245
246 SetPageUptodate(page);
247 unlock_page(page);
248
249 return 0;
250}
251
252/**
253 * gfs2_readpage - readpage with locking
254 * @file: The file to read a page for. N.B. This may be NULL if we are
255 * reading an internal file.
256 * @page: The page to read
257 *
258 * Returns: errno
259 */
260
261static int gfs2_readpage(struct file *file, struct page *page)
262{
263 struct gfs2_inode *ip = get_v2ip(page->mapping->host);
264 struct gfs2_sbd *sdp = ip->i_sbd;
265 struct gfs2_holder gh;
266 int error;
267
268 atomic_inc(&sdp->sd_ops_address);
269
270 if (file != &gfs2_internal_file_sentinal) {
271 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
272 error = gfs2_glock_nq_m_atime(1, &gh);
273 if (error)
274 goto out_unlock;
275 }
276
277 if (gfs2_is_stuffed(ip)) {
278 if (!page->index) {
279 error = stuffed_readpage(ip, page);
280 unlock_page(page);
281 } else
282 error = zero_readpage(page);
283 } else
284 error = mpage_readpage(page, gfs2_get_block);
285
286 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
287 error = -EIO;
288
289 if (file != &gfs2_internal_file_sentinal) {
290 gfs2_glock_dq_m(1, &gh);
291 gfs2_holder_uninit(&gh);
292 }
293out:
294 return error;
295out_unlock:
296 unlock_page(page);
297 goto out;
298}
299
300/**
301 * gfs2_prepare_write - Prepare to write a page to a file
302 * @file: The file to write to
303 * @page: The page which is to be prepared for writing
304 * @from: From (byte range within page)
305 * @to: To (byte range within page)
306 *
307 * Returns: errno
308 */
309
310static int gfs2_prepare_write(struct file *file, struct page *page,
311 unsigned from, unsigned to)
312{
313 struct gfs2_inode *ip = get_v2ip(page->mapping->host);
314 struct gfs2_sbd *sdp = ip->i_sbd;
315 unsigned int data_blocks, ind_blocks, rblocks;
316 int alloc_required;
317 int error = 0;
318 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
319 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
320 struct gfs2_alloc *al;
321
322 atomic_inc(&sdp->sd_ops_address);
323
324 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
325 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
326 if (error)
327 goto out_uninit;
328
329 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
330
331 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
332 if (error)
333 goto out_unlock;
334
335
336 if (alloc_required) {
337 al = gfs2_alloc_get(ip);
338
339 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
340 if (error)
341 goto out_alloc_put;
342
343 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
344 if (error)
345 goto out_qunlock;
346
347 al->al_requested = data_blocks + ind_blocks;
348 error = gfs2_inplace_reserve(ip);
349 if (error)
350 goto out_qunlock;
351 }
352
353 rblocks = RES_DINODE + ind_blocks;
354 if (gfs2_is_jdata(ip))
355 rblocks += data_blocks ? data_blocks : 1;
356 if (ind_blocks || data_blocks)
357 rblocks += RES_STATFS + RES_QUOTA;
358
359 error = gfs2_trans_begin(sdp, rblocks, 0);
360 if (error)
361 goto out;
362
363 if (gfs2_is_stuffed(ip)) {
364 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
365 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, page);
366 if (error == 0)
367 goto prepare_write;
368 } else if (!PageUptodate(page))
369 error = stuffed_readpage(ip, page);
370 goto out;
371 }
372
373prepare_write:
374 error = block_prepare_write(page, from, to, gfs2_get_block);
375
376out:
377 if (error) {
378 gfs2_trans_end(sdp);
379 if (alloc_required) {
380 gfs2_inplace_release(ip);
381out_qunlock:
382 gfs2_quota_unlock(ip);
383out_alloc_put:
384 gfs2_alloc_put(ip);
385 }
386out_unlock:
387 gfs2_glock_dq_m(1, &ip->i_gh);
388out_uninit:
389 gfs2_holder_uninit(&ip->i_gh);
390 }
391
392 return error;
393}
394
395/**
396 * gfs2_commit_write - Commit write to a file
397 * @file: The file to write to
398 * @page: The page containing the data
399 * @from: From (byte range within page)
400 * @to: To (byte range within page)
401 *
402 * Returns: errno
403 */
404
405static int gfs2_commit_write(struct file *file, struct page *page,
406 unsigned from, unsigned to)
407{
408 struct inode *inode = page->mapping->host;
409 struct gfs2_inode *ip = get_v2ip(inode);
410 struct gfs2_sbd *sdp = ip->i_sbd;
411 int error = -EOPNOTSUPP;
412 struct buffer_head *dibh;
413 struct gfs2_alloc *al = &ip->i_alloc;;
414
415 atomic_inc(&sdp->sd_ops_address);
416
417
418 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
419 goto fail_nounlock;
420
421 error = gfs2_meta_inode_buffer(ip, &dibh);
422 if (error)
423 goto fail_endtrans;
424
425 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
426
427 if (gfs2_is_stuffed(ip)) {
428 uint64_t file_size;
429 void *kaddr;
430
431 file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
432
433 kaddr = kmap_atomic(page, KM_USER0);
434 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
435 (char *)kaddr + from, to - from);
436 kunmap_atomic(page, KM_USER0);
437
438 SetPageUptodate(page);
439
440 if (inode->i_size < file_size)
441 i_size_write(inode, file_size);
442 } else {
443 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
444 gfs2_page_add_databufs(ip, page, from, to);
445 error = generic_commit_write(file, page, from, to);
446 if (error)
447 goto fail;
448 }
449
450 if (ip->i_di.di_size < inode->i_size)
451 ip->i_di.di_size = inode->i_size;
452
453 gfs2_dinode_out(&ip->i_di, dibh->b_data);
454 brelse(dibh);
455 gfs2_trans_end(sdp);
456 if (al->al_requested) {
457 gfs2_inplace_release(ip);
458 gfs2_quota_unlock(ip);
459 gfs2_alloc_put(ip);
460 }
461 gfs2_glock_dq_m(1, &ip->i_gh);
462 gfs2_holder_uninit(&ip->i_gh);
463 return 0;
464
465fail:
466 brelse(dibh);
467fail_endtrans:
468 gfs2_trans_end(sdp);
469 if (al->al_requested) {
470 gfs2_inplace_release(ip);
471 gfs2_quota_unlock(ip);
472 gfs2_alloc_put(ip);
473 }
474 gfs2_glock_dq_m(1, &ip->i_gh);
475 gfs2_holder_uninit(&ip->i_gh);
476fail_nounlock:
477 ClearPageUptodate(page);
478 return error;
479}
480
481/**
482 * gfs2_bmap - Block map function
483 * @mapping: Address space info
484 * @lblock: The block to map
485 *
486 * Returns: The disk address for the block or 0 on hole or error
487 */
488
489static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
490{
491 struct gfs2_inode *ip = get_v2ip(mapping->host);
492 struct gfs2_holder i_gh;
493 sector_t dblock = 0;
494 int error;
495
496 atomic_inc(&ip->i_sbd->sd_ops_address);
497
498 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
499 if (error)
500 return 0;
501
502 if (!gfs2_is_stuffed(ip))
503 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
504
505 gfs2_glock_dq_uninit(&i_gh);
506
507 return dblock;
508}
509
510static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
511{
512 struct gfs2_bufdata *bd;
513
514 gfs2_log_lock(sdp);
515 bd = get_v2bd(bh);
516 if (bd) {
517 bd->bd_bh = NULL;
518 set_v2bd(bh, NULL);
519 gfs2_log_unlock(sdp);
520 brelse(bh);
521 } else
522 gfs2_log_unlock(sdp);
523
524 lock_buffer(bh);
525 clear_buffer_dirty(bh);
526 bh->b_bdev = NULL;
527 clear_buffer_mapped(bh);
528 clear_buffer_req(bh);
529 clear_buffer_new(bh);
530 clear_buffer_delay(bh);
531 unlock_buffer(bh);
532}
533
534static int gfs2_invalidatepage(struct page *page, unsigned long offset)
535{
536 struct gfs2_sbd *sdp = get_v2sdp(page->mapping->host->i_sb);
537 struct buffer_head *head, *bh, *next;
538 unsigned int curr_off = 0;
539 int ret = 1;
540
541 BUG_ON(!PageLocked(page));
542 if (!page_has_buffers(page))
543 return 1;
544
545 bh = head = page_buffers(page);
546 do {
547 unsigned int next_off = curr_off + bh->b_size;
548 next = bh->b_this_page;
549
550 if (offset <= curr_off)
551 discard_buffer(sdp, bh);
552
553 curr_off = next_off;
554 bh = next;
555 } while (bh != head);
556
557 if (!offset)
558 ret = try_to_release_page(page, 0);
559
560 return ret;
561}
562
563static ssize_t gfs2_direct_IO_write(struct kiocb *iocb, const struct iovec *iov,
564 loff_t offset, unsigned long nr_segs)
565{
566 struct file *file = iocb->ki_filp;
567 struct inode *inode = file->f_mapping->host;
568 struct gfs2_inode *ip = get_v2ip(inode);
569 struct gfs2_holder gh;
570 int rv;
571
572 /*
573 * Shared lock, even though its write, since we do no allocation
574 * on this path. All we need change is atime.
575 */
576 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
577 rv = gfs2_glock_nq_m_atime(1, &gh);
578 if (rv)
579 goto out;
580
581 /*
582 * Should we return an error here? I can't see that O_DIRECT for
583 * a journaled file makes any sense. For now we'll silently fall
584 * back to buffered I/O, likewise we do the same for stuffed
585 * files since they are (a) small and (b) unaligned.
586 */
587 if (gfs2_is_jdata(ip))
588 goto out;
589
590 if (gfs2_is_stuffed(ip))
591 goto out;
592
593 rv = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
594 iov, offset, nr_segs, get_blocks_noalloc,
595 NULL, DIO_OWN_LOCKING);
596out:
597 gfs2_glock_dq_m(1, &gh);
598 gfs2_holder_uninit(&gh);
599
600 return rv;
601}
602
603/**
604 * gfs2_direct_IO
605 *
606 * This is called with a shared lock already held for the read path.
607 * Currently, no locks are held when the write path is called.
608 */
609static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
610 const struct iovec *iov, loff_t offset,
611 unsigned long nr_segs)
612{
613 struct file *file = iocb->ki_filp;
614 struct inode *inode = file->f_mapping->host;
615 struct gfs2_inode *ip = get_v2ip(inode);
616 struct gfs2_sbd *sdp = ip->i_sbd;
617
618 atomic_inc(&sdp->sd_ops_address);
619
620 if (rw == WRITE)
621 return gfs2_direct_IO_write(iocb, iov, offset, nr_segs);
622
623 if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)) ||
624 gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
625 return -EINVAL;
626
627 return __blockdev_direct_IO(READ, iocb, inode, inode->i_sb->s_bdev, iov,
628 offset, nr_segs, get_blocks, NULL,
629 DIO_OWN_LOCKING);
630}
631
632struct address_space_operations gfs2_file_aops = {
633 .writepage = gfs2_writepage,
634 .readpage = gfs2_readpage,
635 .sync_page = block_sync_page,
636 .prepare_write = gfs2_prepare_write,
637 .commit_write = gfs2_commit_write,
638 .bmap = gfs2_bmap,
639 .invalidatepage = gfs2_invalidatepage,
640 .direct_IO = gfs2_direct_IO,
641};
642
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..f201a059fd91
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13extern struct address_space_operations gfs2_file_aops;
14extern int gfs2_get_block(struct inode *inode, sector_t lblock,
15 struct buffer_head *bh_result, int create);
16
17#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..5c618611c11b
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,117 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "dir.h"
20#include "glock.h"
21#include "ops_dentry.h"
22
23/**
24 * gfs2_drevalidate - Check directory lookup consistency
25 * @dentry: the mapping to check
26 * @nd:
27 *
28 * Check to make sure the lookup necessary to arrive at this inode from its
29 * parent is still good.
30 *
31 * Returns: 1 if the dentry is ok, 0 if it isn't
32 */
33
34static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
35{
36 struct dentry *parent = dget_parent(dentry);
37 struct gfs2_inode *dip = get_v2ip(parent->d_inode);
38 struct gfs2_sbd *sdp = dip->i_sbd;
39 struct inode *inode;
40 struct gfs2_holder d_gh;
41 struct gfs2_inode *ip;
42 struct gfs2_inum inum;
43 unsigned int type;
44 int error;
45
46 lock_kernel();
47
48 atomic_inc(&sdp->sd_ops_dentry);
49
50 inode = dentry->d_inode;
51 if (inode && is_bad_inode(inode))
52 goto invalid;
53
54 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
55 if (error)
56 goto fail;
57
58 error = gfs2_dir_search(dip, &dentry->d_name, &inum, &type);
59 switch (error) {
60 case 0:
61 if (!inode)
62 goto invalid_gunlock;
63 break;
64 case -ENOENT:
65 if (!inode)
66 goto valid_gunlock;
67 goto invalid_gunlock;
68 default:
69 goto fail_gunlock;
70 }
71
72 ip = get_v2ip(inode);
73
74 if (!gfs2_inum_equal(&ip->i_num, &inum))
75 goto invalid_gunlock;
76
77 if (IF2DT(ip->i_di.di_mode) != type) {
78 gfs2_consist_inode(dip);
79 goto fail_gunlock;
80 }
81
82 valid_gunlock:
83 gfs2_glock_dq_uninit(&d_gh);
84
85 valid:
86 unlock_kernel();
87 dput(parent);
88 return 1;
89
90 invalid_gunlock:
91 gfs2_glock_dq_uninit(&d_gh);
92
93 invalid:
94 if (inode && S_ISDIR(inode->i_mode)) {
95 if (have_submounts(dentry))
96 goto valid;
97 shrink_dcache_parent(dentry);
98 }
99 d_drop(dentry);
100
101 unlock_kernel();
102 dput(parent);
103 return 0;
104
105 fail_gunlock:
106 gfs2_glock_dq_uninit(&d_gh);
107
108 fail:
109 unlock_kernel();
110 dput(parent);
111 return 0;
112}
113
114struct dentry_operations gfs2_dops = {
115 .d_revalidate = gfs2_drevalidate,
116};
117
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..94e3ee170165
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13extern struct dentry_operations gfs2_dops;
14
15#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..8389f771d28b
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,303 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "dir.h"
19#include "glock.h"
20#include "glops.h"
21#include "inode.h"
22#include "ops_export.h"
23#include "rgrp.h"
24
25static struct dentry *gfs2_decode_fh(struct super_block *sb,
26 __u32 *fh,
27 int fh_len,
28 int fh_type,
29 int (*acceptable)(void *context,
30 struct dentry *dentry),
31 void *context)
32{
33 struct gfs2_inum this, parent;
34
35 atomic_inc(&get_v2sdp(sb)->sd_ops_export);
36
37 if (fh_type != fh_len)
38 return NULL;
39
40 memset(&parent, 0, sizeof(struct gfs2_inum));
41
42 switch (fh_type) {
43 case 8:
44 parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32;
45 parent.no_formal_ino |= be32_to_cpu(fh[5]);
46 parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32;
47 parent.no_addr |= be32_to_cpu(fh[7]);
48 case 4:
49 this.no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32;
50 this.no_formal_ino |= be32_to_cpu(fh[1]);
51 this.no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32;
52 this.no_addr |= be32_to_cpu(fh[3]);
53 break;
54 default:
55 return NULL;
56 }
57
58 return gfs2_export_ops.find_exported_dentry(sb, &this, &parent,
59 acceptable, context);
60}
61
62static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
63 int connectable)
64{
65 struct inode *inode = dentry->d_inode;
66 struct gfs2_inode *ip = get_v2ip(inode);
67 struct gfs2_sbd *sdp = ip->i_sbd;
68
69 atomic_inc(&sdp->sd_ops_export);
70
71 if (*len < 4 || (connectable && *len < 8))
72 return 255;
73
74 fh[0] = ip->i_num.no_formal_ino >> 32;
75 fh[0] = cpu_to_be32(fh[0]);
76 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
77 fh[1] = cpu_to_be32(fh[1]);
78 fh[2] = ip->i_num.no_addr >> 32;
79 fh[2] = cpu_to_be32(fh[2]);
80 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
81 fh[3] = cpu_to_be32(fh[3]);
82 *len = 4;
83
84 if (!connectable || ip == get_v2ip(sdp->sd_root_dir))
85 return *len;
86
87 spin_lock(&dentry->d_lock);
88 inode = dentry->d_parent->d_inode;
89 ip = get_v2ip(inode);
90 gfs2_inode_hold(ip);
91 spin_unlock(&dentry->d_lock);
92
93 fh[4] = ip->i_num.no_formal_ino >> 32;
94 fh[4] = cpu_to_be32(fh[4]);
95 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
96 fh[5] = cpu_to_be32(fh[5]);
97 fh[6] = ip->i_num.no_addr >> 32;
98 fh[6] = cpu_to_be32(fh[6]);
99 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
100 fh[7] = cpu_to_be32(fh[7]);
101 *len = 8;
102
103 gfs2_inode_put(ip);
104
105 return *len;
106}
107
108struct get_name_filldir {
109 struct gfs2_inum inum;
110 char *name;
111};
112
113static int get_name_filldir(void *opaque, const char *name, unsigned int length,
114 uint64_t offset, struct gfs2_inum *inum,
115 unsigned int type)
116{
117 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
118
119 if (!gfs2_inum_equal(inum, &gnfd->inum))
120 return 0;
121
122 memcpy(gnfd->name, name, length);
123 gnfd->name[length] = 0;
124
125 return 1;
126}
127
128static int gfs2_get_name(struct dentry *parent, char *name,
129 struct dentry *child)
130{
131 struct inode *dir = parent->d_inode;
132 struct inode *inode = child->d_inode;
133 struct gfs2_inode *dip, *ip;
134 struct get_name_filldir gnfd;
135 struct gfs2_holder gh;
136 uint64_t offset = 0;
137 int error;
138
139 if (!dir)
140 return -EINVAL;
141
142 atomic_inc(&get_v2sdp(dir->i_sb)->sd_ops_export);
143
144 if (!S_ISDIR(dir->i_mode) || !inode)
145 return -EINVAL;
146
147 dip = get_v2ip(dir);
148 ip = get_v2ip(inode);
149
150 *name = 0;
151 gnfd.inum = ip->i_num;
152 gnfd.name = name;
153
154 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
155 if (error)
156 return error;
157
158 error = gfs2_dir_read(dip, &offset, &gnfd, get_name_filldir);
159
160 gfs2_glock_dq_uninit(&gh);
161
162 if (!error && !*name)
163 error = -ENOENT;
164
165 return error;
166}
167
168static struct dentry *gfs2_get_parent(struct dentry *child)
169{
170 struct gfs2_inode *dip = get_v2ip(child->d_inode);
171 struct qstr dotdot = { .name = "..", .len = 2 };
172 struct inode *inode;
173 struct dentry *dentry;
174 int error;
175
176 atomic_inc(&dip->i_sbd->sd_ops_export);
177
178 error = gfs2_lookupi(child->d_inode, &dotdot, 1, &inode);
179 if (error)
180 return ERR_PTR(error);
181
182 dentry = d_alloc_anon(inode);
183 if (!dentry) {
184 iput(inode);
185 return ERR_PTR(-ENOMEM);
186 }
187
188 return dentry;
189}
190
191static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_p)
192{
193 struct gfs2_sbd *sdp = get_v2sdp(sb);
194 struct gfs2_inum *inum = (struct gfs2_inum *)inum_p;
195 struct gfs2_holder i_gh, ri_gh, rgd_gh;
196 struct gfs2_rgrpd *rgd;
197 struct gfs2_inode *ip;
198 struct inode *inode;
199 struct dentry *dentry;
200 int error;
201
202 atomic_inc(&sdp->sd_ops_export);
203
204 /* System files? */
205
206 inode = gfs2_iget(sb, inum);
207 if (inode) {
208 ip = get_v2ip(inode);
209 if (ip->i_num.no_formal_ino != inum->no_formal_ino) {
210 iput(inode);
211 return ERR_PTR(-ESTALE);
212 }
213 goto out_inode;
214 }
215
216 error = gfs2_glock_nq_num(sdp,
217 inum->no_addr, &gfs2_inode_glops,
218 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
219 &i_gh);
220 if (error)
221 return ERR_PTR(error);
222
223 error = gfs2_inode_get(i_gh.gh_gl, inum, NO_CREATE, &ip);
224 if (error)
225 goto fail;
226 if (ip)
227 goto out_ip;
228
229 error = gfs2_rindex_hold(sdp, &ri_gh);
230 if (error)
231 goto fail;
232
233 error = -EINVAL;
234 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
235 if (!rgd)
236 goto fail_rindex;
237
238 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
239 if (error)
240 goto fail_rindex;
241
242 error = -ESTALE;
243 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
244 goto fail_rgd;
245
246 gfs2_glock_dq_uninit(&rgd_gh);
247 gfs2_glock_dq_uninit(&ri_gh);
248
249 error = gfs2_inode_get(i_gh.gh_gl, inum, CREATE, &ip);
250 if (error)
251 goto fail;
252
253 error = gfs2_inode_refresh(ip);
254 if (error) {
255 gfs2_inode_put(ip);
256 goto fail;
257 }
258
259 atomic_inc(&sdp->sd_fh2dentry_misses);
260
261 out_ip:
262 error = -EIO;
263 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) {
264 gfs2_inode_put(ip);
265 goto fail;
266 }
267
268 gfs2_glock_dq_uninit(&i_gh);
269
270 inode = gfs2_ip2v(ip);
271 gfs2_inode_put(ip);
272
273 if (!inode)
274 return ERR_PTR(-ENOMEM);
275
276 out_inode:
277 dentry = d_alloc_anon(inode);
278 if (!dentry) {
279 iput(inode);
280 return ERR_PTR(-ENOMEM);
281 }
282
283 return dentry;
284
285 fail_rgd:
286 gfs2_glock_dq_uninit(&rgd_gh);
287
288 fail_rindex:
289 gfs2_glock_dq_uninit(&ri_gh);
290
291 fail:
292 gfs2_glock_dq_uninit(&i_gh);
293 return ERR_PTR(error);
294}
295
296struct export_operations gfs2_export_ops = {
297 .decode_fh = gfs2_decode_fh,
298 .encode_fh = gfs2_encode_fh,
299 .get_name = gfs2_get_name,
300 .get_parent = gfs2_get_parent,
301 .get_dentry = gfs2_get_dentry,
302};
303
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..2f342f3d8755
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13extern struct export_operations gfs2_export_ops;
14
15#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..cf2e26e07245
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,968 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/gfs2_ioctl.h>
21#include <linux/fs.h>
22#include <asm/semaphore.h>
23#include <asm/uaccess.h>
24
25#include "gfs2.h"
26#include "bmap.h"
27#include "dir.h"
28#include "glock.h"
29#include "glops.h"
30#include "inode.h"
31#include "lm.h"
32#include "log.h"
33#include "meta_io.h"
34#include "ops_file.h"
35#include "ops_vm.h"
36#include "quota.h"
37#include "rgrp.h"
38#include "trans.h"
39
40/* "bad" is for NFS support */
41struct filldir_bad_entry {
42 char *fbe_name;
43 unsigned int fbe_length;
44 uint64_t fbe_offset;
45 struct gfs2_inum fbe_inum;
46 unsigned int fbe_type;
47};
48
49struct filldir_bad {
50 struct gfs2_sbd *fdb_sbd;
51
52 struct filldir_bad_entry *fdb_entry;
53 unsigned int fdb_entry_num;
54 unsigned int fdb_entry_off;
55
56 char *fdb_name;
57 unsigned int fdb_name_size;
58 unsigned int fdb_name_off;
59};
60
61/* For regular, non-NFS */
62struct filldir_reg {
63 struct gfs2_sbd *fdr_sbd;
64 int fdr_prefetch;
65
66 filldir_t fdr_filldir;
67 void *fdr_opaque;
68};
69
70/*
71 * Most fields left uninitialised to catch anybody who tries to
72 * use them. f_flags set to prevent file_accessed() from touching
73 * any other part of this. Its use is purely as a flag so that we
74 * know (in readpage()) whether or not do to locking.
75 */
76struct file gfs2_internal_file_sentinal = {
77 .f_flags = O_NOATIME|O_RDONLY,
78};
79
80static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
81 unsigned long offset, unsigned long size)
82{
83 char *kaddr;
84 unsigned long count = desc->count;
85
86 if (size > count)
87 size = count;
88
89 kaddr = kmap(page);
90 memcpy(desc->arg.buf, kaddr + offset, size);
91 kunmap(page);
92
93 desc->count = count - size;
94 desc->written += size;
95 desc->arg.buf += size;
96 return size;
97}
98
99int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
100 char *buf, loff_t *pos, unsigned size)
101{
102 struct inode *inode = ip->i_vnode;
103 read_descriptor_t desc;
104 desc.written = 0;
105 desc.arg.buf = buf;
106 desc.count = size;
107 desc.error = 0;
108 do_generic_mapping_read(inode->i_mapping, ra_state,
109 &gfs2_internal_file_sentinal, pos, &desc,
110 gfs2_read_actor);
111 return desc.written ? desc.written : desc.error;
112}
113
114/**
115 * gfs2_llseek - seek to a location in a file
116 * @file: the file
117 * @offset: the offset
118 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
119 *
120 * SEEK_END requires the glock for the file because it references the
121 * file's size.
122 *
123 * Returns: The new offset, or errno
124 */
125
126static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
127{
128 struct gfs2_inode *ip = get_v2ip(file->f_mapping->host);
129 struct gfs2_holder i_gh;
130 loff_t error;
131
132 atomic_inc(&ip->i_sbd->sd_ops_file);
133
134 if (origin == 2) {
135 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
136 &i_gh);
137 if (!error) {
138 error = remote_llseek(file, offset, origin);
139 gfs2_glock_dq_uninit(&i_gh);
140 }
141 } else
142 error = remote_llseek(file, offset, origin);
143
144 return error;
145}
146
147
148static ssize_t gfs2_direct_IO_read(struct kiocb *iocb, const struct iovec *iov,
149 loff_t offset, unsigned long nr_segs)
150{
151 struct file *file = iocb->ki_filp;
152 struct address_space *mapping = file->f_mapping;
153 ssize_t retval;
154
155 retval = filemap_write_and_wait(mapping);
156 if (retval == 0) {
157 retval = mapping->a_ops->direct_IO(READ, iocb, iov, offset,
158 nr_segs);
159 }
160 return retval;
161}
162
163/**
164 * __gfs2_file_aio_read - The main GFS2 read function
165 *
166 * N.B. This is almost, but not quite the same as __generic_file_aio_read()
167 * the important subtle different being that inode->i_size isn't valid
168 * unless we are holding a lock, and we do this _only_ on the O_DIRECT
169 * path since otherwise locking is done entirely at the page cache
170 * layer.
171 */
172static ssize_t __gfs2_file_aio_read(struct kiocb *iocb,
173 const struct iovec *iov,
174 unsigned long nr_segs, loff_t *ppos)
175{
176 struct file *filp = iocb->ki_filp;
177 struct gfs2_inode *ip = get_v2ip(filp->f_mapping->host);
178 struct gfs2_holder gh;
179 ssize_t retval;
180 unsigned long seg;
181 size_t count;
182
183 count = 0;
184 for (seg = 0; seg < nr_segs; seg++) {
185 const struct iovec *iv = &iov[seg];
186
187 /*
188 * If any segment has a negative length, or the cumulative
189 * length ever wraps negative then return -EINVAL.
190 */
191 count += iv->iov_len;
192 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
193 return -EINVAL;
194 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
195 continue;
196 if (seg == 0)
197 return -EFAULT;
198 nr_segs = seg;
199 count -= iv->iov_len; /* This segment is no good */
200 break;
201 }
202
203 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
204 if (filp->f_flags & O_DIRECT) {
205 loff_t pos = *ppos, size;
206 struct address_space *mapping;
207 struct inode *inode;
208
209 mapping = filp->f_mapping;
210 inode = mapping->host;
211 retval = 0;
212 if (!count)
213 goto out; /* skip atime */
214
215 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
216 retval = gfs2_glock_nq_m_atime(1, &gh);
217 if (retval)
218 goto out;
219 if (gfs2_is_stuffed(ip)) {
220 gfs2_glock_dq_m(1, &gh);
221 gfs2_holder_uninit(&gh);
222 goto fallback_to_normal;
223 }
224 size = i_size_read(inode);
225 if (pos < size) {
226 retval = gfs2_direct_IO_read(iocb, iov, pos, nr_segs);
227 if (retval > 0 && !is_sync_kiocb(iocb))
228 retval = -EIOCBQUEUED;
229 if (retval > 0)
230 *ppos = pos + retval;
231 }
232 file_accessed(filp);
233 gfs2_glock_dq_m(1, &gh);
234 gfs2_holder_uninit(&gh);
235 goto out;
236 }
237
238fallback_to_normal:
239 retval = 0;
240 if (count) {
241 for (seg = 0; seg < nr_segs; seg++) {
242 read_descriptor_t desc;
243
244 desc.written = 0;
245 desc.arg.buf = iov[seg].iov_base;
246 desc.count = iov[seg].iov_len;
247 if (desc.count == 0)
248 continue;
249 desc.error = 0;
250 do_generic_file_read(filp,ppos,&desc,file_read_actor);
251 retval += desc.written;
252 if (desc.error) {
253 retval = retval ?: desc.error;
254 break;
255 }
256 }
257 }
258out:
259 return retval;
260}
261
262/**
263 * gfs2_read - Read bytes from a file
264 * @file: The file to read from
265 * @buf: The buffer to copy into
266 * @size: The amount of data requested
267 * @offset: The current file offset
268 *
269 * Outputs: Offset - updated according to number of bytes read
270 *
271 * Returns: The number of bytes read, errno on failure
272 */
273
274static ssize_t gfs2_read(struct file *filp, char __user *buf, size_t size,
275 loff_t *offset)
276{
277 struct iovec local_iov = { .iov_base = buf, .iov_len = size };
278 struct kiocb kiocb;
279 ssize_t ret;
280
281 atomic_inc(&get_v2sdp(filp->f_mapping->host->i_sb)->sd_ops_file);
282
283 init_sync_kiocb(&kiocb, filp);
284 ret = __gfs2_file_aio_read(&kiocb, &local_iov, 1, offset);
285 if (-EIOCBQUEUED == ret)
286 ret = wait_on_sync_kiocb(&kiocb);
287 return ret;
288}
289
290static ssize_t gfs2_file_readv(struct file *filp, const struct iovec *iov,
291 unsigned long nr_segs, loff_t *ppos)
292{
293 struct kiocb kiocb;
294 ssize_t ret;
295
296 atomic_inc(&get_v2sdp(filp->f_mapping->host->i_sb)->sd_ops_file);
297
298 init_sync_kiocb(&kiocb, filp);
299 ret = __gfs2_file_aio_read(&kiocb, iov, nr_segs, ppos);
300 if (-EIOCBQUEUED == ret)
301 ret = wait_on_sync_kiocb(&kiocb);
302 return ret;
303}
304
305static ssize_t gfs2_file_aio_read(struct kiocb *iocb, char __user *buf,
306 size_t count, loff_t pos)
307{
308 struct file *filp = iocb->ki_filp;
309 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
310
311 atomic_inc(&get_v2sdp(filp->f_mapping->host->i_sb)->sd_ops_file);
312
313 BUG_ON(iocb->ki_pos != pos);
314 return __gfs2_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
315}
316
317
318/**
319 * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read()
320 * @opaque: opaque data used by the function
321 * @name: the name of the directory entry
322 * @length: the length of the name
323 * @offset: the entry's offset in the directory
324 * @inum: the inode number the entry points to
325 * @type: the type of inode the entry points to
326 *
327 * Returns: 0 on success, 1 if buffer full
328 */
329
330static int filldir_reg_func(void *opaque, const char *name, unsigned int length,
331 uint64_t offset, struct gfs2_inum *inum,
332 unsigned int type)
333{
334 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
335 struct gfs2_sbd *sdp = fdr->fdr_sbd;
336 int error;
337
338 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
339 inum->no_formal_ino, type);
340 if (error)
341 return 1;
342
343 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
344 gfs2_glock_prefetch_num(sdp,
345 inum->no_addr, &gfs2_inode_glops,
346 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
347 gfs2_glock_prefetch_num(sdp,
348 inum->no_addr, &gfs2_iopen_glops,
349 LM_ST_SHARED, LM_FLAG_TRY);
350 }
351
352 return 0;
353}
354
355/**
356 * readdir_reg - Read directory entries from a directory
357 * @file: The directory to read from
358 * @dirent: Buffer for dirents
359 * @filldir: Function used to do the copying
360 *
361 * Returns: errno
362 */
363
364static int readdir_reg(struct file *file, void *dirent, filldir_t filldir)
365{
366 struct gfs2_inode *dip = get_v2ip(file->f_mapping->host);
367 struct filldir_reg fdr;
368 struct gfs2_holder d_gh;
369 uint64_t offset = file->f_pos;
370 int error;
371
372 fdr.fdr_sbd = dip->i_sbd;
373 fdr.fdr_prefetch = 1;
374 fdr.fdr_filldir = filldir;
375 fdr.fdr_opaque = dirent;
376
377 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
378 error = gfs2_glock_nq_atime(&d_gh);
379 if (error) {
380 gfs2_holder_uninit(&d_gh);
381 return error;
382 }
383
384 error = gfs2_dir_read(dip, &offset, &fdr, filldir_reg_func);
385
386 gfs2_glock_dq_uninit(&d_gh);
387
388 file->f_pos = offset;
389
390 return error;
391}
392
393/**
394 * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read()
395 * @opaque: opaque data used by the function
396 * @name: the name of the directory entry
397 * @length: the length of the name
398 * @offset: the entry's offset in the directory
399 * @inum: the inode number the entry points to
400 * @type: the type of inode the entry points to
401 *
402 * For supporting NFS.
403 *
404 * Returns: 0 on success, 1 if buffer full
405 */
406
407static int filldir_bad_func(void *opaque, const char *name, unsigned int length,
408 uint64_t offset, struct gfs2_inum *inum,
409 unsigned int type)
410{
411 struct filldir_bad *fdb = (struct filldir_bad *)opaque;
412 struct gfs2_sbd *sdp = fdb->fdb_sbd;
413 struct filldir_bad_entry *fbe;
414
415 if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
416 fdb->fdb_name_off + length > fdb->fdb_name_size)
417 return 1;
418
419 fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
420 fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
421 memcpy(fbe->fbe_name, name, length);
422 fbe->fbe_length = length;
423 fbe->fbe_offset = offset;
424 fbe->fbe_inum = *inum;
425 fbe->fbe_type = type;
426
427 fdb->fdb_entry_off++;
428 fdb->fdb_name_off += length;
429
430 if (!(length == 1 && *name == '.')) {
431 gfs2_glock_prefetch_num(sdp,
432 inum->no_addr, &gfs2_inode_glops,
433 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
434 gfs2_glock_prefetch_num(sdp,
435 inum->no_addr, &gfs2_iopen_glops,
436 LM_ST_SHARED, LM_FLAG_TRY);
437 }
438
439 return 0;
440}
441
442/**
443 * readdir_bad - Read directory entries from a directory
444 * @file: The directory to read from
445 * @dirent: Buffer for dirents
446 * @filldir: Function used to do the copying
447 *
448 * For supporting NFS.
449 *
450 * Returns: errno
451 */
452
453static int readdir_bad(struct file *file, void *dirent, filldir_t filldir)
454{
455 struct gfs2_inode *dip = get_v2ip(file->f_mapping->host);
456 struct gfs2_sbd *sdp = dip->i_sbd;
457 struct filldir_reg fdr;
458 unsigned int entries, size;
459 struct filldir_bad *fdb;
460 struct gfs2_holder d_gh;
461 uint64_t offset = file->f_pos;
462 unsigned int x;
463 struct filldir_bad_entry *fbe;
464 int error;
465
466 entries = gfs2_tune_get(sdp, gt_entries_per_readdir);
467 size = sizeof(struct filldir_bad) +
468 entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE);
469
470 fdb = kzalloc(size, GFP_KERNEL);
471 if (!fdb)
472 return -ENOMEM;
473
474 fdb->fdb_sbd = sdp;
475 fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
476 fdb->fdb_entry_num = entries;
477 fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
478 entries * sizeof(struct filldir_bad_entry);
479 fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE;
480
481 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
482 error = gfs2_glock_nq_atime(&d_gh);
483 if (error) {
484 gfs2_holder_uninit(&d_gh);
485 goto out;
486 }
487
488 error = gfs2_dir_read(dip, &offset, fdb, filldir_bad_func);
489
490 gfs2_glock_dq_uninit(&d_gh);
491
492 fdr.fdr_sbd = sdp;
493 fdr.fdr_prefetch = 0;
494 fdr.fdr_filldir = filldir;
495 fdr.fdr_opaque = dirent;
496
497 for (x = 0; x < fdb->fdb_entry_off; x++) {
498 fbe = &fdb->fdb_entry[x];
499
500 error = filldir_reg_func(&fdr,
501 fbe->fbe_name, fbe->fbe_length,
502 fbe->fbe_offset,
503 &fbe->fbe_inum, fbe->fbe_type);
504 if (error) {
505 file->f_pos = fbe->fbe_offset;
506 error = 0;
507 goto out;
508 }
509 }
510
511 file->f_pos = offset;
512
513 out:
514 kfree(fdb);
515
516 return error;
517}
518
519/**
520 * gfs2_readdir - Read directory entries from a directory
521 * @file: The directory to read from
522 * @dirent: Buffer for dirents
523 * @filldir: Function used to do the copying
524 *
525 * Returns: errno
526 */
527
528static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
529{
530 int error;
531
532 atomic_inc(&get_v2sdp(file->f_mapping->host->i_sb)->sd_ops_file);
533
534 if (strcmp(current->comm, "nfsd") != 0)
535 error = readdir_reg(file, dirent, filldir);
536 else
537 error = readdir_bad(file, dirent, filldir);
538
539 return error;
540}
541
542static int gfs2_ioctl_flags(struct gfs2_inode *ip, unsigned int cmd, unsigned long arg)
543{
544 unsigned int lmode = (cmd == GFS2_IOCTL_SETFLAGS) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
545 struct buffer_head *dibh;
546 struct gfs2_holder i_gh;
547 int error;
548 __u32 flags = 0, change;
549
550 if (cmd == GFS2_IOCTL_SETFLAGS) {
551 error = get_user(flags, (__u32 __user *)arg);
552 if (error)
553 return -EFAULT;
554 }
555
556 error = gfs2_glock_nq_init(ip->i_gl, lmode, 0, &i_gh);
557 if (error)
558 return error;
559
560 if (cmd == GFS2_IOCTL_SETFLAGS) {
561 change = flags ^ ip->i_di.di_flags;
562 error = -EPERM;
563 if (change & (GFS2_DIF_IMMUTABLE|GFS2_DIF_APPENDONLY)) {
564 if (!capable(CAP_LINUX_IMMUTABLE))
565 goto out;
566 }
567 error = -EINVAL;
568 if (flags & (GFS2_DIF_JDATA|GFS2_DIF_DIRECTIO)) {
569 if (!S_ISREG(ip->i_di.di_mode))
570 goto out;
571 }
572 if (flags & (GFS2_DIF_INHERIT_JDATA|GFS2_DIF_INHERIT_DIRECTIO)) {
573 if (!S_ISDIR(ip->i_di.di_mode))
574 goto out;
575 }
576
577 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE, 0);
578 if (error)
579 goto out;
580
581 error = gfs2_meta_inode_buffer(ip, &dibh);
582 if (error)
583 goto out_trans_end;
584
585 ip->i_di.di_flags = flags;
586
587 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
588 gfs2_dinode_out(&ip->i_di, dibh->b_data);
589
590 brelse(dibh);
591
592out_trans_end:
593 gfs2_trans_end(ip->i_sbd);
594 } else {
595 flags = ip->i_di.di_flags;
596 }
597out:
598 gfs2_glock_dq_uninit(&i_gh);
599 if (cmd == GFS2_IOCTL_GETFLAGS) {
600 if (put_user(flags, (__u32 __user *)arg))
601 return -EFAULT;
602 }
603 return error;
604}
605
606/**
607 * gfs2_ioctl - do an ioctl on a file
608 * @inode: the inode
609 * @file: the file pointer
610 * @cmd: the ioctl command
611 * @arg: the argument
612 *
613 * Returns: errno
614 */
615
616static int gfs2_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
617 unsigned long arg)
618{
619 struct gfs2_inode *ip = get_v2ip(inode);
620
621 atomic_inc(&ip->i_sbd->sd_ops_file);
622
623 switch (cmd) {
624 case GFS2_IOCTL_SETFLAGS:
625 case GFS2_IOCTL_GETFLAGS:
626 return gfs2_ioctl_flags(ip, cmd, arg);
627
628 default:
629 return -ENOTTY;
630 }
631}
632
633/**
634 * gfs2_mmap -
635 * @file: The file to map
636 * @vma: The VMA which described the mapping
637 *
638 * Returns: 0 or error code
639 */
640
641static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
642{
643 struct gfs2_inode *ip = get_v2ip(file->f_mapping->host);
644 struct gfs2_holder i_gh;
645 int error;
646
647 atomic_inc(&ip->i_sbd->sd_ops_file);
648
649 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
650 error = gfs2_glock_nq_atime(&i_gh);
651 if (error) {
652 gfs2_holder_uninit(&i_gh);
653 return error;
654 }
655
656 /* This is VM_MAYWRITE instead of VM_WRITE because a call
657 to mprotect() can turn on VM_WRITE later. */
658
659 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
660 (VM_MAYSHARE | VM_MAYWRITE))
661 vma->vm_ops = &gfs2_vm_ops_sharewrite;
662 else
663 vma->vm_ops = &gfs2_vm_ops_private;
664
665 gfs2_glock_dq_uninit(&i_gh);
666
667 return error;
668}
669
670/**
671 * gfs2_open - open a file
672 * @inode: the inode to open
673 * @file: the struct file for this opening
674 *
675 * Returns: errno
676 */
677
678static int gfs2_open(struct inode *inode, struct file *file)
679{
680 struct gfs2_inode *ip = get_v2ip(inode);
681 struct gfs2_holder i_gh;
682 struct gfs2_file *fp;
683 int error;
684
685 atomic_inc(&ip->i_sbd->sd_ops_file);
686
687 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
688 if (!fp)
689 return -ENOMEM;
690
691 mutex_init(&fp->f_fl_mutex);
692
693 fp->f_inode = ip;
694 fp->f_vfile = file;
695
696 gfs2_assert_warn(ip->i_sbd, !get_v2fp(file));
697 set_v2fp(file, fp);
698
699 if (S_ISREG(ip->i_di.di_mode)) {
700 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
701 &i_gh);
702 if (error)
703 goto fail;
704
705 if (!(file->f_flags & O_LARGEFILE) &&
706 ip->i_di.di_size > MAX_NON_LFS) {
707 error = -EFBIG;
708 goto fail_gunlock;
709 }
710
711 /* Listen to the Direct I/O flag */
712
713 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
714 file->f_flags |= O_DIRECT;
715
716 gfs2_glock_dq_uninit(&i_gh);
717 }
718
719 return 0;
720
721 fail_gunlock:
722 gfs2_glock_dq_uninit(&i_gh);
723
724 fail:
725 set_v2fp(file, NULL);
726 kfree(fp);
727
728 return error;
729}
730
731/**
732 * gfs2_close - called to close a struct file
733 * @inode: the inode the struct file belongs to
734 * @file: the struct file being closed
735 *
736 * Returns: errno
737 */
738
739static int gfs2_close(struct inode *inode, struct file *file)
740{
741 struct gfs2_sbd *sdp = get_v2sdp(inode->i_sb);
742 struct gfs2_file *fp;
743
744 atomic_inc(&sdp->sd_ops_file);
745
746 fp = get_v2fp(file);
747 set_v2fp(file, NULL);
748
749 if (gfs2_assert_warn(sdp, fp))
750 return -EIO;
751
752 kfree(fp);
753
754 return 0;
755}
756
757/**
758 * gfs2_fsync - sync the dirty data for a file (across the cluster)
759 * @file: the file that points to the dentry (we ignore this)
760 * @dentry: the dentry that points to the inode to sync
761 *
762 * Returns: errno
763 */
764
765static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
766{
767 struct gfs2_inode *ip = get_v2ip(dentry->d_inode);
768
769 atomic_inc(&ip->i_sbd->sd_ops_file);
770 gfs2_log_flush_glock(ip->i_gl);
771
772 return 0;
773}
774
775/**
776 * gfs2_lock - acquire/release a posix lock on a file
777 * @file: the file pointer
778 * @cmd: either modify or retrieve lock state, possibly wait
779 * @fl: type and range of lock
780 *
781 * Returns: errno
782 */
783
784static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
785{
786 struct gfs2_inode *ip = get_v2ip(file->f_mapping->host);
787 struct gfs2_sbd *sdp = ip->i_sbd;
788 struct lm_lockname name =
789 { .ln_number = ip->i_num.no_addr,
790 .ln_type = LM_TYPE_PLOCK };
791
792 atomic_inc(&sdp->sd_ops_file);
793
794 if (!(fl->fl_flags & FL_POSIX))
795 return -ENOLCK;
796 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
797 return -ENOLCK;
798
799 if (sdp->sd_args.ar_localflocks) {
800 if (IS_GETLK(cmd)) {
801 struct file_lock *tmp;
802 lock_kernel();
803 tmp = posix_test_lock(file, fl);
804 fl->fl_type = F_UNLCK;
805 if (tmp)
806 memcpy(fl, tmp, sizeof(struct file_lock));
807 unlock_kernel();
808 return 0;
809 } else {
810 int error;
811 lock_kernel();
812 error = posix_lock_file_wait(file, fl);
813 unlock_kernel();
814 return error;
815 }
816 }
817
818 if (IS_GETLK(cmd))
819 return gfs2_lm_plock_get(sdp, &name, file, fl);
820 else if (fl->fl_type == F_UNLCK)
821 return gfs2_lm_punlock(sdp, &name, file, fl);
822 else
823 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
824}
825
826/**
827 * gfs2_sendfile - Send bytes to a file or socket
828 * @in_file: The file to read from
829 * @out_file: The file to write to
830 * @count: The amount of data
831 * @offset: The beginning file offset
832 *
833 * Outputs: offset - updated according to number of bytes read
834 *
835 * Returns: The number of bytes sent, errno on failure
836 */
837
838static ssize_t gfs2_sendfile(struct file *in_file, loff_t *offset, size_t count,
839 read_actor_t actor, void *target)
840{
841 struct gfs2_inode *ip = get_v2ip(in_file->f_mapping->host);
842
843 atomic_inc(&ip->i_sbd->sd_ops_file);
844
845 return generic_file_sendfile(in_file, offset, count, actor, target);
846}
847
848static int do_flock(struct file *file, int cmd, struct file_lock *fl)
849{
850 struct gfs2_file *fp = get_v2fp(file);
851 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
852 struct gfs2_inode *ip = fp->f_inode;
853 struct gfs2_glock *gl;
854 unsigned int state;
855 int flags;
856 int error = 0;
857
858 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
859 flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
860
861 mutex_lock(&fp->f_fl_mutex);
862
863 gl = fl_gh->gh_gl;
864 if (gl) {
865 if (fl_gh->gh_state == state)
866 goto out;
867 gfs2_glock_hold(gl);
868 flock_lock_file_wait(file,
869 &(struct file_lock){.fl_type = F_UNLCK});
870 gfs2_glock_dq_uninit(fl_gh);
871 } else {
872 error = gfs2_glock_get(ip->i_sbd,
873 ip->i_num.no_addr, &gfs2_flock_glops,
874 CREATE, &gl);
875 if (error)
876 goto out;
877 }
878
879 gfs2_holder_init(gl, state, flags, fl_gh);
880 gfs2_glock_put(gl);
881
882 error = gfs2_glock_nq(fl_gh);
883 if (error) {
884 gfs2_holder_uninit(fl_gh);
885 if (error == GLR_TRYFAILED)
886 error = -EAGAIN;
887 } else {
888 error = flock_lock_file_wait(file, fl);
889 gfs2_assert_warn(ip->i_sbd, !error);
890 }
891
892 out:
893 mutex_unlock(&fp->f_fl_mutex);
894
895 return error;
896}
897
898static void do_unflock(struct file *file, struct file_lock *fl)
899{
900 struct gfs2_file *fp = get_v2fp(file);
901 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
902
903 mutex_lock(&fp->f_fl_mutex);
904 flock_lock_file_wait(file, fl);
905 if (fl_gh->gh_gl)
906 gfs2_glock_dq_uninit(fl_gh);
907 mutex_unlock(&fp->f_fl_mutex);
908}
909
910/**
911 * gfs2_flock - acquire/release a flock lock on a file
912 * @file: the file pointer
913 * @cmd: either modify or retrieve lock state, possibly wait
914 * @fl: type and range of lock
915 *
916 * Returns: errno
917 */
918
919static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
920{
921 struct gfs2_inode *ip = get_v2ip(file->f_mapping->host);
922 struct gfs2_sbd *sdp = ip->i_sbd;
923
924 atomic_inc(&ip->i_sbd->sd_ops_file);
925
926 if (!(fl->fl_flags & FL_FLOCK))
927 return -ENOLCK;
928 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
929 return -ENOLCK;
930
931 if (sdp->sd_args.ar_localflocks)
932 return flock_lock_file_wait(file, fl);
933
934 if (fl->fl_type == F_UNLCK) {
935 do_unflock(file, fl);
936 return 0;
937 } else
938 return do_flock(file, cmd, fl);
939}
940
941struct file_operations gfs2_file_fops = {
942 .llseek = gfs2_llseek,
943 .read = gfs2_read,
944 .readv = gfs2_file_readv,
945 .aio_read = gfs2_file_aio_read,
946 .write = generic_file_write,
947 .writev = generic_file_writev,
948 .aio_write = generic_file_aio_write,
949 .ioctl = gfs2_ioctl,
950 .mmap = gfs2_mmap,
951 .open = gfs2_open,
952 .release = gfs2_close,
953 .fsync = gfs2_fsync,
954 .lock = gfs2_lock,
955 .sendfile = gfs2_sendfile,
956 .flock = gfs2_flock,
957};
958
959struct file_operations gfs2_dir_fops = {
960 .readdir = gfs2_readdir,
961 .ioctl = gfs2_ioctl,
962 .open = gfs2_open,
963 .release = gfs2_close,
964 .fsync = gfs2_fsync,
965 .lock = gfs2_lock,
966 .flock = gfs2_flock,
967};
968
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..192577b411f0
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12extern struct file gfs2_internal_file_sentinal;
13extern int gfs2_internal_read(struct gfs2_inode *ip,
14 struct file_ra_state *ra_state,
15 char *buf, loff_t *pos, unsigned size);
16
17extern struct file_operations gfs2_file_fops;
18extern struct file_operations gfs2_dir_fops;
19
20#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..751178ab497c
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,882 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/blkdev.h>
17#include <linux/kthread.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "daemon.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "lm.h"
26#include "mount.h"
27#include "ops_export.h"
28#include "ops_fstype.h"
29#include "ops_super.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "unlinked.h"
34#include "sys.h"
35
36#define DO 0
37#define UNDO 1
38
39static struct gfs2_sbd *init_sbd(struct super_block *sb)
40{
41 struct gfs2_sbd *sdp;
42 unsigned int x;
43
44 sdp = vmalloc(sizeof(struct gfs2_sbd));
45 if (!sdp)
46 return NULL;
47
48 memset(sdp, 0, sizeof(struct gfs2_sbd));
49
50 set_v2sdp(sb, sdp);
51 sdp->sd_vfs = sb;
52
53 gfs2_tune_init(&sdp->sd_tune);
54
55 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
56 sdp->sd_gl_hash[x].hb_lock = RW_LOCK_UNLOCKED;
57 INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
58 }
59 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
60 spin_lock_init(&sdp->sd_reclaim_lock);
61 init_waitqueue_head(&sdp->sd_reclaim_wq);
62 mutex_init(&sdp->sd_invalidate_inodes_mutex);
63
64 mutex_init(&sdp->sd_inum_mutex);
65 spin_lock_init(&sdp->sd_statfs_spin);
66 mutex_init(&sdp->sd_statfs_mutex);
67
68 spin_lock_init(&sdp->sd_rindex_spin);
69 mutex_init(&sdp->sd_rindex_mutex);
70 INIT_LIST_HEAD(&sdp->sd_rindex_list);
71 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
72 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
73
74 INIT_LIST_HEAD(&sdp->sd_jindex_list);
75 spin_lock_init(&sdp->sd_jindex_spin);
76 mutex_init(&sdp->sd_jindex_mutex);
77
78 INIT_LIST_HEAD(&sdp->sd_unlinked_list);
79 spin_lock_init(&sdp->sd_unlinked_spin);
80 mutex_init(&sdp->sd_unlinked_mutex);
81
82 INIT_LIST_HEAD(&sdp->sd_quota_list);
83 spin_lock_init(&sdp->sd_quota_spin);
84 mutex_init(&sdp->sd_quota_mutex);
85
86 spin_lock_init(&sdp->sd_log_lock);
87 init_waitqueue_head(&sdp->sd_log_trans_wq);
88 init_waitqueue_head(&sdp->sd_log_flush_wq);
89
90 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
91 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
92 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
93 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
94 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
95
96 INIT_LIST_HEAD(&sdp->sd_log_blks_list);
97 init_waitqueue_head(&sdp->sd_log_blks_wait);
98
99 INIT_LIST_HEAD(&sdp->sd_ail1_list);
100 INIT_LIST_HEAD(&sdp->sd_ail2_list);
101
102 mutex_init(&sdp->sd_log_flush_lock);
103 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
104
105 INIT_LIST_HEAD(&sdp->sd_revoke_list);
106
107 mutex_init(&sdp->sd_freeze_lock);
108
109 return sdp;
110}
111
112static void init_vfs(struct gfs2_sbd *sdp)
113{
114 struct super_block *sb = sdp->sd_vfs;
115
116 sb->s_magic = GFS2_MAGIC;
117 sb->s_op = &gfs2_super_ops;
118 sb->s_export_op = &gfs2_export_ops;
119 sb->s_maxbytes = MAX_LFS_FILESIZE;
120
121 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
122 set_bit(SDF_NOATIME, &sdp->sd_flags);
123
124 /* Don't let the VFS update atimes. GFS2 handles this itself. */
125 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
126
127 /* Set up the buffer cache and fill in some fake block size values
128 to allow us to read-in the on-disk superblock. */
129 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
130 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
131 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT;
132 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
133}
134
135static int init_names(struct gfs2_sbd *sdp, int silent)
136{
137 struct gfs2_sb *sb = NULL;
138 char *proto, *table;
139 int error = 0;
140
141 proto = sdp->sd_args.ar_lockproto;
142 table = sdp->sd_args.ar_locktable;
143
144 /* Try to autodetect */
145
146 if (!proto[0] || !table[0]) {
147 struct buffer_head *bh;
148 bh = sb_getblk(sdp->sd_vfs,
149 GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
150 lock_buffer(bh);
151 clear_buffer_uptodate(bh);
152 clear_buffer_dirty(bh);
153 unlock_buffer(bh);
154 ll_rw_block(READ, 1, &bh);
155 wait_on_buffer(bh);
156
157 if (!buffer_uptodate(bh)) {
158 brelse(bh);
159 return -EIO;
160 }
161
162 sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL);
163 if (!sb) {
164 brelse(bh);
165 return -ENOMEM;
166 }
167 gfs2_sb_in(sb, bh->b_data);
168 brelse(bh);
169
170 error = gfs2_check_sb(sdp, sb, silent);
171 if (error)
172 goto out;
173
174 if (!proto[0])
175 proto = sb->sb_lockproto;
176 if (!table[0])
177 table = sb->sb_locktable;
178 }
179
180 if (!table[0])
181 table = sdp->sd_vfs->s_id;
182
183 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
184 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
185
186 out:
187 kfree(sb);
188
189 return error;
190}
191
192static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
193 int undo)
194{
195 struct task_struct *p;
196 int error = 0;
197
198 if (undo)
199 goto fail_trans;
200
201 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
202 error = IS_ERR(p);
203 if (error) {
204 fs_err(sdp, "can't start scand thread: %d\n", error);
205 return error;
206 }
207 sdp->sd_scand_process = p;
208
209 for (sdp->sd_glockd_num = 0;
210 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
211 sdp->sd_glockd_num++) {
212 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
213 error = IS_ERR(p);
214 if (error) {
215 fs_err(sdp, "can't start glockd thread: %d\n", error);
216 goto fail;
217 }
218 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
219 }
220
221 error = gfs2_glock_nq_num(sdp,
222 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
223 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
224 mount_gh);
225 if (error) {
226 fs_err(sdp, "can't acquire mount glock: %d\n", error);
227 goto fail;
228 }
229
230 error = gfs2_glock_nq_num(sdp,
231 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
232 LM_ST_SHARED,
233 LM_FLAG_NOEXP | GL_EXACT | GL_NEVER_RECURSE,
234 &sdp->sd_live_gh);
235 if (error) {
236 fs_err(sdp, "can't acquire live glock: %d\n", error);
237 goto fail_mount;
238 }
239
240 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
241 CREATE, &sdp->sd_rename_gl);
242 if (error) {
243 fs_err(sdp, "can't create rename glock: %d\n", error);
244 goto fail_live;
245 }
246
247 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
248 CREATE, &sdp->sd_trans_gl);
249 if (error) {
250 fs_err(sdp, "can't create transaction glock: %d\n", error);
251 goto fail_rename;
252 }
253 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
254
255 return 0;
256
257 fail_trans:
258 gfs2_glock_put(sdp->sd_trans_gl);
259
260 fail_rename:
261 gfs2_glock_put(sdp->sd_rename_gl);
262
263 fail_live:
264 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
265
266 fail_mount:
267 gfs2_glock_dq_uninit(mount_gh);
268
269 fail:
270 while (sdp->sd_glockd_num--)
271 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
272
273 kthread_stop(sdp->sd_scand_process);
274
275 return error;
276}
277
278int gfs2_lookup_root(struct gfs2_sbd *sdp)
279{
280 int error;
281 struct gfs2_glock *gl;
282 struct gfs2_inode *ip;
283
284 error = gfs2_glock_get(sdp, sdp->sd_sb.sb_root_dir.no_addr,
285 &gfs2_inode_glops, CREATE, &gl);
286 if (!error) {
287 error = gfs2_inode_get(gl, &sdp->sd_sb.sb_root_dir,
288 CREATE, &ip);
289 if (!error) {
290 if (!error)
291 gfs2_inode_min_init(ip, DT_DIR);
292 sdp->sd_root_dir = gfs2_ip2v(ip);
293 gfs2_inode_put(ip);
294 }
295 gfs2_glock_put(gl);
296 }
297
298 return error;
299}
300
301static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
302{
303 struct super_block *sb = sdp->sd_vfs;
304 struct gfs2_holder sb_gh;
305 struct inode *inode;
306 int error = 0;
307
308 if (undo) {
309 iput(sdp->sd_master_dir);
310 return 0;
311 }
312
313 error = gfs2_glock_nq_num(sdp,
314 GFS2_SB_LOCK, &gfs2_meta_glops,
315 LM_ST_SHARED, 0, &sb_gh);
316 if (error) {
317 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
318 return error;
319 }
320
321 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
322 if (error) {
323 fs_err(sdp, "can't read superblock: %d\n", error);
324 goto out;
325 }
326
327 /* Set up the buffer cache and SB for real */
328 error = -EINVAL;
329 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
330 fs_err(sdp, "FS block size (%u) is too small for device "
331 "block size (%u)\n",
332 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
333 goto out;
334 }
335 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
336 fs_err(sdp, "FS block size (%u) is too big for machine "
337 "page size (%u)\n",
338 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
339 goto out;
340 }
341
342 /* Get rid of buffers from the original block size */
343 sb_gh.gh_gl->gl_ops->go_inval(sb_gh.gh_gl, DIO_METADATA | DIO_DATA);
344 sb_gh.gh_gl->gl_aspace->i_blkbits = sdp->sd_sb.sb_bsize_shift;
345
346 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
347
348 /* Get the root inode */
349 error = gfs2_lookup_root(sdp);
350 if (error) {
351 fs_err(sdp, "can't read in root inode: %d\n", error);
352 goto out;
353 }
354
355 /* Get the root inode/dentry */
356 inode = sdp->sd_root_dir;
357 if (!inode) {
358 fs_err(sdp, "can't get root inode\n");
359 error = -ENOMEM;
360 goto out_rooti;
361 }
362
363 igrab(inode);
364 sb->s_root = d_alloc_root(inode);
365 if (!sb->s_root) {
366 fs_err(sdp, "can't get root dentry\n");
367 error = -ENOMEM;
368 goto out_rooti;
369 }
370
371out:
372 gfs2_glock_dq_uninit(&sb_gh);
373
374 return error;
375out_rooti:
376 iput(sdp->sd_root_dir);
377 goto out;
378}
379
380static int init_journal(struct gfs2_sbd *sdp, int undo)
381{
382 struct gfs2_holder ji_gh;
383 struct task_struct *p;
384 int jindex = 1;
385 int error = 0;
386
387 if (undo) {
388 jindex = 0;
389 goto fail_recoverd;
390 }
391
392 error = gfs2_lookup_simple(sdp->sd_master_dir, "jindex",
393 &sdp->sd_jindex);
394 if (error) {
395 fs_err(sdp, "can't lookup journal index: %d\n", error);
396 return error;
397 }
398 set_bit(GLF_STICKY, &get_v2ip(sdp->sd_jindex)->i_gl->gl_flags);
399
400 /* Load in the journal index special file */
401
402 error = gfs2_jindex_hold(sdp, &ji_gh);
403 if (error) {
404 fs_err(sdp, "can't read journal index: %d\n", error);
405 goto fail;
406 }
407
408 error = -EINVAL;
409 if (!gfs2_jindex_size(sdp)) {
410 fs_err(sdp, "no journals!\n");
411 goto fail_jindex;
412 }
413
414 if (sdp->sd_args.ar_spectator) {
415 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
416 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
417 } else {
418 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
419 fs_err(sdp, "can't mount journal #%u\n",
420 sdp->sd_lockstruct.ls_jid);
421 fs_err(sdp, "there are only %u journals (0 - %u)\n",
422 gfs2_jindex_size(sdp),
423 gfs2_jindex_size(sdp) - 1);
424 goto fail_jindex;
425 }
426 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
427
428 error = gfs2_glock_nq_num(sdp,
429 sdp->sd_lockstruct.ls_jid,
430 &gfs2_journal_glops,
431 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
432 &sdp->sd_journal_gh);
433 if (error) {
434 fs_err(sdp, "can't acquire journal glock: %d\n", error);
435 goto fail_jindex;
436 }
437
438 error = gfs2_glock_nq_init(get_v2ip(sdp->sd_jdesc->jd_inode)->i_gl,
439 LM_ST_SHARED,
440 LM_FLAG_NOEXP | GL_EXACT,
441 &sdp->sd_jinode_gh);
442 if (error) {
443 fs_err(sdp, "can't acquire journal inode glock: %d\n",
444 error);
445 goto fail_journal_gh;
446 }
447
448 error = gfs2_jdesc_check(sdp->sd_jdesc);
449 if (error) {
450 fs_err(sdp, "my journal (%u) is bad: %d\n",
451 sdp->sd_jdesc->jd_jid, error);
452 goto fail_jinode_gh;
453 }
454 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
455 }
456
457 if (sdp->sd_lockstruct.ls_first) {
458 unsigned int x;
459 for (x = 0; x < sdp->sd_journals; x++) {
460 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
461 WAIT);
462 if (error) {
463 fs_err(sdp, "error recovering journal %u: %d\n",
464 x, error);
465 goto fail_jinode_gh;
466 }
467 }
468
469 gfs2_lm_others_may_mount(sdp);
470 } else if (!sdp->sd_args.ar_spectator) {
471 error = gfs2_recover_journal(sdp->sd_jdesc, WAIT);
472 if (error) {
473 fs_err(sdp, "error recovering my journal: %d\n", error);
474 goto fail_jinode_gh;
475 }
476 }
477
478 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
479 gfs2_glock_dq_uninit(&ji_gh);
480 jindex = 0;
481
482 /* Disown my Journal glock */
483
484 sdp->sd_journal_gh.gh_owner = NULL;
485 sdp->sd_jinode_gh.gh_owner = NULL;
486
487 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
488 error = IS_ERR(p);
489 if (error) {
490 fs_err(sdp, "can't start recoverd thread: %d\n", error);
491 goto fail_jinode_gh;
492 }
493 sdp->sd_recoverd_process = p;
494
495 return 0;
496
497 fail_recoverd:
498 kthread_stop(sdp->sd_recoverd_process);
499
500 fail_jinode_gh:
501 if (!sdp->sd_args.ar_spectator)
502 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
503
504 fail_journal_gh:
505 if (!sdp->sd_args.ar_spectator)
506 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
507
508 fail_jindex:
509 gfs2_jindex_free(sdp);
510 if (jindex)
511 gfs2_glock_dq_uninit(&ji_gh);
512
513 fail:
514 iput(sdp->sd_jindex);
515
516 return error;
517}
518
519
520static int init_inodes(struct gfs2_sbd *sdp, int undo)
521{
522 int error = 0;
523
524 if (undo)
525 goto fail_qinode;
526
527 error = gfs2_lookup_master_dir(sdp);
528 if (error) {
529 fs_err(sdp, "can't read in master directory: %d\n", error);
530 goto fail;
531 }
532
533 error = init_journal(sdp, undo);
534 if (error)
535 goto fail_master;
536
537 /* Read in the master inode number inode */
538 error = gfs2_lookup_simple(sdp->sd_master_dir, "inum",
539 &sdp->sd_inum_inode);
540 if (error) {
541 fs_err(sdp, "can't read in inum inode: %d\n", error);
542 goto fail_journal;
543 }
544
545
546 /* Read in the master statfs inode */
547 error = gfs2_lookup_simple(sdp->sd_master_dir, "statfs",
548 &sdp->sd_statfs_inode);
549 if (error) {
550 fs_err(sdp, "can't read in statfs inode: %d\n", error);
551 goto fail_inum;
552 }
553
554 /* Read in the resource index inode */
555 error = gfs2_lookup_simple(sdp->sd_master_dir, "rindex",
556 &sdp->sd_rindex);
557 if (error) {
558 fs_err(sdp, "can't get resource index inode: %d\n", error);
559 goto fail_statfs;
560 }
561 set_bit(GLF_STICKY, &get_v2ip(sdp->sd_rindex)->i_gl->gl_flags);
562 sdp->sd_rindex_vn = get_v2ip(sdp->sd_rindex)->i_gl->gl_vn - 1;
563
564 /* Read in the quota inode */
565 error = gfs2_lookup_simple(sdp->sd_master_dir, "quota",
566 &sdp->sd_quota_inode);
567 if (error) {
568 fs_err(sdp, "can't get quota file inode: %d\n", error);
569 goto fail_rindex;
570 }
571 return 0;
572
573fail_qinode:
574 iput(sdp->sd_quota_inode);
575
576fail_rindex:
577 gfs2_clear_rgrpd(sdp);
578 iput(sdp->sd_rindex);
579
580fail_statfs:
581 iput(sdp->sd_statfs_inode);
582
583fail_inum:
584 iput(sdp->sd_inum_inode);
585fail_journal:
586 init_journal(sdp, UNDO);
587fail_master:
588 iput(sdp->sd_master_dir);
589fail:
590 return error;
591}
592
593static int init_per_node(struct gfs2_sbd *sdp, int undo)
594{
595 struct inode *pn = NULL;
596 char buf[30];
597 int error = 0;
598
599 if (sdp->sd_args.ar_spectator)
600 return 0;
601
602 if (undo)
603 goto fail_qc_gh;
604
605 error = gfs2_lookup_simple(sdp->sd_master_dir, "per_node", &pn);
606 if (error) {
607 fs_err(sdp, "can't find per_node directory: %d\n", error);
608 return error;
609 }
610
611 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
612 error = gfs2_lookup_simple(pn, buf, &sdp->sd_ir_inode);
613 if (error) {
614 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
615 goto fail;
616 }
617
618 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
619 error = gfs2_lookup_simple(pn, buf, &sdp->sd_sc_inode);
620 if (error) {
621 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
622 goto fail_ir_i;
623 }
624
625 sprintf(buf, "unlinked_tag%u", sdp->sd_jdesc->jd_jid);
626 error = gfs2_lookup_simple(pn, buf, &sdp->sd_ut_inode);
627 if (error) {
628 fs_err(sdp, "can't find local \"ut\" file: %d\n", error);
629 goto fail_sc_i;
630 }
631
632 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
633 error = gfs2_lookup_simple(pn, buf, &sdp->sd_qc_inode);
634 if (error) {
635 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
636 goto fail_ut_i;
637 }
638
639 iput(pn);
640 pn = NULL;
641
642 error = gfs2_glock_nq_init(get_v2ip(sdp->sd_ir_inode)->i_gl,
643 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
644 &sdp->sd_ir_gh);
645 if (error) {
646 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
647 goto fail_qc_i;
648 }
649
650 error = gfs2_glock_nq_init(get_v2ip(sdp->sd_sc_inode)->i_gl,
651 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
652 &sdp->sd_sc_gh);
653 if (error) {
654 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
655 goto fail_ir_gh;
656 }
657
658 error = gfs2_glock_nq_init(get_v2ip(sdp->sd_ut_inode)->i_gl,
659 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
660 &sdp->sd_ut_gh);
661 if (error) {
662 fs_err(sdp, "can't lock local \"ut\" file: %d\n", error);
663 goto fail_sc_gh;
664 }
665
666 error = gfs2_glock_nq_init(get_v2ip(sdp->sd_qc_inode)->i_gl,
667 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
668 &sdp->sd_qc_gh);
669 if (error) {
670 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
671 goto fail_ut_gh;
672 }
673
674 return 0;
675
676 fail_qc_gh:
677 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
678
679 fail_ut_gh:
680 gfs2_glock_dq_uninit(&sdp->sd_ut_gh);
681
682 fail_sc_gh:
683 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
684
685 fail_ir_gh:
686 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
687
688 fail_qc_i:
689 iput(sdp->sd_qc_inode);
690
691 fail_ut_i:
692 iput(sdp->sd_ut_inode);
693
694 fail_sc_i:
695 iput(sdp->sd_sc_inode);
696
697 fail_ir_i:
698 iput(sdp->sd_ir_inode);
699
700 fail:
701 if (pn)
702 iput(pn);
703 return error;
704}
705
706static int init_threads(struct gfs2_sbd *sdp, int undo)
707{
708 struct task_struct *p;
709 int error = 0;
710
711 if (undo)
712 goto fail_inoded;
713
714 sdp->sd_log_flush_time = jiffies;
715 sdp->sd_jindex_refresh_time = jiffies;
716
717 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
718 error = IS_ERR(p);
719 if (error) {
720 fs_err(sdp, "can't start logd thread: %d\n", error);
721 return error;
722 }
723 sdp->sd_logd_process = p;
724
725 sdp->sd_statfs_sync_time = jiffies;
726 sdp->sd_quota_sync_time = jiffies;
727
728 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
729 error = IS_ERR(p);
730 if (error) {
731 fs_err(sdp, "can't start quotad thread: %d\n", error);
732 goto fail;
733 }
734 sdp->sd_quotad_process = p;
735
736 p = kthread_run(gfs2_inoded, sdp, "gfs2_inoded");
737 error = IS_ERR(p);
738 if (error) {
739 fs_err(sdp, "can't start inoded thread: %d\n", error);
740 goto fail_quotad;
741 }
742 sdp->sd_inoded_process = p;
743
744 return 0;
745
746 fail_inoded:
747 kthread_stop(sdp->sd_inoded_process);
748
749 fail_quotad:
750 kthread_stop(sdp->sd_quotad_process);
751
752 fail:
753 kthread_stop(sdp->sd_logd_process);
754
755 return error;
756}
757
758/**
759 * fill_super - Read in superblock
760 * @sb: The VFS superblock
761 * @data: Mount options
762 * @silent: Don't complain if it's not a GFS2 filesystem
763 *
764 * Returns: errno
765 */
766
767static int fill_super(struct super_block *sb, void *data, int silent)
768{
769 struct gfs2_sbd *sdp;
770 struct gfs2_holder mount_gh;
771 int error;
772
773 sdp = init_sbd(sb);
774 if (!sdp) {
775 printk("GFS2: can't alloc struct gfs2_sbd\n");
776 return -ENOMEM;
777 }
778
779 error = gfs2_mount_args(sdp, (char *)data, 0);
780 if (error) {
781 printk("GFS2: can't parse mount arguments\n");
782 goto fail;
783 }
784
785 init_vfs(sdp);
786
787 error = init_names(sdp, silent);
788 if (error)
789 goto fail;
790
791 error = gfs2_sys_fs_add(sdp);
792 if (error)
793 goto fail;
794
795 error = gfs2_lm_mount(sdp, silent);
796 if (error)
797 goto fail_sys;
798
799 error = init_locking(sdp, &mount_gh, DO);
800 if (error)
801 goto fail_lm;
802
803 error = init_sb(sdp, silent, DO);
804 if (error)
805 goto fail_locking;
806
807 error = init_inodes(sdp, DO);
808 if (error)
809 goto fail_sb;
810
811 error = init_per_node(sdp, DO);
812 if (error)
813 goto fail_inodes;
814
815 error = gfs2_statfs_init(sdp);
816 if (error) {
817 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
818 goto fail_per_node;
819 }
820
821 error = init_threads(sdp, DO);
822 if (error)
823 goto fail_per_node;
824
825 if (!(sb->s_flags & MS_RDONLY)) {
826 error = gfs2_make_fs_rw(sdp);
827 if (error) {
828 fs_err(sdp, "can't make FS RW: %d\n", error);
829 goto fail_threads;
830 }
831 }
832
833 gfs2_glock_dq_uninit(&mount_gh);
834
835 return 0;
836
837 fail_threads:
838 init_threads(sdp, UNDO);
839
840 fail_per_node:
841 init_per_node(sdp, UNDO);
842
843 fail_inodes:
844 init_inodes(sdp, UNDO);
845
846 fail_sb:
847 init_sb(sdp, 0, UNDO);
848
849 fail_locking:
850 init_locking(sdp, &mount_gh, UNDO);
851
852 fail_lm:
853 gfs2_gl_hash_clear(sdp, WAIT);
854 gfs2_lm_unmount(sdp);
855 while (invalidate_inodes(sb))
856 yield();
857
858 fail_sys:
859 gfs2_sys_fs_del(sdp);
860
861 fail:
862 vfree(sdp);
863 set_v2sdp(sb, NULL);
864
865 return error;
866}
867
868static struct super_block *gfs2_get_sb(struct file_system_type *fs_type,
869 int flags, const char *dev_name,
870 void *data)
871{
872 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
873}
874
875struct file_system_type gfs2_fs_type = {
876 .name = "gfs2",
877 .fs_flags = FS_REQUIRES_DEV,
878 .get_sb = gfs2_get_sb,
879 .kill_sb = kill_block_super,
880 .owner = THIS_MODULE,
881};
882
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7008364e76ea
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13extern struct file_system_type gfs2_fs_type;
14
15#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..9fb9490eb67a
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1234 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <asm/semaphore.h>
21#include <asm/uaccess.h>
22
23#include "gfs2.h"
24#include "acl.h"
25#include "bmap.h"
26#include "dir.h"
27#include "eaops.h"
28#include "eattr.h"
29#include "glock.h"
30#include "inode.h"
31#include "meta_io.h"
32#include "ops_dentry.h"
33#include "ops_inode.h"
34#include "page.h"
35#include "quota.h"
36#include "rgrp.h"
37#include "trans.h"
38#include "unlinked.h"
39
40/**
41 * gfs2_create - Create a file
42 * @dir: The directory in which to create the file
43 * @dentry: The dentry of the new file
44 * @mode: The mode of the new file
45 *
46 * Returns: errno
47 */
48
49static int gfs2_create(struct inode *dir, struct dentry *dentry,
50 int mode, struct nameidata *nd)
51{
52 struct gfs2_inode *dip = get_v2ip(dir);
53 struct gfs2_sbd *sdp = dip->i_sbd;
54 struct gfs2_holder ghs[2];
55 struct inode *inode;
56 int new = 1;
57 int error;
58
59 atomic_inc(&sdp->sd_ops_inode);
60
61 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
62
63 for (;;) {
64 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
65 if (!IS_ERR(inode)) {
66 gfs2_trans_end(sdp);
67 if (dip->i_alloc.al_rgd)
68 gfs2_inplace_release(dip);
69 gfs2_quota_unlock(dip);
70 gfs2_alloc_put(dip);
71 gfs2_glock_dq_uninit_m(2, ghs);
72 break;
73 } else if (PTR_ERR(inode) != -EEXIST ||
74 (nd->intent.open.flags & O_EXCL)) {
75 gfs2_holder_uninit(ghs);
76 return PTR_ERR(inode);
77 }
78
79 error = gfs2_lookupi(dir, &dentry->d_name, 0, &inode);
80 if (!error) {
81 new = 0;
82 gfs2_holder_uninit(ghs);
83 break;
84 } else if (error != -ENOENT) {
85 gfs2_holder_uninit(ghs);
86 return error;
87 }
88 }
89
90 d_instantiate(dentry, inode);
91 if (new)
92 mark_inode_dirty(inode);
93
94 return 0;
95}
96
97/**
98 * gfs2_lookup - Look up a filename in a directory and return its inode
99 * @dir: The directory inode
100 * @dentry: The dentry of the new inode
101 * @nd: passed from Linux VFS, ignored by us
102 *
103 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
104 *
105 * Returns: errno
106 */
107
108static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
109 struct nameidata *nd)
110{
111 struct gfs2_inode *dip = get_v2ip(dir);
112 struct gfs2_sbd *sdp = dip->i_sbd;
113 struct inode *inode = NULL;
114 int error;
115
116 atomic_inc(&sdp->sd_ops_inode);
117
118 if (!sdp->sd_args.ar_localcaching)
119 dentry->d_op = &gfs2_dops;
120
121 error = gfs2_lookupi(dir, &dentry->d_name, 0, &inode);
122 if (error && error != -ENOENT)
123 return ERR_PTR(error);
124
125 if (inode)
126 return d_splice_alias(inode, dentry);
127 d_add(dentry, inode);
128
129 return NULL;
130}
131
132/**
133 * gfs2_link - Link to a file
134 * @old_dentry: The inode to link
135 * @dir: Add link to this directory
136 * @dentry: The name of the link
137 *
138 * Link the inode in "old_dentry" into the directory "dir" with the
139 * name in "dentry".
140 *
141 * Returns: errno
142 */
143
144static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
145 struct dentry *dentry)
146{
147 struct gfs2_inode *dip = get_v2ip(dir);
148 struct gfs2_sbd *sdp = dip->i_sbd;
149 struct inode *inode = old_dentry->d_inode;
150 struct gfs2_inode *ip = get_v2ip(inode);
151 struct gfs2_holder ghs[2];
152 int alloc_required;
153 int error;
154
155 atomic_inc(&sdp->sd_ops_inode);
156
157 if (S_ISDIR(ip->i_di.di_mode))
158 return -EPERM;
159
160 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
161 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
162
163 error = gfs2_glock_nq_m(2, ghs);
164 if (error)
165 goto out;
166
167 error = gfs2_repermission(dir, MAY_WRITE | MAY_EXEC, NULL);
168 if (error)
169 goto out_gunlock;
170
171 error = gfs2_dir_search(dip, &dentry->d_name, NULL, NULL);
172 switch (error) {
173 case -ENOENT:
174 break;
175 case 0:
176 error = -EEXIST;
177 default:
178 goto out_gunlock;
179 }
180
181 error = -EINVAL;
182 if (!dip->i_di.di_nlink)
183 goto out_gunlock;
184 error = -EFBIG;
185 if (dip->i_di.di_entries == (uint32_t)-1)
186 goto out_gunlock;
187 error = -EPERM;
188 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
189 goto out_gunlock;
190 error = -EINVAL;
191 if (!ip->i_di.di_nlink)
192 goto out_gunlock;
193 error = -EMLINK;
194 if (ip->i_di.di_nlink == (uint32_t)-1)
195 goto out_gunlock;
196
197 error = gfs2_diradd_alloc_required(dip, &dentry->d_name,
198 &alloc_required);
199 if (error)
200 goto out_gunlock;
201
202 if (alloc_required) {
203 struct gfs2_alloc *al = gfs2_alloc_get(dip);
204
205 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
206 if (error)
207 goto out_alloc;
208
209 error = gfs2_quota_check(dip, dip->i_di.di_uid,
210 dip->i_di.di_gid);
211 if (error)
212 goto out_gunlock_q;
213
214 al->al_requested = sdp->sd_max_dirres;
215
216 error = gfs2_inplace_reserve(dip);
217 if (error)
218 goto out_gunlock_q;
219
220 error = gfs2_trans_begin(sdp,
221 sdp->sd_max_dirres +
222 al->al_rgd->rd_ri.ri_length +
223 2 * RES_DINODE + RES_STATFS +
224 RES_QUOTA, 0);
225 if (error)
226 goto out_ipres;
227 } else {
228 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
229 if (error)
230 goto out_ipres;
231 }
232
233 error = gfs2_dir_add(dip, &dentry->d_name, &ip->i_num,
234 IF2DT(ip->i_di.di_mode));
235 if (error)
236 goto out_end_trans;
237
238 error = gfs2_change_nlink(ip, +1);
239
240 out_end_trans:
241 gfs2_trans_end(sdp);
242
243 out_ipres:
244 if (alloc_required)
245 gfs2_inplace_release(dip);
246
247 out_gunlock_q:
248 if (alloc_required)
249 gfs2_quota_unlock(dip);
250
251 out_alloc:
252 if (alloc_required)
253 gfs2_alloc_put(dip);
254
255 out_gunlock:
256 gfs2_glock_dq_m(2, ghs);
257
258 out:
259 gfs2_holder_uninit(ghs);
260 gfs2_holder_uninit(ghs + 1);
261
262 if (!error) {
263 atomic_inc(&inode->i_count);
264 d_instantiate(dentry, inode);
265 mark_inode_dirty(inode);
266 }
267
268 return error;
269}
270
271/**
272 * gfs2_unlink - Unlink a file
273 * @dir: The inode of the directory containing the file to unlink
274 * @dentry: The file itself
275 *
276 * Unlink a file. Call gfs2_unlinki()
277 *
278 * Returns: errno
279 */
280
281static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
282{
283 struct gfs2_inode *dip = get_v2ip(dir);
284 struct gfs2_sbd *sdp = dip->i_sbd;
285 struct gfs2_inode *ip = get_v2ip(dentry->d_inode);
286 struct gfs2_unlinked *ul;
287 struct gfs2_holder ghs[2];
288 int error;
289
290 atomic_inc(&sdp->sd_ops_inode);
291
292 error = gfs2_unlinked_get(sdp, &ul);
293 if (error)
294 return error;
295
296 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
297 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
298
299 error = gfs2_glock_nq_m(2, ghs);
300 if (error)
301 goto out;
302
303 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
304 if (error)
305 goto out_gunlock;
306
307 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF +
308 RES_UNLINKED, 0);
309 if (error)
310 goto out_gunlock;
311
312 error = gfs2_unlinki(dip, &dentry->d_name, ip,ul);
313
314 gfs2_trans_end(sdp);
315
316 out_gunlock:
317 gfs2_glock_dq_m(2, ghs);
318
319 out:
320 gfs2_holder_uninit(ghs);
321 gfs2_holder_uninit(ghs + 1);
322
323 gfs2_unlinked_put(sdp, ul);
324
325 return error;
326}
327
328/**
329 * gfs2_symlink - Create a symlink
330 * @dir: The directory to create the symlink in
331 * @dentry: The dentry to put the symlink in
332 * @symname: The thing which the link points to
333 *
334 * Returns: errno
335 */
336
337static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
338 const char *symname)
339{
340 struct gfs2_inode *dip = get_v2ip(dir), *ip;
341 struct gfs2_sbd *sdp = dip->i_sbd;
342 struct gfs2_holder ghs[2];
343 struct inode *inode;
344 struct buffer_head *dibh;
345 int size;
346 int error;
347
348 atomic_inc(&sdp->sd_ops_inode);
349
350 /* Must be stuffed with a null terminator for gfs2_follow_link() */
351 size = strlen(symname);
352 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
353 return -ENAMETOOLONG;
354
355 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
356
357 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
358 if (IS_ERR(inode)) {
359 gfs2_holder_uninit(ghs);
360 return PTR_ERR(inode);
361 }
362
363 ip = get_gl2ip(ghs[1].gh_gl);
364
365 ip->i_di.di_size = size;
366
367 error = gfs2_meta_inode_buffer(ip, &dibh);
368
369 if (!gfs2_assert_withdraw(sdp, !error)) {
370 gfs2_dinode_out(&ip->i_di, dibh->b_data);
371 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
372 size);
373 brelse(dibh);
374 }
375
376 gfs2_trans_end(sdp);
377 if (dip->i_alloc.al_rgd)
378 gfs2_inplace_release(dip);
379 gfs2_quota_unlock(dip);
380 gfs2_alloc_put(dip);
381
382 gfs2_glock_dq_uninit_m(2, ghs);
383
384 d_instantiate(dentry, inode);
385 mark_inode_dirty(inode);
386
387 return 0;
388}
389
390/**
391 * gfs2_mkdir - Make a directory
392 * @dir: The parent directory of the new one
393 * @dentry: The dentry of the new directory
394 * @mode: The mode of the new directory
395 *
396 * Returns: errno
397 */
398
399static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
400{
401 struct gfs2_inode *dip = get_v2ip(dir), *ip;
402 struct gfs2_sbd *sdp = dip->i_sbd;
403 struct gfs2_holder ghs[2];
404 struct inode *inode;
405 struct buffer_head *dibh;
406 int error;
407
408 atomic_inc(&sdp->sd_ops_inode);
409
410 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
411
412 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
413 if (IS_ERR(inode)) {
414 gfs2_holder_uninit(ghs);
415 return PTR_ERR(inode);
416 }
417
418 ip = get_gl2ip(ghs[1].gh_gl);
419
420 ip->i_di.di_nlink = 2;
421 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
422 ip->i_di.di_flags |= GFS2_DIF_JDATA;
423 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
424 ip->i_di.di_entries = 2;
425
426 error = gfs2_meta_inode_buffer(ip, &dibh);
427
428 if (!gfs2_assert_withdraw(sdp, !error)) {
429 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
430 struct gfs2_dirent *dent;
431
432 gfs2_dirent_alloc(ip, dibh, 1, &dent);
433
434 dent->de_inum = di->di_num; /* already GFS2 endian */
435 dent->de_hash = gfs2_disk_hash(".", 1);
436 dent->de_hash = cpu_to_be32(dent->de_hash);
437 dent->de_type = DT_DIR;
438 memcpy((char *) (dent + 1), ".", 1);
439 di->di_entries = cpu_to_be32(1);
440
441 gfs2_dirent_alloc(ip, dibh, 2, &dent);
442
443 gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum);
444 dent->de_hash = gfs2_disk_hash("..", 2);
445 dent->de_hash = cpu_to_be32(dent->de_hash);
446 dent->de_type = DT_DIR;
447 memcpy((char *) (dent + 1), "..", 2);
448
449 gfs2_dinode_out(&ip->i_di, (char *)di);
450
451 brelse(dibh);
452 }
453
454 error = gfs2_change_nlink(dip, +1);
455 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
456
457 gfs2_trans_end(sdp);
458 if (dip->i_alloc.al_rgd)
459 gfs2_inplace_release(dip);
460 gfs2_quota_unlock(dip);
461 gfs2_alloc_put(dip);
462
463 gfs2_glock_dq_uninit_m(2, ghs);
464
465 d_instantiate(dentry, inode);
466 mark_inode_dirty(inode);
467
468 return 0;
469}
470
471/**
472 * gfs2_rmdir - Remove a directory
473 * @dir: The parent directory of the directory to be removed
474 * @dentry: The dentry of the directory to remove
475 *
476 * Remove a directory. Call gfs2_rmdiri()
477 *
478 * Returns: errno
479 */
480
481static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
482{
483 struct gfs2_inode *dip = get_v2ip(dir);
484 struct gfs2_sbd *sdp = dip->i_sbd;
485 struct gfs2_inode *ip = get_v2ip(dentry->d_inode);
486 struct gfs2_unlinked *ul;
487 struct gfs2_holder ghs[2];
488 int error;
489
490 atomic_inc(&sdp->sd_ops_inode);
491
492 error = gfs2_unlinked_get(sdp, &ul);
493 if (error)
494 return error;
495
496 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
497 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
498
499 error = gfs2_glock_nq_m(2, ghs);
500 if (error)
501 goto out;
502
503 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
504 if (error)
505 goto out_gunlock;
506
507 if (ip->i_di.di_entries < 2) {
508 if (gfs2_consist_inode(ip))
509 gfs2_dinode_print(&ip->i_di);
510 error = -EIO;
511 goto out_gunlock;
512 }
513 if (ip->i_di.di_entries > 2) {
514 error = -ENOTEMPTY;
515 goto out_gunlock;
516 }
517
518 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF +
519 RES_UNLINKED, 0);
520 if (error)
521 goto out_gunlock;
522
523 error = gfs2_rmdiri(dip, &dentry->d_name, ip, ul);
524
525 gfs2_trans_end(sdp);
526
527 out_gunlock:
528 gfs2_glock_dq_m(2, ghs);
529
530 out:
531 gfs2_holder_uninit(ghs);
532 gfs2_holder_uninit(ghs + 1);
533
534 gfs2_unlinked_put(sdp, ul);
535
536 return error;
537}
538
539/**
540 * gfs2_mknod - Make a special file
541 * @dir: The directory in which the special file will reside
542 * @dentry: The dentry of the special file
543 * @mode: The mode of the special file
544 * @rdev: The device specification of the special file
545 *
546 */
547
548static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
549 dev_t dev)
550{
551 struct gfs2_inode *dip = get_v2ip(dir), *ip;
552 struct gfs2_sbd *sdp = dip->i_sbd;
553 struct gfs2_holder ghs[2];
554 struct inode *inode;
555 struct buffer_head *dibh;
556 uint32_t major = 0, minor = 0;
557 int error;
558
559 atomic_inc(&sdp->sd_ops_inode);
560
561 switch (mode & S_IFMT) {
562 case S_IFBLK:
563 case S_IFCHR:
564 major = MAJOR(dev);
565 minor = MINOR(dev);
566 break;
567 case S_IFIFO:
568 case S_IFSOCK:
569 break;
570 default:
571 return -EOPNOTSUPP;
572 };
573
574 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
575
576 inode = gfs2_createi(ghs, &dentry->d_name, mode);
577 if (IS_ERR(inode)) {
578 gfs2_holder_uninit(ghs);
579 return PTR_ERR(inode);
580 }
581
582 ip = get_gl2ip(ghs[1].gh_gl);
583
584 ip->i_di.di_major = major;
585 ip->i_di.di_minor = minor;
586
587 error = gfs2_meta_inode_buffer(ip, &dibh);
588
589 if (!gfs2_assert_withdraw(sdp, !error)) {
590 gfs2_dinode_out(&ip->i_di, dibh->b_data);
591 brelse(dibh);
592 }
593
594 gfs2_trans_end(sdp);
595 if (dip->i_alloc.al_rgd)
596 gfs2_inplace_release(dip);
597 gfs2_quota_unlock(dip);
598 gfs2_alloc_put(dip);
599
600 gfs2_glock_dq_uninit_m(2, ghs);
601
602 d_instantiate(dentry, inode);
603 mark_inode_dirty(inode);
604
605 return 0;
606}
607
608/**
609 * gfs2_rename - Rename a file
610 * @odir: Parent directory of old file name
611 * @odentry: The old dentry of the file
612 * @ndir: Parent directory of new file name
613 * @ndentry: The new dentry of the file
614 *
615 * Returns: errno
616 */
617
618static int gfs2_rename(struct inode *odir, struct dentry *odentry,
619 struct inode *ndir, struct dentry *ndentry)
620{
621 struct gfs2_inode *odip = get_v2ip(odir);
622 struct gfs2_inode *ndip = get_v2ip(ndir);
623 struct gfs2_inode *ip = get_v2ip(odentry->d_inode);
624 struct gfs2_inode *nip = NULL;
625 struct gfs2_sbd *sdp = odip->i_sbd;
626 struct gfs2_unlinked *ul;
627 struct gfs2_holder ghs[4], r_gh;
628 unsigned int num_gh;
629 int dir_rename = 0;
630 int alloc_required;
631 unsigned int x;
632 int error;
633
634 atomic_inc(&sdp->sd_ops_inode);
635
636 if (ndentry->d_inode) {
637 nip = get_v2ip(ndentry->d_inode);
638 if (ip == nip)
639 return 0;
640 }
641
642 error = gfs2_unlinked_get(sdp, &ul);
643 if (error)
644 return error;
645
646 /* Make sure we aren't trying to move a dirctory into it's subdir */
647
648 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
649 dir_rename = 1;
650
651 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
652 LM_ST_EXCLUSIVE, 0,
653 &r_gh);
654 if (error)
655 goto out;
656
657 error = gfs2_ok_to_move(ip, ndip);
658 if (error)
659 goto out_gunlock_r;
660 }
661
662 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
663 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
664 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
665 num_gh = 3;
666
667 if (nip)
668 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
669
670 error = gfs2_glock_nq_m(num_gh, ghs);
671 if (error)
672 goto out_uninit;
673
674 /* Check out the old directory */
675
676 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
677 if (error)
678 goto out_gunlock;
679
680 /* Check out the new directory */
681
682 if (nip) {
683 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
684 if (error)
685 goto out_gunlock;
686
687 if (S_ISDIR(nip->i_di.di_mode)) {
688 if (nip->i_di.di_entries < 2) {
689 if (gfs2_consist_inode(nip))
690 gfs2_dinode_print(&nip->i_di);
691 error = -EIO;
692 goto out_gunlock;
693 }
694 if (nip->i_di.di_entries > 2) {
695 error = -ENOTEMPTY;
696 goto out_gunlock;
697 }
698 }
699 } else {
700 error = gfs2_repermission(ndir, MAY_WRITE | MAY_EXEC, NULL);
701 if (error)
702 goto out_gunlock;
703
704 error = gfs2_dir_search(ndip, &ndentry->d_name, NULL, NULL);
705 switch (error) {
706 case -ENOENT:
707 error = 0;
708 break;
709 case 0:
710 error = -EEXIST;
711 default:
712 goto out_gunlock;
713 };
714
715 if (odip != ndip) {
716 if (!ndip->i_di.di_nlink) {
717 error = -EINVAL;
718 goto out_gunlock;
719 }
720 if (ndip->i_di.di_entries == (uint32_t)-1) {
721 error = -EFBIG;
722 goto out_gunlock;
723 }
724 if (S_ISDIR(ip->i_di.di_mode) &&
725 ndip->i_di.di_nlink == (uint32_t)-1) {
726 error = -EMLINK;
727 goto out_gunlock;
728 }
729 }
730 }
731
732 /* Check out the dir to be renamed */
733
734 if (dir_rename) {
735 error = gfs2_repermission(odentry->d_inode, MAY_WRITE, NULL);
736 if (error)
737 goto out_gunlock;
738 }
739
740 error = gfs2_diradd_alloc_required(ndip, &ndentry->d_name,
741 &alloc_required);
742 if (error)
743 goto out_gunlock;
744
745 if (alloc_required) {
746 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
747
748 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
749 if (error)
750 goto out_alloc;
751
752 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
753 ndip->i_di.di_gid);
754 if (error)
755 goto out_gunlock_q;
756
757 al->al_requested = sdp->sd_max_dirres;
758
759 error = gfs2_inplace_reserve(ndip);
760 if (error)
761 goto out_gunlock_q;
762
763 error = gfs2_trans_begin(sdp,
764 sdp->sd_max_dirres +
765 al->al_rgd->rd_ri.ri_length +
766 4 * RES_DINODE + 4 * RES_LEAF +
767 RES_UNLINKED + RES_STATFS +
768 RES_QUOTA, 0);
769 if (error)
770 goto out_ipreserv;
771 } else {
772 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
773 5 * RES_LEAF +
774 RES_UNLINKED, 0);
775 if (error)
776 goto out_gunlock;
777 }
778
779 /* Remove the target file, if it exists */
780
781 if (nip) {
782 if (S_ISDIR(nip->i_di.di_mode))
783 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip, ul);
784 else
785 error = gfs2_unlinki(ndip, &ndentry->d_name, nip, ul);
786 if (error)
787 goto out_end_trans;
788 }
789
790 if (dir_rename) {
791 struct qstr name;
792 name.len = 2;
793 name.name = "..";
794
795 error = gfs2_change_nlink(ndip, +1);
796 if (error)
797 goto out_end_trans;
798 error = gfs2_change_nlink(odip, -1);
799 if (error)
800 goto out_end_trans;
801
802 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
803 if (error)
804 goto out_end_trans;
805 } else {
806 struct buffer_head *dibh;
807 error = gfs2_meta_inode_buffer(ip, &dibh);
808 if (error)
809 goto out_end_trans;
810 ip->i_di.di_ctime = get_seconds();
811 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
812 gfs2_dinode_out(&ip->i_di, dibh->b_data);
813 brelse(dibh);
814 }
815
816 error = gfs2_dir_del(odip, &odentry->d_name);
817 if (error)
818 goto out_end_trans;
819
820 error = gfs2_dir_add(ndip, &ndentry->d_name, &ip->i_num,
821 IF2DT(ip->i_di.di_mode));
822 if (error)
823 goto out_end_trans;
824
825 out_end_trans:
826 gfs2_trans_end(sdp);
827
828 out_ipreserv:
829 if (alloc_required)
830 gfs2_inplace_release(ndip);
831
832 out_gunlock_q:
833 if (alloc_required)
834 gfs2_quota_unlock(ndip);
835
836 out_alloc:
837 if (alloc_required)
838 gfs2_alloc_put(ndip);
839
840 out_gunlock:
841 gfs2_glock_dq_m(num_gh, ghs);
842
843 out_uninit:
844 for (x = 0; x < num_gh; x++)
845 gfs2_holder_uninit(ghs + x);
846
847 out_gunlock_r:
848 if (dir_rename)
849 gfs2_glock_dq_uninit(&r_gh);
850
851 out:
852 gfs2_unlinked_put(sdp, ul);
853
854 return error;
855}
856
857/**
858 * gfs2_readlink - Read the value of a symlink
859 * @dentry: the symlink
860 * @buf: the buffer to read the symlink data into
861 * @size: the size of the buffer
862 *
863 * Returns: errno
864 */
865
866static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
867 int user_size)
868{
869 struct gfs2_inode *ip = get_v2ip(dentry->d_inode);
870 char array[GFS2_FAST_NAME_SIZE], *buf = array;
871 unsigned int len = GFS2_FAST_NAME_SIZE;
872 int error;
873
874 atomic_inc(&ip->i_sbd->sd_ops_inode);
875
876 error = gfs2_readlinki(ip, &buf, &len);
877 if (error)
878 return error;
879
880 if (user_size > len - 1)
881 user_size = len - 1;
882
883 if (copy_to_user(user_buf, buf, user_size))
884 error = -EFAULT;
885 else
886 error = user_size;
887
888 if (buf != array)
889 kfree(buf);
890
891 return error;
892}
893
894/**
895 * gfs2_follow_link - Follow a symbolic link
896 * @dentry: The dentry of the link
897 * @nd: Data that we pass to vfs_follow_link()
898 *
899 * This can handle symlinks of any size. It is optimised for symlinks
900 * under GFS2_FAST_NAME_SIZE.
901 *
902 * Returns: 0 on success or error code
903 */
904
905static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
906{
907 struct gfs2_inode *ip = get_v2ip(dentry->d_inode);
908 char array[GFS2_FAST_NAME_SIZE], *buf = array;
909 unsigned int len = GFS2_FAST_NAME_SIZE;
910 int error;
911
912 atomic_inc(&ip->i_sbd->sd_ops_inode);
913
914 error = gfs2_readlinki(ip, &buf, &len);
915 if (!error) {
916 error = vfs_follow_link(nd, buf);
917 if (buf != array)
918 kfree(buf);
919 }
920
921 return ERR_PTR(error);
922}
923
924/**
925 * gfs2_permission -
926 * @inode:
927 * @mask:
928 * @nd: passed from Linux VFS, ignored by us
929 *
930 * Returns: errno
931 */
932
933static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
934{
935 struct gfs2_inode *ip = get_v2ip(inode);
936 struct gfs2_holder i_gh;
937 int error;
938
939 atomic_inc(&ip->i_sbd->sd_ops_inode);
940
941 if (ip->i_vn == ip->i_gl->gl_vn)
942 return generic_permission(inode, mask, gfs2_check_acl);
943
944 error = gfs2_glock_nq_init(ip->i_gl,
945 LM_ST_SHARED, LM_FLAG_ANY,
946 &i_gh);
947 if (!error) {
948 error = generic_permission(inode, mask, gfs2_check_acl_locked);
949 gfs2_glock_dq_uninit(&i_gh);
950 }
951
952 return error;
953}
954
955static int setattr_size(struct inode *inode, struct iattr *attr)
956{
957 struct gfs2_inode *ip = get_v2ip(inode);
958 int error;
959
960 if (attr->ia_size != ip->i_di.di_size) {
961 error = vmtruncate(inode, attr->ia_size);
962 if (error)
963 return error;
964 }
965
966 error = gfs2_truncatei(ip, attr->ia_size);
967 if (error)
968 return error;
969
970 return error;
971}
972
973static int setattr_chown(struct inode *inode, struct iattr *attr)
974{
975 struct gfs2_inode *ip = get_v2ip(inode);
976 struct gfs2_sbd *sdp = ip->i_sbd;
977 struct buffer_head *dibh;
978 uint32_t ouid, ogid, nuid, ngid;
979 int error;
980
981 ouid = ip->i_di.di_uid;
982 ogid = ip->i_di.di_gid;
983 nuid = attr->ia_uid;
984 ngid = attr->ia_gid;
985
986 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
987 ouid = nuid = NO_QUOTA_CHANGE;
988 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
989 ogid = ngid = NO_QUOTA_CHANGE;
990
991 gfs2_alloc_get(ip);
992
993 error = gfs2_quota_lock(ip, nuid, ngid);
994 if (error)
995 goto out_alloc;
996
997 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
998 error = gfs2_quota_check(ip, nuid, ngid);
999 if (error)
1000 goto out_gunlock_q;
1001 }
1002
1003 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
1004 if (error)
1005 goto out_gunlock_q;
1006
1007 error = gfs2_meta_inode_buffer(ip, &dibh);
1008 if (error)
1009 goto out_end_trans;
1010
1011 error = inode_setattr(inode, attr);
1012 gfs2_assert_warn(sdp, !error);
1013 gfs2_inode_attr_out(ip);
1014
1015 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1016 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1017 brelse(dibh);
1018
1019 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
1020 gfs2_quota_change(ip, -ip->i_di.di_blocks,
1021 ouid, ogid);
1022 gfs2_quota_change(ip, ip->i_di.di_blocks,
1023 nuid, ngid);
1024 }
1025
1026 out_end_trans:
1027 gfs2_trans_end(sdp);
1028
1029 out_gunlock_q:
1030 gfs2_quota_unlock(ip);
1031
1032 out_alloc:
1033 gfs2_alloc_put(ip);
1034
1035 return error;
1036}
1037
1038/**
1039 * gfs2_setattr - Change attributes on an inode
1040 * @dentry: The dentry which is changing
1041 * @attr: The structure describing the change
1042 *
1043 * The VFS layer wants to change one or more of an inodes attributes. Write
1044 * that change out to disk.
1045 *
1046 * Returns: errno
1047 */
1048
1049static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1050{
1051 struct inode *inode = dentry->d_inode;
1052 struct gfs2_inode *ip = get_v2ip(inode);
1053 struct gfs2_holder i_gh;
1054 int error;
1055
1056 atomic_inc(&ip->i_sbd->sd_ops_inode);
1057
1058 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1059 if (error)
1060 return error;
1061
1062 error = -EPERM;
1063 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1064 goto out;
1065
1066 error = inode_change_ok(inode, attr);
1067 if (error)
1068 goto out;
1069
1070 if (attr->ia_valid & ATTR_SIZE)
1071 error = setattr_size(inode, attr);
1072 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1073 error = setattr_chown(inode, attr);
1074 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1075 error = gfs2_acl_chmod(ip, attr);
1076 else
1077 error = gfs2_setattr_simple(ip, attr);
1078
1079 out:
1080 gfs2_glock_dq_uninit(&i_gh);
1081
1082 if (!error)
1083 mark_inode_dirty(inode);
1084
1085 return error;
1086}
1087
1088/**
1089 * gfs2_getattr - Read out an inode's attributes
1090 * @mnt: ?
1091 * @dentry: The dentry to stat
1092 * @stat: The inode's stats
1093 *
1094 * Returns: errno
1095 */
1096
1097static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1098 struct kstat *stat)
1099{
1100 struct inode *inode = dentry->d_inode;
1101 struct gfs2_inode *ip = get_v2ip(inode);
1102 struct gfs2_holder gh;
1103 int error;
1104
1105 atomic_inc(&ip->i_sbd->sd_ops_inode);
1106
1107 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1108 if (!error) {
1109 generic_fillattr(inode, stat);
1110 gfs2_glock_dq_uninit(&gh);
1111 }
1112
1113 return error;
1114}
1115
1116static int gfs2_setxattr(struct dentry *dentry, const char *name,
1117 const void *data, size_t size, int flags)
1118{
1119 struct gfs2_inode *ip = get_v2ip(dentry->d_inode);
1120 struct gfs2_ea_request er;
1121
1122 atomic_inc(&ip->i_sbd->sd_ops_inode);
1123
1124 memset(&er, 0, sizeof(struct gfs2_ea_request));
1125 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1126 if (er.er_type == GFS2_EATYPE_UNUSED)
1127 return -EOPNOTSUPP;
1128 er.er_data = (char *)data;
1129 er.er_name_len = strlen(er.er_name);
1130 er.er_data_len = size;
1131 er.er_flags = flags;
1132
1133 gfs2_assert_warn(ip->i_sbd, !(er.er_flags & GFS2_ERF_MODE));
1134
1135 return gfs2_ea_set(ip, &er);
1136}
1137
1138static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1139 void *data, size_t size)
1140{
1141 struct gfs2_ea_request er;
1142
1143 atomic_inc(&get_v2sdp(dentry->d_inode->i_sb)->sd_ops_inode);
1144
1145 memset(&er, 0, sizeof(struct gfs2_ea_request));
1146 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1147 if (er.er_type == GFS2_EATYPE_UNUSED)
1148 return -EOPNOTSUPP;
1149 er.er_data = data;
1150 er.er_name_len = strlen(er.er_name);
1151 er.er_data_len = size;
1152
1153 return gfs2_ea_get(get_v2ip(dentry->d_inode), &er);
1154}
1155
1156static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1157{
1158 struct gfs2_ea_request er;
1159
1160 atomic_inc(&get_v2sdp(dentry->d_inode->i_sb)->sd_ops_inode);
1161
1162 memset(&er, 0, sizeof(struct gfs2_ea_request));
1163 er.er_data = (size) ? buffer : NULL;
1164 er.er_data_len = size;
1165
1166 return gfs2_ea_list(get_v2ip(dentry->d_inode), &er);
1167}
1168
1169static int gfs2_removexattr(struct dentry *dentry, const char *name)
1170{
1171 struct gfs2_ea_request er;
1172
1173 atomic_inc(&get_v2sdp(dentry->d_inode->i_sb)->sd_ops_inode);
1174
1175 memset(&er, 0, sizeof(struct gfs2_ea_request));
1176 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1177 if (er.er_type == GFS2_EATYPE_UNUSED)
1178 return -EOPNOTSUPP;
1179 er.er_name_len = strlen(er.er_name);
1180
1181 return gfs2_ea_remove(get_v2ip(dentry->d_inode), &er);
1182}
1183
1184struct inode_operations gfs2_file_iops = {
1185 .permission = gfs2_permission,
1186 .setattr = gfs2_setattr,
1187 .getattr = gfs2_getattr,
1188 .setxattr = gfs2_setxattr,
1189 .getxattr = gfs2_getxattr,
1190 .listxattr = gfs2_listxattr,
1191 .removexattr = gfs2_removexattr,
1192};
1193
1194struct inode_operations gfs2_dev_iops = {
1195 .permission = gfs2_permission,
1196 .setattr = gfs2_setattr,
1197 .getattr = gfs2_getattr,
1198 .setxattr = gfs2_setxattr,
1199 .getxattr = gfs2_getxattr,
1200 .listxattr = gfs2_listxattr,
1201 .removexattr = gfs2_removexattr,
1202};
1203
1204struct inode_operations gfs2_dir_iops = {
1205 .create = gfs2_create,
1206 .lookup = gfs2_lookup,
1207 .link = gfs2_link,
1208 .unlink = gfs2_unlink,
1209 .symlink = gfs2_symlink,
1210 .mkdir = gfs2_mkdir,
1211 .rmdir = gfs2_rmdir,
1212 .mknod = gfs2_mknod,
1213 .rename = gfs2_rename,
1214 .permission = gfs2_permission,
1215 .setattr = gfs2_setattr,
1216 .getattr = gfs2_getattr,
1217 .setxattr = gfs2_setxattr,
1218 .getxattr = gfs2_getxattr,
1219 .listxattr = gfs2_listxattr,
1220 .removexattr = gfs2_removexattr,
1221};
1222
1223struct inode_operations gfs2_symlink_iops = {
1224 .readlink = gfs2_readlink,
1225 .follow_link = gfs2_follow_link,
1226 .permission = gfs2_permission,
1227 .setattr = gfs2_setattr,
1228 .getattr = gfs2_getattr,
1229 .setxattr = gfs2_setxattr,
1230 .getxattr = gfs2_getxattr,
1231 .listxattr = gfs2_listxattr,
1232 .removexattr = gfs2_removexattr,
1233};
1234
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..5fafd87c8d7b
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13extern struct inode_operations gfs2_file_iops;
14extern struct inode_operations gfs2_dir_iops;
15extern struct inode_operations gfs2_symlink_iops;
16extern struct inode_operations gfs2_dev_iops;
17
18#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..9130d0d0df3c
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,401 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/statfs.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
21#include <asm/semaphore.h>
22
23#include "gfs2.h"
24#include "glock.h"
25#include "inode.h"
26#include "lm.h"
27#include "log.h"
28#include "mount.h"
29#include "ops_super.h"
30#include "page.h"
31#include "quota.h"
32#include "recovery.h"
33#include "rgrp.h"
34#include "super.h"
35#include "sys.h"
36
37/**
38 * gfs2_write_inode - Make sure the inode is stable on the disk
39 * @inode: The inode
40 * @sync: synchronous write flag
41 *
42 * Returns: errno
43 */
44
45static int gfs2_write_inode(struct inode *inode, int sync)
46{
47 struct gfs2_inode *ip = get_v2ip(inode);
48
49 atomic_inc(&ip->i_sbd->sd_ops_super);
50
51 if (current->flags & PF_MEMALLOC)
52 return 0;
53 if (ip && sync)
54 gfs2_log_flush_glock(ip->i_gl);
55
56 return 0;
57}
58
59/**
60 * gfs2_put_super - Unmount the filesystem
61 * @sb: The VFS superblock
62 *
63 */
64
65static void gfs2_put_super(struct super_block *sb)
66{
67 struct gfs2_sbd *sdp = get_v2sdp(sb);
68 int error;
69
70 if (!sdp)
71 return;
72
73 atomic_inc(&sdp->sd_ops_super);
74
75 /* Unfreeze the filesystem, if we need to */
76
77 mutex_lock(&sdp->sd_freeze_lock);
78 if (sdp->sd_freeze_count)
79 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
80 mutex_unlock(&sdp->sd_freeze_lock);
81
82 kthread_stop(sdp->sd_inoded_process);
83 kthread_stop(sdp->sd_quotad_process);
84 kthread_stop(sdp->sd_logd_process);
85 kthread_stop(sdp->sd_recoverd_process);
86 while (sdp->sd_glockd_num--)
87 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
88 kthread_stop(sdp->sd_scand_process);
89
90 if (!(sb->s_flags & MS_RDONLY)) {
91 error = gfs2_make_fs_ro(sdp);
92 if (error)
93 gfs2_io_error(sdp);
94 }
95
96 /* At this point, we're through modifying the disk */
97
98 /* Release stuff */
99
100 iput(sdp->sd_master_dir);
101 iput(sdp->sd_jindex);
102 iput(sdp->sd_inum_inode);
103 iput(sdp->sd_statfs_inode);
104 iput(sdp->sd_rindex);
105 iput(sdp->sd_quota_inode);
106 iput(sdp->sd_root_dir);
107
108 gfs2_glock_put(sdp->sd_rename_gl);
109 gfs2_glock_put(sdp->sd_trans_gl);
110
111 if (!sdp->sd_args.ar_spectator) {
112 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
113 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
114 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
115 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
116 gfs2_glock_dq_uninit(&sdp->sd_ut_gh);
117 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
118 iput(sdp->sd_ir_inode);
119 iput(sdp->sd_sc_inode);
120 iput(sdp->sd_ut_inode);
121 iput(sdp->sd_qc_inode);
122 }
123
124 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
125
126 gfs2_clear_rgrpd(sdp);
127 gfs2_jindex_free(sdp);
128
129 /* Take apart glock structures and buffer lists */
130 gfs2_gl_hash_clear(sdp, WAIT);
131
132 /* Unmount the locking protocol */
133 gfs2_lm_unmount(sdp);
134
135 /* At this point, we're through participating in the lockspace */
136
137 gfs2_sys_fs_del(sdp);
138
139 /* Get rid of any extra inodes */
140 while (invalidate_inodes(sb))
141 yield();
142
143 vfree(sdp);
144
145 set_v2sdp(sb, NULL);
146}
147
148/**
149 * gfs2_write_super - disk commit all incore transactions
150 * @sb: the filesystem
151 *
152 * This function is called every time sync(2) is called.
153 * After this exits, all dirty buffers and synced.
154 */
155
156static void gfs2_write_super(struct super_block *sb)
157{
158 struct gfs2_sbd *sdp = get_v2sdp(sb);
159 atomic_inc(&sdp->sd_ops_super);
160 gfs2_log_flush(sdp);
161}
162
163/**
164 * gfs2_write_super_lockfs - prevent further writes to the filesystem
165 * @sb: the VFS structure for the filesystem
166 *
167 */
168
169static void gfs2_write_super_lockfs(struct super_block *sb)
170{
171 struct gfs2_sbd *sdp = get_v2sdp(sb);
172 int error;
173
174 atomic_inc(&sdp->sd_ops_super);
175
176 for (;;) {
177 error = gfs2_freeze_fs(sdp);
178 if (!error)
179 break;
180
181 switch (error) {
182 case -EBUSY:
183 fs_err(sdp, "waiting for recovery before freeze\n");
184 break;
185
186 default:
187 fs_err(sdp, "error freezing FS: %d\n", error);
188 break;
189 }
190
191 fs_err(sdp, "retrying...\n");
192 msleep(1000);
193 }
194}
195
196/**
197 * gfs2_unlockfs - reallow writes to the filesystem
198 * @sb: the VFS structure for the filesystem
199 *
200 */
201
202static void gfs2_unlockfs(struct super_block *sb)
203{
204 struct gfs2_sbd *sdp = get_v2sdp(sb);
205
206 atomic_inc(&sdp->sd_ops_super);
207 gfs2_unfreeze_fs(sdp);
208}
209
210/**
211 * gfs2_statfs - Gather and return stats about the filesystem
212 * @sb: The superblock
213 * @statfsbuf: The buffer
214 *
215 * Returns: 0 on success or error code
216 */
217
218static int gfs2_statfs(struct super_block *sb, struct kstatfs *buf)
219{
220 struct gfs2_sbd *sdp = get_v2sdp(sb);
221 struct gfs2_statfs_change sc;
222 int error;
223
224 atomic_inc(&sdp->sd_ops_super);
225
226 if (gfs2_tune_get(sdp, gt_statfs_slow))
227 error = gfs2_statfs_slow(sdp, &sc);
228 else
229 error = gfs2_statfs_i(sdp, &sc);
230
231 if (error)
232 return error;
233
234 memset(buf, 0, sizeof(struct kstatfs));
235
236 buf->f_type = GFS2_MAGIC;
237 buf->f_bsize = sdp->sd_sb.sb_bsize;
238 buf->f_blocks = sc.sc_total;
239 buf->f_bfree = sc.sc_free;
240 buf->f_bavail = sc.sc_free;
241 buf->f_files = sc.sc_dinodes + sc.sc_free;
242 buf->f_ffree = sc.sc_free;
243 buf->f_namelen = GFS2_FNAMESIZE;
244
245 return 0;
246}
247
248/**
249 * gfs2_remount_fs - called when the FS is remounted
250 * @sb: the filesystem
251 * @flags: the remount flags
252 * @data: extra data passed in (not used right now)
253 *
254 * Returns: errno
255 */
256
257static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
258{
259 struct gfs2_sbd *sdp = get_v2sdp(sb);
260 int error;
261
262 atomic_inc(&sdp->sd_ops_super);
263
264 error = gfs2_mount_args(sdp, data, 1);
265 if (error)
266 return error;
267
268 if (sdp->sd_args.ar_spectator)
269 *flags |= MS_RDONLY;
270 else {
271 if (*flags & MS_RDONLY) {
272 if (!(sb->s_flags & MS_RDONLY))
273 error = gfs2_make_fs_ro(sdp);
274 } else if (!(*flags & MS_RDONLY) &&
275 (sb->s_flags & MS_RDONLY)) {
276 error = gfs2_make_fs_rw(sdp);
277 }
278 }
279
280 if (*flags & (MS_NOATIME | MS_NODIRATIME))
281 set_bit(SDF_NOATIME, &sdp->sd_flags);
282 else
283 clear_bit(SDF_NOATIME, &sdp->sd_flags);
284
285 /* Don't let the VFS update atimes. GFS2 handles this itself. */
286 *flags |= MS_NOATIME | MS_NODIRATIME;
287
288 return error;
289}
290
291/**
292 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
293 * @inode: The VFS inode
294 *
295 */
296
297static void gfs2_clear_inode(struct inode *inode)
298{
299 struct gfs2_inode *ip = get_v2ip(inode);
300
301 atomic_inc(&get_v2sdp(inode->i_sb)->sd_ops_super);
302
303 if (ip) {
304 spin_lock(&ip->i_spin);
305 ip->i_vnode = NULL;
306 set_v2ip(inode, NULL);
307 spin_unlock(&ip->i_spin);
308
309 gfs2_glock_schedule_for_reclaim(ip->i_gl);
310 gfs2_inode_put(ip);
311 }
312}
313
314/**
315 * gfs2_show_options - Show mount options for /proc/mounts
316 * @s: seq_file structure
317 * @mnt: vfsmount
318 *
319 * Returns: 0 on success or error code
320 */
321
322static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
323{
324 struct gfs2_sbd *sdp = get_v2sdp(mnt->mnt_sb);
325 struct gfs2_args *args = &sdp->sd_args;
326
327 atomic_inc(&sdp->sd_ops_super);
328
329 if (args->ar_lockproto[0])
330 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
331 if (args->ar_locktable[0])
332 seq_printf(s, ",locktable=%s", args->ar_locktable);
333 if (args->ar_hostdata[0])
334 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
335 if (args->ar_spectator)
336 seq_printf(s, ",spectator");
337 if (args->ar_ignore_local_fs)
338 seq_printf(s, ",ignore_local_fs");
339 if (args->ar_localflocks)
340 seq_printf(s, ",localflocks");
341 if (args->ar_localcaching)
342 seq_printf(s, ",localcaching");
343 if (args->ar_debug)
344 seq_printf(s, ",debug");
345 if (args->ar_upgrade)
346 seq_printf(s, ",upgrade");
347 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
348 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
349 if (args->ar_posix_acl)
350 seq_printf(s, ",acl");
351 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
352 char *state;
353 switch (args->ar_quota) {
354 case GFS2_QUOTA_OFF:
355 state = "off";
356 break;
357 case GFS2_QUOTA_ACCOUNT:
358 state = "account";
359 break;
360 case GFS2_QUOTA_ON:
361 state = "on";
362 break;
363 default:
364 state = "unknown";
365 break;
366 }
367 seq_printf(s, ",quota=%s", state);
368 }
369 if (args->ar_suiddir)
370 seq_printf(s, ",suiddir");
371 if (args->ar_data != GFS2_DATA_DEFAULT) {
372 char *state;
373 switch (args->ar_data) {
374 case GFS2_DATA_WRITEBACK:
375 state = "writeback";
376 break;
377 case GFS2_DATA_ORDERED:
378 state = "ordered";
379 break;
380 default:
381 state = "unknown";
382 break;
383 }
384 seq_printf(s, ",data=%s", state);
385 }
386
387 return 0;
388}
389
390struct super_operations gfs2_super_ops = {
391 .write_inode = gfs2_write_inode,
392 .put_super = gfs2_put_super,
393 .write_super = gfs2_write_super,
394 .write_super_lockfs = gfs2_write_super_lockfs,
395 .unlockfs = gfs2_unlockfs,
396 .statfs = gfs2_statfs,
397 .remount_fs = gfs2_remount_fs,
398 .clear_inode = gfs2_clear_inode,
399 .show_options = gfs2_show_options,
400};
401
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..a41d208dc558
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13extern struct super_operations gfs2_super_ops;
14
15#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..8f77bb7896bd
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "bmap.h"
21#include "glock.h"
22#include "inode.h"
23#include "ops_vm.h"
24#include "page.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28
29static void pfault_be_greedy(struct gfs2_inode *ip)
30{
31 unsigned int time;
32
33 spin_lock(&ip->i_spin);
34 time = ip->i_greedy;
35 ip->i_last_pfault = jiffies;
36 spin_unlock(&ip->i_spin);
37
38 gfs2_inode_hold(ip);
39 if (gfs2_glock_be_greedy(ip->i_gl, time))
40 gfs2_inode_put(ip);
41}
42
43static struct page *gfs2_private_nopage(struct vm_area_struct *area,
44 unsigned long address, int *type)
45{
46 struct gfs2_inode *ip = get_v2ip(area->vm_file->f_mapping->host);
47 struct gfs2_holder i_gh;
48 struct page *result;
49 int error;
50
51 atomic_inc(&ip->i_sbd->sd_ops_vm);
52
53 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
54 if (error)
55 return NULL;
56
57 set_bit(GIF_PAGED, &ip->i_flags);
58
59 result = filemap_nopage(area, address, type);
60
61 if (result && result != NOPAGE_OOM)
62 pfault_be_greedy(ip);
63
64 gfs2_glock_dq_uninit(&i_gh);
65
66 return result;
67}
68
69static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
70{
71 struct gfs2_sbd *sdp = ip->i_sbd;
72 unsigned long index = page->index;
73 uint64_t lblock = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
74 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
75 struct gfs2_alloc *al;
76 unsigned int data_blocks, ind_blocks;
77 unsigned int x;
78 int error;
79
80 al = gfs2_alloc_get(ip);
81
82 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
83 if (error)
84 goto out;
85
86 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
87 if (error)
88 goto out_gunlock_q;
89
90 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE,
91 &data_blocks, &ind_blocks);
92
93 al->al_requested = data_blocks + ind_blocks;
94
95 error = gfs2_inplace_reserve(ip);
96 if (error)
97 goto out_gunlock_q;
98
99 error = gfs2_trans_begin(sdp,
100 al->al_rgd->rd_ri.ri_length +
101 ind_blocks + RES_DINODE +
102 RES_STATFS + RES_QUOTA, 0);
103 if (error)
104 goto out_ipres;
105
106 if (gfs2_is_stuffed(ip)) {
107 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, NULL);
108 if (error)
109 goto out_trans;
110 }
111
112 for (x = 0; x < blocks; ) {
113 uint64_t dblock;
114 unsigned int extlen;
115 int new = 1;
116
117 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
118 if (error)
119 goto out_trans;
120
121 lblock += extlen;
122 x += extlen;
123 }
124
125 gfs2_assert_warn(sdp, al->al_alloced);
126
127 out_trans:
128 gfs2_trans_end(sdp);
129
130 out_ipres:
131 gfs2_inplace_release(ip);
132
133 out_gunlock_q:
134 gfs2_quota_unlock(ip);
135
136 out:
137 gfs2_alloc_put(ip);
138
139 return error;
140}
141
142static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
143 unsigned long address, int *type)
144{
145 struct gfs2_inode *ip = get_v2ip(area->vm_file->f_mapping->host);
146 struct gfs2_holder i_gh;
147 struct page *result = NULL;
148 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
149 int alloc_required;
150 int error;
151
152 atomic_inc(&ip->i_sbd->sd_ops_vm);
153
154 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
155 if (error)
156 return NULL;
157
158 set_bit(GIF_PAGED, &ip->i_flags);
159 set_bit(GIF_SW_PAGED, &ip->i_flags);
160
161 error = gfs2_write_alloc_required(ip,
162 (uint64_t)index << PAGE_CACHE_SHIFT,
163 PAGE_CACHE_SIZE, &alloc_required);
164 if (error)
165 goto out;
166
167 result = filemap_nopage(area, address, type);
168 if (!result || result == NOPAGE_OOM)
169 goto out;
170
171 if (alloc_required) {
172 error = alloc_page_backing(ip, result);
173 if (error) {
174 page_cache_release(result);
175 result = NULL;
176 goto out;
177 }
178 set_page_dirty(result);
179 }
180
181 pfault_be_greedy(ip);
182
183 out:
184 gfs2_glock_dq_uninit(&i_gh);
185
186 return result;
187}
188
189struct vm_operations_struct gfs2_vm_ops_private = {
190 .nopage = gfs2_private_nopage,
191};
192
193struct vm_operations_struct gfs2_vm_ops_sharewrite = {
194 .nopage = gfs2_sharewrite_nopage,
195};
196
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..54e3a8769cbb
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13extern struct vm_operations_struct gfs2_vm_ops_private;
14extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
15
16#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/page.c b/fs/gfs2/page.c
new file mode 100644
index 000000000000..3542aa6b01c4
--- /dev/null
+++ b/fs/gfs2/page.c
@@ -0,0 +1,279 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/mm.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "bmap.h"
21#include "inode.h"
22#include "page.h"
23#include "trans.h"
24#include "ops_address.h"
25
26/**
27 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
28 * @gl: the glock
29 *
30 */
31
32void gfs2_pte_inval(struct gfs2_glock *gl)
33{
34 struct gfs2_inode *ip;
35 struct inode *inode;
36
37 ip = get_gl2ip(gl);
38 if (!ip || !S_ISREG(ip->i_di.di_mode))
39 return;
40
41 if (!test_bit(GIF_PAGED, &ip->i_flags))
42 return;
43
44 inode = gfs2_ip2v_lookup(ip);
45 if (inode) {
46 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
47 iput(inode);
48
49 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
50 set_bit(GLF_DIRTY, &gl->gl_flags);
51 }
52
53 clear_bit(GIF_SW_PAGED, &ip->i_flags);
54}
55
56/**
57 * gfs2_page_inval - Invalidate all pages associated with a glock
58 * @gl: the glock
59 *
60 */
61
62void gfs2_page_inval(struct gfs2_glock *gl)
63{
64 struct gfs2_inode *ip;
65 struct inode *inode;
66
67 ip = get_gl2ip(gl);
68 if (!ip || !S_ISREG(ip->i_di.di_mode))
69 return;
70
71 inode = gfs2_ip2v_lookup(ip);
72 if (inode) {
73 struct address_space *mapping = inode->i_mapping;
74
75 truncate_inode_pages(mapping, 0);
76 gfs2_assert_withdraw(ip->i_sbd, !mapping->nrpages);
77
78 iput(inode);
79 }
80
81 clear_bit(GIF_PAGED, &ip->i_flags);
82}
83
84/**
85 * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock
86 * @gl: the glock
87 * @flags: DIO_START | DIO_WAIT
88 *
89 * Syncs data (not metadata) for a regular file.
90 * No-op for all other types.
91 */
92
93void gfs2_page_sync(struct gfs2_glock *gl, int flags)
94{
95 struct gfs2_inode *ip;
96 struct inode *inode;
97
98 ip = get_gl2ip(gl);
99 if (!ip || !S_ISREG(ip->i_di.di_mode))
100 return;
101
102 inode = gfs2_ip2v_lookup(ip);
103 if (inode) {
104 struct address_space *mapping = inode->i_mapping;
105 int error = 0;
106
107 if (flags & DIO_START)
108 filemap_fdatawrite(mapping);
109 if (!error && (flags & DIO_WAIT))
110 error = filemap_fdatawait(mapping);
111
112 /* Put back any errors cleared by filemap_fdatawait()
113 so they can be caught by someone who can pass them
114 up to user space. */
115
116 if (error == -ENOSPC)
117 set_bit(AS_ENOSPC, &mapping->flags);
118 else if (error)
119 set_bit(AS_EIO, &mapping->flags);
120
121 iput(inode);
122 }
123}
124
125/**
126 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
127 * @ip: the inode
128 * @dibh: the dinode buffer
129 * @block: the block number that was allocated
130 * @private: any locked page held by the caller process
131 *
132 * Returns: errno
133 */
134
135int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
136 uint64_t block, void *private)
137{
138 struct gfs2_sbd *sdp = ip->i_sbd;
139 struct inode *inode = ip->i_vnode;
140 struct page *page = (struct page *)private;
141 struct buffer_head *bh;
142 int release = 0;
143
144 if (!page || page->index) {
145 page = grab_cache_page(inode->i_mapping, 0);
146 if (!page)
147 return -ENOMEM;
148 release = 1;
149 }
150
151 if (!PageUptodate(page)) {
152 void *kaddr = kmap(page);
153
154 memcpy(kaddr,
155 dibh->b_data + sizeof(struct gfs2_dinode),
156 ip->i_di.di_size);
157 memset(kaddr + ip->i_di.di_size,
158 0,
159 PAGE_CACHE_SIZE - ip->i_di.di_size);
160 kunmap(page);
161
162 SetPageUptodate(page);
163 }
164
165 if (!page_has_buffers(page))
166 create_empty_buffers(page, 1 << inode->i_blkbits,
167 (1 << BH_Uptodate));
168
169 bh = page_buffers(page);
170
171 if (!buffer_mapped(bh))
172 map_bh(bh, inode->i_sb, block);
173
174 set_buffer_uptodate(bh);
175 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip))
176 gfs2_trans_add_bh(ip->i_gl, bh, 0);
177 mark_buffer_dirty(bh);
178
179 if (release) {
180 unlock_page(page);
181 page_cache_release(page);
182 }
183
184 return 0;
185}
186
187/**
188 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
189 *
190 * This is partly borrowed from ext3.
191 */
192int gfs2_block_truncate_page(struct address_space *mapping)
193{
194 struct inode *inode = mapping->host;
195 struct gfs2_inode *ip = get_v2ip(inode);
196 struct gfs2_sbd *sdp = ip->i_sbd;
197 loff_t from = inode->i_size;
198 unsigned long index = from >> PAGE_CACHE_SHIFT;
199 unsigned offset = from & (PAGE_CACHE_SIZE-1);
200 unsigned blocksize, iblock, length, pos;
201 struct buffer_head *bh;
202 struct page *page;
203 void *kaddr;
204 int err;
205
206 page = grab_cache_page(mapping, index);
207 if (!page)
208 return 0;
209
210 blocksize = inode->i_sb->s_blocksize;
211 length = blocksize - (offset & (blocksize - 1));
212 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
213
214 if (!page_has_buffers(page))
215 create_empty_buffers(page, blocksize, 0);
216
217 /* Find the buffer that contains "offset" */
218 bh = page_buffers(page);
219 pos = blocksize;
220 while (offset >= pos) {
221 bh = bh->b_this_page;
222 iblock++;
223 pos += blocksize;
224 }
225
226 err = 0;
227
228 if (!buffer_mapped(bh)) {
229 gfs2_get_block(inode, iblock, bh, 0);
230 /* unmapped? It's a hole - nothing to do */
231 if (!buffer_mapped(bh))
232 goto unlock;
233 }
234
235 /* Ok, it's mapped. Make sure it's up-to-date */
236 if (PageUptodate(page))
237 set_buffer_uptodate(bh);
238
239 if (!buffer_uptodate(bh)) {
240 err = -EIO;
241 ll_rw_block(READ, 1, &bh);
242 wait_on_buffer(bh);
243 /* Uhhuh. Read error. Complain and punt. */
244 if (!buffer_uptodate(bh))
245 goto unlock;
246 }
247
248 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
249 gfs2_trans_add_bh(ip->i_gl, bh, 0);
250
251 kaddr = kmap_atomic(page, KM_USER0);
252 memset(kaddr + offset, 0, length);
253 flush_dcache_page(page);
254 kunmap_atomic(kaddr, KM_USER0);
255
256unlock:
257 unlock_page(page);
258 page_cache_release(page);
259 return err;
260}
261
262void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
263 unsigned int from, unsigned int to)
264{
265 struct buffer_head *head = page_buffers(page);
266 unsigned int bsize = head->b_size;
267 struct buffer_head *bh;
268 unsigned int start, end;
269
270 for (bh = head, start = 0;
271 bh != head || !start;
272 bh = bh->b_this_page, start = end) {
273 end = start + bsize;
274 if (end <= from || start >= to)
275 continue;
276 gfs2_trans_add_bh(ip->i_gl, bh, 0);
277 }
278}
279
diff --git a/fs/gfs2/page.h b/fs/gfs2/page.h
new file mode 100644
index 000000000000..346e296420c6
--- /dev/null
+++ b/fs/gfs2/page.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __PAGE_DOT_H__
11#define __PAGE_DOT_H__
12
13void gfs2_pte_inval(struct gfs2_glock *gl);
14void gfs2_page_inval(struct gfs2_glock *gl);
15void gfs2_page_sync(struct gfs2_glock *gl, int flags);
16
17int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
18 uint64_t block, void *private);
19int gfs2_block_truncate_page(struct address_space *mapping);
20void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
21 unsigned int from, unsigned int to);
22
23#endif /* __PAGE_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..c0352cf330a3
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1293 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/tty.h>
45#include <linux/sort.h>
46#include <linux/fs.h>
47#include <asm/semaphore.h>
48
49#include "gfs2.h"
50#include "bmap.h"
51#include "glock.h"
52#include "glops.h"
53#include "log.h"
54#include "meta_io.h"
55#include "quota.h"
56#include "rgrp.h"
57#include "super.h"
58#include "trans.h"
59#include "inode.h"
60#include "ops_file.h"
61#include "ops_address.h"
62
63#define QUOTA_USER 1
64#define QUOTA_GROUP 0
65
66static uint64_t qd2offset(struct gfs2_quota_data *qd)
67{
68 uint64_t offset;
69
70 offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
71 offset *= sizeof(struct gfs2_quota);
72
73 return offset;
74}
75
76static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id,
77 struct gfs2_quota_data **qdp)
78{
79 struct gfs2_quota_data *qd;
80 int error;
81
82 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
83 if (!qd)
84 return -ENOMEM;
85
86 qd->qd_count = 1;
87 qd->qd_id = id;
88 if (user)
89 set_bit(QDF_USER, &qd->qd_flags);
90 qd->qd_slot = -1;
91
92 error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user,
93 &gfs2_quota_glops, CREATE, &qd->qd_gl);
94 if (error)
95 goto fail;
96
97 error = gfs2_lvb_hold(qd->qd_gl);
98 gfs2_glock_put(qd->qd_gl);
99 if (error)
100 goto fail;
101
102 *qdp = qd;
103
104 return 0;
105
106 fail:
107 kfree(qd);
108 return error;
109}
110
111static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
112 struct gfs2_quota_data **qdp)
113{
114 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
115 int error, found;
116
117 *qdp = NULL;
118
119 for (;;) {
120 found = 0;
121 spin_lock(&sdp->sd_quota_spin);
122 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
123 if (qd->qd_id == id &&
124 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
125 qd->qd_count++;
126 found = 1;
127 break;
128 }
129 }
130
131 if (!found)
132 qd = NULL;
133
134 if (!qd && new_qd) {
135 qd = new_qd;
136 list_add(&qd->qd_list, &sdp->sd_quota_list);
137 atomic_inc(&sdp->sd_quota_count);
138 new_qd = NULL;
139 }
140
141 spin_unlock(&sdp->sd_quota_spin);
142
143 if (qd || !create) {
144 if (new_qd) {
145 gfs2_lvb_unhold(new_qd->qd_gl);
146 kfree(new_qd);
147 }
148 *qdp = qd;
149 return 0;
150 }
151
152 error = qd_alloc(sdp, user, id, &new_qd);
153 if (error)
154 return error;
155 }
156}
157
158static void qd_hold(struct gfs2_quota_data *qd)
159{
160 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
161
162 spin_lock(&sdp->sd_quota_spin);
163 gfs2_assert(sdp, qd->qd_count);
164 qd->qd_count++;
165 spin_unlock(&sdp->sd_quota_spin);
166}
167
168static void qd_put(struct gfs2_quota_data *qd)
169{
170 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
171 spin_lock(&sdp->sd_quota_spin);
172 gfs2_assert(sdp, qd->qd_count);
173 if (!--qd->qd_count)
174 qd->qd_last_touched = jiffies;
175 spin_unlock(&sdp->sd_quota_spin);
176}
177
178static int slot_get(struct gfs2_quota_data *qd)
179{
180 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
181 unsigned int c, o = 0, b;
182 unsigned char byte = 0;
183
184 spin_lock(&sdp->sd_quota_spin);
185
186 if (qd->qd_slot_count++) {
187 spin_unlock(&sdp->sd_quota_spin);
188 return 0;
189 }
190
191 for (c = 0; c < sdp->sd_quota_chunks; c++)
192 for (o = 0; o < PAGE_SIZE; o++) {
193 byte = sdp->sd_quota_bitmap[c][o];
194 if (byte != 0xFF)
195 goto found;
196 }
197
198 goto fail;
199
200 found:
201 for (b = 0; b < 8; b++)
202 if (!(byte & (1 << b)))
203 break;
204 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
205
206 if (qd->qd_slot >= sdp->sd_quota_slots)
207 goto fail;
208
209 sdp->sd_quota_bitmap[c][o] |= 1 << b;
210
211 spin_unlock(&sdp->sd_quota_spin);
212
213 return 0;
214
215 fail:
216 qd->qd_slot_count--;
217 spin_unlock(&sdp->sd_quota_spin);
218 return -ENOSPC;
219}
220
221static void slot_hold(struct gfs2_quota_data *qd)
222{
223 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
224
225 spin_lock(&sdp->sd_quota_spin);
226 gfs2_assert(sdp, qd->qd_slot_count);
227 qd->qd_slot_count++;
228 spin_unlock(&sdp->sd_quota_spin);
229}
230
231static void slot_put(struct gfs2_quota_data *qd)
232{
233 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
234
235 spin_lock(&sdp->sd_quota_spin);
236 gfs2_assert(sdp, qd->qd_slot_count);
237 if (!--qd->qd_slot_count) {
238 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
239 qd->qd_slot = -1;
240 }
241 spin_unlock(&sdp->sd_quota_spin);
242}
243
244static int bh_get(struct gfs2_quota_data *qd)
245{
246 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
247 struct gfs2_inode *ip = get_v2ip(sdp->sd_qc_inode);
248 unsigned int block, offset;
249 uint64_t dblock;
250 int new = 0;
251 struct buffer_head *bh;
252 int error;
253
254 mutex_lock(&sdp->sd_quota_mutex);
255
256 if (qd->qd_bh_count++) {
257 mutex_unlock(&sdp->sd_quota_mutex);
258 return 0;
259 }
260
261 block = qd->qd_slot / sdp->sd_qc_per_block;
262 offset = qd->qd_slot % sdp->sd_qc_per_block;;
263
264 error = gfs2_block_map(ip, block, &new, &dblock, NULL);
265 if (error)
266 goto fail;
267 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
268 if (error)
269 goto fail;
270 error = -EIO;
271 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
272 goto fail_brelse;
273
274 qd->qd_bh = bh;
275 qd->qd_bh_qc = (struct gfs2_quota_change *)
276 (bh->b_data + sizeof(struct gfs2_meta_header) +
277 offset * sizeof(struct gfs2_quota_change));
278
279 mutex_lock(&sdp->sd_quota_mutex);
280
281 return 0;
282
283 fail_brelse:
284 brelse(bh);
285
286 fail:
287 qd->qd_bh_count--;
288 mutex_unlock(&sdp->sd_quota_mutex);
289 return error;
290}
291
292static void bh_put(struct gfs2_quota_data *qd)
293{
294 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
295
296 mutex_lock(&sdp->sd_quota_mutex);
297 gfs2_assert(sdp, qd->qd_bh_count);
298 if (!--qd->qd_bh_count) {
299 brelse(qd->qd_bh);
300 qd->qd_bh = NULL;
301 qd->qd_bh_qc = NULL;
302 }
303 mutex_unlock(&sdp->sd_quota_mutex);
304}
305
306static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
307{
308 struct gfs2_quota_data *qd = NULL;
309 int error;
310 int found = 0;
311
312 *qdp = NULL;
313
314 if (sdp->sd_vfs->s_flags & MS_RDONLY)
315 return 0;
316
317 spin_lock(&sdp->sd_quota_spin);
318
319 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
320 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
321 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
322 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
323 continue;
324
325 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
326
327 set_bit(QDF_LOCKED, &qd->qd_flags);
328 gfs2_assert_warn(sdp, qd->qd_count);
329 qd->qd_count++;
330 qd->qd_change_sync = qd->qd_change;
331 gfs2_assert_warn(sdp, qd->qd_slot_count);
332 qd->qd_slot_count++;
333 found = 1;
334
335 break;
336 }
337
338 if (!found)
339 qd = NULL;
340
341 spin_unlock(&sdp->sd_quota_spin);
342
343 if (qd) {
344 gfs2_assert_warn(sdp, qd->qd_change_sync);
345 error = bh_get(qd);
346 if (error) {
347 clear_bit(QDF_LOCKED, &qd->qd_flags);
348 slot_put(qd);
349 qd_put(qd);
350 return error;
351 }
352 }
353
354 *qdp = qd;
355
356 return 0;
357}
358
359static int qd_trylock(struct gfs2_quota_data *qd)
360{
361 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
362
363 if (sdp->sd_vfs->s_flags & MS_RDONLY)
364 return 0;
365
366 spin_lock(&sdp->sd_quota_spin);
367
368 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
369 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
370 spin_unlock(&sdp->sd_quota_spin);
371 return 0;
372 }
373
374 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
375
376 set_bit(QDF_LOCKED, &qd->qd_flags);
377 gfs2_assert_warn(sdp, qd->qd_count);
378 qd->qd_count++;
379 qd->qd_change_sync = qd->qd_change;
380 gfs2_assert_warn(sdp, qd->qd_slot_count);
381 qd->qd_slot_count++;
382
383 spin_unlock(&sdp->sd_quota_spin);
384
385 gfs2_assert_warn(sdp, qd->qd_change_sync);
386 if (bh_get(qd)) {
387 clear_bit(QDF_LOCKED, &qd->qd_flags);
388 slot_put(qd);
389 qd_put(qd);
390 return 0;
391 }
392
393 return 1;
394}
395
396static void qd_unlock(struct gfs2_quota_data *qd)
397{
398 gfs2_assert_warn(qd->qd_gl->gl_sbd, test_bit(QDF_LOCKED, &qd->qd_flags));
399 clear_bit(QDF_LOCKED, &qd->qd_flags);
400 bh_put(qd);
401 slot_put(qd);
402 qd_put(qd);
403}
404
405static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
406 struct gfs2_quota_data **qdp)
407{
408 int error;
409
410 error = qd_get(sdp, user, id, create, qdp);
411 if (error)
412 return error;
413
414 error = slot_get(*qdp);
415 if (error)
416 goto fail;
417
418 error = bh_get(*qdp);
419 if (error)
420 goto fail_slot;
421
422 return 0;
423
424 fail_slot:
425 slot_put(*qdp);
426
427 fail:
428 qd_put(*qdp);
429 return error;
430}
431
432static void qdsb_put(struct gfs2_quota_data *qd)
433{
434 bh_put(qd);
435 slot_put(qd);
436 qd_put(qd);
437}
438
439int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
440{
441 struct gfs2_sbd *sdp = ip->i_sbd;
442 struct gfs2_alloc *al = &ip->i_alloc;
443 struct gfs2_quota_data **qd = al->al_qd;
444 int error;
445
446 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
447 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
448 return -EIO;
449
450 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
451 return 0;
452
453 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
454 if (error)
455 goto out;
456 al->al_qd_num++;
457 qd++;
458
459 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
460 if (error)
461 goto out;
462 al->al_qd_num++;
463 qd++;
464
465 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
466 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
467 if (error)
468 goto out;
469 al->al_qd_num++;
470 qd++;
471 }
472
473 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
474 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
475 if (error)
476 goto out;
477 al->al_qd_num++;
478 qd++;
479 }
480
481 out:
482 if (error)
483 gfs2_quota_unhold(ip);
484
485 return error;
486}
487
488void gfs2_quota_unhold(struct gfs2_inode *ip)
489{
490 struct gfs2_sbd *sdp = ip->i_sbd;
491 struct gfs2_alloc *al = &ip->i_alloc;
492 unsigned int x;
493
494 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
495
496 for (x = 0; x < al->al_qd_num; x++) {
497 qdsb_put(al->al_qd[x]);
498 al->al_qd[x] = NULL;
499 }
500 al->al_qd_num = 0;
501}
502
503static int sort_qd(const void *a, const void *b)
504{
505 struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a;
506 struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b;
507 int ret = 0;
508
509 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
510 !test_bit(QDF_USER, &qd_b->qd_flags)) {
511 if (test_bit(QDF_USER, &qd_a->qd_flags))
512 ret = -1;
513 else
514 ret = 1;
515 } else {
516 if (qd_a->qd_id < qd_b->qd_id)
517 ret = -1;
518 else if (qd_a->qd_id > qd_b->qd_id)
519 ret = 1;
520 }
521
522 return ret;
523}
524
525static void do_qc(struct gfs2_quota_data *qd, int64_t change)
526{
527 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
528 struct gfs2_inode *ip = get_v2ip(sdp->sd_qc_inode);
529 struct gfs2_quota_change *qc = qd->qd_bh_qc;
530 int64_t x;
531
532 mutex_lock(&sdp->sd_quota_mutex);
533 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
534
535 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
536 qc->qc_change = 0;
537 qc->qc_flags = 0;
538 if (test_bit(QDF_USER, &qd->qd_flags))
539 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
540 qc->qc_id = cpu_to_be32(qd->qd_id);
541 }
542
543 x = qc->qc_change;
544 x = be64_to_cpu(x) + change;
545 qc->qc_change = cpu_to_be64(x);
546
547 spin_lock(&sdp->sd_quota_spin);
548 qd->qd_change = x;
549 spin_unlock(&sdp->sd_quota_spin);
550
551 if (!x) {
552 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
553 clear_bit(QDF_CHANGE, &qd->qd_flags);
554 qc->qc_flags = 0;
555 qc->qc_id = 0;
556 slot_put(qd);
557 qd_put(qd);
558 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
559 qd_hold(qd);
560 slot_hold(qd);
561 }
562
563 mutex_unlock(&sdp->sd_quota_mutex);
564}
565
566/**
567 * gfs2_adjust_quota
568 *
569 * This function was mostly borrowed from gfs2_block_truncate_page which was
570 * in turn mostly borrowed from ext3
571 */
572static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
573 int64_t change, struct gfs2_quota_data *qd)
574{
575 struct inode *inode = ip->i_vnode;
576 struct address_space *mapping = inode->i_mapping;
577 unsigned long index = loc >> PAGE_CACHE_SHIFT;
578 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
579 unsigned blocksize, iblock, pos;
580 struct buffer_head *bh;
581 struct page *page;
582 void *kaddr;
583 __be64 *ptr;
584 u64 value;
585 int err = -EIO;
586
587 page = grab_cache_page(mapping, index);
588 if (!page)
589 return -ENOMEM;
590
591 blocksize = inode->i_sb->s_blocksize;
592 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
593
594 if (!page_has_buffers(page))
595 create_empty_buffers(page, blocksize, 0);
596
597 bh = page_buffers(page);
598 pos = blocksize;
599 while (offset >= pos) {
600 bh = bh->b_this_page;
601 iblock++;
602 pos += blocksize;
603 }
604
605 if (!buffer_mapped(bh)) {
606 gfs2_get_block(inode, iblock, bh, 1);
607 if (!buffer_mapped(bh))
608 goto unlock;
609 }
610
611 if (PageUptodate(page))
612 set_buffer_uptodate(bh);
613
614 if (!buffer_uptodate(bh)) {
615 ll_rw_block(READ, 1, &bh);
616 wait_on_buffer(bh);
617 if (!buffer_uptodate(bh))
618 goto unlock;
619 }
620
621 gfs2_trans_add_bh(ip->i_gl, bh, 0);
622
623 kaddr = kmap_atomic(page, KM_USER0);
624 ptr = (__be64 *)(kaddr + offset);
625 value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change);
626 flush_dcache_page(page);
627 kunmap_atomic(kaddr, KM_USER0);
628 err = 0;
629 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
630#if 0
631 qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit);
632 qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn);
633#endif
634 qd->qd_qb.qb_value = cpu_to_be64(value);
635unlock:
636 unlock_page(page);
637 page_cache_release(page);
638 return err;
639}
640
641static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
642{
643 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
644 struct gfs2_inode *ip = get_v2ip(sdp->sd_quota_inode);
645 unsigned int data_blocks, ind_blocks;
646 struct file_ra_state ra_state;
647 struct gfs2_holder *ghs, i_gh;
648 unsigned int qx, x;
649 struct gfs2_quota_data *qd;
650 loff_t offset;
651 unsigned int nalloc = 0;
652 struct gfs2_alloc *al = NULL;
653 int error;
654
655 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
656 &data_blocks, &ind_blocks);
657
658 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
659 if (!ghs)
660 return -ENOMEM;
661
662 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
663 for (qx = 0; qx < num_qd; qx++) {
664 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
665 LM_ST_EXCLUSIVE,
666 GL_NOCACHE, &ghs[qx]);
667 if (error)
668 goto out;
669 }
670
671 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
672 if (error)
673 goto out;
674
675 for (x = 0; x < num_qd; x++) {
676 int alloc_required;
677
678 offset = qd2offset(qda[x]);
679 error = gfs2_write_alloc_required(ip, offset,
680 sizeof(struct gfs2_quota),
681 &alloc_required);
682 if (error)
683 goto out_gunlock;
684 if (alloc_required)
685 nalloc++;
686 }
687
688 if (nalloc) {
689 al = gfs2_alloc_get(ip);
690
691 al->al_requested = nalloc * (data_blocks + ind_blocks);
692
693 error = gfs2_inplace_reserve(ip);
694 if (error)
695 goto out_alloc;
696
697 error = gfs2_trans_begin(sdp,
698 al->al_rgd->rd_ri.ri_length +
699 num_qd * data_blocks +
700 nalloc * ind_blocks +
701 RES_DINODE + num_qd +
702 RES_STATFS, 0);
703 if (error)
704 goto out_ipres;
705 } else {
706 error = gfs2_trans_begin(sdp,
707 num_qd * data_blocks +
708 RES_DINODE + num_qd, 0);
709 if (error)
710 goto out_gunlock;
711 }
712
713 file_ra_state_init(&ra_state, ip->i_vnode->i_mapping);
714 for (x = 0; x < num_qd; x++) {
715 qd = qda[x];
716 offset = qd2offset(qd);
717 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
718 (struct gfs2_quota_data *)qd->qd_gl->gl_lvb);
719 if (error)
720 goto out_end_trans;
721
722 do_qc(qd, -qd->qd_change_sync);
723 }
724
725 error = 0;
726
727 out_end_trans:
728 gfs2_trans_end(sdp);
729
730 out_ipres:
731 if (nalloc)
732 gfs2_inplace_release(ip);
733
734 out_alloc:
735 if (nalloc)
736 gfs2_alloc_put(ip);
737
738 out_gunlock:
739 gfs2_glock_dq_uninit(&i_gh);
740
741 out:
742 while (qx--)
743 gfs2_glock_dq_uninit(&ghs[qx]);
744 kfree(ghs);
745 gfs2_log_flush_glock(ip->i_gl);
746
747 return error;
748}
749
750static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
751 struct gfs2_holder *q_gh)
752{
753 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
754 struct gfs2_holder i_gh;
755 struct gfs2_quota q;
756 char buf[sizeof(struct gfs2_quota)];
757 struct file_ra_state ra_state;
758 int error;
759
760 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
761 restart:
762 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
763 if (error)
764 return error;
765
766 gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
767
768 if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) {
769 loff_t pos;
770 gfs2_glock_dq_uninit(q_gh);
771 error = gfs2_glock_nq_init(qd->qd_gl,
772 LM_ST_EXCLUSIVE, GL_NOCACHE,
773 q_gh);
774 if (error)
775 return error;
776
777 error = gfs2_glock_nq_init(get_v2ip(sdp->sd_quota_inode)->i_gl,
778 LM_ST_SHARED, 0,
779 &i_gh);
780 if (error)
781 goto fail;
782
783 memset(buf, 0, sizeof(struct gfs2_quota));
784 pos = qd2offset(qd);
785 error = gfs2_internal_read(get_v2ip(sdp->sd_quota_inode),
786 &ra_state, buf,
787 &pos,
788 sizeof(struct gfs2_quota));
789 if (error < 0)
790 goto fail_gunlock;
791
792 gfs2_glock_dq_uninit(&i_gh);
793
794 gfs2_quota_in(&q, buf);
795
796 memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb));
797 qd->qd_qb.qb_magic = GFS2_MAGIC;
798 qd->qd_qb.qb_limit = q.qu_limit;
799 qd->qd_qb.qb_warn = q.qu_warn;
800 qd->qd_qb.qb_value = q.qu_value;
801
802 gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
803
804 if (gfs2_glock_is_blocking(qd->qd_gl)) {
805 gfs2_glock_dq_uninit(q_gh);
806 force_refresh = 0;
807 goto restart;
808 }
809 }
810
811 return 0;
812
813 fail_gunlock:
814 gfs2_glock_dq_uninit(&i_gh);
815
816 fail:
817 gfs2_glock_dq_uninit(q_gh);
818
819 return error;
820}
821
822int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
823{
824 struct gfs2_sbd *sdp = ip->i_sbd;
825 struct gfs2_alloc *al = &ip->i_alloc;
826 unsigned int x;
827 int error = 0;
828
829 gfs2_quota_hold(ip, uid, gid);
830
831 if (capable(CAP_SYS_RESOURCE) ||
832 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
833 return 0;
834
835 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
836 sort_qd, NULL);
837
838 for (x = 0; x < al->al_qd_num; x++) {
839 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
840 if (error)
841 break;
842 }
843
844 if (!error)
845 set_bit(GIF_QD_LOCKED, &ip->i_flags);
846 else {
847 while (x--)
848 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
849 gfs2_quota_unhold(ip);
850 }
851
852 return error;
853}
854
855static int need_sync(struct gfs2_quota_data *qd)
856{
857 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
858 struct gfs2_tune *gt = &sdp->sd_tune;
859 int64_t value;
860 unsigned int num, den;
861 int do_sync = 1;
862
863 if (!qd->qd_qb.qb_limit)
864 return 0;
865
866 spin_lock(&sdp->sd_quota_spin);
867 value = qd->qd_change;
868 spin_unlock(&sdp->sd_quota_spin);
869
870 spin_lock(&gt->gt_spin);
871 num = gt->gt_quota_scale_num;
872 den = gt->gt_quota_scale_den;
873 spin_unlock(&gt->gt_spin);
874
875 if (value < 0)
876 do_sync = 0;
877 else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
878 do_sync = 0;
879 else {
880 value *= gfs2_jindex_size(sdp) * num;
881 do_div(value, den);
882 value += qd->qd_qb.qb_value;
883 if (value < (int64_t)qd->qd_qb.qb_limit)
884 do_sync = 0;
885 }
886
887 return do_sync;
888}
889
890void gfs2_quota_unlock(struct gfs2_inode *ip)
891{
892 struct gfs2_alloc *al = &ip->i_alloc;
893 struct gfs2_quota_data *qda[4];
894 unsigned int count = 0;
895 unsigned int x;
896
897 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
898 goto out;
899
900 for (x = 0; x < al->al_qd_num; x++) {
901 struct gfs2_quota_data *qd;
902 int sync;
903
904 qd = al->al_qd[x];
905 sync = need_sync(qd);
906
907 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
908
909 if (sync && qd_trylock(qd))
910 qda[count++] = qd;
911 }
912
913 if (count) {
914 do_sync(count, qda);
915 for (x = 0; x < count; x++)
916 qd_unlock(qda[x]);
917 }
918
919 out:
920 gfs2_quota_unhold(ip);
921}
922
923#define MAX_LINE 256
924
925static int print_message(struct gfs2_quota_data *qd, char *type)
926{
927 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
928 char *line;
929 int len;
930
931 line = kmalloc(MAX_LINE, GFP_KERNEL);
932 if (!line)
933 return -ENOMEM;
934
935 len = snprintf(line, MAX_LINE-1, "GFS2: fsid=%s: quota %s for %s %u\r\n",
936 sdp->sd_fsname, type,
937 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
938 qd->qd_id);
939 line[MAX_LINE-1] = 0;
940
941 if (current->signal) { /* Is this test still required? */
942 tty_write_message(current->signal->tty, line);
943 }
944
945 kfree(line);
946
947 return 0;
948}
949
950int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
951{
952 struct gfs2_sbd *sdp = ip->i_sbd;
953 struct gfs2_alloc *al = &ip->i_alloc;
954 struct gfs2_quota_data *qd;
955 int64_t value;
956 unsigned int x;
957 int error = 0;
958
959 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
960 return 0;
961
962 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
963 return 0;
964
965 for (x = 0; x < al->al_qd_num; x++) {
966 qd = al->al_qd[x];
967
968 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
969 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
970 continue;
971
972 value = qd->qd_qb.qb_value;
973 spin_lock(&sdp->sd_quota_spin);
974 value += qd->qd_change;
975 spin_unlock(&sdp->sd_quota_spin);
976
977 if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
978 print_message(qd, "exceeded");
979 error = -EDQUOT;
980 break;
981 } else if (qd->qd_qb.qb_warn &&
982 (int64_t)qd->qd_qb.qb_warn < value &&
983 time_after_eq(jiffies, qd->qd_last_warn +
984 gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) {
985 error = print_message(qd, "warning");
986 qd->qd_last_warn = jiffies;
987 }
988 }
989
990 return error;
991}
992
993void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
994 uint32_t uid, uint32_t gid)
995{
996 struct gfs2_alloc *al = &ip->i_alloc;
997 struct gfs2_quota_data *qd;
998 unsigned int x;
999 unsigned int found = 0;
1000
1001 if (gfs2_assert_warn(ip->i_sbd, change))
1002 return;
1003 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
1004 return;
1005
1006 for (x = 0; x < al->al_qd_num; x++) {
1007 qd = al->al_qd[x];
1008
1009 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1010 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1011 do_qc(qd, change);
1012 found++;
1013 }
1014 }
1015}
1016
1017int gfs2_quota_sync(struct gfs2_sbd *sdp)
1018{
1019 struct gfs2_quota_data **qda;
1020 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1021 unsigned int num_qd;
1022 unsigned int x;
1023 int error = 0;
1024
1025 sdp->sd_quota_sync_gen++;
1026
1027 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1028 if (!qda)
1029 return -ENOMEM;
1030
1031 do {
1032 num_qd = 0;
1033
1034 for (;;) {
1035 error = qd_fish(sdp, qda + num_qd);
1036 if (error || !qda[num_qd])
1037 break;
1038 if (++num_qd == max_qd)
1039 break;
1040 }
1041
1042 if (num_qd) {
1043 if (!error)
1044 error = do_sync(num_qd, qda);
1045 if (!error)
1046 for (x = 0; x < num_qd; x++)
1047 qda[x]->qd_sync_gen =
1048 sdp->sd_quota_sync_gen;
1049
1050 for (x = 0; x < num_qd; x++)
1051 qd_unlock(qda[x]);
1052 }
1053 } while (!error && num_qd == max_qd);
1054
1055 kfree(qda);
1056
1057 return error;
1058}
1059
1060int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id)
1061{
1062 struct gfs2_quota_data *qd;
1063 struct gfs2_holder q_gh;
1064 int error;
1065
1066 error = qd_get(sdp, user, id, CREATE, &qd);
1067 if (error)
1068 return error;
1069
1070 error = do_glock(qd, FORCE, &q_gh);
1071 if (!error)
1072 gfs2_glock_dq_uninit(&q_gh);
1073
1074 qd_put(qd);
1075
1076 return error;
1077}
1078
1079int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
1080 struct gfs2_quota *q)
1081{
1082 struct gfs2_quota_data *qd;
1083 struct gfs2_holder q_gh;
1084 int error;
1085
1086 if (((user) ? (id != current->fsuid) : (!in_group_p(id))) &&
1087 !capable(CAP_SYS_ADMIN))
1088 return -EACCES;
1089
1090 error = qd_get(sdp, user, id, CREATE, &qd);
1091 if (error)
1092 return error;
1093
1094 error = do_glock(qd, NO_FORCE, &q_gh);
1095 if (error)
1096 goto out;
1097
1098 memset(q, 0, sizeof(struct gfs2_quota));
1099 q->qu_limit = qd->qd_qb.qb_limit;
1100 q->qu_warn = qd->qd_qb.qb_warn;
1101 q->qu_value = qd->qd_qb.qb_value;
1102
1103 spin_lock(&sdp->sd_quota_spin);
1104 q->qu_value += qd->qd_change;
1105 spin_unlock(&sdp->sd_quota_spin);
1106
1107 gfs2_glock_dq_uninit(&q_gh);
1108
1109 out:
1110 qd_put(qd);
1111
1112 return error;
1113}
1114
1115int gfs2_quota_init(struct gfs2_sbd *sdp)
1116{
1117 struct gfs2_inode *ip = get_v2ip(sdp->sd_qc_inode);
1118 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1119 unsigned int x, slot = 0;
1120 unsigned int found = 0;
1121 uint64_t dblock;
1122 uint32_t extlen = 0;
1123 int error;
1124
1125 if (!ip->i_di.di_size ||
1126 ip->i_di.di_size > (64 << 20) ||
1127 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1128 gfs2_consist_inode(ip);
1129 return -EIO;
1130 }
1131 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1132 sdp->sd_quota_chunks = DIV_RU(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1133
1134 error = -ENOMEM;
1135
1136 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1137 sizeof(unsigned char *), GFP_KERNEL);
1138 if (!sdp->sd_quota_bitmap)
1139 return error;
1140
1141 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1142 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1143 if (!sdp->sd_quota_bitmap[x])
1144 goto fail;
1145 }
1146
1147 for (x = 0; x < blocks; x++) {
1148 struct buffer_head *bh;
1149 unsigned int y;
1150
1151 if (!extlen) {
1152 int new = 0;
1153 error = gfs2_block_map(ip, x, &new, &dblock, &extlen);
1154 if (error)
1155 goto fail;
1156 }
1157 gfs2_meta_ra(ip->i_gl, dblock, extlen);
1158 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
1159 &bh);
1160 if (error)
1161 goto fail;
1162 error = -EIO;
1163 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1164 brelse(bh);
1165 goto fail;
1166 }
1167
1168 for (y = 0;
1169 y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1170 y++, slot++) {
1171 struct gfs2_quota_change qc;
1172 struct gfs2_quota_data *qd;
1173
1174 gfs2_quota_change_in(&qc, bh->b_data +
1175 sizeof(struct gfs2_meta_header) +
1176 y * sizeof(struct gfs2_quota_change));
1177 if (!qc.qc_change)
1178 continue;
1179
1180 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1181 qc.qc_id, &qd);
1182 if (error) {
1183 brelse(bh);
1184 goto fail;
1185 }
1186
1187 set_bit(QDF_CHANGE, &qd->qd_flags);
1188 qd->qd_change = qc.qc_change;
1189 qd->qd_slot = slot;
1190 qd->qd_slot_count = 1;
1191 qd->qd_last_touched = jiffies;
1192
1193 spin_lock(&sdp->sd_quota_spin);
1194 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1195 list_add(&qd->qd_list, &sdp->sd_quota_list);
1196 atomic_inc(&sdp->sd_quota_count);
1197 spin_unlock(&sdp->sd_quota_spin);
1198
1199 found++;
1200 }
1201
1202 brelse(bh);
1203 dblock++;
1204 extlen--;
1205 }
1206
1207 if (found)
1208 fs_info(sdp, "found %u quota changes\n", found);
1209
1210 return 0;
1211
1212 fail:
1213 gfs2_quota_cleanup(sdp);
1214 return error;
1215}
1216
1217void gfs2_quota_scan(struct gfs2_sbd *sdp)
1218{
1219 struct gfs2_quota_data *qd, *safe;
1220 LIST_HEAD(dead);
1221
1222 spin_lock(&sdp->sd_quota_spin);
1223 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1224 if (!qd->qd_count &&
1225 time_after_eq(jiffies, qd->qd_last_touched +
1226 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1227 list_move(&qd->qd_list, &dead);
1228 gfs2_assert_warn(sdp,
1229 atomic_read(&sdp->sd_quota_count) > 0);
1230 atomic_dec(&sdp->sd_quota_count);
1231 }
1232 }
1233 spin_unlock(&sdp->sd_quota_spin);
1234
1235 while (!list_empty(&dead)) {
1236 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1237 list_del(&qd->qd_list);
1238
1239 gfs2_assert_warn(sdp, !qd->qd_change);
1240 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1241 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1242
1243 gfs2_lvb_unhold(qd->qd_gl);
1244 kfree(qd);
1245 }
1246}
1247
1248void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1249{
1250 struct list_head *head = &sdp->sd_quota_list;
1251 struct gfs2_quota_data *qd;
1252 unsigned int x;
1253
1254 spin_lock(&sdp->sd_quota_spin);
1255 while (!list_empty(head)) {
1256 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1257
1258 if (qd->qd_count > 1 ||
1259 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1260 list_move(&qd->qd_list, head);
1261 spin_unlock(&sdp->sd_quota_spin);
1262 schedule();
1263 spin_lock(&sdp->sd_quota_spin);
1264 continue;
1265 }
1266
1267 list_del(&qd->qd_list);
1268 atomic_dec(&sdp->sd_quota_count);
1269 spin_unlock(&sdp->sd_quota_spin);
1270
1271 if (!qd->qd_count) {
1272 gfs2_assert_warn(sdp, !qd->qd_change);
1273 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1274 } else
1275 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1276 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1277
1278 gfs2_lvb_unhold(qd->qd_gl);
1279 kfree(qd);
1280
1281 spin_lock(&sdp->sd_quota_spin);
1282 }
1283 spin_unlock(&sdp->sd_quota_spin);
1284
1285 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1286
1287 if (sdp->sd_quota_bitmap) {
1288 for (x = 0; x < sdp->sd_quota_chunks; x++)
1289 kfree(sdp->sd_quota_bitmap[x]);
1290 kfree(sdp->sd_quota_bitmap);
1291 }
1292}
1293
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..005529f6895d
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13#define NO_QUOTA_CHANGE ((uint32_t)-1)
14
15int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
16void gfs2_quota_unhold(struct gfs2_inode *ip);
17
18int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
19void gfs2_quota_unlock(struct gfs2_inode *ip);
20
21int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
22void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
23 uint32_t uid, uint32_t gid);
24
25int gfs2_quota_sync(struct gfs2_sbd *sdp);
26int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id);
27int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
28 struct gfs2_quota *q);
29
30int gfs2_quota_init(struct gfs2_sbd *sdp);
31void gfs2_quota_scan(struct gfs2_sbd *sdp);
32void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
33
34#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..bcb81c768c8b
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,570 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "bmap.h"
19#include "glock.h"
20#include "glops.h"
21#include "lm.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "super.h"
26
27int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
28 struct buffer_head **bh)
29{
30 struct gfs2_glock *gl = get_v2ip(jd->jd_inode)->i_gl;
31 int new = 0;
32 uint64_t dblock;
33 uint32_t extlen;
34 int error;
35
36 error = gfs2_block_map(get_v2ip(jd->jd_inode), blk, &new, &dblock, &extlen);
37 if (error)
38 return error;
39 if (!dblock) {
40 gfs2_consist_inode(get_v2ip(jd->jd_inode));
41 return -EIO;
42 }
43
44 gfs2_meta_ra(gl, dblock, extlen);
45 error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh);
46
47 return error;
48}
49
50int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
51{
52 struct list_head *head = &sdp->sd_revoke_list;
53 struct gfs2_revoke_replay *rr;
54 int found = 0;
55
56 list_for_each_entry(rr, head, rr_list) {
57 if (rr->rr_blkno == blkno) {
58 found = 1;
59 break;
60 }
61 }
62
63 if (found) {
64 rr->rr_where = where;
65 return 0;
66 }
67
68 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
69 if (!rr)
70 return -ENOMEM;
71
72 rr->rr_blkno = blkno;
73 rr->rr_where = where;
74 list_add(&rr->rr_list, head);
75
76 return 1;
77}
78
79int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
80{
81 struct gfs2_revoke_replay *rr;
82 int wrap, a, b, revoke;
83 int found = 0;
84
85 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
86 if (rr->rr_blkno == blkno) {
87 found = 1;
88 break;
89 }
90 }
91
92 if (!found)
93 return 0;
94
95 wrap = (rr->rr_where < sdp->sd_replay_tail);
96 a = (sdp->sd_replay_tail < where);
97 b = (where < rr->rr_where);
98 revoke = (wrap) ? (a || b) : (a && b);
99
100 return revoke;
101}
102
103void gfs2_revoke_clean(struct gfs2_sbd *sdp)
104{
105 struct list_head *head = &sdp->sd_revoke_list;
106 struct gfs2_revoke_replay *rr;
107
108 while (!list_empty(head)) {
109 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
110 list_del(&rr->rr_list);
111 kfree(rr);
112 }
113}
114
115/**
116 * get_log_header - read the log header for a given segment
117 * @jd: the journal
118 * @blk: the block to look at
119 * @lh: the log header to return
120 *
121 * Read the log header for a given segement in a given journal. Do a few
122 * sanity checks on it.
123 *
124 * Returns: 0 on success,
125 * 1 if the header was invalid or incomplete,
126 * errno on error
127 */
128
129static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
130 struct gfs2_log_header *head)
131{
132 struct buffer_head *bh;
133 struct gfs2_log_header lh;
134 uint32_t hash;
135 int error;
136
137 error = gfs2_replay_read_block(jd, blk, &bh);
138 if (error)
139 return error;
140
141 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
142 lh.lh_hash = 0;
143 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
144 gfs2_log_header_in(&lh, bh->b_data);
145
146 brelse(bh);
147
148 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
149 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
150 lh.lh_blkno != blk ||
151 lh.lh_hash != hash)
152 return 1;
153
154 *head = lh;
155
156 return 0;
157}
158
159/**
160 * find_good_lh - find a good log header
161 * @jd: the journal
162 * @blk: the segment to start searching from
163 * @lh: the log header to fill in
164 * @forward: if true search forward in the log, else search backward
165 *
166 * Call get_log_header() to get a log header for a segment, but if the
167 * segment is bad, either scan forward or backward until we find a good one.
168 *
169 * Returns: errno
170 */
171
172static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
173 struct gfs2_log_header *head)
174{
175 unsigned int orig_blk = *blk;
176 int error;
177
178 for (;;) {
179 error = get_log_header(jd, *blk, head);
180 if (error <= 0)
181 return error;
182
183 if (++*blk == jd->jd_blocks)
184 *blk = 0;
185
186 if (*blk == orig_blk) {
187 gfs2_consist_inode(get_v2ip(jd->jd_inode));
188 return -EIO;
189 }
190 }
191}
192
193/**
194 * jhead_scan - make sure we've found the head of the log
195 * @jd: the journal
196 * @head: this is filled in with the log descriptor of the head
197 *
198 * At this point, seg and lh should be either the head of the log or just
199 * before. Scan forward until we find the head.
200 *
201 * Returns: errno
202 */
203
204static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
205{
206 unsigned int blk = head->lh_blkno;
207 struct gfs2_log_header lh;
208 int error;
209
210 for (;;) {
211 if (++blk == jd->jd_blocks)
212 blk = 0;
213
214 error = get_log_header(jd, blk, &lh);
215 if (error < 0)
216 return error;
217 if (error == 1)
218 continue;
219
220 if (lh.lh_sequence == head->lh_sequence) {
221 gfs2_consist_inode(get_v2ip(jd->jd_inode));
222 return -EIO;
223 }
224 if (lh.lh_sequence < head->lh_sequence)
225 break;
226
227 *head = lh;
228 }
229
230 return 0;
231}
232
233/**
234 * gfs2_find_jhead - find the head of a log
235 * @jd: the journal
236 * @head: the log descriptor for the head of the log is returned here
237 *
238 * Do a binary search of a journal and find the valid log entry with the
239 * highest sequence number. (i.e. the log head)
240 *
241 * Returns: errno
242 */
243
244int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
245{
246 struct gfs2_log_header lh_1, lh_m;
247 uint32_t blk_1, blk_2, blk_m;
248 int error;
249
250 blk_1 = 0;
251 blk_2 = jd->jd_blocks - 1;
252
253 for (;;) {
254 blk_m = (blk_1 + blk_2) / 2;
255
256 error = find_good_lh(jd, &blk_1, &lh_1);
257 if (error)
258 return error;
259
260 error = find_good_lh(jd, &blk_m, &lh_m);
261 if (error)
262 return error;
263
264 if (blk_1 == blk_m || blk_m == blk_2)
265 break;
266
267 if (lh_1.lh_sequence <= lh_m.lh_sequence)
268 blk_1 = blk_m;
269 else
270 blk_2 = blk_m;
271 }
272
273 error = jhead_scan(jd, &lh_1);
274 if (error)
275 return error;
276
277 *head = lh_1;
278
279 return error;
280}
281
282/**
283 * foreach_descriptor - go through the active part of the log
284 * @jd: the journal
285 * @start: the first log header in the active region
286 * @end: the last log header (don't process the contents of this entry))
287 *
288 * Call a given function once for every log descriptor in the active
289 * portion of the log.
290 *
291 * Returns: errno
292 */
293
294static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
295 unsigned int end, int pass)
296{
297 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
298 struct buffer_head *bh;
299 struct gfs2_log_descriptor *ld;
300 int error = 0;
301 u32 length;
302 __be64 *ptr;
303 unsigned int offset = sizeof(struct gfs2_log_descriptor);
304 offset += (sizeof(__be64)-1);
305 offset &= ~(sizeof(__be64)-1);
306
307 while (start != end) {
308 error = gfs2_replay_read_block(jd, start, &bh);
309 if (error)
310 return error;
311 if (gfs2_meta_check(sdp, bh)) {
312 brelse(bh);
313 return -EIO;
314 }
315 ld = (struct gfs2_log_descriptor *)bh->b_data;
316 length = be32_to_cpu(ld->ld_length);
317
318 if (be16_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
319 struct gfs2_log_header lh;
320 error = get_log_header(jd, start, &lh);
321 if (!error) {
322 gfs2_replay_incr_blk(sdp, &start);
323 continue;
324 }
325 if (error == 1) {
326 gfs2_consist_inode(get_v2ip(jd->jd_inode));
327 error = -EIO;
328 }
329 brelse(bh);
330 return error;
331 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
332 brelse(bh);
333 return -EIO;
334 }
335 ptr = (__be64 *)(bh->b_data + offset);
336 error = lops_scan_elements(jd, start, ld, ptr, pass);
337 if (error) {
338 brelse(bh);
339 return error;
340 }
341
342 while (length--)
343 gfs2_replay_incr_blk(sdp, &start);
344
345 brelse(bh);
346 }
347
348 return 0;
349}
350
351/**
352 * clean_journal - mark a dirty journal as being clean
353 * @sdp: the filesystem
354 * @jd: the journal
355 * @gl: the journal's glock
356 * @head: the head journal to start from
357 *
358 * Returns: errno
359 */
360
361static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
362{
363 struct gfs2_inode *ip = get_v2ip(jd->jd_inode);
364 struct gfs2_sbd *sdp = ip->i_sbd;
365 unsigned int lblock;
366 int new = 0;
367 uint64_t dblock;
368 struct gfs2_log_header *lh;
369 uint32_t hash;
370 struct buffer_head *bh;
371 int error;
372
373 lblock = head->lh_blkno;
374 gfs2_replay_incr_blk(sdp, &lblock);
375 error = gfs2_block_map(ip, lblock, &new, &dblock, NULL);
376 if (error)
377 return error;
378 if (!dblock) {
379 gfs2_consist_inode(ip);
380 return -EIO;
381 }
382
383 bh = sb_getblk(sdp->sd_vfs, dblock);
384 lock_buffer(bh);
385 memset(bh->b_data, 0, bh->b_size);
386 set_buffer_uptodate(bh);
387 clear_buffer_dirty(bh);
388 unlock_buffer(bh);
389
390 lh = (struct gfs2_log_header *)bh->b_data;
391 memset(lh, 0, sizeof(struct gfs2_log_header));
392 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
393 lh->lh_header.mh_type = cpu_to_be16(GFS2_METATYPE_LH);
394 lh->lh_header.mh_format = cpu_to_be16(GFS2_FORMAT_LH);
395 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
396 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
397 lh->lh_blkno = cpu_to_be32(lblock);
398 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
399 lh->lh_hash = cpu_to_be32(hash);
400
401 set_buffer_dirty(bh);
402 if (sync_dirty_buffer(bh))
403 gfs2_io_error_bh(sdp, bh);
404 brelse(bh);
405
406 return error;
407}
408
409/**
410 * gfs2_recover_journal - recovery a given journal
411 * @jd: the struct gfs2_jdesc describing the journal
412 * @wait: Don't return until the journal is clean (or an error is encountered)
413 *
414 * Acquire the journal's lock, check to see if the journal is clean, and
415 * do recovery if necessary.
416 *
417 * Returns: errno
418 */
419
420int gfs2_recover_journal(struct gfs2_jdesc *jd, int wait)
421{
422 struct gfs2_sbd *sdp = get_v2ip(jd->jd_inode)->i_sbd;
423 struct gfs2_log_header head;
424 struct gfs2_holder j_gh, ji_gh, t_gh;
425 unsigned long t;
426 int ro = 0;
427 unsigned int pass;
428 int error;
429
430 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", jd->jd_jid);
431
432 /* Aquire the journal lock so we can do recovery */
433
434 error = gfs2_glock_nq_num(sdp,
435 jd->jd_jid, &gfs2_journal_glops,
436 LM_ST_EXCLUSIVE,
437 LM_FLAG_NOEXP |
438 ((wait) ? 0 : LM_FLAG_TRY) |
439 GL_NOCACHE, &j_gh);
440 switch (error) {
441 case 0:
442 break;
443
444 case GLR_TRYFAILED:
445 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
446 error = 0;
447
448 default:
449 goto fail;
450 };
451
452 error = gfs2_glock_nq_init(get_v2ip(jd->jd_inode)->i_gl, LM_ST_SHARED,
453 LM_FLAG_NOEXP, &ji_gh);
454 if (error)
455 goto fail_gunlock_j;
456
457 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
458
459 error = gfs2_jdesc_check(jd);
460 if (error)
461 goto fail_gunlock_ji;
462
463 error = gfs2_find_jhead(jd, &head);
464 if (error)
465 goto fail_gunlock_ji;
466
467 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
468 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
469 jd->jd_jid);
470
471 t = jiffies;
472
473 /* Acquire a shared hold on the transaction lock */
474
475 error = gfs2_glock_nq_init(sdp->sd_trans_gl,
476 LM_ST_SHARED,
477 LM_FLAG_NOEXP |
478 LM_FLAG_PRIORITY |
479 GL_NEVER_RECURSE |
480 GL_NOCANCEL |
481 GL_NOCACHE,
482 &t_gh);
483 if (error)
484 goto fail_gunlock_ji;
485
486 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
487 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
488 ro = 1;
489 } else {
490 if (sdp->sd_vfs->s_flags & MS_RDONLY)
491 ro = 1;
492 }
493
494 if (ro) {
495 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
496 jd->jd_jid);
497 error = -EROFS;
498 goto fail_gunlock_tr;
499 }
500
501 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
502
503 for (pass = 0; pass < 2; pass++) {
504 lops_before_scan(jd, &head, pass);
505 error = foreach_descriptor(jd, head.lh_tail,
506 head.lh_blkno, pass);
507 lops_after_scan(jd, error, pass);
508 if (error)
509 goto fail_gunlock_tr;
510 }
511
512 error = clean_journal(jd, &head);
513 if (error)
514 goto fail_gunlock_tr;
515
516 gfs2_glock_dq_uninit(&t_gh);
517
518 t = DIV_RU(jiffies - t, HZ);
519
520 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
521 jd->jd_jid, t);
522 }
523
524 gfs2_glock_dq_uninit(&ji_gh);
525
526 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
527
528 gfs2_glock_dq_uninit(&j_gh);
529
530 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
531
532 return 0;
533
534 fail_gunlock_tr:
535 gfs2_glock_dq_uninit(&t_gh);
536
537 fail_gunlock_ji:
538 gfs2_glock_dq_uninit(&ji_gh);
539
540 fail_gunlock_j:
541 gfs2_glock_dq_uninit(&j_gh);
542
543 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
544
545 fail:
546 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
547
548 return error;
549}
550
551/**
552 * gfs2_check_journals - Recover any dirty journals
553 * @sdp: the filesystem
554 *
555 */
556
557void gfs2_check_journals(struct gfs2_sbd *sdp)
558{
559 struct gfs2_jdesc *jd;
560
561 for (;;) {
562 jd = gfs2_jdesc_find_dirty(sdp);
563 if (!jd)
564 break;
565
566 if (jd != sdp->sd_jdesc)
567 gfs2_recover_journal(jd, NO_WAIT);
568 }
569}
570
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..50d7eb57881c
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
14{
15 if (++*blk == sdp->sd_jdesc->jd_blocks)
16 *blk = 0;
17}
18
19int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
20 struct buffer_head **bh);
21
22int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
23int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
24void gfs2_revoke_clean(struct gfs2_sbd *sdp);
25
26int gfs2_find_jhead(struct gfs2_jdesc *jd,
27 struct gfs2_log_header *head);
28int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, int wait);
29void gfs2_check_journals(struct gfs2_sbd *sdp);
30
31#endif /* __RECOVERY_DOT_H__ */
32
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..2e69e5cda967
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1364 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "bits.h"
20#include "glock.h"
21#include "glops.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "quota.h"
25#include "rgrp.h"
26#include "super.h"
27#include "trans.h"
28#include "ops_file.h"
29
30/**
31 * gfs2_rgrp_verify - Verify that a resource group is consistent
32 * @sdp: the filesystem
33 * @rgd: the rgrp
34 *
35 */
36
37void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
38{
39 struct gfs2_sbd *sdp = rgd->rd_sbd;
40 struct gfs2_bitmap *bi = NULL;
41 uint32_t length = rgd->rd_ri.ri_length;
42 uint32_t count[4], tmp;
43 int buf, x;
44
45 memset(count, 0, 4 * sizeof(uint32_t));
46
47 /* Count # blocks in each of 4 possible allocation states */
48 for (buf = 0; buf < length; buf++) {
49 bi = rgd->rd_bits + buf;
50 for (x = 0; x < 4; x++)
51 count[x] += gfs2_bitcount(rgd,
52 bi->bi_bh->b_data +
53 bi->bi_offset,
54 bi->bi_len, x);
55 }
56
57 if (count[0] != rgd->rd_rg.rg_free) {
58 if (gfs2_consist_rgrpd(rgd))
59 fs_err(sdp, "free data mismatch: %u != %u\n",
60 count[0], rgd->rd_rg.rg_free);
61 return;
62 }
63
64 tmp = rgd->rd_ri.ri_data -
65 rgd->rd_rg.rg_free -
66 rgd->rd_rg.rg_dinodes;
67 if (count[1] != tmp) {
68 if (gfs2_consist_rgrpd(rgd))
69 fs_err(sdp, "used data mismatch: %u != %u\n",
70 count[1], tmp);
71 return;
72 }
73
74 if (count[2]) {
75 if (gfs2_consist_rgrpd(rgd))
76 fs_err(sdp, "free metadata mismatch: %u != 0\n",
77 count[2]);
78 return;
79 }
80
81 if (count[3] != rgd->rd_rg.rg_dinodes) {
82 if (gfs2_consist_rgrpd(rgd))
83 fs_err(sdp, "used metadata mismatch: %u != %u\n",
84 count[3], rgd->rd_rg.rg_dinodes);
85 return;
86 }
87}
88
89static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block)
90{
91 uint64_t first = ri->ri_data0;
92 uint64_t last = first + ri->ri_data;
93 return !!(first <= block && block < last);
94}
95
96/**
97 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
98 * @sdp: The GFS2 superblock
99 * @n: The data block number
100 *
101 * Returns: The resource group, or NULL if not found
102 */
103
104struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk)
105{
106 struct gfs2_rgrpd *rgd;
107
108 spin_lock(&sdp->sd_rindex_spin);
109
110 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
111 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
112 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
113 spin_unlock(&sdp->sd_rindex_spin);
114 return rgd;
115 }
116 }
117
118 spin_unlock(&sdp->sd_rindex_spin);
119
120 return NULL;
121}
122
123/**
124 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
125 * @sdp: The GFS2 superblock
126 *
127 * Returns: The first rgrp in the filesystem
128 */
129
130struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
131{
132 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
133 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
134}
135
136/**
137 * gfs2_rgrpd_get_next - get the next RG
138 * @rgd: A RG
139 *
140 * Returns: The next rgrp
141 */
142
143struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
144{
145 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
146 return NULL;
147 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
148}
149
150static void clear_rgrpdi(struct gfs2_sbd *sdp)
151{
152 struct list_head *head;
153 struct gfs2_rgrpd *rgd;
154 struct gfs2_glock *gl;
155
156 spin_lock(&sdp->sd_rindex_spin);
157 sdp->sd_rindex_forward = NULL;
158 head = &sdp->sd_rindex_recent_list;
159 while (!list_empty(head)) {
160 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
161 list_del(&rgd->rd_recent);
162 }
163 spin_unlock(&sdp->sd_rindex_spin);
164
165 head = &sdp->sd_rindex_list;
166 while (!list_empty(head)) {
167 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
168 gl = rgd->rd_gl;
169
170 list_del(&rgd->rd_list);
171 list_del(&rgd->rd_list_mru);
172
173 if (gl) {
174 set_gl2rgd(gl, NULL);
175 gfs2_glock_put(gl);
176 }
177
178 kfree(rgd->rd_bits);
179 kfree(rgd);
180 }
181}
182
183void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
184{
185 mutex_lock(&sdp->sd_rindex_mutex);
186 clear_rgrpdi(sdp);
187 mutex_unlock(&sdp->sd_rindex_mutex);
188}
189
190/**
191 * gfs2_compute_bitstructs - Compute the bitmap sizes
192 * @rgd: The resource group descriptor
193 *
194 * Calculates bitmap descriptors, one for each block that contains bitmap data
195 *
196 * Returns: errno
197 */
198
199static int compute_bitstructs(struct gfs2_rgrpd *rgd)
200{
201 struct gfs2_sbd *sdp = rgd->rd_sbd;
202 struct gfs2_bitmap *bi;
203 uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
204 uint32_t bytes_left, bytes;
205 int x;
206
207 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_KERNEL);
208 if (!rgd->rd_bits)
209 return -ENOMEM;
210
211 bytes_left = rgd->rd_ri.ri_bitbytes;
212
213 for (x = 0; x < length; x++) {
214 bi = rgd->rd_bits + x;
215
216 /* small rgrp; bitmap stored completely in header block */
217 if (length == 1) {
218 bytes = bytes_left;
219 bi->bi_offset = sizeof(struct gfs2_rgrp);
220 bi->bi_start = 0;
221 bi->bi_len = bytes;
222 /* header block */
223 } else if (x == 0) {
224 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
225 bi->bi_offset = sizeof(struct gfs2_rgrp);
226 bi->bi_start = 0;
227 bi->bi_len = bytes;
228 /* last block */
229 } else if (x + 1 == length) {
230 bytes = bytes_left;
231 bi->bi_offset = sizeof(struct gfs2_meta_header);
232 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
233 bi->bi_len = bytes;
234 /* other blocks */
235 } else {
236 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
237 bi->bi_offset = sizeof(struct gfs2_meta_header);
238 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
239 bi->bi_len = bytes;
240 }
241
242 bytes_left -= bytes;
243 }
244
245 if (bytes_left) {
246 gfs2_consist_rgrpd(rgd);
247 return -EIO;
248 }
249 bi = rgd->rd_bits + (length - 1);
250 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
251 if (gfs2_consist_rgrpd(rgd)) {
252 gfs2_rindex_print(&rgd->rd_ri);
253 fs_err(sdp, "start=%u len=%u offset=%u\n",
254 bi->bi_start, bi->bi_len, bi->bi_offset);
255 }
256 return -EIO;
257 }
258
259 return 0;
260}
261
262/**
263 * gfs2_ri_update - Pull in a new resource index from the disk
264 * @gl: The glock covering the rindex inode
265 *
266 * Returns: 0 on successful update, error code otherwise
267 */
268
269static int gfs2_ri_update(struct gfs2_inode *ip)
270{
271 struct gfs2_sbd *sdp = ip->i_sbd;
272 struct inode *inode = ip->i_vnode;
273 struct gfs2_rgrpd *rgd;
274 char buf[sizeof(struct gfs2_rindex)];
275 struct file_ra_state ra_state;
276 uint64_t junk = ip->i_di.di_size;
277 int error;
278
279 if (do_div(junk, sizeof(struct gfs2_rindex))) {
280 gfs2_consist_inode(ip);
281 return -EIO;
282 }
283
284 clear_rgrpdi(sdp);
285
286 file_ra_state_init(&ra_state, inode->i_mapping);
287 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
288 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
289 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
290 sizeof(struct gfs2_rindex));
291 if (!error)
292 break;
293 if (error != sizeof(struct gfs2_rindex)) {
294 if (error > 0)
295 error = -EIO;
296 goto fail;
297 }
298
299 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_KERNEL);
300 error = -ENOMEM;
301 if (!rgd)
302 goto fail;
303
304 mutex_init(&rgd->rd_mutex);
305 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
306 rgd->rd_sbd = sdp;
307
308 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
309 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
310
311 gfs2_rindex_in(&rgd->rd_ri, buf);
312
313 error = compute_bitstructs(rgd);
314 if (error)
315 goto fail;
316
317 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
318 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
319 if (error)
320 goto fail;
321
322 set_gl2rgd(rgd->rd_gl, rgd);
323 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
324 }
325
326 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
327
328 return 0;
329
330 fail:
331 clear_rgrpdi(sdp);
332
333 return error;
334}
335
336/**
337 * gfs2_rindex_hold - Grab a lock on the rindex
338 * @sdp: The GFS2 superblock
339 * @ri_gh: the glock holder
340 *
341 * We grab a lock on the rindex inode to make sure that it doesn't
342 * change whilst we are performing an operation. We keep this lock
343 * for quite long periods of time compared to other locks. This
344 * doesn't matter, since it is shared and it is very, very rarely
345 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
346 *
347 * This makes sure that we're using the latest copy of the resource index
348 * special file, which might have been updated if someone expanded the
349 * filesystem (via gfs2_grow utility), which adds new resource groups.
350 *
351 * Returns: 0 on success, error code otherwise
352 */
353
354int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
355{
356 struct gfs2_inode *ip = get_v2ip(sdp->sd_rindex);
357 struct gfs2_glock *gl = ip->i_gl;
358 int error;
359
360 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
361 if (error)
362 return error;
363
364 /* Read new copy from disk if we don't have the latest */
365 if (sdp->sd_rindex_vn != gl->gl_vn) {
366 mutex_lock(&sdp->sd_rindex_mutex);
367 if (sdp->sd_rindex_vn != gl->gl_vn) {
368 error = gfs2_ri_update(ip);
369 if (error)
370 gfs2_glock_dq_uninit(ri_gh);
371 }
372 mutex_unlock(&sdp->sd_rindex_mutex);
373 }
374
375 return error;
376}
377
378/**
379 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
380 * @rgd: the struct gfs2_rgrpd describing the RG to read in
381 *
382 * Read in all of a Resource Group's header and bitmap blocks.
383 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
384 *
385 * Returns: errno
386 */
387
388int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
389{
390 struct gfs2_sbd *sdp = rgd->rd_sbd;
391 struct gfs2_glock *gl = rgd->rd_gl;
392 unsigned int length = rgd->rd_ri.ri_length;
393 struct gfs2_bitmap *bi;
394 unsigned int x, y;
395 int error;
396
397 mutex_lock(&rgd->rd_mutex);
398
399 spin_lock(&sdp->sd_rindex_spin);
400 if (rgd->rd_bh_count) {
401 rgd->rd_bh_count++;
402 spin_unlock(&sdp->sd_rindex_spin);
403 mutex_unlock(&rgd->rd_mutex);
404 return 0;
405 }
406 spin_unlock(&sdp->sd_rindex_spin);
407
408 for (x = 0; x < length; x++) {
409 bi = rgd->rd_bits + x;
410 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START,
411 &bi->bi_bh);
412 if (error)
413 goto fail;
414 }
415
416 for (y = length; y--;) {
417 bi = rgd->rd_bits + y;
418 error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT);
419 if (error)
420 goto fail;
421 if (gfs2_metatype_check(sdp, bi->bi_bh,
422 (y) ? GFS2_METATYPE_RB :
423 GFS2_METATYPE_RG)) {
424 error = -EIO;
425 goto fail;
426 }
427 }
428
429 if (rgd->rd_rg_vn != gl->gl_vn) {
430 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
431 rgd->rd_rg_vn = gl->gl_vn;
432 }
433
434 spin_lock(&sdp->sd_rindex_spin);
435 rgd->rd_free_clone = rgd->rd_rg.rg_free;
436 rgd->rd_bh_count++;
437 spin_unlock(&sdp->sd_rindex_spin);
438
439 mutex_unlock(&rgd->rd_mutex);
440
441 return 0;
442
443 fail:
444 while (x--) {
445 bi = rgd->rd_bits + x;
446 brelse(bi->bi_bh);
447 bi->bi_bh = NULL;
448 gfs2_assert_warn(sdp, !bi->bi_clone);
449 }
450 mutex_unlock(&rgd->rd_mutex);
451
452 return error;
453}
454
455void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
456{
457 struct gfs2_sbd *sdp = rgd->rd_sbd;
458
459 spin_lock(&sdp->sd_rindex_spin);
460 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
461 rgd->rd_bh_count++;
462 spin_unlock(&sdp->sd_rindex_spin);
463}
464
465/**
466 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
467 * @rgd: the struct gfs2_rgrpd describing the RG to read in
468 *
469 */
470
471void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
472{
473 struct gfs2_sbd *sdp = rgd->rd_sbd;
474 int x, length = rgd->rd_ri.ri_length;
475
476 spin_lock(&sdp->sd_rindex_spin);
477 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
478 if (--rgd->rd_bh_count) {
479 spin_unlock(&sdp->sd_rindex_spin);
480 return;
481 }
482
483 for (x = 0; x < length; x++) {
484 struct gfs2_bitmap *bi = rgd->rd_bits + x;
485 kfree(bi->bi_clone);
486 bi->bi_clone = NULL;
487 brelse(bi->bi_bh);
488 bi->bi_bh = NULL;
489 }
490
491 spin_unlock(&sdp->sd_rindex_spin);
492}
493
494void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
495{
496 struct gfs2_sbd *sdp = rgd->rd_sbd;
497 unsigned int length = rgd->rd_ri.ri_length;
498 unsigned int x;
499
500 for (x = 0; x < length; x++) {
501 struct gfs2_bitmap *bi = rgd->rd_bits + x;
502 if (!bi->bi_clone)
503 continue;
504 memcpy(bi->bi_clone + bi->bi_offset,
505 bi->bi_bh->b_data + bi->bi_offset,
506 bi->bi_len);
507 }
508
509 spin_lock(&sdp->sd_rindex_spin);
510 rgd->rd_free_clone = rgd->rd_rg.rg_free;
511 spin_unlock(&sdp->sd_rindex_spin);
512}
513
514/**
515 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
516 * @ip: the incore GFS2 inode structure
517 *
518 * Returns: the struct gfs2_alloc
519 */
520
521struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
522{
523 struct gfs2_alloc *al = &ip->i_alloc;
524
525 /* FIXME: Should assert that the correct locks are held here... */
526 memset(al, 0, sizeof(*al));
527 return al;
528}
529
530/**
531 * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode
532 * @ip: the inode
533 *
534 */
535
536void gfs2_alloc_put(struct gfs2_inode *ip)
537{
538 return;
539}
540
541/**
542 * try_rgrp_fit - See if a given reservation will fit in a given RG
543 * @rgd: the RG data
544 * @al: the struct gfs2_alloc structure describing the reservation
545 *
546 * If there's room for the requested blocks to be allocated from the RG:
547 * Sets the $al_reserved_data field in @al.
548 * Sets the $al_reserved_meta field in @al.
549 * Sets the $al_rgd field in @al.
550 *
551 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
552 */
553
554static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
555{
556 struct gfs2_sbd *sdp = rgd->rd_sbd;
557 int ret = 0;
558
559 spin_lock(&sdp->sd_rindex_spin);
560 if (rgd->rd_free_clone >= al->al_requested) {
561 al->al_rgd = rgd;
562 ret = 1;
563 }
564 spin_unlock(&sdp->sd_rindex_spin);
565
566 return ret;
567}
568
569/**
570 * recent_rgrp_first - get first RG from "recent" list
571 * @sdp: The GFS2 superblock
572 * @rglast: address of the rgrp used last
573 *
574 * Returns: The first rgrp in the recent list
575 */
576
577static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
578 uint64_t rglast)
579{
580 struct gfs2_rgrpd *rgd = NULL;
581
582 spin_lock(&sdp->sd_rindex_spin);
583
584 if (list_empty(&sdp->sd_rindex_recent_list))
585 goto out;
586
587 if (!rglast)
588 goto first;
589
590 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
591 if (rgd->rd_ri.ri_addr == rglast)
592 goto out;
593 }
594
595 first:
596 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
597 rd_recent);
598
599 out:
600 spin_unlock(&sdp->sd_rindex_spin);
601
602 return rgd;
603}
604
605/**
606 * recent_rgrp_next - get next RG from "recent" list
607 * @cur_rgd: current rgrp
608 * @remove:
609 *
610 * Returns: The next rgrp in the recent list
611 */
612
613static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
614 int remove)
615{
616 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
617 struct list_head *head;
618 struct gfs2_rgrpd *rgd;
619
620 spin_lock(&sdp->sd_rindex_spin);
621
622 head = &sdp->sd_rindex_recent_list;
623
624 list_for_each_entry(rgd, head, rd_recent) {
625 if (rgd == cur_rgd) {
626 if (cur_rgd->rd_recent.next != head)
627 rgd = list_entry(cur_rgd->rd_recent.next,
628 struct gfs2_rgrpd, rd_recent);
629 else
630 rgd = NULL;
631
632 if (remove)
633 list_del(&cur_rgd->rd_recent);
634
635 goto out;
636 }
637 }
638
639 rgd = NULL;
640 if (!list_empty(head))
641 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
642
643 out:
644 spin_unlock(&sdp->sd_rindex_spin);
645
646 return rgd;
647}
648
649/**
650 * recent_rgrp_add - add an RG to tail of "recent" list
651 * @new_rgd: The rgrp to add
652 *
653 */
654
655static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
656{
657 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
658 struct gfs2_rgrpd *rgd;
659 unsigned int count = 0;
660 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
661
662 spin_lock(&sdp->sd_rindex_spin);
663
664 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
665 if (rgd == new_rgd)
666 goto out;
667
668 if (++count >= max)
669 goto out;
670 }
671 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
672
673 out:
674 spin_unlock(&sdp->sd_rindex_spin);
675}
676
677/**
678 * forward_rgrp_get - get an rgrp to try next from full list
679 * @sdp: The GFS2 superblock
680 *
681 * Returns: The rgrp to try next
682 */
683
684static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
685{
686 struct gfs2_rgrpd *rgd;
687 unsigned int journals = gfs2_jindex_size(sdp);
688 unsigned int rg = 0, x;
689
690 spin_lock(&sdp->sd_rindex_spin);
691
692 rgd = sdp->sd_rindex_forward;
693 if (!rgd) {
694 if (sdp->sd_rgrps >= journals)
695 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
696
697 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp);
698 x < rg;
699 x++, rgd = gfs2_rgrpd_get_next(rgd))
700 /* Do Nothing */;
701
702 sdp->sd_rindex_forward = rgd;
703 }
704
705 spin_unlock(&sdp->sd_rindex_spin);
706
707 return rgd;
708}
709
710/**
711 * forward_rgrp_set - set the forward rgrp pointer
712 * @sdp: the filesystem
713 * @rgd: The new forward rgrp
714 *
715 */
716
717static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
718{
719 spin_lock(&sdp->sd_rindex_spin);
720 sdp->sd_rindex_forward = rgd;
721 spin_unlock(&sdp->sd_rindex_spin);
722}
723
724/**
725 * get_local_rgrp - Choose and lock a rgrp for allocation
726 * @ip: the inode to reserve space for
727 * @rgp: the chosen and locked rgrp
728 *
729 * Try to acquire rgrp in way which avoids contending with others.
730 *
731 * Returns: errno
732 */
733
734static int get_local_rgrp(struct gfs2_inode *ip)
735{
736 struct gfs2_sbd *sdp = ip->i_sbd;
737 struct gfs2_rgrpd *rgd, *begin = NULL;
738 struct gfs2_alloc *al = &ip->i_alloc;
739 int flags = LM_FLAG_TRY;
740 int skipped = 0;
741 int loops = 0;
742 int error;
743
744 /* Try recently successful rgrps */
745
746 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
747
748 while (rgd) {
749 error = gfs2_glock_nq_init(rgd->rd_gl,
750 LM_ST_EXCLUSIVE, LM_FLAG_TRY,
751 &al->al_rgd_gh);
752 switch (error) {
753 case 0:
754 if (try_rgrp_fit(rgd, al))
755 goto out;
756 gfs2_glock_dq_uninit(&al->al_rgd_gh);
757 rgd = recent_rgrp_next(rgd, 1);
758 break;
759
760 case GLR_TRYFAILED:
761 rgd = recent_rgrp_next(rgd, 0);
762 break;
763
764 default:
765 return error;
766 }
767 }
768
769 /* Go through full list of rgrps */
770
771 begin = rgd = forward_rgrp_get(sdp);
772
773 for (;;) {
774 error = gfs2_glock_nq_init(rgd->rd_gl,
775 LM_ST_EXCLUSIVE, flags,
776 &al->al_rgd_gh);
777 switch (error) {
778 case 0:
779 if (try_rgrp_fit(rgd, al))
780 goto out;
781 gfs2_glock_dq_uninit(&al->al_rgd_gh);
782 break;
783
784 case GLR_TRYFAILED:
785 skipped++;
786 break;
787
788 default:
789 return error;
790 }
791
792 rgd = gfs2_rgrpd_get_next(rgd);
793 if (!rgd)
794 rgd = gfs2_rgrpd_get_first(sdp);
795
796 if (rgd == begin) {
797 if (++loops >= 2 || !skipped)
798 return -ENOSPC;
799 flags = 0;
800 }
801 }
802
803 out:
804 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
805
806 if (begin) {
807 recent_rgrp_add(rgd);
808 rgd = gfs2_rgrpd_get_next(rgd);
809 if (!rgd)
810 rgd = gfs2_rgrpd_get_first(sdp);
811 forward_rgrp_set(sdp, rgd);
812 }
813
814 return 0;
815}
816
817/**
818 * gfs2_inplace_reserve_i - Reserve space in the filesystem
819 * @ip: the inode to reserve space for
820 *
821 * Returns: errno
822 */
823
824int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
825{
826 struct gfs2_sbd *sdp = ip->i_sbd;
827 struct gfs2_alloc *al = &ip->i_alloc;
828 int error;
829
830 if (gfs2_assert_warn(sdp, al->al_requested))
831 return -EINVAL;
832
833 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
834 if (error)
835 return error;
836
837 error = get_local_rgrp(ip);
838 if (error) {
839 gfs2_glock_dq_uninit(&al->al_ri_gh);
840 return error;
841 }
842
843 al->al_file = file;
844 al->al_line = line;
845
846 return 0;
847}
848
849/**
850 * gfs2_inplace_release - release an inplace reservation
851 * @ip: the inode the reservation was taken out on
852 *
853 * Release a reservation made by gfs2_inplace_reserve().
854 */
855
856void gfs2_inplace_release(struct gfs2_inode *ip)
857{
858 struct gfs2_sbd *sdp = ip->i_sbd;
859 struct gfs2_alloc *al = &ip->i_alloc;
860
861 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
862 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
863 "al_file = %s, al_line = %u\n",
864 al->al_alloced, al->al_requested, al->al_file,
865 al->al_line);
866
867 al->al_rgd = NULL;
868 gfs2_glock_dq_uninit(&al->al_rgd_gh);
869 gfs2_glock_dq_uninit(&al->al_ri_gh);
870}
871
872/**
873 * gfs2_get_block_type - Check a block in a RG is of given type
874 * @rgd: the resource group holding the block
875 * @block: the block number
876 *
877 * Returns: The block type (GFS2_BLKST_*)
878 */
879
880unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block)
881{
882 struct gfs2_bitmap *bi = NULL;
883 uint32_t length, rgrp_block, buf_block;
884 unsigned int buf;
885 unsigned char type;
886
887 length = rgd->rd_ri.ri_length;
888 rgrp_block = block - rgd->rd_ri.ri_data0;
889
890 for (buf = 0; buf < length; buf++) {
891 bi = rgd->rd_bits + buf;
892 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
893 break;
894 }
895
896 gfs2_assert(rgd->rd_sbd, buf < length);
897 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
898
899 type = gfs2_testbit(rgd,
900 bi->bi_bh->b_data + bi->bi_offset,
901 bi->bi_len, buf_block);
902
903 return type;
904}
905
906/**
907 * rgblk_search - find a block in @old_state, change allocation
908 * state to @new_state
909 * @rgd: the resource group descriptor
910 * @goal: the goal block within the RG (start here to search for avail block)
911 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
912 * @new_state: GFS2_BLKST_XXX the after-allocation block state
913 *
914 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
915 * Add the found bitmap buffer to the transaction.
916 * Set the found bits to @new_state to change block's allocation state.
917 *
918 * This function never fails, because we wouldn't call it unless we
919 * know (from reservation results, etc.) that a block is available.
920 *
921 * Scope of @goal and returned block is just within rgrp, not the whole
922 * filesystem.
923 *
924 * Returns: the block number allocated
925 */
926
927static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal,
928 unsigned char old_state, unsigned char new_state)
929{
930 struct gfs2_bitmap *bi = NULL;
931 uint32_t length = rgd->rd_ri.ri_length;
932 uint32_t blk = 0;
933 unsigned int buf, x;
934
935 /* Find bitmap block that contains bits for goal block */
936 for (buf = 0; buf < length; buf++) {
937 bi = rgd->rd_bits + buf;
938 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
939 break;
940 }
941
942 gfs2_assert(rgd->rd_sbd, buf < length);
943
944 /* Convert scope of "goal" from rgrp-wide to within found bit block */
945 goal -= bi->bi_start * GFS2_NBBY;
946
947 /* Search (up to entire) bitmap in this rgrp for allocatable block.
948 "x <= length", instead of "x < length", because we typically start
949 the search in the middle of a bit block, but if we can't find an
950 allocatable block anywhere else, we want to be able wrap around and
951 search in the first part of our first-searched bit block. */
952 for (x = 0; x <= length; x++) {
953 if (bi->bi_clone)
954 blk = gfs2_bitfit(rgd,
955 bi->bi_clone + bi->bi_offset,
956 bi->bi_len, goal, old_state);
957 else
958 blk = gfs2_bitfit(rgd,
959 bi->bi_bh->b_data + bi->bi_offset,
960 bi->bi_len, goal, old_state);
961 if (blk != BFITNOENT)
962 break;
963
964 /* Try next bitmap block (wrap back to rgrp header if at end) */
965 buf = (buf + 1) % length;
966 bi = rgd->rd_bits + buf;
967 goal = 0;
968 }
969
970 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
971 blk = 0;
972
973 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
974 gfs2_setbit(rgd,
975 bi->bi_bh->b_data + bi->bi_offset,
976 bi->bi_len, blk, new_state);
977 if (bi->bi_clone)
978 gfs2_setbit(rgd,
979 bi->bi_clone + bi->bi_offset,
980 bi->bi_len, blk, new_state);
981
982 return bi->bi_start * GFS2_NBBY + blk;
983}
984
985/**
986 * rgblk_free - Change alloc state of given block(s)
987 * @sdp: the filesystem
988 * @bstart: the start of a run of blocks to free
989 * @blen: the length of the block run (all must lie within ONE RG!)
990 * @new_state: GFS2_BLKST_XXX the after-allocation block state
991 *
992 * Returns: Resource group containing the block(s)
993 */
994
995static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart,
996 uint32_t blen, unsigned char new_state)
997{
998 struct gfs2_rgrpd *rgd;
999 struct gfs2_bitmap *bi = NULL;
1000 uint32_t length, rgrp_blk, buf_blk;
1001 unsigned int buf;
1002
1003 rgd = gfs2_blk2rgrpd(sdp, bstart);
1004 if (!rgd) {
1005 if (gfs2_consist(sdp))
1006 fs_err(sdp, "block = %llu\n", bstart);
1007 return NULL;
1008 }
1009
1010 length = rgd->rd_ri.ri_length;
1011
1012 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1013
1014 while (blen--) {
1015 for (buf = 0; buf < length; buf++) {
1016 bi = rgd->rd_bits + buf;
1017 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1018 break;
1019 }
1020
1021 gfs2_assert(rgd->rd_sbd, buf < length);
1022
1023 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1024 rgrp_blk++;
1025
1026 if (!bi->bi_clone) {
1027 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1028 GFP_KERNEL | __GFP_NOFAIL);
1029 memcpy(bi->bi_clone + bi->bi_offset,
1030 bi->bi_bh->b_data + bi->bi_offset,
1031 bi->bi_len);
1032 }
1033 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1034 gfs2_setbit(rgd,
1035 bi->bi_bh->b_data + bi->bi_offset,
1036 bi->bi_len, buf_blk, new_state);
1037 }
1038
1039 return rgd;
1040}
1041
1042/**
1043 * gfs2_alloc_data - Allocate a data block
1044 * @ip: the inode to allocate the data block for
1045 *
1046 * Returns: the allocated block
1047 */
1048
1049uint64_t gfs2_alloc_data(struct gfs2_inode *ip)
1050{
1051 struct gfs2_sbd *sdp = ip->i_sbd;
1052 struct gfs2_alloc *al = &ip->i_alloc;
1053 struct gfs2_rgrpd *rgd = al->al_rgd;
1054 uint32_t goal, blk;
1055 uint64_t block;
1056
1057 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1058 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1059 else
1060 goal = rgd->rd_last_alloc_data;
1061
1062 blk = rgblk_search(rgd, goal,
1063 GFS2_BLKST_FREE, GFS2_BLKST_USED);
1064 rgd->rd_last_alloc_data = blk;
1065
1066 block = rgd->rd_ri.ri_data0 + blk;
1067 ip->i_di.di_goal_data = block;
1068
1069 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1070 rgd->rd_rg.rg_free--;
1071
1072 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1073 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1074
1075 al->al_alloced++;
1076
1077 gfs2_statfs_change(sdp, 0, -1, 0);
1078 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1079
1080 spin_lock(&sdp->sd_rindex_spin);
1081 rgd->rd_free_clone--;
1082 spin_unlock(&sdp->sd_rindex_spin);
1083
1084 return block;
1085}
1086
1087/**
1088 * gfs2_alloc_meta - Allocate a metadata block
1089 * @ip: the inode to allocate the metadata block for
1090 *
1091 * Returns: the allocated block
1092 */
1093
1094uint64_t gfs2_alloc_meta(struct gfs2_inode *ip)
1095{
1096 struct gfs2_sbd *sdp = ip->i_sbd;
1097 struct gfs2_alloc *al = &ip->i_alloc;
1098 struct gfs2_rgrpd *rgd = al->al_rgd;
1099 uint32_t goal, blk;
1100 uint64_t block;
1101
1102 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1103 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1104 else
1105 goal = rgd->rd_last_alloc_meta;
1106
1107 blk = rgblk_search(rgd, goal,
1108 GFS2_BLKST_FREE, GFS2_BLKST_USED);
1109 rgd->rd_last_alloc_meta = blk;
1110
1111 block = rgd->rd_ri.ri_data0 + blk;
1112 ip->i_di.di_goal_meta = block;
1113
1114 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1115 rgd->rd_rg.rg_free--;
1116
1117 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1118 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1119
1120 al->al_alloced++;
1121
1122 gfs2_statfs_change(sdp, 0, -1, 0);
1123 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1124 gfs2_trans_add_unrevoke(sdp, block);
1125
1126 spin_lock(&sdp->sd_rindex_spin);
1127 rgd->rd_free_clone--;
1128 spin_unlock(&sdp->sd_rindex_spin);
1129
1130 return block;
1131}
1132
1133/**
1134 * gfs2_alloc_di - Allocate a dinode
1135 * @dip: the directory that the inode is going in
1136 *
1137 * Returns: the block allocated
1138 */
1139
1140uint64_t gfs2_alloc_di(struct gfs2_inode *dip)
1141{
1142 struct gfs2_sbd *sdp = dip->i_sbd;
1143 struct gfs2_alloc *al = &dip->i_alloc;
1144 struct gfs2_rgrpd *rgd = al->al_rgd;
1145 uint32_t blk;
1146 uint64_t block;
1147
1148 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1149 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1150
1151 rgd->rd_last_alloc_meta = blk;
1152
1153 block = rgd->rd_ri.ri_data0 + blk;
1154
1155 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1156 rgd->rd_rg.rg_free--;
1157 rgd->rd_rg.rg_dinodes++;
1158
1159 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1160 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1161
1162 al->al_alloced++;
1163
1164 gfs2_statfs_change(sdp, 0, -1, +1);
1165 gfs2_trans_add_unrevoke(sdp, block);
1166
1167 spin_lock(&sdp->sd_rindex_spin);
1168 rgd->rd_free_clone--;
1169 spin_unlock(&sdp->sd_rindex_spin);
1170
1171 return block;
1172}
1173
1174/**
1175 * gfs2_free_data - free a contiguous run of data block(s)
1176 * @ip: the inode these blocks are being freed from
1177 * @bstart: first block of a run of contiguous blocks
1178 * @blen: the length of the block run
1179 *
1180 */
1181
1182void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1183{
1184 struct gfs2_sbd *sdp = ip->i_sbd;
1185 struct gfs2_rgrpd *rgd;
1186
1187 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1188 if (!rgd)
1189 return;
1190
1191 rgd->rd_rg.rg_free += blen;
1192
1193 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1194 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1195
1196 gfs2_trans_add_rg(rgd);
1197
1198 gfs2_statfs_change(sdp, 0, +blen, 0);
1199 gfs2_quota_change(ip, -(int64_t)blen,
1200 ip->i_di.di_uid, ip->i_di.di_gid);
1201}
1202
1203/**
1204 * gfs2_free_meta - free a contiguous run of data block(s)
1205 * @ip: the inode these blocks are being freed from
1206 * @bstart: first block of a run of contiguous blocks
1207 * @blen: the length of the block run
1208 *
1209 */
1210
1211void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1212{
1213 struct gfs2_sbd *sdp = ip->i_sbd;
1214 struct gfs2_rgrpd *rgd;
1215
1216 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1217 if (!rgd)
1218 return;
1219
1220 rgd->rd_rg.rg_free += blen;
1221
1222 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1223 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1224
1225 gfs2_trans_add_rg(rgd);
1226
1227 gfs2_statfs_change(sdp, 0, +blen, 0);
1228 gfs2_quota_change(ip, -(int64_t)blen,
1229 ip->i_di.di_uid, ip->i_di.di_gid);
1230 gfs2_meta_wipe(ip, bstart, blen);
1231}
1232
1233void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno)
1234{
1235 struct gfs2_sbd *sdp = rgd->rd_sbd;
1236 struct gfs2_rgrpd *tmp_rgd;
1237
1238 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1239 if (!tmp_rgd)
1240 return;
1241 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1242
1243 if (!rgd->rd_rg.rg_dinodes)
1244 gfs2_consist_rgrpd(rgd);
1245 rgd->rd_rg.rg_dinodes--;
1246 rgd->rd_rg.rg_free++;
1247
1248 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1249 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1250
1251 gfs2_statfs_change(sdp, 0, +1, -1);
1252 gfs2_trans_add_rg(rgd);
1253}
1254
1255/**
1256 * gfs2_free_uninit_di - free a dinode block
1257 * @rgd: the resource group that contains the dinode
1258 * @ip: the inode
1259 *
1260 */
1261
1262void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1263{
1264 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1265 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1266 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1267}
1268
1269/**
1270 * gfs2_rlist_add - add a RG to a list of RGs
1271 * @sdp: the filesystem
1272 * @rlist: the list of resource groups
1273 * @block: the block
1274 *
1275 * Figure out what RG a block belongs to and add that RG to the list
1276 *
1277 * FIXME: Don't use NOFAIL
1278 *
1279 */
1280
1281void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1282 uint64_t block)
1283{
1284 struct gfs2_rgrpd *rgd;
1285 struct gfs2_rgrpd **tmp;
1286 unsigned int new_space;
1287 unsigned int x;
1288
1289 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1290 return;
1291
1292 rgd = gfs2_blk2rgrpd(sdp, block);
1293 if (!rgd) {
1294 if (gfs2_consist(sdp))
1295 fs_err(sdp, "block = %llu\n", block);
1296 return;
1297 }
1298
1299 for (x = 0; x < rlist->rl_rgrps; x++)
1300 if (rlist->rl_rgd[x] == rgd)
1301 return;
1302
1303 if (rlist->rl_rgrps == rlist->rl_space) {
1304 new_space = rlist->rl_space + 10;
1305
1306 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1307 GFP_KERNEL | __GFP_NOFAIL);
1308
1309 if (rlist->rl_rgd) {
1310 memcpy(tmp, rlist->rl_rgd,
1311 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1312 kfree(rlist->rl_rgd);
1313 }
1314
1315 rlist->rl_space = new_space;
1316 rlist->rl_rgd = tmp;
1317 }
1318
1319 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1320}
1321
1322/**
1323 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1324 * and initialize an array of glock holders for them
1325 * @rlist: the list of resource groups
1326 * @state: the lock state to acquire the RG lock in
1327 * @flags: the modifier flags for the holder structures
1328 *
1329 * FIXME: Don't use NOFAIL
1330 *
1331 */
1332
1333void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1334 int flags)
1335{
1336 unsigned int x;
1337
1338 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1339 GFP_KERNEL | __GFP_NOFAIL);
1340 for (x = 0; x < rlist->rl_rgrps; x++)
1341 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1342 state, flags,
1343 &rlist->rl_ghs[x]);
1344}
1345
1346/**
1347 * gfs2_rlist_free - free a resource group list
1348 * @list: the list of resource groups
1349 *
1350 */
1351
1352void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1353{
1354 unsigned int x;
1355
1356 kfree(rlist->rl_rgd);
1357
1358 if (rlist->rl_ghs) {
1359 for (x = 0; x < rlist->rl_rgrps; x++)
1360 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1361 kfree(rlist->rl_ghs);
1362 }
1363}
1364
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..4c44a191b1c1
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
14
15struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk);
16struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
17struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
18
19void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
20int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
21
22int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
23void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
24void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
25
26void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
27
28struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
29void gfs2_alloc_put(struct gfs2_inode *ip);
30
31int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
32 char *file, unsigned int line);
33#define gfs2_inplace_reserve(ip) \
34gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
35
36void gfs2_inplace_release(struct gfs2_inode *ip);
37
38unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block);
39
40uint64_t gfs2_alloc_data(struct gfs2_inode *ip);
41uint64_t gfs2_alloc_meta(struct gfs2_inode *ip);
42uint64_t gfs2_alloc_di(struct gfs2_inode *ip);
43
44void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
45void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
46void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno);
47void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
48
49struct gfs2_rgrp_list {
50 unsigned int rl_rgrps;
51 unsigned int rl_space;
52 struct gfs2_rgrpd **rl_rgd;
53 struct gfs2_holder *rl_ghs;
54};
55
56void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
57 uint64_t block);
58void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
59 int flags);
60void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
61
62#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..be80771c414d
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,945 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "bmap.h"
19#include "dir.h"
20#include "format.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "quota.h"
27#include "recovery.h"
28#include "rgrp.h"
29#include "super.h"
30#include "trans.h"
31#include "unlinked.h"
32
33/**
34 * gfs2_tune_init - Fill a gfs2_tune structure with default values
35 * @gt: tune
36 *
37 */
38
39void gfs2_tune_init(struct gfs2_tune *gt)
40{
41 spin_lock_init(&gt->gt_spin);
42
43 gt->gt_ilimit = 100;
44 gt->gt_ilimit_tries = 3;
45 gt->gt_ilimit_min = 1;
46 gt->gt_demote_secs = 300;
47 gt->gt_incore_log_blocks = 1024;
48 gt->gt_log_flush_secs = 60;
49 gt->gt_jindex_refresh_secs = 60;
50 gt->gt_scand_secs = 15;
51 gt->gt_recoverd_secs = 60;
52 gt->gt_logd_secs = 1;
53 gt->gt_quotad_secs = 5;
54 gt->gt_inoded_secs = 15;
55 gt->gt_quota_simul_sync = 64;
56 gt->gt_quota_warn_period = 10;
57 gt->gt_quota_scale_num = 1;
58 gt->gt_quota_scale_den = 1;
59 gt->gt_quota_cache_secs = 300;
60 gt->gt_quota_quantum = 60;
61 gt->gt_atime_quantum = 3600;
62 gt->gt_new_files_jdata = 0;
63 gt->gt_new_files_directio = 0;
64 gt->gt_max_atomic_write = 4 << 20;
65 gt->gt_max_readahead = 1 << 18;
66 gt->gt_lockdump_size = 131072;
67 gt->gt_stall_secs = 600;
68 gt->gt_complain_secs = 10;
69 gt->gt_reclaim_limit = 5000;
70 gt->gt_entries_per_readdir = 32;
71 gt->gt_prefetch_secs = 10;
72 gt->gt_greedy_default = HZ / 10;
73 gt->gt_greedy_quantum = HZ / 40;
74 gt->gt_greedy_max = HZ / 4;
75 gt->gt_statfs_quantum = 30;
76 gt->gt_statfs_slow = 0;
77}
78
79/**
80 * gfs2_check_sb - Check superblock
81 * @sdp: the filesystem
82 * @sb: The superblock
83 * @silent: Don't print a message if the check fails
84 *
85 * Checks the version code of the FS is one that we understand how to
86 * read and that the sizes of the various on-disk structures have not
87 * changed.
88 */
89
90int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
91{
92 unsigned int x;
93
94 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
95 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
96 if (!silent)
97 printk("GFS2: not a GFS2 filesystem\n");
98 return -EINVAL;
99 }
100
101 /* If format numbers match exactly, we're done. */
102
103 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
104 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
105 return 0;
106
107 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
108 for (x = 0; gfs2_old_fs_formats[x]; x++)
109 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
110 break;
111
112 if (!gfs2_old_fs_formats[x]) {
113 printk("GFS2: code version (%u, %u) is incompatible "
114 "with ondisk format (%u, %u)\n",
115 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
116 sb->sb_fs_format, sb->sb_multihost_format);
117 printk("GFS2: I don't know how to upgrade this FS\n");
118 return -EINVAL;
119 }
120 }
121
122 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
123 for (x = 0; gfs2_old_multihost_formats[x]; x++)
124 if (gfs2_old_multihost_formats[x] == sb->sb_multihost_format)
125 break;
126
127 if (!gfs2_old_multihost_formats[x]) {
128 printk("GFS2: code version (%u, %u) is incompatible "
129 "with ondisk format (%u, %u)\n",
130 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
131 sb->sb_fs_format, sb->sb_multihost_format);
132 printk("GFS2: I don't know how to upgrade this FS\n");
133 return -EINVAL;
134 }
135 }
136
137 if (!sdp->sd_args.ar_upgrade) {
138 printk("GFS2: code version (%u, %u) is incompatible "
139 "with ondisk format (%u, %u)\n",
140 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
141 sb->sb_fs_format, sb->sb_multihost_format);
142 printk("GFS2: Use the \"upgrade\" mount option to upgrade "
143 "the FS\n");
144 printk("GFS2: See the manual for more details\n");
145 return -EINVAL;
146 }
147
148 return 0;
149}
150
151/**
152 * gfs2_read_sb - Read super block
153 * @sdp: The GFS2 superblock
154 * @gl: the glock for the superblock (assumed to be held)
155 * @silent: Don't print message if mount fails
156 *
157 */
158
159int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
160{
161 struct buffer_head *bh;
162 uint32_t hash_blocks, ind_blocks, leaf_blocks;
163 uint32_t tmp_blocks;
164 unsigned int x;
165 int error;
166
167 error = gfs2_meta_read(gl, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift,
168 DIO_FORCE | DIO_START | DIO_WAIT, &bh);
169 if (error) {
170 if (!silent)
171 fs_err(sdp, "can't read superblock\n");
172 return error;
173 }
174
175 gfs2_assert(sdp, sizeof(struct gfs2_sb) <= bh->b_size);
176 gfs2_sb_in(&sdp->sd_sb, bh->b_data);
177 brelse(bh);
178
179 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
180 if (error)
181 return error;
182
183 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
184 GFS2_BASIC_BLOCK_SHIFT;
185 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
186 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
187 sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
188 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
189 sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
190 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
191 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
192 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
193 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
194 sdp->sd_ut_per_block = (sdp->sd_sb.sb_bsize -
195 sizeof(struct gfs2_meta_header)) /
196 sizeof(struct gfs2_unlinked_tag);
197 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
198 sizeof(struct gfs2_meta_header)) /
199 sizeof(struct gfs2_quota_change);
200
201 /* Compute maximum reservation required to add a entry to a directory */
202
203 hash_blocks = DIV_RU(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH),
204 sdp->sd_jbsize);
205
206 ind_blocks = 0;
207 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
208 tmp_blocks = DIV_RU(tmp_blocks, sdp->sd_inptrs);
209 ind_blocks += tmp_blocks;
210 }
211
212 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
213
214 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
215
216 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
217 sizeof(struct gfs2_dinode);
218 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
219 for (x = 2;; x++) {
220 uint64_t space, d;
221 uint32_t m;
222
223 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
224 d = space;
225 m = do_div(d, sdp->sd_inptrs);
226
227 if (d != sdp->sd_heightsize[x - 1] || m)
228 break;
229 sdp->sd_heightsize[x] = space;
230 }
231 sdp->sd_max_height = x;
232 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
233
234 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
235 sizeof(struct gfs2_dinode);
236 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
237 for (x = 2;; x++) {
238 uint64_t space, d;
239 uint32_t m;
240
241 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
242 d = space;
243 m = do_div(d, sdp->sd_inptrs);
244
245 if (d != sdp->sd_jheightsize[x - 1] || m)
246 break;
247 sdp->sd_jheightsize[x] = space;
248 }
249 sdp->sd_max_jheight = x;
250 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
251
252 return 0;
253}
254
255int gfs2_do_upgrade(struct gfs2_sbd *sdp, struct gfs2_glock *sb_gl)
256{
257 return 0;
258}
259
260/**
261 * gfs2_jindex_hold - Grab a lock on the jindex
262 * @sdp: The GFS2 superblock
263 * @ji_gh: the holder for the jindex glock
264 *
265 * This is very similar to the gfs2_rindex_hold() function, except that
266 * in general we hold the jindex lock for longer periods of time and
267 * we grab it far less frequently (in general) then the rgrp lock.
268 *
269 * Returns: errno
270 */
271
272int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
273{
274 struct gfs2_inode *dip = get_v2ip(sdp->sd_jindex);
275 struct qstr name;
276 char buf[20];
277 struct gfs2_jdesc *jd;
278 int error;
279
280 name.name = buf;
281
282 mutex_lock(&sdp->sd_jindex_mutex);
283
284 for (;;) {
285 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
286 GL_LOCAL_EXCL, ji_gh);
287 if (error)
288 break;
289
290 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
291
292 error = gfs2_dir_search(get_v2ip(sdp->sd_jindex), &name, NULL, NULL);
293 if (error == -ENOENT) {
294 error = 0;
295 break;
296 }
297
298 gfs2_glock_dq_uninit(ji_gh);
299
300 if (error)
301 break;
302
303 error = -ENOMEM;
304 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
305 if (!jd)
306 break;
307
308 error = gfs2_lookupi(sdp->sd_jindex, &name, 1, &jd->jd_inode);
309 if (error) {
310 kfree(jd);
311 break;
312 }
313
314 spin_lock(&sdp->sd_jindex_spin);
315 jd->jd_jid = sdp->sd_journals++;
316 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
317 spin_unlock(&sdp->sd_jindex_spin);
318 }
319
320 mutex_unlock(&sdp->sd_jindex_mutex);
321
322 return error;
323}
324
325/**
326 * gfs2_jindex_free - Clear all the journal index information
327 * @sdp: The GFS2 superblock
328 *
329 */
330
331void gfs2_jindex_free(struct gfs2_sbd *sdp)
332{
333 struct list_head list;
334 struct gfs2_jdesc *jd;
335
336 spin_lock(&sdp->sd_jindex_spin);
337 list_add(&list, &sdp->sd_jindex_list);
338 list_del_init(&sdp->sd_jindex_list);
339 sdp->sd_journals = 0;
340 spin_unlock(&sdp->sd_jindex_spin);
341
342 while (!list_empty(&list)) {
343 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
344 list_del(&jd->jd_list);
345 iput(jd->jd_inode);
346 kfree(jd);
347 }
348}
349
350static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
351{
352 struct gfs2_jdesc *jd;
353 int found = 0;
354
355 list_for_each_entry(jd, head, jd_list) {
356 if (jd->jd_jid == jid) {
357 found = 1;
358 break;
359 }
360 }
361
362 if (!found)
363 jd = NULL;
364
365 return jd;
366}
367
368struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
369{
370 struct gfs2_jdesc *jd;
371
372 spin_lock(&sdp->sd_jindex_spin);
373 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
374 spin_unlock(&sdp->sd_jindex_spin);
375
376 return jd;
377}
378
379void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
380{
381 struct gfs2_jdesc *jd;
382
383 spin_lock(&sdp->sd_jindex_spin);
384 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
385 if (jd)
386 jd->jd_dirty = 1;
387 spin_unlock(&sdp->sd_jindex_spin);
388}
389
390struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
391{
392 struct gfs2_jdesc *jd;
393 int found = 0;
394
395 spin_lock(&sdp->sd_jindex_spin);
396
397 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
398 if (jd->jd_dirty) {
399 jd->jd_dirty = 0;
400 found = 1;
401 break;
402 }
403 }
404 spin_unlock(&sdp->sd_jindex_spin);
405
406 if (!found)
407 jd = NULL;
408
409 return jd;
410}
411
412int gfs2_jdesc_check(struct gfs2_jdesc *jd)
413{
414 struct gfs2_inode *ip = get_v2ip(jd->jd_inode);
415 struct gfs2_sbd *sdp = ip->i_sbd;
416 int ar;
417 int error;
418
419 if (ip->i_di.di_size < (8 << 20) ||
420 ip->i_di.di_size > (1 << 30) ||
421 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
422 gfs2_consist_inode(ip);
423 return -EIO;
424 }
425 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
426
427 error = gfs2_write_alloc_required(ip,
428 0, ip->i_di.di_size,
429 &ar);
430 if (!error && ar) {
431 gfs2_consist_inode(ip);
432 error = -EIO;
433 }
434
435 return error;
436}
437
438int gfs2_lookup_master_dir(struct gfs2_sbd *sdp)
439{
440 struct inode *inode = NULL;
441 struct gfs2_glock *gl;
442 int error;
443
444 error = gfs2_glock_get(sdp,
445 sdp->sd_sb.sb_master_dir.no_addr,
446 &gfs2_inode_glops, CREATE, &gl);
447 if (!error) {
448 error = gfs2_lookup_simple(sdp->sd_root_dir, ".gfs2_admin", &inode);
449 sdp->sd_master_dir = inode;
450 gfs2_glock_put(gl);
451 }
452
453 return error;
454}
455
456/**
457 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
458 * @sdp: the filesystem
459 *
460 * Returns: errno
461 */
462
463int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
464{
465 struct gfs2_glock *j_gl = get_v2ip(sdp->sd_jdesc->jd_inode)->i_gl;
466 struct gfs2_holder t_gh;
467 struct gfs2_log_header head;
468 int error;
469
470 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
471 GL_LOCAL_EXCL | GL_NEVER_RECURSE, &t_gh);
472 if (error)
473 return error;
474
475 gfs2_meta_cache_flush(get_v2ip(sdp->sd_jdesc->jd_inode));
476 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
477
478 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
479 if (error)
480 goto fail;
481
482 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
483 gfs2_consist(sdp);
484 error = -EIO;
485 goto fail;
486 }
487
488 /* Initialize some head of the log stuff */
489 sdp->sd_log_sequence = head.lh_sequence + 1;
490 gfs2_log_pointers_init(sdp, head.lh_blkno);
491
492 error = gfs2_unlinked_init(sdp);
493 if (error)
494 goto fail;
495 error = gfs2_quota_init(sdp);
496 if (error)
497 goto fail_unlinked;
498
499 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
500
501 gfs2_glock_dq_uninit(&t_gh);
502
503 return 0;
504
505 fail_unlinked:
506 gfs2_unlinked_cleanup(sdp);
507
508 fail:
509 t_gh.gh_flags |= GL_NOCACHE;
510 gfs2_glock_dq_uninit(&t_gh);
511
512 return error;
513}
514
515/**
516 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
517 * @sdp: the filesystem
518 *
519 * Returns: errno
520 */
521
522int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
523{
524 struct gfs2_holder t_gh;
525 int error;
526
527 gfs2_unlinked_dealloc(sdp);
528 gfs2_quota_sync(sdp);
529 gfs2_statfs_sync(sdp);
530
531 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
532 GL_LOCAL_EXCL | GL_NEVER_RECURSE | GL_NOCACHE,
533 &t_gh);
534 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
535 return error;
536
537 gfs2_meta_syncfs(sdp);
538 gfs2_log_shutdown(sdp);
539
540 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
541
542 if (t_gh.gh_gl)
543 gfs2_glock_dq_uninit(&t_gh);
544
545 gfs2_unlinked_cleanup(sdp);
546 gfs2_quota_cleanup(sdp);
547
548 return error;
549}
550
551int gfs2_statfs_init(struct gfs2_sbd *sdp)
552{
553 struct gfs2_inode *m_ip = get_v2ip(sdp->sd_statfs_inode);
554 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
555 struct gfs2_inode *l_ip = get_v2ip(sdp->sd_sc_inode);
556 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
557 struct buffer_head *m_bh, *l_bh;
558 struct gfs2_holder gh;
559 int error;
560
561 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
562 &gh);
563 if (error)
564 return error;
565
566 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
567 if (error)
568 goto out;
569
570 if (sdp->sd_args.ar_spectator) {
571 spin_lock(&sdp->sd_statfs_spin);
572 gfs2_statfs_change_in(m_sc, m_bh->b_data +
573 sizeof(struct gfs2_dinode));
574 spin_unlock(&sdp->sd_statfs_spin);
575 } else {
576 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
577 if (error)
578 goto out_m_bh;
579
580 spin_lock(&sdp->sd_statfs_spin);
581 gfs2_statfs_change_in(m_sc, m_bh->b_data +
582 sizeof(struct gfs2_dinode));
583 gfs2_statfs_change_in(l_sc, l_bh->b_data +
584 sizeof(struct gfs2_dinode));
585 spin_unlock(&sdp->sd_statfs_spin);
586
587 brelse(l_bh);
588 }
589
590 out_m_bh:
591 brelse(m_bh);
592
593 out:
594 gfs2_glock_dq_uninit(&gh);
595
596 return 0;
597}
598
599void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free,
600 int64_t dinodes)
601{
602 struct gfs2_inode *l_ip = get_v2ip(sdp->sd_sc_inode);
603 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
604 struct buffer_head *l_bh;
605 int error;
606
607 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
608 if (error)
609 return;
610
611 mutex_lock(&sdp->sd_statfs_mutex);
612 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
613 mutex_unlock(&sdp->sd_statfs_mutex);
614
615 spin_lock(&sdp->sd_statfs_spin);
616 l_sc->sc_total += total;
617 l_sc->sc_free += free;
618 l_sc->sc_dinodes += dinodes;
619 gfs2_statfs_change_out(l_sc, l_bh->b_data +
620 sizeof(struct gfs2_dinode));
621 spin_unlock(&sdp->sd_statfs_spin);
622
623 brelse(l_bh);
624}
625
626int gfs2_statfs_sync(struct gfs2_sbd *sdp)
627{
628 struct gfs2_inode *m_ip = get_v2ip(sdp->sd_statfs_inode);
629 struct gfs2_inode *l_ip = get_v2ip(sdp->sd_sc_inode);
630 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
631 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
632 struct gfs2_holder gh;
633 struct buffer_head *m_bh, *l_bh;
634 int error;
635
636 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
637 &gh);
638 if (error)
639 return error;
640
641 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
642 if (error)
643 goto out;
644
645 spin_lock(&sdp->sd_statfs_spin);
646 gfs2_statfs_change_in(m_sc, m_bh->b_data +
647 sizeof(struct gfs2_dinode));
648 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
649 spin_unlock(&sdp->sd_statfs_spin);
650 goto out_bh;
651 }
652 spin_unlock(&sdp->sd_statfs_spin);
653
654 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
655 if (error)
656 goto out_bh;
657
658 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
659 if (error)
660 goto out_bh2;
661
662 mutex_lock(&sdp->sd_statfs_mutex);
663 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
664 mutex_unlock(&sdp->sd_statfs_mutex);
665
666 spin_lock(&sdp->sd_statfs_spin);
667 m_sc->sc_total += l_sc->sc_total;
668 m_sc->sc_free += l_sc->sc_free;
669 m_sc->sc_dinodes += l_sc->sc_dinodes;
670 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
671 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
672 0, sizeof(struct gfs2_statfs_change));
673 spin_unlock(&sdp->sd_statfs_spin);
674
675 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
676 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
677
678 gfs2_trans_end(sdp);
679
680 out_bh2:
681 brelse(l_bh);
682
683 out_bh:
684 brelse(m_bh);
685
686 out:
687 gfs2_glock_dq_uninit(&gh);
688
689 return error;
690}
691
692/**
693 * gfs2_statfs_i - Do a statfs
694 * @sdp: the filesystem
695 * @sg: the sg structure
696 *
697 * Returns: errno
698 */
699
700int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
701{
702 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
703 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
704
705 spin_lock(&sdp->sd_statfs_spin);
706
707 *sc = *m_sc;
708 sc->sc_total += l_sc->sc_total;
709 sc->sc_free += l_sc->sc_free;
710 sc->sc_dinodes += l_sc->sc_dinodes;
711
712 spin_unlock(&sdp->sd_statfs_spin);
713
714 if (sc->sc_free < 0)
715 sc->sc_free = 0;
716 if (sc->sc_free > sc->sc_total)
717 sc->sc_free = sc->sc_total;
718 if (sc->sc_dinodes < 0)
719 sc->sc_dinodes = 0;
720
721 return 0;
722}
723
724/**
725 * statfs_fill - fill in the sg for a given RG
726 * @rgd: the RG
727 * @sc: the sc structure
728 *
729 * Returns: 0 on success, -ESTALE if the LVB is invalid
730 */
731
732static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
733 struct gfs2_statfs_change *sc)
734{
735 gfs2_rgrp_verify(rgd);
736 sc->sc_total += rgd->rd_ri.ri_data;
737 sc->sc_free += rgd->rd_rg.rg_free;
738 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
739 return 0;
740}
741
742/**
743 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
744 * @sdp: the filesystem
745 * @sc: the sc info that will be returned
746 *
747 * Any error (other than a signal) will cause this routine to fall back
748 * to the synchronous version.
749 *
750 * FIXME: This really shouldn't busy wait like this.
751 *
752 * Returns: errno
753 */
754
755int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
756{
757 struct gfs2_holder ri_gh;
758 struct gfs2_rgrpd *rgd_next;
759 struct gfs2_holder *gha, *gh;
760 unsigned int slots = 64;
761 unsigned int x;
762 int done;
763 int error = 0, err;
764
765 memset(sc, 0, sizeof(struct gfs2_statfs_change));
766 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
767 if (!gha)
768 return -ENOMEM;
769
770 error = gfs2_rindex_hold(sdp, &ri_gh);
771 if (error)
772 goto out;
773
774 rgd_next = gfs2_rgrpd_get_first(sdp);
775
776 for (;;) {
777 done = 1;
778
779 for (x = 0; x < slots; x++) {
780 gh = gha + x;
781
782 if (gh->gh_gl && gfs2_glock_poll(gh)) {
783 err = gfs2_glock_wait(gh);
784 if (err) {
785 gfs2_holder_uninit(gh);
786 error = err;
787 } else {
788 if (!error)
789 error = statfs_slow_fill(get_gl2rgd(gh->gh_gl), sc);
790 gfs2_glock_dq_uninit(gh);
791 }
792 }
793
794 if (gh->gh_gl)
795 done = 0;
796 else if (rgd_next && !error) {
797 error = gfs2_glock_nq_init(rgd_next->rd_gl,
798 LM_ST_SHARED,
799 GL_ASYNC,
800 gh);
801 rgd_next = gfs2_rgrpd_get_next(rgd_next);
802 done = 0;
803 }
804
805 if (signal_pending(current))
806 error = -ERESTARTSYS;
807 }
808
809 if (done)
810 break;
811
812 yield();
813 }
814
815 gfs2_glock_dq_uninit(&ri_gh);
816
817 out:
818 kfree(gha);
819
820 return error;
821}
822
823struct lfcc {
824 struct list_head list;
825 struct gfs2_holder gh;
826};
827
828/**
829 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
830 * journals are clean
831 * @sdp: the file system
832 * @state: the state to put the transaction lock into
833 * @t_gh: the hold on the transaction lock
834 *
835 * Returns: errno
836 */
837
838int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_holder *t_gh)
839{
840 struct gfs2_holder ji_gh;
841 struct gfs2_jdesc *jd;
842 struct lfcc *lfcc;
843 LIST_HEAD(list);
844 struct gfs2_log_header lh;
845 int error;
846
847 error = gfs2_jindex_hold(sdp, &ji_gh);
848 if (error)
849 return error;
850
851 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
852 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
853 if (!lfcc) {
854 error = -ENOMEM;
855 goto out;
856 }
857 error = gfs2_glock_nq_init(get_v2ip(jd->jd_inode)->i_gl, LM_ST_SHARED, 0,
858 &lfcc->gh);
859 if (error) {
860 kfree(lfcc);
861 goto out;
862 }
863 list_add(&lfcc->list, &list);
864 }
865
866 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
867 LM_FLAG_PRIORITY | GL_NEVER_RECURSE | GL_NOCACHE,
868 t_gh);
869
870 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
871 error = gfs2_jdesc_check(jd);
872 if (error)
873 break;
874 error = gfs2_find_jhead(jd, &lh);
875 if (error)
876 break;
877 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
878 error = -EBUSY;
879 break;
880 }
881 }
882
883 if (error)
884 gfs2_glock_dq_uninit(t_gh);
885
886 out:
887 while (!list_empty(&list)) {
888 lfcc = list_entry(list.next, struct lfcc, list);
889 list_del(&lfcc->list);
890 gfs2_glock_dq_uninit(&lfcc->gh);
891 kfree(lfcc);
892 }
893 gfs2_glock_dq_uninit(&ji_gh);
894
895 return error;
896}
897
898/**
899 * gfs2_freeze_fs - freezes the file system
900 * @sdp: the file system
901 *
902 * This function flushes data and meta data for all machines by
903 * aquiring the transaction log exclusively. All journals are
904 * ensured to be in a clean state as well.
905 *
906 * Returns: errno
907 */
908
909int gfs2_freeze_fs(struct gfs2_sbd *sdp)
910{
911 int error = 0;
912
913 mutex_lock(&sdp->sd_freeze_lock);
914
915 if (!sdp->sd_freeze_count++) {
916 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
917 if (error)
918 sdp->sd_freeze_count--;
919 }
920
921 mutex_unlock(&sdp->sd_freeze_lock);
922
923 return error;
924}
925
926/**
927 * gfs2_unfreeze_fs - unfreezes the file system
928 * @sdp: the file system
929 *
930 * This function allows the file system to proceed by unlocking
931 * the exclusively held transaction lock. Other GFS2 nodes are
932 * now free to acquire the lock shared and go on with their lives.
933 *
934 */
935
936void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
937{
938 mutex_lock(&sdp->sd_freeze_lock);
939
940 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
941 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
942
943 mutex_unlock(&sdp->sd_freeze_lock);
944}
945
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..cc1a3df1949a
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13void gfs2_tune_init(struct gfs2_tune *gt);
14
15int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
16int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
17int gfs2_do_upgrade(struct gfs2_sbd *sdp, struct gfs2_glock *gl_sb);
18
19static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
20{
21 unsigned int x;
22 spin_lock(&sdp->sd_jindex_spin);
23 x = sdp->sd_journals;
24 spin_unlock(&sdp->sd_jindex_spin);
25 return x;
26}
27
28int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
29void gfs2_jindex_free(struct gfs2_sbd *sdp);
30
31struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
32void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
33struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
34int gfs2_jdesc_check(struct gfs2_jdesc *jd);
35
36int gfs2_lookup_master_dir(struct gfs2_sbd *sdp);
37int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
38 struct gfs2_inode **ipp);
39
40int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
41int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
42
43int gfs2_statfs_init(struct gfs2_sbd *sdp);
44void gfs2_statfs_change(struct gfs2_sbd *sdp,
45 int64_t total, int64_t free, int64_t dinodes);
46int gfs2_statfs_sync(struct gfs2_sbd *sdp);
47int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
48int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
49
50int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_holder *t_gh);
51int gfs2_freeze_fs(struct gfs2_sbd *sdp);
52void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
53
54#endif /* __SUPER_DOT_H__ */
55
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..c5984351e4d8
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,640 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <asm/semaphore.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm.h"
22#include "sys.h"
23#include "super.h"
24#include "glock.h"
25#include "quota.h"
26
27char *gfs2_sys_margs;
28spinlock_t gfs2_sys_margs_lock;
29
30static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
31{
32 return sprintf(buf, "%s\n", sdp->sd_vfs->s_id);
33}
34
35static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
36{
37 return sprintf(buf, "%s\n", sdp->sd_fsname);
38}
39
40static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
41{
42 unsigned int count;
43
44 mutex_lock(&sdp->sd_freeze_lock);
45 count = sdp->sd_freeze_count;
46 mutex_unlock(&sdp->sd_freeze_lock);
47
48 return sprintf(buf, "%u\n", count);
49}
50
51static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
52{
53 ssize_t ret = len;
54 int error = 0;
55 int n = simple_strtol(buf, NULL, 0);
56
57 if (!capable(CAP_SYS_ADMIN))
58 return -EACCES;
59
60 switch (n) {
61 case 0:
62 gfs2_unfreeze_fs(sdp);
63 break;
64 case 1:
65 error = gfs2_freeze_fs(sdp);
66 break;
67 default:
68 ret = -EINVAL;
69 }
70
71 if (error)
72 fs_warn(sdp, "freeze %d error %d", n, error);
73
74 return ret;
75}
76
77static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
78{
79 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
80 return sprintf(buf, "%u\n", b);
81}
82
83static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
84{
85 if (!capable(CAP_SYS_ADMIN))
86 return -EACCES;
87
88 if (simple_strtol(buf, NULL, 0) != 1)
89 return -EINVAL;
90
91 gfs2_lm_withdraw(sdp,
92 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
93 sdp->sd_fsname);
94 return len;
95}
96
97static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
98 size_t len)
99{
100 if (!capable(CAP_SYS_ADMIN))
101 return -EACCES;
102
103 if (simple_strtol(buf, NULL, 0) != 1)
104 return -EINVAL;
105
106 gfs2_statfs_sync(sdp);
107 return len;
108}
109
110static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
111{
112 if (!capable(CAP_SYS_ADMIN))
113 return -EACCES;
114
115 if (simple_strtol(buf, NULL, 0) != 1)
116 return -EINVAL;
117
118 gfs2_gl_hash_clear(sdp, NO_WAIT);
119 return len;
120}
121
122static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
123 size_t len)
124{
125 if (!capable(CAP_SYS_ADMIN))
126 return -EACCES;
127
128 if (simple_strtol(buf, NULL, 0) != 1)
129 return -EINVAL;
130
131 gfs2_quota_sync(sdp);
132 return len;
133}
134
135static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
136 size_t len)
137{
138 uint32_t id;
139
140 if (!capable(CAP_SYS_ADMIN))
141 return -EACCES;
142
143 id = simple_strtoul(buf, NULL, 0);
144
145 gfs2_quota_refresh(sdp, 1, id);
146 return len;
147}
148
149static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
150 size_t len)
151{
152 uint32_t id;
153
154 if (!capable(CAP_SYS_ADMIN))
155 return -EACCES;
156
157 id = simple_strtoul(buf, NULL, 0);
158
159 gfs2_quota_refresh(sdp, 0, id);
160 return len;
161}
162
163struct gfs2_attr {
164 struct attribute attr;
165 ssize_t (*show)(struct gfs2_sbd *, char *);
166 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
167};
168
169#define GFS2_ATTR(name, mode, show, store) \
170static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
171
172GFS2_ATTR(id, 0444, id_show, NULL);
173GFS2_ATTR(fsname, 0444, fsname_show, NULL);
174GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
175GFS2_ATTR(shrink, 0200, NULL, shrink_store);
176GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
177GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
178GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
179GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
180GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
181
182static struct attribute *gfs2_attrs[] = {
183 &gfs2_attr_id.attr,
184 &gfs2_attr_fsname.attr,
185 &gfs2_attr_freeze.attr,
186 &gfs2_attr_shrink.attr,
187 &gfs2_attr_withdraw.attr,
188 &gfs2_attr_statfs_sync.attr,
189 &gfs2_attr_quota_sync.attr,
190 &gfs2_attr_quota_refresh_user.attr,
191 &gfs2_attr_quota_refresh_group.attr,
192 NULL,
193};
194
195static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
196 char *buf)
197{
198 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
199 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
200 return a->show ? a->show(sdp, buf) : 0;
201}
202
203static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
204 const char *buf, size_t len)
205{
206 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
207 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
208 return a->store ? a->store(sdp, buf, len) : len;
209}
210
211static struct sysfs_ops gfs2_attr_ops = {
212 .show = gfs2_attr_show,
213 .store = gfs2_attr_store,
214};
215
216static struct kobj_type gfs2_ktype = {
217 .default_attrs = gfs2_attrs,
218 .sysfs_ops = &gfs2_attr_ops,
219};
220
221static struct kset gfs2_kset = {
222 .subsys = &fs_subsys,
223 .kobj = {.name = "gfs2",},
224 .ktype = &gfs2_ktype,
225};
226
227/*
228 * display struct lm_lockstruct fields
229 */
230
231struct lockstruct_attr {
232 struct attribute attr;
233 ssize_t (*show)(struct gfs2_sbd *, char *);
234};
235
236#define LOCKSTRUCT_ATTR(name, fmt) \
237static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
238{ \
239 return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \
240} \
241static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
242
243LOCKSTRUCT_ATTR(jid, "%u\n");
244LOCKSTRUCT_ATTR(first, "%u\n");
245LOCKSTRUCT_ATTR(lvb_size, "%u\n");
246LOCKSTRUCT_ATTR(flags, "%d\n");
247
248static struct attribute *lockstruct_attrs[] = {
249 &lockstruct_attr_jid.attr,
250 &lockstruct_attr_first.attr,
251 &lockstruct_attr_lvb_size.attr,
252 &lockstruct_attr_flags.attr,
253 NULL
254};
255
256/*
257 * display struct gfs2_args fields
258 */
259
260struct args_attr {
261 struct attribute attr;
262 ssize_t (*show)(struct gfs2_sbd *, char *);
263};
264
265#define ARGS_ATTR(name, fmt) \
266static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
267{ \
268 return sprintf(buf, fmt, sdp->sd_args.ar_##name); \
269} \
270static struct args_attr args_attr_##name = __ATTR_RO(name)
271
272ARGS_ATTR(lockproto, "%s\n");
273ARGS_ATTR(locktable, "%s\n");
274ARGS_ATTR(hostdata, "%s\n");
275ARGS_ATTR(spectator, "%d\n");
276ARGS_ATTR(ignore_local_fs, "%d\n");
277ARGS_ATTR(localcaching, "%d\n");
278ARGS_ATTR(localflocks, "%d\n");
279ARGS_ATTR(debug, "%d\n");
280ARGS_ATTR(upgrade, "%d\n");
281ARGS_ATTR(num_glockd, "%u\n");
282ARGS_ATTR(posix_acl, "%d\n");
283ARGS_ATTR(quota, "%u\n");
284ARGS_ATTR(suiddir, "%d\n");
285ARGS_ATTR(data, "%d\n");
286
287/* one oddball doesn't fit the macro mold */
288static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
289{
290 return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags));
291}
292static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
293
294static struct attribute *args_attrs[] = {
295 &args_attr_lockproto.attr,
296 &args_attr_locktable.attr,
297 &args_attr_hostdata.attr,
298 &args_attr_spectator.attr,
299 &args_attr_ignore_local_fs.attr,
300 &args_attr_localcaching.attr,
301 &args_attr_localflocks.attr,
302 &args_attr_debug.attr,
303 &args_attr_upgrade.attr,
304 &args_attr_num_glockd.attr,
305 &args_attr_posix_acl.attr,
306 &args_attr_quota.attr,
307 &args_attr_suiddir.attr,
308 &args_attr_data.attr,
309 &args_attr_noatime.attr,
310 NULL
311};
312
313/*
314 * display counters from superblock
315 */
316
317struct counters_attr {
318 struct attribute attr;
319 ssize_t (*show)(struct gfs2_sbd *, char *);
320};
321
322#define COUNTERS_ATTR_GENERAL(name, fmt, val) \
323static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
324{ \
325 return sprintf(buf, fmt, val); \
326} \
327static struct counters_attr counters_attr_##name = __ATTR_RO(name)
328
329#define COUNTERS_ATTR_SIMPLE(name, fmt) \
330 COUNTERS_ATTR_GENERAL(name, fmt, sdp->sd_##name)
331
332#define COUNTERS_ATTR_ATOMIC(name, fmt) \
333 COUNTERS_ATTR_GENERAL(name, fmt, (unsigned int)atomic_read(&sdp->sd_##name))
334
335COUNTERS_ATTR_ATOMIC(glock_count, "%u\n");
336COUNTERS_ATTR_ATOMIC(glock_held_count, "%u\n");
337COUNTERS_ATTR_ATOMIC(inode_count, "%u\n");
338COUNTERS_ATTR_ATOMIC(bufdata_count, "%u\n");
339COUNTERS_ATTR_ATOMIC(unlinked_count, "%u\n");
340COUNTERS_ATTR_ATOMIC(quota_count, "%u\n");
341COUNTERS_ATTR_SIMPLE(log_num_gl, "%u\n");
342COUNTERS_ATTR_SIMPLE(log_num_buf, "%u\n");
343COUNTERS_ATTR_SIMPLE(log_num_revoke, "%u\n");
344COUNTERS_ATTR_SIMPLE(log_num_rg, "%u\n");
345COUNTERS_ATTR_SIMPLE(log_num_databuf, "%u\n");
346COUNTERS_ATTR_SIMPLE(log_blks_free, "%u\n");
347COUNTERS_ATTR_GENERAL(jd_blocks, "%u\n", sdp->sd_jdesc->jd_blocks);
348COUNTERS_ATTR_ATOMIC(reclaim_count, "%u\n");
349COUNTERS_ATTR_SIMPLE(log_wraps, "%llu\n");
350COUNTERS_ATTR_ATOMIC(fh2dentry_misses, "%u\n");
351COUNTERS_ATTR_ATOMIC(reclaimed, "%u\n");
352COUNTERS_ATTR_ATOMIC(log_flush_incore, "%u\n");
353COUNTERS_ATTR_ATOMIC(log_flush_ondisk, "%u\n");
354COUNTERS_ATTR_ATOMIC(glock_nq_calls, "%u\n");
355COUNTERS_ATTR_ATOMIC(glock_dq_calls, "%u\n");
356COUNTERS_ATTR_ATOMIC(glock_prefetch_calls, "%u\n");
357COUNTERS_ATTR_ATOMIC(lm_lock_calls, "%u\n");
358COUNTERS_ATTR_ATOMIC(lm_unlock_calls, "%u\n");
359COUNTERS_ATTR_ATOMIC(lm_callbacks, "%u\n");
360COUNTERS_ATTR_ATOMIC(ops_address, "%u\n");
361COUNTERS_ATTR_ATOMIC(ops_dentry, "%u\n");
362COUNTERS_ATTR_ATOMIC(ops_export, "%u\n");
363COUNTERS_ATTR_ATOMIC(ops_file, "%u\n");
364COUNTERS_ATTR_ATOMIC(ops_inode, "%u\n");
365COUNTERS_ATTR_ATOMIC(ops_super, "%u\n");
366COUNTERS_ATTR_ATOMIC(ops_vm, "%u\n");
367
368static struct attribute *counters_attrs[] = {
369 &counters_attr_glock_count.attr,
370 &counters_attr_glock_held_count.attr,
371 &counters_attr_inode_count.attr,
372 &counters_attr_bufdata_count.attr,
373 &counters_attr_unlinked_count.attr,
374 &counters_attr_quota_count.attr,
375 &counters_attr_log_num_gl.attr,
376 &counters_attr_log_num_buf.attr,
377 &counters_attr_log_num_revoke.attr,
378 &counters_attr_log_num_rg.attr,
379 &counters_attr_log_num_databuf.attr,
380 &counters_attr_log_blks_free.attr,
381 &counters_attr_jd_blocks.attr,
382 &counters_attr_reclaim_count.attr,
383 &counters_attr_log_wraps.attr,
384 &counters_attr_fh2dentry_misses.attr,
385 &counters_attr_reclaimed.attr,
386 &counters_attr_log_flush_incore.attr,
387 &counters_attr_log_flush_ondisk.attr,
388 &counters_attr_glock_nq_calls.attr,
389 &counters_attr_glock_dq_calls.attr,
390 &counters_attr_glock_prefetch_calls.attr,
391 &counters_attr_lm_lock_calls.attr,
392 &counters_attr_lm_unlock_calls.attr,
393 &counters_attr_lm_callbacks.attr,
394 &counters_attr_ops_address.attr,
395 &counters_attr_ops_dentry.attr,
396 &counters_attr_ops_export.attr,
397 &counters_attr_ops_file.attr,
398 &counters_attr_ops_inode.attr,
399 &counters_attr_ops_super.attr,
400 &counters_attr_ops_vm.attr,
401 NULL
402};
403
404/*
405 * get and set struct gfs2_tune fields
406 */
407
408static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
409{
410 return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num,
411 sdp->sd_tune.gt_quota_scale_den);
412}
413
414static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
415 size_t len)
416{
417 struct gfs2_tune *gt = &sdp->sd_tune;
418 unsigned int x, y;
419
420 if (!capable(CAP_SYS_ADMIN))
421 return -EACCES;
422
423 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
424 return -EINVAL;
425
426 spin_lock(&gt->gt_spin);
427 gt->gt_quota_scale_num = x;
428 gt->gt_quota_scale_den = y;
429 spin_unlock(&gt->gt_spin);
430 return len;
431}
432
433static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
434 int check_zero, const char *buf, size_t len)
435{
436 struct gfs2_tune *gt = &sdp->sd_tune;
437 unsigned int x;
438
439 if (!capable(CAP_SYS_ADMIN))
440 return -EACCES;
441
442 x = simple_strtoul(buf, NULL, 0);
443
444 if (check_zero && !x)
445 return -EINVAL;
446
447 spin_lock(&gt->gt_spin);
448 *field = x;
449 spin_unlock(&gt->gt_spin);
450 return len;
451}
452
453struct tune_attr {
454 struct attribute attr;
455 ssize_t (*show)(struct gfs2_sbd *, char *);
456 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
457};
458
459#define TUNE_ATTR_3(name, show, store) \
460static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
461
462#define TUNE_ATTR_2(name, store) \
463static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
464{ \
465 return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \
466} \
467TUNE_ATTR_3(name, name##_show, store)
468
469#define TUNE_ATTR(name, check_zero) \
470static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
471{ \
472 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
473} \
474TUNE_ATTR_2(name, name##_store)
475
476#define TUNE_ATTR_DAEMON(name, process) \
477static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
478{ \
479 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
480 wake_up_process(sdp->sd_##process); \
481 return r; \
482} \
483TUNE_ATTR_2(name, name##_store)
484
485TUNE_ATTR(ilimit, 0);
486TUNE_ATTR(ilimit_tries, 0);
487TUNE_ATTR(ilimit_min, 0);
488TUNE_ATTR(demote_secs, 0);
489TUNE_ATTR(incore_log_blocks, 0);
490TUNE_ATTR(log_flush_secs, 0);
491TUNE_ATTR(jindex_refresh_secs, 0);
492TUNE_ATTR(quota_warn_period, 0);
493TUNE_ATTR(quota_quantum, 0);
494TUNE_ATTR(atime_quantum, 0);
495TUNE_ATTR(max_readahead, 0);
496TUNE_ATTR(complain_secs, 0);
497TUNE_ATTR(reclaim_limit, 0);
498TUNE_ATTR(prefetch_secs, 0);
499TUNE_ATTR(statfs_slow, 0);
500TUNE_ATTR(new_files_jdata, 0);
501TUNE_ATTR(new_files_directio, 0);
502TUNE_ATTR(quota_simul_sync, 1);
503TUNE_ATTR(quota_cache_secs, 1);
504TUNE_ATTR(max_atomic_write, 1);
505TUNE_ATTR(stall_secs, 1);
506TUNE_ATTR(entries_per_readdir, 1);
507TUNE_ATTR(greedy_default, 1);
508TUNE_ATTR(greedy_quantum, 1);
509TUNE_ATTR(greedy_max, 1);
510TUNE_ATTR(statfs_quantum, 1);
511TUNE_ATTR_DAEMON(scand_secs, scand_process);
512TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
513TUNE_ATTR_DAEMON(logd_secs, logd_process);
514TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
515TUNE_ATTR_DAEMON(inoded_secs, inoded_process);
516TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
517
518static struct attribute *tune_attrs[] = {
519 &tune_attr_ilimit.attr,
520 &tune_attr_ilimit_tries.attr,
521 &tune_attr_ilimit_min.attr,
522 &tune_attr_demote_secs.attr,
523 &tune_attr_incore_log_blocks.attr,
524 &tune_attr_log_flush_secs.attr,
525 &tune_attr_jindex_refresh_secs.attr,
526 &tune_attr_quota_warn_period.attr,
527 &tune_attr_quota_quantum.attr,
528 &tune_attr_atime_quantum.attr,
529 &tune_attr_max_readahead.attr,
530 &tune_attr_complain_secs.attr,
531 &tune_attr_reclaim_limit.attr,
532 &tune_attr_prefetch_secs.attr,
533 &tune_attr_statfs_slow.attr,
534 &tune_attr_quota_simul_sync.attr,
535 &tune_attr_quota_cache_secs.attr,
536 &tune_attr_max_atomic_write.attr,
537 &tune_attr_stall_secs.attr,
538 &tune_attr_entries_per_readdir.attr,
539 &tune_attr_greedy_default.attr,
540 &tune_attr_greedy_quantum.attr,
541 &tune_attr_greedy_max.attr,
542 &tune_attr_statfs_quantum.attr,
543 &tune_attr_scand_secs.attr,
544 &tune_attr_recoverd_secs.attr,
545 &tune_attr_logd_secs.attr,
546 &tune_attr_quotad_secs.attr,
547 &tune_attr_inoded_secs.attr,
548 &tune_attr_quota_scale.attr,
549 &tune_attr_new_files_jdata.attr,
550 &tune_attr_new_files_directio.attr,
551 NULL
552};
553
554static struct attribute_group lockstruct_group = {
555 .name = "lockstruct",
556 .attrs = lockstruct_attrs
557};
558
559static struct attribute_group counters_group = {
560 .name = "counters",
561 .attrs = counters_attrs
562};
563
564static struct attribute_group args_group = {
565 .name = "args",
566 .attrs = args_attrs
567};
568
569static struct attribute_group tune_group = {
570 .name = "tune",
571 .attrs = tune_attrs
572};
573
574int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
575{
576 int error;
577
578 sdp->sd_kobj.kset = &gfs2_kset;
579 sdp->sd_kobj.ktype = &gfs2_ktype;
580
581 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
582 if (error)
583 goto fail;
584
585 error = kobject_register(&sdp->sd_kobj);
586 if (error)
587 goto fail;
588
589 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
590 if (error)
591 goto fail_reg;
592
593 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
594 if (error)
595 goto fail_lockstruct;
596
597 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
598 if (error)
599 goto fail_counters;
600
601 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
602 if (error)
603 goto fail_args;
604
605 return 0;
606
607 fail_args:
608 sysfs_remove_group(&sdp->sd_kobj, &args_group);
609 fail_counters:
610 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
611 fail_lockstruct:
612 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
613 fail_reg:
614 kobject_unregister(&sdp->sd_kobj);
615 fail:
616 return error;
617}
618
619void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
620{
621 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
622 sysfs_remove_group(&sdp->sd_kobj, &args_group);
623 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
624 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
625 kobject_unregister(&sdp->sd_kobj);
626}
627
628int gfs2_sys_init(void)
629{
630 gfs2_sys_margs = NULL;
631 spin_lock_init(&gfs2_sys_margs_lock);
632 return kset_register(&gfs2_kset);
633}
634
635void gfs2_sys_uninit(void)
636{
637 kfree(gfs2_sys_margs);
638 kset_unregister(&gfs2_kset);
639}
640
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..62c8ed89ab9c
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13/* Allow args to be passed to GFS2 when using an initial ram disk */
14extern char *gfs2_sys_margs;
15extern spinlock_t gfs2_sys_margs_lock;
16
17int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
18void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19
20int gfs2_sys_init(void);
21void gfs2_sys_uninit(void);
22
23#endif /* __SYS_DOT_H__ */
24
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..0a0ea70eac4c
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,198 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include "glock.h"
19#include "log.h"
20#include "lops.h"
21#include "meta_io.h"
22#include "trans.h"
23
24int gfs2_trans_begin_i(struct gfs2_sbd *sdp, unsigned int blocks,
25 unsigned int revokes, char *file, unsigned int line)
26{
27 struct gfs2_trans *tr;
28 int error;
29
30 if (gfs2_assert_warn(sdp, !get_transaction) ||
31 gfs2_assert_warn(sdp, blocks || revokes)) {
32 fs_warn(sdp, "(%s, %u)\n", file, line);
33 return -EINVAL;
34 }
35
36 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
37 if (!tr)
38 return -ENOMEM;
39
40 tr->tr_file = file;
41 tr->tr_line = line;
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 1 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(uint64_t));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 error = -ENOMEM;
53 tr->tr_t_gh = gfs2_holder_get(sdp->sd_trans_gl, LM_ST_SHARED,
54 GL_NEVER_RECURSE, GFP_NOFS);
55 if (!tr->tr_t_gh)
56 goto fail;
57
58 error = gfs2_glock_nq(tr->tr_t_gh);
59 if (error)
60 goto fail_holder_put;
61
62 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
63 tr->tr_t_gh->gh_flags |= GL_NOCACHE;
64 error = -EROFS;
65 goto fail_gunlock;
66 }
67
68 error = gfs2_log_reserve(sdp, tr->tr_reserved);
69 if (error)
70 goto fail_gunlock;
71
72 set_transaction(tr);
73
74 return 0;
75
76 fail_gunlock:
77 gfs2_glock_dq(tr->tr_t_gh);
78
79 fail_holder_put:
80 gfs2_holder_put(tr->tr_t_gh);
81
82 fail:
83 kfree(tr);
84
85 return error;
86}
87
88void gfs2_trans_end(struct gfs2_sbd *sdp)
89{
90 struct gfs2_trans *tr;
91 struct gfs2_holder *t_gh;
92
93 tr = get_transaction;
94 set_transaction(NULL);
95
96 if (gfs2_assert_warn(sdp, tr))
97 return;
98
99 t_gh = tr->tr_t_gh;
100 tr->tr_t_gh = NULL;
101
102 if (!tr->tr_touched) {
103 gfs2_log_release(sdp, tr->tr_reserved);
104 kfree(tr);
105
106 gfs2_glock_dq(t_gh);
107 gfs2_holder_put(t_gh);
108
109 return;
110 }
111
112 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks))
113 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u "
114 "tr_file = %s, tr_line = %u\n",
115 tr->tr_num_buf, tr->tr_blocks,
116 tr->tr_file, tr->tr_line);
117 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes))
118 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u "
119 "tr_file = %s, tr_line = %u\n",
120 tr->tr_num_revoke, tr->tr_revokes,
121 tr->tr_file, tr->tr_line);
122
123 gfs2_log_commit(sdp, tr);
124
125 gfs2_glock_dq(t_gh);
126 gfs2_holder_put(t_gh);
127
128 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
129 gfs2_log_flush(sdp);
130}
131
132void gfs2_trans_add_gl(struct gfs2_glock *gl)
133{
134 lops_add(gl->gl_sbd, &gl->gl_le);
135}
136
137/**
138 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
139 * @gl: the glock the buffer belongs to
140 * @bh: The buffer to add
141 * @meta: True in the case of adding metadata
142 *
143 */
144
145void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
146{
147 struct gfs2_sbd *sdp = gl->gl_sbd;
148 struct gfs2_bufdata *bd;
149
150 bd = get_v2bd(bh);
151 if (bd)
152 gfs2_assert(sdp, bd->bd_gl == gl);
153 else {
154 gfs2_attach_bufdata(gl, bh, meta);
155 bd = get_v2bd(bh);
156 }
157 lops_add(sdp, &bd->bd_le);
158}
159
160void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno)
161{
162 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
163 GFP_NOFS | __GFP_NOFAIL);
164 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
165 rv->rv_blkno = blkno;
166 lops_add(sdp, &rv->rv_le);
167}
168
169void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno)
170{
171 struct gfs2_revoke *rv;
172 int found = 0;
173
174 gfs2_log_lock(sdp);
175
176 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
177 if (rv->rv_blkno == blkno) {
178 list_del(&rv->rv_le.le_list);
179 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
180 sdp->sd_log_num_revoke--;
181 found = 1;
182 break;
183 }
184 }
185
186 gfs2_log_unlock(sdp);
187
188 if (found) {
189 kfree(rv);
190 get_transaction->tr_num_revoke_rm++;
191 }
192}
193
194void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
195{
196 lops_add(rgd->rd_sbd, &rgd->rd_le);
197}
198
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..f7f3e2a3d590
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#define RES_DINODE 1
14#define RES_INDIRECT 1
15#define RES_JDATA 1
16#define RES_DATA 1
17#define RES_LEAF 1
18#define RES_RG_BIT 2
19#define RES_EATTR 1
20#define RES_UNLINKED 1
21#define RES_STATFS 1
22#define RES_QUOTA 2
23
24#define gfs2_trans_begin(sdp, blocks, revokes) \
25gfs2_trans_begin_i((sdp), (blocks), (revokes), __FILE__, __LINE__)
26
27int gfs2_trans_begin_i(struct gfs2_sbd *sdp,
28 unsigned int blocks, unsigned int revokes,
29 char *file, unsigned int line);
30
31void gfs2_trans_end(struct gfs2_sbd *sdp);
32
33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno);
37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
38
39#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/unlinked.c b/fs/gfs2/unlinked.c
new file mode 100644
index 000000000000..e92a3a11815b
--- /dev/null
+++ b/fs/gfs2/unlinked.c
@@ -0,0 +1,453 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "bmap.h"
20#include "inode.h"
21#include "meta_io.h"
22#include "trans.h"
23#include "unlinked.h"
24
25static int munge_ondisk(struct gfs2_sbd *sdp, unsigned int slot,
26 struct gfs2_unlinked_tag *ut)
27{
28 struct gfs2_inode *ip = get_v2ip(sdp->sd_ut_inode);
29 unsigned int block, offset;
30 uint64_t dblock;
31 int new = 0;
32 struct buffer_head *bh;
33 int error;
34
35 block = slot / sdp->sd_ut_per_block;
36 offset = slot % sdp->sd_ut_per_block;
37
38 error = gfs2_block_map(ip, block, &new, &dblock, NULL);
39 if (error)
40 return error;
41 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
42 if (error)
43 return error;
44 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_UT)) {
45 error = -EIO;
46 goto out;
47 }
48
49 mutex_lock(&sdp->sd_unlinked_mutex);
50 gfs2_trans_add_bh(ip->i_gl, bh, 1);
51 gfs2_unlinked_tag_out(ut, bh->b_data +
52 sizeof(struct gfs2_meta_header) +
53 offset * sizeof(struct gfs2_unlinked_tag));
54 mutex_unlock(&sdp->sd_unlinked_mutex);
55
56 out:
57 brelse(bh);
58
59 return error;
60}
61
62static void ul_hash(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
63{
64 spin_lock(&sdp->sd_unlinked_spin);
65 list_add(&ul->ul_list, &sdp->sd_unlinked_list);
66 gfs2_assert(sdp, ul->ul_count);
67 ul->ul_count++;
68 atomic_inc(&sdp->sd_unlinked_count);
69 spin_unlock(&sdp->sd_unlinked_spin);
70}
71
72static void ul_unhash(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
73{
74 spin_lock(&sdp->sd_unlinked_spin);
75 list_del_init(&ul->ul_list);
76 gfs2_assert(sdp, ul->ul_count > 1);
77 ul->ul_count--;
78 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_unlinked_count) > 0);
79 atomic_dec(&sdp->sd_unlinked_count);
80 spin_unlock(&sdp->sd_unlinked_spin);
81}
82
83static struct gfs2_unlinked *ul_fish(struct gfs2_sbd *sdp)
84{
85 struct list_head *head;
86 struct gfs2_unlinked *ul;
87 int found = 0;
88
89 if (sdp->sd_vfs->s_flags & MS_RDONLY)
90 return NULL;
91
92 spin_lock(&sdp->sd_unlinked_spin);
93
94 head = &sdp->sd_unlinked_list;
95
96 list_for_each_entry(ul, head, ul_list) {
97 if (test_bit(ULF_LOCKED, &ul->ul_flags))
98 continue;
99
100 list_move_tail(&ul->ul_list, head);
101 ul->ul_count++;
102 set_bit(ULF_LOCKED, &ul->ul_flags);
103 found = 1;
104
105 break;
106 }
107
108 if (!found)
109 ul = NULL;
110
111 spin_unlock(&sdp->sd_unlinked_spin);
112
113 return ul;
114}
115
116/**
117 * enforce_limit - limit the number of inodes waiting to be deallocated
118 * @sdp: the filesystem
119 *
120 * Returns: errno
121 */
122
123static void enforce_limit(struct gfs2_sbd *sdp)
124{
125 unsigned int tries = 0, min = 0;
126 int error;
127
128 if (atomic_read(&sdp->sd_unlinked_count) >=
129 gfs2_tune_get(sdp, gt_ilimit)) {
130 tries = gfs2_tune_get(sdp, gt_ilimit_tries);
131 min = gfs2_tune_get(sdp, gt_ilimit_min);
132 }
133
134 while (tries--) {
135 struct gfs2_unlinked *ul = ul_fish(sdp);
136 if (!ul)
137 break;
138 error = gfs2_inode_dealloc(sdp, ul);
139 gfs2_unlinked_put(sdp, ul);
140
141 if (!error) {
142 if (!--min)
143 break;
144 } else if (error != 1)
145 break;
146 }
147}
148
149static struct gfs2_unlinked *ul_alloc(struct gfs2_sbd *sdp)
150{
151 struct gfs2_unlinked *ul;
152
153 ul = kzalloc(sizeof(struct gfs2_unlinked), GFP_KERNEL);
154 if (ul) {
155 INIT_LIST_HEAD(&ul->ul_list);
156 ul->ul_count = 1;
157 set_bit(ULF_LOCKED, &ul->ul_flags);
158 }
159
160 return ul;
161}
162
163int gfs2_unlinked_get(struct gfs2_sbd *sdp, struct gfs2_unlinked **ul)
164{
165 unsigned int c, o = 0, b;
166 unsigned char byte = 0;
167
168 enforce_limit(sdp);
169
170 *ul = ul_alloc(sdp);
171 if (!*ul)
172 return -ENOMEM;
173
174 spin_lock(&sdp->sd_unlinked_spin);
175
176 for (c = 0; c < sdp->sd_unlinked_chunks; c++)
177 for (o = 0; o < PAGE_SIZE; o++) {
178 byte = sdp->sd_unlinked_bitmap[c][o];
179 if (byte != 0xFF)
180 goto found;
181 }
182
183 goto fail;
184
185 found:
186 for (b = 0; b < 8; b++)
187 if (!(byte & (1 << b)))
188 break;
189 (*ul)->ul_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
190
191 if ((*ul)->ul_slot >= sdp->sd_unlinked_slots)
192 goto fail;
193
194 sdp->sd_unlinked_bitmap[c][o] |= 1 << b;
195
196 spin_unlock(&sdp->sd_unlinked_spin);
197
198 return 0;
199
200 fail:
201 spin_unlock(&sdp->sd_unlinked_spin);
202 kfree(*ul);
203 return -ENOSPC;
204}
205
206void gfs2_unlinked_put(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
207{
208 gfs2_assert_warn(sdp, test_and_clear_bit(ULF_LOCKED, &ul->ul_flags));
209
210 spin_lock(&sdp->sd_unlinked_spin);
211 gfs2_assert(sdp, ul->ul_count);
212 ul->ul_count--;
213 if (!ul->ul_count) {
214 gfs2_icbit_munge(sdp, sdp->sd_unlinked_bitmap, ul->ul_slot, 0);
215 spin_unlock(&sdp->sd_unlinked_spin);
216 kfree(ul);
217 } else
218 spin_unlock(&sdp->sd_unlinked_spin);
219}
220
221int gfs2_unlinked_ondisk_add(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
222{
223 int error;
224
225 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
226 gfs2_assert_warn(sdp, list_empty(&ul->ul_list));
227
228 error = munge_ondisk(sdp, ul->ul_slot, &ul->ul_ut);
229 if (!error)
230 ul_hash(sdp, ul);
231
232 return error;
233}
234
235int gfs2_unlinked_ondisk_munge(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
236{
237 int error;
238
239 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
240 gfs2_assert_warn(sdp, !list_empty(&ul->ul_list));
241
242 error = munge_ondisk(sdp, ul->ul_slot, &ul->ul_ut);
243
244 return error;
245}
246
247int gfs2_unlinked_ondisk_rm(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
248{
249 struct gfs2_unlinked_tag ut;
250 int error;
251
252 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
253 gfs2_assert_warn(sdp, !list_empty(&ul->ul_list));
254
255 memset(&ut, 0, sizeof(struct gfs2_unlinked_tag));
256
257 error = munge_ondisk(sdp, ul->ul_slot, &ut);
258 if (error)
259 return error;
260
261 ul_unhash(sdp, ul);
262
263 return 0;
264}
265
266/**
267 * gfs2_unlinked_dealloc - Go through the list of inodes to be deallocated
268 * @sdp: the filesystem
269 *
270 * Returns: errno
271 */
272
273int gfs2_unlinked_dealloc(struct gfs2_sbd *sdp)
274{
275 unsigned int hits, strikes;
276 int error;
277
278 for (;;) {
279 hits = 0;
280 strikes = 0;
281
282 for (;;) {
283 struct gfs2_unlinked *ul = ul_fish(sdp);
284 if (!ul)
285 return 0;
286 error = gfs2_inode_dealloc(sdp, ul);
287 gfs2_unlinked_put(sdp, ul);
288
289 if (!error) {
290 hits++;
291 if (strikes)
292 strikes--;
293 } else if (error == 1) {
294 strikes++;
295 if (strikes >=
296 atomic_read(&sdp->sd_unlinked_count)) {
297 error = 0;
298 break;
299 }
300 } else
301 return error;
302 }
303
304 if (!hits || kthread_should_stop())
305 break;
306
307 cond_resched();
308 }
309
310 return 0;
311}
312
313int gfs2_unlinked_init(struct gfs2_sbd *sdp)
314{
315 struct gfs2_inode *ip = get_v2ip(sdp->sd_ut_inode);
316 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
317 unsigned int x, slot = 0;
318 unsigned int found = 0;
319 uint64_t dblock;
320 uint32_t extlen = 0;
321 int error;
322
323 if (!ip->i_di.di_size ||
324 ip->i_di.di_size > (64 << 20) ||
325 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
326 gfs2_consist_inode(ip);
327 return -EIO;
328 }
329 sdp->sd_unlinked_slots = blocks * sdp->sd_ut_per_block;
330 sdp->sd_unlinked_chunks = DIV_RU(sdp->sd_unlinked_slots, 8 * PAGE_SIZE);
331
332 error = -ENOMEM;
333
334 sdp->sd_unlinked_bitmap = kcalloc(sdp->sd_unlinked_chunks,
335 sizeof(unsigned char *),
336 GFP_KERNEL);
337 if (!sdp->sd_unlinked_bitmap)
338 return error;
339
340 for (x = 0; x < sdp->sd_unlinked_chunks; x++) {
341 sdp->sd_unlinked_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
342 if (!sdp->sd_unlinked_bitmap[x])
343 goto fail;
344 }
345
346 for (x = 0; x < blocks; x++) {
347 struct buffer_head *bh;
348 unsigned int y;
349
350 if (!extlen) {
351 int new = 0;
352 error = gfs2_block_map(ip, x, &new, &dblock, &extlen);
353 if (error)
354 goto fail;
355 }
356 gfs2_meta_ra(ip->i_gl, dblock, extlen);
357 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
358 &bh);
359 if (error)
360 goto fail;
361 error = -EIO;
362 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_UT)) {
363 brelse(bh);
364 goto fail;
365 }
366
367 for (y = 0;
368 y < sdp->sd_ut_per_block && slot < sdp->sd_unlinked_slots;
369 y++, slot++) {
370 struct gfs2_unlinked_tag ut;
371 struct gfs2_unlinked *ul;
372
373 gfs2_unlinked_tag_in(&ut, bh->b_data +
374 sizeof(struct gfs2_meta_header) +
375 y * sizeof(struct gfs2_unlinked_tag));
376 if (!ut.ut_inum.no_addr)
377 continue;
378
379 error = -ENOMEM;
380 ul = ul_alloc(sdp);
381 if (!ul) {
382 brelse(bh);
383 goto fail;
384 }
385 ul->ul_ut = ut;
386 ul->ul_slot = slot;
387
388 spin_lock(&sdp->sd_unlinked_spin);
389 gfs2_icbit_munge(sdp, sdp->sd_unlinked_bitmap, slot, 1);
390 spin_unlock(&sdp->sd_unlinked_spin);
391 ul_hash(sdp, ul);
392
393 gfs2_unlinked_put(sdp, ul);
394 found++;
395 }
396
397 brelse(bh);
398 dblock++;
399 extlen--;
400 }
401
402 if (found)
403 fs_info(sdp, "found %u unlinked inodes\n", found);
404
405 return 0;
406
407 fail:
408 gfs2_unlinked_cleanup(sdp);
409 return error;
410}
411
412/**
413 * gfs2_unlinked_cleanup - get rid of any extra struct gfs2_unlinked structures
414 * @sdp: the filesystem
415 *
416 */
417
418void gfs2_unlinked_cleanup(struct gfs2_sbd *sdp)
419{
420 struct list_head *head = &sdp->sd_unlinked_list;
421 struct gfs2_unlinked *ul;
422 unsigned int x;
423
424 spin_lock(&sdp->sd_unlinked_spin);
425 while (!list_empty(head)) {
426 ul = list_entry(head->next, struct gfs2_unlinked, ul_list);
427
428 if (ul->ul_count > 1) {
429 list_move_tail(&ul->ul_list, head);
430 spin_unlock(&sdp->sd_unlinked_spin);
431 schedule();
432 spin_lock(&sdp->sd_unlinked_spin);
433 continue;
434 }
435
436 list_del_init(&ul->ul_list);
437 atomic_dec(&sdp->sd_unlinked_count);
438
439 gfs2_assert_warn(sdp, ul->ul_count == 1);
440 gfs2_assert_warn(sdp, !test_bit(ULF_LOCKED, &ul->ul_flags));
441 kfree(ul);
442 }
443 spin_unlock(&sdp->sd_unlinked_spin);
444
445 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_unlinked_count));
446
447 if (sdp->sd_unlinked_bitmap) {
448 for (x = 0; x < sdp->sd_unlinked_chunks; x++)
449 kfree(sdp->sd_unlinked_bitmap[x]);
450 kfree(sdp->sd_unlinked_bitmap);
451 }
452}
453
diff --git a/fs/gfs2/unlinked.h b/fs/gfs2/unlinked.h
new file mode 100644
index 000000000000..51e77f88d74f
--- /dev/null
+++ b/fs/gfs2/unlinked.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UNLINKED_DOT_H__
11#define __UNLINKED_DOT_H__
12
13int gfs2_unlinked_get(struct gfs2_sbd *sdp, struct gfs2_unlinked **ul);
14void gfs2_unlinked_put(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
15
16int gfs2_unlinked_ondisk_add(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
17int gfs2_unlinked_ondisk_munge(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
18int gfs2_unlinked_ondisk_rm(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
19
20int gfs2_unlinked_dealloc(struct gfs2_sbd *sdp);
21
22int gfs2_unlinked_init(struct gfs2_sbd *sdp);
23void gfs2_unlinked_cleanup(struct gfs2_sbd *sdp);
24
25#endif /* __UNLINKED_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..4fb1704aac10
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,246 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <asm/semaphore.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "glock.h"
21#include "lm.h"
22
23kmem_cache_t *gfs2_glock_cachep __read_mostly;
24kmem_cache_t *gfs2_inode_cachep __read_mostly;
25kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
26
27uint32_t gfs2_disk_hash(const char *data, int len)
28{
29 return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF;
30}
31
32void gfs2_assert_i(struct gfs2_sbd *sdp)
33{
34 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
35 sdp->sd_fsname);
36}
37
38/**
39 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
40 * Returns: -1 if this call withdrew the machine,
41 * -2 if it was already withdrawn
42 */
43
44int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
45 const char *function, char *file, unsigned int line)
46{
47 int me;
48 me = gfs2_lm_withdraw(sdp,
49 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
50 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
51 sdp->sd_fsname, assertion,
52 sdp->sd_fsname, function, file, line);
53 dump_stack();
54 return (me) ? -1 : -2;
55}
56
57/**
58 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
59 * Returns: -1 if we printed something
60 * -2 if we didn't
61 */
62
63int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
64 const char *function, char *file, unsigned int line)
65{
66 if (time_before(jiffies,
67 sdp->sd_last_warning +
68 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
69 return -2;
70
71 printk(KERN_WARNING
72 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
73 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
74 sdp->sd_fsname, assertion,
75 sdp->sd_fsname, function, file, line);
76
77 if (sdp->sd_args.ar_debug)
78 BUG();
79 else
80 dump_stack();
81
82 sdp->sd_last_warning = jiffies;
83
84 return -1;
85}
86
87/**
88 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
89 * Returns: -1 if this call withdrew the machine,
90 * 0 if it was already withdrawn
91 */
92
93int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
94 char *file, unsigned int line)
95{
96 int rv;
97 rv = gfs2_lm_withdraw(sdp,
98 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
99 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
100 sdp->sd_fsname,
101 sdp->sd_fsname, function, file, line);
102 return rv;
103}
104
105/**
106 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
107 * Returns: -1 if this call withdrew the machine,
108 * 0 if it was already withdrawn
109 */
110
111int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
112 const char *function, char *file, unsigned int line)
113{
114 struct gfs2_sbd *sdp = ip->i_sbd;
115 int rv;
116 rv = gfs2_lm_withdraw(sdp,
117 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
118 "GFS2: fsid=%s: inode = %llu %llu\n"
119 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
120 sdp->sd_fsname,
121 sdp->sd_fsname, ip->i_num.no_formal_ino, ip->i_num.no_addr,
122 sdp->sd_fsname, function, file, line);
123 return rv;
124}
125
126/**
127 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
128 * Returns: -1 if this call withdrew the machine,
129 * 0 if it was already withdrawn
130 */
131
132int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
133 const char *function, char *file, unsigned int line)
134{
135 struct gfs2_sbd *sdp = rgd->rd_sbd;
136 int rv;
137 rv = gfs2_lm_withdraw(sdp,
138 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
139 "GFS2: fsid=%s: RG = %llu\n"
140 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
141 sdp->sd_fsname,
142 sdp->sd_fsname, rgd->rd_ri.ri_addr,
143 sdp->sd_fsname, function, file, line);
144 return rv;
145}
146
147/**
148 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
149 * Returns: -1 if this call withdrew the machine,
150 * -2 if it was already withdrawn
151 */
152
153int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
154 const char *type, const char *function, char *file,
155 unsigned int line)
156{
157 int me;
158 me = gfs2_lm_withdraw(sdp,
159 "GFS2: fsid=%s: fatal: invalid metadata block\n"
160 "GFS2: fsid=%s: bh = %llu (%s)\n"
161 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
162 sdp->sd_fsname,
163 sdp->sd_fsname, (uint64_t)bh->b_blocknr, type,
164 sdp->sd_fsname, function, file, line);
165 return (me) ? -1 : -2;
166}
167
168/**
169 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
170 * Returns: -1 if this call withdrew the machine,
171 * -2 if it was already withdrawn
172 */
173
174int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
175 uint16_t type, uint16_t t, const char *function,
176 char *file, unsigned int line)
177{
178 int me;
179 me = gfs2_lm_withdraw(sdp,
180 "GFS2: fsid=%s: fatal: invalid metadata block\n"
181 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
182 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
183 sdp->sd_fsname,
184 sdp->sd_fsname, (uint64_t)bh->b_blocknr, type, t,
185 sdp->sd_fsname, function, file, line);
186 return (me) ? -1 : -2;
187}
188
189/**
190 * gfs2_io_error_i - Flag an I/O error and withdraw
191 * Returns: -1 if this call withdrew the machine,
192 * 0 if it was already withdrawn
193 */
194
195int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
196 unsigned int line)
197{
198 int rv;
199 rv = gfs2_lm_withdraw(sdp,
200 "GFS2: fsid=%s: fatal: I/O error\n"
201 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
202 sdp->sd_fsname,
203 sdp->sd_fsname, function, file, line);
204 return rv;
205}
206
207/**
208 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
209 * Returns: -1 if this call withdrew the machine,
210 * 0 if it was already withdrawn
211 */
212
213int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
214 const char *function, char *file, unsigned int line)
215{
216 int rv;
217 rv = gfs2_lm_withdraw(sdp,
218 "GFS2: fsid=%s: fatal: I/O error\n"
219 "GFS2: fsid=%s: block = %llu\n"
220 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
221 sdp->sd_fsname,
222 sdp->sd_fsname, (uint64_t)bh->b_blocknr,
223 sdp->sd_fsname, function, file, line);
224 return rv;
225}
226
227void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
228 unsigned int bit, int new_value)
229{
230 unsigned int c, o, b = bit;
231 int old_value;
232
233 c = b / (8 * PAGE_SIZE);
234 b %= 8 * PAGE_SIZE;
235 o = b / 8;
236 b %= 8;
237
238 old_value = (bitmap[c][o] & (1 << b));
239 gfs2_assert_withdraw(sdp, !old_value != !new_value);
240
241 if (new_value)
242 bitmap[c][o] |= 1 << b;
243 else
244 bitmap[c][o] &= ~(1 << b);
245}
246
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..8d6eba3bdf0a
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,172 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13uint32_t gfs2_disk_hash(const char *data, int len);
14
15
16#define fs_printk(level, fs, fmt, arg...) \
17 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
18
19#define fs_info(fs, fmt, arg...) \
20 fs_printk(KERN_INFO , fs , fmt , ## arg)
21
22#define fs_warn(fs, fmt, arg...) \
23 fs_printk(KERN_WARNING , fs , fmt , ## arg)
24
25#define fs_err(fs, fmt, arg...) \
26 fs_printk(KERN_ERR, fs , fmt , ## arg)
27
28
29void gfs2_assert_i(struct gfs2_sbd *sdp);
30
31#define gfs2_assert(sdp, assertion) \
32do { \
33 if (unlikely(!(assertion))) { \
34 gfs2_assert_i(sdp); \
35 BUG(); \
36 } \
37} while (0)
38
39
40int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
41 const char *function, char *file, unsigned int line);
42
43#define gfs2_assert_withdraw(sdp, assertion) \
44((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
45 __FUNCTION__, __FILE__, __LINE__))
46
47
48int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
49 const char *function, char *file, unsigned int line);
50
51#define gfs2_assert_warn(sdp, assertion) \
52((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
53 __FUNCTION__, __FILE__, __LINE__))
54
55
56int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
57 const char *function, char *file, unsigned int line);
58
59#define gfs2_consist(sdp) \
60gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
61
62
63int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
64 const char *function, char *file, unsigned int line);
65
66#define gfs2_consist_inode(ip) \
67gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
68
69
70int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
71 const char *function, char *file, unsigned int line);
72
73#define gfs2_consist_rgrpd(rgd) \
74gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
75
76
77int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
78 const char *type, const char *function,
79 char *file, unsigned int line);
80
81static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
82 struct buffer_head *bh,
83 const char *function,
84 char *file, unsigned int line)
85{
86 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
87 uint32_t magic = mh->mh_magic;
88 magic = be32_to_cpu(magic);
89 if (unlikely(magic != GFS2_MAGIC))
90 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
91 file, line);
92 return 0;
93}
94
95#define gfs2_meta_check(sdp, bh) \
96gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
97
98
99int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
100 uint16_t type, uint16_t t,
101 const char *function,
102 char *file, unsigned int line);
103
104static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
105 struct buffer_head *bh,
106 uint16_t type,
107 const char *function,
108 char *file, unsigned int line)
109{
110 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
111 uint32_t magic = mh->mh_magic;
112 uint16_t t = mh->mh_type;
113 magic = be32_to_cpu(magic);
114 if (unlikely(magic != GFS2_MAGIC))
115 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
116 file, line);
117 t = be16_to_cpu(t);
118 if (unlikely(t != type))
119 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
120 file, line);
121 return 0;
122}
123
124#define gfs2_metatype_check(sdp, bh, type) \
125gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
126
127static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type,
128 uint16_t format)
129{
130 struct gfs2_meta_header *mh;
131 mh = (struct gfs2_meta_header *)bh->b_data;
132 mh->mh_type = cpu_to_be16(type);
133 mh->mh_format = cpu_to_be16(format);
134}
135
136
137int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
138 char *file, unsigned int line);
139
140#define gfs2_io_error(sdp) \
141gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
142
143
144int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
145 const char *function, char *file, unsigned int line);
146
147#define gfs2_io_error_bh(sdp, bh) \
148gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
149
150
151extern kmem_cache_t *gfs2_glock_cachep;
152extern kmem_cache_t *gfs2_inode_cachep;
153extern kmem_cache_t *gfs2_bufdata_cachep;
154
155static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
156 unsigned int *p)
157{
158 unsigned int x;
159 spin_lock(&gt->gt_spin);
160 x = *p;
161 spin_unlock(&gt->gt_spin);
162 return x;
163}
164
165#define gfs2_tune_get(sdp, field) \
166gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
167
168void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
169 unsigned int bit, int new_value);
170
171#endif /* __UTIL_DOT_H__ */
172