aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS6
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--MAINTAINERS18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig21
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c172
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c296
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h538
-rw-r--r--fs/dlm/lock.c3839
-rw-r--r--fs/dlm/lock.h61
-rw-r--r--fs/dlm/lockspace.c704
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1238
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c312
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c115
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c457
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c762
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c285
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c769
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c313
-rw-r--r--fs/gfs2/acl.h37
-rw-r--r--fs/gfs2/bmap.c1103
-rw-r--r--fs/gfs2/bmap.h32
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1975
-rw-r--r--fs/gfs2/dir.h73
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h29
-rw-r--r--fs/gfs2/eattr.c1548
-rw-r--r--fs/gfs2/eattr.h97
-rw-r--r--fs/gfs2/format.h21
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2279
-rw-r--r--fs/gfs2/glock.h152
-rw-r--r--fs/gfs2/glops.c491
-rw-r--r--fs/gfs2/glops.h23
-rw-r--r--fs/gfs2/incore.h658
-rw-r--r--fs/gfs2/inode.c1354
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c244
-rw-r--r--fs/gfs2/lm.h41
-rw-r--r--fs/gfs2/lm_interface.h295
-rw-r--r--fs/gfs2/locking.c191
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c541
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h188
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c256
-rw-r--r--fs/gfs2/locking/dlm/plock.c299
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c225
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c259
-rw-r--r--fs/gfs2/log.c601
-rw-r--r--fs/gfs2/log.h61
-rw-r--r--fs/gfs2/lops.c800
-rw-r--r--fs/gfs2/lops.h96
-rw-r--r--fs/gfs2/lvb.c45
-rw-r--r--fs/gfs2/lvb.h19
-rw-r--r--fs/gfs2/main.c127
-rw-r--r--fs/gfs2/meta_io.c780
-rw-r--r--fs/gfs2/meta_io.h89
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h15
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c784
-rw-r--r--fs/gfs2/ops_address.h18
-rw-r--r--fs/gfs2/ops_dentry.c123
-rw-r--r--fs/gfs2/ops_dentry.h15
-rw-r--r--fs/gfs2/ops_export.c293
-rw-r--r--fs/gfs2/ops_export.h19
-rw-r--r--fs/gfs2/ops_file.c982
-rw-r--r--fs/gfs2/ops_file.h20
-rw-r--r--fs/gfs2/ops_fstype.c841
-rw-r--r--fs/gfs2/ops_fstype.h16
-rw-r--r--fs/gfs2/ops_inode.c1166
-rw-r--r--fs/gfs2/ops_inode.h18
-rw-r--r--fs/gfs2/ops_super.c472
-rw-r--r--fs/gfs2/ops_super.h15
-rw-r--r--fs/gfs2/ops_vm.c195
-rw-r--r--fs/gfs2/ops_vm.h16
-rw-r--r--fs/gfs2/page.c267
-rw-r--r--fs/gfs2/page.h23
-rw-r--r--fs/gfs2/quota.c1286
-rw-r--r--fs/gfs2/quota.h32
-rw-r--r--fs/gfs2/recovery.c575
-rw-r--r--fs/gfs2/recovery.h32
-rw-r--r--fs/gfs2/rgrp.c1529
-rw-r--r--fs/gfs2/rgrp.h62
-rw-r--r--fs/gfs2/super.c928
-rw-r--r--fs/gfs2/super.h52
-rw-r--r--fs/gfs2/sys.c579
-rw-r--r--fs/gfs2/sys.h24
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h34
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h169
-rw-r--r--include/linux/Kbuild33
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h86
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/gfs2_ondisk.h443
-rw-r--r--include/linux/iflags.h102
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/lock_dlm_plock.h40
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/readahead.c1
133 files changed, 40738 insertions, 20 deletions
diff --git a/CREDITS b/CREDITS
index 29be6d1fdf49..f41e1d2952c9 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3541,11 +3541,11 @@ S: Fargo, North Dakota 58122
3541S: USA 3541S: USA
3542 3542
3543N: Steven Whitehouse 3543N: Steven Whitehouse
3544E: SteveW@ACM.org 3544E: steve@chygwyn.com
3545W: http://www.chygwyn.com/~steve 3545W: http://www.chygwyn.com/~steve
3546D: Linux DECnet project: http://www.sucs.swan.ac.uk/~rohan/DECnet/index.html 3546D: Linux DECnet project
3547D: Minor debugging of other networking protocols. 3547D: Minor debugging of other networking protocols.
3548D: Misc bug fixes and filesystem development 3548D: Misc bug fixes and GFS2 filesystem development
3549 3549
3550N: Hans-Joachim Widmaier 3550N: Hans-Joachim Widmaier
3551E: hjw@zvw.de 3551E: hjw@zvw.de
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..593004b6bbab
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
1Global File System
2------------------
3
4http://sources.redhat.com/cluster/
5
6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC,
8iSCSI, NBD, etc). GFS reads and writes to the block device like a local
9file system, but also uses a lock module to allow the computers coordinate
10their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster.
13
14GFS uses interchangable inter-node locking mechanisms. Different lock
15modules can plug into GFS and each file system selects the appropriate
16lock module at mount time. Lock modules include:
17
18 lock_nolock -- allows gfs to be used as a local file system
19
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/
22
23In addition to interfacing with an external locking manager, a gfs lock
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above.
27
28To use gfs as a local file system, no external clustering systems are
29needed, simply:
30
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir
33
34GFS2 is not on-disk compatible with previous versions of GFS.
35
36The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem
38 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem
42 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem
diff --git a/MAINTAINERS b/MAINTAINERS
index e99028ca2f7c..c67c3e338105 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -862,6 +862,16 @@ M: jack@suse.cz
862L: linux-kernel@vger.kernel.org 862L: linux-kernel@vger.kernel.org
863S: Maintained 863S: Maintained
864 864
865DISTRIBUTED LOCK MANAGER
866P: Patrick Caulfield
867M: pcaulfie@redhat.com
868P: David Teigland
869M: teigland@redhat.com
870L: cluster-devel@redhat.com
871W: http://sources.redhat.com/cluster/
872T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
873S: Supported
874
865DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER 875DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
866P: Tobias Ringstrom 876P: Tobias Ringstrom
867M: tori@unhappy.mine.nu 877M: tori@unhappy.mine.nu
@@ -1112,6 +1122,14 @@ M: khc@pm.waw.pl
1112W: http://www.kernel.org/pub/linux/utils/net/hdlc/ 1122W: http://www.kernel.org/pub/linux/utils/net/hdlc/
1113S: Maintained 1123S: Maintained
1114 1124
1125GFS2 FILE SYSTEM
1126P: Steven Whitehouse
1127M: swhiteho@redhat.com
1128L: cluster-devel@redhat.com
1129W: http://sources.redhat.com/cluster/
1130T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
1131S: Supported
1132
1115GIGASET ISDN DRIVERS 1133GIGASET ISDN DRIVERS
1116P: Hansjoerg Lipp 1134P: Hansjoerg Lipp
1117M: hjlipp@web.de 1135M: hjlipp@web.de
diff --git a/fs/Kconfig b/fs/Kconfig
index 3f00a9faabcb..ddc7462ddb56 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -323,6 +323,7 @@ config FS_POSIX_ACL
323 default n 323 default n
324 324
325source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
326source "fs/gfs2/Kconfig"
326 327
327config OCFS2_FS 328config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)" 329 tristate "OCFS2 file system support (EXPERIMENTAL)"
@@ -1930,6 +1931,7 @@ source "fs/partitions/Kconfig"
1930endmenu 1931endmenu
1931 1932
1932source "fs/nls/Kconfig" 1933source "fs/nls/Kconfig"
1934source "fs/dlm/Kconfig"
1933 1935
1934endmenu 1936endmenu
1935 1937
diff --git a/fs/Makefile b/fs/Makefile
index 89135428a539..64df11047ccc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
50obj-y += devpts/ 50obj-y += devpts/
51 51
52obj-$(CONFIG_PROFILING) += dcookies.o 52obj-$(CONFIG_PROFILING) += dcookies.o
53obj-$(CONFIG_DLM) += dlm/
53 54
54# Do not add any filesystems before this line 55# Do not add any filesystems before this line
55obj-$(CONFIG_REISERFS_FS) += reiserfs/ 56obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -102,3 +103,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
102obj-$(CONFIG_HPPFS) += hppfs/ 103obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 104obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_OCFS2_FS) += ocfs2/ 105obj-$(CONFIG_OCFS2_FS) += ocfs2/
106obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 depends on IP_SCTP
8 select CONFIGFS_FS
9 help
10 A general purpose distributed lock manager for kernel or userspace
11 applications.
12
13config DLM_DEBUG
14 bool "DLM debugging"
15 depends on DLM
16 help
17 Under the debugfs mount point, the name of each lockspace will
18 appear as a file in the "dlm" directory. The output is the
19 list of resource and locks the local node knows about.
20
21endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
1obj-$(CONFIG_DLM) += dlm.o
2dlm-y := ast.o \
3 config.o \
4 dir.o \
5 lock.o \
6 lockspace.o \
7 lowcomms.o \
8 main.o \
9 member.o \
10 memory.o \
11 midcomms.o \
12 rcom.o \
13 recover.o \
14 recoverd.o \
15 requestqueue.o \
16 user.o \
17 util.o
18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
19
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..a211330cbc42
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,172 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "user.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 if (lkb->lkb_flags & DLM_IFL_USER) {
38 dlm_user_add_ast(lkb, type);
39 return;
40 }
41
42 spin_lock(&ast_queue_lock);
43 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
44 kref_get(&lkb->lkb_ref);
45 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
46 }
47 lkb->lkb_ast_type |= type;
48 spin_unlock(&ast_queue_lock);
49
50 set_bit(WAKE_ASTS, &astd_wakeflags);
51 wake_up_process(astd_task);
52}
53
54static void process_asts(void)
55{
56 struct dlm_ls *ls = NULL;
57 struct dlm_rsb *r = NULL;
58 struct dlm_lkb *lkb;
59 void (*cast) (long param);
60 void (*bast) (long param, int mode);
61 int type = 0, found, bmode;
62
63 for (;;) {
64 found = 0;
65 spin_lock(&ast_queue_lock);
66 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
67 r = lkb->lkb_resource;
68 ls = r->res_ls;
69
70 if (dlm_locking_stopped(ls))
71 continue;
72
73 list_del(&lkb->lkb_astqueue);
74 type = lkb->lkb_ast_type;
75 lkb->lkb_ast_type = 0;
76 found = 1;
77 break;
78 }
79 spin_unlock(&ast_queue_lock);
80
81 if (!found)
82 break;
83
84 cast = lkb->lkb_astaddr;
85 bast = lkb->lkb_bastaddr;
86 bmode = lkb->lkb_bastmode;
87
88 if ((type & AST_COMP) && cast)
89 cast(lkb->lkb_astparam);
90
91 /* FIXME: Is it safe to look at lkb_grmode here
92 without doing a lock_rsb() ?
93 Look at other checks in v1 to avoid basts. */
94
95 if ((type & AST_BAST) && bast)
96 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
97 bast(lkb->lkb_astparam, bmode);
98
99 /* this removes the reference added by dlm_add_ast
100 and may result in the lkb being freed */
101 dlm_put_lkb(lkb);
102
103 schedule();
104 }
105}
106
107static inline int no_asts(void)
108{
109 int ret;
110
111 spin_lock(&ast_queue_lock);
112 ret = list_empty(&ast_queue);
113 spin_unlock(&ast_queue_lock);
114 return ret;
115}
116
117static int dlm_astd(void *data)
118{
119 while (!kthread_should_stop()) {
120 set_current_state(TASK_INTERRUPTIBLE);
121 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
122 schedule();
123 set_current_state(TASK_RUNNING);
124
125 mutex_lock(&astd_running);
126 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
127 process_asts();
128 mutex_unlock(&astd_running);
129 }
130 return 0;
131}
132
133void dlm_astd_wake(void)
134{
135 if (!no_asts()) {
136 set_bit(WAKE_ASTS, &astd_wakeflags);
137 wake_up_process(astd_task);
138 }
139}
140
141int dlm_astd_start(void)
142{
143 struct task_struct *p;
144 int error = 0;
145
146 INIT_LIST_HEAD(&ast_queue);
147 spin_lock_init(&ast_queue_lock);
148 mutex_init(&astd_running);
149
150 p = kthread_run(dlm_astd, NULL, "dlm_astd");
151 if (IS_ERR(p))
152 error = PTR_ERR(p);
153 else
154 astd_task = p;
155 return error;
156}
157
158void dlm_astd_stop(void)
159{
160 kthread_stop(astd_task);
161}
162
163void dlm_astd_suspend(void)
164{
165 mutex_lock(&astd_running);
166}
167
168void dlm_astd_resume(void)
169{
170 mutex_unlock(&astd_running);
171}
172
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..49deca845dba
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,296 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21
22static struct dentry *dlm_root;
23
24struct rsb_iter {
25 int entry;
26 struct dlm_ls *ls;
27 struct list_head *next;
28 struct dlm_rsb *rsb;
29};
30
31static char *print_lockmode(int mode)
32{
33 switch (mode) {
34 case DLM_LOCK_IV:
35 return "--";
36 case DLM_LOCK_NL:
37 return "NL";
38 case DLM_LOCK_CR:
39 return "CR";
40 case DLM_LOCK_CW:
41 return "CW";
42 case DLM_LOCK_PR:
43 return "PR";
44 case DLM_LOCK_PW:
45 return "PW";
46 case DLM_LOCK_EX:
47 return "EX";
48 default:
49 return "??";
50 }
51}
52
53static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
54 struct dlm_rsb *res)
55{
56 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
57
58 if (lkb->lkb_status == DLM_LKSTS_CONVERT
59 || lkb->lkb_status == DLM_LKSTS_WAITING)
60 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
61
62 if (lkb->lkb_nodeid) {
63 if (lkb->lkb_nodeid != res->res_nodeid)
64 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
65 lkb->lkb_remid);
66 else
67 seq_printf(s, " Master: %08x", lkb->lkb_remid);
68 }
69
70 if (lkb->lkb_wait_type)
71 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
72
73 seq_printf(s, "\n");
74}
75
76static int print_resource(struct dlm_rsb *res, struct seq_file *s)
77{
78 struct dlm_lkb *lkb;
79 int i, lvblen = res->res_ls->ls_lvblen;
80
81 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
82 for (i = 0; i < res->res_length; i++) {
83 if (isprint(res->res_name[i]))
84 seq_printf(s, "%c", res->res_name[i]);
85 else
86 seq_printf(s, "%c", '.');
87 }
88 if (res->res_nodeid > 0)
89 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
90 res->res_nodeid);
91 else if (res->res_nodeid == 0)
92 seq_printf(s, "\" \nMaster Copy\n");
93 else if (res->res_nodeid == -1)
94 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
95 res->res_first_lkid);
96 else
97 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
98
99 /* Print the LVB: */
100 if (res->res_lvbptr) {
101 seq_printf(s, "LVB: ");
102 for (i = 0; i < lvblen; i++) {
103 if (i == lvblen / 2)
104 seq_printf(s, "\n ");
105 seq_printf(s, "%02x ",
106 (unsigned char) res->res_lvbptr[i]);
107 }
108 if (rsb_flag(res, RSB_VALNOTVALID))
109 seq_printf(s, " (INVALID)");
110 seq_printf(s, "\n");
111 }
112
113 /* Print the locks attached to this resource */
114 seq_printf(s, "Granted Queue\n");
115 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
116 print_lock(s, lkb, res);
117
118 seq_printf(s, "Conversion Queue\n");
119 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
120 print_lock(s, lkb, res);
121
122 seq_printf(s, "Waiting Queue\n");
123 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
124 print_lock(s, lkb, res);
125
126 return 0;
127}
128
129static int rsb_iter_next(struct rsb_iter *ri)
130{
131 struct dlm_ls *ls = ri->ls;
132 int i;
133
134 if (!ri->next) {
135 top:
136 /* Find the next non-empty hash bucket */
137 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
138 read_lock(&ls->ls_rsbtbl[i].lock);
139 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
140 ri->next = ls->ls_rsbtbl[i].list.next;
141 read_unlock(&ls->ls_rsbtbl[i].lock);
142 break;
143 }
144 read_unlock(&ls->ls_rsbtbl[i].lock);
145 }
146 ri->entry = i;
147
148 if (ri->entry >= ls->ls_rsbtbl_size)
149 return 1;
150 } else {
151 i = ri->entry;
152 read_lock(&ls->ls_rsbtbl[i].lock);
153 ri->next = ri->next->next;
154 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
155 /* End of list - move to next bucket */
156 ri->next = NULL;
157 ri->entry++;
158 read_unlock(&ls->ls_rsbtbl[i].lock);
159 goto top;
160 }
161 read_unlock(&ls->ls_rsbtbl[i].lock);
162 }
163 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
164
165 return 0;
166}
167
168static void rsb_iter_free(struct rsb_iter *ri)
169{
170 kfree(ri);
171}
172
173static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
174{
175 struct rsb_iter *ri;
176
177 ri = kmalloc(sizeof *ri, GFP_KERNEL);
178 if (!ri)
179 return NULL;
180
181 ri->ls = ls;
182 ri->entry = 0;
183 ri->next = NULL;
184
185 if (rsb_iter_next(ri)) {
186 rsb_iter_free(ri);
187 return NULL;
188 }
189
190 return ri;
191}
192
193static void *seq_start(struct seq_file *file, loff_t *pos)
194{
195 struct rsb_iter *ri;
196 loff_t n = *pos;
197
198 ri = rsb_iter_init(file->private);
199 if (!ri)
200 return NULL;
201
202 while (n--) {
203 if (rsb_iter_next(ri)) {
204 rsb_iter_free(ri);
205 return NULL;
206 }
207 }
208
209 return ri;
210}
211
212static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
213{
214 struct rsb_iter *ri = iter_ptr;
215
216 (*pos)++;
217
218 if (rsb_iter_next(ri)) {
219 rsb_iter_free(ri);
220 return NULL;
221 }
222
223 return ri;
224}
225
226static void seq_stop(struct seq_file *file, void *iter_ptr)
227{
228 /* nothing for now */
229}
230
231static int seq_show(struct seq_file *file, void *iter_ptr)
232{
233 struct rsb_iter *ri = iter_ptr;
234
235 print_resource(ri->rsb, file);
236
237 return 0;
238}
239
240static struct seq_operations dlm_seq_ops = {
241 .start = seq_start,
242 .next = seq_next,
243 .stop = seq_stop,
244 .show = seq_show,
245};
246
247static int do_open(struct inode *inode, struct file *file)
248{
249 struct seq_file *seq;
250 int ret;
251
252 ret = seq_open(file, &dlm_seq_ops);
253 if (ret)
254 return ret;
255
256 seq = file->private_data;
257 seq->private = inode->u.generic_ip;
258
259 return 0;
260}
261
262static struct file_operations dlm_fops = {
263 .owner = THIS_MODULE,
264 .open = do_open,
265 .read = seq_read,
266 .llseek = seq_lseek,
267 .release = seq_release
268};
269
270int dlm_create_debug_file(struct dlm_ls *ls)
271{
272 ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
273 S_IFREG | S_IRUGO,
274 dlm_root,
275 ls,
276 &dlm_fops);
277 return ls->ls_debug_dentry ? 0 : -ENOMEM;
278}
279
280void dlm_delete_debug_file(struct dlm_ls *ls)
281{
282 if (ls->ls_debug_dentry)
283 debugfs_remove(ls->ls_debug_dentry);
284}
285
286int dlm_register_debugfs(void)
287{
288 dlm_root = debugfs_create_dir("dlm", NULL);
289 return dlm_root ? 0 : -ENOMEM;
290}
291
292void dlm_unregister_debugfs(void)
293{
294 debugfs_remove(dlm_root);
295}
296
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..db080de2a7e9
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,538 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/miscdevice.h>
39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h>
42
43#include <linux/dlm.h>
44
45#define DLM_LOCKSPACE_LEN 64
46
47/* Size of the temp buffer midcomms allocates on the stack.
48 We try to make this large enough so most messages fit.
49 FIXME: should sctp make this unnecessary? */
50
51#define DLM_INBUF_LEN 148
52
53struct dlm_ls;
54struct dlm_lkb;
55struct dlm_rsb;
56struct dlm_member;
57struct dlm_lkbtable;
58struct dlm_rsbtable;
59struct dlm_dirtable;
60struct dlm_direntry;
61struct dlm_recover;
62struct dlm_header;
63struct dlm_message;
64struct dlm_rcom;
65struct dlm_mhandle;
66
67#define log_print(fmt, args...) \
68 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
69#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71
72#define DLM_LOG_DEBUG
73#ifdef DLM_LOG_DEBUG
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
75#else
76#define log_debug(ls, fmt, args...)
77#endif
78
79#define DLM_ASSERT(x, do) \
80{ \
81 if (!(x)) \
82 { \
83 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
84 "DLM: assertion: \"%s\"\n" \
85 "DLM: time = %lu\n", \
86 __LINE__, __FILE__, #x, jiffies); \
87 {do} \
88 printk("\n"); \
89 BUG(); \
90 panic("DLM: Record message above and reboot.\n"); \
91 } \
92}
93
94
95struct dlm_direntry {
96 struct list_head list;
97 uint32_t master_nodeid;
98 uint16_t length;
99 char name[1];
100};
101
102struct dlm_dirtable {
103 struct list_head list;
104 rwlock_t lock;
105};
106
107struct dlm_rsbtable {
108 struct list_head list;
109 struct list_head toss;
110 rwlock_t lock;
111};
112
113struct dlm_lkbtable {
114 struct list_head list;
115 rwlock_t lock;
116 uint16_t counter;
117};
118
119/*
120 * Lockspace member (per node in a ls)
121 */
122
123struct dlm_member {
124 struct list_head list;
125 int nodeid;
126 int weight;
127};
128
129/*
130 * Save and manage recovery state for a lockspace.
131 */
132
133struct dlm_recover {
134 struct list_head list;
135 int *nodeids;
136 int node_count;
137 uint64_t seq;
138};
139
140/*
141 * Pass input args to second stage locking function.
142 */
143
144struct dlm_args {
145 uint32_t flags;
146 void *astaddr;
147 long astparam;
148 void *bastaddr;
149 int mode;
150 struct dlm_lksb *lksb;
151};
152
153
154/*
155 * Lock block
156 *
157 * A lock can be one of three types:
158 *
159 * local copy lock is mastered locally
160 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
161 * process copy lock is mastered on a remote node
162 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
163 * master copy master node's copy of a lock owned by remote node
164 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
165 *
166 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
167 * dlm_unlock. The dlm does not modify these or use any private flags in
168 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
169 * are sent as-is to the remote master when the lock is remote.
170 *
171 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
172 * Some internal flags are shared between the master and process nodes;
173 * these shared flags are kept in the lower two bytes. One of these
174 * flags set on the master copy will be propagated to the process copy
175 * and v.v. Other internal flags are private to the master or process
176 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
177 *
178 * lkb_sbflags: status block flags. These flags are copied directly into
179 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
180 * ast. All defined in dlm.h with DLM_SBF_ prefix.
181 *
182 * lkb_status: the lock status indicates which rsb queue the lock is
183 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
184 *
185 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
186 * reply is needed. Only set when the lkb is on the lockspace waiters
187 * list awaiting a reply from a remote node.
188 *
189 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
190 * is a master copy, nodeid specifies the remote lock holder, when the
191 * lkb is a process copy, the nodeid specifies the lock master.
192 */
193
194/* lkb_ast_type */
195
196#define AST_COMP 1
197#define AST_BAST 2
198
199/* lkb_status */
200
201#define DLM_LKSTS_WAITING 1
202#define DLM_LKSTS_GRANTED 2
203#define DLM_LKSTS_CONVERT 3
204
205/* lkb_flags */
206
207#define DLM_IFL_MSTCPY 0x00010000
208#define DLM_IFL_RESEND 0x00020000
209#define DLM_IFL_DEAD 0x00040000
210#define DLM_IFL_USER 0x00000001
211#define DLM_IFL_ORPHAN 0x00000002
212
213struct dlm_lkb {
214 struct dlm_rsb *lkb_resource; /* the rsb */
215 struct kref lkb_ref;
216 int lkb_nodeid; /* copied from rsb */
217 int lkb_ownpid; /* pid of lock owner */
218 uint32_t lkb_id; /* our lock ID */
219 uint32_t lkb_remid; /* lock ID on remote partner */
220 uint32_t lkb_exflags; /* external flags from caller */
221 uint32_t lkb_sbflags; /* lksb flags */
222 uint32_t lkb_flags; /* internal flags */
223 uint32_t lkb_lvbseq; /* lvb sequence number */
224
225 int8_t lkb_status; /* granted, waiting, convert */
226 int8_t lkb_rqmode; /* requested lock mode */
227 int8_t lkb_grmode; /* granted lock mode */
228 int8_t lkb_bastmode; /* requested mode */
229 int8_t lkb_highbast; /* highest mode bast sent for */
230
231 int8_t lkb_wait_type; /* type of reply waiting for */
232 int8_t lkb_ast_type; /* type of ast queued for */
233
234 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
235 struct list_head lkb_statequeue; /* rsb g/c/w list */
236 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
237 struct list_head lkb_wait_reply; /* waiting for remote reply */
238 struct list_head lkb_astqueue; /* need ast to be sent */
239 struct list_head lkb_ownqueue; /* list of locks for a process */
240
241 char *lkb_lvbptr;
242 struct dlm_lksb *lkb_lksb; /* caller's status block */
243 void *lkb_astaddr; /* caller's ast function */
244 void *lkb_bastaddr; /* caller's bast function */
245 long lkb_astparam; /* caller's ast arg */
246};
247
248
249struct dlm_rsb {
250 struct dlm_ls *res_ls; /* the lockspace */
251 struct kref res_ref;
252 struct mutex res_mutex;
253 unsigned long res_flags;
254 int res_length; /* length of rsb name */
255 int res_nodeid;
256 uint32_t res_lvbseq;
257 uint32_t res_hash;
258 uint32_t res_bucket; /* rsbtbl */
259 unsigned long res_toss_time;
260 uint32_t res_first_lkid;
261 struct list_head res_lookup; /* lkbs waiting on first */
262 struct list_head res_hashchain; /* rsbtbl */
263 struct list_head res_grantqueue;
264 struct list_head res_convertqueue;
265 struct list_head res_waitqueue;
266
267 struct list_head res_root_list; /* used for recovery */
268 struct list_head res_recover_list; /* used for recovery */
269 int res_recover_locks_count;
270
271 char *res_lvbptr;
272 char res_name[1];
273};
274
275/* find_rsb() flags */
276
277#define R_MASTER 1 /* only return rsb if it's a master */
278#define R_CREATE 2 /* create/add rsb if not found */
279
280/* rsb_flags */
281
282enum rsb_flags {
283 RSB_MASTER_UNCERTAIN,
284 RSB_VALNOTVALID,
285 RSB_VALNOTVALID_PREV,
286 RSB_NEW_MASTER,
287 RSB_NEW_MASTER2,
288 RSB_RECOVER_CONVERT,
289 RSB_LOCKS_PURGED,
290};
291
292static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
293{
294 __set_bit(flag, &r->res_flags);
295}
296
297static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
298{
299 __clear_bit(flag, &r->res_flags);
300}
301
302static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
303{
304 return test_bit(flag, &r->res_flags);
305}
306
307
308/* dlm_header is first element of all structs sent between nodes */
309
310#define DLM_HEADER_MAJOR 0x00020000
311#define DLM_HEADER_MINOR 0x00000001
312
313#define DLM_MSG 1
314#define DLM_RCOM 2
315
316struct dlm_header {
317 uint32_t h_version;
318 uint32_t h_lockspace;
319 uint32_t h_nodeid; /* nodeid of sender */
320 uint16_t h_length;
321 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
322 uint8_t h_pad;
323};
324
325
326#define DLM_MSG_REQUEST 1
327#define DLM_MSG_CONVERT 2
328#define DLM_MSG_UNLOCK 3
329#define DLM_MSG_CANCEL 4
330#define DLM_MSG_REQUEST_REPLY 5
331#define DLM_MSG_CONVERT_REPLY 6
332#define DLM_MSG_UNLOCK_REPLY 7
333#define DLM_MSG_CANCEL_REPLY 8
334#define DLM_MSG_GRANT 9
335#define DLM_MSG_BAST 10
336#define DLM_MSG_LOOKUP 11
337#define DLM_MSG_REMOVE 12
338#define DLM_MSG_LOOKUP_REPLY 13
339
340struct dlm_message {
341 struct dlm_header m_header;
342 uint32_t m_type; /* DLM_MSG_ */
343 uint32_t m_nodeid;
344 uint32_t m_pid;
345 uint32_t m_lkid; /* lkid on sender */
346 uint32_t m_remid; /* lkid on receiver */
347 uint32_t m_parent_lkid;
348 uint32_t m_parent_remid;
349 uint32_t m_exflags;
350 uint32_t m_sbflags;
351 uint32_t m_flags;
352 uint32_t m_lvbseq;
353 uint32_t m_hash;
354 int m_status;
355 int m_grmode;
356 int m_rqmode;
357 int m_bastmode;
358 int m_asts;
359 int m_result; /* 0 or -EXXX */
360 char m_extra[0]; /* name or lvb */
361};
362
363
364#define DLM_RS_NODES 0x00000001
365#define DLM_RS_NODES_ALL 0x00000002
366#define DLM_RS_DIR 0x00000004
367#define DLM_RS_DIR_ALL 0x00000008
368#define DLM_RS_LOCKS 0x00000010
369#define DLM_RS_LOCKS_ALL 0x00000020
370#define DLM_RS_DONE 0x00000040
371#define DLM_RS_DONE_ALL 0x00000080
372
373#define DLM_RCOM_STATUS 1
374#define DLM_RCOM_NAMES 2
375#define DLM_RCOM_LOOKUP 3
376#define DLM_RCOM_LOCK 4
377#define DLM_RCOM_STATUS_REPLY 5
378#define DLM_RCOM_NAMES_REPLY 6
379#define DLM_RCOM_LOOKUP_REPLY 7
380#define DLM_RCOM_LOCK_REPLY 8
381
382struct dlm_rcom {
383 struct dlm_header rc_header;
384 uint32_t rc_type; /* DLM_RCOM_ */
385 int rc_result; /* multi-purpose */
386 uint64_t rc_id; /* match reply with request */
387 char rc_buf[0];
388};
389
390struct rcom_config {
391 uint32_t rf_lvblen;
392 uint32_t rf_lsflags;
393 uint64_t rf_unused;
394};
395
396struct rcom_lock {
397 uint32_t rl_ownpid;
398 uint32_t rl_lkid;
399 uint32_t rl_remid;
400 uint32_t rl_parent_lkid;
401 uint32_t rl_parent_remid;
402 uint32_t rl_exflags;
403 uint32_t rl_flags;
404 uint32_t rl_lvbseq;
405 int rl_result;
406 int8_t rl_rqmode;
407 int8_t rl_grmode;
408 int8_t rl_status;
409 int8_t rl_asts;
410 uint16_t rl_wait_type;
411 uint16_t rl_namelen;
412 char rl_name[DLM_RESNAME_MAXLEN];
413 char rl_lvb[0];
414};
415
416struct dlm_ls {
417 struct list_head ls_list; /* list of lockspaces */
418 dlm_lockspace_t *ls_local_handle;
419 uint32_t ls_global_id; /* global unique lockspace ID */
420 uint32_t ls_exflags;
421 int ls_lvblen;
422 int ls_count; /* reference count */
423 unsigned long ls_flags; /* LSFL_ */
424 struct kobject ls_kobj;
425
426 struct dlm_rsbtable *ls_rsbtbl;
427 uint32_t ls_rsbtbl_size;
428
429 struct dlm_lkbtable *ls_lkbtbl;
430 uint32_t ls_lkbtbl_size;
431
432 struct dlm_dirtable *ls_dirtbl;
433 uint32_t ls_dirtbl_size;
434
435 struct mutex ls_waiters_mutex;
436 struct list_head ls_waiters; /* lkbs needing a reply */
437
438 struct list_head ls_nodes; /* current nodes in ls */
439 struct list_head ls_nodes_gone; /* dead node list, recovery */
440 int ls_num_nodes; /* number of nodes in ls */
441 int ls_low_nodeid;
442 int ls_total_weight;
443 int *ls_node_array;
444
445 struct dlm_rsb ls_stub_rsb; /* for returning errors */
446 struct dlm_lkb ls_stub_lkb; /* for returning errors */
447 struct dlm_message ls_stub_ms; /* for faking a reply */
448
449 struct dentry *ls_debug_dentry; /* debugfs */
450
451 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
452 int ls_uevent_result;
453
454 struct miscdevice ls_device;
455
456 /* recovery related */
457
458 struct timer_list ls_timer;
459 struct task_struct *ls_recoverd_task;
460 struct mutex ls_recoverd_active;
461 spinlock_t ls_recover_lock;
462 uint32_t ls_recover_status; /* DLM_RS_ */
463 uint64_t ls_recover_seq;
464 struct dlm_recover *ls_recover_args;
465 struct rw_semaphore ls_in_recovery; /* block local requests */
466 struct list_head ls_requestqueue;/* queue remote requests */
467 struct mutex ls_requestqueue_mutex;
468 char *ls_recover_buf;
469 struct list_head ls_recover_list;
470 spinlock_t ls_recover_list_lock;
471 int ls_recover_list_count;
472 wait_queue_head_t ls_wait_general;
473 struct mutex ls_clear_proc_locks;
474
475 struct list_head ls_root_list; /* root resources */
476 struct rw_semaphore ls_root_sem; /* protect root_list */
477
478 int ls_namelen;
479 char ls_name[1];
480};
481
482#define LSFL_WORK 0
483#define LSFL_RUNNING 1
484#define LSFL_RECOVERY_STOP 2
485#define LSFL_RCOM_READY 3
486#define LSFL_UEVENT_WAIT 4
487
488/* much of this is just saving user space pointers associated with the
489 lock that we pass back to the user lib with an ast */
490
491struct dlm_user_args {
492 struct dlm_user_proc *proc; /* each process that opens the lockspace
493 device has private data
494 (dlm_user_proc) on the struct file,
495 the process's locks point back to it*/
496 struct dlm_lksb lksb;
497 int old_mode;
498 int update_user_lvb;
499 struct dlm_lksb __user *user_lksb;
500 void __user *castparam;
501 void __user *castaddr;
502 void __user *bastparam;
503 void __user *bastaddr;
504};
505
506#define DLM_PROC_FLAGS_CLOSING 1
507#define DLM_PROC_FLAGS_COMPAT 2
508
509/* locks list is kept so we can remove all a process's locks when it
510 exits (or orphan those that are persistent) */
511
512struct dlm_user_proc {
513 dlm_lockspace_t *lockspace;
514 unsigned long flags; /* DLM_PROC_FLAGS */
515 struct list_head asts;
516 spinlock_t asts_spin;
517 struct list_head locks;
518 spinlock_t locks_spin;
519 wait_queue_head_t wait;
520};
521
522static inline int dlm_locking_stopped(struct dlm_ls *ls)
523{
524 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
525}
526
527static inline int dlm_recovery_stopped(struct dlm_ls *ls)
528{
529 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
530}
531
532static inline int dlm_no_directory(struct dlm_ls *ls)
533{
534 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
535}
536
537#endif /* __DLM_INTERNAL_DOT_H__ */
538
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..4e222f873b6c
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3839 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58#include <linux/types.h>
59#include "dlm_internal.h"
60#include <linux/dlm_device.h>
61#include "memory.h"
62#include "lowcomms.h"
63#include "requestqueue.h"
64#include "util.h"
65#include "dir.h"
66#include "member.h"
67#include "lockspace.h"
68#include "ast.h"
69#include "lock.h"
70#include "rcom.h"
71#include "recover.h"
72#include "lvb_table.h"
73#include "user.h"
74#include "config.h"
75
76static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms);
88
89#define FAKE_USER_AST (void*)0xff00ff00
90
91/*
92 * Lock compatibilty matrix - thanks Steve
93 * UN = Unlocked state. Not really a state, used as a flag
94 * PD = Padding. Used to make the matrix a nice power of two in size
95 * Other states are the same as the VMS DLM.
96 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
97 */
98
99static const int __dlm_compat_matrix[8][8] = {
100 /* UN NL CR CW PR PW EX PD */
101 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
102 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
103 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
104 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
105 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
106 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
107 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
108 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
109};
110
111/*
112 * This defines the direction of transfer of LVB data.
113 * Granted mode is the row; requested mode is the column.
114 * Usage: matrix[grmode+1][rqmode+1]
115 * 1 = LVB is returned to the caller
116 * 0 = LVB is written to the resource
117 * -1 = nothing happens to the LVB
118 */
119
120const int dlm_lvb_operations[8][8] = {
121 /* UN NL CR CW PR PW EX PD*/
122 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
123 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
124 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
125 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
126 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
127 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
128 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
129 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
130};
131EXPORT_SYMBOL_GPL(dlm_lvb_operations);
132
133#define modes_compat(gr, rq) \
134 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
135
136int dlm_modes_compat(int mode1, int mode2)
137{
138 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
139}
140
141/*
142 * Compatibility matrix for conversions with QUECVT set.
143 * Granted mode is the row; requested mode is the column.
144 * Usage: matrix[grmode+1][rqmode+1]
145 */
146
147static const int __quecvt_compat_matrix[8][8] = {
148 /* UN NL CR CW PR PW EX PD */
149 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
150 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
151 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
152 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
153 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
154 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
155 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
156 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
157};
158
159void dlm_print_lkb(struct dlm_lkb *lkb)
160{
161 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
162 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
163 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
164 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
165 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
166}
167
168void dlm_print_rsb(struct dlm_rsb *r)
169{
170 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
171 r->res_nodeid, r->res_flags, r->res_first_lkid,
172 r->res_recover_locks_count, r->res_name);
173}
174
175/* Threads cannot use the lockspace while it's being recovered */
176
177static inline void lock_recovery(struct dlm_ls *ls)
178{
179 down_read(&ls->ls_in_recovery);
180}
181
182static inline void unlock_recovery(struct dlm_ls *ls)
183{
184 up_read(&ls->ls_in_recovery);
185}
186
187static inline int lock_recovery_try(struct dlm_ls *ls)
188{
189 return down_read_trylock(&ls->ls_in_recovery);
190}
191
192static inline int can_be_queued(struct dlm_lkb *lkb)
193{
194 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
195}
196
197static inline int force_blocking_asts(struct dlm_lkb *lkb)
198{
199 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
200}
201
202static inline int is_demoted(struct dlm_lkb *lkb)
203{
204 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
205}
206
207static inline int is_remote(struct dlm_rsb *r)
208{
209 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
210 return !!r->res_nodeid;
211}
212
213static inline int is_process_copy(struct dlm_lkb *lkb)
214{
215 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
216}
217
218static inline int is_master_copy(struct dlm_lkb *lkb)
219{
220 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
221 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
222 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
223}
224
225static inline int middle_conversion(struct dlm_lkb *lkb)
226{
227 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
228 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
229 return 1;
230 return 0;
231}
232
233static inline int down_conversion(struct dlm_lkb *lkb)
234{
235 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
236}
237
238static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
239{
240 if (is_master_copy(lkb))
241 return;
242
243 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
244
245 lkb->lkb_lksb->sb_status = rv;
246 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
247
248 dlm_add_ast(lkb, AST_COMP);
249}
250
251static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
252{
253 if (is_master_copy(lkb))
254 send_bast(r, lkb, rqmode);
255 else {
256 lkb->lkb_bastmode = rqmode;
257 dlm_add_ast(lkb, AST_BAST);
258 }
259}
260
261/*
262 * Basic operations on rsb's and lkb's
263 */
264
265static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
266{
267 struct dlm_rsb *r;
268
269 r = allocate_rsb(ls, len);
270 if (!r)
271 return NULL;
272
273 r->res_ls = ls;
274 r->res_length = len;
275 memcpy(r->res_name, name, len);
276 mutex_init(&r->res_mutex);
277
278 INIT_LIST_HEAD(&r->res_lookup);
279 INIT_LIST_HEAD(&r->res_grantqueue);
280 INIT_LIST_HEAD(&r->res_convertqueue);
281 INIT_LIST_HEAD(&r->res_waitqueue);
282 INIT_LIST_HEAD(&r->res_root_list);
283 INIT_LIST_HEAD(&r->res_recover_list);
284
285 return r;
286}
287
288static int search_rsb_list(struct list_head *head, char *name, int len,
289 unsigned int flags, struct dlm_rsb **r_ret)
290{
291 struct dlm_rsb *r;
292 int error = 0;
293
294 list_for_each_entry(r, head, res_hashchain) {
295 if (len == r->res_length && !memcmp(name, r->res_name, len))
296 goto found;
297 }
298 return -EBADR;
299
300 found:
301 if (r->res_nodeid && (flags & R_MASTER))
302 error = -ENOTBLK;
303 *r_ret = r;
304 return error;
305}
306
307static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
308 unsigned int flags, struct dlm_rsb **r_ret)
309{
310 struct dlm_rsb *r;
311 int error;
312
313 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
314 if (!error) {
315 kref_get(&r->res_ref);
316 goto out;
317 }
318 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
319 if (error)
320 goto out;
321
322 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
323
324 if (dlm_no_directory(ls))
325 goto out;
326
327 if (r->res_nodeid == -1) {
328 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
329 r->res_first_lkid = 0;
330 } else if (r->res_nodeid > 0) {
331 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
332 r->res_first_lkid = 0;
333 } else {
334 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
335 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
336 }
337 out:
338 *r_ret = r;
339 return error;
340}
341
342static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
343 unsigned int flags, struct dlm_rsb **r_ret)
344{
345 int error;
346 write_lock(&ls->ls_rsbtbl[b].lock);
347 error = _search_rsb(ls, name, len, b, flags, r_ret);
348 write_unlock(&ls->ls_rsbtbl[b].lock);
349 return error;
350}
351
352/*
353 * Find rsb in rsbtbl and potentially create/add one
354 *
355 * Delaying the release of rsb's has a similar benefit to applications keeping
356 * NL locks on an rsb, but without the guarantee that the cached master value
357 * will still be valid when the rsb is reused. Apps aren't always smart enough
358 * to keep NL locks on an rsb that they may lock again shortly; this can lead
359 * to excessive master lookups and removals if we don't delay the release.
360 *
361 * Searching for an rsb means looking through both the normal list and toss
362 * list. When found on the toss list the rsb is moved to the normal list with
363 * ref count of 1; when found on normal list the ref count is incremented.
364 */
365
366static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
367 unsigned int flags, struct dlm_rsb **r_ret)
368{
369 struct dlm_rsb *r, *tmp;
370 uint32_t hash, bucket;
371 int error = 0;
372
373 if (dlm_no_directory(ls))
374 flags |= R_CREATE;
375
376 hash = jhash(name, namelen, 0);
377 bucket = hash & (ls->ls_rsbtbl_size - 1);
378
379 error = search_rsb(ls, name, namelen, bucket, flags, &r);
380 if (!error)
381 goto out;
382
383 if (error == -EBADR && !(flags & R_CREATE))
384 goto out;
385
386 /* the rsb was found but wasn't a master copy */
387 if (error == -ENOTBLK)
388 goto out;
389
390 error = -ENOMEM;
391 r = create_rsb(ls, name, namelen);
392 if (!r)
393 goto out;
394
395 r->res_hash = hash;
396 r->res_bucket = bucket;
397 r->res_nodeid = -1;
398 kref_init(&r->res_ref);
399
400 /* With no directory, the master can be set immediately */
401 if (dlm_no_directory(ls)) {
402 int nodeid = dlm_dir_nodeid(r);
403 if (nodeid == dlm_our_nodeid())
404 nodeid = 0;
405 r->res_nodeid = nodeid;
406 }
407
408 write_lock(&ls->ls_rsbtbl[bucket].lock);
409 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
410 if (!error) {
411 write_unlock(&ls->ls_rsbtbl[bucket].lock);
412 free_rsb(r);
413 r = tmp;
414 goto out;
415 }
416 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
417 write_unlock(&ls->ls_rsbtbl[bucket].lock);
418 error = 0;
419 out:
420 *r_ret = r;
421 return error;
422}
423
424int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
425 unsigned int flags, struct dlm_rsb **r_ret)
426{
427 return find_rsb(ls, name, namelen, flags, r_ret);
428}
429
430/* This is only called to add a reference when the code already holds
431 a valid reference to the rsb, so there's no need for locking. */
432
433static inline void hold_rsb(struct dlm_rsb *r)
434{
435 kref_get(&r->res_ref);
436}
437
438void dlm_hold_rsb(struct dlm_rsb *r)
439{
440 hold_rsb(r);
441}
442
443static void toss_rsb(struct kref *kref)
444{
445 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
446 struct dlm_ls *ls = r->res_ls;
447
448 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
449 kref_init(&r->res_ref);
450 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
451 r->res_toss_time = jiffies;
452 if (r->res_lvbptr) {
453 free_lvb(r->res_lvbptr);
454 r->res_lvbptr = NULL;
455 }
456}
457
458/* When all references to the rsb are gone it's transfered to
459 the tossed list for later disposal. */
460
461static void put_rsb(struct dlm_rsb *r)
462{
463 struct dlm_ls *ls = r->res_ls;
464 uint32_t bucket = r->res_bucket;
465
466 write_lock(&ls->ls_rsbtbl[bucket].lock);
467 kref_put(&r->res_ref, toss_rsb);
468 write_unlock(&ls->ls_rsbtbl[bucket].lock);
469}
470
471void dlm_put_rsb(struct dlm_rsb *r)
472{
473 put_rsb(r);
474}
475
476/* See comment for unhold_lkb */
477
478static void unhold_rsb(struct dlm_rsb *r)
479{
480 int rv;
481 rv = kref_put(&r->res_ref, toss_rsb);
482 DLM_ASSERT(!rv, dlm_print_rsb(r););
483}
484
485static void kill_rsb(struct kref *kref)
486{
487 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
488
489 /* All work is done after the return from kref_put() so we
490 can release the write_lock before the remove and free. */
491
492 DLM_ASSERT(list_empty(&r->res_lookup),);
493 DLM_ASSERT(list_empty(&r->res_grantqueue),);
494 DLM_ASSERT(list_empty(&r->res_convertqueue),);
495 DLM_ASSERT(list_empty(&r->res_waitqueue),);
496 DLM_ASSERT(list_empty(&r->res_root_list),);
497 DLM_ASSERT(list_empty(&r->res_recover_list),);
498}
499
500/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
501 The rsb must exist as long as any lkb's for it do. */
502
503static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
504{
505 hold_rsb(r);
506 lkb->lkb_resource = r;
507}
508
509static void detach_lkb(struct dlm_lkb *lkb)
510{
511 if (lkb->lkb_resource) {
512 put_rsb(lkb->lkb_resource);
513 lkb->lkb_resource = NULL;
514 }
515}
516
517static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
518{
519 struct dlm_lkb *lkb, *tmp;
520 uint32_t lkid = 0;
521 uint16_t bucket;
522
523 lkb = allocate_lkb(ls);
524 if (!lkb)
525 return -ENOMEM;
526
527 lkb->lkb_nodeid = -1;
528 lkb->lkb_grmode = DLM_LOCK_IV;
529 kref_init(&lkb->lkb_ref);
530
531 get_random_bytes(&bucket, sizeof(bucket));
532 bucket &= (ls->ls_lkbtbl_size - 1);
533
534 write_lock(&ls->ls_lkbtbl[bucket].lock);
535
536 /* counter can roll over so we must verify lkid is not in use */
537
538 while (lkid == 0) {
539 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
540
541 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
542 lkb_idtbl_list) {
543 if (tmp->lkb_id != lkid)
544 continue;
545 lkid = 0;
546 break;
547 }
548 }
549
550 lkb->lkb_id = lkid;
551 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
552 write_unlock(&ls->ls_lkbtbl[bucket].lock);
553
554 *lkb_ret = lkb;
555 return 0;
556}
557
558static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
559{
560 uint16_t bucket = lkid & 0xFFFF;
561 struct dlm_lkb *lkb;
562
563 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
564 if (lkb->lkb_id == lkid)
565 return lkb;
566 }
567 return NULL;
568}
569
570static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
571{
572 struct dlm_lkb *lkb;
573 uint16_t bucket = lkid & 0xFFFF;
574
575 if (bucket >= ls->ls_lkbtbl_size)
576 return -EBADSLT;
577
578 read_lock(&ls->ls_lkbtbl[bucket].lock);
579 lkb = __find_lkb(ls, lkid);
580 if (lkb)
581 kref_get(&lkb->lkb_ref);
582 read_unlock(&ls->ls_lkbtbl[bucket].lock);
583
584 *lkb_ret = lkb;
585 return lkb ? 0 : -ENOENT;
586}
587
588static void kill_lkb(struct kref *kref)
589{
590 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
591
592 /* All work is done after the return from kref_put() so we
593 can release the write_lock before the detach_lkb */
594
595 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
596}
597
598/* __put_lkb() is used when an lkb may not have an rsb attached to
599 it so we need to provide the lockspace explicitly */
600
601static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
602{
603 uint16_t bucket = lkb->lkb_id & 0xFFFF;
604
605 write_lock(&ls->ls_lkbtbl[bucket].lock);
606 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
607 list_del(&lkb->lkb_idtbl_list);
608 write_unlock(&ls->ls_lkbtbl[bucket].lock);
609
610 detach_lkb(lkb);
611
612 /* for local/process lkbs, lvbptr points to caller's lksb */
613 if (lkb->lkb_lvbptr && is_master_copy(lkb))
614 free_lvb(lkb->lkb_lvbptr);
615 free_lkb(lkb);
616 return 1;
617 } else {
618 write_unlock(&ls->ls_lkbtbl[bucket].lock);
619 return 0;
620 }
621}
622
623int dlm_put_lkb(struct dlm_lkb *lkb)
624{
625 struct dlm_ls *ls;
626
627 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
628 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
629
630 ls = lkb->lkb_resource->res_ls;
631 return __put_lkb(ls, lkb);
632}
633
634/* This is only called to add a reference when the code already holds
635 a valid reference to the lkb, so there's no need for locking. */
636
637static inline void hold_lkb(struct dlm_lkb *lkb)
638{
639 kref_get(&lkb->lkb_ref);
640}
641
642/* This is called when we need to remove a reference and are certain
643 it's not the last ref. e.g. del_lkb is always called between a
644 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
645 put_lkb would work fine, but would involve unnecessary locking */
646
647static inline void unhold_lkb(struct dlm_lkb *lkb)
648{
649 int rv;
650 rv = kref_put(&lkb->lkb_ref, kill_lkb);
651 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
652}
653
654static void lkb_add_ordered(struct list_head *new, struct list_head *head,
655 int mode)
656{
657 struct dlm_lkb *lkb = NULL;
658
659 list_for_each_entry(lkb, head, lkb_statequeue)
660 if (lkb->lkb_rqmode < mode)
661 break;
662
663 if (!lkb)
664 list_add_tail(new, head);
665 else
666 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
667}
668
669/* add/remove lkb to rsb's grant/convert/wait queue */
670
671static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
672{
673 kref_get(&lkb->lkb_ref);
674
675 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
676
677 lkb->lkb_status = status;
678
679 switch (status) {
680 case DLM_LKSTS_WAITING:
681 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
682 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
683 else
684 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
685 break;
686 case DLM_LKSTS_GRANTED:
687 /* convention says granted locks kept in order of grmode */
688 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
689 lkb->lkb_grmode);
690 break;
691 case DLM_LKSTS_CONVERT:
692 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
693 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
694 else
695 list_add_tail(&lkb->lkb_statequeue,
696 &r->res_convertqueue);
697 break;
698 default:
699 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
700 }
701}
702
703static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
704{
705 lkb->lkb_status = 0;
706 list_del(&lkb->lkb_statequeue);
707 unhold_lkb(lkb);
708}
709
710static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
711{
712 hold_lkb(lkb);
713 del_lkb(r, lkb);
714 add_lkb(r, lkb, sts);
715 unhold_lkb(lkb);
716}
717
718/* add/remove lkb from global waiters list of lkb's waiting for
719 a reply from a remote node */
720
721static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
722{
723 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
724
725 mutex_lock(&ls->ls_waiters_mutex);
726 if (lkb->lkb_wait_type) {
727 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
728 goto out;
729 }
730 lkb->lkb_wait_type = mstype;
731 kref_get(&lkb->lkb_ref);
732 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
733 out:
734 mutex_unlock(&ls->ls_waiters_mutex);
735}
736
737static int _remove_from_waiters(struct dlm_lkb *lkb)
738{
739 int error = 0;
740
741 if (!lkb->lkb_wait_type) {
742 log_print("remove_from_waiters error");
743 error = -EINVAL;
744 goto out;
745 }
746 lkb->lkb_wait_type = 0;
747 list_del(&lkb->lkb_wait_reply);
748 unhold_lkb(lkb);
749 out:
750 return error;
751}
752
753static int remove_from_waiters(struct dlm_lkb *lkb)
754{
755 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
756 int error;
757
758 mutex_lock(&ls->ls_waiters_mutex);
759 error = _remove_from_waiters(lkb);
760 mutex_unlock(&ls->ls_waiters_mutex);
761 return error;
762}
763
764static void dir_remove(struct dlm_rsb *r)
765{
766 int to_nodeid;
767
768 if (dlm_no_directory(r->res_ls))
769 return;
770
771 to_nodeid = dlm_dir_nodeid(r);
772 if (to_nodeid != dlm_our_nodeid())
773 send_remove(r);
774 else
775 dlm_dir_remove_entry(r->res_ls, to_nodeid,
776 r->res_name, r->res_length);
777}
778
779/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
780 found since they are in order of newest to oldest? */
781
782static int shrink_bucket(struct dlm_ls *ls, int b)
783{
784 struct dlm_rsb *r;
785 int count = 0, found;
786
787 for (;;) {
788 found = 0;
789 write_lock(&ls->ls_rsbtbl[b].lock);
790 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
791 res_hashchain) {
792 if (!time_after_eq(jiffies, r->res_toss_time +
793 dlm_config.toss_secs * HZ))
794 continue;
795 found = 1;
796 break;
797 }
798
799 if (!found) {
800 write_unlock(&ls->ls_rsbtbl[b].lock);
801 break;
802 }
803
804 if (kref_put(&r->res_ref, kill_rsb)) {
805 list_del(&r->res_hashchain);
806 write_unlock(&ls->ls_rsbtbl[b].lock);
807
808 if (is_master(r))
809 dir_remove(r);
810 free_rsb(r);
811 count++;
812 } else {
813 write_unlock(&ls->ls_rsbtbl[b].lock);
814 log_error(ls, "tossed rsb in use %s", r->res_name);
815 }
816 }
817
818 return count;
819}
820
821void dlm_scan_rsbs(struct dlm_ls *ls)
822{
823 int i;
824
825 if (dlm_locking_stopped(ls))
826 return;
827
828 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
829 shrink_bucket(ls, i);
830 cond_resched();
831 }
832}
833
834/* lkb is master or local copy */
835
836static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
837{
838 int b, len = r->res_ls->ls_lvblen;
839
840 /* b=1 lvb returned to caller
841 b=0 lvb written to rsb or invalidated
842 b=-1 do nothing */
843
844 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
845
846 if (b == 1) {
847 if (!lkb->lkb_lvbptr)
848 return;
849
850 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
851 return;
852
853 if (!r->res_lvbptr)
854 return;
855
856 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
857 lkb->lkb_lvbseq = r->res_lvbseq;
858
859 } else if (b == 0) {
860 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
861 rsb_set_flag(r, RSB_VALNOTVALID);
862 return;
863 }
864
865 if (!lkb->lkb_lvbptr)
866 return;
867
868 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
869 return;
870
871 if (!r->res_lvbptr)
872 r->res_lvbptr = allocate_lvb(r->res_ls);
873
874 if (!r->res_lvbptr)
875 return;
876
877 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
878 r->res_lvbseq++;
879 lkb->lkb_lvbseq = r->res_lvbseq;
880 rsb_clear_flag(r, RSB_VALNOTVALID);
881 }
882
883 if (rsb_flag(r, RSB_VALNOTVALID))
884 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
885}
886
887static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
888{
889 if (lkb->lkb_grmode < DLM_LOCK_PW)
890 return;
891
892 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
893 rsb_set_flag(r, RSB_VALNOTVALID);
894 return;
895 }
896
897 if (!lkb->lkb_lvbptr)
898 return;
899
900 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
901 return;
902
903 if (!r->res_lvbptr)
904 r->res_lvbptr = allocate_lvb(r->res_ls);
905
906 if (!r->res_lvbptr)
907 return;
908
909 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
910 r->res_lvbseq++;
911 rsb_clear_flag(r, RSB_VALNOTVALID);
912}
913
914/* lkb is process copy (pc) */
915
916static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
917 struct dlm_message *ms)
918{
919 int b;
920
921 if (!lkb->lkb_lvbptr)
922 return;
923
924 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
925 return;
926
927 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
928 if (b == 1) {
929 int len = receive_extralen(ms);
930 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
931 lkb->lkb_lvbseq = ms->m_lvbseq;
932 }
933}
934
935/* Manipulate lkb's on rsb's convert/granted/waiting queues
936 remove_lock -- used for unlock, removes lkb from granted
937 revert_lock -- used for cancel, moves lkb from convert to granted
938 grant_lock -- used for request and convert, adds lkb to granted or
939 moves lkb from convert or waiting to granted
940
941 Each of these is used for master or local copy lkb's. There is
942 also a _pc() variation used to make the corresponding change on
943 a process copy (pc) lkb. */
944
945static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
946{
947 del_lkb(r, lkb);
948 lkb->lkb_grmode = DLM_LOCK_IV;
949 /* this unhold undoes the original ref from create_lkb()
950 so this leads to the lkb being freed */
951 unhold_lkb(lkb);
952}
953
954static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
955{
956 set_lvb_unlock(r, lkb);
957 _remove_lock(r, lkb);
958}
959
960static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
961{
962 _remove_lock(r, lkb);
963}
964
965static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
966{
967 lkb->lkb_rqmode = DLM_LOCK_IV;
968
969 switch (lkb->lkb_status) {
970 case DLM_LKSTS_GRANTED:
971 break;
972 case DLM_LKSTS_CONVERT:
973 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
974 break;
975 case DLM_LKSTS_WAITING:
976 del_lkb(r, lkb);
977 lkb->lkb_grmode = DLM_LOCK_IV;
978 /* this unhold undoes the original ref from create_lkb()
979 so this leads to the lkb being freed */
980 unhold_lkb(lkb);
981 break;
982 default:
983 log_print("invalid status for revert %d", lkb->lkb_status);
984 }
985}
986
987static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
988{
989 revert_lock(r, lkb);
990}
991
992static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
993{
994 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
995 lkb->lkb_grmode = lkb->lkb_rqmode;
996 if (lkb->lkb_status)
997 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
998 else
999 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1000 }
1001
1002 lkb->lkb_rqmode = DLM_LOCK_IV;
1003}
1004
1005static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1006{
1007 set_lvb_lock(r, lkb);
1008 _grant_lock(r, lkb);
1009 lkb->lkb_highbast = 0;
1010}
1011
1012static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1013 struct dlm_message *ms)
1014{
1015 set_lvb_lock_pc(r, lkb, ms);
1016 _grant_lock(r, lkb);
1017}
1018
1019/* called by grant_pending_locks() which means an async grant message must
1020 be sent to the requesting node in addition to granting the lock if the
1021 lkb belongs to a remote node. */
1022
1023static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1024{
1025 grant_lock(r, lkb);
1026 if (is_master_copy(lkb))
1027 send_grant(r, lkb);
1028 else
1029 queue_cast(r, lkb, 0);
1030}
1031
1032static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1033{
1034 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1035 lkb_statequeue);
1036 if (lkb->lkb_id == first->lkb_id)
1037 return 1;
1038
1039 return 0;
1040}
1041
1042/* Check if the given lkb conflicts with another lkb on the queue. */
1043
1044static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1045{
1046 struct dlm_lkb *this;
1047
1048 list_for_each_entry(this, head, lkb_statequeue) {
1049 if (this == lkb)
1050 continue;
1051 if (!modes_compat(this, lkb))
1052 return 1;
1053 }
1054 return 0;
1055}
1056
1057/*
1058 * "A conversion deadlock arises with a pair of lock requests in the converting
1059 * queue for one resource. The granted mode of each lock blocks the requested
1060 * mode of the other lock."
1061 *
1062 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1063 * convert queue from being granted, then demote lkb (set grmode to NL).
1064 * This second form requires that we check for conv-deadlk even when
1065 * now == 0 in _can_be_granted().
1066 *
1067 * Example:
1068 * Granted Queue: empty
1069 * Convert Queue: NL->EX (first lock)
1070 * PR->EX (second lock)
1071 *
1072 * The first lock can't be granted because of the granted mode of the second
1073 * lock and the second lock can't be granted because it's not first in the
1074 * list. We demote the granted mode of the second lock (the lkb passed to this
1075 * function).
1076 *
1077 * After the resolution, the "grant pending" function needs to go back and try
1078 * to grant locks on the convert queue again since the first lock can now be
1079 * granted.
1080 */
1081
1082static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1083{
1084 struct dlm_lkb *this, *first = NULL, *self = NULL;
1085
1086 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1087 if (!first)
1088 first = this;
1089 if (this == lkb) {
1090 self = lkb;
1091 continue;
1092 }
1093
1094 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1095 return 1;
1096 }
1097
1098 /* if lkb is on the convert queue and is preventing the first
1099 from being granted, then there's deadlock and we demote lkb.
1100 multiple converting locks may need to do this before the first
1101 converting lock can be granted. */
1102
1103 if (self && self != first) {
1104 if (!modes_compat(lkb, first) &&
1105 !queue_conflict(&rsb->res_grantqueue, first))
1106 return 1;
1107 }
1108
1109 return 0;
1110}
1111
1112/*
1113 * Return 1 if the lock can be granted, 0 otherwise.
1114 * Also detect and resolve conversion deadlocks.
1115 *
1116 * lkb is the lock to be granted
1117 *
1118 * now is 1 if the function is being called in the context of the
1119 * immediate request, it is 0 if called later, after the lock has been
1120 * queued.
1121 *
1122 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1123 */
1124
1125static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1126{
1127 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1128
1129 /*
1130 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1131 * a new request for a NL mode lock being blocked.
1132 *
1133 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1134 * request, then it would be granted. In essence, the use of this flag
1135 * tells the Lock Manager to expedite theis request by not considering
1136 * what may be in the CONVERTING or WAITING queues... As of this
1137 * writing, the EXPEDITE flag can be used only with new requests for NL
1138 * mode locks. This flag is not valid for conversion requests.
1139 *
1140 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1141 * conversion or used with a non-NL requested mode. We also know an
1142 * EXPEDITE request is always granted immediately, so now must always
1143 * be 1. The full condition to grant an expedite request: (now &&
1144 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1145 * therefore be shortened to just checking the flag.
1146 */
1147
1148 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1149 return 1;
1150
1151 /*
1152 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1153 * added to the remaining conditions.
1154 */
1155
1156 if (queue_conflict(&r->res_grantqueue, lkb))
1157 goto out;
1158
1159 /*
1160 * 6-3: By default, a conversion request is immediately granted if the
1161 * requested mode is compatible with the modes of all other granted
1162 * locks
1163 */
1164
1165 if (queue_conflict(&r->res_convertqueue, lkb))
1166 goto out;
1167
1168 /*
1169 * 6-5: But the default algorithm for deciding whether to grant or
1170 * queue conversion requests does not by itself guarantee that such
1171 * requests are serviced on a "first come first serve" basis. This, in
1172 * turn, can lead to a phenomenon known as "indefinate postponement".
1173 *
1174 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1175 * the system service employed to request a lock conversion. This flag
1176 * forces certain conversion requests to be queued, even if they are
1177 * compatible with the granted modes of other locks on the same
1178 * resource. Thus, the use of this flag results in conversion requests
1179 * being ordered on a "first come first servce" basis.
1180 *
1181 * DCT: This condition is all about new conversions being able to occur
1182 * "in place" while the lock remains on the granted queue (assuming
1183 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1184 * doesn't _have_ to go onto the convert queue where it's processed in
1185 * order. The "now" variable is necessary to distinguish converts
1186 * being received and processed for the first time now, because once a
1187 * convert is moved to the conversion queue the condition below applies
1188 * requiring fifo granting.
1189 */
1190
1191 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1192 return 1;
1193
1194 /*
1195 * The NOORDER flag is set to avoid the standard vms rules on grant
1196 * order.
1197 */
1198
1199 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1200 return 1;
1201
1202 /*
1203 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1204 * granted until all other conversion requests ahead of it are granted
1205 * and/or canceled.
1206 */
1207
1208 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1209 return 1;
1210
1211 /*
1212 * 6-4: By default, a new request is immediately granted only if all
1213 * three of the following conditions are satisfied when the request is
1214 * issued:
1215 * - The queue of ungranted conversion requests for the resource is
1216 * empty.
1217 * - The queue of ungranted new requests for the resource is empty.
1218 * - The mode of the new request is compatible with the most
1219 * restrictive mode of all granted locks on the resource.
1220 */
1221
1222 if (now && !conv && list_empty(&r->res_convertqueue) &&
1223 list_empty(&r->res_waitqueue))
1224 return 1;
1225
1226 /*
1227 * 6-4: Once a lock request is in the queue of ungranted new requests,
1228 * it cannot be granted until the queue of ungranted conversion
1229 * requests is empty, all ungranted new requests ahead of it are
1230 * granted and/or canceled, and it is compatible with the granted mode
1231 * of the most restrictive lock granted on the resource.
1232 */
1233
1234 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1235 first_in_list(lkb, &r->res_waitqueue))
1236 return 1;
1237
1238 out:
1239 /*
1240 * The following, enabled by CONVDEADLK, departs from VMS.
1241 */
1242
1243 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1244 conversion_deadlock_detect(r, lkb)) {
1245 lkb->lkb_grmode = DLM_LOCK_NL;
1246 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1247 }
1248
1249 return 0;
1250}
1251
1252/*
1253 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1254 * simple way to provide a big optimization to applications that can use them.
1255 */
1256
1257static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1258{
1259 uint32_t flags = lkb->lkb_exflags;
1260 int rv;
1261 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1262
1263 rv = _can_be_granted(r, lkb, now);
1264 if (rv)
1265 goto out;
1266
1267 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1268 goto out;
1269
1270 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1271 alt = DLM_LOCK_PR;
1272 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1273 alt = DLM_LOCK_CW;
1274
1275 if (alt) {
1276 lkb->lkb_rqmode = alt;
1277 rv = _can_be_granted(r, lkb, now);
1278 if (rv)
1279 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1280 else
1281 lkb->lkb_rqmode = rqmode;
1282 }
1283 out:
1284 return rv;
1285}
1286
1287static int grant_pending_convert(struct dlm_rsb *r, int high)
1288{
1289 struct dlm_lkb *lkb, *s;
1290 int hi, demoted, quit, grant_restart, demote_restart;
1291
1292 quit = 0;
1293 restart:
1294 grant_restart = 0;
1295 demote_restart = 0;
1296 hi = DLM_LOCK_IV;
1297
1298 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1299 demoted = is_demoted(lkb);
1300 if (can_be_granted(r, lkb, 0)) {
1301 grant_lock_pending(r, lkb);
1302 grant_restart = 1;
1303 } else {
1304 hi = max_t(int, lkb->lkb_rqmode, hi);
1305 if (!demoted && is_demoted(lkb))
1306 demote_restart = 1;
1307 }
1308 }
1309
1310 if (grant_restart)
1311 goto restart;
1312 if (demote_restart && !quit) {
1313 quit = 1;
1314 goto restart;
1315 }
1316
1317 return max_t(int, high, hi);
1318}
1319
1320static int grant_pending_wait(struct dlm_rsb *r, int high)
1321{
1322 struct dlm_lkb *lkb, *s;
1323
1324 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1325 if (can_be_granted(r, lkb, 0))
1326 grant_lock_pending(r, lkb);
1327 else
1328 high = max_t(int, lkb->lkb_rqmode, high);
1329 }
1330
1331 return high;
1332}
1333
1334static void grant_pending_locks(struct dlm_rsb *r)
1335{
1336 struct dlm_lkb *lkb, *s;
1337 int high = DLM_LOCK_IV;
1338
1339 DLM_ASSERT(is_master(r), dlm_print_rsb(r););
1340
1341 high = grant_pending_convert(r, high);
1342 high = grant_pending_wait(r, high);
1343
1344 if (high == DLM_LOCK_IV)
1345 return;
1346
1347 /*
1348 * If there are locks left on the wait/convert queue then send blocking
1349 * ASTs to granted locks based on the largest requested mode (high)
1350 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1351 */
1352
1353 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1354 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1355 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1356 queue_bast(r, lkb, high);
1357 lkb->lkb_highbast = high;
1358 }
1359 }
1360}
1361
1362static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1363 struct dlm_lkb *lkb)
1364{
1365 struct dlm_lkb *gr;
1366
1367 list_for_each_entry(gr, head, lkb_statequeue) {
1368 if (gr->lkb_bastaddr &&
1369 gr->lkb_highbast < lkb->lkb_rqmode &&
1370 !modes_compat(gr, lkb)) {
1371 queue_bast(r, gr, lkb->lkb_rqmode);
1372 gr->lkb_highbast = lkb->lkb_rqmode;
1373 }
1374 }
1375}
1376
1377static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1378{
1379 send_bast_queue(r, &r->res_grantqueue, lkb);
1380}
1381
1382static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1383{
1384 send_bast_queue(r, &r->res_grantqueue, lkb);
1385 send_bast_queue(r, &r->res_convertqueue, lkb);
1386}
1387
1388/* set_master(r, lkb) -- set the master nodeid of a resource
1389
1390 The purpose of this function is to set the nodeid field in the given
1391 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1392 known, it can just be copied to the lkb and the function will return
1393 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1394 before it can be copied to the lkb.
1395
1396 When the rsb nodeid is being looked up remotely, the initial lkb
1397 causing the lookup is kept on the ls_waiters list waiting for the
1398 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1399 on the rsb's res_lookup list until the master is verified.
1400
1401 Return values:
1402 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1403 1: the rsb master is not available and the lkb has been placed on
1404 a wait queue
1405*/
1406
1407static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1408{
1409 struct dlm_ls *ls = r->res_ls;
1410 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1411
1412 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1413 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1414 r->res_first_lkid = lkb->lkb_id;
1415 lkb->lkb_nodeid = r->res_nodeid;
1416 return 0;
1417 }
1418
1419 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1420 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1421 return 1;
1422 }
1423
1424 if (r->res_nodeid == 0) {
1425 lkb->lkb_nodeid = 0;
1426 return 0;
1427 }
1428
1429 if (r->res_nodeid > 0) {
1430 lkb->lkb_nodeid = r->res_nodeid;
1431 return 0;
1432 }
1433
1434 DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
1435
1436 dir_nodeid = dlm_dir_nodeid(r);
1437
1438 if (dir_nodeid != our_nodeid) {
1439 r->res_first_lkid = lkb->lkb_id;
1440 send_lookup(r, lkb);
1441 return 1;
1442 }
1443
1444 for (;;) {
1445 /* It's possible for dlm_scand to remove an old rsb for
1446 this same resource from the toss list, us to create
1447 a new one, look up the master locally, and find it
1448 already exists just before dlm_scand does the
1449 dir_remove() on the previous rsb. */
1450
1451 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1452 r->res_length, &ret_nodeid);
1453 if (!error)
1454 break;
1455 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1456 schedule();
1457 }
1458
1459 if (ret_nodeid == our_nodeid) {
1460 r->res_first_lkid = 0;
1461 r->res_nodeid = 0;
1462 lkb->lkb_nodeid = 0;
1463 } else {
1464 r->res_first_lkid = lkb->lkb_id;
1465 r->res_nodeid = ret_nodeid;
1466 lkb->lkb_nodeid = ret_nodeid;
1467 }
1468 return 0;
1469}
1470
1471static void process_lookup_list(struct dlm_rsb *r)
1472{
1473 struct dlm_lkb *lkb, *safe;
1474
1475 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1476 list_del(&lkb->lkb_rsb_lookup);
1477 _request_lock(r, lkb);
1478 schedule();
1479 }
1480}
1481
1482/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1483
1484static void confirm_master(struct dlm_rsb *r, int error)
1485{
1486 struct dlm_lkb *lkb;
1487
1488 if (!r->res_first_lkid)
1489 return;
1490
1491 switch (error) {
1492 case 0:
1493 case -EINPROGRESS:
1494 r->res_first_lkid = 0;
1495 process_lookup_list(r);
1496 break;
1497
1498 case -EAGAIN:
1499 /* the remote master didn't queue our NOQUEUE request;
1500 make a waiting lkb the first_lkid */
1501
1502 r->res_first_lkid = 0;
1503
1504 if (!list_empty(&r->res_lookup)) {
1505 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1506 lkb_rsb_lookup);
1507 list_del(&lkb->lkb_rsb_lookup);
1508 r->res_first_lkid = lkb->lkb_id;
1509 _request_lock(r, lkb);
1510 } else
1511 r->res_nodeid = -1;
1512 break;
1513
1514 default:
1515 log_error(r->res_ls, "confirm_master unknown error %d", error);
1516 }
1517}
1518
1519static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1520 int namelen, uint32_t parent_lkid, void *ast,
1521 void *astarg, void *bast, struct dlm_args *args)
1522{
1523 int rv = -EINVAL;
1524
1525 /* check for invalid arg usage */
1526
1527 if (mode < 0 || mode > DLM_LOCK_EX)
1528 goto out;
1529
1530 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1531 goto out;
1532
1533 if (flags & DLM_LKF_CANCEL)
1534 goto out;
1535
1536 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1537 goto out;
1538
1539 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1540 goto out;
1541
1542 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1543 goto out;
1544
1545 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1546 goto out;
1547
1548 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1549 goto out;
1550
1551 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1552 goto out;
1553
1554 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1555 goto out;
1556
1557 if (!ast || !lksb)
1558 goto out;
1559
1560 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1561 goto out;
1562
1563 /* parent/child locks not yet supported */
1564 if (parent_lkid)
1565 goto out;
1566
1567 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1568 goto out;
1569
1570 /* these args will be copied to the lkb in validate_lock_args,
1571 it cannot be done now because when converting locks, fields in
1572 an active lkb cannot be modified before locking the rsb */
1573
1574 args->flags = flags;
1575 args->astaddr = ast;
1576 args->astparam = (long) astarg;
1577 args->bastaddr = bast;
1578 args->mode = mode;
1579 args->lksb = lksb;
1580 rv = 0;
1581 out:
1582 return rv;
1583}
1584
1585static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1586{
1587 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1588 DLM_LKF_FORCEUNLOCK))
1589 return -EINVAL;
1590
1591 args->flags = flags;
1592 args->astparam = (long) astarg;
1593 return 0;
1594}
1595
1596static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1597 struct dlm_args *args)
1598{
1599 int rv = -EINVAL;
1600
1601 if (args->flags & DLM_LKF_CONVERT) {
1602 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1603 goto out;
1604
1605 if (args->flags & DLM_LKF_QUECVT &&
1606 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1607 goto out;
1608
1609 rv = -EBUSY;
1610 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1611 goto out;
1612
1613 if (lkb->lkb_wait_type)
1614 goto out;
1615 }
1616
1617 lkb->lkb_exflags = args->flags;
1618 lkb->lkb_sbflags = 0;
1619 lkb->lkb_astaddr = args->astaddr;
1620 lkb->lkb_astparam = args->astparam;
1621 lkb->lkb_bastaddr = args->bastaddr;
1622 lkb->lkb_rqmode = args->mode;
1623 lkb->lkb_lksb = args->lksb;
1624 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1625 lkb->lkb_ownpid = (int) current->pid;
1626 rv = 0;
1627 out:
1628 return rv;
1629}
1630
1631static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1632{
1633 int rv = -EINVAL;
1634
1635 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1636 goto out;
1637
1638 if (args->flags & DLM_LKF_FORCEUNLOCK)
1639 goto out_ok;
1640
1641 if (args->flags & DLM_LKF_CANCEL &&
1642 lkb->lkb_status == DLM_LKSTS_GRANTED)
1643 goto out;
1644
1645 if (!(args->flags & DLM_LKF_CANCEL) &&
1646 lkb->lkb_status != DLM_LKSTS_GRANTED)
1647 goto out;
1648
1649 rv = -EBUSY;
1650 if (lkb->lkb_wait_type)
1651 goto out;
1652
1653 out_ok:
1654 lkb->lkb_exflags = args->flags;
1655 lkb->lkb_sbflags = 0;
1656 lkb->lkb_astparam = args->astparam;
1657
1658 rv = 0;
1659 out:
1660 return rv;
1661}
1662
1663/*
1664 * Four stage 4 varieties:
1665 * do_request(), do_convert(), do_unlock(), do_cancel()
1666 * These are called on the master node for the given lock and
1667 * from the central locking logic.
1668 */
1669
1670static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1671{
1672 int error = 0;
1673
1674 if (can_be_granted(r, lkb, 1)) {
1675 grant_lock(r, lkb);
1676 queue_cast(r, lkb, 0);
1677 goto out;
1678 }
1679
1680 if (can_be_queued(lkb)) {
1681 error = -EINPROGRESS;
1682 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1683 send_blocking_asts(r, lkb);
1684 goto out;
1685 }
1686
1687 error = -EAGAIN;
1688 if (force_blocking_asts(lkb))
1689 send_blocking_asts_all(r, lkb);
1690 queue_cast(r, lkb, -EAGAIN);
1691
1692 out:
1693 return error;
1694}
1695
1696static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1697{
1698 int error = 0;
1699
1700 /* changing an existing lock may allow others to be granted */
1701
1702 if (can_be_granted(r, lkb, 1)) {
1703 grant_lock(r, lkb);
1704 queue_cast(r, lkb, 0);
1705 grant_pending_locks(r);
1706 goto out;
1707 }
1708
1709 if (can_be_queued(lkb)) {
1710 if (is_demoted(lkb))
1711 grant_pending_locks(r);
1712 error = -EINPROGRESS;
1713 del_lkb(r, lkb);
1714 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1715 send_blocking_asts(r, lkb);
1716 goto out;
1717 }
1718
1719 error = -EAGAIN;
1720 if (force_blocking_asts(lkb))
1721 send_blocking_asts_all(r, lkb);
1722 queue_cast(r, lkb, -EAGAIN);
1723
1724 out:
1725 return error;
1726}
1727
1728static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1729{
1730 remove_lock(r, lkb);
1731 queue_cast(r, lkb, -DLM_EUNLOCK);
1732 grant_pending_locks(r);
1733 return -DLM_EUNLOCK;
1734}
1735
1736/* FIXME: if revert_lock() finds that the lkb is granted, we should
1737 skip the queue_cast(ECANCEL). It indicates that the request/convert
1738 completed (and queued a normal ast) just before the cancel; we don't
1739 want to clobber the sb_result for the normal ast with ECANCEL. */
1740
1741static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1742{
1743 revert_lock(r, lkb);
1744 queue_cast(r, lkb, -DLM_ECANCEL);
1745 grant_pending_locks(r);
1746 return -DLM_ECANCEL;
1747}
1748
1749/*
1750 * Four stage 3 varieties:
1751 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1752 */
1753
1754/* add a new lkb to a possibly new rsb, called by requesting process */
1755
1756static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1757{
1758 int error;
1759
1760 /* set_master: sets lkb nodeid from r */
1761
1762 error = set_master(r, lkb);
1763 if (error < 0)
1764 goto out;
1765 if (error) {
1766 error = 0;
1767 goto out;
1768 }
1769
1770 if (is_remote(r))
1771 /* receive_request() calls do_request() on remote node */
1772 error = send_request(r, lkb);
1773 else
1774 error = do_request(r, lkb);
1775 out:
1776 return error;
1777}
1778
1779/* change some property of an existing lkb, e.g. mode */
1780
1781static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1782{
1783 int error;
1784
1785 if (is_remote(r))
1786 /* receive_convert() calls do_convert() on remote node */
1787 error = send_convert(r, lkb);
1788 else
1789 error = do_convert(r, lkb);
1790
1791 return error;
1792}
1793
1794/* remove an existing lkb from the granted queue */
1795
1796static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1797{
1798 int error;
1799
1800 if (is_remote(r))
1801 /* receive_unlock() calls do_unlock() on remote node */
1802 error = send_unlock(r, lkb);
1803 else
1804 error = do_unlock(r, lkb);
1805
1806 return error;
1807}
1808
1809/* remove an existing lkb from the convert or wait queue */
1810
1811static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1812{
1813 int error;
1814
1815 if (is_remote(r))
1816 /* receive_cancel() calls do_cancel() on remote node */
1817 error = send_cancel(r, lkb);
1818 else
1819 error = do_cancel(r, lkb);
1820
1821 return error;
1822}
1823
1824/*
1825 * Four stage 2 varieties:
1826 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1827 */
1828
1829static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1830 int len, struct dlm_args *args)
1831{
1832 struct dlm_rsb *r;
1833 int error;
1834
1835 error = validate_lock_args(ls, lkb, args);
1836 if (error)
1837 goto out;
1838
1839 error = find_rsb(ls, name, len, R_CREATE, &r);
1840 if (error)
1841 goto out;
1842
1843 lock_rsb(r);
1844
1845 attach_lkb(r, lkb);
1846 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1847
1848 error = _request_lock(r, lkb);
1849
1850 unlock_rsb(r);
1851 put_rsb(r);
1852
1853 out:
1854 return error;
1855}
1856
1857static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1858 struct dlm_args *args)
1859{
1860 struct dlm_rsb *r;
1861 int error;
1862
1863 r = lkb->lkb_resource;
1864
1865 hold_rsb(r);
1866 lock_rsb(r);
1867
1868 error = validate_lock_args(ls, lkb, args);
1869 if (error)
1870 goto out;
1871
1872 error = _convert_lock(r, lkb);
1873 out:
1874 unlock_rsb(r);
1875 put_rsb(r);
1876 return error;
1877}
1878
1879static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1880 struct dlm_args *args)
1881{
1882 struct dlm_rsb *r;
1883 int error;
1884
1885 r = lkb->lkb_resource;
1886
1887 hold_rsb(r);
1888 lock_rsb(r);
1889
1890 error = validate_unlock_args(lkb, args);
1891 if (error)
1892 goto out;
1893
1894 error = _unlock_lock(r, lkb);
1895 out:
1896 unlock_rsb(r);
1897 put_rsb(r);
1898 return error;
1899}
1900
1901static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1902 struct dlm_args *args)
1903{
1904 struct dlm_rsb *r;
1905 int error;
1906
1907 r = lkb->lkb_resource;
1908
1909 hold_rsb(r);
1910 lock_rsb(r);
1911
1912 error = validate_unlock_args(lkb, args);
1913 if (error)
1914 goto out;
1915
1916 error = _cancel_lock(r, lkb);
1917 out:
1918 unlock_rsb(r);
1919 put_rsb(r);
1920 return error;
1921}
1922
1923/*
1924 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1925 */
1926
1927int dlm_lock(dlm_lockspace_t *lockspace,
1928 int mode,
1929 struct dlm_lksb *lksb,
1930 uint32_t flags,
1931 void *name,
1932 unsigned int namelen,
1933 uint32_t parent_lkid,
1934 void (*ast) (void *astarg),
1935 void *astarg,
1936 void (*bast) (void *astarg, int mode))
1937{
1938 struct dlm_ls *ls;
1939 struct dlm_lkb *lkb;
1940 struct dlm_args args;
1941 int error, convert = flags & DLM_LKF_CONVERT;
1942
1943 ls = dlm_find_lockspace_local(lockspace);
1944 if (!ls)
1945 return -EINVAL;
1946
1947 lock_recovery(ls);
1948
1949 if (convert)
1950 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1951 else
1952 error = create_lkb(ls, &lkb);
1953
1954 if (error)
1955 goto out;
1956
1957 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1958 astarg, bast, &args);
1959 if (error)
1960 goto out_put;
1961
1962 if (convert)
1963 error = convert_lock(ls, lkb, &args);
1964 else
1965 error = request_lock(ls, lkb, name, namelen, &args);
1966
1967 if (error == -EINPROGRESS)
1968 error = 0;
1969 out_put:
1970 if (convert || error)
1971 __put_lkb(ls, lkb);
1972 if (error == -EAGAIN)
1973 error = 0;
1974 out:
1975 unlock_recovery(ls);
1976 dlm_put_lockspace(ls);
1977 return error;
1978}
1979
1980int dlm_unlock(dlm_lockspace_t *lockspace,
1981 uint32_t lkid,
1982 uint32_t flags,
1983 struct dlm_lksb *lksb,
1984 void *astarg)
1985{
1986 struct dlm_ls *ls;
1987 struct dlm_lkb *lkb;
1988 struct dlm_args args;
1989 int error;
1990
1991 ls = dlm_find_lockspace_local(lockspace);
1992 if (!ls)
1993 return -EINVAL;
1994
1995 lock_recovery(ls);
1996
1997 error = find_lkb(ls, lkid, &lkb);
1998 if (error)
1999 goto out;
2000
2001 error = set_unlock_args(flags, astarg, &args);
2002 if (error)
2003 goto out_put;
2004
2005 if (flags & DLM_LKF_CANCEL)
2006 error = cancel_lock(ls, lkb, &args);
2007 else
2008 error = unlock_lock(ls, lkb, &args);
2009
2010 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2011 error = 0;
2012 out_put:
2013 dlm_put_lkb(lkb);
2014 out:
2015 unlock_recovery(ls);
2016 dlm_put_lockspace(ls);
2017 return error;
2018}
2019
2020/*
2021 * send/receive routines for remote operations and replies
2022 *
2023 * send_args
2024 * send_common
2025 * send_request receive_request
2026 * send_convert receive_convert
2027 * send_unlock receive_unlock
2028 * send_cancel receive_cancel
2029 * send_grant receive_grant
2030 * send_bast receive_bast
2031 * send_lookup receive_lookup
2032 * send_remove receive_remove
2033 *
2034 * send_common_reply
2035 * receive_request_reply send_request_reply
2036 * receive_convert_reply send_convert_reply
2037 * receive_unlock_reply send_unlock_reply
2038 * receive_cancel_reply send_cancel_reply
2039 * receive_lookup_reply send_lookup_reply
2040 */
2041
2042static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2043 int to_nodeid, int mstype,
2044 struct dlm_message **ms_ret,
2045 struct dlm_mhandle **mh_ret)
2046{
2047 struct dlm_message *ms;
2048 struct dlm_mhandle *mh;
2049 char *mb;
2050 int mb_len = sizeof(struct dlm_message);
2051
2052 switch (mstype) {
2053 case DLM_MSG_REQUEST:
2054 case DLM_MSG_LOOKUP:
2055 case DLM_MSG_REMOVE:
2056 mb_len += r->res_length;
2057 break;
2058 case DLM_MSG_CONVERT:
2059 case DLM_MSG_UNLOCK:
2060 case DLM_MSG_REQUEST_REPLY:
2061 case DLM_MSG_CONVERT_REPLY:
2062 case DLM_MSG_GRANT:
2063 if (lkb && lkb->lkb_lvbptr)
2064 mb_len += r->res_ls->ls_lvblen;
2065 break;
2066 }
2067
2068 /* get_buffer gives us a message handle (mh) that we need to
2069 pass into lowcomms_commit and a message buffer (mb) that we
2070 write our data into */
2071
2072 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2073 if (!mh)
2074 return -ENOBUFS;
2075
2076 memset(mb, 0, mb_len);
2077
2078 ms = (struct dlm_message *) mb;
2079
2080 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2081 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2082 ms->m_header.h_nodeid = dlm_our_nodeid();
2083 ms->m_header.h_length = mb_len;
2084 ms->m_header.h_cmd = DLM_MSG;
2085
2086 ms->m_type = mstype;
2087
2088 *mh_ret = mh;
2089 *ms_ret = ms;
2090 return 0;
2091}
2092
2093/* further lowcomms enhancements or alternate implementations may make
2094 the return value from this function useful at some point */
2095
2096static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2097{
2098 dlm_message_out(ms);
2099 dlm_lowcomms_commit_buffer(mh);
2100 return 0;
2101}
2102
2103static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2104 struct dlm_message *ms)
2105{
2106 ms->m_nodeid = lkb->lkb_nodeid;
2107 ms->m_pid = lkb->lkb_ownpid;
2108 ms->m_lkid = lkb->lkb_id;
2109 ms->m_remid = lkb->lkb_remid;
2110 ms->m_exflags = lkb->lkb_exflags;
2111 ms->m_sbflags = lkb->lkb_sbflags;
2112 ms->m_flags = lkb->lkb_flags;
2113 ms->m_lvbseq = lkb->lkb_lvbseq;
2114 ms->m_status = lkb->lkb_status;
2115 ms->m_grmode = lkb->lkb_grmode;
2116 ms->m_rqmode = lkb->lkb_rqmode;
2117 ms->m_hash = r->res_hash;
2118
2119 /* m_result and m_bastmode are set from function args,
2120 not from lkb fields */
2121
2122 if (lkb->lkb_bastaddr)
2123 ms->m_asts |= AST_BAST;
2124 if (lkb->lkb_astaddr)
2125 ms->m_asts |= AST_COMP;
2126
2127 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2128 memcpy(ms->m_extra, r->res_name, r->res_length);
2129
2130 else if (lkb->lkb_lvbptr)
2131 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2132
2133}
2134
2135static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2136{
2137 struct dlm_message *ms;
2138 struct dlm_mhandle *mh;
2139 int to_nodeid, error;
2140
2141 add_to_waiters(lkb, mstype);
2142
2143 to_nodeid = r->res_nodeid;
2144
2145 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2146 if (error)
2147 goto fail;
2148
2149 send_args(r, lkb, ms);
2150
2151 error = send_message(mh, ms);
2152 if (error)
2153 goto fail;
2154 return 0;
2155
2156 fail:
2157 remove_from_waiters(lkb);
2158 return error;
2159}
2160
2161static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2162{
2163 return send_common(r, lkb, DLM_MSG_REQUEST);
2164}
2165
2166static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2167{
2168 int error;
2169
2170 error = send_common(r, lkb, DLM_MSG_CONVERT);
2171
2172 /* down conversions go without a reply from the master */
2173 if (!error && down_conversion(lkb)) {
2174 remove_from_waiters(lkb);
2175 r->res_ls->ls_stub_ms.m_result = 0;
2176 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2177 }
2178
2179 return error;
2180}
2181
2182/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2183 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2184 that the master is still correct. */
2185
2186static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2187{
2188 return send_common(r, lkb, DLM_MSG_UNLOCK);
2189}
2190
2191static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2192{
2193 return send_common(r, lkb, DLM_MSG_CANCEL);
2194}
2195
2196static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2197{
2198 struct dlm_message *ms;
2199 struct dlm_mhandle *mh;
2200 int to_nodeid, error;
2201
2202 to_nodeid = lkb->lkb_nodeid;
2203
2204 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2205 if (error)
2206 goto out;
2207
2208 send_args(r, lkb, ms);
2209
2210 ms->m_result = 0;
2211
2212 error = send_message(mh, ms);
2213 out:
2214 return error;
2215}
2216
2217static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2218{
2219 struct dlm_message *ms;
2220 struct dlm_mhandle *mh;
2221 int to_nodeid, error;
2222
2223 to_nodeid = lkb->lkb_nodeid;
2224
2225 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2226 if (error)
2227 goto out;
2228
2229 send_args(r, lkb, ms);
2230
2231 ms->m_bastmode = mode;
2232
2233 error = send_message(mh, ms);
2234 out:
2235 return error;
2236}
2237
2238static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2239{
2240 struct dlm_message *ms;
2241 struct dlm_mhandle *mh;
2242 int to_nodeid, error;
2243
2244 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2245
2246 to_nodeid = dlm_dir_nodeid(r);
2247
2248 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2249 if (error)
2250 goto fail;
2251
2252 send_args(r, lkb, ms);
2253
2254 error = send_message(mh, ms);
2255 if (error)
2256 goto fail;
2257 return 0;
2258
2259 fail:
2260 remove_from_waiters(lkb);
2261 return error;
2262}
2263
2264static int send_remove(struct dlm_rsb *r)
2265{
2266 struct dlm_message *ms;
2267 struct dlm_mhandle *mh;
2268 int to_nodeid, error;
2269
2270 to_nodeid = dlm_dir_nodeid(r);
2271
2272 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2273 if (error)
2274 goto out;
2275
2276 memcpy(ms->m_extra, r->res_name, r->res_length);
2277 ms->m_hash = r->res_hash;
2278
2279 error = send_message(mh, ms);
2280 out:
2281 return error;
2282}
2283
2284static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2285 int mstype, int rv)
2286{
2287 struct dlm_message *ms;
2288 struct dlm_mhandle *mh;
2289 int to_nodeid, error;
2290
2291 to_nodeid = lkb->lkb_nodeid;
2292
2293 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2294 if (error)
2295 goto out;
2296
2297 send_args(r, lkb, ms);
2298
2299 ms->m_result = rv;
2300
2301 error = send_message(mh, ms);
2302 out:
2303 return error;
2304}
2305
2306static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2307{
2308 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2309}
2310
2311static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2312{
2313 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2314}
2315
2316static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2317{
2318 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2319}
2320
2321static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2322{
2323 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2324}
2325
2326static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2327 int ret_nodeid, int rv)
2328{
2329 struct dlm_rsb *r = &ls->ls_stub_rsb;
2330 struct dlm_message *ms;
2331 struct dlm_mhandle *mh;
2332 int error, nodeid = ms_in->m_header.h_nodeid;
2333
2334 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2335 if (error)
2336 goto out;
2337
2338 ms->m_lkid = ms_in->m_lkid;
2339 ms->m_result = rv;
2340 ms->m_nodeid = ret_nodeid;
2341
2342 error = send_message(mh, ms);
2343 out:
2344 return error;
2345}
2346
2347/* which args we save from a received message depends heavily on the type
2348 of message, unlike the send side where we can safely send everything about
2349 the lkb for any type of message */
2350
2351static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2352{
2353 lkb->lkb_exflags = ms->m_exflags;
2354 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2355 (ms->m_flags & 0x0000FFFF);
2356}
2357
2358static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2359{
2360 lkb->lkb_sbflags = ms->m_sbflags;
2361 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2362 (ms->m_flags & 0x0000FFFF);
2363}
2364
2365static int receive_extralen(struct dlm_message *ms)
2366{
2367 return (ms->m_header.h_length - sizeof(struct dlm_message));
2368}
2369
2370static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2371 struct dlm_message *ms)
2372{
2373 int len;
2374
2375 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2376 if (!lkb->lkb_lvbptr)
2377 lkb->lkb_lvbptr = allocate_lvb(ls);
2378 if (!lkb->lkb_lvbptr)
2379 return -ENOMEM;
2380 len = receive_extralen(ms);
2381 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2382 }
2383 return 0;
2384}
2385
2386static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2387 struct dlm_message *ms)
2388{
2389 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2390 lkb->lkb_ownpid = ms->m_pid;
2391 lkb->lkb_remid = ms->m_lkid;
2392 lkb->lkb_grmode = DLM_LOCK_IV;
2393 lkb->lkb_rqmode = ms->m_rqmode;
2394 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2395 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2396
2397 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2398
2399 if (receive_lvb(ls, lkb, ms))
2400 return -ENOMEM;
2401
2402 return 0;
2403}
2404
2405static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2406 struct dlm_message *ms)
2407{
2408 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2409 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2410 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2411 lkb->lkb_id, lkb->lkb_remid);
2412 return -EINVAL;
2413 }
2414
2415 if (!is_master_copy(lkb))
2416 return -EINVAL;
2417
2418 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2419 return -EBUSY;
2420
2421 if (receive_lvb(ls, lkb, ms))
2422 return -ENOMEM;
2423
2424 lkb->lkb_rqmode = ms->m_rqmode;
2425 lkb->lkb_lvbseq = ms->m_lvbseq;
2426
2427 return 0;
2428}
2429
2430static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2431 struct dlm_message *ms)
2432{
2433 if (!is_master_copy(lkb))
2434 return -EINVAL;
2435 if (receive_lvb(ls, lkb, ms))
2436 return -ENOMEM;
2437 return 0;
2438}
2439
2440/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2441 uses to send a reply and that the remote end uses to process the reply. */
2442
2443static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2444{
2445 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2446 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2447 lkb->lkb_remid = ms->m_lkid;
2448}
2449
2450static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2451{
2452 struct dlm_lkb *lkb;
2453 struct dlm_rsb *r;
2454 int error, namelen;
2455
2456 error = create_lkb(ls, &lkb);
2457 if (error)
2458 goto fail;
2459
2460 receive_flags(lkb, ms);
2461 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2462 error = receive_request_args(ls, lkb, ms);
2463 if (error) {
2464 __put_lkb(ls, lkb);
2465 goto fail;
2466 }
2467
2468 namelen = receive_extralen(ms);
2469
2470 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2471 if (error) {
2472 __put_lkb(ls, lkb);
2473 goto fail;
2474 }
2475
2476 lock_rsb(r);
2477
2478 attach_lkb(r, lkb);
2479 error = do_request(r, lkb);
2480 send_request_reply(r, lkb, error);
2481
2482 unlock_rsb(r);
2483 put_rsb(r);
2484
2485 if (error == -EINPROGRESS)
2486 error = 0;
2487 if (error)
2488 dlm_put_lkb(lkb);
2489 return;
2490
2491 fail:
2492 setup_stub_lkb(ls, ms);
2493 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2494}
2495
2496static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2497{
2498 struct dlm_lkb *lkb;
2499 struct dlm_rsb *r;
2500 int error, reply = 1;
2501
2502 error = find_lkb(ls, ms->m_remid, &lkb);
2503 if (error)
2504 goto fail;
2505
2506 r = lkb->lkb_resource;
2507
2508 hold_rsb(r);
2509 lock_rsb(r);
2510
2511 receive_flags(lkb, ms);
2512 error = receive_convert_args(ls, lkb, ms);
2513 if (error)
2514 goto out;
2515 reply = !down_conversion(lkb);
2516
2517 error = do_convert(r, lkb);
2518 out:
2519 if (reply)
2520 send_convert_reply(r, lkb, error);
2521
2522 unlock_rsb(r);
2523 put_rsb(r);
2524 dlm_put_lkb(lkb);
2525 return;
2526
2527 fail:
2528 setup_stub_lkb(ls, ms);
2529 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2530}
2531
2532static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2533{
2534 struct dlm_lkb *lkb;
2535 struct dlm_rsb *r;
2536 int error;
2537
2538 error = find_lkb(ls, ms->m_remid, &lkb);
2539 if (error)
2540 goto fail;
2541
2542 r = lkb->lkb_resource;
2543
2544 hold_rsb(r);
2545 lock_rsb(r);
2546
2547 receive_flags(lkb, ms);
2548 error = receive_unlock_args(ls, lkb, ms);
2549 if (error)
2550 goto out;
2551
2552 error = do_unlock(r, lkb);
2553 out:
2554 send_unlock_reply(r, lkb, error);
2555
2556 unlock_rsb(r);
2557 put_rsb(r);
2558 dlm_put_lkb(lkb);
2559 return;
2560
2561 fail:
2562 setup_stub_lkb(ls, ms);
2563 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2564}
2565
2566static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2567{
2568 struct dlm_lkb *lkb;
2569 struct dlm_rsb *r;
2570 int error;
2571
2572 error = find_lkb(ls, ms->m_remid, &lkb);
2573 if (error)
2574 goto fail;
2575
2576 receive_flags(lkb, ms);
2577
2578 r = lkb->lkb_resource;
2579
2580 hold_rsb(r);
2581 lock_rsb(r);
2582
2583 error = do_cancel(r, lkb);
2584 send_cancel_reply(r, lkb, error);
2585
2586 unlock_rsb(r);
2587 put_rsb(r);
2588 dlm_put_lkb(lkb);
2589 return;
2590
2591 fail:
2592 setup_stub_lkb(ls, ms);
2593 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2594}
2595
2596static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2597{
2598 struct dlm_lkb *lkb;
2599 struct dlm_rsb *r;
2600 int error;
2601
2602 error = find_lkb(ls, ms->m_remid, &lkb);
2603 if (error) {
2604 log_error(ls, "receive_grant no lkb");
2605 return;
2606 }
2607 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2608
2609 r = lkb->lkb_resource;
2610
2611 hold_rsb(r);
2612 lock_rsb(r);
2613
2614 receive_flags_reply(lkb, ms);
2615 grant_lock_pc(r, lkb, ms);
2616 queue_cast(r, lkb, 0);
2617
2618 unlock_rsb(r);
2619 put_rsb(r);
2620 dlm_put_lkb(lkb);
2621}
2622
2623static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2624{
2625 struct dlm_lkb *lkb;
2626 struct dlm_rsb *r;
2627 int error;
2628
2629 error = find_lkb(ls, ms->m_remid, &lkb);
2630 if (error) {
2631 log_error(ls, "receive_bast no lkb");
2632 return;
2633 }
2634 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2635
2636 r = lkb->lkb_resource;
2637
2638 hold_rsb(r);
2639 lock_rsb(r);
2640
2641 queue_bast(r, lkb, ms->m_bastmode);
2642
2643 unlock_rsb(r);
2644 put_rsb(r);
2645 dlm_put_lkb(lkb);
2646}
2647
2648static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2649{
2650 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2651
2652 from_nodeid = ms->m_header.h_nodeid;
2653 our_nodeid = dlm_our_nodeid();
2654
2655 len = receive_extralen(ms);
2656
2657 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2658 if (dir_nodeid != our_nodeid) {
2659 log_error(ls, "lookup dir_nodeid %d from %d",
2660 dir_nodeid, from_nodeid);
2661 error = -EINVAL;
2662 ret_nodeid = -1;
2663 goto out;
2664 }
2665
2666 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2667
2668 /* Optimization: we're master so treat lookup as a request */
2669 if (!error && ret_nodeid == our_nodeid) {
2670 receive_request(ls, ms);
2671 return;
2672 }
2673 out:
2674 send_lookup_reply(ls, ms, ret_nodeid, error);
2675}
2676
2677static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2678{
2679 int len, dir_nodeid, from_nodeid;
2680
2681 from_nodeid = ms->m_header.h_nodeid;
2682
2683 len = receive_extralen(ms);
2684
2685 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2686 if (dir_nodeid != dlm_our_nodeid()) {
2687 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2688 dir_nodeid, from_nodeid);
2689 return;
2690 }
2691
2692 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2693}
2694
2695static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2696{
2697 struct dlm_lkb *lkb;
2698 struct dlm_rsb *r;
2699 int error, mstype;
2700
2701 error = find_lkb(ls, ms->m_remid, &lkb);
2702 if (error) {
2703 log_error(ls, "receive_request_reply no lkb");
2704 return;
2705 }
2706 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2707
2708 mstype = lkb->lkb_wait_type;
2709 error = remove_from_waiters(lkb);
2710 if (error) {
2711 log_error(ls, "receive_request_reply not on waiters");
2712 goto out;
2713 }
2714
2715 /* this is the value returned from do_request() on the master */
2716 error = ms->m_result;
2717
2718 r = lkb->lkb_resource;
2719 hold_rsb(r);
2720 lock_rsb(r);
2721
2722 /* Optimization: the dir node was also the master, so it took our
2723 lookup as a request and sent request reply instead of lookup reply */
2724 if (mstype == DLM_MSG_LOOKUP) {
2725 r->res_nodeid = ms->m_header.h_nodeid;
2726 lkb->lkb_nodeid = r->res_nodeid;
2727 }
2728
2729 switch (error) {
2730 case -EAGAIN:
2731 /* request would block (be queued) on remote master;
2732 the unhold undoes the original ref from create_lkb()
2733 so it leads to the lkb being freed */
2734 queue_cast(r, lkb, -EAGAIN);
2735 confirm_master(r, -EAGAIN);
2736 unhold_lkb(lkb);
2737 break;
2738
2739 case -EINPROGRESS:
2740 case 0:
2741 /* request was queued or granted on remote master */
2742 receive_flags_reply(lkb, ms);
2743 lkb->lkb_remid = ms->m_lkid;
2744 if (error)
2745 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2746 else {
2747 grant_lock_pc(r, lkb, ms);
2748 queue_cast(r, lkb, 0);
2749 }
2750 confirm_master(r, error);
2751 break;
2752
2753 case -EBADR:
2754 case -ENOTBLK:
2755 /* find_rsb failed to find rsb or rsb wasn't master */
2756 r->res_nodeid = -1;
2757 lkb->lkb_nodeid = -1;
2758 _request_lock(r, lkb);
2759 break;
2760
2761 default:
2762 log_error(ls, "receive_request_reply error %d", error);
2763 }
2764
2765 unlock_rsb(r);
2766 put_rsb(r);
2767 out:
2768 dlm_put_lkb(lkb);
2769}
2770
2771static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2772 struct dlm_message *ms)
2773{
2774 int error = ms->m_result;
2775
2776 /* this is the value returned from do_convert() on the master */
2777
2778 switch (error) {
2779 case -EAGAIN:
2780 /* convert would block (be queued) on remote master */
2781 queue_cast(r, lkb, -EAGAIN);
2782 break;
2783
2784 case -EINPROGRESS:
2785 /* convert was queued on remote master */
2786 del_lkb(r, lkb);
2787 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2788 break;
2789
2790 case 0:
2791 /* convert was granted on remote master */
2792 receive_flags_reply(lkb, ms);
2793 grant_lock_pc(r, lkb, ms);
2794 queue_cast(r, lkb, 0);
2795 break;
2796
2797 default:
2798 log_error(r->res_ls, "receive_convert_reply error %d", error);
2799 }
2800}
2801
2802static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2803{
2804 struct dlm_rsb *r = lkb->lkb_resource;
2805
2806 hold_rsb(r);
2807 lock_rsb(r);
2808
2809 __receive_convert_reply(r, lkb, ms);
2810
2811 unlock_rsb(r);
2812 put_rsb(r);
2813}
2814
2815static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2816{
2817 struct dlm_lkb *lkb;
2818 int error;
2819
2820 error = find_lkb(ls, ms->m_remid, &lkb);
2821 if (error) {
2822 log_error(ls, "receive_convert_reply no lkb");
2823 return;
2824 }
2825 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2826
2827 error = remove_from_waiters(lkb);
2828 if (error) {
2829 log_error(ls, "receive_convert_reply not on waiters");
2830 goto out;
2831 }
2832
2833 _receive_convert_reply(lkb, ms);
2834 out:
2835 dlm_put_lkb(lkb);
2836}
2837
2838static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2839{
2840 struct dlm_rsb *r = lkb->lkb_resource;
2841 int error = ms->m_result;
2842
2843 hold_rsb(r);
2844 lock_rsb(r);
2845
2846 /* this is the value returned from do_unlock() on the master */
2847
2848 switch (error) {
2849 case -DLM_EUNLOCK:
2850 receive_flags_reply(lkb, ms);
2851 remove_lock_pc(r, lkb);
2852 queue_cast(r, lkb, -DLM_EUNLOCK);
2853 break;
2854 default:
2855 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2856 }
2857
2858 unlock_rsb(r);
2859 put_rsb(r);
2860}
2861
2862static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2863{
2864 struct dlm_lkb *lkb;
2865 int error;
2866
2867 error = find_lkb(ls, ms->m_remid, &lkb);
2868 if (error) {
2869 log_error(ls, "receive_unlock_reply no lkb");
2870 return;
2871 }
2872 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2873
2874 error = remove_from_waiters(lkb);
2875 if (error) {
2876 log_error(ls, "receive_unlock_reply not on waiters");
2877 goto out;
2878 }
2879
2880 _receive_unlock_reply(lkb, ms);
2881 out:
2882 dlm_put_lkb(lkb);
2883}
2884
2885static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2886{
2887 struct dlm_rsb *r = lkb->lkb_resource;
2888 int error = ms->m_result;
2889
2890 hold_rsb(r);
2891 lock_rsb(r);
2892
2893 /* this is the value returned from do_cancel() on the master */
2894
2895 switch (error) {
2896 case -DLM_ECANCEL:
2897 receive_flags_reply(lkb, ms);
2898 revert_lock_pc(r, lkb);
2899 queue_cast(r, lkb, -DLM_ECANCEL);
2900 break;
2901 default:
2902 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2903 }
2904
2905 unlock_rsb(r);
2906 put_rsb(r);
2907}
2908
2909static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2910{
2911 struct dlm_lkb *lkb;
2912 int error;
2913
2914 error = find_lkb(ls, ms->m_remid, &lkb);
2915 if (error) {
2916 log_error(ls, "receive_cancel_reply no lkb");
2917 return;
2918 }
2919 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2920
2921 error = remove_from_waiters(lkb);
2922 if (error) {
2923 log_error(ls, "receive_cancel_reply not on waiters");
2924 goto out;
2925 }
2926
2927 _receive_cancel_reply(lkb, ms);
2928 out:
2929 dlm_put_lkb(lkb);
2930}
2931
2932static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2933{
2934 struct dlm_lkb *lkb;
2935 struct dlm_rsb *r;
2936 int error, ret_nodeid;
2937
2938 error = find_lkb(ls, ms->m_lkid, &lkb);
2939 if (error) {
2940 log_error(ls, "receive_lookup_reply no lkb");
2941 return;
2942 }
2943
2944 error = remove_from_waiters(lkb);
2945 if (error) {
2946 log_error(ls, "receive_lookup_reply not on waiters");
2947 goto out;
2948 }
2949
2950 /* this is the value returned by dlm_dir_lookup on dir node
2951 FIXME: will a non-zero error ever be returned? */
2952 error = ms->m_result;
2953
2954 r = lkb->lkb_resource;
2955 hold_rsb(r);
2956 lock_rsb(r);
2957
2958 ret_nodeid = ms->m_nodeid;
2959 if (ret_nodeid == dlm_our_nodeid()) {
2960 r->res_nodeid = 0;
2961 ret_nodeid = 0;
2962 r->res_first_lkid = 0;
2963 } else {
2964 /* set_master() will copy res_nodeid to lkb_nodeid */
2965 r->res_nodeid = ret_nodeid;
2966 }
2967
2968 _request_lock(r, lkb);
2969
2970 if (!ret_nodeid)
2971 process_lookup_list(r);
2972
2973 unlock_rsb(r);
2974 put_rsb(r);
2975 out:
2976 dlm_put_lkb(lkb);
2977}
2978
2979int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
2980{
2981 struct dlm_message *ms = (struct dlm_message *) hd;
2982 struct dlm_ls *ls;
2983 int error;
2984
2985 if (!recovery)
2986 dlm_message_in(ms);
2987
2988 ls = dlm_find_lockspace_global(hd->h_lockspace);
2989 if (!ls) {
2990 log_print("drop message %d from %d for unknown lockspace %d",
2991 ms->m_type, nodeid, hd->h_lockspace);
2992 return -EINVAL;
2993 }
2994
2995 /* recovery may have just ended leaving a bunch of backed-up requests
2996 in the requestqueue; wait while dlm_recoverd clears them */
2997
2998 if (!recovery)
2999 dlm_wait_requestqueue(ls);
3000
3001 /* recovery may have just started while there were a bunch of
3002 in-flight requests -- save them in requestqueue to be processed
3003 after recovery. we can't let dlm_recvd block on the recovery
3004 lock. if dlm_recoverd is calling this function to clear the
3005 requestqueue, it needs to be interrupted (-EINTR) if another
3006 recovery operation is starting. */
3007
3008 while (1) {
3009 if (dlm_locking_stopped(ls)) {
3010 if (!recovery)
3011 dlm_add_requestqueue(ls, nodeid, hd);
3012 error = -EINTR;
3013 goto out;
3014 }
3015
3016 if (lock_recovery_try(ls))
3017 break;
3018 schedule();
3019 }
3020
3021 switch (ms->m_type) {
3022
3023 /* messages sent to a master node */
3024
3025 case DLM_MSG_REQUEST:
3026 receive_request(ls, ms);
3027 break;
3028
3029 case DLM_MSG_CONVERT:
3030 receive_convert(ls, ms);
3031 break;
3032
3033 case DLM_MSG_UNLOCK:
3034 receive_unlock(ls, ms);
3035 break;
3036
3037 case DLM_MSG_CANCEL:
3038 receive_cancel(ls, ms);
3039 break;
3040
3041 /* messages sent from a master node (replies to above) */
3042
3043 case DLM_MSG_REQUEST_REPLY:
3044 receive_request_reply(ls, ms);
3045 break;
3046
3047 case DLM_MSG_CONVERT_REPLY:
3048 receive_convert_reply(ls, ms);
3049 break;
3050
3051 case DLM_MSG_UNLOCK_REPLY:
3052 receive_unlock_reply(ls, ms);
3053 break;
3054
3055 case DLM_MSG_CANCEL_REPLY:
3056 receive_cancel_reply(ls, ms);
3057 break;
3058
3059 /* messages sent from a master node (only two types of async msg) */
3060
3061 case DLM_MSG_GRANT:
3062 receive_grant(ls, ms);
3063 break;
3064
3065 case DLM_MSG_BAST:
3066 receive_bast(ls, ms);
3067 break;
3068
3069 /* messages sent to a dir node */
3070
3071 case DLM_MSG_LOOKUP:
3072 receive_lookup(ls, ms);
3073 break;
3074
3075 case DLM_MSG_REMOVE:
3076 receive_remove(ls, ms);
3077 break;
3078
3079 /* messages sent from a dir node (remove has no reply) */
3080
3081 case DLM_MSG_LOOKUP_REPLY:
3082 receive_lookup_reply(ls, ms);
3083 break;
3084
3085 default:
3086 log_error(ls, "unknown message type %d", ms->m_type);
3087 }
3088
3089 unlock_recovery(ls);
3090 out:
3091 dlm_put_lockspace(ls);
3092 dlm_astd_wake();
3093 return 0;
3094}
3095
3096
3097/*
3098 * Recovery related
3099 */
3100
3101static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3102{
3103 if (middle_conversion(lkb)) {
3104 hold_lkb(lkb);
3105 ls->ls_stub_ms.m_result = -EINPROGRESS;
3106 _remove_from_waiters(lkb);
3107 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3108
3109 /* Same special case as in receive_rcom_lock_args() */
3110 lkb->lkb_grmode = DLM_LOCK_IV;
3111 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3112 unhold_lkb(lkb);
3113
3114 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3115 lkb->lkb_flags |= DLM_IFL_RESEND;
3116 }
3117
3118 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3119 conversions are async; there's no reply from the remote master */
3120}
3121
3122/* A waiting lkb needs recovery if the master node has failed, or
3123 the master node is changing (only when no directory is used) */
3124
3125static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3126{
3127 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3128 return 1;
3129
3130 if (!dlm_no_directory(ls))
3131 return 0;
3132
3133 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3134 return 1;
3135
3136 return 0;
3137}
3138
3139/* Recovery for locks that are waiting for replies from nodes that are now
3140 gone. We can just complete unlocks and cancels by faking a reply from the
3141 dead node. Requests and up-conversions we flag to be resent after
3142 recovery. Down-conversions can just be completed with a fake reply like
3143 unlocks. Conversions between PR and CW need special attention. */
3144
3145void dlm_recover_waiters_pre(struct dlm_ls *ls)
3146{
3147 struct dlm_lkb *lkb, *safe;
3148
3149 mutex_lock(&ls->ls_waiters_mutex);
3150
3151 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3152 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3153 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3154
3155 /* all outstanding lookups, regardless of destination will be
3156 resent after recovery is done */
3157
3158 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3159 lkb->lkb_flags |= DLM_IFL_RESEND;
3160 continue;
3161 }
3162
3163 if (!waiter_needs_recovery(ls, lkb))
3164 continue;
3165
3166 switch (lkb->lkb_wait_type) {
3167
3168 case DLM_MSG_REQUEST:
3169 lkb->lkb_flags |= DLM_IFL_RESEND;
3170 break;
3171
3172 case DLM_MSG_CONVERT:
3173 recover_convert_waiter(ls, lkb);
3174 break;
3175
3176 case DLM_MSG_UNLOCK:
3177 hold_lkb(lkb);
3178 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3179 _remove_from_waiters(lkb);
3180 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3181 dlm_put_lkb(lkb);
3182 break;
3183
3184 case DLM_MSG_CANCEL:
3185 hold_lkb(lkb);
3186 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3187 _remove_from_waiters(lkb);
3188 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3189 dlm_put_lkb(lkb);
3190 break;
3191
3192 default:
3193 log_error(ls, "invalid lkb wait_type %d",
3194 lkb->lkb_wait_type);
3195 }
3196 }
3197 mutex_unlock(&ls->ls_waiters_mutex);
3198}
3199
3200static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3201{
3202 struct dlm_lkb *lkb;
3203 int rv = 0;
3204
3205 mutex_lock(&ls->ls_waiters_mutex);
3206 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3207 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3208 rv = lkb->lkb_wait_type;
3209 _remove_from_waiters(lkb);
3210 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3211 break;
3212 }
3213 }
3214 mutex_unlock(&ls->ls_waiters_mutex);
3215
3216 if (!rv)
3217 lkb = NULL;
3218 *lkb_ret = lkb;
3219 return rv;
3220}
3221
3222/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3223 master or dir-node for r. Processing the lkb may result in it being placed
3224 back on waiters. */
3225
3226int dlm_recover_waiters_post(struct dlm_ls *ls)
3227{
3228 struct dlm_lkb *lkb;
3229 struct dlm_rsb *r;
3230 int error = 0, mstype;
3231
3232 while (1) {
3233 if (dlm_locking_stopped(ls)) {
3234 log_debug(ls, "recover_waiters_post aborted");
3235 error = -EINTR;
3236 break;
3237 }
3238
3239 mstype = remove_resend_waiter(ls, &lkb);
3240 if (!mstype)
3241 break;
3242
3243 r = lkb->lkb_resource;
3244
3245 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3246 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3247
3248 switch (mstype) {
3249
3250 case DLM_MSG_LOOKUP:
3251 hold_rsb(r);
3252 lock_rsb(r);
3253 _request_lock(r, lkb);
3254 if (is_master(r))
3255 confirm_master(r, 0);
3256 unlock_rsb(r);
3257 put_rsb(r);
3258 break;
3259
3260 case DLM_MSG_REQUEST:
3261 hold_rsb(r);
3262 lock_rsb(r);
3263 _request_lock(r, lkb);
3264 unlock_rsb(r);
3265 put_rsb(r);
3266 break;
3267
3268 case DLM_MSG_CONVERT:
3269 hold_rsb(r);
3270 lock_rsb(r);
3271 _convert_lock(r, lkb);
3272 unlock_rsb(r);
3273 put_rsb(r);
3274 break;
3275
3276 default:
3277 log_error(ls, "recover_waiters_post type %d", mstype);
3278 }
3279 }
3280
3281 return error;
3282}
3283
3284static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3285 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3286{
3287 struct dlm_ls *ls = r->res_ls;
3288 struct dlm_lkb *lkb, *safe;
3289
3290 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3291 if (test(ls, lkb)) {
3292 rsb_set_flag(r, RSB_LOCKS_PURGED);
3293 del_lkb(r, lkb);
3294 /* this put should free the lkb */
3295 if (!dlm_put_lkb(lkb))
3296 log_error(ls, "purged lkb not released");
3297 }
3298 }
3299}
3300
3301static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3302{
3303 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3304}
3305
3306static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3307{
3308 return is_master_copy(lkb);
3309}
3310
3311static void purge_dead_locks(struct dlm_rsb *r)
3312{
3313 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3314 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3315 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3316}
3317
3318void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3319{
3320 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3321 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3322 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3323}
3324
3325/* Get rid of locks held by nodes that are gone. */
3326
3327int dlm_purge_locks(struct dlm_ls *ls)
3328{
3329 struct dlm_rsb *r;
3330
3331 log_debug(ls, "dlm_purge_locks");
3332
3333 down_write(&ls->ls_root_sem);
3334 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3335 hold_rsb(r);
3336 lock_rsb(r);
3337 if (is_master(r))
3338 purge_dead_locks(r);
3339 unlock_rsb(r);
3340 unhold_rsb(r);
3341
3342 schedule();
3343 }
3344 up_write(&ls->ls_root_sem);
3345
3346 return 0;
3347}
3348
3349static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3350{
3351 struct dlm_rsb *r, *r_ret = NULL;
3352
3353 read_lock(&ls->ls_rsbtbl[bucket].lock);
3354 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3355 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3356 continue;
3357 hold_rsb(r);
3358 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3359 r_ret = r;
3360 break;
3361 }
3362 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3363 return r_ret;
3364}
3365
3366void dlm_grant_after_purge(struct dlm_ls *ls)
3367{
3368 struct dlm_rsb *r;
3369 int i;
3370
3371 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
3372 r = find_purged_rsb(ls, i);
3373 if (!r)
3374 continue;
3375 lock_rsb(r);
3376 if (is_master(r)) {
3377 grant_pending_locks(r);
3378 confirm_master(r, 0);
3379 }
3380 unlock_rsb(r);
3381 put_rsb(r);
3382 }
3383}
3384
3385static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3386 uint32_t remid)
3387{
3388 struct dlm_lkb *lkb;
3389
3390 list_for_each_entry(lkb, head, lkb_statequeue) {
3391 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3392 return lkb;
3393 }
3394 return NULL;
3395}
3396
3397static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3398 uint32_t remid)
3399{
3400 struct dlm_lkb *lkb;
3401
3402 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3403 if (lkb)
3404 return lkb;
3405 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3406 if (lkb)
3407 return lkb;
3408 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3409 if (lkb)
3410 return lkb;
3411 return NULL;
3412}
3413
3414static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3415 struct dlm_rsb *r, struct dlm_rcom *rc)
3416{
3417 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3418 int lvblen;
3419
3420 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3421 lkb->lkb_ownpid = rl->rl_ownpid;
3422 lkb->lkb_remid = rl->rl_lkid;
3423 lkb->lkb_exflags = rl->rl_exflags;
3424 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3425 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3426 lkb->lkb_lvbseq = rl->rl_lvbseq;
3427 lkb->lkb_rqmode = rl->rl_rqmode;
3428 lkb->lkb_grmode = rl->rl_grmode;
3429 /* don't set lkb_status because add_lkb wants to itself */
3430
3431 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3432 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3433
3434 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3435 lkb->lkb_lvbptr = allocate_lvb(ls);
3436 if (!lkb->lkb_lvbptr)
3437 return -ENOMEM;
3438 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3439 sizeof(struct rcom_lock);
3440 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3441 }
3442
3443 /* Conversions between PR and CW (middle modes) need special handling.
3444 The real granted mode of these converting locks cannot be determined
3445 until all locks have been rebuilt on the rsb (recover_conversion) */
3446
3447 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3448 rl->rl_status = DLM_LKSTS_CONVERT;
3449 lkb->lkb_grmode = DLM_LOCK_IV;
3450 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3451 }
3452
3453 return 0;
3454}
3455
3456/* This lkb may have been recovered in a previous aborted recovery so we need
3457 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3458 If so we just send back a standard reply. If not, we create a new lkb with
3459 the given values and send back our lkid. We send back our lkid by sending
3460 back the rcom_lock struct we got but with the remid field filled in. */
3461
3462int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3463{
3464 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3465 struct dlm_rsb *r;
3466 struct dlm_lkb *lkb;
3467 int error;
3468
3469 if (rl->rl_parent_lkid) {
3470 error = -EOPNOTSUPP;
3471 goto out;
3472 }
3473
3474 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3475 if (error)
3476 goto out;
3477
3478 lock_rsb(r);
3479
3480 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3481 if (lkb) {
3482 error = -EEXIST;
3483 goto out_remid;
3484 }
3485
3486 error = create_lkb(ls, &lkb);
3487 if (error)
3488 goto out_unlock;
3489
3490 error = receive_rcom_lock_args(ls, lkb, r, rc);
3491 if (error) {
3492 __put_lkb(ls, lkb);
3493 goto out_unlock;
3494 }
3495
3496 attach_lkb(r, lkb);
3497 add_lkb(r, lkb, rl->rl_status);
3498 error = 0;
3499
3500 out_remid:
3501 /* this is the new value returned to the lock holder for
3502 saving in its process-copy lkb */
3503 rl->rl_remid = lkb->lkb_id;
3504
3505 out_unlock:
3506 unlock_rsb(r);
3507 put_rsb(r);
3508 out:
3509 if (error)
3510 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3511 rl->rl_result = error;
3512 return error;
3513}
3514
3515int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3516{
3517 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3518 struct dlm_rsb *r;
3519 struct dlm_lkb *lkb;
3520 int error;
3521
3522 error = find_lkb(ls, rl->rl_lkid, &lkb);
3523 if (error) {
3524 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3525 return error;
3526 }
3527
3528 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3529
3530 error = rl->rl_result;
3531
3532 r = lkb->lkb_resource;
3533 hold_rsb(r);
3534 lock_rsb(r);
3535
3536 switch (error) {
3537 case -EEXIST:
3538 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3539 /* fall through */
3540 case 0:
3541 lkb->lkb_remid = rl->rl_remid;
3542 break;
3543 default:
3544 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3545 error, lkb->lkb_id);
3546 }
3547
3548 /* an ack for dlm_recover_locks() which waits for replies from
3549 all the locks it sends to new masters */
3550 dlm_recovered_lock(r);
3551
3552 unlock_rsb(r);
3553 put_rsb(r);
3554 dlm_put_lkb(lkb);
3555
3556 return 0;
3557}
3558
3559int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3560 int mode, uint32_t flags, void *name, unsigned int namelen,
3561 uint32_t parent_lkid)
3562{
3563 struct dlm_lkb *lkb;
3564 struct dlm_args args;
3565 int error;
3566
3567 lock_recovery(ls);
3568
3569 error = create_lkb(ls, &lkb);
3570 if (error) {
3571 kfree(ua);
3572 goto out;
3573 }
3574
3575 if (flags & DLM_LKF_VALBLK) {
3576 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3577 if (!ua->lksb.sb_lvbptr) {
3578 kfree(ua);
3579 __put_lkb(ls, lkb);
3580 error = -ENOMEM;
3581 goto out;
3582 }
3583 }
3584
3585 /* After ua is attached to lkb it will be freed by free_lkb().
3586 When DLM_IFL_USER is set, the dlm knows that this is a userspace
3587 lock and that lkb_astparam is the dlm_user_args structure. */
3588
3589 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
3590 FAKE_USER_AST, ua, FAKE_USER_AST, &args);
3591 lkb->lkb_flags |= DLM_IFL_USER;
3592 ua->old_mode = DLM_LOCK_IV;
3593
3594 if (error) {
3595 __put_lkb(ls, lkb);
3596 goto out;
3597 }
3598
3599 error = request_lock(ls, lkb, name, namelen, &args);
3600
3601 switch (error) {
3602 case 0:
3603 break;
3604 case -EINPROGRESS:
3605 error = 0;
3606 break;
3607 case -EAGAIN:
3608 error = 0;
3609 /* fall through */
3610 default:
3611 __put_lkb(ls, lkb);
3612 goto out;
3613 }
3614
3615 /* add this new lkb to the per-process list of locks */
3616 spin_lock(&ua->proc->locks_spin);
3617 kref_get(&lkb->lkb_ref);
3618 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3619 spin_unlock(&ua->proc->locks_spin);
3620 out:
3621 unlock_recovery(ls);
3622 return error;
3623}
3624
3625int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3626 int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
3627{
3628 struct dlm_lkb *lkb;
3629 struct dlm_args args;
3630 struct dlm_user_args *ua;
3631 int error;
3632
3633 lock_recovery(ls);
3634
3635 error = find_lkb(ls, lkid, &lkb);
3636 if (error)
3637 goto out;
3638
3639 /* user can change the params on its lock when it converts it, or
3640 add an lvb that didn't exist before */
3641
3642 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3643
3644 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3645 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3646 if (!ua->lksb.sb_lvbptr) {
3647 error = -ENOMEM;
3648 goto out_put;
3649 }
3650 }
3651 if (lvb_in && ua->lksb.sb_lvbptr)
3652 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3653
3654 ua->castparam = ua_tmp->castparam;
3655 ua->castaddr = ua_tmp->castaddr;
3656 ua->bastparam = ua_tmp->bastparam;
3657 ua->bastaddr = ua_tmp->bastaddr;
3658 ua->old_mode = lkb->lkb_grmode;
3659
3660 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, FAKE_USER_AST, ua,
3661 FAKE_USER_AST, &args);
3662 if (error)
3663 goto out_put;
3664
3665 error = convert_lock(ls, lkb, &args);
3666
3667 if (error == -EINPROGRESS || error == -EAGAIN)
3668 error = 0;
3669 out_put:
3670 dlm_put_lkb(lkb);
3671 out:
3672 unlock_recovery(ls);
3673 kfree(ua_tmp);
3674 return error;
3675}
3676
3677int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3678 uint32_t flags, uint32_t lkid, char *lvb_in)
3679{
3680 struct dlm_lkb *lkb;
3681 struct dlm_args args;
3682 struct dlm_user_args *ua;
3683 int error;
3684
3685 lock_recovery(ls);
3686
3687 error = find_lkb(ls, lkid, &lkb);
3688 if (error)
3689 goto out;
3690
3691 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3692
3693 if (lvb_in && ua->lksb.sb_lvbptr)
3694 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3695 ua->castparam = ua_tmp->castparam;
3696
3697 error = set_unlock_args(flags, ua, &args);
3698 if (error)
3699 goto out_put;
3700
3701 error = unlock_lock(ls, lkb, &args);
3702
3703 if (error == -DLM_EUNLOCK)
3704 error = 0;
3705 if (error)
3706 goto out_put;
3707
3708 spin_lock(&ua->proc->locks_spin);
3709 list_del(&lkb->lkb_ownqueue);
3710 spin_unlock(&ua->proc->locks_spin);
3711
3712 /* this removes the reference for the proc->locks list added by
3713 dlm_user_request */
3714 unhold_lkb(lkb);
3715 out_put:
3716 dlm_put_lkb(lkb);
3717 out:
3718 unlock_recovery(ls);
3719 return error;
3720}
3721
3722int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3723 uint32_t flags, uint32_t lkid)
3724{
3725 struct dlm_lkb *lkb;
3726 struct dlm_args args;
3727 struct dlm_user_args *ua;
3728 int error;
3729
3730 lock_recovery(ls);
3731
3732 error = find_lkb(ls, lkid, &lkb);
3733 if (error)
3734 goto out;
3735
3736 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3737 ua->castparam = ua_tmp->castparam;
3738
3739 error = set_unlock_args(flags, ua, &args);
3740 if (error)
3741 goto out_put;
3742
3743 error = cancel_lock(ls, lkb, &args);
3744
3745 if (error == -DLM_ECANCEL)
3746 error = 0;
3747 if (error)
3748 goto out_put;
3749
3750 /* this lkb was removed from the WAITING queue */
3751 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3752 spin_lock(&ua->proc->locks_spin);
3753 list_del(&lkb->lkb_ownqueue);
3754 spin_unlock(&ua->proc->locks_spin);
3755 unhold_lkb(lkb);
3756 }
3757 out_put:
3758 dlm_put_lkb(lkb);
3759 out:
3760 unlock_recovery(ls);
3761 return error;
3762}
3763
3764static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3765{
3766 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3767
3768 if (ua->lksb.sb_lvbptr)
3769 kfree(ua->lksb.sb_lvbptr);
3770 kfree(ua);
3771 lkb->lkb_astparam = (long)NULL;
3772
3773 /* TODO: propogate to master if needed */
3774 return 0;
3775}
3776
3777/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
3778 Regardless of what rsb queue the lock is on, it's removed and freed. */
3779
3780static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3781{
3782 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3783 struct dlm_args args;
3784 int error;
3785
3786 /* FIXME: we need to handle the case where the lkb is in limbo
3787 while the rsb is being looked up, currently we assert in
3788 _unlock_lock/is_remote because rsb nodeid is -1. */
3789
3790 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3791
3792 error = unlock_lock(ls, lkb, &args);
3793 if (error == -DLM_EUNLOCK)
3794 error = 0;
3795 return error;
3796}
3797
3798/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3799 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3800 which we clear here. */
3801
3802/* proc CLOSING flag is set so no more device_reads should look at proc->asts
3803 list, and no more device_writes should add lkb's to proc->locks list; so we
3804 shouldn't need to take asts_spin or locks_spin here. this assumes that
3805 device reads/writes/closes are serialized -- FIXME: we may need to serialize
3806 them ourself. */
3807
3808void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3809{
3810 struct dlm_lkb *lkb, *safe;
3811
3812 lock_recovery(ls);
3813 mutex_lock(&ls->ls_clear_proc_locks);
3814
3815 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3816 if (lkb->lkb_ast_type) {
3817 list_del(&lkb->lkb_astqueue);
3818 unhold_lkb(lkb);
3819 }
3820
3821 list_del(&lkb->lkb_ownqueue);
3822
3823 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
3824 lkb->lkb_flags |= DLM_IFL_ORPHAN;
3825 orphan_proc_lock(ls, lkb);
3826 } else {
3827 lkb->lkb_flags |= DLM_IFL_DEAD;
3828 unlock_proc_lock(ls, lkb);
3829 }
3830
3831 /* this removes the reference for the proc->locks list
3832 added by dlm_user_request, it may result in the lkb
3833 being freed */
3834
3835 dlm_put_lkb(lkb);
3836 }
3837 mutex_unlock(&ls->ls_clear_proc_locks);
3838 unlock_recovery(ls);
3839}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..8d2660f0ab10
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,61 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_print_lkb(struct dlm_lkb *lkb);
18int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
19int dlm_modes_compat(int mode1, int mode2);
20int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
21 unsigned int flags, struct dlm_rsb **r_ret);
22void dlm_put_rsb(struct dlm_rsb *r);
23void dlm_hold_rsb(struct dlm_rsb *r);
24int dlm_put_lkb(struct dlm_lkb *lkb);
25void dlm_scan_rsbs(struct dlm_ls *ls);
26
27int dlm_purge_locks(struct dlm_ls *ls);
28void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
29void dlm_grant_after_purge(struct dlm_ls *ls);
30int dlm_recover_waiters_post(struct dlm_ls *ls);
31void dlm_recover_waiters_pre(struct dlm_ls *ls);
32int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
33int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34
35int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
36 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
37int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
38 int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
39int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
40 uint32_t flags, uint32_t lkid, char *lvb_in);
41int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
42 uint32_t flags, uint32_t lkid);
43void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
44
45static inline int is_master(struct dlm_rsb *r)
46{
47 return !r->res_nodeid;
48}
49
50static inline void lock_rsb(struct dlm_rsb *r)
51{
52 mutex_lock(&r->res_mutex);
53}
54
55static inline void unlock_rsb(struct dlm_rsb *r)
56{
57 mutex_unlock(&r->res_mutex);
58}
59
60#endif
61
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..3f6cb422ac4b
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,704 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return sprintf(buf, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return sprintf(buf, "%x\n", status);
82}
83
84struct dlm_attr {
85 struct attribute attr;
86 ssize_t (*show)(struct dlm_ls *, char *);
87 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
88};
89
90static struct dlm_attr dlm_attr_control = {
91 .attr = {.name = "control", .mode = S_IWUSR},
92 .store = dlm_control_store
93};
94
95static struct dlm_attr dlm_attr_event = {
96 .attr = {.name = "event_done", .mode = S_IWUSR},
97 .store = dlm_event_store
98};
99
100static struct dlm_attr dlm_attr_id = {
101 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
102 .show = dlm_id_show,
103 .store = dlm_id_store
104};
105
106static struct dlm_attr dlm_attr_recover_status = {
107 .attr = {.name = "recover_status", .mode = S_IRUGO},
108 .show = dlm_recover_status_show
109};
110
111static struct attribute *dlm_attrs[] = {
112 &dlm_attr_control.attr,
113 &dlm_attr_event.attr,
114 &dlm_attr_id.attr,
115 &dlm_attr_recover_status.attr,
116 NULL,
117};
118
119static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
120 char *buf)
121{
122 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
123 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
124 return a->show ? a->show(ls, buf) : 0;
125}
126
127static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
128 const char *buf, size_t len)
129{
130 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
131 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
132 return a->store ? a->store(ls, buf, len) : len;
133}
134
135static struct sysfs_ops dlm_attr_ops = {
136 .show = dlm_attr_show,
137 .store = dlm_attr_store,
138};
139
140static struct kobj_type dlm_ktype = {
141 .default_attrs = dlm_attrs,
142 .sysfs_ops = &dlm_attr_ops,
143};
144
145static struct kset dlm_kset = {
146 .subsys = &kernel_subsys,
147 .kobj = {.name = "dlm",},
148 .ktype = &dlm_ktype,
149};
150
151static int kobject_setup(struct dlm_ls *ls)
152{
153 char lsname[DLM_LOCKSPACE_LEN];
154 int error;
155
156 memset(lsname, 0, DLM_LOCKSPACE_LEN);
157 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
158
159 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
160 if (error)
161 return error;
162
163 ls->ls_kobj.kset = &dlm_kset;
164 ls->ls_kobj.ktype = &dlm_ktype;
165 return 0;
166}
167
168static int do_uevent(struct dlm_ls *ls, int in)
169{
170 int error;
171
172 if (in)
173 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
174 else
175 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
176
177 error = wait_event_interruptible(ls->ls_uevent_wait,
178 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
179 if (error)
180 goto out;
181
182 error = ls->ls_uevent_result;
183 out:
184 return error;
185}
186
187
188int dlm_lockspace_init(void)
189{
190 int error;
191
192 ls_count = 0;
193 mutex_init(&ls_lock);
194 INIT_LIST_HEAD(&lslist);
195 spin_lock_init(&lslist_lock);
196
197 error = kset_register(&dlm_kset);
198 if (error)
199 printk("dlm_lockspace_init: cannot register kset %d\n", error);
200 return error;
201}
202
203void dlm_lockspace_exit(void)
204{
205 kset_unregister(&dlm_kset);
206}
207
208static int dlm_scand(void *data)
209{
210 struct dlm_ls *ls;
211
212 while (!kthread_should_stop()) {
213 list_for_each_entry(ls, &lslist, ls_list)
214 dlm_scan_rsbs(ls);
215 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
216 }
217 return 0;
218}
219
220static int dlm_scand_start(void)
221{
222 struct task_struct *p;
223 int error = 0;
224
225 p = kthread_run(dlm_scand, NULL, "dlm_scand");
226 if (IS_ERR(p))
227 error = PTR_ERR(p);
228 else
229 scand_task = p;
230 return error;
231}
232
233static void dlm_scand_stop(void)
234{
235 kthread_stop(scand_task);
236}
237
238static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
239{
240 struct dlm_ls *ls;
241
242 spin_lock(&lslist_lock);
243
244 list_for_each_entry(ls, &lslist, ls_list) {
245 if (ls->ls_namelen == namelen &&
246 memcmp(ls->ls_name, name, namelen) == 0)
247 goto out;
248 }
249 ls = NULL;
250 out:
251 spin_unlock(&lslist_lock);
252 return ls;
253}
254
255struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
256{
257 struct dlm_ls *ls;
258
259 spin_lock(&lslist_lock);
260
261 list_for_each_entry(ls, &lslist, ls_list) {
262 if (ls->ls_global_id == id) {
263 ls->ls_count++;
264 goto out;
265 }
266 }
267 ls = NULL;
268 out:
269 spin_unlock(&lslist_lock);
270 return ls;
271}
272
273struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
274{
275 struct dlm_ls *ls;
276
277 spin_lock(&lslist_lock);
278 list_for_each_entry(ls, &lslist, ls_list) {
279 if (ls->ls_local_handle == lockspace) {
280 ls->ls_count++;
281 goto out;
282 }
283 }
284 ls = NULL;
285 out:
286 spin_unlock(&lslist_lock);
287 return ls;
288}
289
290struct dlm_ls *dlm_find_lockspace_device(int minor)
291{
292 struct dlm_ls *ls;
293
294 spin_lock(&lslist_lock);
295 list_for_each_entry(ls, &lslist, ls_list) {
296 if (ls->ls_device.minor == minor) {
297 ls->ls_count++;
298 goto out;
299 }
300 }
301 ls = NULL;
302 out:
303 spin_unlock(&lslist_lock);
304 return ls;
305}
306
307void dlm_put_lockspace(struct dlm_ls *ls)
308{
309 spin_lock(&lslist_lock);
310 ls->ls_count--;
311 spin_unlock(&lslist_lock);
312}
313
314static void remove_lockspace(struct dlm_ls *ls)
315{
316 for (;;) {
317 spin_lock(&lslist_lock);
318 if (ls->ls_count == 0) {
319 list_del(&ls->ls_list);
320 spin_unlock(&lslist_lock);
321 return;
322 }
323 spin_unlock(&lslist_lock);
324 ssleep(1);
325 }
326}
327
328static int threads_start(void)
329{
330 int error;
331
332 /* Thread which process lock requests for all lockspace's */
333 error = dlm_astd_start();
334 if (error) {
335 log_print("cannot start dlm_astd thread %d", error);
336 goto fail;
337 }
338
339 error = dlm_scand_start();
340 if (error) {
341 log_print("cannot start dlm_scand thread %d", error);
342 goto astd_fail;
343 }
344
345 /* Thread for sending/receiving messages for all lockspace's */
346 error = dlm_lowcomms_start();
347 if (error) {
348 log_print("cannot start dlm lowcomms %d", error);
349 goto scand_fail;
350 }
351
352 return 0;
353
354 scand_fail:
355 dlm_scand_stop();
356 astd_fail:
357 dlm_astd_stop();
358 fail:
359 return error;
360}
361
362static void threads_stop(void)
363{
364 dlm_scand_stop();
365 dlm_lowcomms_stop();
366 dlm_astd_stop();
367}
368
369static int new_lockspace(char *name, int namelen, void **lockspace,
370 uint32_t flags, int lvblen)
371{
372 struct dlm_ls *ls;
373 int i, size, error = -ENOMEM;
374
375 if (namelen > DLM_LOCKSPACE_LEN)
376 return -EINVAL;
377
378 if (!lvblen || (lvblen % 8))
379 return -EINVAL;
380
381 if (!try_module_get(THIS_MODULE))
382 return -EINVAL;
383
384 ls = dlm_find_lockspace_name(name, namelen);
385 if (ls) {
386 *lockspace = ls;
387 module_put(THIS_MODULE);
388 return -EEXIST;
389 }
390
391 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
392 if (!ls)
393 goto out;
394 memcpy(ls->ls_name, name, namelen);
395 ls->ls_namelen = namelen;
396 ls->ls_exflags = flags;
397 ls->ls_lvblen = lvblen;
398 ls->ls_count = 0;
399 ls->ls_flags = 0;
400
401 size = dlm_config.rsbtbl_size;
402 ls->ls_rsbtbl_size = size;
403
404 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
405 if (!ls->ls_rsbtbl)
406 goto out_lsfree;
407 for (i = 0; i < size; i++) {
408 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
409 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
410 rwlock_init(&ls->ls_rsbtbl[i].lock);
411 }
412
413 size = dlm_config.lkbtbl_size;
414 ls->ls_lkbtbl_size = size;
415
416 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
417 if (!ls->ls_lkbtbl)
418 goto out_rsbfree;
419 for (i = 0; i < size; i++) {
420 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
421 rwlock_init(&ls->ls_lkbtbl[i].lock);
422 ls->ls_lkbtbl[i].counter = 1;
423 }
424
425 size = dlm_config.dirtbl_size;
426 ls->ls_dirtbl_size = size;
427
428 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
429 if (!ls->ls_dirtbl)
430 goto out_lkbfree;
431 for (i = 0; i < size; i++) {
432 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
433 rwlock_init(&ls->ls_dirtbl[i].lock);
434 }
435
436 INIT_LIST_HEAD(&ls->ls_waiters);
437 mutex_init(&ls->ls_waiters_mutex);
438
439 INIT_LIST_HEAD(&ls->ls_nodes);
440 INIT_LIST_HEAD(&ls->ls_nodes_gone);
441 ls->ls_num_nodes = 0;
442 ls->ls_low_nodeid = 0;
443 ls->ls_total_weight = 0;
444 ls->ls_node_array = NULL;
445
446 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
447 ls->ls_stub_rsb.res_ls = ls;
448
449 ls->ls_debug_dentry = NULL;
450
451 init_waitqueue_head(&ls->ls_uevent_wait);
452 ls->ls_uevent_result = 0;
453
454 ls->ls_recoverd_task = NULL;
455 mutex_init(&ls->ls_recoverd_active);
456 spin_lock_init(&ls->ls_recover_lock);
457 ls->ls_recover_status = 0;
458 ls->ls_recover_seq = 0;
459 ls->ls_recover_args = NULL;
460 init_rwsem(&ls->ls_in_recovery);
461 INIT_LIST_HEAD(&ls->ls_requestqueue);
462 mutex_init(&ls->ls_requestqueue_mutex);
463 mutex_init(&ls->ls_clear_proc_locks);
464
465 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
466 if (!ls->ls_recover_buf)
467 goto out_dirfree;
468
469 INIT_LIST_HEAD(&ls->ls_recover_list);
470 spin_lock_init(&ls->ls_recover_list_lock);
471 ls->ls_recover_list_count = 0;
472 ls->ls_local_handle = ls;
473 init_waitqueue_head(&ls->ls_wait_general);
474 INIT_LIST_HEAD(&ls->ls_root_list);
475 init_rwsem(&ls->ls_root_sem);
476
477 down_write(&ls->ls_in_recovery);
478
479 error = dlm_recoverd_start(ls);
480 if (error) {
481 log_error(ls, "can't start dlm_recoverd %d", error);
482 goto out_rcomfree;
483 }
484
485 spin_lock(&lslist_lock);
486 list_add(&ls->ls_list, &lslist);
487 spin_unlock(&lslist_lock);
488
489 dlm_create_debug_file(ls);
490
491 error = kobject_setup(ls);
492 if (error)
493 goto out_del;
494
495 error = kobject_register(&ls->ls_kobj);
496 if (error)
497 goto out_del;
498
499 error = do_uevent(ls, 1);
500 if (error)
501 goto out_unreg;
502
503 *lockspace = ls;
504 return 0;
505
506 out_unreg:
507 kobject_unregister(&ls->ls_kobj);
508 out_del:
509 dlm_delete_debug_file(ls);
510 spin_lock(&lslist_lock);
511 list_del(&ls->ls_list);
512 spin_unlock(&lslist_lock);
513 dlm_recoverd_stop(ls);
514 out_rcomfree:
515 kfree(ls->ls_recover_buf);
516 out_dirfree:
517 kfree(ls->ls_dirtbl);
518 out_lkbfree:
519 kfree(ls->ls_lkbtbl);
520 out_rsbfree:
521 kfree(ls->ls_rsbtbl);
522 out_lsfree:
523 kfree(ls);
524 out:
525 module_put(THIS_MODULE);
526 return error;
527}
528
529int dlm_new_lockspace(char *name, int namelen, void **lockspace,
530 uint32_t flags, int lvblen)
531{
532 int error = 0;
533
534 mutex_lock(&ls_lock);
535 if (!ls_count)
536 error = threads_start();
537 if (error)
538 goto out;
539
540 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
541 if (!error)
542 ls_count++;
543 out:
544 mutex_unlock(&ls_lock);
545 return error;
546}
547
548/* Return 1 if the lockspace still has active remote locks,
549 * 2 if the lockspace still has active local locks.
550 */
551static int lockspace_busy(struct dlm_ls *ls)
552{
553 int i, lkb_found = 0;
554 struct dlm_lkb *lkb;
555
556 /* NOTE: We check the lockidtbl here rather than the resource table.
557 This is because there may be LKBs queued as ASTs that have been
558 unlinked from their RSBs and are pending deletion once the AST has
559 been delivered */
560
561 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
562 read_lock(&ls->ls_lkbtbl[i].lock);
563 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
564 lkb_found = 1;
565 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
566 lkb_idtbl_list) {
567 if (!lkb->lkb_nodeid) {
568 read_unlock(&ls->ls_lkbtbl[i].lock);
569 return 2;
570 }
571 }
572 }
573 read_unlock(&ls->ls_lkbtbl[i].lock);
574 }
575 return lkb_found;
576}
577
578static int release_lockspace(struct dlm_ls *ls, int force)
579{
580 struct dlm_lkb *lkb;
581 struct dlm_rsb *rsb;
582 struct list_head *head;
583 int i;
584 int busy = lockspace_busy(ls);
585
586 if (busy > force)
587 return -EBUSY;
588
589 if (force < 3)
590 do_uevent(ls, 0);
591
592 dlm_recoverd_stop(ls);
593
594 remove_lockspace(ls);
595
596 dlm_delete_debug_file(ls);
597
598 dlm_astd_suspend();
599
600 kfree(ls->ls_recover_buf);
601
602 /*
603 * Free direntry structs.
604 */
605
606 dlm_dir_clear(ls);
607 kfree(ls->ls_dirtbl);
608
609 /*
610 * Free all lkb's on lkbtbl[] lists.
611 */
612
613 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
614 head = &ls->ls_lkbtbl[i].list;
615 while (!list_empty(head)) {
616 lkb = list_entry(head->next, struct dlm_lkb,
617 lkb_idtbl_list);
618
619 list_del(&lkb->lkb_idtbl_list);
620
621 dlm_del_ast(lkb);
622
623 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
624 free_lvb(lkb->lkb_lvbptr);
625
626 free_lkb(lkb);
627 }
628 }
629 dlm_astd_resume();
630
631 kfree(ls->ls_lkbtbl);
632
633 /*
634 * Free all rsb's on rsbtbl[] lists
635 */
636
637 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
638 head = &ls->ls_rsbtbl[i].list;
639 while (!list_empty(head)) {
640 rsb = list_entry(head->next, struct dlm_rsb,
641 res_hashchain);
642
643 list_del(&rsb->res_hashchain);
644 free_rsb(rsb);
645 }
646
647 head = &ls->ls_rsbtbl[i].toss;
648 while (!list_empty(head)) {
649 rsb = list_entry(head->next, struct dlm_rsb,
650 res_hashchain);
651 list_del(&rsb->res_hashchain);
652 free_rsb(rsb);
653 }
654 }
655
656 kfree(ls->ls_rsbtbl);
657
658 /*
659 * Free structures on any other lists
660 */
661
662 kfree(ls->ls_recover_args);
663 dlm_clear_free_entries(ls);
664 dlm_clear_members(ls);
665 dlm_clear_members_gone(ls);
666 kfree(ls->ls_node_array);
667 kobject_unregister(&ls->ls_kobj);
668 kfree(ls);
669
670 mutex_lock(&ls_lock);
671 ls_count--;
672 if (!ls_count)
673 threads_stop();
674 mutex_unlock(&ls_lock);
675
676 module_put(THIS_MODULE);
677 return 0;
678}
679
680/*
681 * Called when a system has released all its locks and is not going to use the
682 * lockspace any longer. We free everything we're managing for this lockspace.
683 * Remaining nodes will go through the recovery process as if we'd died. The
684 * lockspace must continue to function as usual, participating in recoveries,
685 * until this returns.
686 *
687 * Force has 4 possible values:
688 * 0 - don't destroy locksapce if it has any LKBs
689 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
690 * 2 - destroy lockspace regardless of LKBs
691 * 3 - destroy lockspace as part of a forced shutdown
692 */
693
694int dlm_release_lockspace(void *lockspace, int force)
695{
696 struct dlm_ls *ls;
697
698 ls = dlm_find_lockspace_local(lockspace);
699 if (!ls)
700 return -EINVAL;
701 dlm_put_lockspace(ls);
702 return release_lockspace(ls, force);
703}
704
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls);
23
24#endif /* __LOCKSPACE_DOT_H__ */
25
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..7ab40422ab57
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!dlm_local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (dlm_local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!dlm_local_count)
264 return;
265
266 if (!port) {
267 if (dlm_local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = dlm_local_addr[0]->ss_family;
277 if (dlm_local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 dlm_local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 dlm_local_addr[dlm_local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!dlm_local_count) {
663 init_local();
664 if (!dlm_local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < dlm_local_count; i++) {
704 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 kmap(e->page);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066int dlm_lowcomms_close(int nodeid)
1067{
1068 struct nodeinfo *ni;
1069
1070 ni = nodeid2nodeinfo(nodeid, 0);
1071 if (!ni)
1072 return -1;
1073
1074 spin_lock(&ni->lock);
1075 if (ni->assoc_id) {
1076 ni->assoc_id = 0;
1077 /* Don't send shutdown here, sctp will just queue it
1078 till the node comes back up! */
1079 }
1080 spin_unlock(&ni->lock);
1081
1082 clean_one_writequeue(ni);
1083 clear_bit(NI_INIT_PENDING, &ni->flags);
1084 return 0;
1085}
1086
1087static int write_list_empty(void)
1088{
1089 int status;
1090
1091 spin_lock_bh(&write_nodes_lock);
1092 status = list_empty(&write_nodes);
1093 spin_unlock_bh(&write_nodes_lock);
1094
1095 return status;
1096}
1097
1098static int dlm_recvd(void *data)
1099{
1100 DECLARE_WAITQUEUE(wait, current);
1101
1102 while (!kthread_should_stop()) {
1103 int count = 0;
1104
1105 set_current_state(TASK_INTERRUPTIBLE);
1106 add_wait_queue(&lowcomms_recv_wait, &wait);
1107 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1108 schedule();
1109 remove_wait_queue(&lowcomms_recv_wait, &wait);
1110 set_current_state(TASK_RUNNING);
1111
1112 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1113 int ret;
1114
1115 do {
1116 ret = receive_from_sock();
1117
1118 /* Don't starve out everyone else */
1119 if (++count >= MAX_RX_MSG_COUNT) {
1120 schedule();
1121 count = 0;
1122 }
1123 } while (!kthread_should_stop() && ret >=0);
1124 }
1125 schedule();
1126 }
1127
1128 return 0;
1129}
1130
1131static int dlm_sendd(void *data)
1132{
1133 DECLARE_WAITQUEUE(wait, current);
1134
1135 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1136
1137 while (!kthread_should_stop()) {
1138 set_current_state(TASK_INTERRUPTIBLE);
1139 if (write_list_empty())
1140 schedule();
1141 set_current_state(TASK_RUNNING);
1142
1143 if (sctp_con.eagain_flag) {
1144 sctp_con.eagain_flag = 0;
1145 refill_write_queue();
1146 }
1147 process_output_queue();
1148 }
1149
1150 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1151
1152 return 0;
1153}
1154
1155static void daemons_stop(void)
1156{
1157 kthread_stop(recv_task);
1158 kthread_stop(send_task);
1159}
1160
1161static int daemons_start(void)
1162{
1163 struct task_struct *p;
1164 int error;
1165
1166 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1167 error = IS_ERR(p);
1168 if (error) {
1169 log_print("can't start dlm_recvd %d", error);
1170 return error;
1171 }
1172 recv_task = p;
1173
1174 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1175 error = IS_ERR(p);
1176 if (error) {
1177 log_print("can't start dlm_sendd %d", error);
1178 kthread_stop(recv_task);
1179 return error;
1180 }
1181 send_task = p;
1182
1183 return 0;
1184}
1185
1186/*
1187 * This is quite likely to sleep...
1188 */
1189int dlm_lowcomms_start(void)
1190{
1191 int error;
1192
1193 error = init_sock();
1194 if (error)
1195 goto fail_sock;
1196 error = daemons_start();
1197 if (error)
1198 goto fail_sock;
1199 atomic_set(&accepting, 1);
1200 return 0;
1201
1202 fail_sock:
1203 close_connection();
1204 return error;
1205}
1206
1207/* Set all the activity flags to prevent any socket activity. */
1208
1209void dlm_lowcomms_stop(void)
1210{
1211 atomic_set(&accepting, 0);
1212 sctp_con.flags = 0x7;
1213 daemons_stop();
1214 clean_writequeues();
1215 close_connection();
1216 dealloc_nodeinfo();
1217 max_nodeid = 0;
1218}
1219
1220int dlm_lowcomms_init(void)
1221{
1222 init_waitqueue_head(&lowcomms_recv_wait);
1223 spin_lock_init(&write_nodes_lock);
1224 INIT_LIST_HEAD(&write_nodes);
1225 init_rwsem(&nodeinfo_lock);
1226 return 0;
1227}
1228
1229void dlm_lowcomms_exit(void)
1230{
1231 int i;
1232
1233 for (i = 0; i < dlm_local_count; i++)
1234 kfree(dlm_local_addr[i]);
1235 dlm_local_count = 0;
1236 dlm_local_nodeid = 0;
1237}
1238
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "user.h"
18#include "memory.h"
19#include "lowcomms.h"
20#include "config.h"
21
22#ifdef CONFIG_DLM_DEBUG
23int dlm_register_debugfs(void);
24void dlm_unregister_debugfs(void);
25#else
26static inline int dlm_register_debugfs(void) { return 0; }
27static inline void dlm_unregister_debugfs(void) { }
28#endif
29
30static int __init init_dlm(void)
31{
32 int error;
33
34 error = dlm_memory_init();
35 if (error)
36 goto out;
37
38 error = dlm_lockspace_init();
39 if (error)
40 goto out_mem;
41
42 error = dlm_config_init();
43 if (error)
44 goto out_lockspace;
45
46 error = dlm_register_debugfs();
47 if (error)
48 goto out_config;
49
50 error = dlm_lowcomms_init();
51 if (error)
52 goto out_debug;
53
54 error = dlm_user_init();
55 if (error)
56 goto out_lowcomms;
57
58 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
59
60 return 0;
61
62 out_lowcomms:
63 dlm_lowcomms_exit();
64 out_debug:
65 dlm_unregister_debugfs();
66 out_config:
67 dlm_config_exit();
68 out_lockspace:
69 dlm_lockspace_exit();
70 out_mem:
71 dlm_memory_exit();
72 out:
73 return error;
74}
75
76static void __exit exit_dlm(void)
77{
78 dlm_user_exit();
79 dlm_lowcomms_exit();
80 dlm_config_exit();
81 dlm_memory_exit();
82 dlm_lockspace_exit();
83 dlm_unregister_debugfs();
84}
85
86module_init(init_dlm);
87module_exit(exit_dlm);
88
89MODULE_DESCRIPTION("Distributed Lock Manager");
90MODULE_AUTHOR("Red Hat, Inc.");
91MODULE_LICENSE("GPL");
92
93EXPORT_SYMBOL_GPL(dlm_new_lockspace);
94EXPORT_SYMBOL_GPL(dlm_release_lockspace);
95EXPORT_SYMBOL_GPL(dlm_lock);
96EXPORT_SYMBOL_GPL(dlm_unlock);
97
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..cd0c51e724e0
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,312 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static void ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 list_for_each_entry(memb, &ls->ls_nodes, list)
169 dlm_rcom_status(ls, memb->nodeid);
170}
171
172int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
173{
174 struct dlm_member *memb, *safe;
175 int i, error, found, pos = 0, neg = 0, low = -1;
176
177 /* move departed members from ls_nodes to ls_nodes_gone */
178
179 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
180 found = 0;
181 for (i = 0; i < rv->node_count; i++) {
182 if (memb->nodeid == rv->nodeids[i]) {
183 found = 1;
184 break;
185 }
186 }
187
188 if (!found) {
189 neg++;
190 dlm_remove_member(ls, memb);
191 log_debug(ls, "remove member %d", memb->nodeid);
192 }
193 }
194
195 /* add new members to ls_nodes */
196
197 for (i = 0; i < rv->node_count; i++) {
198 if (dlm_is_member(ls, rv->nodeids[i]))
199 continue;
200 dlm_add_member(ls, rv->nodeids[i]);
201 pos++;
202 log_debug(ls, "add member %d", rv->nodeids[i]);
203 }
204
205 list_for_each_entry(memb, &ls->ls_nodes, list) {
206 if (low == -1 || memb->nodeid < low)
207 low = memb->nodeid;
208 }
209 ls->ls_low_nodeid = low;
210
211 make_member_array(ls);
212 dlm_set_recover_status(ls, DLM_RS_NODES);
213 *neg_out = neg;
214
215 ping_members(ls);
216
217 error = dlm_recover_members_wait(ls);
218 log_debug(ls, "total members %d", ls->ls_num_nodes);
219 return error;
220}
221
222/*
223 * Following called from lockspace.c
224 */
225
226int dlm_ls_stop(struct dlm_ls *ls)
227{
228 int new;
229
230 /*
231 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
232 * dlm_recovery_stopped()) and prevents any new locks from being
233 * processed (see RUNNING, dlm_locking_stopped()).
234 */
235
236 spin_lock(&ls->ls_recover_lock);
237 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
238 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
239 ls->ls_recover_seq++;
240 spin_unlock(&ls->ls_recover_lock);
241
242 /*
243 * This in_recovery lock does two things:
244 *
245 * 1) Keeps this function from returning until all threads are out
246 * of locking routines and locking is truely stopped.
247 * 2) Keeps any new requests from being processed until it's unlocked
248 * when recovery is complete.
249 */
250
251 if (new)
252 down_write(&ls->ls_in_recovery);
253
254 /*
255 * The recoverd suspend/resume makes sure that dlm_recoverd (if
256 * running) has noticed the clearing of RUNNING above and quit
257 * processing the previous recovery. This will be true for all nodes
258 * before any nodes start the new recovery.
259 */
260
261 dlm_recoverd_suspend(ls);
262 ls->ls_recover_status = 0;
263 dlm_recoverd_resume(ls);
264 return 0;
265}
266
267int dlm_ls_start(struct dlm_ls *ls)
268{
269 struct dlm_recover *rv = NULL, *rv_old;
270 int *ids = NULL;
271 int error, count;
272
273 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
274 if (!rv)
275 return -ENOMEM;
276
277 error = count = dlm_nodeid_list(ls->ls_name, &ids);
278 if (error <= 0)
279 goto fail;
280
281 spin_lock(&ls->ls_recover_lock);
282
283 /* the lockspace needs to be stopped before it can be started */
284
285 if (!dlm_locking_stopped(ls)) {
286 spin_unlock(&ls->ls_recover_lock);
287 log_error(ls, "start ignored: lockspace running");
288 error = -EINVAL;
289 goto fail;
290 }
291
292 rv->nodeids = ids;
293 rv->node_count = count;
294 rv->seq = ++ls->ls_recover_seq;
295 rv_old = ls->ls_recover_args;
296 ls->ls_recover_args = rv;
297 spin_unlock(&ls->ls_recover_lock);
298
299 if (rv_old) {
300 kfree(rv_old->nodeids);
301 kfree(rv_old);
302 }
303
304 dlm_recoverd_kick(ls);
305 return 0;
306
307 fail:
308 kfree(rv);
309 kfree(ids);
310 return error;
311}
312
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..48dfc27861f4
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,115 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 if (lkb->lkb_flags & DLM_IFL_USER) {
88 struct dlm_user_args *ua;
89 ua = (struct dlm_user_args *)lkb->lkb_astparam;
90 if (ua) {
91 if (ua->lksb.sb_lvbptr)
92 kfree(ua->lksb.sb_lvbptr);
93 kfree(ua);
94 }
95 }
96 kmem_cache_free(lkb_cache, lkb);
97}
98
99struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
100{
101 struct dlm_direntry *de;
102
103 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
104
105 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
106 if (de)
107 memset(de, 0, sizeof(*de) + namelen);
108 return de;
109}
110
111void free_direntry(struct dlm_direntry *de)
112{
113 kfree(de);
114}
115
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..55fbe313340e
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,457 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100
101 if (nodeid == dlm_our_nodeid()) {
102 rc = (struct dlm_rcom *) ls->ls_recover_buf;
103 rc->rc_result = dlm_recover_status(ls);
104 goto out;
105 }
106
107 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
108 if (error)
109 goto out;
110
111 send_rcom(ls, mh, rc);
112
113 error = dlm_wait_function(ls, &rcom_response);
114 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
115 if (error)
116 goto out;
117
118 rc = (struct dlm_rcom *) ls->ls_recover_buf;
119
120 if (rc->rc_result == -ESRCH) {
121 /* we pretend the remote lockspace exists with 0 status */
122 log_debug(ls, "remote node %d not ready", nodeid);
123 rc->rc_result = 0;
124 } else
125 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
126 nodeid);
127 /* the caller looks at rc_result for the remote recovery status */
128 out:
129 return error;
130}
131
132static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
133{
134 struct dlm_rcom *rc;
135 struct dlm_mhandle *mh;
136 int error, nodeid = rc_in->rc_header.h_nodeid;
137
138 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
139 sizeof(struct rcom_config), &rc, &mh);
140 if (error)
141 return;
142 rc->rc_result = dlm_recover_status(ls);
143 make_config(ls, (struct rcom_config *) rc->rc_buf);
144
145 send_rcom(ls, mh, rc);
146}
147
148static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
149{
150 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
151 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
152 wake_up(&ls->ls_wait_general);
153}
154
155int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
156{
157 struct dlm_rcom *rc;
158 struct dlm_mhandle *mh;
159 int error = 0, len = sizeof(struct dlm_rcom);
160
161 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
162
163 if (nodeid == dlm_our_nodeid()) {
164 dlm_copy_master_names(ls, last_name, last_len,
165 ls->ls_recover_buf + len,
166 dlm_config.buffer_size - len, nodeid);
167 goto out;
168 }
169
170 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
171 if (error)
172 goto out;
173 memcpy(rc->rc_buf, last_name, last_len);
174
175 send_rcom(ls, mh, rc);
176
177 error = dlm_wait_function(ls, &rcom_response);
178 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
179 out:
180 return error;
181}
182
183static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
184{
185 struct dlm_rcom *rc;
186 struct dlm_mhandle *mh;
187 int error, inlen, outlen;
188 int nodeid = rc_in->rc_header.h_nodeid;
189 uint32_t status = dlm_recover_status(ls);
190
191 /*
192 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
193 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
194 * It could only happen in rare cases where we get a late NAMES
195 * message from a previous instance of recovery.
196 */
197
198 if (!(status & DLM_RS_NODES)) {
199 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
200 return;
201 }
202
203 nodeid = rc_in->rc_header.h_nodeid;
204 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
205 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
206
207 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
208 if (error)
209 return;
210
211 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
212 nodeid);
213 send_rcom(ls, mh, rc);
214}
215
216static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
217{
218 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
219 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
220 wake_up(&ls->ls_wait_general);
221}
222
223int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
224{
225 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh;
227 struct dlm_ls *ls = r->res_ls;
228 int error;
229
230 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
231 &rc, &mh);
232 if (error)
233 goto out;
234 memcpy(rc->rc_buf, r->res_name, r->res_length);
235 rc->rc_id = (unsigned long) r;
236
237 send_rcom(ls, mh, rc);
238 out:
239 return error;
240}
241
242static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
243{
244 struct dlm_rcom *rc;
245 struct dlm_mhandle *mh;
246 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
247 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
248
249 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
250 if (error)
251 return;
252
253 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
254 if (error)
255 ret_nodeid = error;
256 rc->rc_result = ret_nodeid;
257 rc->rc_id = rc_in->rc_id;
258
259 send_rcom(ls, mh, rc);
260}
261
262static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
263{
264 dlm_recover_master_reply(ls, rc_in);
265}
266
267static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
268 struct rcom_lock *rl)
269{
270 memset(rl, 0, sizeof(*rl));
271
272 rl->rl_ownpid = lkb->lkb_ownpid;
273 rl->rl_lkid = lkb->lkb_id;
274 rl->rl_exflags = lkb->lkb_exflags;
275 rl->rl_flags = lkb->lkb_flags;
276 rl->rl_lvbseq = lkb->lkb_lvbseq;
277 rl->rl_rqmode = lkb->lkb_rqmode;
278 rl->rl_grmode = lkb->lkb_grmode;
279 rl->rl_status = lkb->lkb_status;
280 rl->rl_wait_type = lkb->lkb_wait_type;
281
282 if (lkb->lkb_bastaddr)
283 rl->rl_asts |= AST_BAST;
284 if (lkb->lkb_astaddr)
285 rl->rl_asts |= AST_COMP;
286
287 rl->rl_namelen = r->res_length;
288 memcpy(rl->rl_name, r->res_name, r->res_length);
289
290 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
291 If so, receive_rcom_lock_args() won't take this copy. */
292
293 if (lkb->lkb_lvbptr)
294 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
295}
296
297int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
298{
299 struct dlm_ls *ls = r->res_ls;
300 struct dlm_rcom *rc;
301 struct dlm_mhandle *mh;
302 struct rcom_lock *rl;
303 int error, len = sizeof(struct rcom_lock);
304
305 if (lkb->lkb_lvbptr)
306 len += ls->ls_lvblen;
307
308 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
309 if (error)
310 goto out;
311
312 rl = (struct rcom_lock *) rc->rc_buf;
313 pack_rcom_lock(r, lkb, rl);
314 rc->rc_id = (unsigned long) r;
315
316 send_rcom(ls, mh, rc);
317 out:
318 return error;
319}
320
321static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
322{
323 struct dlm_rcom *rc;
324 struct dlm_mhandle *mh;
325 int error, nodeid = rc_in->rc_header.h_nodeid;
326
327 dlm_recover_master_copy(ls, rc_in);
328
329 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
330 sizeof(struct rcom_lock), &rc, &mh);
331 if (error)
332 return;
333
334 /* We send back the same rcom_lock struct we received, but
335 dlm_recover_master_copy() has filled in rl_remid and rl_result */
336
337 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
338 rc->rc_id = rc_in->rc_id;
339
340 send_rcom(ls, mh, rc);
341}
342
343static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
344{
345 uint32_t status = dlm_recover_status(ls);
346
347 if (!(status & DLM_RS_DIR)) {
348 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
349 rc_in->rc_header.h_nodeid);
350 return;
351 }
352
353 dlm_recover_process_copy(ls, rc_in);
354}
355
356static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
357{
358 struct dlm_rcom *rc;
359 struct dlm_mhandle *mh;
360 char *mb;
361 int mb_len = sizeof(struct dlm_rcom);
362
363 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
364 if (!mh)
365 return -ENOBUFS;
366 memset(mb, 0, mb_len);
367
368 rc = (struct dlm_rcom *) mb;
369
370 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
371 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
372 rc->rc_header.h_nodeid = dlm_our_nodeid();
373 rc->rc_header.h_length = mb_len;
374 rc->rc_header.h_cmd = DLM_RCOM;
375
376 rc->rc_type = DLM_RCOM_STATUS_REPLY;
377 rc->rc_result = -ESRCH;
378
379 dlm_rcom_out(rc);
380 dlm_lowcomms_commit_buffer(mh);
381
382 return 0;
383}
384
385/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
386 recovery-only comms are sent through here. */
387
388void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
389{
390 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
391 struct dlm_ls *ls;
392
393 dlm_rcom_in(rc);
394
395 /* If the lockspace doesn't exist then still send a status message
396 back; it's possible that it just doesn't have its global_id yet. */
397
398 ls = dlm_find_lockspace_global(hd->h_lockspace);
399 if (!ls) {
400 log_print("lockspace %x from %d not found",
401 hd->h_lockspace, nodeid);
402 send_ls_not_ready(nodeid, rc);
403 return;
404 }
405
406 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
407 log_error(ls, "ignoring recovery message %x from %d",
408 rc->rc_type, nodeid);
409 goto out;
410 }
411
412 if (nodeid != rc->rc_header.h_nodeid) {
413 log_error(ls, "bad rcom nodeid %d from %d",
414 rc->rc_header.h_nodeid, nodeid);
415 goto out;
416 }
417
418 switch (rc->rc_type) {
419 case DLM_RCOM_STATUS:
420 receive_rcom_status(ls, rc);
421 break;
422
423 case DLM_RCOM_NAMES:
424 receive_rcom_names(ls, rc);
425 break;
426
427 case DLM_RCOM_LOOKUP:
428 receive_rcom_lookup(ls, rc);
429 break;
430
431 case DLM_RCOM_LOCK:
432 receive_rcom_lock(ls, rc);
433 break;
434
435 case DLM_RCOM_STATUS_REPLY:
436 receive_rcom_status_reply(ls, rc);
437 break;
438
439 case DLM_RCOM_NAMES_REPLY:
440 receive_rcom_names_reply(ls, rc);
441 break;
442
443 case DLM_RCOM_LOOKUP_REPLY:
444 receive_rcom_lookup_reply(ls, rc);
445 break;
446
447 case DLM_RCOM_LOCK_REPLY:
448 receive_rcom_lock_reply(ls, rc);
449 break;
450
451 default:
452 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
453 }
454 out:
455 dlm_put_lockspace(ls);
456}
457
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..34876f60f298
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,762 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
309 */
310
311static void set_new_master(struct dlm_rsb *r, int nodeid)
312{
313 lock_rsb(r);
314 r->res_nodeid = nodeid;
315 set_master_lkbs(r);
316 rsb_set_flag(r, RSB_NEW_MASTER);
317 rsb_set_flag(r, RSB_NEW_MASTER2);
318 unlock_rsb(r);
319}
320
321/*
322 * We do async lookups on rsb's that need new masters. The rsb's
323 * waiting for a lookup reply are kept on the recover_list.
324 */
325
326static int recover_master(struct dlm_rsb *r)
327{
328 struct dlm_ls *ls = r->res_ls;
329 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
330
331 dir_nodeid = dlm_dir_nodeid(r);
332
333 if (dir_nodeid == our_nodeid) {
334 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
335 r->res_length, &ret_nodeid);
336 if (error)
337 log_error(ls, "recover dir lookup error %d", error);
338
339 if (ret_nodeid == our_nodeid)
340 ret_nodeid = 0;
341 set_new_master(r, ret_nodeid);
342 } else {
343 recover_list_add(r);
344 error = dlm_send_rcom_lookup(r, dir_nodeid);
345 }
346
347 return error;
348}
349
350/*
351 * When not using a directory, most resource names will hash to a new static
352 * master nodeid and the resource will need to be remastered.
353 */
354
355static int recover_master_static(struct dlm_rsb *r)
356{
357 int master = dlm_dir_nodeid(r);
358
359 if (master == dlm_our_nodeid())
360 master = 0;
361
362 if (r->res_nodeid != master) {
363 if (is_master(r))
364 dlm_purge_mstcpy_locks(r);
365 set_new_master(r, master);
366 return 1;
367 }
368 return 0;
369}
370
371/*
372 * Go through local root resources and for each rsb which has a master which
373 * has departed, get the new master nodeid from the directory. The dir will
374 * assign mastery to the first node to look up the new master. That means
375 * we'll discover in this lookup if we're the new master of any rsb's.
376 *
377 * We fire off all the dir lookup requests individually and asynchronously to
378 * the correct dir node.
379 */
380
381int dlm_recover_masters(struct dlm_ls *ls)
382{
383 struct dlm_rsb *r;
384 int error = 0, count = 0;
385
386 log_debug(ls, "dlm_recover_masters");
387
388 down_read(&ls->ls_root_sem);
389 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
390 if (dlm_recovery_stopped(ls)) {
391 up_read(&ls->ls_root_sem);
392 error = -EINTR;
393 goto out;
394 }
395
396 if (dlm_no_directory(ls))
397 count += recover_master_static(r);
398 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
399 recover_master(r);
400 count++;
401 }
402
403 schedule();
404 }
405 up_read(&ls->ls_root_sem);
406
407 log_debug(ls, "dlm_recover_masters %d resources", count);
408
409 error = dlm_wait_function(ls, &recover_list_empty);
410 out:
411 if (error)
412 recover_list_clear(ls);
413 return error;
414}
415
416int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
417{
418 struct dlm_rsb *r;
419 int nodeid;
420
421 r = recover_list_find(ls, rc->rc_id);
422 if (!r) {
423 log_error(ls, "dlm_recover_master_reply no id %llx",
424 (unsigned long long)rc->rc_id);
425 goto out;
426 }
427
428 nodeid = rc->rc_result;
429 if (nodeid == dlm_our_nodeid())
430 nodeid = 0;
431
432 set_new_master(r, nodeid);
433 recover_list_del(r);
434
435 if (recover_list_empty(ls))
436 wake_up(&ls->ls_wait_general);
437 out:
438 return 0;
439}
440
441
442/* Lock recovery: rebuild the process-copy locks we hold on a
443 remastered rsb on the new rsb master.
444
445 dlm_recover_locks
446 recover_locks
447 recover_locks_queue
448 dlm_send_rcom_lock -> receive_rcom_lock
449 dlm_recover_master_copy
450 receive_rcom_lock_reply <-
451 dlm_recover_process_copy
452*/
453
454
455/*
456 * keep a count of the number of lkb's we send to the new master; when we get
457 * an equal number of replies then recovery for the rsb is done
458 */
459
460static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
461{
462 struct dlm_lkb *lkb;
463 int error = 0;
464
465 list_for_each_entry(lkb, head, lkb_statequeue) {
466 error = dlm_send_rcom_lock(r, lkb);
467 if (error)
468 break;
469 r->res_recover_locks_count++;
470 }
471
472 return error;
473}
474
475static int all_queues_empty(struct dlm_rsb *r)
476{
477 if (!list_empty(&r->res_grantqueue) ||
478 !list_empty(&r->res_convertqueue) ||
479 !list_empty(&r->res_waitqueue))
480 return 0;
481 return 1;
482}
483
484static int recover_locks(struct dlm_rsb *r)
485{
486 int error = 0;
487
488 lock_rsb(r);
489 if (all_queues_empty(r))
490 goto out;
491
492 DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
493
494 error = recover_locks_queue(r, &r->res_grantqueue);
495 if (error)
496 goto out;
497 error = recover_locks_queue(r, &r->res_convertqueue);
498 if (error)
499 goto out;
500 error = recover_locks_queue(r, &r->res_waitqueue);
501 if (error)
502 goto out;
503
504 if (r->res_recover_locks_count)
505 recover_list_add(r);
506 else
507 rsb_clear_flag(r, RSB_NEW_MASTER);
508 out:
509 unlock_rsb(r);
510 return error;
511}
512
513int dlm_recover_locks(struct dlm_ls *ls)
514{
515 struct dlm_rsb *r;
516 int error, count = 0;
517
518 log_debug(ls, "dlm_recover_locks");
519
520 down_read(&ls->ls_root_sem);
521 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
522 if (is_master(r)) {
523 rsb_clear_flag(r, RSB_NEW_MASTER);
524 continue;
525 }
526
527 if (!rsb_flag(r, RSB_NEW_MASTER))
528 continue;
529
530 if (dlm_recovery_stopped(ls)) {
531 error = -EINTR;
532 up_read(&ls->ls_root_sem);
533 goto out;
534 }
535
536 error = recover_locks(r);
537 if (error) {
538 up_read(&ls->ls_root_sem);
539 goto out;
540 }
541
542 count += r->res_recover_locks_count;
543 }
544 up_read(&ls->ls_root_sem);
545
546 log_debug(ls, "dlm_recover_locks %d locks", count);
547
548 error = dlm_wait_function(ls, &recover_list_empty);
549 out:
550 if (error)
551 recover_list_clear(ls);
552 else
553 dlm_set_recover_status(ls, DLM_RS_LOCKS);
554 return error;
555}
556
557void dlm_recovered_lock(struct dlm_rsb *r)
558{
559 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
560
561 r->res_recover_locks_count--;
562 if (!r->res_recover_locks_count) {
563 rsb_clear_flag(r, RSB_NEW_MASTER);
564 recover_list_del(r);
565 }
566
567 if (recover_list_empty(r->res_ls))
568 wake_up(&r->res_ls->ls_wait_general);
569}
570
571/*
572 * The lvb needs to be recovered on all master rsb's. This includes setting
573 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
574 * based on the lvb's of the locks held on the rsb.
575 *
576 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
577 * was already set prior to recovery, it's not cleared, regardless of locks.
578 *
579 * The LVB contents are only considered for changing when this is a new master
580 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
581 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
582 * from the lkb with the largest lvb sequence number.
583 */
584
585static void recover_lvb(struct dlm_rsb *r)
586{
587 struct dlm_lkb *lkb, *high_lkb = NULL;
588 uint32_t high_seq = 0;
589 int lock_lvb_exists = 0;
590 int big_lock_exists = 0;
591 int lvblen = r->res_ls->ls_lvblen;
592
593 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
594 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
595 continue;
596
597 lock_lvb_exists = 1;
598
599 if (lkb->lkb_grmode > DLM_LOCK_CR) {
600 big_lock_exists = 1;
601 goto setflag;
602 }
603
604 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
605 high_lkb = lkb;
606 high_seq = lkb->lkb_lvbseq;
607 }
608 }
609
610 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
611 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
612 continue;
613
614 lock_lvb_exists = 1;
615
616 if (lkb->lkb_grmode > DLM_LOCK_CR) {
617 big_lock_exists = 1;
618 goto setflag;
619 }
620
621 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
622 high_lkb = lkb;
623 high_seq = lkb->lkb_lvbseq;
624 }
625 }
626
627 setflag:
628 if (!lock_lvb_exists)
629 goto out;
630
631 if (!big_lock_exists)
632 rsb_set_flag(r, RSB_VALNOTVALID);
633
634 /* don't mess with the lvb unless we're the new master */
635 if (!rsb_flag(r, RSB_NEW_MASTER2))
636 goto out;
637
638 if (!r->res_lvbptr) {
639 r->res_lvbptr = allocate_lvb(r->res_ls);
640 if (!r->res_lvbptr)
641 goto out;
642 }
643
644 if (big_lock_exists) {
645 r->res_lvbseq = lkb->lkb_lvbseq;
646 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
647 } else if (high_lkb) {
648 r->res_lvbseq = high_lkb->lkb_lvbseq;
649 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
650 } else {
651 r->res_lvbseq = 0;
652 memset(r->res_lvbptr, 0, lvblen);
653 }
654 out:
655 return;
656}
657
658/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
659 converting PR->CW or CW->PR need to have their lkb_grmode set. */
660
661static void recover_conversion(struct dlm_rsb *r)
662{
663 struct dlm_lkb *lkb;
664 int grmode = -1;
665
666 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
667 if (lkb->lkb_grmode == DLM_LOCK_PR ||
668 lkb->lkb_grmode == DLM_LOCK_CW) {
669 grmode = lkb->lkb_grmode;
670 break;
671 }
672 }
673
674 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
675 if (lkb->lkb_grmode != DLM_LOCK_IV)
676 continue;
677 if (grmode == -1)
678 lkb->lkb_grmode = lkb->lkb_rqmode;
679 else
680 lkb->lkb_grmode = grmode;
681 }
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 recover_lvb(r);
698 count++;
699 }
700 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
701 unlock_rsb(r);
702 }
703 up_read(&ls->ls_root_sem);
704
705 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
706}
707
708/* Create a single list of all root rsb's to be used during recovery */
709
710int dlm_create_root_list(struct dlm_ls *ls)
711{
712 struct dlm_rsb *r;
713 int i, error = 0;
714
715 down_write(&ls->ls_root_sem);
716 if (!list_empty(&ls->ls_root_list)) {
717 log_error(ls, "root list not empty");
718 error = -EINVAL;
719 goto out;
720 }
721
722 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
723 read_lock(&ls->ls_rsbtbl[i].lock);
724 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
725 list_add(&r->res_root_list, &ls->ls_root_list);
726 dlm_hold_rsb(r);
727 }
728 read_unlock(&ls->ls_rsbtbl[i].lock);
729 }
730 out:
731 up_write(&ls->ls_root_sem);
732 return error;
733}
734
735void dlm_release_root_list(struct dlm_ls *ls)
736{
737 struct dlm_rsb *r, *safe;
738
739 down_write(&ls->ls_root_sem);
740 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
741 list_del_init(&r->res_root_list);
742 dlm_put_rsb(r);
743 }
744 up_write(&ls->ls_root_sem);
745}
746
747void dlm_clear_toss_list(struct dlm_ls *ls)
748{
749 struct dlm_rsb *r, *safe;
750 int i;
751
752 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
753 write_lock(&ls->ls_rsbtbl[i].lock);
754 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
755 res_hashchain) {
756 list_del(&r->res_hashchain);
757 free_rsb(r);
758 }
759 write_unlock(&ls->ls_rsbtbl[i].lock);
760 }
761}
762
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..70103533677d
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,285 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237
238 while (!kthread_should_stop()) {
239 set_current_state(TASK_INTERRUPTIBLE);
240 if (!test_bit(LSFL_WORK, &ls->ls_flags))
241 schedule();
242 set_current_state(TASK_RUNNING);
243
244 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
245 do_ls_recovery(ls);
246 }
247
248 dlm_put_lockspace(ls);
249 return 0;
250}
251
252void dlm_recoverd_kick(struct dlm_ls *ls)
253{
254 set_bit(LSFL_WORK, &ls->ls_flags);
255 wake_up_process(ls->ls_recoverd_task);
256}
257
258int dlm_recoverd_start(struct dlm_ls *ls)
259{
260 struct task_struct *p;
261 int error = 0;
262
263 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
264 if (IS_ERR(p))
265 error = PTR_ERR(p);
266 else
267 ls->ls_recoverd_task = p;
268 return error;
269}
270
271void dlm_recoverd_stop(struct dlm_ls *ls)
272{
273 kthread_stop(ls->ls_recoverd_task);
274}
275
276void dlm_recoverd_suspend(struct dlm_ls *ls)
277{
278 mutex_lock(&ls->ls_recoverd_active);
279}
280
281void dlm_recoverd_resume(struct dlm_ls *ls)
282{
283 mutex_unlock(&ls->ls_recoverd_active);
284}
285
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..1f05960a916f
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,769 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/init.h>
11#include <linux/wait.h>
12#include <linux/module.h>
13#include <linux/file.h>
14#include <linux/fs.h>
15#include <linux/poll.h>
16#include <linux/signal.h>
17#include <linux/spinlock.h>
18#include <linux/dlm.h>
19#include <linux/dlm_device.h>
20
21#include "dlm_internal.h"
22#include "lockspace.h"
23#include "lock.h"
24#include "lvb_table.h"
25
26static const char *name_prefix="dlm";
27static struct miscdevice ctl_device;
28static struct file_operations device_fops;
29
30#ifdef CONFIG_COMPAT
31
32struct dlm_lock_params32 {
33 __u8 mode;
34 __u8 namelen;
35 __u16 flags;
36 __u32 lkid;
37 __u32 parent;
38
39 __u32 castparam;
40 __u32 castaddr;
41 __u32 bastparam;
42 __u32 bastaddr;
43 __u32 lksb;
44
45 char lvb[DLM_USER_LVB_LEN];
46 char name[0];
47};
48
49struct dlm_write_request32 {
50 __u32 version[3];
51 __u8 cmd;
52 __u8 is64bit;
53 __u8 unused[2];
54
55 union {
56 struct dlm_lock_params32 lock;
57 struct dlm_lspace_params lspace;
58 } i;
59};
60
61struct dlm_lksb32 {
62 __u32 sb_status;
63 __u32 sb_lkid;
64 __u8 sb_flags;
65 __u32 sb_lvbptr;
66};
67
68struct dlm_lock_result32 {
69 __u32 length;
70 __u32 user_astaddr;
71 __u32 user_astparam;
72 __u32 user_lksb;
73 struct dlm_lksb32 lksb;
74 __u8 bast_mode;
75 __u8 unused[3];
76 /* Offsets may be zero if no data is present */
77 __u32 lvb_offset;
78};
79
80static void compat_input(struct dlm_write_request *kb,
81 struct dlm_write_request32 *kb32)
82{
83 kb->version[0] = kb32->version[0];
84 kb->version[1] = kb32->version[1];
85 kb->version[2] = kb32->version[2];
86
87 kb->cmd = kb32->cmd;
88 kb->is64bit = kb32->is64bit;
89 if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
90 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
91 kb->i.lspace.flags = kb32->i.lspace.flags;
92 kb->i.lspace.minor = kb32->i.lspace.minor;
93 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
94 } else {
95 kb->i.lock.mode = kb32->i.lock.mode;
96 kb->i.lock.namelen = kb32->i.lock.namelen;
97 kb->i.lock.flags = kb32->i.lock.flags;
98 kb->i.lock.lkid = kb32->i.lock.lkid;
99 kb->i.lock.parent = kb32->i.lock.parent;
100 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
101 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
102 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
103 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
104 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
105 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
106 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
107 }
108}
109
110static void compat_output(struct dlm_lock_result *res,
111 struct dlm_lock_result32 *res32)
112{
113 res32->length = res->length - (sizeof(struct dlm_lock_result) -
114 sizeof(struct dlm_lock_result32));
115 res32->user_astaddr = (__u32)(long)res->user_astaddr;
116 res32->user_astparam = (__u32)(long)res->user_astparam;
117 res32->user_lksb = (__u32)(long)res->user_lksb;
118 res32->bast_mode = res->bast_mode;
119
120 res32->lvb_offset = res->lvb_offset;
121 res32->length = res->length;
122
123 res32->lksb.sb_status = res->lksb.sb_status;
124 res32->lksb.sb_flags = res->lksb.sb_flags;
125 res32->lksb.sb_lkid = res->lksb.sb_lkid;
126 res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
127}
128#endif
129
130
131void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
132{
133 struct dlm_ls *ls;
134 struct dlm_user_args *ua;
135 struct dlm_user_proc *proc;
136
137 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
138 lkb before dealing with it. We need to check this
139 flag before taking ls_clear_proc_locks mutex because if
140 it's set, dlm_clear_proc_locks() holds the mutex. */
141
142 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
143 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
144 return;
145 }
146
147 ls = lkb->lkb_resource->res_ls;
148 mutex_lock(&ls->ls_clear_proc_locks);
149
150 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
151 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
152 lkb->ua so we can't try to use it. */
153
154 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
155 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
156 goto out;
157 }
158
159 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
160 ua = (struct dlm_user_args *)lkb->lkb_astparam;
161 proc = ua->proc;
162
163 if (type == AST_BAST && ua->bastaddr == NULL)
164 goto out;
165
166 spin_lock(&proc->asts_spin);
167 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
168 kref_get(&lkb->lkb_ref);
169 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
170 lkb->lkb_ast_type |= type;
171 wake_up_interruptible(&proc->wait);
172 }
173
174 /* We want to copy the lvb to userspace when the completion
175 ast is read if the status is 0, the lock has an lvb and
176 lvb_ops says we should. We could probably have set_lvb_lock()
177 set update_user_lvb instead and not need old_mode */
178
179 if ((lkb->lkb_ast_type & AST_COMP) &&
180 (lkb->lkb_lksb->sb_status == 0) &&
181 lkb->lkb_lksb->sb_lvbptr &&
182 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
183 ua->update_user_lvb = 1;
184 else
185 ua->update_user_lvb = 0;
186
187 spin_unlock(&proc->asts_spin);
188 out:
189 mutex_unlock(&ls->ls_clear_proc_locks);
190}
191
192static int device_user_lock(struct dlm_user_proc *proc,
193 struct dlm_lock_params *params)
194{
195 struct dlm_ls *ls;
196 struct dlm_user_args *ua;
197 int error = -ENOMEM;
198
199 ls = dlm_find_lockspace_local(proc->lockspace);
200 if (!ls)
201 return -ENOENT;
202
203 if (!params->castaddr || !params->lksb) {
204 error = -EINVAL;
205 goto out;
206 }
207
208 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
209 if (!ua)
210 goto out;
211 ua->proc = proc;
212 ua->user_lksb = params->lksb;
213 ua->castparam = params->castparam;
214 ua->castaddr = params->castaddr;
215 ua->bastparam = params->bastparam;
216 ua->bastaddr = params->bastaddr;
217
218 if (params->flags & DLM_LKF_CONVERT)
219 error = dlm_user_convert(ls, ua,
220 params->mode, params->flags,
221 params->lkid, params->lvb);
222 else {
223 error = dlm_user_request(ls, ua,
224 params->mode, params->flags,
225 params->name, params->namelen,
226 params->parent);
227 if (!error)
228 error = ua->lksb.sb_lkid;
229 }
230 out:
231 dlm_put_lockspace(ls);
232 return error;
233}
234
235static int device_user_unlock(struct dlm_user_proc *proc,
236 struct dlm_lock_params *params)
237{
238 struct dlm_ls *ls;
239 struct dlm_user_args *ua;
240 int error = -ENOMEM;
241
242 ls = dlm_find_lockspace_local(proc->lockspace);
243 if (!ls)
244 return -ENOENT;
245
246 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
247 if (!ua)
248 goto out;
249 ua->proc = proc;
250 ua->user_lksb = params->lksb;
251 ua->castparam = params->castparam;
252 ua->castaddr = params->castaddr;
253
254 if (params->flags & DLM_LKF_CANCEL)
255 error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
256 else
257 error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
258 params->lvb);
259 out:
260 dlm_put_lockspace(ls);
261 return error;
262}
263
264static int device_create_lockspace(struct dlm_lspace_params *params)
265{
266 dlm_lockspace_t *lockspace;
267 struct dlm_ls *ls;
268 int error, len;
269
270 if (!capable(CAP_SYS_ADMIN))
271 return -EPERM;
272
273 error = dlm_new_lockspace(params->name, strlen(params->name),
274 &lockspace, 0, DLM_USER_LVB_LEN);
275 if (error)
276 return error;
277
278 ls = dlm_find_lockspace_local(lockspace);
279 if (!ls)
280 return -ENOENT;
281
282 error = -ENOMEM;
283 len = strlen(params->name) + strlen(name_prefix) + 2;
284 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
285 if (!ls->ls_device.name)
286 goto fail;
287 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
288 params->name);
289 ls->ls_device.fops = &device_fops;
290 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
291
292 error = misc_register(&ls->ls_device);
293 if (error) {
294 kfree(ls->ls_device.name);
295 goto fail;
296 }
297
298 error = ls->ls_device.minor;
299 dlm_put_lockspace(ls);
300 return error;
301
302 fail:
303 dlm_put_lockspace(ls);
304 dlm_release_lockspace(lockspace, 0);
305 return error;
306}
307
308static int device_remove_lockspace(struct dlm_lspace_params *params)
309{
310 dlm_lockspace_t *lockspace;
311 struct dlm_ls *ls;
312 int error;
313
314 if (!capable(CAP_SYS_ADMIN))
315 return -EPERM;
316
317 ls = dlm_find_lockspace_device(params->minor);
318 if (!ls)
319 return -ENOENT;
320
321 error = misc_deregister(&ls->ls_device);
322 if (error) {
323 dlm_put_lockspace(ls);
324 goto out;
325 }
326 kfree(ls->ls_device.name);
327
328 lockspace = ls->ls_local_handle;
329
330 /* dlm_release_lockspace waits for references to go to zero,
331 so all processes will need to close their device for the ls
332 before the release will procede */
333
334 dlm_put_lockspace(ls);
335 error = dlm_release_lockspace(lockspace, 0);
336out:
337 return error;
338}
339
340/* Check the user's version matches ours */
341static int check_version(struct dlm_write_request *req)
342{
343 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
344 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
345 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
346
347 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
348 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
349 current->comm,
350 current->pid,
351 req->version[0],
352 req->version[1],
353 req->version[2],
354 DLM_DEVICE_VERSION_MAJOR,
355 DLM_DEVICE_VERSION_MINOR,
356 DLM_DEVICE_VERSION_PATCH);
357 return -EINVAL;
358 }
359 return 0;
360}
361
362/*
363 * device_write
364 *
365 * device_user_lock
366 * dlm_user_request -> request_lock
367 * dlm_user_convert -> convert_lock
368 *
369 * device_user_unlock
370 * dlm_user_unlock -> unlock_lock
371 * dlm_user_cancel -> cancel_lock
372 *
373 * device_create_lockspace
374 * dlm_new_lockspace
375 *
376 * device_remove_lockspace
377 * dlm_release_lockspace
378 */
379
380/* a write to a lockspace device is a lock or unlock request, a write
381 to the control device is to create/remove a lockspace */
382
383static ssize_t device_write(struct file *file, const char __user *buf,
384 size_t count, loff_t *ppos)
385{
386 struct dlm_user_proc *proc = file->private_data;
387 struct dlm_write_request *kbuf;
388 sigset_t tmpsig, allsigs;
389 int error;
390
391#ifdef CONFIG_COMPAT
392 if (count < sizeof(struct dlm_write_request32))
393#else
394 if (count < sizeof(struct dlm_write_request))
395#endif
396 return -EINVAL;
397
398 kbuf = kmalloc(count, GFP_KERNEL);
399 if (!kbuf)
400 return -ENOMEM;
401
402 if (copy_from_user(kbuf, buf, count)) {
403 error = -EFAULT;
404 goto out_free;
405 }
406
407 if (check_version(kbuf)) {
408 error = -EBADE;
409 goto out_free;
410 }
411
412#ifdef CONFIG_COMPAT
413 if (!kbuf->is64bit) {
414 struct dlm_write_request32 *k32buf;
415 k32buf = (struct dlm_write_request32 *)kbuf;
416 kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
417 sizeof(struct dlm_write_request32)), GFP_KERNEL);
418 if (!kbuf)
419 return -ENOMEM;
420
421 if (proc)
422 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
423 compat_input(kbuf, k32buf);
424 kfree(k32buf);
425 }
426#endif
427
428 /* do we really need this? can a write happen after a close? */
429 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
430 test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
431 return -EINVAL;
432
433 sigfillset(&allsigs);
434 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
435
436 error = -EINVAL;
437
438 switch (kbuf->cmd)
439 {
440 case DLM_USER_LOCK:
441 if (!proc) {
442 log_print("no locking on control device");
443 goto out_sig;
444 }
445 error = device_user_lock(proc, &kbuf->i.lock);
446 break;
447
448 case DLM_USER_UNLOCK:
449 if (!proc) {
450 log_print("no locking on control device");
451 goto out_sig;
452 }
453 error = device_user_unlock(proc, &kbuf->i.lock);
454 break;
455
456 case DLM_USER_CREATE_LOCKSPACE:
457 if (proc) {
458 log_print("create/remove only on control device");
459 goto out_sig;
460 }
461 error = device_create_lockspace(&kbuf->i.lspace);
462 break;
463
464 case DLM_USER_REMOVE_LOCKSPACE:
465 if (proc) {
466 log_print("create/remove only on control device");
467 goto out_sig;
468 }
469 error = device_remove_lockspace(&kbuf->i.lspace);
470 break;
471
472 default:
473 log_print("Unknown command passed to DLM device : %d\n",
474 kbuf->cmd);
475 }
476
477 out_sig:
478 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
479 recalc_sigpending();
480 out_free:
481 kfree(kbuf);
482 return error;
483}
484
485/* Every process that opens the lockspace device has its own "proc" structure
486 hanging off the open file that's used to keep track of locks owned by the
487 process and asts that need to be delivered to the process. */
488
489static int device_open(struct inode *inode, struct file *file)
490{
491 struct dlm_user_proc *proc;
492 struct dlm_ls *ls;
493
494 ls = dlm_find_lockspace_device(iminor(inode));
495 if (!ls)
496 return -ENOENT;
497
498 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
499 if (!proc) {
500 dlm_put_lockspace(ls);
501 return -ENOMEM;
502 }
503
504 proc->lockspace = ls->ls_local_handle;
505 INIT_LIST_HEAD(&proc->asts);
506 INIT_LIST_HEAD(&proc->locks);
507 spin_lock_init(&proc->asts_spin);
508 spin_lock_init(&proc->locks_spin);
509 init_waitqueue_head(&proc->wait);
510 file->private_data = proc;
511
512 return 0;
513}
514
515static int device_close(struct inode *inode, struct file *file)
516{
517 struct dlm_user_proc *proc = file->private_data;
518 struct dlm_ls *ls;
519 sigset_t tmpsig, allsigs;
520
521 ls = dlm_find_lockspace_local(proc->lockspace);
522 if (!ls)
523 return -ENOENT;
524
525 sigfillset(&allsigs);
526 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
527
528 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
529
530 dlm_clear_proc_locks(ls, proc);
531
532 /* at this point no more lkb's should exist for this lockspace,
533 so there's no chance of dlm_user_add_ast() being called and
534 looking for lkb->ua->proc */
535
536 kfree(proc);
537 file->private_data = NULL;
538
539 dlm_put_lockspace(ls);
540 dlm_put_lockspace(ls); /* for the find in device_open() */
541
542 /* FIXME: AUTOFREE: if this ls is no longer used do
543 device_remove_lockspace() */
544
545 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
546 recalc_sigpending();
547
548 return 0;
549}
550
551static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
552 int bmode, char __user *buf, size_t count)
553{
554#ifdef CONFIG_COMPAT
555 struct dlm_lock_result32 result32;
556#endif
557 struct dlm_lock_result result;
558 void *resultptr;
559 int error=0;
560 int len;
561 int struct_len;
562
563 memset(&result, 0, sizeof(struct dlm_lock_result));
564 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
565 result.user_lksb = ua->user_lksb;
566
567 /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
568 in a conversion unless the conversion is successful. See code
569 in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
570 notes that a new blocking AST address and parameter are set even if
571 the conversion fails, so maybe we should just do that. */
572
573 if (type == AST_BAST) {
574 result.user_astaddr = ua->bastaddr;
575 result.user_astparam = ua->bastparam;
576 result.bast_mode = bmode;
577 } else {
578 result.user_astaddr = ua->castaddr;
579 result.user_astparam = ua->castparam;
580 }
581
582#ifdef CONFIG_COMPAT
583 if (compat)
584 len = sizeof(struct dlm_lock_result32);
585 else
586#endif
587 len = sizeof(struct dlm_lock_result);
588 struct_len = len;
589
590 /* copy lvb to userspace if there is one, it's been updated, and
591 the user buffer has space for it */
592
593 if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
594 count >= len + DLM_USER_LVB_LEN) {
595 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
596 DLM_USER_LVB_LEN)) {
597 error = -EFAULT;
598 goto out;
599 }
600
601 result.lvb_offset = len;
602 len += DLM_USER_LVB_LEN;
603 }
604
605 result.length = len;
606 resultptr = &result;
607#ifdef CONFIG_COMPAT
608 if (compat) {
609 compat_output(&result, &result32);
610 resultptr = &result32;
611 }
612#endif
613
614 if (copy_to_user(buf, resultptr, struct_len))
615 error = -EFAULT;
616 else
617 error = len;
618 out:
619 return error;
620}
621
622/* a read returns a single ast described in a struct dlm_lock_result */
623
624static ssize_t device_read(struct file *file, char __user *buf, size_t count,
625 loff_t *ppos)
626{
627 struct dlm_user_proc *proc = file->private_data;
628 struct dlm_lkb *lkb;
629 struct dlm_user_args *ua;
630 DECLARE_WAITQUEUE(wait, current);
631 int error, type=0, bmode=0, removed = 0;
632
633#ifdef CONFIG_COMPAT
634 if (count < sizeof(struct dlm_lock_result32))
635#else
636 if (count < sizeof(struct dlm_lock_result))
637#endif
638 return -EINVAL;
639
640 /* do we really need this? can a read happen after a close? */
641 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
642 return -EINVAL;
643
644 spin_lock(&proc->asts_spin);
645 if (list_empty(&proc->asts)) {
646 if (file->f_flags & O_NONBLOCK) {
647 spin_unlock(&proc->asts_spin);
648 return -EAGAIN;
649 }
650
651 add_wait_queue(&proc->wait, &wait);
652
653 repeat:
654 set_current_state(TASK_INTERRUPTIBLE);
655 if (list_empty(&proc->asts) && !signal_pending(current)) {
656 spin_unlock(&proc->asts_spin);
657 schedule();
658 spin_lock(&proc->asts_spin);
659 goto repeat;
660 }
661 set_current_state(TASK_RUNNING);
662 remove_wait_queue(&proc->wait, &wait);
663
664 if (signal_pending(current)) {
665 spin_unlock(&proc->asts_spin);
666 return -ERESTARTSYS;
667 }
668 }
669
670 if (list_empty(&proc->asts)) {
671 spin_unlock(&proc->asts_spin);
672 return -EAGAIN;
673 }
674
675 /* there may be both completion and blocking asts to return for
676 the lkb, don't remove lkb from asts list unless no asts remain */
677
678 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
679
680 if (lkb->lkb_ast_type & AST_COMP) {
681 lkb->lkb_ast_type &= ~AST_COMP;
682 type = AST_COMP;
683 } else if (lkb->lkb_ast_type & AST_BAST) {
684 lkb->lkb_ast_type &= ~AST_BAST;
685 type = AST_BAST;
686 bmode = lkb->lkb_bastmode;
687 }
688
689 if (!lkb->lkb_ast_type) {
690 list_del(&lkb->lkb_astqueue);
691 removed = 1;
692 }
693 spin_unlock(&proc->asts_spin);
694
695 ua = (struct dlm_user_args *)lkb->lkb_astparam;
696 error = copy_result_to_user(ua,
697 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
698 type, bmode, buf, count);
699
700 /* removes reference for the proc->asts lists added by
701 dlm_user_add_ast() and may result in the lkb being freed */
702 if (removed)
703 dlm_put_lkb(lkb);
704
705 return error;
706}
707
708static unsigned int device_poll(struct file *file, poll_table *wait)
709{
710 struct dlm_user_proc *proc = file->private_data;
711
712 poll_wait(file, &proc->wait, wait);
713
714 spin_lock(&proc->asts_spin);
715 if (!list_empty(&proc->asts)) {
716 spin_unlock(&proc->asts_spin);
717 return POLLIN | POLLRDNORM;
718 }
719 spin_unlock(&proc->asts_spin);
720 return 0;
721}
722
723static int ctl_device_open(struct inode *inode, struct file *file)
724{
725 file->private_data = NULL;
726 return 0;
727}
728
729static int ctl_device_close(struct inode *inode, struct file *file)
730{
731 return 0;
732}
733
734static struct file_operations device_fops = {
735 .open = device_open,
736 .release = device_close,
737 .read = device_read,
738 .write = device_write,
739 .poll = device_poll,
740 .owner = THIS_MODULE,
741};
742
743static struct file_operations ctl_device_fops = {
744 .open = ctl_device_open,
745 .release = ctl_device_close,
746 .write = device_write,
747 .owner = THIS_MODULE,
748};
749
750int dlm_user_init(void)
751{
752 int error;
753
754 ctl_device.name = "dlm-control";
755 ctl_device.fops = &ctl_device_fops;
756 ctl_device.minor = MISC_DYNAMIC_MINOR;
757
758 error = misc_register(&ctl_device);
759 if (error)
760 log_print("misc_register failed for control device");
761
762 return error;
763}
764
765void dlm_user_exit(void)
766{
767 misc_deregister(&ctl_device);
768}
769
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__
11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void);
14void dlm_user_exit(void);
15
16#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL
4 select FS_POSIX_ACL
5 help
6 A cluster filesystem.
7
8 Allows a cluster of computers to simultaneously use a block device
9 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
10 and writes to the block device like a local filesystem, but also uses
11 a lock module to allow the computers coordinate their I/O so
12 filesystem consistency is maintained. One of the nifty features of
13 GFS is perfect consistency -- changes made to the filesystem on one
14 machine show up immediately on all other machines in the cluster.
15
16 To use the GFS2 filesystem, you will need to enable one or more of
17 the below locking modules. Documentation and utilities for GFS2 can
18 be found here: http://sources.redhat.com/cluster
19
20config GFS2_FS_LOCKING_NOLOCK
21 tristate "GFS2 \"nolock\" locking module"
22 depends on GFS2_FS
23 help
24 Single node locking module for GFS2.
25
26 Use this module if you want to use GFS2 on a single node without
27 its clustering features. You can still take advantage of the
28 large file support, and upgrade to running a full cluster later on
29 if required.
30
31 If you will only be using GFS2 in cluster mode, you do not need this
32 module.
33
34config GFS2_FS_LOCKING_DLM
35 tristate "GFS2 DLM locking module"
36 depends on GFS2_FS
37 select DLM
38 help
39 Multiple node locking module for GFS2
40
41 Most users of GFS2 will require this module. It provides the locking
42 interface between GFS2 and the DLM, which is required to use GFS2
43 in a cluster environment.
44
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..0b7977623b80
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o lvb.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o page.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o
7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..399317841501
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,313 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "trans.h"
29#include "util.h"
30
31#define ACL_ACCESS 1
32#define ACL_DEFAULT 0
33
34int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
35 struct gfs2_ea_request *er,
36 int *remove, mode_t *mode)
37{
38 struct posix_acl *acl;
39 int error;
40
41 error = gfs2_acl_validate_remove(ip, access);
42 if (error)
43 return error;
44
45 if (!er->er_data)
46 return -EINVAL;
47
48 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
49 if (IS_ERR(acl))
50 return PTR_ERR(acl);
51 if (!acl) {
52 *remove = 1;
53 return 0;
54 }
55
56 error = posix_acl_valid(acl);
57 if (error)
58 goto out;
59
60 if (access) {
61 error = posix_acl_equiv_mode(acl, mode);
62 if (!error)
63 *remove = 1;
64 else if (error > 0)
65 error = 0;
66 }
67
68 out:
69 posix_acl_release(acl);
70
71 return error;
72}
73
74int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
75{
76 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
77 return -EOPNOTSUPP;
78 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
79 return -EPERM;
80 if (S_ISLNK(ip->i_di.di_mode))
81 return -EOPNOTSUPP;
82 if (!access && !S_ISDIR(ip->i_di.di_mode))
83 return -EACCES;
84
85 return 0;
86}
87
88static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
89 struct gfs2_ea_location *el, char **data, unsigned int *len)
90{
91 struct gfs2_ea_request er;
92 struct gfs2_ea_location el_this;
93 int error;
94
95 if (!ip->i_di.di_eattr)
96 return 0;
97
98 memset(&er, 0, sizeof(struct gfs2_ea_request));
99 if (access) {
100 er.er_name = GFS2_POSIX_ACL_ACCESS;
101 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
102 } else {
103 er.er_name = GFS2_POSIX_ACL_DEFAULT;
104 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
105 }
106 er.er_type = GFS2_EATYPE_SYS;
107
108 if (!el)
109 el = &el_this;
110
111 error = gfs2_ea_find(ip, &er, el);
112 if (error)
113 return error;
114 if (!el->el_ea)
115 return 0;
116 if (!GFS2_EA_DATA_LEN(el->el_ea))
117 goto out;
118
119 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
120 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
121 error = -ENOMEM;
122 if (!er.er_data)
123 goto out;
124
125 error = gfs2_ea_get_copy(ip, el, er.er_data);
126 if (error)
127 goto out_kfree;
128
129 if (acl) {
130 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
131 if (IS_ERR(*acl))
132 error = PTR_ERR(*acl);
133 }
134
135 out_kfree:
136 if (error || !data)
137 kfree(er.er_data);
138 else {
139 *data = er.er_data;
140 *len = er.er_data_len;
141 }
142
143 out:
144 if (error || el == &el_this)
145 brelse(el->el_bh);
146
147 return error;
148}
149
150/**
151 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
152 * @inode: the file we want to do something to
153 * @mask: what we want to do
154 *
155 * Returns: errno
156 */
157
158int gfs2_check_acl_locked(struct inode *inode, int mask)
159{
160 struct posix_acl *acl = NULL;
161 int error;
162
163 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
164 if (error)
165 return error;
166
167 if (acl) {
168 error = posix_acl_permission(inode, acl, mask);
169 posix_acl_release(acl);
170 return error;
171 }
172
173 return -EAGAIN;
174}
175
176int gfs2_check_acl(struct inode *inode, int mask)
177{
178 struct gfs2_inode *ip = GFS2_I(inode);
179 struct gfs2_holder i_gh;
180 int error;
181
182 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
183 if (!error) {
184 error = gfs2_check_acl_locked(inode, mask);
185 gfs2_glock_dq_uninit(&i_gh);
186 }
187
188 return error;
189}
190
191static int munge_mode(struct gfs2_inode *ip, mode_t mode)
192{
193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
194 struct buffer_head *dibh;
195 int error;
196
197 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
198 if (error)
199 return error;
200
201 error = gfs2_meta_inode_buffer(ip, &dibh);
202 if (!error) {
203 gfs2_assert_withdraw(sdp,
204 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
205 ip->i_di.di_mode = mode;
206 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
207 gfs2_dinode_out(&ip->i_di, dibh->b_data);
208 brelse(dibh);
209 }
210
211 gfs2_trans_end(sdp);
212
213 return 0;
214}
215
216int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
217{
218 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
219 struct posix_acl *acl = NULL, *clone;
220 struct gfs2_ea_request er;
221 mode_t mode = ip->i_di.di_mode;
222 int error;
223
224 if (!sdp->sd_args.ar_posix_acl)
225 return 0;
226 if (S_ISLNK(ip->i_di.di_mode))
227 return 0;
228
229 memset(&er, 0, sizeof(struct gfs2_ea_request));
230 er.er_type = GFS2_EATYPE_SYS;
231
232 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
233 &er.er_data, &er.er_data_len);
234 if (error)
235 return error;
236 if (!acl) {
237 mode &= ~current->fs->umask;
238 if (mode != ip->i_di.di_mode)
239 error = munge_mode(ip, mode);
240 return error;
241 }
242
243 clone = posix_acl_clone(acl, GFP_KERNEL);
244 error = -ENOMEM;
245 if (!clone)
246 goto out;
247 posix_acl_release(acl);
248 acl = clone;
249
250 if (S_ISDIR(ip->i_di.di_mode)) {
251 er.er_name = GFS2_POSIX_ACL_DEFAULT;
252 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
253 error = gfs2_system_eaops.eo_set(ip, &er);
254 if (error)
255 goto out;
256 }
257
258 error = posix_acl_create_masq(acl, &mode);
259 if (error < 0)
260 goto out;
261 if (error > 0) {
262 er.er_name = GFS2_POSIX_ACL_ACCESS;
263 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
264 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
265 er.er_mode = mode;
266 er.er_flags = GFS2_ERF_MODE;
267 error = gfs2_system_eaops.eo_set(ip, &er);
268 if (error)
269 goto out;
270 } else
271 munge_mode(ip, mode);
272
273 out:
274 posix_acl_release(acl);
275 kfree(er.er_data);
276 return error;
277}
278
279int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
280{
281 struct posix_acl *acl = NULL, *clone;
282 struct gfs2_ea_location el;
283 char *data;
284 unsigned int len;
285 int error;
286
287 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
288 if (error)
289 return error;
290 if (!acl)
291 return gfs2_setattr_simple(ip, attr);
292
293 clone = posix_acl_clone(acl, GFP_KERNEL);
294 error = -ENOMEM;
295 if (!clone)
296 goto out;
297 posix_acl_release(acl);
298 acl = clone;
299
300 error = posix_acl_chmod_masq(acl, attr->ia_mode);
301 if (!error) {
302 posix_acl_to_xattr(acl, data, len);
303 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
304 }
305
306 out:
307 posix_acl_release(acl);
308 brelse(el.el_bh);
309 kfree(data);
310
311 return error;
312}
313
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..067105786eaa
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,37 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
14#define GFS2_POSIX_ACL_ACCESS_LEN 16
15#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
16#define GFS2_POSIX_ACL_DEFAULT_LEN 17
17
18#define GFS2_ACL_IS_ACCESS(name, len) \
19 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
20 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
21
22#define GFS2_ACL_IS_DEFAULT(name, len) \
23 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
24 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
25
26struct gfs2_ea_request;
27
28int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
29 struct gfs2_ea_request *er,
30 int *remove, mode_t *mode);
31int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
32int gfs2_check_acl_locked(struct inode *inode, int mask);
33int gfs2_check_acl(struct inode *inode, int mask);
34int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
35int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
36
37#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..98fa07c2b710
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1103 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "page.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29#include "dir.h"
30#include "util.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
38};
39
40typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
41 struct buffer_head *bh, uint64_t *top,
42 uint64_t *bottom, unsigned int height,
43 void *data);
44
45struct strip_mine {
46 int sm_first;
47 unsigned int sm_height;
48};
49
50/**
51 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
52 * @ip: The GFS2 inode to unstuff
53 * @unstuffer: the routine that handles unstuffing a non-zero length file
54 * @private: private data for the unstuffer
55 *
56 * This routine unstuffs a dinode and returns it to a "normal" state such
57 * that the height can be grown in the traditional way.
58 *
59 * Returns: errno
60 */
61
62int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
63 void *private)
64{
65 struct buffer_head *bh, *dibh;
66 uint64_t block = 0;
67 int isdir = gfs2_is_dir(ip);
68 int error;
69
70 down_write(&ip->i_rw_mutex);
71
72 error = gfs2_meta_inode_buffer(ip, &dibh);
73 if (error)
74 goto out;
75
76 if (ip->i_di.di_size) {
77 /* Get a free block, fill it with the stuffed data,
78 and write it out to disk */
79
80 if (isdir) {
81 block = gfs2_alloc_meta(ip);
82
83 error = gfs2_dir_get_new_buffer(ip, block, &bh);
84 if (error)
85 goto out_brelse;
86 gfs2_buffer_copy_tail(bh,
87 sizeof(struct gfs2_meta_header),
88 dibh, sizeof(struct gfs2_dinode));
89 brelse(bh);
90 } else {
91 block = gfs2_alloc_data(ip);
92
93 error = unstuffer(ip, dibh, block, private);
94 if (error)
95 goto out_brelse;
96 }
97 }
98
99 /* Set up the pointer to the new block */
100
101 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
102
103 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
104
105 if (ip->i_di.di_size) {
106 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) =
107 cpu_to_be64(block);
108 ip->i_di.di_blocks++;
109 }
110
111 ip->i_di.di_height = 1;
112
113 gfs2_dinode_out(&ip->i_di, dibh->b_data);
114
115 out_brelse:
116 brelse(dibh);
117
118 out:
119 up_write(&ip->i_rw_mutex);
120
121 return error;
122}
123
124/**
125 * calc_tree_height - Calculate the height of a metadata tree
126 * @ip: The GFS2 inode
127 * @size: The proposed size of the file
128 *
129 * Work out how tall a metadata tree needs to be in order to accommodate a
130 * file of a particular size. If size is less than the current size of
131 * the inode, then the current size of the inode is used instead of the
132 * supplied one.
133 *
134 * Returns: the height the tree should be
135 */
136
137static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
138{
139 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
140 uint64_t *arr;
141 unsigned int max, height;
142
143 if (ip->i_di.di_size > size)
144 size = ip->i_di.di_size;
145
146 if (gfs2_is_dir(ip)) {
147 arr = sdp->sd_jheightsize;
148 max = sdp->sd_max_jheight;
149 } else {
150 arr = sdp->sd_heightsize;
151 max = sdp->sd_max_height;
152 }
153
154 for (height = 0; height < max; height++)
155 if (arr[height] >= size)
156 break;
157
158 return height;
159}
160
161/**
162 * build_height - Build a metadata tree of the requested height
163 * @ip: The GFS2 inode
164 * @height: The height to build to
165 *
166 *
167 * Returns: errno
168 */
169
170static int build_height(struct inode *inode, unsigned height)
171{
172 struct gfs2_inode *ip = GFS2_I(inode);
173 unsigned new_height = height - ip->i_di.di_height;
174 struct buffer_head *dibh;
175 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
176 int error;
177 u64 *bp;
178 u64 bn;
179 unsigned n;
180
181 if (height <= ip->i_di.di_height)
182 return 0;
183
184 error = gfs2_meta_inode_buffer(ip, &dibh);
185 if (error)
186 return error;
187
188 for(n = 0; n < new_height; n++) {
189 bn = gfs2_alloc_meta(ip);
190 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
191 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
192 }
193
194 n = 0;
195 bn = blocks[0]->b_blocknr;
196 if (new_height > 1) {
197 for(; n < new_height-1; n++) {
198 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
199 GFS2_FORMAT_IN);
200 gfs2_buffer_clear_tail(blocks[n],
201 sizeof(struct gfs2_meta_header));
202 bp = (u64 *)(blocks[n]->b_data +
203 sizeof(struct gfs2_meta_header));
204 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
205 brelse(blocks[n]);
206 blocks[n] = NULL;
207 }
208 }
209 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
210 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
211 dibh, sizeof(struct gfs2_dinode));
212 brelse(blocks[n]);
213 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
214 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
215 bp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
216 *bp = cpu_to_be64(bn);
217 ip->i_di.di_height += new_height;
218 ip->i_di.di_blocks += new_height;
219 gfs2_dinode_out(&ip->i_di, dibh->b_data);
220 brelse(dibh);
221 return error;
222}
223
224/**
225 * find_metapath - Find path through the metadata tree
226 * @ip: The inode pointer
227 * @mp: The metapath to return the result in
228 * @block: The disk block to look up
229 *
230 * This routine returns a struct metapath structure that defines a path
231 * through the metadata of inode "ip" to get to block "block".
232 *
233 * Example:
234 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
235 * filesystem with a blocksize of 4096.
236 *
237 * find_metapath() would return a struct metapath structure set to:
238 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
239 * and mp_list[2] = 165.
240 *
241 * That means that in order to get to the block containing the byte at
242 * offset 101342453, we would load the indirect block pointed to by pointer
243 * 0 in the dinode. We would then load the indirect block pointed to by
244 * pointer 48 in that indirect block. We would then load the data block
245 * pointed to by pointer 165 in that indirect block.
246 *
247 * ----------------------------------------
248 * | Dinode | |
249 * | | 4|
250 * | |0 1 2 3 4 5 9|
251 * | | 6|
252 * ----------------------------------------
253 * |
254 * |
255 * V
256 * ----------------------------------------
257 * | Indirect Block |
258 * | 5|
259 * | 4 4 4 4 4 5 5 1|
260 * |0 5 6 7 8 9 0 1 2|
261 * ----------------------------------------
262 * |
263 * |
264 * V
265 * ----------------------------------------
266 * | Indirect Block |
267 * | 1 1 1 1 1 5|
268 * | 6 6 6 6 6 1|
269 * |0 3 4 5 6 7 2|
270 * ----------------------------------------
271 * |
272 * |
273 * V
274 * ----------------------------------------
275 * | Data block containing offset |
276 * | 101342453 |
277 * | |
278 * | |
279 * ----------------------------------------
280 *
281 */
282
283static void find_metapath(struct gfs2_inode *ip, uint64_t block,
284 struct metapath *mp)
285{
286 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
287 uint64_t b = block;
288 unsigned int i;
289
290 for (i = ip->i_di.di_height; i--;)
291 mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
292
293}
294
295/**
296 * metapointer - Return pointer to start of metadata in a buffer
297 * @bh: The buffer
298 * @height: The metadata height (0 = dinode)
299 * @mp: The metapath
300 *
301 * Return a pointer to the block number of the next height of the metadata
302 * tree given a buffer containing the pointer to the current height of the
303 * metadata tree.
304 */
305
306static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
307 unsigned int height, const struct metapath *mp)
308{
309 unsigned int head_size = (height > 0) ?
310 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
311 u64 *ptr;
312 *boundary = 0;
313 ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
314 if (ptr + 1 == (u64*)(bh->b_data + bh->b_size))
315 *boundary = 1;
316 return ptr;
317}
318
319/**
320 * lookup_block - Get the next metadata block in metadata tree
321 * @ip: The GFS2 inode
322 * @bh: Buffer containing the pointers to metadata blocks
323 * @height: The height of the tree (0 = dinode)
324 * @mp: The metapath
325 * @create: Non-zero if we may create a new meatdata block
326 * @new: Used to indicate if we did create a new metadata block
327 * @block: the returned disk block number
328 *
329 * Given a metatree, complete to a particular height, checks to see if the next
330 * height of the tree exists. If not the next height of the tree is created.
331 * The block number of the next height of the metadata tree is returned.
332 *
333 */
334
335static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
336 unsigned int height, struct metapath *mp, int create,
337 int *new, uint64_t *block)
338{
339 int boundary;
340 uint64_t *ptr = metapointer(bh, &boundary, height, mp);
341
342 if (*ptr) {
343 *block = be64_to_cpu(*ptr);
344 return boundary;
345 }
346
347 *block = 0;
348
349 if (!create)
350 return 0;
351
352 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
353 *block = gfs2_alloc_data(ip);
354 else
355 *block = gfs2_alloc_meta(ip);
356
357 gfs2_trans_add_bh(ip->i_gl, bh, 1);
358
359 *ptr = cpu_to_be64(*block);
360 ip->i_di.di_blocks++;
361
362 *new = 1;
363 return 0;
364}
365
366/**
367 * gfs2_block_pointers - Map a block from an inode to a disk block
368 * @inode: The inode
369 * @lblock: The logical block number
370 * @new: Value/Result argument (1 = may create/did create new blocks)
371 * @boundary: gets set if we've hit a block boundary
372 * @mp: metapath to use
373 *
374 * Find the block number on the current device which corresponds to an
375 * inode's block. If the block had to be created, "new" will be set.
376 *
377 * Returns: errno
378 */
379
380static struct buffer_head *gfs2_block_pointers(struct inode *inode, u64 lblock,
381 int *new, u64 *dblock,
382 int *boundary,
383 struct metapath *mp)
384{
385 struct gfs2_inode *ip = GFS2_I(inode);
386 struct gfs2_sbd *sdp = GFS2_SB(inode);
387 struct buffer_head *bh;
388 int create = *new;
389 unsigned int bsize;
390 unsigned int height;
391 unsigned int end_of_metadata;
392 unsigned int x;
393 int error = 0;
394
395 *new = 0;
396 *dblock = 0;
397
398 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
399 goto out;
400
401 bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
402
403 height = calc_tree_height(ip, (lblock + 1) * bsize);
404 if (ip->i_di.di_height < height) {
405 if (!create)
406 goto out;
407
408 error = build_height(inode, height);
409 if (error)
410 goto out;
411 }
412
413 find_metapath(ip, lblock, mp);
414 end_of_metadata = ip->i_di.di_height - 1;
415
416 error = gfs2_meta_inode_buffer(ip, &bh);
417 if (error)
418 goto out;
419
420 for (x = 0; x < end_of_metadata; x++) {
421 lookup_block(ip, bh, x, mp, create, new, dblock);
422 brelse(bh);
423 if (!*dblock)
424 goto out;
425
426 error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
427 if (error)
428 goto out;
429 }
430
431 *boundary = lookup_block(ip, bh, end_of_metadata, mp, create, new, dblock);
432 if (*new) {
433 struct buffer_head *dibh;
434 error = gfs2_meta_inode_buffer(ip, &dibh);
435 if (!error) {
436 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
437 gfs2_dinode_out(&ip->i_di, dibh->b_data);
438 brelse(dibh);
439 }
440 }
441 return bh;
442out:
443 return ERR_PTR(error);
444}
445
446
447static inline void bmap_lock(struct inode *inode, int create)
448{
449 struct gfs2_inode *ip = GFS2_I(inode);
450 if (create)
451 down_write(&ip->i_rw_mutex);
452 else
453 down_read(&ip->i_rw_mutex);
454}
455
456static inline void bmap_unlock(struct inode *inode, int create)
457{
458 struct gfs2_inode *ip = GFS2_I(inode);
459 if (create)
460 up_write(&ip->i_rw_mutex);
461 else
462 up_read(&ip->i_rw_mutex);
463}
464
465int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary)
466{
467 struct metapath mp;
468 struct buffer_head *bh;
469 int create = *new;
470
471 bmap_lock(inode, create);
472 bh = gfs2_block_pointers(inode, lblock, new, dblock, boundary, &mp);
473 bmap_unlock(inode, create);
474 if (!bh)
475 return 0;
476 if (IS_ERR(bh))
477 return PTR_ERR(bh);
478 brelse(bh);
479 return 0;
480}
481
482int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
483{
484 struct gfs2_inode *ip = GFS2_I(inode);
485 struct gfs2_sbd *sdp = GFS2_SB(inode);
486 struct metapath mp;
487 struct buffer_head *bh;
488 int boundary;
489 int create = *new;
490
491 BUG_ON(!extlen);
492 BUG_ON(!dblock);
493 BUG_ON(!new);
494
495 bmap_lock(inode, create);
496 bh = gfs2_block_pointers(inode, lblock, new, dblock, &boundary, &mp);
497 *extlen = 1;
498
499 if (bh && !IS_ERR(bh) && *dblock && !*new) {
500 u64 tmp_dblock;
501 int tmp_new;
502 unsigned int nptrs;
503 unsigned end_of_metadata = ip->i_di.di_height - 1;
504
505 nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs;
506 while (++mp.mp_list[end_of_metadata] < nptrs) {
507 lookup_block(ip, bh, end_of_metadata, &mp, 0, &tmp_new, &tmp_dblock);
508 if (*dblock + *extlen != tmp_dblock)
509 break;
510 (*extlen)++;
511 }
512 }
513 bmap_unlock(inode, create);
514 if (!bh)
515 return 0;
516 if (IS_ERR(bh))
517 return PTR_ERR(bh);
518 brelse(bh);
519 return 0;
520}
521
522/**
523 * recursive_scan - recursively scan through the end of a file
524 * @ip: the inode
525 * @dibh: the dinode buffer
526 * @mp: the path through the metadata to the point to start
527 * @height: the height the recursion is at
528 * @block: the indirect block to look at
529 * @first: 1 if this is the first block
530 * @bc: the call to make for each piece of metadata
531 * @data: data opaque to this function to pass to @bc
532 *
533 * When this is first called @height and @block should be zero and
534 * @first should be 1.
535 *
536 * Returns: errno
537 */
538
539static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
540 struct metapath *mp, unsigned int height,
541 uint64_t block, int first, block_call_t bc,
542 void *data)
543{
544 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
545 struct buffer_head *bh = NULL;
546 uint64_t *top, *bottom;
547 uint64_t bn;
548 int error;
549 int mh_size = sizeof(struct gfs2_meta_header);
550
551 if (!height) {
552 error = gfs2_meta_inode_buffer(ip, &bh);
553 if (error)
554 return error;
555 dibh = bh;
556
557 top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
558 mp->mp_list[0];
559 bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
560 sdp->sd_diptrs;
561 } else {
562 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
563 if (error)
564 return error;
565
566 top = (uint64_t *)(bh->b_data + mh_size) +
567 ((first) ? mp->mp_list[height] : 0);
568
569 bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
570 }
571
572 error = bc(ip, dibh, bh, top, bottom, height, data);
573 if (error)
574 goto out;
575
576 if (height < ip->i_di.di_height - 1)
577 for (; top < bottom; top++, first = 0) {
578 if (!*top)
579 continue;
580
581 bn = be64_to_cpu(*top);
582
583 error = recursive_scan(ip, dibh, mp, height + 1, bn,
584 first, bc, data);
585 if (error)
586 break;
587 }
588
589 out:
590 brelse(bh);
591
592 return error;
593}
594
595/**
596 * do_strip - Look for a layer a particular layer of the file and strip it off
597 * @ip: the inode
598 * @dibh: the dinode buffer
599 * @bh: A buffer of pointers
600 * @top: The first pointer in the buffer
601 * @bottom: One more than the last pointer
602 * @height: the height this buffer is at
603 * @data: a pointer to a struct strip_mine
604 *
605 * Returns: errno
606 */
607
608static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
609 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
610 unsigned int height, void *data)
611{
612 struct strip_mine *sm = data;
613 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
614 struct gfs2_rgrp_list rlist;
615 uint64_t bn, bstart;
616 uint32_t blen;
617 uint64_t *p;
618 unsigned int rg_blocks = 0;
619 int metadata;
620 unsigned int revokes = 0;
621 int x;
622 int error;
623
624 if (!*top)
625 sm->sm_first = 0;
626
627 if (height != sm->sm_height)
628 return 0;
629
630 if (sm->sm_first) {
631 top++;
632 sm->sm_first = 0;
633 }
634
635 metadata = (height != ip->i_di.di_height - 1);
636 if (metadata)
637 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
638
639 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
640 if (error)
641 return error;
642
643 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
644 bstart = 0;
645 blen = 0;
646
647 for (p = top; p < bottom; p++) {
648 if (!*p)
649 continue;
650
651 bn = be64_to_cpu(*p);
652
653 if (bstart + blen == bn)
654 blen++;
655 else {
656 if (bstart)
657 gfs2_rlist_add(sdp, &rlist, bstart);
658
659 bstart = bn;
660 blen = 1;
661 }
662 }
663
664 if (bstart)
665 gfs2_rlist_add(sdp, &rlist, bstart);
666 else
667 goto out; /* Nothing to do */
668
669 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
670
671 for (x = 0; x < rlist.rl_rgrps; x++) {
672 struct gfs2_rgrpd *rgd;
673 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
674 rg_blocks += rgd->rd_ri.ri_length;
675 }
676
677 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
678 if (error)
679 goto out_rlist;
680
681 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
682 RES_INDIRECT + RES_STATFS + RES_QUOTA,
683 revokes);
684 if (error)
685 goto out_rg_gunlock;
686
687 down_write(&ip->i_rw_mutex);
688
689 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
690 gfs2_trans_add_bh(ip->i_gl, bh, 1);
691
692 bstart = 0;
693 blen = 0;
694
695 for (p = top; p < bottom; p++) {
696 if (!*p)
697 continue;
698
699 bn = be64_to_cpu(*p);
700
701 if (bstart + blen == bn)
702 blen++;
703 else {
704 if (bstart) {
705 if (metadata)
706 gfs2_free_meta(ip, bstart, blen);
707 else
708 gfs2_free_data(ip, bstart, blen);
709 }
710
711 bstart = bn;
712 blen = 1;
713 }
714
715 *p = 0;
716 if (!ip->i_di.di_blocks)
717 gfs2_consist_inode(ip);
718 ip->i_di.di_blocks--;
719 }
720 if (bstart) {
721 if (metadata)
722 gfs2_free_meta(ip, bstart, blen);
723 else
724 gfs2_free_data(ip, bstart, blen);
725 }
726
727 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
728
729 gfs2_dinode_out(&ip->i_di, dibh->b_data);
730
731 up_write(&ip->i_rw_mutex);
732
733 gfs2_trans_end(sdp);
734
735 out_rg_gunlock:
736 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
737
738 out_rlist:
739 gfs2_rlist_free(&rlist);
740
741 out:
742 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
743
744 return error;
745}
746
747/**
748 * do_grow - Make a file look bigger than it is
749 * @ip: the inode
750 * @size: the size to set the file to
751 *
752 * Called with an exclusive lock on @ip.
753 *
754 * Returns: errno
755 */
756
757static int do_grow(struct gfs2_inode *ip, uint64_t size)
758{
759 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
760 struct gfs2_alloc *al;
761 struct buffer_head *dibh;
762 unsigned int h;
763 int error;
764
765 al = gfs2_alloc_get(ip);
766
767 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
768 if (error)
769 goto out;
770
771 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
772 if (error)
773 goto out_gunlock_q;
774
775 al->al_requested = sdp->sd_max_height + RES_DATA;
776
777 error = gfs2_inplace_reserve(ip);
778 if (error)
779 goto out_gunlock_q;
780
781 error = gfs2_trans_begin(sdp,
782 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
783 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
784 if (error)
785 goto out_ipres;
786
787 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
788 if (gfs2_is_stuffed(ip)) {
789 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
790 NULL);
791 if (error)
792 goto out_end_trans;
793 }
794
795 h = calc_tree_height(ip, size);
796 if (ip->i_di.di_height < h) {
797 down_write(&ip->i_rw_mutex);
798 error = build_height(&ip->i_inode, h);
799 up_write(&ip->i_rw_mutex);
800 if (error)
801 goto out_end_trans;
802 }
803 }
804
805 ip->i_di.di_size = size;
806 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
807
808 error = gfs2_meta_inode_buffer(ip, &dibh);
809 if (error)
810 goto out_end_trans;
811
812 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
813 gfs2_dinode_out(&ip->i_di, dibh->b_data);
814 brelse(dibh);
815
816 out_end_trans:
817 gfs2_trans_end(sdp);
818
819 out_ipres:
820 gfs2_inplace_release(ip);
821
822 out_gunlock_q:
823 gfs2_quota_unlock(ip);
824
825 out:
826 gfs2_alloc_put(ip);
827
828 return error;
829}
830
831static int trunc_start(struct gfs2_inode *ip, uint64_t size)
832{
833 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
834 struct buffer_head *dibh;
835 int journaled = gfs2_is_jdata(ip);
836 int error;
837
838 error = gfs2_trans_begin(sdp,
839 RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
840 if (error)
841 return error;
842
843 error = gfs2_meta_inode_buffer(ip, &dibh);
844 if (error)
845 goto out;
846
847 if (gfs2_is_stuffed(ip)) {
848 ip->i_di.di_size = size;
849 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
850 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
851 gfs2_dinode_out(&ip->i_di, dibh->b_data);
852 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
853 error = 1;
854
855 } else {
856 if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
857 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
858
859 if (!error) {
860 ip->i_di.di_size = size;
861 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
862 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
863 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
864 gfs2_dinode_out(&ip->i_di, dibh->b_data);
865 }
866 }
867
868 brelse(dibh);
869
870 out:
871 gfs2_trans_end(sdp);
872
873 return error;
874}
875
876static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
877{
878 unsigned int height = ip->i_di.di_height;
879 uint64_t lblock;
880 struct metapath mp;
881 int error;
882
883 if (!size)
884 lblock = 0;
885 else
886 lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
887
888 find_metapath(ip, lblock, &mp);
889 gfs2_alloc_get(ip);
890
891 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
892 if (error)
893 goto out;
894
895 while (height--) {
896 struct strip_mine sm;
897 sm.sm_first = !!size;
898 sm.sm_height = height;
899
900 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
901 if (error)
902 break;
903 }
904
905 gfs2_quota_unhold(ip);
906
907 out:
908 gfs2_alloc_put(ip);
909 return error;
910}
911
912static int trunc_end(struct gfs2_inode *ip)
913{
914 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
915 struct buffer_head *dibh;
916 int error;
917
918 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
919 if (error)
920 return error;
921
922 down_write(&ip->i_rw_mutex);
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out;
927
928 if (!ip->i_di.di_size) {
929 ip->i_di.di_height = 0;
930 ip->i_di.di_goal_meta =
931 ip->i_di.di_goal_data =
932 ip->i_num.no_addr;
933 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
934 }
935 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
936 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
937
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(&ip->i_di, dibh->b_data);
940 brelse(dibh);
941
942 out:
943 up_write(&ip->i_rw_mutex);
944
945 gfs2_trans_end(sdp);
946
947 return error;
948}
949
950/**
951 * do_shrink - make a file smaller
952 * @ip: the inode
953 * @size: the size to make the file
954 * @truncator: function to truncate the last partial block
955 *
956 * Called with an exclusive lock on @ip.
957 *
958 * Returns: errno
959 */
960
961static int do_shrink(struct gfs2_inode *ip, uint64_t size)
962{
963 int error;
964
965 error = trunc_start(ip, size);
966 if (error < 0)
967 return error;
968 if (error > 0)
969 return 0;
970
971 error = trunc_dealloc(ip, size);
972 if (!error)
973 error = trunc_end(ip);
974
975 return error;
976}
977
978/**
979 * gfs2_truncatei - make a file a given size
980 * @ip: the inode
981 * @size: the size to make the file
982 * @truncator: function to truncate the last partial block
983 *
984 * The file size can grow, shrink, or stay the same size.
985 *
986 * Returns: errno
987 */
988
989int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size)
990{
991 int error;
992
993 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
994 return -EINVAL;
995
996 if (size > ip->i_di.di_size)
997 error = do_grow(ip, size);
998 else
999 error = do_shrink(ip, size);
1000
1001 return error;
1002}
1003
1004int gfs2_truncatei_resume(struct gfs2_inode *ip)
1005{
1006 int error;
1007 error = trunc_dealloc(ip, ip->i_di.di_size);
1008 if (!error)
1009 error = trunc_end(ip);
1010 return error;
1011}
1012
1013int gfs2_file_dealloc(struct gfs2_inode *ip)
1014{
1015 return trunc_dealloc(ip, 0);
1016}
1017
1018/**
1019 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1020 * @ip: the file
1021 * @len: the number of bytes to be written to the file
1022 * @data_blocks: returns the number of data blocks required
1023 * @ind_blocks: returns the number of indirect blocks required
1024 *
1025 */
1026
1027void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1028 unsigned int *data_blocks, unsigned int *ind_blocks)
1029{
1030 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1031 unsigned int tmp;
1032
1033 if (gfs2_is_dir(ip)) {
1034 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1035 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1036 } else {
1037 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1038 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1039 }
1040
1041 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1042 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1043 *ind_blocks += tmp;
1044 }
1045}
1046
1047/**
1048 * gfs2_write_alloc_required - figure out if a write will require an allocation
1049 * @ip: the file being written to
1050 * @offset: the offset to write to
1051 * @len: the number of bytes being written
1052 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1053 *
1054 * Returns: errno
1055 */
1056
1057int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
1058 unsigned int len, int *alloc_required)
1059{
1060 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1061 uint64_t lblock, lblock_stop, dblock;
1062 uint32_t extlen;
1063 int new = 0;
1064 int error = 0;
1065
1066 *alloc_required = 0;
1067
1068 if (!len)
1069 return 0;
1070
1071 if (gfs2_is_stuffed(ip)) {
1072 if (offset + len >
1073 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1074 *alloc_required = 1;
1075 return 0;
1076 }
1077
1078 if (gfs2_is_dir(ip)) {
1079 unsigned int bsize = sdp->sd_jbsize;
1080 lblock = offset;
1081 do_div(lblock, bsize);
1082 lblock_stop = offset + len + bsize - 1;
1083 do_div(lblock_stop, bsize);
1084 } else {
1085 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1086 lblock = offset >> shift;
1087 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1088 }
1089
1090 for (; lblock < lblock_stop; lblock += extlen) {
1091 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
1092 if (error)
1093 return error;
1094
1095 if (!dblock) {
1096 *alloc_required = 1;
1097 return 0;
1098 }
1099 }
1100
1101 return 0;
1102}
1103
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..06ccb2d808ad
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13typedef int (*gfs2_unstuffer_t) (struct gfs2_inode * ip,
14 struct buffer_head * dibh, uint64_t block,
15 void *private);
16int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
17 void *private);
18
19int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary);
20int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
21
22int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size);
23int gfs2_truncatei_resume(struct gfs2_inode *ip);
24int gfs2_file_dealloc(struct gfs2_inode *ip);
25
26void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
27 unsigned int *data_blocks,
28 unsigned int *ind_blocks);
29int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
30 unsigned int len, int *alloc_required);
31
32#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..1453605c8f32
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "daemon.h"
23#include "glock.h"
24#include "log.h"
25#include "quota.h"
26#include "recovery.h"
27#include "super.h"
28#include "util.h"
29
30/* This uses schedule_timeout() instead of msleep() because it's good for
31 the daemons to wake up more often than the timeout when unmounting so
32 the user's unmount doesn't sit there forever.
33
34 The kthread functions used to start these daemons block and flush signals. */
35
36/**
37 * gfs2_scand - Look for cached glocks and inodes to toss from memory
38 * @sdp: Pointer to GFS2 superblock
39 *
40 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
41 * See gfs2_glockd()
42 */
43
44int gfs2_scand(void *data)
45{
46 struct gfs2_sbd *sdp = data;
47 unsigned long t;
48
49 while (!kthread_should_stop()) {
50 gfs2_scand_internal(sdp);
51 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
52 schedule_timeout_interruptible(t);
53 }
54
55 return 0;
56}
57
58/**
59 * gfs2_glockd - Reclaim unused glock structures
60 * @sdp: Pointer to GFS2 superblock
61 *
62 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
63 * Number of daemons can be set by user, with num_glockd mount option.
64 */
65
66int gfs2_glockd(void *data)
67{
68 struct gfs2_sbd *sdp = data;
69
70 while (!kthread_should_stop()) {
71 while (atomic_read(&sdp->sd_reclaim_count))
72 gfs2_reclaim_glock(sdp);
73
74 wait_event_interruptible(sdp->sd_reclaim_wq,
75 (atomic_read(&sdp->sd_reclaim_count) ||
76 kthread_should_stop()));
77 }
78
79 return 0;
80}
81
82/**
83 * gfs2_recoverd - Recover dead machine's journals
84 * @sdp: Pointer to GFS2 superblock
85 *
86 */
87
88int gfs2_recoverd(void *data)
89{
90 struct gfs2_sbd *sdp = data;
91 unsigned long t;
92
93 while (!kthread_should_stop()) {
94 gfs2_check_journals(sdp);
95 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
96 schedule_timeout_interruptible(t);
97 }
98
99 return 0;
100}
101
102/**
103 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
104 * @sdp: Pointer to GFS2 superblock
105 *
106 * Also, periodically check to make sure that we're using the most recent
107 * journal index.
108 */
109
110int gfs2_logd(void *data)
111{
112 struct gfs2_sbd *sdp = data;
113 struct gfs2_holder ji_gh;
114 unsigned long t;
115
116 while (!kthread_should_stop()) {
117 /* Advance the log tail */
118
119 t = sdp->sd_log_flush_time +
120 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
121
122 gfs2_ail1_empty(sdp, DIO_ALL);
123
124 if (time_after_eq(jiffies, t)) {
125 gfs2_log_flush(sdp, NULL);
126 sdp->sd_log_flush_time = jiffies;
127 }
128
129 /* Check for latest journal index */
130
131 t = sdp->sd_jindex_refresh_time +
132 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
133
134 if (time_after_eq(jiffies, t)) {
135 if (!gfs2_jindex_hold(sdp, &ji_gh))
136 gfs2_glock_dq_uninit(&ji_gh);
137 sdp->sd_jindex_refresh_time = jiffies;
138 }
139
140 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
141 schedule_timeout_interruptible(t);
142 }
143
144 return 0;
145}
146
147/**
148 * gfs2_quotad - Write cached quota changes into the quota file
149 * @sdp: Pointer to GFS2 superblock
150 *
151 */
152
153int gfs2_quotad(void *data)
154{
155 struct gfs2_sbd *sdp = data;
156 unsigned long t;
157 int error;
158
159 while (!kthread_should_stop()) {
160 /* Update the master statfs file */
161
162 t = sdp->sd_statfs_sync_time +
163 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
164
165 if (time_after_eq(jiffies, t)) {
166 error = gfs2_statfs_sync(sdp);
167 if (error &&
168 error != -EROFS &&
169 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
170 fs_err(sdp, "quotad: (1) error=%d\n", error);
171 sdp->sd_statfs_sync_time = jiffies;
172 }
173
174 /* Update quota file */
175
176 t = sdp->sd_quota_sync_time +
177 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
178
179 if (time_after_eq(jiffies, t)) {
180 error = gfs2_quota_sync(sdp);
181 if (error &&
182 error != -EROFS &&
183 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
184 fs_err(sdp, "quotad: (2) error=%d\n", error);
185 sdp->sd_quota_sync_time = jiffies;
186 }
187
188 gfs2_quota_scan(sdp);
189
190 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
191 schedule_timeout_interruptible(t);
192 }
193
194 return 0;
195}
196
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..aa93eb6f668e
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18
19#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..e96b5322c843
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1975 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 * "Extendible Hashing" by Fagin, et al in
13 * __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/buffer_head.h>
60#include <linux/sort.h>
61#include <linux/gfs2_ondisk.h>
62#include <linux/crc32.h>
63#include <linux/vmalloc.h>
64
65#include "gfs2.h"
66#include "lm_interface.h"
67#include "incore.h"
68#include "dir.h"
69#include "glock.h"
70#include "inode.h"
71#include "meta_io.h"
72#include "quota.h"
73#include "rgrp.h"
74#include "trans.h"
75#include "bmap.h"
76#include "util.h"
77
78#define IS_LEAF 1 /* Hashed (leaf) directory */
79#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
80
81#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
82#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
83
84typedef int (*leaf_call_t) (struct gfs2_inode *dip,
85 uint32_t index, uint32_t len, uint64_t leaf_no,
86 void *data);
87
88
89int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
90 struct buffer_head **bhp)
91{
92 struct buffer_head *bh;
93
94 bh = gfs2_meta_new(ip->i_gl, block);
95 gfs2_trans_add_bh(ip->i_gl, bh, 1);
96 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
97 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
98 *bhp = bh;
99 return 0;
100}
101
102static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, uint64_t block,
103 struct buffer_head **bhp)
104{
105 struct buffer_head *bh;
106 int error;
107
108 error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT, &bh);
109 if (error)
110 return error;
111 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
112 brelse(bh);
113 return -EIO;
114 }
115 *bhp = bh;
116 return 0;
117}
118
119static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
120 unsigned int offset, unsigned int size)
121
122{
123 struct buffer_head *dibh;
124 int error;
125
126 error = gfs2_meta_inode_buffer(ip, &dibh);
127 if (error)
128 return error;
129
130 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
132 if (ip->i_di.di_size < offset + size)
133 ip->i_di.di_size = offset + size;
134 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
135 gfs2_dinode_out(&ip->i_di, dibh->b_data);
136
137 brelse(dibh);
138
139 return size;
140}
141
142
143
144/**
145 * gfs2_dir_write_data - Write directory information to the inode
146 * @ip: The GFS2 inode
147 * @buf: The buffer containing information to be written
148 * @offset: The file offset to start writing at
149 * @size: The amount of data to write
150 *
151 * Returns: The number of bytes correctly written or error code
152 */
153static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
154 uint64_t offset, unsigned int size)
155{
156 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
157 struct buffer_head *dibh;
158 uint64_t lblock, dblock;
159 uint32_t extlen = 0;
160 unsigned int o;
161 int copied = 0;
162 int error = 0;
163
164 if (!size)
165 return 0;
166
167 if (gfs2_is_stuffed(ip) &&
168 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
169 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
170 size);
171
172 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
173 return -EINVAL;
174
175 if (gfs2_is_stuffed(ip)) {
176 error = gfs2_unstuff_dinode(ip, NULL, NULL);
177 if (error)
178 return error;
179 }
180
181 lblock = offset;
182 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
183
184 while (copied < size) {
185 unsigned int amount;
186 struct buffer_head *bh;
187 int new;
188
189 amount = size - copied;
190 if (amount > sdp->sd_sb.sb_bsize - o)
191 amount = sdp->sd_sb.sb_bsize - o;
192
193 if (!extlen) {
194 new = 1;
195 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
196 &dblock, &extlen);
197 if (error)
198 goto fail;
199 error = -EIO;
200 if (gfs2_assert_withdraw(sdp, dblock))
201 goto fail;
202 }
203
204 if (amount == sdp->sd_jbsize || new)
205 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
206 else
207 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
208
209 if (error)
210 goto fail;
211
212 gfs2_trans_add_bh(ip->i_gl, bh, 1);
213 memcpy(bh->b_data + o, buf, amount);
214 brelse(bh);
215 if (error)
216 goto fail;
217
218 copied += amount;
219 lblock++;
220 dblock++;
221 extlen--;
222
223 o = sizeof(struct gfs2_meta_header);
224 }
225
226out:
227 error = gfs2_meta_inode_buffer(ip, &dibh);
228 if (error)
229 return error;
230
231 if (ip->i_di.di_size < offset + copied)
232 ip->i_di.di_size = offset + copied;
233 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
234
235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
236 gfs2_dinode_out(&ip->i_di, dibh->b_data);
237 brelse(dibh);
238
239 return copied;
240fail:
241 if (copied)
242 goto out;
243 return error;
244}
245
246static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
247 unsigned int offset, unsigned int size)
248{
249 struct buffer_head *dibh;
250 int error;
251
252 error = gfs2_meta_inode_buffer(ip, &dibh);
253 if (!error) {
254 offset += sizeof(struct gfs2_dinode);
255 memcpy(buf, dibh->b_data + offset, size);
256 brelse(dibh);
257 }
258
259 return (error) ? error : size;
260}
261
262
263/**
264 * gfs2_dir_read_data - Read a data from a directory inode
265 * @ip: The GFS2 Inode
266 * @buf: The buffer to place result into
267 * @offset: File offset to begin jdata_readng from
268 * @size: Amount of data to transfer
269 *
270 * Returns: The amount of data actually copied or the error
271 */
272static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf,
273 uint64_t offset, unsigned int size)
274{
275 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
276 uint64_t lblock, dblock;
277 uint32_t extlen = 0;
278 unsigned int o;
279 int copied = 0;
280 int error = 0;
281
282 if (offset >= ip->i_di.di_size)
283 return 0;
284
285 if ((offset + size) > ip->i_di.di_size)
286 size = ip->i_di.di_size - offset;
287
288 if (!size)
289 return 0;
290
291 if (gfs2_is_stuffed(ip))
292 return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset,
293 size);
294
295 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
296 return -EINVAL;
297
298 lblock = offset;
299 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
300
301 while (copied < size) {
302 unsigned int amount;
303 struct buffer_head *bh;
304 int new;
305
306 amount = size - copied;
307 if (amount > sdp->sd_sb.sb_bsize - o)
308 amount = sdp->sd_sb.sb_bsize - o;
309
310 if (!extlen) {
311 new = 0;
312 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
313 &dblock, &extlen);
314 if (error)
315 goto fail;
316 }
317
318 if (extlen > 1)
319 gfs2_meta_ra(ip->i_gl, dblock, extlen);
320
321 if (dblock) {
322 if (new)
323 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
324 else
325 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
326 if (error)
327 goto fail;
328 dblock++;
329 extlen--;
330 } else
331 bh = NULL;
332
333 memcpy(buf, bh->b_data + o, amount);
334 brelse(bh);
335 if (error)
336 goto fail;
337
338 copied += amount;
339 lblock++;
340
341 o = sizeof(struct gfs2_meta_header);
342 }
343
344 return copied;
345fail:
346 return (copied) ? copied : error;
347}
348
349typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
350 const struct qstr *name,
351 void *opaque);
352
353static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
354 const struct qstr *name, int ret)
355{
356 if (dent->de_inum.no_addr != 0 &&
357 be32_to_cpu(dent->de_hash) == name->hash &&
358 be16_to_cpu(dent->de_name_len) == name->len &&
359 memcmp((char *)(dent+1), name->name, name->len) == 0)
360 return ret;
361 return 0;
362}
363
364static int gfs2_dirent_find(const struct gfs2_dirent *dent,
365 const struct qstr *name,
366 void *opaque)
367{
368 return __gfs2_dirent_find(dent, name, 1);
369}
370
371static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
372 const struct qstr *name,
373 void *opaque)
374{
375 return __gfs2_dirent_find(dent, name, 2);
376}
377
378/*
379 * name->name holds ptr to start of block.
380 * name->len holds size of block.
381 */
382static int gfs2_dirent_last(const struct gfs2_dirent *dent,
383 const struct qstr *name,
384 void *opaque)
385{
386 const char *start = name->name;
387 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
388 if (name->len == (end - start))
389 return 1;
390 return 0;
391}
392
393static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
394 const struct qstr *name,
395 void *opaque)
396{
397 unsigned required = GFS2_DIRENT_SIZE(name->len);
398 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
399 unsigned totlen = be16_to_cpu(dent->de_rec_len);
400
401 if (!dent->de_inum.no_addr)
402 actual = GFS2_DIRENT_SIZE(0);
403 if ((totlen - actual) >= required)
404 return 1;
405 return 0;
406}
407
408struct dirent_gather {
409 const struct gfs2_dirent **pdent;
410 unsigned offset;
411};
412
413static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
414 const struct qstr *name,
415 void *opaque)
416{
417 struct dirent_gather *g = opaque;
418 if (dent->de_inum.no_addr) {
419 g->pdent[g->offset++] = dent;
420 }
421 return 0;
422}
423
424/*
425 * Other possible things to check:
426 * - Inode located within filesystem size (and on valid block)
427 * - Valid directory entry type
428 * Not sure how heavy-weight we want to make this... could also check
429 * hash is correct for example, but that would take a lot of extra time.
430 * For now the most important thing is to check that the various sizes
431 * are correct.
432 */
433static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
434 unsigned int size, unsigned int len, int first)
435{
436 const char *msg = "gfs2_dirent too small";
437 if (unlikely(size < sizeof(struct gfs2_dirent)))
438 goto error;
439 msg = "gfs2_dirent misaligned";
440 if (unlikely(offset & 0x7))
441 goto error;
442 msg = "gfs2_dirent points beyond end of block";
443 if (unlikely(offset + size > len))
444 goto error;
445 msg = "zero inode number";
446 if (unlikely(!first && !dent->de_inum.no_addr))
447 goto error;
448 msg = "name length is greater than space in dirent";
449 if (dent->de_inum.no_addr &&
450 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
451 size))
452 goto error;
453 return 0;
454error:
455 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
456 first ? "first in block" : "not first in block");
457 return -EIO;
458}
459
460static int gfs2_dirent_offset(const void *buf)
461{
462 const struct gfs2_meta_header *h = buf;
463 int offset;
464
465 BUG_ON(buf == NULL);
466
467 switch(be32_to_cpu(h->mh_type)) {
468 case GFS2_METATYPE_LF:
469 offset = sizeof(struct gfs2_leaf);
470 break;
471 case GFS2_METATYPE_DI:
472 offset = sizeof(struct gfs2_dinode);
473 break;
474 default:
475 goto wrong_type;
476 }
477 return offset;
478wrong_type:
479 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
480 be32_to_cpu(h->mh_type));
481 return -1;
482}
483
484static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode,
485 void *buf,
486 unsigned int len, gfs2_dscan_t scan,
487 const struct qstr *name,
488 void *opaque)
489{
490 struct gfs2_dirent *dent, *prev;
491 unsigned offset;
492 unsigned size;
493 int ret = 0;
494
495 ret = gfs2_dirent_offset(buf);
496 if (ret < 0)
497 goto consist_inode;
498
499 offset = ret;
500 prev = NULL;
501 dent = (struct gfs2_dirent *)(buf + offset);
502 size = be16_to_cpu(dent->de_rec_len);
503 if (gfs2_check_dirent(dent, offset, size, len, 1))
504 goto consist_inode;
505 do {
506 ret = scan(dent, name, opaque);
507 if (ret)
508 break;
509 offset += size;
510 if (offset == len)
511 break;
512 prev = dent;
513 dent = (struct gfs2_dirent *)(buf + offset);
514 size = be16_to_cpu(dent->de_rec_len);
515 if (gfs2_check_dirent(dent, offset, size, len, 0))
516 goto consist_inode;
517 } while(1);
518
519 switch(ret) {
520 case 0:
521 return NULL;
522 case 1:
523 return dent;
524 case 2:
525 return prev ? prev : dent;
526 default:
527 BUG_ON(ret > 0);
528 return ERR_PTR(ret);
529 }
530
531consist_inode:
532 gfs2_consist_inode(GFS2_I(inode));
533 return ERR_PTR(-EIO);
534}
535
536
537/**
538 * dirent_first - Return the first dirent
539 * @dip: the directory
540 * @bh: The buffer
541 * @dent: Pointer to list of dirents
542 *
543 * return first dirent whether bh points to leaf or stuffed dinode
544 *
545 * Returns: IS_LEAF, IS_DINODE, or -errno
546 */
547
548static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
549 struct gfs2_dirent **dent)
550{
551 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
552
553 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
554 if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
555 return -EIO;
556 *dent = (struct gfs2_dirent *)(bh->b_data +
557 sizeof(struct gfs2_leaf));
558 return IS_LEAF;
559 } else {
560 if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
561 return -EIO;
562 *dent = (struct gfs2_dirent *)(bh->b_data +
563 sizeof(struct gfs2_dinode));
564 return IS_DINODE;
565 }
566}
567
568/**
569 * dirent_next - Next dirent
570 * @dip: the directory
571 * @bh: The buffer
572 * @dent: Pointer to list of dirents
573 *
574 * Returns: 0 on success, error code otherwise
575 */
576
577static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
578 struct gfs2_dirent **dent)
579{
580 struct gfs2_dirent *tmp, *cur;
581 char *bh_end;
582 uint16_t cur_rec_len;
583
584 cur = *dent;
585 bh_end = bh->b_data + bh->b_size;
586 cur_rec_len = be16_to_cpu(cur->de_rec_len);
587
588 if ((char *)cur + cur_rec_len >= bh_end) {
589 if ((char *)cur + cur_rec_len > bh_end) {
590 gfs2_consist_inode(dip);
591 return -EIO;
592 }
593 return -ENOENT;
594 }
595
596 tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
597
598 if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) {
599 gfs2_consist_inode(dip);
600 return -EIO;
601 }
602
603 if (cur_rec_len == 0) {
604 gfs2_consist_inode(dip);
605 return -EIO;
606 }
607
608 /* Only the first dent could ever have de_inum.no_addr == 0 */
609 if (!tmp->de_inum.no_addr) {
610 gfs2_consist_inode(dip);
611 return -EIO;
612 }
613
614 *dent = tmp;
615
616 return 0;
617}
618
619/**
620 * dirent_del - Delete a dirent
621 * @dip: The GFS2 inode
622 * @bh: The buffer
623 * @prev: The previous dirent
624 * @cur: The current dirent
625 *
626 */
627
628static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
629 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
630{
631 uint16_t cur_rec_len, prev_rec_len;
632
633 if (!cur->de_inum.no_addr) {
634 gfs2_consist_inode(dip);
635 return;
636 }
637
638 gfs2_trans_add_bh(dip->i_gl, bh, 1);
639
640 /* If there is no prev entry, this is the first entry in the block.
641 The de_rec_len is already as big as it needs to be. Just zero
642 out the inode number and return. */
643
644 if (!prev) {
645 cur->de_inum.no_addr = 0; /* No endianess worries */
646 return;
647 }
648
649 /* Combine this dentry with the previous one. */
650
651 prev_rec_len = be16_to_cpu(prev->de_rec_len);
652 cur_rec_len = be16_to_cpu(cur->de_rec_len);
653
654 if ((char *)prev + prev_rec_len != (char *)cur)
655 gfs2_consist_inode(dip);
656 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
657 gfs2_consist_inode(dip);
658
659 prev_rec_len += cur_rec_len;
660 prev->de_rec_len = cpu_to_be16(prev_rec_len);
661}
662
663/*
664 * Takes a dent from which to grab space as an argument. Returns the
665 * newly created dent.
666 */
667static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
668 struct gfs2_dirent *dent,
669 const struct qstr *name,
670 struct buffer_head *bh)
671{
672 struct gfs2_inode *ip = GFS2_I(inode);
673 struct gfs2_dirent *ndent;
674 unsigned offset = 0, totlen;
675
676 if (dent->de_inum.no_addr)
677 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
678 totlen = be16_to_cpu(dent->de_rec_len);
679 BUG_ON(offset + name->len > totlen);
680 gfs2_trans_add_bh(ip->i_gl, bh, 1);
681 ndent = (struct gfs2_dirent *)((char *)dent + offset);
682 dent->de_rec_len = cpu_to_be16(offset);
683 gfs2_qstr2dirent(name, totlen - offset, ndent);
684 return ndent;
685}
686
687static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
688 struct buffer_head *bh,
689 const struct qstr *name)
690{
691 struct gfs2_dirent *dent;
692 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
693 gfs2_dirent_find_space, name, NULL);
694 if (!dent || IS_ERR(dent))
695 return dent;
696 return gfs2_init_dirent(inode, dent, name, bh);
697}
698
699static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
700 struct buffer_head **bhp)
701{
702 int error;
703
704 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
705 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
706 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
707 error = -EIO;
708 }
709
710 return error;
711}
712
713/**
714 * get_leaf_nr - Get a leaf number associated with the index
715 * @dip: The GFS2 inode
716 * @index:
717 * @leaf_out:
718 *
719 * Returns: 0 on success, error code otherwise
720 */
721
722static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
723 uint64_t *leaf_out)
724{
725 uint64_t leaf_no;
726 int error;
727
728 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
729 index * sizeof(uint64_t),
730 sizeof(uint64_t));
731 if (error != sizeof(uint64_t))
732 return (error < 0) ? error : -EIO;
733
734 *leaf_out = be64_to_cpu(leaf_no);
735
736 return 0;
737}
738
739static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
740 struct buffer_head **bh_out)
741{
742 uint64_t leaf_no;
743 int error;
744
745 error = get_leaf_nr(dip, index, &leaf_no);
746 if (!error)
747 error = get_leaf(dip, leaf_no, bh_out);
748
749 return error;
750}
751
752static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
753 const struct qstr *name,
754 gfs2_dscan_t scan,
755 struct buffer_head **pbh)
756{
757 struct buffer_head *bh;
758 struct gfs2_dirent *dent;
759 struct gfs2_inode *ip = GFS2_I(inode);
760 int error;
761
762 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
763 struct gfs2_leaf *leaf;
764 unsigned hsize = 1 << ip->i_di.di_depth;
765 unsigned index;
766 u64 ln;
767 if (hsize * sizeof(u64) != ip->i_di.di_size) {
768 gfs2_consist_inode(ip);
769 return ERR_PTR(-EIO);
770 }
771
772 index = name->hash >> (32 - ip->i_di.di_depth);
773 error = get_first_leaf(ip, index, &bh);
774 if (error)
775 return ERR_PTR(error);
776 do {
777 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
778 scan, name, NULL);
779 if (dent)
780 goto got_dent;
781 leaf = (struct gfs2_leaf *)bh->b_data;
782 ln = be64_to_cpu(leaf->lf_next);
783 brelse(bh);
784 if (!ln)
785 break;
786
787 error = get_leaf(ip, ln, &bh);
788 } while(!error);
789
790 return error ? ERR_PTR(error) : NULL;
791 }
792
793
794 error = gfs2_meta_inode_buffer(ip, &bh);
795 if (error)
796 return ERR_PTR(error);
797 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
798got_dent:
799 if (unlikely(dent == NULL || IS_ERR(dent))) {
800 brelse(bh);
801 bh = NULL;
802 }
803 *pbh = bh;
804 return dent;
805}
806
807static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
808{
809 struct gfs2_inode *ip = GFS2_I(inode);
810 u64 bn = gfs2_alloc_meta(ip);
811 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
812 struct gfs2_leaf *leaf;
813 struct gfs2_dirent *dent;
814 struct qstr name = { .name = "", .len = 0, .hash = 0 };
815 if (!bh)
816 return NULL;
817
818 gfs2_trans_add_bh(ip->i_gl, bh, 1);
819 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
820 leaf = (struct gfs2_leaf *)bh->b_data;
821 leaf->lf_depth = cpu_to_be16(depth);
822 leaf->lf_entries = cpu_to_be16(0);
823 leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
824 leaf->lf_next = cpu_to_be64(0);
825 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
826 dent = (struct gfs2_dirent *)(leaf+1);
827 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
828 *pbh = bh;
829 return leaf;
830}
831
832/**
833 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
834 * @dip: The GFS2 inode
835 *
836 * Returns: 0 on success, error code otherwise
837 */
838
839static int dir_make_exhash(struct inode *inode)
840{
841 struct gfs2_inode *dip = GFS2_I(inode);
842 struct gfs2_sbd *sdp = GFS2_SB(inode);
843 struct gfs2_dirent *dent;
844 struct qstr args;
845 struct buffer_head *bh, *dibh;
846 struct gfs2_leaf *leaf;
847 int y;
848 uint32_t x;
849 uint64_t *lp, bn;
850 int error;
851
852 error = gfs2_meta_inode_buffer(dip, &dibh);
853 if (error)
854 return error;
855
856 /* Turn over a new leaf */
857
858 leaf = new_leaf(inode, &bh, 0);
859 if (!leaf)
860 return -ENOSPC;
861 bn = bh->b_blocknr;
862
863 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
864 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
865
866 /* Copy dirents */
867
868 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
869 sizeof(struct gfs2_dinode));
870
871 /* Find last entry */
872
873 x = 0;
874 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
875 sizeof(struct gfs2_leaf);
876 args.name = bh->b_data;
877 dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
878 gfs2_dirent_last, &args, NULL);
879 if (!dent) {
880 brelse(bh);
881 brelse(dibh);
882 return -EIO;
883 }
884 if (IS_ERR(dent)) {
885 brelse(bh);
886 brelse(dibh);
887 return PTR_ERR(dent);
888 }
889
890 /* Adjust the last dirent's record length
891 (Remember that dent still points to the last entry.) */
892
893 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
894 sizeof(struct gfs2_dinode) -
895 sizeof(struct gfs2_leaf));
896
897 brelse(bh);
898
899 /* We're done with the new leaf block, now setup the new
900 hash table. */
901
902 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
903 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
904
905 lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
906
907 for (x = sdp->sd_hash_ptrs; x--; lp++)
908 *lp = cpu_to_be64(bn);
909
910 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
911 dip->i_di.di_blocks++;
912 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
913 dip->i_di.di_payload_format = 0;
914
915 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
916 dip->i_di.di_depth = y;
917
918 gfs2_dinode_out(&dip->i_di, dibh->b_data);
919
920 brelse(dibh);
921
922 return 0;
923}
924
925/**
926 * dir_split_leaf - Split a leaf block into two
927 * @dip: The GFS2 inode
928 * @index:
929 * @leaf_no:
930 *
931 * Returns: 0 on success, error code on failure
932 */
933
934static int dir_split_leaf(struct inode *inode, const struct qstr *name)
935{
936 struct gfs2_inode *dip = GFS2_I(inode);
937 struct buffer_head *nbh, *obh, *dibh;
938 struct gfs2_leaf *nleaf, *oleaf;
939 struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
940 uint32_t start, len, half_len, divider;
941 uint64_t bn, *lp, leaf_no;
942 uint32_t index;
943 int x, moved = 0;
944 int error;
945
946 index = name->hash >> (32 - dip->i_di.di_depth);
947 error = get_leaf_nr(dip, index, &leaf_no);
948 if (error)
949 return error;
950
951 /* Get the old leaf block */
952 error = get_leaf(dip, leaf_no, &obh);
953 if (error)
954 return error;
955
956 oleaf = (struct gfs2_leaf *)obh->b_data;
957 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
958 brelse(obh);
959 return 1; /* can't split */
960 }
961
962 gfs2_trans_add_bh(dip->i_gl, obh, 1);
963
964 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
965 if (!nleaf) {
966 brelse(obh);
967 return -ENOSPC;
968 }
969 bn = nbh->b_blocknr;
970
971 /* Compute the start and len of leaf pointers in the hash table. */
972 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
973 half_len = len >> 1;
974 if (!half_len) {
975 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
976 gfs2_consist_inode(dip);
977 error = -EIO;
978 goto fail_brelse;
979 }
980
981 start = (index & ~(len - 1));
982
983 /* Change the pointers.
984 Don't bother distinguishing stuffed from non-stuffed.
985 This code is complicated enough already. */
986 lp = kmalloc(half_len * sizeof(uint64_t), GFP_NOFS | __GFP_NOFAIL);
987 /* Change the pointers */
988 for (x = 0; x < half_len; x++)
989 lp[x] = cpu_to_be64(bn);
990
991 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t),
992 half_len * sizeof(uint64_t));
993 if (error != half_len * sizeof(uint64_t)) {
994 if (error >= 0)
995 error = -EIO;
996 goto fail_lpfree;
997 }
998
999 kfree(lp);
1000
1001 /* Compute the divider */
1002 divider = (start + half_len) << (32 - dip->i_di.di_depth);
1003
1004 /* Copy the entries */
1005 dirent_first(dip, obh, &dent);
1006
1007 do {
1008 next = dent;
1009 if (dirent_next(dip, obh, &next))
1010 next = NULL;
1011
1012 if (dent->de_inum.no_addr &&
1013 be32_to_cpu(dent->de_hash) < divider) {
1014 struct qstr str;
1015 str.name = (char*)(dent+1);
1016 str.len = be16_to_cpu(dent->de_name_len);
1017 str.hash = be32_to_cpu(dent->de_hash);
1018 new = gfs2_dirent_alloc(inode, nbh, &str);
1019 if (IS_ERR(new)) {
1020 error = PTR_ERR(new);
1021 break;
1022 }
1023
1024 new->de_inum = dent->de_inum; /* No endian worries */
1025 new->de_type = dent->de_type; /* No endian worries */
1026 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1027
1028 dirent_del(dip, obh, prev, dent);
1029
1030 if (!oleaf->lf_entries)
1031 gfs2_consist_inode(dip);
1032 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1033
1034 if (!prev)
1035 prev = dent;
1036
1037 moved = 1;
1038 } else {
1039 prev = dent;
1040 }
1041 dent = next;
1042 } while (dent);
1043
1044 oleaf->lf_depth = nleaf->lf_depth;
1045
1046 error = gfs2_meta_inode_buffer(dip, &dibh);
1047 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1048 dip->i_di.di_blocks++;
1049 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1050 brelse(dibh);
1051 }
1052
1053 brelse(obh);
1054 brelse(nbh);
1055
1056 return error;
1057
1058fail_lpfree:
1059 kfree(lp);
1060
1061fail_brelse:
1062 brelse(obh);
1063 brelse(nbh);
1064 return error;
1065}
1066
1067/**
1068 * dir_double_exhash - Double size of ExHash table
1069 * @dip: The GFS2 dinode
1070 *
1071 * Returns: 0 on success, error code on failure
1072 */
1073
1074static int dir_double_exhash(struct gfs2_inode *dip)
1075{
1076 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1077 struct buffer_head *dibh;
1078 uint32_t hsize;
1079 uint64_t *buf;
1080 uint64_t *from, *to;
1081 uint64_t block;
1082 int x;
1083 int error = 0;
1084
1085 hsize = 1 << dip->i_di.di_depth;
1086 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1087 gfs2_consist_inode(dip);
1088 return -EIO;
1089 }
1090
1091 /* Allocate both the "from" and "to" buffers in one big chunk */
1092
1093 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1094
1095 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1096 error = gfs2_dir_read_data(dip, (char *)buf,
1097 block * sdp->sd_hash_bsize,
1098 sdp->sd_hash_bsize);
1099 if (error != sdp->sd_hash_bsize) {
1100 if (error >= 0)
1101 error = -EIO;
1102 goto fail;
1103 }
1104
1105 from = buf;
1106 to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
1107
1108 for (x = sdp->sd_hash_ptrs; x--; from++) {
1109 *to++ = *from; /* No endianess worries */
1110 *to++ = *from;
1111 }
1112
1113 error = gfs2_dir_write_data(dip,
1114 (char *)buf + sdp->sd_hash_bsize,
1115 block * sdp->sd_sb.sb_bsize,
1116 sdp->sd_sb.sb_bsize);
1117 if (error != sdp->sd_sb.sb_bsize) {
1118 if (error >= 0)
1119 error = -EIO;
1120 goto fail;
1121 }
1122 }
1123
1124 kfree(buf);
1125
1126 error = gfs2_meta_inode_buffer(dip, &dibh);
1127 if (!gfs2_assert_withdraw(sdp, !error)) {
1128 dip->i_di.di_depth++;
1129 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1130 brelse(dibh);
1131 }
1132
1133 return error;
1134
1135 fail:
1136 kfree(buf);
1137
1138 return error;
1139}
1140
1141/**
1142 * compare_dents - compare directory entries by hash value
1143 * @a: first dent
1144 * @b: second dent
1145 *
1146 * When comparing the hash entries of @a to @b:
1147 * gt: returns 1
1148 * lt: returns -1
1149 * eq: returns 0
1150 */
1151
1152static int compare_dents(const void *a, const void *b)
1153{
1154 struct gfs2_dirent *dent_a, *dent_b;
1155 uint32_t hash_a, hash_b;
1156 int ret = 0;
1157
1158 dent_a = *(struct gfs2_dirent **)a;
1159 hash_a = be32_to_cpu(dent_a->de_hash);
1160
1161 dent_b = *(struct gfs2_dirent **)b;
1162 hash_b = be32_to_cpu(dent_b->de_hash);
1163
1164 if (hash_a > hash_b)
1165 ret = 1;
1166 else if (hash_a < hash_b)
1167 ret = -1;
1168 else {
1169 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1170 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1171
1172 if (len_a > len_b)
1173 ret = 1;
1174 else if (len_a < len_b)
1175 ret = -1;
1176 else
1177 ret = memcmp((char *)(dent_a + 1),
1178 (char *)(dent_b + 1),
1179 len_a);
1180 }
1181
1182 return ret;
1183}
1184
1185/**
1186 * do_filldir_main - read out directory entries
1187 * @dip: The GFS2 inode
1188 * @offset: The offset in the file to read from
1189 * @opaque: opaque data to pass to filldir
1190 * @filldir: The function to pass entries to
1191 * @darr: an array of struct gfs2_dirent pointers to read
1192 * @entries: the number of entries in darr
1193 * @copied: pointer to int that's non-zero if a entry has been copied out
1194 *
1195 * Jump through some hoops to make sure that if there are hash collsions,
1196 * they are read out at the beginning of a buffer. We want to minimize
1197 * the possibility that they will fall into different readdir buffers or
1198 * that someone will want to seek to that location.
1199 *
1200 * Returns: errno, >0 on exception from filldir
1201 */
1202
1203static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
1204 void *opaque, gfs2_filldir_t filldir,
1205 const struct gfs2_dirent **darr, uint32_t entries,
1206 int *copied)
1207{
1208 const struct gfs2_dirent *dent, *dent_next;
1209 struct gfs2_inum inum;
1210 uint64_t off, off_next;
1211 unsigned int x, y;
1212 int run = 0;
1213 int error = 0;
1214
1215 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1216
1217 dent_next = darr[0];
1218 off_next = be32_to_cpu(dent_next->de_hash);
1219 off_next = gfs2_disk_hash2offset(off_next);
1220
1221 for (x = 0, y = 1; x < entries; x++, y++) {
1222 dent = dent_next;
1223 off = off_next;
1224
1225 if (y < entries) {
1226 dent_next = darr[y];
1227 off_next = be32_to_cpu(dent_next->de_hash);
1228 off_next = gfs2_disk_hash2offset(off_next);
1229
1230 if (off < *offset)
1231 continue;
1232 *offset = off;
1233
1234 if (off_next == off) {
1235 if (*copied && !run)
1236 return 1;
1237 run = 1;
1238 } else
1239 run = 0;
1240 } else {
1241 if (off < *offset)
1242 continue;
1243 *offset = off;
1244 }
1245
1246 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1247
1248 error = filldir(opaque, (char *)(dent + 1),
1249 be16_to_cpu(dent->de_name_len),
1250 off, &inum,
1251 be16_to_cpu(dent->de_type));
1252 if (error)
1253 return 1;
1254
1255 *copied = 1;
1256 }
1257
1258 /* Increment the *offset by one, so the next time we come into the
1259 do_filldir fxn, we get the next entry instead of the last one in the
1260 current leaf */
1261
1262 (*offset)++;
1263
1264 return 0;
1265}
1266
1267static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1268 gfs2_filldir_t filldir, int *copied,
1269 unsigned *depth, u64 leaf_no)
1270{
1271 struct gfs2_inode *ip = GFS2_I(inode);
1272 struct buffer_head *bh;
1273 struct gfs2_leaf *lf;
1274 unsigned entries = 0;
1275 unsigned leaves = 0;
1276 const struct gfs2_dirent **darr, *dent;
1277 struct dirent_gather g;
1278 struct buffer_head **larr;
1279 int leaf = 0;
1280 int error, i;
1281 u64 lfn = leaf_no;
1282
1283 do {
1284 error = get_leaf(ip, lfn, &bh);
1285 if (error)
1286 goto out;
1287 lf = (struct gfs2_leaf *)bh->b_data;
1288 if (leaves == 0)
1289 *depth = be16_to_cpu(lf->lf_depth);
1290 entries += be16_to_cpu(lf->lf_entries);
1291 leaves++;
1292 lfn = be64_to_cpu(lf->lf_next);
1293 brelse(bh);
1294 } while(lfn);
1295
1296 if (!entries)
1297 return 0;
1298
1299 error = -ENOMEM;
1300 larr = vmalloc((leaves + entries) * sizeof(void*));
1301 if (!larr)
1302 goto out;
1303 darr = (const struct gfs2_dirent **)(larr + leaves);
1304 g.pdent = darr;
1305 g.offset = 0;
1306 lfn = leaf_no;
1307
1308 do {
1309 error = get_leaf(ip, lfn, &bh);
1310 if (error)
1311 goto out_kfree;
1312 lf = (struct gfs2_leaf *)bh->b_data;
1313 lfn = be64_to_cpu(lf->lf_next);
1314 if (lf->lf_entries) {
1315 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1316 gfs2_dirent_gather, NULL, &g);
1317 error = PTR_ERR(dent);
1318 if (IS_ERR(dent)) {
1319 goto out_kfree;
1320 }
1321 error = 0;
1322 larr[leaf++] = bh;
1323 } else {
1324 brelse(bh);
1325 }
1326 } while(lfn);
1327
1328 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1329 entries, copied);
1330out_kfree:
1331 for(i = 0; i < leaf; i++)
1332 brelse(larr[i]);
1333 vfree(larr);
1334out:
1335 return error;
1336}
1337
1338/**
1339 * dir_e_read - Reads the entries from a directory into a filldir buffer
1340 * @dip: dinode pointer
1341 * @offset: the hash of the last entry read shifted to the right once
1342 * @opaque: buffer for the filldir function to fill
1343 * @filldir: points to the filldir function to use
1344 *
1345 * Returns: errno
1346 */
1347
1348static int dir_e_read(struct inode *inode, uint64_t *offset, void *opaque,
1349 gfs2_filldir_t filldir)
1350{
1351 struct gfs2_inode *dip = GFS2_I(inode);
1352 struct gfs2_sbd *sdp = GFS2_SB(inode);
1353 uint32_t hsize, len = 0;
1354 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1355 uint32_t hash, index;
1356 uint64_t *lp;
1357 int copied = 0;
1358 int error = 0;
1359 unsigned depth = 0;
1360
1361 hsize = 1 << dip->i_di.di_depth;
1362 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1363 gfs2_consist_inode(dip);
1364 return -EIO;
1365 }
1366
1367 hash = gfs2_dir_offset2hash(*offset);
1368 index = hash >> (32 - dip->i_di.di_depth);
1369
1370 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1371 if (!lp)
1372 return -ENOMEM;
1373
1374 while (index < hsize) {
1375 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1376 ht_offset = index - lp_offset;
1377
1378 if (ht_offset_cur != ht_offset) {
1379 error = gfs2_dir_read_data(dip, (char *)lp,
1380 ht_offset * sizeof(uint64_t),
1381 sdp->sd_hash_bsize);
1382 if (error != sdp->sd_hash_bsize) {
1383 if (error >= 0)
1384 error = -EIO;
1385 goto out;
1386 }
1387 ht_offset_cur = ht_offset;
1388 }
1389
1390 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1391 &copied, &depth,
1392 be64_to_cpu(lp[lp_offset]));
1393 if (error)
1394 break;
1395
1396 len = 1 << (dip->i_di.di_depth - depth);
1397 index = (index & ~(len - 1)) + len;
1398 }
1399
1400out:
1401 kfree(lp);
1402 if (error > 0)
1403 error = 0;
1404 return error;
1405}
1406
1407int gfs2_dir_read(struct inode *inode, uint64_t *offset, void *opaque,
1408 gfs2_filldir_t filldir)
1409{
1410 struct gfs2_inode *dip = GFS2_I(inode);
1411 struct dirent_gather g;
1412 const struct gfs2_dirent **darr, *dent;
1413 struct buffer_head *dibh;
1414 int copied = 0;
1415 int error;
1416
1417 if (!dip->i_di.di_entries)
1418 return 0;
1419
1420 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1421 return dir_e_read(inode, offset, opaque, filldir);
1422
1423 if (!gfs2_is_stuffed(dip)) {
1424 gfs2_consist_inode(dip);
1425 return -EIO;
1426 }
1427
1428 error = gfs2_meta_inode_buffer(dip, &dibh);
1429 if (error)
1430 return error;
1431
1432 error = -ENOMEM;
1433 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1434 GFP_KERNEL);
1435 if (darr) {
1436 g.pdent = darr;
1437 g.offset = 0;
1438 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1439 gfs2_dirent_gather, NULL, &g);
1440 if (IS_ERR(dent)) {
1441 error = PTR_ERR(dent);
1442 goto out;
1443 }
1444 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1445 dip->i_di.di_entries, &copied);
1446out:
1447 kfree(darr);
1448 }
1449
1450 if (error > 0)
1451 error = 0;
1452
1453 brelse(dibh);
1454
1455 return error;
1456}
1457
1458/**
1459 * gfs2_dir_search - Search a directory
1460 * @dip: The GFS2 inode
1461 * @filename:
1462 * @inode:
1463 *
1464 * This routine searches a directory for a file or another directory.
1465 * Assumes a glock is held on dip.
1466 *
1467 * Returns: errno
1468 */
1469
1470int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1471 struct gfs2_inum *inum, unsigned int *type)
1472{
1473 struct buffer_head *bh;
1474 struct gfs2_dirent *dent;
1475
1476 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1477 if (dent) {
1478 if (IS_ERR(dent))
1479 return PTR_ERR(dent);
1480 if (inum)
1481 gfs2_inum_in(inum, (char *)&dent->de_inum);
1482 if (type)
1483 *type = be16_to_cpu(dent->de_type);
1484 brelse(bh);
1485 return 0;
1486 }
1487 return -ENOENT;
1488}
1489
1490static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1491{
1492 struct buffer_head *bh, *obh;
1493 struct gfs2_inode *ip = GFS2_I(inode);
1494 struct gfs2_leaf *leaf, *oleaf;
1495 int error;
1496 u32 index;
1497 u64 bn;
1498
1499 index = name->hash >> (32 - ip->i_di.di_depth);
1500 error = get_first_leaf(ip, index, &obh);
1501 if (error)
1502 return error;
1503 do {
1504 oleaf = (struct gfs2_leaf *)obh->b_data;
1505 bn = be64_to_cpu(oleaf->lf_next);
1506 if (!bn)
1507 break;
1508 brelse(obh);
1509 error = get_leaf(ip, bn, &obh);
1510 if (error)
1511 return error;
1512 } while(1);
1513
1514 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1515
1516 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1517 if (!leaf) {
1518 brelse(obh);
1519 return -ENOSPC;
1520 }
1521 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1522 brelse(bh);
1523 brelse(obh);
1524
1525 error = gfs2_meta_inode_buffer(ip, &bh);
1526 if (error)
1527 return error;
1528 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1529 ip->i_di.di_blocks++;
1530 gfs2_dinode_out(&ip->i_di, bh->b_data);
1531 brelse(bh);
1532 return 0;
1533}
1534
1535/**
1536 * gfs2_dir_add - Add new filename into directory
1537 * @dip: The GFS2 inode
1538 * @filename: The new name
1539 * @inode: The inode number of the entry
1540 * @type: The type of the entry
1541 *
1542 * Returns: 0 on success, error code on failure
1543 */
1544
1545int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1546 const struct gfs2_inum *inum, unsigned type)
1547{
1548 struct gfs2_inode *ip = GFS2_I(inode);
1549 struct buffer_head *bh;
1550 struct gfs2_dirent *dent;
1551 struct gfs2_leaf *leaf;
1552 int error;
1553
1554 while(1) {
1555 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1556 &bh);
1557 if (dent) {
1558 if (IS_ERR(dent))
1559 return PTR_ERR(dent);
1560 dent = gfs2_init_dirent(inode, dent, name, bh);
1561 gfs2_inum_out(inum, (char *)&dent->de_inum);
1562 dent->de_type = cpu_to_be16(type);
1563 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1564 leaf = (struct gfs2_leaf *)bh->b_data;
1565 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1566 }
1567 brelse(bh);
1568 error = gfs2_meta_inode_buffer(ip, &bh);
1569 if (error)
1570 break;
1571 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1572 ip->i_di.di_entries++;
1573 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1574 gfs2_dinode_out(&ip->i_di, bh->b_data);
1575 brelse(bh);
1576 error = 0;
1577 break;
1578 }
1579 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1580 error = dir_make_exhash(inode);
1581 if (error)
1582 break;
1583 continue;
1584 }
1585 error = dir_split_leaf(inode, name);
1586 if (error == 0)
1587 continue;
1588 if (error < 0)
1589 break;
1590 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1591 error = dir_double_exhash(ip);
1592 if (error)
1593 break;
1594 error = dir_split_leaf(inode, name);
1595 if (error < 0)
1596 break;
1597 if (error == 0)
1598 continue;
1599 }
1600 error = dir_new_leaf(inode, name);
1601 if (!error)
1602 continue;
1603 error = -ENOSPC;
1604 break;
1605 }
1606 return error;
1607}
1608
1609
1610/**
1611 * gfs2_dir_del - Delete a directory entry
1612 * @dip: The GFS2 inode
1613 * @filename: The filename
1614 *
1615 * Returns: 0 on success, error code on failure
1616 */
1617
1618int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1619{
1620 struct gfs2_dirent *dent, *prev = NULL;
1621 struct buffer_head *bh;
1622 int error;
1623
1624 /* Returns _either_ the entry (if its first in block) or the
1625 previous entry otherwise */
1626 dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1627 if (!dent) {
1628 gfs2_consist_inode(dip);
1629 return -EIO;
1630 }
1631 if (IS_ERR(dent)) {
1632 gfs2_consist_inode(dip);
1633 return PTR_ERR(dent);
1634 }
1635 /* If not first in block, adjust pointers accordingly */
1636 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1637 prev = dent;
1638 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1639 }
1640
1641 dirent_del(dip, bh, prev, dent);
1642 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1643 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1644 u16 entries = be16_to_cpu(leaf->lf_entries);
1645 if (!entries)
1646 gfs2_consist_inode(dip);
1647 leaf->lf_entries = cpu_to_be16(--entries);
1648 }
1649 brelse(bh);
1650
1651 error = gfs2_meta_inode_buffer(dip, &bh);
1652 if (error)
1653 return error;
1654
1655 if (!dip->i_di.di_entries)
1656 gfs2_consist_inode(dip);
1657 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1658 dip->i_di.di_entries--;
1659 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1660 gfs2_dinode_out(&dip->i_di, bh->b_data);
1661 brelse(bh);
1662 mark_inode_dirty(&dip->i_inode);
1663
1664 return error;
1665}
1666
1667/**
1668 * gfs2_dir_mvino - Change inode number of directory entry
1669 * @dip: The GFS2 inode
1670 * @filename:
1671 * @new_inode:
1672 *
1673 * This routine changes the inode number of a directory entry. It's used
1674 * by rename to change ".." when a directory is moved.
1675 * Assumes a glock is held on dvp.
1676 *
1677 * Returns: errno
1678 */
1679
1680int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1681 struct gfs2_inum *inum, unsigned int new_type)
1682{
1683 struct buffer_head *bh;
1684 struct gfs2_dirent *dent;
1685 int error;
1686
1687 dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1688 if (!dent) {
1689 gfs2_consist_inode(dip);
1690 return -EIO;
1691 }
1692 if (IS_ERR(dent))
1693 return PTR_ERR(dent);
1694
1695 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1696 gfs2_inum_out(inum, (char *)&dent->de_inum);
1697 dent->de_type = cpu_to_be16(new_type);
1698
1699 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1700 brelse(bh);
1701 error = gfs2_meta_inode_buffer(dip, &bh);
1702 if (error)
1703 return error;
1704 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1705 }
1706
1707 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1708 gfs2_dinode_out(&dip->i_di, bh->b_data);
1709 brelse(bh);
1710 return 0;
1711}
1712
1713/**
1714 * foreach_leaf - call a function for each leaf in a directory
1715 * @dip: the directory
1716 * @lc: the function to call for each each
1717 * @data: private data to pass to it
1718 *
1719 * Returns: errno
1720 */
1721
1722static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1723{
1724 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1725 struct buffer_head *bh;
1726 struct gfs2_leaf *leaf;
1727 uint32_t hsize, len;
1728 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1729 uint32_t index = 0;
1730 uint64_t *lp;
1731 uint64_t leaf_no;
1732 int error = 0;
1733
1734 hsize = 1 << dip->i_di.di_depth;
1735 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1736 gfs2_consist_inode(dip);
1737 return -EIO;
1738 }
1739
1740 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1741 if (!lp)
1742 return -ENOMEM;
1743
1744 while (index < hsize) {
1745 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1746 ht_offset = index - lp_offset;
1747
1748 if (ht_offset_cur != ht_offset) {
1749 error = gfs2_dir_read_data(dip, (char *)lp,
1750 ht_offset * sizeof(uint64_t),
1751 sdp->sd_hash_bsize);
1752 if (error != sdp->sd_hash_bsize) {
1753 if (error >= 0)
1754 error = -EIO;
1755 goto out;
1756 }
1757 ht_offset_cur = ht_offset;
1758 }
1759
1760 leaf_no = be64_to_cpu(lp[lp_offset]);
1761 if (leaf_no) {
1762 error = get_leaf(dip, leaf_no, &bh);
1763 if (error)
1764 goto out;
1765 leaf = (struct gfs2_leaf *)bh->b_data;
1766 brelse(bh);
1767
1768 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1769
1770 error = lc(dip, index, len, leaf_no, data);
1771 if (error)
1772 goto out;
1773
1774 index = (index & ~(len - 1)) + len;
1775 } else
1776 index++;
1777 }
1778
1779 if (index != hsize) {
1780 gfs2_consist_inode(dip);
1781 error = -EIO;
1782 }
1783
1784 out:
1785 kfree(lp);
1786
1787 return error;
1788}
1789
1790/**
1791 * leaf_dealloc - Deallocate a directory leaf
1792 * @dip: the directory
1793 * @index: the hash table offset in the directory
1794 * @len: the number of pointers to this leaf
1795 * @leaf_no: the leaf number
1796 * @data: not used
1797 *
1798 * Returns: errno
1799 */
1800
1801static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
1802 uint64_t leaf_no, void *data)
1803{
1804 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1805 struct gfs2_leaf *tmp_leaf;
1806 struct gfs2_rgrp_list rlist;
1807 struct buffer_head *bh, *dibh;
1808 uint64_t blk, nblk;
1809 unsigned int rg_blocks = 0, l_blocks = 0;
1810 char *ht;
1811 unsigned int x, size = len * sizeof(uint64_t);
1812 int error;
1813
1814 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1815
1816 ht = kzalloc(size, GFP_KERNEL);
1817 if (!ht)
1818 return -ENOMEM;
1819
1820 gfs2_alloc_get(dip);
1821
1822 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1823 if (error)
1824 goto out;
1825
1826 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1827 if (error)
1828 goto out_qs;
1829
1830 /* Count the number of leaves */
1831
1832 for (blk = leaf_no; blk; blk = nblk) {
1833 error = get_leaf(dip, blk, &bh);
1834 if (error)
1835 goto out_rlist;
1836 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1837 nblk = be64_to_cpu(tmp_leaf->lf_next);
1838 brelse(bh);
1839
1840 gfs2_rlist_add(sdp, &rlist, blk);
1841 l_blocks++;
1842 }
1843
1844 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1845
1846 for (x = 0; x < rlist.rl_rgrps; x++) {
1847 struct gfs2_rgrpd *rgd;
1848 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1849 rg_blocks += rgd->rd_ri.ri_length;
1850 }
1851
1852 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1853 if (error)
1854 goto out_rlist;
1855
1856 error = gfs2_trans_begin(sdp,
1857 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1858 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1859 if (error)
1860 goto out_rg_gunlock;
1861
1862 for (blk = leaf_no; blk; blk = nblk) {
1863 error = get_leaf(dip, blk, &bh);
1864 if (error)
1865 goto out_end_trans;
1866 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1867 nblk = be64_to_cpu(tmp_leaf->lf_next);
1868 brelse(bh);
1869
1870 gfs2_free_meta(dip, blk, 1);
1871
1872 if (!dip->i_di.di_blocks)
1873 gfs2_consist_inode(dip);
1874 dip->i_di.di_blocks--;
1875 }
1876
1877 error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size);
1878 if (error != size) {
1879 if (error >= 0)
1880 error = -EIO;
1881 goto out_end_trans;
1882 }
1883
1884 error = gfs2_meta_inode_buffer(dip, &dibh);
1885 if (error)
1886 goto out_end_trans;
1887
1888 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1889 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1890 brelse(dibh);
1891
1892 out_end_trans:
1893 gfs2_trans_end(sdp);
1894
1895 out_rg_gunlock:
1896 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1897
1898 out_rlist:
1899 gfs2_rlist_free(&rlist);
1900 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1901
1902 out_qs:
1903 gfs2_quota_unhold(dip);
1904
1905 out:
1906 gfs2_alloc_put(dip);
1907 kfree(ht);
1908
1909 return error;
1910}
1911
1912/**
1913 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1914 * @dip: the directory
1915 *
1916 * Dealloc all on-disk directory leaves to FREEMETA state
1917 * Change on-disk inode type to "regular file"
1918 *
1919 * Returns: errno
1920 */
1921
1922int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1923{
1924 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1925 struct buffer_head *bh;
1926 int error;
1927
1928 /* Dealloc on-disk leaves to FREEMETA state */
1929 error = foreach_leaf(dip, leaf_dealloc, NULL);
1930 if (error)
1931 return error;
1932
1933 /* Make this a regular file in case we crash.
1934 (We don't want to free these blocks a second time.) */
1935
1936 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1937 if (error)
1938 return error;
1939
1940 error = gfs2_meta_inode_buffer(dip, &bh);
1941 if (!error) {
1942 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1943 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1944 cpu_to_be32(S_IFREG);
1945 brelse(bh);
1946 }
1947
1948 gfs2_trans_end(sdp);
1949
1950 return error;
1951}
1952
1953/**
1954 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1955 * @ip: the file being written to
1956 * @filname: the filename that's going to be added
1957 *
1958 * Returns: 1 if alloc required, 0 if not, -ve on error
1959 */
1960
1961int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1962{
1963 struct gfs2_dirent *dent;
1964 struct buffer_head *bh;
1965
1966 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1967 if (!dent) {
1968 return 1;
1969 }
1970 if (IS_ERR(dent))
1971 return PTR_ERR(dent);
1972 brelse(bh);
1973 return 0;
1974}
1975
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..173403095eb2
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,73 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13/**
14 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
15 * @opaque: opaque data used by the function
16 * @name: the name of the directory entry
17 * @length: the length of the name
18 * @offset: the entry's offset in the directory
19 * @inum: the inode number the entry points to
20 * @type: the type of inode the entry points to
21 *
22 * Returns: 0 on success, 1 if buffer full
23 */
24
25typedef int (*gfs2_filldir_t) (void *opaque,
26 const char *name, unsigned int length,
27 uint64_t offset,
28 struct gfs2_inum *inum, unsigned int type);
29
30int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
31 struct gfs2_inum *inum, unsigned int *type);
32int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
33 const struct gfs2_inum *inum, unsigned int type);
34int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
35int gfs2_dir_read(struct inode *inode, uint64_t * offset, void *opaque,
36 gfs2_filldir_t filldir);
37int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
38 struct gfs2_inum *new_inum, unsigned int new_type);
39
40int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
41
42int gfs2_diradd_alloc_required(struct inode *dir,
43 const struct qstr *filename);
44int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
45 struct buffer_head **bhp);
46
47static inline uint32_t gfs2_disk_hash(const char *data, int len)
48{
49 return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF;
50}
51
52
53static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
54{
55 name->name = fname;
56 name->len = strlen(fname);
57 name->hash = gfs2_disk_hash(name->name, name->len);
58}
59
60/* N.B. This probably ought to take inum & type as args as well */
61static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
62{
63 dent->de_inum.no_addr = cpu_to_be64(0);
64 dent->de_inum.no_formal_ino = cpu_to_be64(0);
65 dent->de_hash = cpu_to_be32(name->hash);
66 dent->de_rec_len = cpu_to_be16(reclen);
67 dent->de_name_len = cpu_to_be16(name->len);
68 dent->de_type = cpu_to_be16(0);
69 memset(dent->__pad, 0, sizeof(dent->__pad));
70 memcpy((char*)(dent+1), name->name, name->len);
71}
72
73#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..3ace242f2b16
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "util.h"
26
27/**
28 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
29 * @namep: ea name, possibly with type appended
30 *
31 * Returns: GFS2_EATYPE_XXX
32 */
33
34unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
35{
36 unsigned int type;
37
38 if (strncmp(name, "system.", 7) == 0) {
39 type = GFS2_EATYPE_SYS;
40 if (truncated_name)
41 *truncated_name = strchr(name, '.') + 1;
42 } else if (strncmp(name, "user.", 5) == 0) {
43 type = GFS2_EATYPE_USR;
44 if (truncated_name)
45 *truncated_name = strchr(name, '.') + 1;
46 } else if (strncmp(name, "security.", 9) == 0) {
47 type = GFS2_EATYPE_SECURITY;
48 if (truncated_name)
49 *truncated_name = strchr(name, '.') + 1;
50 } else {
51 type = GFS2_EATYPE_UNUSED;
52 if (truncated_name)
53 *truncated_name = NULL;
54 }
55
56 return type;
57}
58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
102 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
103 !capable(CAP_SYS_ADMIN))
104 return -EPERM;
105
106 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
107 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP;
110
111
112
113 return gfs2_ea_get_i(ip, er);
114}
115
116static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
117{
118 int remove = 0;
119 int error;
120
121 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
122 if (!(er->er_flags & GFS2_ERF_MODE)) {
123 er->er_mode = ip->i_di.di_mode;
124 er->er_flags |= GFS2_ERF_MODE;
125 }
126 error = gfs2_acl_validate_set(ip, 1, er,
127 &remove, &er->er_mode);
128 if (error)
129 return error;
130 error = gfs2_ea_set_i(ip, er);
131 if (error)
132 return error;
133 if (remove)
134 gfs2_ea_remove_i(ip, er);
135 return 0;
136
137 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
138 error = gfs2_acl_validate_set(ip, 0, er,
139 &remove, NULL);
140 if (error)
141 return error;
142 if (!remove)
143 error = gfs2_ea_set_i(ip, er);
144 else {
145 error = gfs2_ea_remove_i(ip, er);
146 if (error == -ENODATA)
147 error = 0;
148 }
149 return error;
150 }
151
152 return -EPERM;
153}
154
155static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
156{
157 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
158 int error = gfs2_acl_validate_remove(ip, 1);
159 if (error)
160 return error;
161
162 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
163 int error = gfs2_acl_validate_remove(ip, 0);
164 if (error)
165 return error;
166
167 } else
168 return -EPERM;
169
170 return gfs2_ea_remove_i(ip, er);
171}
172
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove,
207 .eo_name = "user",
208};
209
210struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove,
214 .eo_name = "system",
215};
216
217static struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove,
221 .eo_name = "security",
222};
223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL,
226 &gfs2_user_eaops,
227 &gfs2_system_eaops,
228 &gfs2_security_eaops,
229};
230
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..3dece17e3116
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14
15struct gfs2_eattr_operations {
16 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
17 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 char *eo_name;
20};
21
22unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
23
24extern struct gfs2_eattr_operations gfs2_system_eaops;
25
26extern struct gfs2_eattr_operations *gfs2_ea_ops[];
27
28#endif /* __EAOPS_DOT_H__ */
29
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..96736932260f
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1548 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33/**
34 * ea_calc_size - returns the acutal number of bytes the request will take up
35 * (not counting any unstuffed data blocks)
36 * @sdp:
37 * @er:
38 * @size:
39 *
40 * Returns: 1 if the EA should be stuffed
41 */
42
43static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
44 unsigned int *size)
45{
46 *size = GFS2_EAREQ_SIZE_STUFFED(er);
47 if (*size <= sdp->sd_jbsize)
48 return 1;
49
50 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
51
52 return 0;
53}
54
55static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
56{
57 unsigned int size;
58
59 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
60 return -ERANGE;
61
62 ea_calc_size(sdp, er, &size);
63
64 /* This can only happen with 512 byte blocks */
65 if (size > sdp->sd_jbsize)
66 return -ERANGE;
67
68 return 0;
69}
70
71typedef int (*ea_call_t) (struct gfs2_inode *ip,
72 struct buffer_head *bh,
73 struct gfs2_ea_header *ea,
74 struct gfs2_ea_header *prev,
75 void *private);
76
77static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
78 ea_call_t ea_call, void *data)
79{
80 struct gfs2_ea_header *ea, *prev = NULL;
81 int error = 0;
82
83 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
84 return -EIO;
85
86 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
87 if (!GFS2_EA_REC_LEN(ea))
88 goto fail;
89 if (!(bh->b_data <= (char *)ea &&
90 (char *)GFS2_EA2NEXT(ea) <=
91 bh->b_data + bh->b_size))
92 goto fail;
93 if (!GFS2_EATYPE_VALID(ea->ea_type))
94 goto fail;
95
96 error = ea_call(ip, bh, ea, prev, data);
97 if (error)
98 return error;
99
100 if (GFS2_EA_IS_LAST(ea)) {
101 if ((char *)GFS2_EA2NEXT(ea) !=
102 bh->b_data + bh->b_size)
103 goto fail;
104 break;
105 }
106 }
107
108 return error;
109
110 fail:
111 gfs2_consist_inode(ip);
112 return -EIO;
113}
114
115static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
116{
117 struct buffer_head *bh, *eabh;
118 uint64_t *eablk, *end;
119 int error;
120
121 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
122 DIO_START | DIO_WAIT, &bh);
123 if (error)
124 return error;
125
126 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
127 error = ea_foreach_i(ip, bh, ea_call, data);
128 goto out;
129 }
130
131 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
132 error = -EIO;
133 goto out;
134 }
135
136 eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
137 end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
138
139 for (; eablk < end; eablk++) {
140 uint64_t bn;
141
142 if (!*eablk)
143 break;
144 bn = be64_to_cpu(*eablk);
145
146 error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
147 &eabh);
148 if (error)
149 break;
150 error = ea_foreach_i(ip, eabh, ea_call, data);
151 brelse(eabh);
152 if (error)
153 break;
154 }
155 out:
156 brelse(bh);
157
158 return error;
159}
160
161struct ea_find {
162 struct gfs2_ea_request *ef_er;
163 struct gfs2_ea_location *ef_el;
164};
165
166static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
167 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
168 void *private)
169{
170 struct ea_find *ef = private;
171 struct gfs2_ea_request *er = ef->ef_er;
172
173 if (ea->ea_type == GFS2_EATYPE_UNUSED)
174 return 0;
175
176 if (ea->ea_type == er->er_type) {
177 if (ea->ea_name_len == er->er_name_len &&
178 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
179 struct gfs2_ea_location *el = ef->ef_el;
180 get_bh(bh);
181 el->el_bh = bh;
182 el->el_ea = ea;
183 el->el_prev = prev;
184 return 1;
185 }
186 }
187
188#if 0
189 else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
190 er->er_type == GFS2_EATYPE_SYS)
191 return 1;
192#endif
193
194 return 0;
195}
196
197int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
198 struct gfs2_ea_location *el)
199{
200 struct ea_find ef;
201 int error;
202
203 ef.ef_er = er;
204 ef.ef_el = el;
205
206 memset(el, 0, sizeof(struct gfs2_ea_location));
207
208 error = ea_foreach(ip, ea_find_i, &ef);
209 if (error > 0)
210 return 0;
211
212 return error;
213}
214
215/**
216 * ea_dealloc_unstuffed -
217 * @ip:
218 * @bh:
219 * @ea:
220 * @prev:
221 * @private:
222 *
223 * Take advantage of the fact that all unstuffed blocks are
224 * allocated from the same RG. But watch, this may not always
225 * be true.
226 *
227 * Returns: errno
228 */
229
230static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
231 struct gfs2_ea_header *ea,
232 struct gfs2_ea_header *prev, void *private)
233{
234 int *leave = private;
235 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
236 struct gfs2_rgrpd *rgd;
237 struct gfs2_holder rg_gh;
238 struct buffer_head *dibh;
239 uint64_t *dataptrs, bn = 0;
240 uint64_t bstart = 0;
241 unsigned int blen = 0;
242 unsigned int blks = 0;
243 unsigned int x;
244 int error;
245
246 if (GFS2_EA_IS_STUFFED(ea))
247 return 0;
248
249 dataptrs = GFS2_EA2DATAPTRS(ea);
250 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
251 if (*dataptrs) {
252 blks++;
253 bn = be64_to_cpu(*dataptrs);
254 }
255 if (!blks)
256 return 0;
257
258 rgd = gfs2_blk2rgrpd(sdp, bn);
259 if (!rgd) {
260 gfs2_consist_inode(ip);
261 return -EIO;
262 }
263
264 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
265 if (error)
266 return error;
267
268 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
269 RES_DINODE + RES_EATTR + RES_STATFS +
270 RES_QUOTA, blks);
271 if (error)
272 goto out_gunlock;
273
274 gfs2_trans_add_bh(ip->i_gl, bh, 1);
275
276 dataptrs = GFS2_EA2DATAPTRS(ea);
277 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
278 if (!*dataptrs)
279 break;
280 bn = be64_to_cpu(*dataptrs);
281
282 if (bstart + blen == bn)
283 blen++;
284 else {
285 if (bstart)
286 gfs2_free_meta(ip, bstart, blen);
287 bstart = bn;
288 blen = 1;
289 }
290
291 *dataptrs = 0;
292 if (!ip->i_di.di_blocks)
293 gfs2_consist_inode(ip);
294 ip->i_di.di_blocks--;
295 }
296 if (bstart)
297 gfs2_free_meta(ip, bstart, blen);
298
299 if (prev && !leave) {
300 uint32_t len;
301
302 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
303 prev->ea_rec_len = cpu_to_be32(len);
304
305 if (GFS2_EA_IS_LAST(ea))
306 prev->ea_flags |= GFS2_EAFLAG_LAST;
307 } else {
308 ea->ea_type = GFS2_EATYPE_UNUSED;
309 ea->ea_num_ptrs = 0;
310 }
311
312 error = gfs2_meta_inode_buffer(ip, &dibh);
313 if (!error) {
314 ip->i_di.di_ctime = get_seconds();
315 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
316 gfs2_dinode_out(&ip->i_di, dibh->b_data);
317 brelse(dibh);
318 }
319
320 gfs2_trans_end(sdp);
321
322 out_gunlock:
323 gfs2_glock_dq_uninit(&rg_gh);
324
325 return error;
326}
327
328static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
329 struct gfs2_ea_header *ea,
330 struct gfs2_ea_header *prev, int leave)
331{
332 struct gfs2_alloc *al;
333 int error;
334
335 al = gfs2_alloc_get(ip);
336
337 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
338 if (error)
339 goto out_alloc;
340
341 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
342 if (error)
343 goto out_quota;
344
345 error = ea_dealloc_unstuffed(ip,
346 bh, ea, prev,
347 (leave) ? &error : NULL);
348
349 gfs2_glock_dq_uninit(&al->al_ri_gh);
350
351 out_quota:
352 gfs2_quota_unhold(ip);
353
354 out_alloc:
355 gfs2_alloc_put(ip);
356
357 return error;
358}
359
360struct ea_list {
361 struct gfs2_ea_request *ei_er;
362 unsigned int ei_size;
363};
364
365static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
366 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
367 void *private)
368{
369 struct ea_list *ei = private;
370 struct gfs2_ea_request *er = ei->ei_er;
371 unsigned int ea_size = gfs2_ea_strlen(ea);
372
373 if (ea->ea_type == GFS2_EATYPE_UNUSED)
374 return 0;
375
376 if (er->er_data_len) {
377 char *prefix = NULL;
378 unsigned int l = 0;
379 char c = 0;
380
381 if (ei->ei_size + ea_size > er->er_data_len)
382 return -ERANGE;
383
384 switch (ea->ea_type) {
385 case GFS2_EATYPE_USR:
386 prefix = "user.";
387 l = 5;
388 break;
389 case GFS2_EATYPE_SYS:
390 prefix = "system.";
391 l = 7;
392 break;
393 case GFS2_EATYPE_SECURITY:
394 prefix = "security.";
395 l = 9;
396 break;
397 }
398
399 BUG_ON(l == 0);
400
401 memcpy(er->er_data + ei->ei_size, prefix, l);
402 memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
403 ea->ea_name_len);
404 memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
405 }
406
407 ei->ei_size += ea_size;
408
409 return 0;
410}
411
412/**
413 * gfs2_ea_list -
414 * @ip:
415 * @er:
416 *
417 * Returns: actual size of data on success, -errno on error
418 */
419
420int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
421{
422 struct gfs2_holder i_gh;
423 int error;
424
425 if (!er->er_data || !er->er_data_len) {
426 er->er_data = NULL;
427 er->er_data_len = 0;
428 }
429
430 error = gfs2_glock_nq_init(ip->i_gl,
431 LM_ST_SHARED, LM_FLAG_ANY,
432 &i_gh);
433 if (error)
434 return error;
435
436 if (ip->i_di.di_eattr) {
437 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
438
439 error = ea_foreach(ip, ea_list_i, &ei);
440 if (!error)
441 error = ei.ei_size;
442 }
443
444 gfs2_glock_dq_uninit(&i_gh);
445
446 return error;
447}
448
449/**
450 * ea_get_unstuffed - actually copies the unstuffed data into the
451 * request buffer
452 * @ip:
453 * @ea:
454 * @data:
455 *
456 * Returns: errno
457 */
458
459static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
460 char *data)
461{
462 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
463 struct buffer_head **bh;
464 unsigned int amount = GFS2_EA_DATA_LEN(ea);
465 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
466 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
467 unsigned int x;
468 int error = 0;
469
470 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
471 if (!bh)
472 return -ENOMEM;
473
474 for (x = 0; x < nptrs; x++) {
475 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
476 DIO_START, bh + x);
477 if (error) {
478 while (x--)
479 brelse(bh[x]);
480 goto out;
481 }
482 dataptrs++;
483 }
484
485 for (x = 0; x < nptrs; x++) {
486 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
487 if (error) {
488 for (; x < nptrs; x++)
489 brelse(bh[x]);
490 goto out;
491 }
492 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
493 for (; x < nptrs; x++)
494 brelse(bh[x]);
495 error = -EIO;
496 goto out;
497 }
498
499 memcpy(data,
500 bh[x]->b_data + sizeof(struct gfs2_meta_header),
501 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
502
503 amount -= sdp->sd_jbsize;
504 data += sdp->sd_jbsize;
505
506 brelse(bh[x]);
507 }
508
509 out:
510 kfree(bh);
511
512 return error;
513}
514
515int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
516 char *data)
517{
518 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
519 memcpy(data,
520 GFS2_EA2DATA(el->el_ea),
521 GFS2_EA_DATA_LEN(el->el_ea));
522 return 0;
523 } else
524 return ea_get_unstuffed(ip, el->el_ea, data);
525}
526
527/**
528 * gfs2_ea_get_i -
529 * @ip:
530 * @er:
531 *
532 * Returns: actual size of data on success, -errno on error
533 */
534
535int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
536{
537 struct gfs2_ea_location el;
538 int error;
539
540 if (!ip->i_di.di_eattr)
541 return -ENODATA;
542
543 error = gfs2_ea_find(ip, er, &el);
544 if (error)
545 return error;
546 if (!el.el_ea)
547 return -ENODATA;
548
549 if (er->er_data_len) {
550 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
551 error = -ERANGE;
552 else
553 error = gfs2_ea_get_copy(ip, &el, er->er_data);
554 }
555 if (!error)
556 error = GFS2_EA_DATA_LEN(el.el_ea);
557
558 brelse(el.el_bh);
559
560 return error;
561}
562
563/**
564 * gfs2_ea_get -
565 * @ip:
566 * @er:
567 *
568 * Returns: actual size of data on success, -errno on error
569 */
570
571int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
572{
573 struct gfs2_holder i_gh;
574 int error;
575
576 if (!er->er_name_len ||
577 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
578 return -EINVAL;
579 if (!er->er_data || !er->er_data_len) {
580 er->er_data = NULL;
581 er->er_data_len = 0;
582 }
583
584 error = gfs2_glock_nq_init(ip->i_gl,
585 LM_ST_SHARED, LM_FLAG_ANY,
586 &i_gh);
587 if (error)
588 return error;
589
590 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
591
592 gfs2_glock_dq_uninit(&i_gh);
593
594 return error;
595}
596
597/**
598 * ea_alloc_blk - allocates a new block for extended attributes.
599 * @ip: A pointer to the inode that's getting extended attributes
600 * @bhp:
601 *
602 * Returns: errno
603 */
604
605static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
606{
607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
608 struct gfs2_ea_header *ea;
609 uint64_t block;
610
611 block = gfs2_alloc_meta(ip);
612
613 *bhp = gfs2_meta_new(ip->i_gl, block);
614 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
615 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
616 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
617
618 ea = GFS2_EA_BH2FIRST(*bhp);
619 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
620 ea->ea_type = GFS2_EATYPE_UNUSED;
621 ea->ea_flags = GFS2_EAFLAG_LAST;
622 ea->ea_num_ptrs = 0;
623
624 ip->i_di.di_blocks++;
625
626 return 0;
627}
628
629/**
630 * ea_write - writes the request info to an ea, creating new blocks if
631 * necessary
632 * @ip: inode that is being modified
633 * @ea: the location of the new ea in a block
634 * @er: the write request
635 *
636 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
637 *
638 * returns : errno
639 */
640
641static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
642 struct gfs2_ea_request *er)
643{
644 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
645
646 ea->ea_data_len = cpu_to_be32(er->er_data_len);
647 ea->ea_name_len = er->er_name_len;
648 ea->ea_type = er->er_type;
649 ea->__pad = 0;
650
651 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
652
653 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
654 ea->ea_num_ptrs = 0;
655 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
656 } else {
657 uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
658 const char *data = er->er_data;
659 unsigned int data_len = er->er_data_len;
660 unsigned int copy;
661 unsigned int x;
662
663 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
664 for (x = 0; x < ea->ea_num_ptrs; x++) {
665 struct buffer_head *bh;
666 uint64_t block;
667 int mh_size = sizeof(struct gfs2_meta_header);
668
669 block = gfs2_alloc_meta(ip);
670
671 bh = gfs2_meta_new(ip->i_gl, block);
672 gfs2_trans_add_bh(ip->i_gl, bh, 1);
673 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
674
675 ip->i_di.di_blocks++;
676
677 copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
678 data_len;
679 memcpy(bh->b_data + mh_size, data, copy);
680 if (copy < sdp->sd_jbsize)
681 memset(bh->b_data + mh_size + copy, 0,
682 sdp->sd_jbsize - copy);
683
684 *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
685 data += copy;
686 data_len -= copy;
687
688 brelse(bh);
689 }
690
691 gfs2_assert_withdraw(sdp, !data_len);
692 }
693
694 return 0;
695}
696
697typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
698 struct gfs2_ea_request *er,
699 void *private);
700
701static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
702 unsigned int blks,
703 ea_skeleton_call_t skeleton_call,
704 void *private)
705{
706 struct gfs2_alloc *al;
707 struct buffer_head *dibh;
708 int error;
709
710 al = gfs2_alloc_get(ip);
711
712 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
713 if (error)
714 goto out;
715
716 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
717 if (error)
718 goto out_gunlock_q;
719
720 al->al_requested = blks;
721
722 error = gfs2_inplace_reserve(ip);
723 if (error)
724 goto out_gunlock_q;
725
726 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
727 blks + al->al_rgd->rd_ri.ri_length +
728 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
729 if (error)
730 goto out_ipres;
731
732 error = skeleton_call(ip, er, private);
733 if (error)
734 goto out_end_trans;
735
736 error = gfs2_meta_inode_buffer(ip, &dibh);
737 if (!error) {
738 if (er->er_flags & GFS2_ERF_MODE) {
739 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
740 (ip->i_di.di_mode & S_IFMT) ==
741 (er->er_mode & S_IFMT));
742 ip->i_di.di_mode = er->er_mode;
743 }
744 ip->i_di.di_ctime = get_seconds();
745 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
746 gfs2_dinode_out(&ip->i_di, dibh->b_data);
747 brelse(dibh);
748 }
749
750 out_end_trans:
751 gfs2_trans_end(GFS2_SB(&ip->i_inode));
752
753 out_ipres:
754 gfs2_inplace_release(ip);
755
756 out_gunlock_q:
757 gfs2_quota_unlock(ip);
758
759 out:
760 gfs2_alloc_put(ip);
761
762 return error;
763}
764
765static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
766 void *private)
767{
768 struct buffer_head *bh;
769 int error;
770
771 error = ea_alloc_blk(ip, &bh);
772 if (error)
773 return error;
774
775 ip->i_di.di_eattr = bh->b_blocknr;
776 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
777
778 brelse(bh);
779
780 return error;
781}
782
783/**
784 * ea_init - initializes a new eattr block
785 * @ip:
786 * @er:
787 *
788 * Returns: errno
789 */
790
791static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
792{
793 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
794 unsigned int blks = 1;
795
796 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
797 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
798
799 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
800}
801
802static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
803{
804 uint32_t ea_size = GFS2_EA_SIZE(ea);
805 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
806 ea_size);
807 uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
808 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
809
810 ea->ea_rec_len = cpu_to_be32(ea_size);
811 ea->ea_flags ^= last;
812
813 new->ea_rec_len = cpu_to_be32(new_size);
814 new->ea_flags = last;
815
816 return new;
817}
818
819static void ea_set_remove_stuffed(struct gfs2_inode *ip,
820 struct gfs2_ea_location *el)
821{
822 struct gfs2_ea_header *ea = el->el_ea;
823 struct gfs2_ea_header *prev = el->el_prev;
824 uint32_t len;
825
826 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
827
828 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
829 ea->ea_type = GFS2_EATYPE_UNUSED;
830 return;
831 } else if (GFS2_EA2NEXT(prev) != ea) {
832 prev = GFS2_EA2NEXT(prev);
833 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
834 }
835
836 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
837 prev->ea_rec_len = cpu_to_be32(len);
838
839 if (GFS2_EA_IS_LAST(ea))
840 prev->ea_flags |= GFS2_EAFLAG_LAST;
841}
842
843struct ea_set {
844 int ea_split;
845
846 struct gfs2_ea_request *es_er;
847 struct gfs2_ea_location *es_el;
848
849 struct buffer_head *es_bh;
850 struct gfs2_ea_header *es_ea;
851};
852
853static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
854 struct gfs2_ea_header *ea, struct ea_set *es)
855{
856 struct gfs2_ea_request *er = es->es_er;
857 struct buffer_head *dibh;
858 int error;
859
860 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
861 if (error)
862 return error;
863
864 gfs2_trans_add_bh(ip->i_gl, bh, 1);
865
866 if (es->ea_split)
867 ea = ea_split_ea(ea);
868
869 ea_write(ip, ea, er);
870
871 if (es->es_el)
872 ea_set_remove_stuffed(ip, es->es_el);
873
874 error = gfs2_meta_inode_buffer(ip, &dibh);
875 if (error)
876 goto out;
877
878 if (er->er_flags & GFS2_ERF_MODE) {
879 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
880 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
881 ip->i_di.di_mode = er->er_mode;
882 }
883 ip->i_di.di_ctime = get_seconds();
884 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
885 gfs2_dinode_out(&ip->i_di, dibh->b_data);
886 brelse(dibh);
887 out:
888 gfs2_trans_end(GFS2_SB(&ip->i_inode));
889
890 return error;
891}
892
893static int ea_set_simple_alloc(struct gfs2_inode *ip,
894 struct gfs2_ea_request *er, void *private)
895{
896 struct ea_set *es = private;
897 struct gfs2_ea_header *ea = es->es_ea;
898 int error;
899
900 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
901
902 if (es->ea_split)
903 ea = ea_split_ea(ea);
904
905 error = ea_write(ip, ea, er);
906 if (error)
907 return error;
908
909 if (es->es_el)
910 ea_set_remove_stuffed(ip, es->es_el);
911
912 return 0;
913}
914
915static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
916 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
917 void *private)
918{
919 struct ea_set *es = private;
920 unsigned int size;
921 int stuffed;
922 int error;
923
924 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
925
926 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
927 if (GFS2_EA_REC_LEN(ea) < size)
928 return 0;
929 if (!GFS2_EA_IS_STUFFED(ea)) {
930 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
931 if (error)
932 return error;
933 }
934 es->ea_split = 0;
935 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
936 es->ea_split = 1;
937 else
938 return 0;
939
940 if (stuffed) {
941 error = ea_set_simple_noalloc(ip, bh, ea, es);
942 if (error)
943 return error;
944 } else {
945 unsigned int blks;
946
947 es->es_bh = bh;
948 es->es_ea = ea;
949 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
950 GFS2_SB(&ip->i_inode)->sd_jbsize);
951
952 error = ea_alloc_skeleton(ip, es->es_er, blks,
953 ea_set_simple_alloc, es);
954 if (error)
955 return error;
956 }
957
958 return 1;
959}
960
961static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
962 void *private)
963{
964 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
965 struct buffer_head *indbh, *newbh;
966 uint64_t *eablk;
967 int error;
968 int mh_size = sizeof(struct gfs2_meta_header);
969
970 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
971 uint64_t *end;
972
973 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
974 DIO_START | DIO_WAIT, &indbh);
975 if (error)
976 return error;
977
978 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
979 error = -EIO;
980 goto out;
981 }
982
983 eablk = (uint64_t *)(indbh->b_data + mh_size);
984 end = eablk + sdp->sd_inptrs;
985
986 for (; eablk < end; eablk++)
987 if (!*eablk)
988 break;
989
990 if (eablk == end) {
991 error = -ENOSPC;
992 goto out;
993 }
994
995 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
996 } else {
997 uint64_t blk;
998
999 blk = gfs2_alloc_meta(ip);
1000
1001 indbh = gfs2_meta_new(ip->i_gl, blk);
1002 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1003 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
1004 gfs2_buffer_clear_tail(indbh, mh_size);
1005
1006 eablk = (uint64_t *)(indbh->b_data + mh_size);
1007 *eablk = cpu_to_be64(ip->i_di.di_eattr);
1008 ip->i_di.di_eattr = blk;
1009 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
1010 ip->i_di.di_blocks++;
1011
1012 eablk++;
1013 }
1014
1015 error = ea_alloc_blk(ip, &newbh);
1016 if (error)
1017 goto out;
1018
1019 *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
1020 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
1021 brelse(newbh);
1022 if (error)
1023 goto out;
1024
1025 if (private)
1026 ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
1027
1028 out:
1029 brelse(indbh);
1030
1031 return error;
1032}
1033
1034static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1035 struct gfs2_ea_location *el)
1036{
1037 struct ea_set es;
1038 unsigned int blks = 2;
1039 int error;
1040
1041 memset(&es, 0, sizeof(struct ea_set));
1042 es.es_er = er;
1043 es.es_el = el;
1044
1045 error = ea_foreach(ip, ea_set_simple, &es);
1046 if (error > 0)
1047 return 0;
1048 if (error)
1049 return error;
1050
1051 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1052 blks++;
1053 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1054 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1055
1056 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1057}
1058
1059static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1060 struct gfs2_ea_location *el)
1061{
1062 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1063 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1064 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
1065 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1066 }
1067
1068 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1069}
1070
1071int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1072{
1073 struct gfs2_ea_location el;
1074 int error;
1075
1076 if (!ip->i_di.di_eattr) {
1077 if (er->er_flags & XATTR_REPLACE)
1078 return -ENODATA;
1079 return ea_init(ip, er);
1080 }
1081
1082 error = gfs2_ea_find(ip, er, &el);
1083 if (error)
1084 return error;
1085
1086 if (el.el_ea) {
1087 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1088 brelse(el.el_bh);
1089 return -EPERM;
1090 }
1091
1092 error = -EEXIST;
1093 if (!(er->er_flags & XATTR_CREATE)) {
1094 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1095 error = ea_set_i(ip, er, &el);
1096 if (!error && unstuffed)
1097 ea_set_remove_unstuffed(ip, &el);
1098 }
1099
1100 brelse(el.el_bh);
1101 } else {
1102 error = -ENODATA;
1103 if (!(er->er_flags & XATTR_REPLACE))
1104 error = ea_set_i(ip, er, NULL);
1105 }
1106
1107 return error;
1108}
1109
1110int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1111{
1112 struct gfs2_holder i_gh;
1113 int error;
1114
1115 if (!er->er_name_len ||
1116 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1117 return -EINVAL;
1118 if (!er->er_data || !er->er_data_len) {
1119 er->er_data = NULL;
1120 er->er_data_len = 0;
1121 }
1122 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1123 if (error)
1124 return error;
1125
1126 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1127 if (error)
1128 return error;
1129
1130 if (IS_IMMUTABLE(&ip->i_inode))
1131 error = -EPERM;
1132 else
1133 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1134
1135 gfs2_glock_dq_uninit(&i_gh);
1136
1137 return error;
1138}
1139
1140static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1141{
1142 struct gfs2_ea_header *ea = el->el_ea;
1143 struct gfs2_ea_header *prev = el->el_prev;
1144 struct buffer_head *dibh;
1145 int error;
1146
1147 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1148 if (error)
1149 return error;
1150
1151 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1152
1153 if (prev) {
1154 uint32_t len;
1155
1156 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1157 prev->ea_rec_len = cpu_to_be32(len);
1158
1159 if (GFS2_EA_IS_LAST(ea))
1160 prev->ea_flags |= GFS2_EAFLAG_LAST;
1161 } else
1162 ea->ea_type = GFS2_EATYPE_UNUSED;
1163
1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1165 if (!error) {
1166 ip->i_di.di_ctime = get_seconds();
1167 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1168 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1169 brelse(dibh);
1170 }
1171
1172 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1173
1174 return error;
1175}
1176
1177int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1178{
1179 struct gfs2_ea_location el;
1180 int error;
1181
1182 if (!ip->i_di.di_eattr)
1183 return -ENODATA;
1184
1185 error = gfs2_ea_find(ip, er, &el);
1186 if (error)
1187 return error;
1188 if (!el.el_ea)
1189 return -ENODATA;
1190
1191 if (GFS2_EA_IS_STUFFED(el.el_ea))
1192 error = ea_remove_stuffed(ip, &el);
1193 else
1194 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1195 0);
1196
1197 brelse(el.el_bh);
1198
1199 return error;
1200}
1201
1202/**
1203 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1204 * @ip: pointer to the inode of the target file
1205 * @er: request information
1206 *
1207 * Returns: errno
1208 */
1209
1210int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1211{
1212 struct gfs2_holder i_gh;
1213 int error;
1214
1215 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1216 return -EINVAL;
1217
1218 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1219 if (error)
1220 return error;
1221
1222 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1223 error = -EPERM;
1224 else
1225 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1226
1227 gfs2_glock_dq_uninit(&i_gh);
1228
1229 return error;
1230}
1231
1232static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1233 struct gfs2_ea_header *ea, char *data)
1234{
1235 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1236 struct buffer_head **bh;
1237 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1238 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1239 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
1240 unsigned int x;
1241 int error;
1242
1243 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1244 if (!bh)
1245 return -ENOMEM;
1246
1247 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1248 if (error)
1249 goto out;
1250
1251 for (x = 0; x < nptrs; x++) {
1252 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
1253 DIO_START, bh + x);
1254 if (error) {
1255 while (x--)
1256 brelse(bh[x]);
1257 goto fail;
1258 }
1259 dataptrs++;
1260 }
1261
1262 for (x = 0; x < nptrs; x++) {
1263 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
1264 if (error) {
1265 for (; x < nptrs; x++)
1266 brelse(bh[x]);
1267 goto fail;
1268 }
1269 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1270 for (; x < nptrs; x++)
1271 brelse(bh[x]);
1272 error = -EIO;
1273 goto fail;
1274 }
1275
1276 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1277
1278 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
1279 data,
1280 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1281
1282 amount -= sdp->sd_jbsize;
1283 data += sdp->sd_jbsize;
1284
1285 brelse(bh[x]);
1286 }
1287
1288 out:
1289 kfree(bh);
1290
1291 return error;
1292
1293 fail:
1294 gfs2_trans_end(sdp);
1295 kfree(bh);
1296
1297 return error;
1298}
1299
1300int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1301 struct iattr *attr, char *data)
1302{
1303 struct buffer_head *dibh;
1304 int error;
1305
1306 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1307 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1308 if (error)
1309 return error;
1310
1311 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1312 memcpy(GFS2_EA2DATA(el->el_ea),
1313 data,
1314 GFS2_EA_DATA_LEN(el->el_ea));
1315 } else
1316 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1317
1318 if (error)
1319 return error;
1320
1321 error = gfs2_meta_inode_buffer(ip, &dibh);
1322 if (!error) {
1323 error = inode_setattr(&ip->i_inode, attr);
1324 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1325 gfs2_inode_attr_out(ip);
1326 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1327 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1328 brelse(dibh);
1329 }
1330
1331 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1332
1333 return error;
1334}
1335
1336static int ea_dealloc_indirect(struct gfs2_inode *ip)
1337{
1338 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1339 struct gfs2_rgrp_list rlist;
1340 struct buffer_head *indbh, *dibh;
1341 uint64_t *eablk, *end;
1342 unsigned int rg_blocks = 0;
1343 uint64_t bstart = 0;
1344 unsigned int blen = 0;
1345 unsigned int blks = 0;
1346 unsigned int x;
1347 int error;
1348
1349 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1350
1351 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
1352 DIO_START | DIO_WAIT, &indbh);
1353 if (error)
1354 return error;
1355
1356 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1357 error = -EIO;
1358 goto out;
1359 }
1360
1361 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1362 end = eablk + sdp->sd_inptrs;
1363
1364 for (; eablk < end; eablk++) {
1365 uint64_t bn;
1366
1367 if (!*eablk)
1368 break;
1369 bn = be64_to_cpu(*eablk);
1370
1371 if (bstart + blen == bn)
1372 blen++;
1373 else {
1374 if (bstart)
1375 gfs2_rlist_add(sdp, &rlist, bstart);
1376 bstart = bn;
1377 blen = 1;
1378 }
1379 blks++;
1380 }
1381 if (bstart)
1382 gfs2_rlist_add(sdp, &rlist, bstart);
1383 else
1384 goto out;
1385
1386 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1387
1388 for (x = 0; x < rlist.rl_rgrps; x++) {
1389 struct gfs2_rgrpd *rgd;
1390 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1391 rg_blocks += rgd->rd_ri.ri_length;
1392 }
1393
1394 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1395 if (error)
1396 goto out_rlist_free;
1397
1398 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
1399 RES_INDIRECT + RES_STATFS +
1400 RES_QUOTA, blks);
1401 if (error)
1402 goto out_gunlock;
1403
1404 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1405
1406 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1407 bstart = 0;
1408 blen = 0;
1409
1410 for (; eablk < end; eablk++) {
1411 uint64_t bn;
1412
1413 if (!*eablk)
1414 break;
1415 bn = be64_to_cpu(*eablk);
1416
1417 if (bstart + blen == bn)
1418 blen++;
1419 else {
1420 if (bstart)
1421 gfs2_free_meta(ip, bstart, blen);
1422 bstart = bn;
1423 blen = 1;
1424 }
1425
1426 *eablk = 0;
1427 if (!ip->i_di.di_blocks)
1428 gfs2_consist_inode(ip);
1429 ip->i_di.di_blocks--;
1430 }
1431 if (bstart)
1432 gfs2_free_meta(ip, bstart, blen);
1433
1434 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1435
1436 error = gfs2_meta_inode_buffer(ip, &dibh);
1437 if (!error) {
1438 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1439 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1440 brelse(dibh);
1441 }
1442
1443 gfs2_trans_end(sdp);
1444
1445 out_gunlock:
1446 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1447
1448 out_rlist_free:
1449 gfs2_rlist_free(&rlist);
1450
1451 out:
1452 brelse(indbh);
1453
1454 return error;
1455}
1456
1457static int ea_dealloc_block(struct gfs2_inode *ip)
1458{
1459 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1460 struct gfs2_alloc *al = &ip->i_alloc;
1461 struct gfs2_rgrpd *rgd;
1462 struct buffer_head *dibh;
1463 int error;
1464
1465 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1466 if (!rgd) {
1467 gfs2_consist_inode(ip);
1468 return -EIO;
1469 }
1470
1471 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1472 &al->al_rgd_gh);
1473 if (error)
1474 return error;
1475
1476 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
1477 RES_STATFS + RES_QUOTA, 1);
1478 if (error)
1479 goto out_gunlock;
1480
1481 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1482
1483 ip->i_di.di_eattr = 0;
1484 if (!ip->i_di.di_blocks)
1485 gfs2_consist_inode(ip);
1486 ip->i_di.di_blocks--;
1487
1488 error = gfs2_meta_inode_buffer(ip, &dibh);
1489 if (!error) {
1490 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1491 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1492 brelse(dibh);
1493 }
1494
1495 gfs2_trans_end(sdp);
1496
1497 out_gunlock:
1498 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1499
1500 return error;
1501}
1502
1503/**
1504 * gfs2_ea_dealloc - deallocate the extended attribute fork
1505 * @ip: the inode
1506 *
1507 * Returns: errno
1508 */
1509
1510int gfs2_ea_dealloc(struct gfs2_inode *ip)
1511{
1512 struct gfs2_alloc *al;
1513 int error;
1514
1515 al = gfs2_alloc_get(ip);
1516
1517 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1518 if (error)
1519 goto out_alloc;
1520
1521 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1522 if (error)
1523 goto out_quota;
1524
1525 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1526 if (error)
1527 goto out_rindex;
1528
1529 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1530 error = ea_dealloc_indirect(ip);
1531 if (error)
1532 goto out_rindex;
1533 }
1534
1535 error = ea_dealloc_block(ip);
1536
1537 out_rindex:
1538 gfs2_glock_dq_uninit(&al->al_ri_gh);
1539
1540 out_quota:
1541 gfs2_quota_unhold(ip);
1542
1543 out_alloc:
1544 gfs2_alloc_put(ip);
1545
1546 return error;
1547}
1548
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ae199692e51d
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,97 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
14#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
15
16#define GFS2_EA_SIZE(ea) \
17ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
18 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
19 (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
20
21#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
22#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
23
24#define GFS2_EAREQ_SIZE_STUFFED(er) \
25ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
26
27#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
29 sizeof(uint64_t) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
30
31#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
32#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
33
34#define GFS2_EA2DATAPTRS(ea) \
35((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
36
37#define GFS2_EA2NEXT(ea) \
38((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
39
40#define GFS2_EA_BH2FIRST(bh) \
41((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
42
43#define GFS2_ERF_MODE 0x80000000
44
45struct gfs2_ea_request {
46 char *er_name;
47 char *er_data;
48 unsigned int er_name_len;
49 unsigned int er_data_len;
50 unsigned int er_type; /* GFS2_EATYPE_... */
51 int er_flags;
52 mode_t er_mode;
53};
54
55struct gfs2_ea_location {
56 struct buffer_head *el_bh;
57 struct gfs2_ea_header *el_ea;
58 struct gfs2_ea_header *el_prev;
59};
60
61int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
62int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
63int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
64
65int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
66int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
68int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69
70int gfs2_ea_dealloc(struct gfs2_inode *ip);
71
72/* Exported to acl.c */
73
74int gfs2_ea_find(struct gfs2_inode *ip,
75 struct gfs2_ea_request *er,
76 struct gfs2_ea_location *el);
77int gfs2_ea_get_copy(struct gfs2_inode *ip,
78 struct gfs2_ea_location *el,
79 char *data);
80int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
81 struct iattr *attr, char *data);
82
83static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
84{
85 switch (ea->ea_type) {
86 case GFS2_EATYPE_USR:
87 return (5 + (ea->ea_name_len + 1));
88 case GFS2_EATYPE_SYS:
89 return (7 + (ea->ea_name_len + 1));
90 case GFS2_EATYPE_SECURITY:
91 return (9 + (ea->ea_name_len + 1));
92 default:
93 return (0);
94 }
95}
96
97#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644
index 000000000000..239f0c3553fc
--- /dev/null
+++ b/fs/gfs2/format.h
@@ -0,0 +1,21 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __FORMAT_DOT_H__
11#define __FORMAT_DOT_H__
12
13static const uint32_t gfs2_old_fs_formats[] = {
14 0
15};
16
17static const uint32_t gfs2_old_multihost_formats[] = {
18 0
19};
20
21#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..6edbd551a4c0
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..0381d4cc4146
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2279 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kref.h>
19#include <linux/kallsyms.h>
20#include <linux/gfs2_ondisk.h>
21#include <asm/uaccess.h>
22
23#include "gfs2.h"
24#include "lm_interface.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "quota.h"
33#include "super.h"
34#include "util.h"
35
36/* Must be kept in sync with the beginning of struct gfs2_glock */
37struct glock_plug {
38 struct list_head gl_list;
39 unsigned long gl_flags;
40};
41
42struct greedy {
43 struct gfs2_holder gr_gh;
44 struct work_struct gr_work;
45};
46
47typedef void (*glock_examiner) (struct gfs2_glock * gl);
48
49static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
50static int dump_glock(struct gfs2_glock *gl);
51
52/**
53 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
54 * @actual: the current state of the lock
55 * @requested: the lock state that was requested by the caller
56 * @flags: the modifier flags passed in by the caller
57 *
58 * Returns: 1 if the locks are compatible, 0 otherwise
59 */
60
61static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
62 int flags)
63{
64 if (actual == requested)
65 return 1;
66
67 if (flags & GL_EXACT)
68 return 0;
69
70 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
71 return 1;
72
73 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
74 return 1;
75
76 return 0;
77}
78
79/**
80 * gl_hash() - Turn glock number into hash bucket number
81 * @lock: The glock number
82 *
83 * Returns: The number of the corresponding hash bucket
84 */
85
86static unsigned int gl_hash(struct lm_lockname *name)
87{
88 unsigned int h;
89
90 h = jhash(&name->ln_number, sizeof(uint64_t), 0);
91 h = jhash(&name->ln_type, sizeof(unsigned int), h);
92 h &= GFS2_GL_HASH_MASK;
93
94 return h;
95}
96
97/**
98 * glock_free() - Perform a few checks and then release struct gfs2_glock
99 * @gl: The glock to release
100 *
101 * Also calls lock module to release its internal structure for this glock.
102 *
103 */
104
105static void glock_free(struct gfs2_glock *gl)
106{
107 struct gfs2_sbd *sdp = gl->gl_sbd;
108 struct inode *aspace = gl->gl_aspace;
109
110 gfs2_lm_put_lock(sdp, gl->gl_lock);
111
112 if (aspace)
113 gfs2_aspace_put(aspace);
114
115 kmem_cache_free(gfs2_glock_cachep, gl);
116}
117
118/**
119 * gfs2_glock_hold() - increment reference count on glock
120 * @gl: The glock to hold
121 *
122 */
123
124void gfs2_glock_hold(struct gfs2_glock *gl)
125{
126 kref_get(&gl->gl_ref);
127}
128
129/* All work is done after the return from kref_put() so we
130 can release the write_lock before the free. */
131
132static void kill_glock(struct kref *kref)
133{
134 struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
135 struct gfs2_sbd *sdp = gl->gl_sbd;
136
137 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
138 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
139 gfs2_assert(sdp, list_empty(&gl->gl_holders));
140 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
141 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
142 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
143}
144
145/**
146 * gfs2_glock_put() - Decrement reference count on glock
147 * @gl: The glock to put
148 *
149 */
150
151int gfs2_glock_put(struct gfs2_glock *gl)
152{
153 struct gfs2_sbd *sdp = gl->gl_sbd;
154 struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
155 int rv = 0;
156
157 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
158
159 write_lock(&bucket->hb_lock);
160 if (kref_put(&gl->gl_ref, kill_glock)) {
161 list_del_init(&gl->gl_list);
162 write_unlock(&bucket->hb_lock);
163 BUG_ON(spin_is_locked(&gl->gl_spin));
164 glock_free(gl);
165 rv = 1;
166 goto out;
167 }
168 write_unlock(&bucket->hb_lock);
169 out:
170 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
171 return rv;
172}
173
174/**
175 * queue_empty - check to see if a glock's queue is empty
176 * @gl: the glock
177 * @head: the head of the queue to check
178 *
179 * This function protects the list in the event that a process already
180 * has a holder on the list and is adding a second holder for itself.
181 * The glmutex lock is what generally prevents processes from working
182 * on the same glock at once, but the special case of adding a second
183 * holder for yourself ("recursive" locking) doesn't involve locking
184 * glmutex, making the spin lock necessary.
185 *
186 * Returns: 1 if the queue is empty
187 */
188
189static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
190{
191 int empty;
192 spin_lock(&gl->gl_spin);
193 empty = list_empty(head);
194 spin_unlock(&gl->gl_spin);
195 return empty;
196}
197
198/**
199 * search_bucket() - Find struct gfs2_glock by lock number
200 * @bucket: the bucket to search
201 * @name: The lock name
202 *
203 * Returns: NULL, or the struct gfs2_glock with the requested number
204 */
205
206static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
207 struct lm_lockname *name)
208{
209 struct gfs2_glock *gl;
210
211 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
212 if (test_bit(GLF_PLUG, &gl->gl_flags))
213 continue;
214 if (!lm_name_equal(&gl->gl_name, name))
215 continue;
216
217 kref_get(&gl->gl_ref);
218
219 return gl;
220 }
221
222 return NULL;
223}
224
225/**
226 * gfs2_glock_find() - Find glock by lock number
227 * @sdp: The GFS2 superblock
228 * @name: The lock name
229 *
230 * Returns: NULL, or the struct gfs2_glock with the requested number
231 */
232
233static struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
234 struct lm_lockname *name)
235{
236 struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
237 struct gfs2_glock *gl;
238
239 read_lock(&bucket->hb_lock);
240 gl = search_bucket(bucket, name);
241 read_unlock(&bucket->hb_lock);
242
243 return gl;
244}
245
246/**
247 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
248 * @sdp: The GFS2 superblock
249 * @number: the lock number
250 * @glops: The glock_operations to use
251 * @create: If 0, don't create the glock if it doesn't exist
252 * @glp: the glock is returned here
253 *
254 * This does not lock a glock, just finds/creates structures for one.
255 *
256 * Returns: errno
257 */
258
259int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
260 struct gfs2_glock_operations *glops, int create,
261 struct gfs2_glock **glp)
262{
263 struct lm_lockname name;
264 struct gfs2_glock *gl, *tmp;
265 struct gfs2_gl_hash_bucket *bucket;
266 int error;
267
268 name.ln_number = number;
269 name.ln_type = glops->go_type;
270 bucket = &sdp->sd_gl_hash[gl_hash(&name)];
271
272 read_lock(&bucket->hb_lock);
273 gl = search_bucket(bucket, &name);
274 read_unlock(&bucket->hb_lock);
275
276 if (gl || !create) {
277 *glp = gl;
278 return 0;
279 }
280
281 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
282 if (!gl)
283 return -ENOMEM;
284
285 memset(gl, 0, sizeof(struct gfs2_glock));
286
287 INIT_LIST_HEAD(&gl->gl_list);
288 gl->gl_name = name;
289 kref_init(&gl->gl_ref);
290
291 spin_lock_init(&gl->gl_spin);
292
293 gl->gl_state = LM_ST_UNLOCKED;
294 gl->gl_owner = NULL;
295 gl->gl_ip = 0;
296 INIT_LIST_HEAD(&gl->gl_holders);
297 INIT_LIST_HEAD(&gl->gl_waiters1);
298 INIT_LIST_HEAD(&gl->gl_waiters2);
299 INIT_LIST_HEAD(&gl->gl_waiters3);
300
301 gl->gl_ops = glops;
302
303 gl->gl_bucket = bucket;
304 INIT_LIST_HEAD(&gl->gl_reclaim);
305
306 gl->gl_sbd = sdp;
307
308 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
309 INIT_LIST_HEAD(&gl->gl_ail_list);
310
311 /* If this glock protects actual on-disk data or metadata blocks,
312 create a VFS inode to manage the pages/buffers holding them. */
313 if (glops == &gfs2_inode_glops ||
314 glops == &gfs2_rgrp_glops ||
315 glops == &gfs2_meta_glops) {
316 gl->gl_aspace = gfs2_aspace_get(sdp);
317 if (!gl->gl_aspace) {
318 error = -ENOMEM;
319 goto fail;
320 }
321 }
322
323 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
324 if (error)
325 goto fail_aspace;
326
327 write_lock(&bucket->hb_lock);
328 tmp = search_bucket(bucket, &name);
329 if (tmp) {
330 write_unlock(&bucket->hb_lock);
331 glock_free(gl);
332 gl = tmp;
333 } else {
334 list_add_tail(&gl->gl_list, &bucket->hb_list);
335 write_unlock(&bucket->hb_lock);
336 }
337
338 *glp = gl;
339
340 return 0;
341
342 fail_aspace:
343 if (gl->gl_aspace)
344 gfs2_aspace_put(gl->gl_aspace);
345
346 fail:
347 kmem_cache_free(gfs2_glock_cachep, gl);
348
349 return error;
350}
351
352/**
353 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
354 * @gl: the glock
355 * @state: the state we're requesting
356 * @flags: the modifier flags
357 * @gh: the holder structure
358 *
359 */
360
361void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
362 struct gfs2_holder *gh)
363{
364 INIT_LIST_HEAD(&gh->gh_list);
365 gh->gh_gl = gl;
366 gh->gh_ip = (unsigned long)__builtin_return_address(0);
367 gh->gh_owner = current;
368 gh->gh_state = state;
369 gh->gh_flags = flags;
370 gh->gh_error = 0;
371 gh->gh_iflags = 0;
372 init_completion(&gh->gh_wait);
373
374 if (gh->gh_state == LM_ST_EXCLUSIVE)
375 gh->gh_flags |= GL_LOCAL_EXCL;
376
377 gfs2_glock_hold(gl);
378}
379
380/**
381 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
382 * @state: the state we're requesting
383 * @flags: the modifier flags
384 * @gh: the holder structure
385 *
386 * Don't mess with the glock.
387 *
388 */
389
390void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
391{
392 gh->gh_state = state;
393 gh->gh_flags = flags;
394 if (gh->gh_state == LM_ST_EXCLUSIVE)
395 gh->gh_flags |= GL_LOCAL_EXCL;
396
397 gh->gh_iflags &= 1 << HIF_ALLOCED;
398 gh->gh_ip = (unsigned long)__builtin_return_address(0);
399}
400
401/**
402 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
403 * @gh: the holder structure
404 *
405 */
406
407void gfs2_holder_uninit(struct gfs2_holder *gh)
408{
409 gfs2_glock_put(gh->gh_gl);
410 gh->gh_gl = NULL;
411 gh->gh_ip = 0;
412}
413
414/**
415 * gfs2_holder_get - get a struct gfs2_holder structure
416 * @gl: the glock
417 * @state: the state we're requesting
418 * @flags: the modifier flags
419 * @gfp_flags:
420 *
421 * Figure out how big an impact this function has. Either:
422 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
423 * 2) Leave it like it is
424 *
425 * Returns: the holder structure, NULL on ENOMEM
426 */
427
428static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
429 unsigned int state,
430 int flags, gfp_t gfp_flags)
431{
432 struct gfs2_holder *gh;
433
434 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
435 if (!gh)
436 return NULL;
437
438 gfs2_holder_init(gl, state, flags, gh);
439 set_bit(HIF_ALLOCED, &gh->gh_iflags);
440 gh->gh_ip = (unsigned long)__builtin_return_address(0);
441 return gh;
442}
443
444/**
445 * gfs2_holder_put - get rid of a struct gfs2_holder structure
446 * @gh: the holder structure
447 *
448 */
449
450static void gfs2_holder_put(struct gfs2_holder *gh)
451{
452 gfs2_holder_uninit(gh);
453 kfree(gh);
454}
455
456/**
457 * rq_mutex - process a mutex request in the queue
458 * @gh: the glock holder
459 *
460 * Returns: 1 if the queue is blocked
461 */
462
463static int rq_mutex(struct gfs2_holder *gh)
464{
465 struct gfs2_glock *gl = gh->gh_gl;
466
467 list_del_init(&gh->gh_list);
468 /* gh->gh_error never examined. */
469 set_bit(GLF_LOCK, &gl->gl_flags);
470 complete(&gh->gh_wait);
471
472 return 1;
473}
474
475/**
476 * rq_promote - process a promote request in the queue
477 * @gh: the glock holder
478 *
479 * Acquire a new inter-node lock, or change a lock state to more restrictive.
480 *
481 * Returns: 1 if the queue is blocked
482 */
483
484static int rq_promote(struct gfs2_holder *gh)
485{
486 struct gfs2_glock *gl = gh->gh_gl;
487 struct gfs2_sbd *sdp = gl->gl_sbd;
488 struct gfs2_glock_operations *glops = gl->gl_ops;
489
490 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
491 if (list_empty(&gl->gl_holders)) {
492 gl->gl_req_gh = gh;
493 set_bit(GLF_LOCK, &gl->gl_flags);
494 spin_unlock(&gl->gl_spin);
495
496 if (atomic_read(&sdp->sd_reclaim_count) >
497 gfs2_tune_get(sdp, gt_reclaim_limit) &&
498 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
499 gfs2_reclaim_glock(sdp);
500 gfs2_reclaim_glock(sdp);
501 }
502
503 glops->go_xmote_th(gl, gh->gh_state,
504 gh->gh_flags);
505
506 spin_lock(&gl->gl_spin);
507 }
508 return 1;
509 }
510
511 if (list_empty(&gl->gl_holders)) {
512 set_bit(HIF_FIRST, &gh->gh_iflags);
513 set_bit(GLF_LOCK, &gl->gl_flags);
514 } else {
515 struct gfs2_holder *next_gh;
516 if (gh->gh_flags & GL_LOCAL_EXCL)
517 return 1;
518 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
519 gh_list);
520 if (next_gh->gh_flags & GL_LOCAL_EXCL)
521 return 1;
522 }
523
524 list_move_tail(&gh->gh_list, &gl->gl_holders);
525 gh->gh_error = 0;
526 set_bit(HIF_HOLDER, &gh->gh_iflags);
527
528 complete(&gh->gh_wait);
529
530 return 0;
531}
532
533/**
534 * rq_demote - process a demote request in the queue
535 * @gh: the glock holder
536 *
537 * Returns: 1 if the queue is blocked
538 */
539
540static int rq_demote(struct gfs2_holder *gh)
541{
542 struct gfs2_glock *gl = gh->gh_gl;
543 struct gfs2_glock_operations *glops = gl->gl_ops;
544
545 if (!list_empty(&gl->gl_holders))
546 return 1;
547
548 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
549 list_del_init(&gh->gh_list);
550 gh->gh_error = 0;
551 spin_unlock(&gl->gl_spin);
552 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
553 gfs2_holder_put(gh);
554 else
555 complete(&gh->gh_wait);
556 spin_lock(&gl->gl_spin);
557 } else {
558 gl->gl_req_gh = gh;
559 set_bit(GLF_LOCK, &gl->gl_flags);
560 spin_unlock(&gl->gl_spin);
561
562 if (gh->gh_state == LM_ST_UNLOCKED ||
563 gl->gl_state != LM_ST_EXCLUSIVE)
564 glops->go_drop_th(gl);
565 else
566 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
567
568 spin_lock(&gl->gl_spin);
569 }
570
571 return 0;
572}
573
574/**
575 * rq_greedy - process a queued request to drop greedy status
576 * @gh: the glock holder
577 *
578 * Returns: 1 if the queue is blocked
579 */
580
581static int rq_greedy(struct gfs2_holder *gh)
582{
583 struct gfs2_glock *gl = gh->gh_gl;
584
585 list_del_init(&gh->gh_list);
586 /* gh->gh_error never examined. */
587 clear_bit(GLF_GREEDY, &gl->gl_flags);
588 spin_unlock(&gl->gl_spin);
589
590 gfs2_holder_uninit(gh);
591 kfree(container_of(gh, struct greedy, gr_gh));
592
593 spin_lock(&gl->gl_spin);
594
595 return 0;
596}
597
598/**
599 * run_queue - process holder structures on a glock
600 * @gl: the glock
601 *
602 */
603static void run_queue(struct gfs2_glock *gl)
604{
605 struct gfs2_holder *gh;
606 int blocked = 1;
607
608 for (;;) {
609 if (test_bit(GLF_LOCK, &gl->gl_flags))
610 break;
611
612 if (!list_empty(&gl->gl_waiters1)) {
613 gh = list_entry(gl->gl_waiters1.next,
614 struct gfs2_holder, gh_list);
615
616 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
617 blocked = rq_mutex(gh);
618 else
619 gfs2_assert_warn(gl->gl_sbd, 0);
620
621 } else if (!list_empty(&gl->gl_waiters2) &&
622 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
623 gh = list_entry(gl->gl_waiters2.next,
624 struct gfs2_holder, gh_list);
625
626 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
627 blocked = rq_demote(gh);
628 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
629 blocked = rq_greedy(gh);
630 else
631 gfs2_assert_warn(gl->gl_sbd, 0);
632
633 } else if (!list_empty(&gl->gl_waiters3)) {
634 gh = list_entry(gl->gl_waiters3.next,
635 struct gfs2_holder, gh_list);
636
637 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
638 blocked = rq_promote(gh);
639 else
640 gfs2_assert_warn(gl->gl_sbd, 0);
641
642 } else
643 break;
644
645 if (blocked)
646 break;
647 }
648}
649
650/**
651 * gfs2_glmutex_lock - acquire a local lock on a glock
652 * @gl: the glock
653 *
654 * Gives caller exclusive access to manipulate a glock structure.
655 */
656
657static void gfs2_glmutex_lock(struct gfs2_glock *gl)
658{
659 struct gfs2_holder gh;
660
661 gfs2_holder_init(gl, 0, 0, &gh);
662 set_bit(HIF_MUTEX, &gh.gh_iflags);
663
664 spin_lock(&gl->gl_spin);
665 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
666 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
667 else {
668 gl->gl_owner = current;
669 gl->gl_ip = (unsigned long)__builtin_return_address(0);
670 complete(&gh.gh_wait);
671 }
672 spin_unlock(&gl->gl_spin);
673
674 wait_for_completion(&gh.gh_wait);
675 gfs2_holder_uninit(&gh);
676}
677
678/**
679 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
680 * @gl: the glock
681 *
682 * Returns: 1 if the glock is acquired
683 */
684
685static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
686{
687 int acquired = 1;
688
689 spin_lock(&gl->gl_spin);
690 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
691 acquired = 0;
692 else {
693 gl->gl_owner = current;
694 gl->gl_ip = (unsigned long)__builtin_return_address(0);
695 }
696 spin_unlock(&gl->gl_spin);
697
698 return acquired;
699}
700
701/**
702 * gfs2_glmutex_unlock - release a local lock on a glock
703 * @gl: the glock
704 *
705 */
706
707static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
708{
709 spin_lock(&gl->gl_spin);
710 clear_bit(GLF_LOCK, &gl->gl_flags);
711 gl->gl_owner = NULL;
712 gl->gl_ip = 0;
713 run_queue(gl);
714 BUG_ON(!spin_is_locked(&gl->gl_spin));
715 spin_unlock(&gl->gl_spin);
716}
717
718/**
719 * handle_callback - add a demote request to a lock's queue
720 * @gl: the glock
721 * @state: the state the caller wants us to change to
722 *
723 * Note: This may fail sliently if we are out of memory.
724 */
725
726static void handle_callback(struct gfs2_glock *gl, unsigned int state)
727{
728 struct gfs2_holder *gh, *new_gh = NULL;
729
730restart:
731 spin_lock(&gl->gl_spin);
732
733 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
734 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
735 gl->gl_req_gh != gh) {
736 if (gh->gh_state != state)
737 gh->gh_state = LM_ST_UNLOCKED;
738 goto out;
739 }
740 }
741
742 if (new_gh) {
743 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
744 new_gh = NULL;
745 } else {
746 spin_unlock(&gl->gl_spin);
747
748 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
749 if (!new_gh)
750 return;
751 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
752 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
753
754 goto restart;
755 }
756
757out:
758 spin_unlock(&gl->gl_spin);
759
760 if (new_gh)
761 gfs2_holder_put(new_gh);
762}
763
764void gfs2_glock_inode_squish(struct inode *inode)
765{
766 struct gfs2_holder gh;
767 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
768 gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
769 set_bit(HIF_DEMOTE, &gh.gh_iflags);
770 spin_lock(&gl->gl_spin);
771 gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
772 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
773 run_queue(gl);
774 spin_unlock(&gl->gl_spin);
775 gfs2_holder_uninit(&gh);
776}
777
778/**
779 * state_change - record that the glock is now in a different state
780 * @gl: the glock
781 * @new_state the new state
782 *
783 */
784
785static void state_change(struct gfs2_glock *gl, unsigned int new_state)
786{
787 int held1, held2;
788
789 held1 = (gl->gl_state != LM_ST_UNLOCKED);
790 held2 = (new_state != LM_ST_UNLOCKED);
791
792 if (held1 != held2) {
793 if (held2)
794 gfs2_glock_hold(gl);
795 else
796 gfs2_glock_put(gl);
797 }
798
799 gl->gl_state = new_state;
800}
801
802/**
803 * xmote_bh - Called after the lock module is done acquiring a lock
804 * @gl: The glock in question
805 * @ret: the int returned from the lock module
806 *
807 */
808
809static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
810{
811 struct gfs2_sbd *sdp = gl->gl_sbd;
812 struct gfs2_glock_operations *glops = gl->gl_ops;
813 struct gfs2_holder *gh = gl->gl_req_gh;
814 int prev_state = gl->gl_state;
815 int op_done = 1;
816
817 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
818 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
819 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
820
821 state_change(gl, ret & LM_OUT_ST_MASK);
822
823 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
824 if (glops->go_inval)
825 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
826 } else if (gl->gl_state == LM_ST_DEFERRED) {
827 /* We might not want to do this here.
828 Look at moving to the inode glops. */
829 if (glops->go_inval)
830 glops->go_inval(gl, DIO_DATA);
831 }
832
833 /* Deal with each possible exit condition */
834
835 if (!gh)
836 gl->gl_stamp = jiffies;
837
838 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
839 spin_lock(&gl->gl_spin);
840 list_del_init(&gh->gh_list);
841 gh->gh_error = -EIO;
842 spin_unlock(&gl->gl_spin);
843
844 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
845 spin_lock(&gl->gl_spin);
846 list_del_init(&gh->gh_list);
847 if (gl->gl_state == gh->gh_state ||
848 gl->gl_state == LM_ST_UNLOCKED)
849 gh->gh_error = 0;
850 else {
851 if (gfs2_assert_warn(sdp, gh->gh_flags &
852 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
853 fs_warn(sdp, "ret = 0x%.8X\n", ret);
854 gh->gh_error = GLR_TRYFAILED;
855 }
856 spin_unlock(&gl->gl_spin);
857
858 if (ret & LM_OUT_CANCELED)
859 handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
860
861 } else if (ret & LM_OUT_CANCELED) {
862 spin_lock(&gl->gl_spin);
863 list_del_init(&gh->gh_list);
864 gh->gh_error = GLR_CANCELED;
865 spin_unlock(&gl->gl_spin);
866
867 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
868 spin_lock(&gl->gl_spin);
869 list_move_tail(&gh->gh_list, &gl->gl_holders);
870 gh->gh_error = 0;
871 set_bit(HIF_HOLDER, &gh->gh_iflags);
872 spin_unlock(&gl->gl_spin);
873
874 set_bit(HIF_FIRST, &gh->gh_iflags);
875
876 op_done = 0;
877
878 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
879 spin_lock(&gl->gl_spin);
880 list_del_init(&gh->gh_list);
881 gh->gh_error = GLR_TRYFAILED;
882 spin_unlock(&gl->gl_spin);
883
884 } else {
885 if (gfs2_assert_withdraw(sdp, 0) == -1)
886 fs_err(sdp, "ret = 0x%.8X\n", ret);
887 }
888
889 if (glops->go_xmote_bh)
890 glops->go_xmote_bh(gl);
891
892 if (op_done) {
893 spin_lock(&gl->gl_spin);
894 gl->gl_req_gh = NULL;
895 gl->gl_req_bh = NULL;
896 clear_bit(GLF_LOCK, &gl->gl_flags);
897 run_queue(gl);
898 spin_unlock(&gl->gl_spin);
899 }
900
901 gfs2_glock_put(gl);
902
903 if (gh) {
904 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
905 gfs2_holder_put(gh);
906 else
907 complete(&gh->gh_wait);
908 }
909}
910
911/**
912 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
913 * @gl: The glock in question
914 * @state: the requested state
915 * @flags: modifier flags to the lock call
916 *
917 */
918
919void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
920{
921 struct gfs2_sbd *sdp = gl->gl_sbd;
922 struct gfs2_glock_operations *glops = gl->gl_ops;
923 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
924 LM_FLAG_NOEXP | LM_FLAG_ANY |
925 LM_FLAG_PRIORITY);
926 unsigned int lck_ret;
927
928 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
929 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
930 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
931 gfs2_assert_warn(sdp, state != gl->gl_state);
932
933 if (gl->gl_state == LM_ST_EXCLUSIVE) {
934 if (glops->go_sync)
935 glops->go_sync(gl,
936 DIO_METADATA | DIO_DATA | DIO_RELEASE);
937 }
938
939 gfs2_glock_hold(gl);
940 gl->gl_req_bh = xmote_bh;
941
942 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
943 lck_flags);
944
945 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
946 return;
947
948 if (lck_ret & LM_OUT_ASYNC)
949 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
950 else
951 xmote_bh(gl, lck_ret);
952}
953
954/**
955 * drop_bh - Called after a lock module unlock completes
956 * @gl: the glock
957 * @ret: the return status
958 *
959 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
960 * Doesn't drop the reference on the glock the top half took out
961 *
962 */
963
964static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
965{
966 struct gfs2_sbd *sdp = gl->gl_sbd;
967 struct gfs2_glock_operations *glops = gl->gl_ops;
968 struct gfs2_holder *gh = gl->gl_req_gh;
969
970 clear_bit(GLF_PREFETCH, &gl->gl_flags);
971
972 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
973 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
974 gfs2_assert_warn(sdp, !ret);
975
976 state_change(gl, LM_ST_UNLOCKED);
977
978 if (glops->go_inval)
979 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
980
981 if (gh) {
982 spin_lock(&gl->gl_spin);
983 list_del_init(&gh->gh_list);
984 gh->gh_error = 0;
985 spin_unlock(&gl->gl_spin);
986 }
987
988 if (glops->go_drop_bh)
989 glops->go_drop_bh(gl);
990
991 spin_lock(&gl->gl_spin);
992 gl->gl_req_gh = NULL;
993 gl->gl_req_bh = NULL;
994 clear_bit(GLF_LOCK, &gl->gl_flags);
995 run_queue(gl);
996 spin_unlock(&gl->gl_spin);
997
998 gfs2_glock_put(gl);
999
1000 if (gh) {
1001 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
1002 gfs2_holder_put(gh);
1003 else
1004 complete(&gh->gh_wait);
1005 }
1006}
1007
1008/**
1009 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1010 * @gl: the glock
1011 *
1012 */
1013
1014void gfs2_glock_drop_th(struct gfs2_glock *gl)
1015{
1016 struct gfs2_sbd *sdp = gl->gl_sbd;
1017 struct gfs2_glock_operations *glops = gl->gl_ops;
1018 unsigned int ret;
1019
1020 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1021 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1022 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1023
1024 if (gl->gl_state == LM_ST_EXCLUSIVE) {
1025 if (glops->go_sync)
1026 glops->go_sync(gl,
1027 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1028 }
1029
1030 gfs2_glock_hold(gl);
1031 gl->gl_req_bh = drop_bh;
1032
1033 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1034
1035 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1036 return;
1037
1038 if (!ret)
1039 drop_bh(gl, ret);
1040 else
1041 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1042}
1043
1044/**
1045 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1046 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1047 *
1048 * Don't cancel GL_NOCANCEL requests.
1049 */
1050
1051static void do_cancels(struct gfs2_holder *gh)
1052{
1053 struct gfs2_glock *gl = gh->gh_gl;
1054
1055 spin_lock(&gl->gl_spin);
1056
1057 while (gl->gl_req_gh != gh &&
1058 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1059 !list_empty(&gh->gh_list)) {
1060 if (gl->gl_req_bh &&
1061 !(gl->gl_req_gh &&
1062 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1063 spin_unlock(&gl->gl_spin);
1064 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1065 msleep(100);
1066 spin_lock(&gl->gl_spin);
1067 } else {
1068 spin_unlock(&gl->gl_spin);
1069 msleep(100);
1070 spin_lock(&gl->gl_spin);
1071 }
1072 }
1073
1074 spin_unlock(&gl->gl_spin);
1075}
1076
1077/**
1078 * glock_wait_internal - wait on a glock acquisition
1079 * @gh: the glock holder
1080 *
1081 * Returns: 0 on success
1082 */
1083
1084static int glock_wait_internal(struct gfs2_holder *gh)
1085{
1086 struct gfs2_glock *gl = gh->gh_gl;
1087 struct gfs2_sbd *sdp = gl->gl_sbd;
1088 struct gfs2_glock_operations *glops = gl->gl_ops;
1089
1090 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1091 return -EIO;
1092
1093 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1094 spin_lock(&gl->gl_spin);
1095 if (gl->gl_req_gh != gh &&
1096 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1097 !list_empty(&gh->gh_list)) {
1098 list_del_init(&gh->gh_list);
1099 gh->gh_error = GLR_TRYFAILED;
1100 run_queue(gl);
1101 spin_unlock(&gl->gl_spin);
1102 return gh->gh_error;
1103 }
1104 spin_unlock(&gl->gl_spin);
1105 }
1106
1107 if (gh->gh_flags & LM_FLAG_PRIORITY)
1108 do_cancels(gh);
1109
1110 wait_for_completion(&gh->gh_wait);
1111
1112 if (gh->gh_error)
1113 return gh->gh_error;
1114
1115 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1116 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
1117 gh->gh_state,
1118 gh->gh_flags));
1119
1120 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1121 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1122
1123 if (glops->go_lock) {
1124 gh->gh_error = glops->go_lock(gh);
1125 if (gh->gh_error) {
1126 spin_lock(&gl->gl_spin);
1127 list_del_init(&gh->gh_list);
1128 spin_unlock(&gl->gl_spin);
1129 }
1130 }
1131
1132 spin_lock(&gl->gl_spin);
1133 gl->gl_req_gh = NULL;
1134 gl->gl_req_bh = NULL;
1135 clear_bit(GLF_LOCK, &gl->gl_flags);
1136 run_queue(gl);
1137 spin_unlock(&gl->gl_spin);
1138 }
1139
1140 return gh->gh_error;
1141}
1142
1143static inline struct gfs2_holder *
1144find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1145{
1146 struct gfs2_holder *gh;
1147
1148 list_for_each_entry(gh, head, gh_list) {
1149 if (gh->gh_owner == owner)
1150 return gh;
1151 }
1152
1153 return NULL;
1154}
1155
1156/**
1157 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1158 * @gh: the holder structure to add
1159 *
1160 */
1161
1162static void add_to_queue(struct gfs2_holder *gh)
1163{
1164 struct gfs2_glock *gl = gh->gh_gl;
1165 struct gfs2_holder *existing;
1166
1167 BUG_ON(!gh->gh_owner);
1168
1169 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1170 if (existing) {
1171 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1172 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1173 BUG();
1174 }
1175
1176 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1177 if (existing) {
1178 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1179 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1180 BUG();
1181 }
1182
1183 if (gh->gh_flags & LM_FLAG_PRIORITY)
1184 list_add(&gh->gh_list, &gl->gl_waiters3);
1185 else
1186 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1187}
1188
1189/**
1190 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1191 * @gh: the holder structure
1192 *
1193 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1194 *
1195 * Returns: 0, GLR_TRYFAILED, or errno on failure
1196 */
1197
1198int gfs2_glock_nq(struct gfs2_holder *gh)
1199{
1200 struct gfs2_glock *gl = gh->gh_gl;
1201 struct gfs2_sbd *sdp = gl->gl_sbd;
1202 int error = 0;
1203
1204restart:
1205 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1206 set_bit(HIF_ABORTED, &gh->gh_iflags);
1207 return -EIO;
1208 }
1209
1210 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1211
1212 spin_lock(&gl->gl_spin);
1213 add_to_queue(gh);
1214 run_queue(gl);
1215 spin_unlock(&gl->gl_spin);
1216
1217 if (!(gh->gh_flags & GL_ASYNC)) {
1218 error = glock_wait_internal(gh);
1219 if (error == GLR_CANCELED) {
1220 msleep(100);
1221 goto restart;
1222 }
1223 }
1224
1225 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1226
1227 if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
1228 dump_glock(gl);
1229
1230 return error;
1231}
1232
1233/**
1234 * gfs2_glock_poll - poll to see if an async request has been completed
1235 * @gh: the holder
1236 *
1237 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1238 */
1239
1240int gfs2_glock_poll(struct gfs2_holder *gh)
1241{
1242 struct gfs2_glock *gl = gh->gh_gl;
1243 int ready = 0;
1244
1245 spin_lock(&gl->gl_spin);
1246
1247 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1248 ready = 1;
1249 else if (list_empty(&gh->gh_list)) {
1250 if (gh->gh_error == GLR_CANCELED) {
1251 spin_unlock(&gl->gl_spin);
1252 msleep(100);
1253 if (gfs2_glock_nq(gh))
1254 return 1;
1255 return 0;
1256 } else
1257 ready = 1;
1258 }
1259
1260 spin_unlock(&gl->gl_spin);
1261
1262 return ready;
1263}
1264
1265/**
1266 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1267 * @gh: the holder structure
1268 *
1269 * Returns: 0, GLR_TRYFAILED, or errno on failure
1270 */
1271
1272int gfs2_glock_wait(struct gfs2_holder *gh)
1273{
1274 int error;
1275
1276 error = glock_wait_internal(gh);
1277 if (error == GLR_CANCELED) {
1278 msleep(100);
1279 gh->gh_flags &= ~GL_ASYNC;
1280 error = gfs2_glock_nq(gh);
1281 }
1282
1283 return error;
1284}
1285
1286/**
1287 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1288 * @gh: the glock holder
1289 *
1290 */
1291
1292void gfs2_glock_dq(struct gfs2_holder *gh)
1293{
1294 struct gfs2_glock *gl = gh->gh_gl;
1295 struct gfs2_glock_operations *glops = gl->gl_ops;
1296
1297 if (gh->gh_flags & GL_SYNC)
1298 set_bit(GLF_SYNC, &gl->gl_flags);
1299
1300 if (gh->gh_flags & GL_NOCACHE)
1301 handle_callback(gl, LM_ST_UNLOCKED);
1302
1303 gfs2_glmutex_lock(gl);
1304
1305 spin_lock(&gl->gl_spin);
1306 list_del_init(&gh->gh_list);
1307
1308 if (list_empty(&gl->gl_holders)) {
1309 spin_unlock(&gl->gl_spin);
1310
1311 if (glops->go_unlock)
1312 glops->go_unlock(gh);
1313
1314 if (test_bit(GLF_SYNC, &gl->gl_flags)) {
1315 if (glops->go_sync)
1316 glops->go_sync(gl, DIO_METADATA | DIO_DATA);
1317 }
1318
1319 gl->gl_stamp = jiffies;
1320
1321 spin_lock(&gl->gl_spin);
1322 }
1323
1324 clear_bit(GLF_LOCK, &gl->gl_flags);
1325 run_queue(gl);
1326 spin_unlock(&gl->gl_spin);
1327}
1328
1329/**
1330 * gfs2_glock_prefetch - Try to prefetch a glock
1331 * @gl: the glock
1332 * @state: the state to prefetch in
1333 * @flags: flags passed to go_xmote_th()
1334 *
1335 */
1336
1337static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1338 int flags)
1339{
1340 struct gfs2_glock_operations *glops = gl->gl_ops;
1341
1342 spin_lock(&gl->gl_spin);
1343
1344 if (test_bit(GLF_LOCK, &gl->gl_flags) ||
1345 !list_empty(&gl->gl_holders) ||
1346 !list_empty(&gl->gl_waiters1) ||
1347 !list_empty(&gl->gl_waiters2) ||
1348 !list_empty(&gl->gl_waiters3) ||
1349 relaxed_state_ok(gl->gl_state, state, flags)) {
1350 spin_unlock(&gl->gl_spin);
1351 return;
1352 }
1353
1354 set_bit(GLF_PREFETCH, &gl->gl_flags);
1355 set_bit(GLF_LOCK, &gl->gl_flags);
1356 spin_unlock(&gl->gl_spin);
1357
1358 glops->go_xmote_th(gl, state, flags);
1359}
1360
1361static void greedy_work(void *data)
1362{
1363 struct greedy *gr = data;
1364 struct gfs2_holder *gh = &gr->gr_gh;
1365 struct gfs2_glock *gl = gh->gh_gl;
1366 struct gfs2_glock_operations *glops = gl->gl_ops;
1367
1368 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1369
1370 if (glops->go_greedy)
1371 glops->go_greedy(gl);
1372
1373 spin_lock(&gl->gl_spin);
1374
1375 if (list_empty(&gl->gl_waiters2)) {
1376 clear_bit(GLF_GREEDY, &gl->gl_flags);
1377 spin_unlock(&gl->gl_spin);
1378 gfs2_holder_uninit(gh);
1379 kfree(gr);
1380 } else {
1381 gfs2_glock_hold(gl);
1382 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1383 run_queue(gl);
1384 spin_unlock(&gl->gl_spin);
1385 gfs2_glock_put(gl);
1386 }
1387}
1388
1389/**
1390 * gfs2_glock_be_greedy -
1391 * @gl:
1392 * @time:
1393 *
1394 * Returns: 0 if go_greedy will be called, 1 otherwise
1395 */
1396
1397int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1398{
1399 struct greedy *gr;
1400 struct gfs2_holder *gh;
1401
1402 if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
1403 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1404 return 1;
1405
1406 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1407 if (!gr) {
1408 clear_bit(GLF_GREEDY, &gl->gl_flags);
1409 return 1;
1410 }
1411 gh = &gr->gr_gh;
1412
1413 gfs2_holder_init(gl, 0, 0, gh);
1414 set_bit(HIF_GREEDY, &gh->gh_iflags);
1415 INIT_WORK(&gr->gr_work, greedy_work, gr);
1416
1417 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1418 schedule_delayed_work(&gr->gr_work, time);
1419
1420 return 0;
1421}
1422
1423/**
1424 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1425 * @gh: the holder structure
1426 *
1427 */
1428
1429void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1430{
1431 gfs2_glock_dq(gh);
1432 gfs2_holder_uninit(gh);
1433}
1434
1435/**
1436 * gfs2_glock_nq_num - acquire a glock based on lock number
1437 * @sdp: the filesystem
1438 * @number: the lock number
1439 * @glops: the glock operations for the type of glock
1440 * @state: the state to acquire the glock in
1441 * @flags: modifier flags for the aquisition
1442 * @gh: the struct gfs2_holder
1443 *
1444 * Returns: errno
1445 */
1446
1447int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
1448 struct gfs2_glock_operations *glops, unsigned int state,
1449 int flags, struct gfs2_holder *gh)
1450{
1451 struct gfs2_glock *gl;
1452 int error;
1453
1454 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1455 if (!error) {
1456 error = gfs2_glock_nq_init(gl, state, flags, gh);
1457 gfs2_glock_put(gl);
1458 }
1459
1460 return error;
1461}
1462
1463/**
1464 * glock_compare - Compare two struct gfs2_glock structures for sorting
1465 * @arg_a: the first structure
1466 * @arg_b: the second structure
1467 *
1468 */
1469
1470static int glock_compare(const void *arg_a, const void *arg_b)
1471{
1472 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1473 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1474 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1475 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1476 int ret = 0;
1477
1478 if (a->ln_number > b->ln_number)
1479 ret = 1;
1480 else if (a->ln_number < b->ln_number)
1481 ret = -1;
1482 else {
1483 if (gh_a->gh_state == LM_ST_SHARED &&
1484 gh_b->gh_state == LM_ST_EXCLUSIVE)
1485 ret = 1;
1486 else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
1487 (gh_b->gh_flags & GL_LOCAL_EXCL))
1488 ret = 1;
1489 }
1490
1491 return ret;
1492}
1493
1494/**
1495 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1496 * @num_gh: the number of structures
1497 * @ghs: an array of struct gfs2_holder structures
1498 *
1499 * Returns: 0 on success (all glocks acquired),
1500 * errno on failure (no glocks acquired)
1501 */
1502
1503static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1504 struct gfs2_holder **p)
1505{
1506 unsigned int x;
1507 int error = 0;
1508
1509 for (x = 0; x < num_gh; x++)
1510 p[x] = &ghs[x];
1511
1512 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1513
1514 for (x = 0; x < num_gh; x++) {
1515 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1516
1517 error = gfs2_glock_nq(p[x]);
1518 if (error) {
1519 while (x--)
1520 gfs2_glock_dq(p[x]);
1521 break;
1522 }
1523 }
1524
1525 return error;
1526}
1527
1528/**
1529 * gfs2_glock_nq_m - acquire multiple glocks
1530 * @num_gh: the number of structures
1531 * @ghs: an array of struct gfs2_holder structures
1532 *
1533 * Figure out how big an impact this function has. Either:
1534 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1535 * 2) Forget async stuff and just call nq_m_sync()
1536 * 3) Leave it like it is
1537 *
1538 * Returns: 0 on success (all glocks acquired),
1539 * errno on failure (no glocks acquired)
1540 */
1541
1542int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1543{
1544 int *e;
1545 unsigned int x;
1546 int borked = 0, serious = 0;
1547 int error = 0;
1548
1549 if (!num_gh)
1550 return 0;
1551
1552 if (num_gh == 1) {
1553 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1554 return gfs2_glock_nq(ghs);
1555 }
1556
1557 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1558 if (!e)
1559 return -ENOMEM;
1560
1561 for (x = 0; x < num_gh; x++) {
1562 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1563 error = gfs2_glock_nq(&ghs[x]);
1564 if (error) {
1565 borked = 1;
1566 serious = error;
1567 num_gh = x;
1568 break;
1569 }
1570 }
1571
1572 for (x = 0; x < num_gh; x++) {
1573 error = e[x] = glock_wait_internal(&ghs[x]);
1574 if (error) {
1575 borked = 1;
1576 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1577 serious = error;
1578 }
1579 }
1580
1581 if (!borked) {
1582 kfree(e);
1583 return 0;
1584 }
1585
1586 for (x = 0; x < num_gh; x++)
1587 if (!e[x])
1588 gfs2_glock_dq(&ghs[x]);
1589
1590 if (serious)
1591 error = serious;
1592 else {
1593 for (x = 0; x < num_gh; x++)
1594 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1595 &ghs[x]);
1596 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1597 }
1598
1599 kfree(e);
1600
1601 return error;
1602}
1603
1604/**
1605 * gfs2_glock_dq_m - release multiple glocks
1606 * @num_gh: the number of structures
1607 * @ghs: an array of struct gfs2_holder structures
1608 *
1609 */
1610
1611void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1612{
1613 unsigned int x;
1614
1615 for (x = 0; x < num_gh; x++)
1616 gfs2_glock_dq(&ghs[x]);
1617}
1618
1619/**
1620 * gfs2_glock_dq_uninit_m - release multiple glocks
1621 * @num_gh: the number of structures
1622 * @ghs: an array of struct gfs2_holder structures
1623 *
1624 */
1625
1626void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1627{
1628 unsigned int x;
1629
1630 for (x = 0; x < num_gh; x++)
1631 gfs2_glock_dq_uninit(&ghs[x]);
1632}
1633
1634/**
1635 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1636 * @sdp: the filesystem
1637 * @number: the lock number
1638 * @glops: the glock operations for the type of glock
1639 * @state: the state to acquire the glock in
1640 * @flags: modifier flags for the aquisition
1641 *
1642 * Returns: errno
1643 */
1644
1645void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
1646 struct gfs2_glock_operations *glops,
1647 unsigned int state, int flags)
1648{
1649 struct gfs2_glock *gl;
1650 int error;
1651
1652 if (atomic_read(&sdp->sd_reclaim_count) <
1653 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1654 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1655 if (!error) {
1656 gfs2_glock_prefetch(gl, state, flags);
1657 gfs2_glock_put(gl);
1658 }
1659 }
1660}
1661
1662/**
1663 * gfs2_lvb_hold - attach a LVB from a glock
1664 * @gl: The glock in question
1665 *
1666 */
1667
1668int gfs2_lvb_hold(struct gfs2_glock *gl)
1669{
1670 int error;
1671
1672 gfs2_glmutex_lock(gl);
1673
1674 if (!atomic_read(&gl->gl_lvb_count)) {
1675 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1676 if (error) {
1677 gfs2_glmutex_unlock(gl);
1678 return error;
1679 }
1680 gfs2_glock_hold(gl);
1681 }
1682 atomic_inc(&gl->gl_lvb_count);
1683
1684 gfs2_glmutex_unlock(gl);
1685
1686 return 0;
1687}
1688
1689/**
1690 * gfs2_lvb_unhold - detach a LVB from a glock
1691 * @gl: The glock in question
1692 *
1693 */
1694
1695void gfs2_lvb_unhold(struct gfs2_glock *gl)
1696{
1697 gfs2_glock_hold(gl);
1698 gfs2_glmutex_lock(gl);
1699
1700 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1701 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1702 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1703 gl->gl_lvb = NULL;
1704 gfs2_glock_put(gl);
1705 }
1706
1707 gfs2_glmutex_unlock(gl);
1708 gfs2_glock_put(gl);
1709}
1710
1711#if 0
1712void gfs2_lvb_sync(struct gfs2_glock *gl)
1713{
1714 gfs2_glmutex_lock(gl);
1715
1716 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
1717 if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
1718 gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1719
1720 gfs2_glmutex_unlock(gl);
1721}
1722#endif /* 0 */
1723
1724static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1725 unsigned int state)
1726{
1727 struct gfs2_glock *gl;
1728
1729 gl = gfs2_glock_find(sdp, name);
1730 if (!gl)
1731 return;
1732
1733 if (gl->gl_ops->go_callback)
1734 gl->gl_ops->go_callback(gl, state);
1735 handle_callback(gl, state);
1736
1737 spin_lock(&gl->gl_spin);
1738 run_queue(gl);
1739 spin_unlock(&gl->gl_spin);
1740
1741 gfs2_glock_put(gl);
1742}
1743
1744/**
1745 * gfs2_glock_cb - Callback used by locking module
1746 * @fsdata: Pointer to the superblock
1747 * @type: Type of callback
1748 * @data: Type dependent data pointer
1749 *
1750 * Called by the locking module when it wants to tell us something.
1751 * Either we need to drop a lock, one of our ASYNC requests completed, or
1752 * a journal from another client needs to be recovered.
1753 */
1754
1755void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
1756{
1757 struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
1758
1759 switch (type) {
1760 case LM_CB_NEED_E:
1761 blocking_cb(sdp, data, LM_ST_UNLOCKED);
1762 return;
1763
1764 case LM_CB_NEED_D:
1765 blocking_cb(sdp, data, LM_ST_DEFERRED);
1766 return;
1767
1768 case LM_CB_NEED_S:
1769 blocking_cb(sdp, data, LM_ST_SHARED);
1770 return;
1771
1772 case LM_CB_ASYNC: {
1773 struct lm_async_cb *async = data;
1774 struct gfs2_glock *gl;
1775
1776 gl = gfs2_glock_find(sdp, &async->lc_name);
1777 if (gfs2_assert_warn(sdp, gl))
1778 return;
1779 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1780 gl->gl_req_bh(gl, async->lc_ret);
1781 gfs2_glock_put(gl);
1782 return;
1783 }
1784
1785 case LM_CB_NEED_RECOVERY:
1786 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1787 if (sdp->sd_recoverd_process)
1788 wake_up_process(sdp->sd_recoverd_process);
1789 return;
1790
1791 case LM_CB_DROPLOCKS:
1792 gfs2_gl_hash_clear(sdp, NO_WAIT);
1793 gfs2_quota_scan(sdp);
1794 return;
1795
1796 default:
1797 gfs2_assert_warn(sdp, 0);
1798 return;
1799 }
1800}
1801
1802/**
1803 * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
1804 * iopen glock from memory
1805 * @io_gl: the iopen glock
1806 * @state: the state into which the glock should be put
1807 *
1808 */
1809
1810void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
1811{
1812
1813 if (state != LM_ST_UNLOCKED)
1814 return;
1815 /* FIXME: remove this? */
1816}
1817
1818/**
1819 * demote_ok - Check to see if it's ok to unlock a glock
1820 * @gl: the glock
1821 *
1822 * Returns: 1 if it's ok
1823 */
1824
1825static int demote_ok(struct gfs2_glock *gl)
1826{
1827 struct gfs2_sbd *sdp = gl->gl_sbd;
1828 struct gfs2_glock_operations *glops = gl->gl_ops;
1829 int demote = 1;
1830
1831 if (test_bit(GLF_STICKY, &gl->gl_flags))
1832 demote = 0;
1833 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1834 demote = time_after_eq(jiffies,
1835 gl->gl_stamp +
1836 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1837 else if (glops->go_demote_ok)
1838 demote = glops->go_demote_ok(gl);
1839
1840 return demote;
1841}
1842
1843/**
1844 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1845 * @gl: the glock
1846 *
1847 */
1848
1849void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1850{
1851 struct gfs2_sbd *sdp = gl->gl_sbd;
1852
1853 spin_lock(&sdp->sd_reclaim_lock);
1854 if (list_empty(&gl->gl_reclaim)) {
1855 gfs2_glock_hold(gl);
1856 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1857 atomic_inc(&sdp->sd_reclaim_count);
1858 }
1859 spin_unlock(&sdp->sd_reclaim_lock);
1860
1861 wake_up(&sdp->sd_reclaim_wq);
1862}
1863
1864/**
1865 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1866 * @sdp: the filesystem
1867 *
1868 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1869 * different glock and we notice that there are a lot of glocks in the
1870 * reclaim list.
1871 *
1872 */
1873
1874void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1875{
1876 struct gfs2_glock *gl;
1877
1878 spin_lock(&sdp->sd_reclaim_lock);
1879 if (list_empty(&sdp->sd_reclaim_list)) {
1880 spin_unlock(&sdp->sd_reclaim_lock);
1881 return;
1882 }
1883 gl = list_entry(sdp->sd_reclaim_list.next,
1884 struct gfs2_glock, gl_reclaim);
1885 list_del_init(&gl->gl_reclaim);
1886 spin_unlock(&sdp->sd_reclaim_lock);
1887
1888 atomic_dec(&sdp->sd_reclaim_count);
1889 atomic_inc(&sdp->sd_reclaimed);
1890
1891 if (gfs2_glmutex_trylock(gl)) {
1892 if (queue_empty(gl, &gl->gl_holders) &&
1893 gl->gl_state != LM_ST_UNLOCKED &&
1894 demote_ok(gl))
1895 handle_callback(gl, LM_ST_UNLOCKED);
1896 gfs2_glmutex_unlock(gl);
1897 }
1898
1899 gfs2_glock_put(gl);
1900}
1901
1902/**
1903 * examine_bucket - Call a function for glock in a hash bucket
1904 * @examiner: the function
1905 * @sdp: the filesystem
1906 * @bucket: the bucket
1907 *
1908 * Returns: 1 if the bucket has entries
1909 */
1910
1911static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1912 struct gfs2_gl_hash_bucket *bucket)
1913{
1914 struct glock_plug plug;
1915 struct list_head *tmp;
1916 struct gfs2_glock *gl;
1917 int entries;
1918
1919 /* Add "plug" to end of bucket list, work back up list from there */
1920 memset(&plug.gl_flags, 0, sizeof(unsigned long));
1921 set_bit(GLF_PLUG, &plug.gl_flags);
1922
1923 write_lock(&bucket->hb_lock);
1924 list_add(&plug.gl_list, &bucket->hb_list);
1925 write_unlock(&bucket->hb_lock);
1926
1927 for (;;) {
1928 write_lock(&bucket->hb_lock);
1929
1930 for (;;) {
1931 tmp = plug.gl_list.next;
1932
1933 if (tmp == &bucket->hb_list) {
1934 list_del(&plug.gl_list);
1935 entries = !list_empty(&bucket->hb_list);
1936 write_unlock(&bucket->hb_lock);
1937 return entries;
1938 }
1939 gl = list_entry(tmp, struct gfs2_glock, gl_list);
1940
1941 /* Move plug up list */
1942 list_move(&plug.gl_list, &gl->gl_list);
1943
1944 if (test_bit(GLF_PLUG, &gl->gl_flags))
1945 continue;
1946
1947 /* examiner() must glock_put() */
1948 gfs2_glock_hold(gl);
1949
1950 break;
1951 }
1952
1953 write_unlock(&bucket->hb_lock);
1954
1955 examiner(gl);
1956 }
1957}
1958
1959/**
1960 * scan_glock - look at a glock and see if we can reclaim it
1961 * @gl: the glock to look at
1962 *
1963 */
1964
1965static void scan_glock(struct gfs2_glock *gl)
1966{
1967 if (gfs2_glmutex_trylock(gl)) {
1968 if (gl->gl_ops == &gfs2_inode_glops)
1969 goto out;
1970 if (queue_empty(gl, &gl->gl_holders) &&
1971 gl->gl_state != LM_ST_UNLOCKED &&
1972 demote_ok(gl))
1973 goto out_schedule;
1974out:
1975 gfs2_glmutex_unlock(gl);
1976 }
1977
1978 gfs2_glock_put(gl);
1979
1980 return;
1981
1982out_schedule:
1983 gfs2_glmutex_unlock(gl);
1984 gfs2_glock_schedule_for_reclaim(gl);
1985 gfs2_glock_put(gl);
1986}
1987
1988/**
1989 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1990 * @sdp: the filesystem
1991 *
1992 */
1993
1994void gfs2_scand_internal(struct gfs2_sbd *sdp)
1995{
1996 unsigned int x;
1997
1998 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1999 examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
2000 cond_resched();
2001 }
2002}
2003
2004/**
2005 * clear_glock - look at a glock and see if we can free it from glock cache
2006 * @gl: the glock to look at
2007 *
2008 */
2009
2010static void clear_glock(struct gfs2_glock *gl)
2011{
2012 struct gfs2_sbd *sdp = gl->gl_sbd;
2013 int released;
2014
2015 spin_lock(&sdp->sd_reclaim_lock);
2016 if (!list_empty(&gl->gl_reclaim)) {
2017 list_del_init(&gl->gl_reclaim);
2018 atomic_dec(&sdp->sd_reclaim_count);
2019 spin_unlock(&sdp->sd_reclaim_lock);
2020 released = gfs2_glock_put(gl);
2021 gfs2_assert(sdp, !released);
2022 } else {
2023 spin_unlock(&sdp->sd_reclaim_lock);
2024 }
2025
2026 if (gfs2_glmutex_trylock(gl)) {
2027 if (queue_empty(gl, &gl->gl_holders) &&
2028 gl->gl_state != LM_ST_UNLOCKED)
2029 handle_callback(gl, LM_ST_UNLOCKED);
2030
2031 gfs2_glmutex_unlock(gl);
2032 }
2033
2034 gfs2_glock_put(gl);
2035}
2036
2037/**
2038 * gfs2_gl_hash_clear - Empty out the glock hash table
2039 * @sdp: the filesystem
2040 * @wait: wait until it's all gone
2041 *
2042 * Called when unmounting the filesystem, or when inter-node lock manager
2043 * requests DROPLOCKS because it is running out of capacity.
2044 */
2045
2046void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2047{
2048 unsigned long t;
2049 unsigned int x;
2050 int cont;
2051
2052 t = jiffies;
2053
2054 for (;;) {
2055 cont = 0;
2056
2057 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2058 if (examine_bucket(clear_glock, sdp,
2059 &sdp->sd_gl_hash[x]))
2060 cont = 1;
2061
2062 if (!wait || !cont)
2063 break;
2064
2065 if (time_after_eq(jiffies,
2066 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2067 fs_warn(sdp, "Unmount seems to be stalled. "
2068 "Dumping lock state...\n");
2069 gfs2_dump_lockstate(sdp);
2070 t = jiffies;
2071 }
2072
2073 /* invalidate_inodes() requires that the sb inodes list
2074 not change, but an async completion callback for an
2075 unlock can occur which does glock_put() which
2076 can call iput() which will change the sb inodes list.
2077 invalidate_inodes_mutex prevents glock_put()'s during
2078 an invalidate_inodes() */
2079
2080 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
2081 invalidate_inodes(sdp->sd_vfs);
2082 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
2083 msleep(10);
2084 }
2085}
2086
2087/*
2088 * Diagnostic routines to help debug distributed deadlock
2089 */
2090
2091/**
2092 * dump_holder - print information about a glock holder
2093 * @str: a string naming the type of holder
2094 * @gh: the glock holder
2095 *
2096 * Returns: 0 on success, -ENOBUFS when we run out of space
2097 */
2098
2099static int dump_holder(char *str, struct gfs2_holder *gh)
2100{
2101 unsigned int x;
2102 int error = -ENOBUFS;
2103
2104 printk(KERN_INFO " %s\n", str);
2105 printk(KERN_INFO " owner = %ld\n",
2106 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2107 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2108 printk(KERN_INFO " gh_flags =");
2109 for (x = 0; x < 32; x++)
2110 if (gh->gh_flags & (1 << x))
2111 printk(" %u", x);
2112 printk(" \n");
2113 printk(KERN_INFO " error = %d\n", gh->gh_error);
2114 printk(KERN_INFO " gh_iflags =");
2115 for (x = 0; x < 32; x++)
2116 if (test_bit(x, &gh->gh_iflags))
2117 printk(" %u", x);
2118 printk(" \n");
2119 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2120
2121 error = 0;
2122
2123 return error;
2124}
2125
2126/**
2127 * dump_inode - print information about an inode
2128 * @ip: the inode
2129 *
2130 * Returns: 0 on success, -ENOBUFS when we run out of space
2131 */
2132
2133static int dump_inode(struct gfs2_inode *ip)
2134{
2135 unsigned int x;
2136 int error = -ENOBUFS;
2137
2138 printk(KERN_INFO " Inode:\n");
2139 printk(KERN_INFO " num = %llu %llu\n",
2140 (unsigned long long)ip->i_num.no_formal_ino,
2141 (unsigned long long)ip->i_num.no_addr);
2142 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2143 printk(KERN_INFO " i_flags =");
2144 for (x = 0; x < 32; x++)
2145 if (test_bit(x, &ip->i_flags))
2146 printk(" %u", x);
2147 printk(" \n");
2148
2149 error = 0;
2150
2151 return error;
2152}
2153
2154/**
2155 * dump_glock - print information about a glock
2156 * @gl: the glock
2157 * @count: where we are in the buffer
2158 *
2159 * Returns: 0 on success, -ENOBUFS when we run out of space
2160 */
2161
2162static int dump_glock(struct gfs2_glock *gl)
2163{
2164 struct gfs2_holder *gh;
2165 unsigned int x;
2166 int error = -ENOBUFS;
2167
2168 spin_lock(&gl->gl_spin);
2169
2170 printk(KERN_INFO "Glock (%u, %llu)\n", gl->gl_name.ln_type,
2171 (unsigned long long)gl->gl_name.ln_number);
2172 printk(KERN_INFO " gl_flags =");
2173 for (x = 0; x < 32; x++)
2174 if (test_bit(x, &gl->gl_flags))
2175 printk(" %u", x);
2176 printk(" \n");
2177 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
2178 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2179 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
2180 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
2181 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2182 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2183 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2184 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2185 printk(KERN_INFO " le = %s\n",
2186 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2187 printk(KERN_INFO " reclaim = %s\n",
2188 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2189 if (gl->gl_aspace)
2190 printk(KERN_INFO " aspace = %lu\n",
2191 gl->gl_aspace->i_mapping->nrpages);
2192 else
2193 printk(KERN_INFO " aspace = no\n");
2194 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2195 if (gl->gl_req_gh) {
2196 error = dump_holder("Request", gl->gl_req_gh);
2197 if (error)
2198 goto out;
2199 }
2200 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2201 error = dump_holder("Holder", gh);
2202 if (error)
2203 goto out;
2204 }
2205 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2206 error = dump_holder("Waiter1", gh);
2207 if (error)
2208 goto out;
2209 }
2210 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2211 error = dump_holder("Waiter2", gh);
2212 if (error)
2213 goto out;
2214 }
2215 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2216 error = dump_holder("Waiter3", gh);
2217 if (error)
2218 goto out;
2219 }
2220 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2221 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2222 list_empty(&gl->gl_holders)) {
2223 error = dump_inode(gl->gl_object);
2224 if (error)
2225 goto out;
2226 } else {
2227 error = -ENOBUFS;
2228 printk(KERN_INFO " Inode: busy\n");
2229 }
2230 }
2231
2232 error = 0;
2233
2234 out:
2235 spin_unlock(&gl->gl_spin);
2236
2237 return error;
2238}
2239
2240/**
2241 * gfs2_dump_lockstate - print out the current lockstate
2242 * @sdp: the filesystem
2243 * @ub: the buffer to copy the information into
2244 *
2245 * If @ub is NULL, dump the lockstate to the console.
2246 *
2247 */
2248
2249static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2250{
2251 struct gfs2_gl_hash_bucket *bucket;
2252 struct gfs2_glock *gl;
2253 unsigned int x;
2254 int error = 0;
2255
2256 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2257 bucket = &sdp->sd_gl_hash[x];
2258
2259 read_lock(&bucket->hb_lock);
2260
2261 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
2262 if (test_bit(GLF_PLUG, &gl->gl_flags))
2263 continue;
2264
2265 error = dump_glock(gl);
2266 if (error)
2267 break;
2268 }
2269
2270 read_unlock(&bucket->hb_lock);
2271
2272 if (error)
2273 break;
2274 }
2275
2276
2277 return error;
2278}
2279
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..fdf58db44ae3
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,152 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13/* Flags for lock requests; used in gfs2_holder gh_flag field.
14 From lm_interface.h:
15#define LM_FLAG_TRY 0x00000001
16#define LM_FLAG_TRY_1CB 0x00000002
17#define LM_FLAG_NOEXP 0x00000004
18#define LM_FLAG_ANY 0x00000008
19#define LM_FLAG_PRIORITY 0x00000010 */
20
21#define GL_LOCAL_EXCL 0x00000020
22#define GL_ASYNC 0x00000040
23#define GL_EXACT 0x00000080
24#define GL_SKIP 0x00000100
25#define GL_ATIME 0x00000200
26#define GL_NOCACHE 0x00000400
27#define GL_SYNC 0x00000800
28#define GL_NOCANCEL 0x00001000
29#define GL_AOP 0x00004000
30#define GL_DUMP 0x00008000
31
32#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34
35static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{
37 struct gfs2_holder *gh;
38 int locked = 0;
39
40 /* Look in glock's list of holders for one with current task as owner */
41 spin_lock(&gl->gl_spin);
42 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
43 if (gh->gh_owner == current) {
44 locked = 1;
45 break;
46 }
47 }
48 spin_unlock(&gl->gl_spin);
49
50 return locked;
51}
52
53static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
54{
55 return (gl->gl_state == LM_ST_EXCLUSIVE);
56}
57
58static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
59{
60 return (gl->gl_state == LM_ST_DEFERRED);
61}
62
63static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
64{
65 return (gl->gl_state == LM_ST_SHARED);
66}
67
68static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
69{
70 int ret;
71 spin_lock(&gl->gl_spin);
72 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
73 spin_unlock(&gl->gl_spin);
74 return ret;
75}
76
77int gfs2_glock_get(struct gfs2_sbd *sdp,
78 uint64_t number, struct gfs2_glock_operations *glops,
79 int create, struct gfs2_glock **glp);
80void gfs2_glock_hold(struct gfs2_glock *gl);
81int gfs2_glock_put(struct gfs2_glock *gl);
82void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
83 struct gfs2_holder *gh);
84void gfs2_holder_reinit(unsigned int state, unsigned flags,
85 struct gfs2_holder *gh);
86void gfs2_holder_uninit(struct gfs2_holder *gh);
87
88void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
89void gfs2_glock_drop_th(struct gfs2_glock *gl);
90
91int gfs2_glock_nq(struct gfs2_holder *gh);
92int gfs2_glock_poll(struct gfs2_holder *gh);
93int gfs2_glock_wait(struct gfs2_holder *gh);
94void gfs2_glock_dq(struct gfs2_holder *gh);
95
96int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
97
98void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
99int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
100 uint64_t number, struct gfs2_glock_operations *glops,
101 unsigned int state, int flags, struct gfs2_holder *gh);
102
103int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
104void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
105void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
106
107void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
108 struct gfs2_glock_operations *glops,
109 unsigned int state, int flags);
110void gfs2_glock_inode_squish(struct inode *inode);
111
112/**
113 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
114 * @gl: the glock
115 * @state: the state we're requesting
116 * @flags: the modifier flags
117 * @gh: the holder structure
118 *
119 * Returns: 0, GLR_*, or errno
120 */
121
122static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
123 unsigned int state, int flags,
124 struct gfs2_holder *gh)
125{
126 int error;
127
128 gfs2_holder_init(gl, state, flags, gh);
129
130 error = gfs2_glock_nq(gh);
131 if (error)
132 gfs2_holder_uninit(gh);
133
134 return error;
135}
136
137/* Lock Value Block functions */
138
139int gfs2_lvb_hold(struct gfs2_glock *gl);
140void gfs2_lvb_unhold(struct gfs2_glock *gl);
141
142void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
143
144void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
145
146void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
147void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
148
149void gfs2_scand_internal(struct gfs2_sbd *sdp);
150void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
151
152#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..a01874c58834
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,491 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "bmap.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "page.h"
27#include "recovery.h"
28#include "rgrp.h"
29#include "util.h"
30
31/**
32 * meta_go_sync - sync out the metadata for this glock
33 * @gl: the glock
34 * @flags: DIO_*
35 *
36 * Called when demoting or unlocking an EX glock. We must flush
37 * to disk all dirty buffers/pages relating to this glock, and must not
38 * not return to caller to demote/unlock the glock until I/O is complete.
39 */
40
41static void meta_go_sync(struct gfs2_glock *gl, int flags)
42{
43 if (!(flags & DIO_METADATA))
44 return;
45
46 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
47 gfs2_log_flush(gl->gl_sbd, gl);
48 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
49 if (flags & DIO_RELEASE)
50 gfs2_ail_empty_gl(gl);
51 }
52
53 clear_bit(GLF_SYNC, &gl->gl_flags);
54}
55
56/**
57 * meta_go_inval - invalidate the metadata for this glock
58 * @gl: the glock
59 * @flags:
60 *
61 */
62
63static void meta_go_inval(struct gfs2_glock *gl, int flags)
64{
65 if (!(flags & DIO_METADATA))
66 return;
67
68 gfs2_meta_inval(gl);
69 gl->gl_vn++;
70}
71
72/**
73 * meta_go_demote_ok - Check to see if it's ok to unlock a glock
74 * @gl: the glock
75 *
76 * Returns: 1 if we have no cached data; ok to demote meta glock
77 */
78
79static int meta_go_demote_ok(struct gfs2_glock *gl)
80{
81 return !gl->gl_aspace->i_mapping->nrpages;
82}
83
84/**
85 * inode_go_xmote_th - promote/demote a glock
86 * @gl: the glock
87 * @state: the requested state
88 * @flags:
89 *
90 */
91
92static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
93 int flags)
94{
95 if (gl->gl_state != LM_ST_UNLOCKED)
96 gfs2_pte_inval(gl);
97 gfs2_glock_xmote_th(gl, state, flags);
98}
99
100/**
101 * inode_go_xmote_bh - After promoting/demoting a glock
102 * @gl: the glock
103 *
104 */
105
106static void inode_go_xmote_bh(struct gfs2_glock *gl)
107{
108 struct gfs2_holder *gh = gl->gl_req_gh;
109 struct buffer_head *bh;
110 int error;
111
112 if (gl->gl_state != LM_ST_UNLOCKED &&
113 (!gh || !(gh->gh_flags & GL_SKIP))) {
114 error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
115 &bh);
116 if (!error)
117 brelse(bh);
118 }
119}
120
121/**
122 * inode_go_drop_th - unlock a glock
123 * @gl: the glock
124 *
125 * Invoked from rq_demote().
126 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
127 * is being purged from our node's glock cache; we're dropping lock.
128 */
129
130static void inode_go_drop_th(struct gfs2_glock *gl)
131{
132 gfs2_pte_inval(gl);
133 gfs2_glock_drop_th(gl);
134}
135
136/**
137 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
138 * @gl: the glock protecting the inode
139 * @flags:
140 *
141 */
142
143static void inode_go_sync(struct gfs2_glock *gl, int flags)
144{
145 int meta = (flags & DIO_METADATA);
146 int data = (flags & DIO_DATA);
147
148 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
149 if (meta && data) {
150 gfs2_page_sync(gl, flags | DIO_START);
151 gfs2_log_flush(gl->gl_sbd, gl);
152 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
153 gfs2_page_sync(gl, flags | DIO_WAIT);
154 clear_bit(GLF_DIRTY, &gl->gl_flags);
155 } else if (meta) {
156 gfs2_log_flush(gl->gl_sbd, gl);
157 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
158 } else if (data)
159 gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
160 if (flags & DIO_RELEASE)
161 gfs2_ail_empty_gl(gl);
162 }
163
164 clear_bit(GLF_SYNC, &gl->gl_flags);
165}
166
167/**
168 * inode_go_inval - prepare a inode glock to be released
169 * @gl: the glock
170 * @flags:
171 *
172 */
173
174static void inode_go_inval(struct gfs2_glock *gl, int flags)
175{
176 int meta = (flags & DIO_METADATA);
177 int data = (flags & DIO_DATA);
178
179 if (meta) {
180 gfs2_meta_inval(gl);
181 gl->gl_vn++;
182 }
183 if (data)
184 gfs2_page_inval(gl);
185}
186
187/**
188 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
189 * @gl: the glock
190 *
191 * Returns: 1 if it's ok
192 */
193
194static int inode_go_demote_ok(struct gfs2_glock *gl)
195{
196 struct gfs2_sbd *sdp = gl->gl_sbd;
197 int demote = 0;
198
199 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
200 demote = 1;
201 else if (!sdp->sd_args.ar_localcaching &&
202 time_after_eq(jiffies, gl->gl_stamp +
203 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
204 demote = 1;
205
206 return demote;
207}
208
209/**
210 * inode_go_lock - operation done after an inode lock is locked by a process
211 * @gl: the glock
212 * @flags:
213 *
214 * Returns: errno
215 */
216
217static int inode_go_lock(struct gfs2_holder *gh)
218{
219 struct gfs2_glock *gl = gh->gh_gl;
220 struct gfs2_inode *ip = gl->gl_object;
221 int error = 0;
222
223 if (!ip)
224 return 0;
225
226 if (ip->i_vn != gl->gl_vn) {
227 error = gfs2_inode_refresh(ip);
228 if (error)
229 return error;
230 gfs2_inode_attr_in(ip);
231 }
232
233 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
234 (gl->gl_state == LM_ST_EXCLUSIVE) &&
235 (gh->gh_flags & GL_LOCAL_EXCL))
236 error = gfs2_truncatei_resume(ip);
237
238 return error;
239}
240
241/**
242 * inode_go_unlock - operation done before an inode lock is unlocked by a
243 * process
244 * @gl: the glock
245 * @flags:
246 *
247 */
248
249static void inode_go_unlock(struct gfs2_holder *gh)
250{
251 struct gfs2_glock *gl = gh->gh_gl;
252 struct gfs2_inode *ip = gl->gl_object;
253
254 if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
255 gfs2_inode_attr_in(ip);
256
257 if (ip)
258 gfs2_meta_cache_flush(ip);
259}
260
261/**
262 * inode_greedy -
263 * @gl: the glock
264 *
265 */
266
267static void inode_greedy(struct gfs2_glock *gl)
268{
269 struct gfs2_sbd *sdp = gl->gl_sbd;
270 struct gfs2_inode *ip = gl->gl_object;
271 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
272 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
273 unsigned int new_time;
274
275 spin_lock(&ip->i_spin);
276
277 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
278 new_time = ip->i_greedy + quantum;
279 if (new_time > max)
280 new_time = max;
281 } else {
282 new_time = ip->i_greedy - quantum;
283 if (!new_time || new_time > max)
284 new_time = 1;
285 }
286
287 ip->i_greedy = new_time;
288
289 spin_unlock(&ip->i_spin);
290
291 iput(&ip->i_inode);
292}
293
294/**
295 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
296 * @gl: the glock
297 *
298 * Returns: 1 if it's ok
299 */
300
301static int rgrp_go_demote_ok(struct gfs2_glock *gl)
302{
303 return !gl->gl_aspace->i_mapping->nrpages;
304}
305
306/**
307 * rgrp_go_lock - operation done after an rgrp lock is locked by
308 * a first holder on this node.
309 * @gl: the glock
310 * @flags:
311 *
312 * Returns: errno
313 */
314
315static int rgrp_go_lock(struct gfs2_holder *gh)
316{
317 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
318}
319
320/**
321 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
322 * a last holder on this node.
323 * @gl: the glock
324 * @flags:
325 *
326 */
327
328static void rgrp_go_unlock(struct gfs2_holder *gh)
329{
330 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
331}
332
333/**
334 * trans_go_xmote_th - promote/demote the transaction glock
335 * @gl: the glock
336 * @state: the requested state
337 * @flags:
338 *
339 */
340
341static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
342 int flags)
343{
344 struct gfs2_sbd *sdp = gl->gl_sbd;
345
346 if (gl->gl_state != LM_ST_UNLOCKED &&
347 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
348 gfs2_meta_syncfs(sdp);
349 gfs2_log_shutdown(sdp);
350 }
351
352 gfs2_glock_xmote_th(gl, state, flags);
353}
354
355/**
356 * trans_go_xmote_bh - After promoting/demoting the transaction glock
357 * @gl: the glock
358 *
359 */
360
361static void trans_go_xmote_bh(struct gfs2_glock *gl)
362{
363 struct gfs2_sbd *sdp = gl->gl_sbd;
364 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
365 struct gfs2_glock *j_gl = ip->i_gl;
366 struct gfs2_log_header head;
367 int error;
368
369 if (gl->gl_state != LM_ST_UNLOCKED &&
370 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
371 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
372 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
373
374 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
375 if (error)
376 gfs2_consist(sdp);
377 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
378 gfs2_consist(sdp);
379
380 /* Initialize some head of the log stuff */
381 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
382 sdp->sd_log_sequence = head.lh_sequence + 1;
383 gfs2_log_pointers_init(sdp, head.lh_blkno);
384 }
385 }
386}
387
388/**
389 * trans_go_drop_th - unlock the transaction glock
390 * @gl: the glock
391 *
392 * We want to sync the device even with localcaching. Remember
393 * that localcaching journal replay only marks buffers dirty.
394 */
395
396static void trans_go_drop_th(struct gfs2_glock *gl)
397{
398 struct gfs2_sbd *sdp = gl->gl_sbd;
399
400 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
401 gfs2_meta_syncfs(sdp);
402 gfs2_log_shutdown(sdp);
403 }
404
405 gfs2_glock_drop_th(gl);
406}
407
408/**
409 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
410 * @gl: the glock
411 *
412 * Returns: 1 if it's ok
413 */
414
415static int quota_go_demote_ok(struct gfs2_glock *gl)
416{
417 return !atomic_read(&gl->gl_lvb_count);
418}
419
420struct gfs2_glock_operations gfs2_meta_glops = {
421 .go_xmote_th = gfs2_glock_xmote_th,
422 .go_drop_th = gfs2_glock_drop_th,
423 .go_sync = meta_go_sync,
424 .go_inval = meta_go_inval,
425 .go_demote_ok = meta_go_demote_ok,
426 .go_type = LM_TYPE_META
427};
428
429struct gfs2_glock_operations gfs2_inode_glops = {
430 .go_xmote_th = inode_go_xmote_th,
431 .go_xmote_bh = inode_go_xmote_bh,
432 .go_drop_th = inode_go_drop_th,
433 .go_sync = inode_go_sync,
434 .go_inval = inode_go_inval,
435 .go_demote_ok = inode_go_demote_ok,
436 .go_lock = inode_go_lock,
437 .go_unlock = inode_go_unlock,
438 .go_greedy = inode_greedy,
439 .go_type = LM_TYPE_INODE
440};
441
442struct gfs2_glock_operations gfs2_rgrp_glops = {
443 .go_xmote_th = gfs2_glock_xmote_th,
444 .go_drop_th = gfs2_glock_drop_th,
445 .go_sync = meta_go_sync,
446 .go_inval = meta_go_inval,
447 .go_demote_ok = rgrp_go_demote_ok,
448 .go_lock = rgrp_go_lock,
449 .go_unlock = rgrp_go_unlock,
450 .go_type = LM_TYPE_RGRP
451};
452
453struct gfs2_glock_operations gfs2_trans_glops = {
454 .go_xmote_th = trans_go_xmote_th,
455 .go_xmote_bh = trans_go_xmote_bh,
456 .go_drop_th = trans_go_drop_th,
457 .go_type = LM_TYPE_NONDISK
458};
459
460struct gfs2_glock_operations gfs2_iopen_glops = {
461 .go_xmote_th = gfs2_glock_xmote_th,
462 .go_drop_th = gfs2_glock_drop_th,
463 .go_callback = gfs2_iopen_go_callback,
464 .go_type = LM_TYPE_IOPEN
465};
466
467struct gfs2_glock_operations gfs2_flock_glops = {
468 .go_xmote_th = gfs2_glock_xmote_th,
469 .go_drop_th = gfs2_glock_drop_th,
470 .go_type = LM_TYPE_FLOCK
471};
472
473struct gfs2_glock_operations gfs2_nondisk_glops = {
474 .go_xmote_th = gfs2_glock_xmote_th,
475 .go_drop_th = gfs2_glock_drop_th,
476 .go_type = LM_TYPE_NONDISK
477};
478
479struct gfs2_glock_operations gfs2_quota_glops = {
480 .go_xmote_th = gfs2_glock_xmote_th,
481 .go_drop_th = gfs2_glock_drop_th,
482 .go_demote_ok = quota_go_demote_ok,
483 .go_type = LM_TYPE_QUOTA
484};
485
486struct gfs2_glock_operations gfs2_journal_glops = {
487 .go_xmote_th = gfs2_glock_xmote_th,
488 .go_drop_th = gfs2_glock_drop_th,
489 .go_type = LM_TYPE_JOURNAL
490};
491
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..5c1e9491024f
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13extern struct gfs2_glock_operations gfs2_meta_glops;
14extern struct gfs2_glock_operations gfs2_inode_glops;
15extern struct gfs2_glock_operations gfs2_rgrp_glops;
16extern struct gfs2_glock_operations gfs2_trans_glops;
17extern struct gfs2_glock_operations gfs2_iopen_glops;
18extern struct gfs2_glock_operations gfs2_flock_glops;
19extern struct gfs2_glock_operations gfs2_nondisk_glops;
20extern struct gfs2_glock_operations gfs2_quota_glops;
21extern struct gfs2_glock_operations gfs2_journal_glops;
22
23#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..90e0624d8065
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,658 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#define DIO_FORCE 0x00000001
14#define DIO_CLEAN 0x00000002
15#define DIO_DIRTY 0x00000004
16#define DIO_START 0x00000008
17#define DIO_WAIT 0x00000010
18#define DIO_METADATA 0x00000020
19#define DIO_DATA 0x00000040
20#define DIO_RELEASE 0x00000080
21#define DIO_ALL 0x00000100
22
23struct gfs2_log_operations;
24struct gfs2_log_element;
25struct gfs2_bitmap;
26struct gfs2_rgrpd;
27struct gfs2_bufdata;
28struct gfs2_glock_operations;
29struct gfs2_holder;
30struct gfs2_glock;
31struct gfs2_alloc;
32struct gfs2_inode;
33struct gfs2_file;
34struct gfs2_revoke;
35struct gfs2_revoke_replay;
36struct gfs2_quota_data;
37struct gfs2_log_buf;
38struct gfs2_trans;
39struct gfs2_ail;
40struct gfs2_jdesc;
41struct gfs2_args;
42struct gfs2_tune;
43struct gfs2_gl_hash_bucket;
44struct gfs2_sbd;
45
46typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
47
48/*
49 * Structure of operations that are associated with each
50 * type of element in the log.
51 */
52
53struct gfs2_log_operations {
54 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
55 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
56 void (*lo_before_commit) (struct gfs2_sbd *sdp);
57 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
58 void (*lo_before_scan) (struct gfs2_jdesc *jd,
59 struct gfs2_log_header *head, int pass);
60 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
61 struct gfs2_log_descriptor *ld, __be64 *ptr,
62 int pass);
63 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
64 const char *lo_name;
65};
66
67struct gfs2_log_element {
68 struct list_head le_list;
69 const struct gfs2_log_operations *le_ops;
70};
71
72struct gfs2_bitmap {
73 struct buffer_head *bi_bh;
74 char *bi_clone;
75 uint32_t bi_offset;
76 uint32_t bi_start;
77 uint32_t bi_len;
78};
79
80struct gfs2_rgrpd {
81 struct list_head rd_list; /* Link with superblock */
82 struct list_head rd_list_mru;
83 struct list_head rd_recent; /* Recently used rgrps */
84 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
85 struct gfs2_rindex rd_ri;
86 struct gfs2_rgrp rd_rg;
87 uint64_t rd_rg_vn;
88 struct gfs2_bitmap *rd_bits;
89 unsigned int rd_bh_count;
90 struct mutex rd_mutex;
91 uint32_t rd_free_clone;
92 struct gfs2_log_element rd_le;
93 uint32_t rd_last_alloc_data;
94 uint32_t rd_last_alloc_meta;
95 struct gfs2_sbd *rd_sbd;
96};
97
98enum gfs2_state_bits {
99 BH_Pinned = BH_PrivateStart,
100 BH_Escaped = BH_PrivateStart + 1,
101};
102
103BUFFER_FNS(Pinned, pinned)
104TAS_BUFFER_FNS(Pinned, pinned)
105BUFFER_FNS(Escaped, escaped)
106TAS_BUFFER_FNS(Escaped, escaped)
107
108struct gfs2_bufdata {
109 struct buffer_head *bd_bh;
110 struct gfs2_glock *bd_gl;
111
112 struct list_head bd_list_tr;
113 struct gfs2_log_element bd_le;
114
115 struct gfs2_ail *bd_ail;
116 struct list_head bd_ail_st_list;
117 struct list_head bd_ail_gl_list;
118};
119
120struct gfs2_glock_operations {
121 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
122 int flags);
123 void (*go_xmote_bh) (struct gfs2_glock * gl);
124 void (*go_drop_th) (struct gfs2_glock * gl);
125 void (*go_drop_bh) (struct gfs2_glock * gl);
126 void (*go_sync) (struct gfs2_glock * gl, int flags);
127 void (*go_inval) (struct gfs2_glock * gl, int flags);
128 int (*go_demote_ok) (struct gfs2_glock * gl);
129 int (*go_lock) (struct gfs2_holder * gh);
130 void (*go_unlock) (struct gfs2_holder * gh);
131 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
132 void (*go_greedy) (struct gfs2_glock * gl);
133 int go_type;
134};
135
136enum {
137 /* Actions */
138 HIF_MUTEX = 0,
139 HIF_PROMOTE = 1,
140 HIF_DEMOTE = 2,
141 HIF_GREEDY = 3,
142
143 /* States */
144 HIF_ALLOCED = 4,
145 HIF_DEALLOC = 5,
146 HIF_HOLDER = 6,
147 HIF_FIRST = 7,
148 HIF_ABORTED = 9,
149};
150
151struct gfs2_holder {
152 struct list_head gh_list;
153
154 struct gfs2_glock *gh_gl;
155 struct task_struct *gh_owner;
156 unsigned int gh_state;
157 unsigned gh_flags;
158
159 int gh_error;
160 unsigned long gh_iflags;
161 struct completion gh_wait;
162 unsigned long gh_ip;
163};
164
165enum {
166 GLF_PLUG = 0,
167 GLF_LOCK = 1,
168 GLF_STICKY = 2,
169 GLF_PREFETCH = 3,
170 GLF_SYNC = 4,
171 GLF_DIRTY = 5,
172 GLF_SKIP_WAITERS2 = 6,
173 GLF_GREEDY = 7,
174};
175
176struct gfs2_glock {
177 struct list_head gl_list;
178 unsigned long gl_flags; /* GLF_... */
179 struct lm_lockname gl_name;
180 struct kref gl_ref;
181
182 spinlock_t gl_spin;
183
184 unsigned int gl_state;
185 struct task_struct *gl_owner;
186 unsigned long gl_ip;
187 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
190 struct list_head gl_waiters3; /* HIF_PROMOTE */
191
192 struct gfs2_glock_operations *gl_ops;
193
194 struct gfs2_holder *gl_req_gh;
195 gfs2_glop_bh_t gl_req_bh;
196
197 lm_lock_t *gl_lock;
198 char *gl_lvb;
199 atomic_t gl_lvb_count;
200
201 uint64_t gl_vn;
202 unsigned long gl_stamp;
203 void *gl_object;
204
205 struct gfs2_gl_hash_bucket *gl_bucket;
206 struct list_head gl_reclaim;
207
208 struct gfs2_sbd *gl_sbd;
209
210 struct inode *gl_aspace;
211 struct gfs2_log_element gl_le;
212 struct list_head gl_ail_list;
213 atomic_t gl_ail_count;
214};
215
216struct gfs2_alloc {
217 /* Quota stuff */
218
219 struct gfs2_quota_data *al_qd[4];
220 struct gfs2_holder al_qd_ghs[4];
221 unsigned int al_qd_num;
222
223 u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
224 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
225
226 /* Filled in by gfs2_inplace_reserve() */
227
228 unsigned int al_line;
229 char *al_file;
230 struct gfs2_holder al_ri_gh;
231 struct gfs2_holder al_rgd_gh;
232 struct gfs2_rgrpd *al_rgd;
233
234};
235
236enum {
237 GIF_QD_LOCKED = 1,
238 GIF_PAGED = 2,
239 GIF_SW_PAGED = 3,
240};
241
242struct gfs2_inode {
243 struct inode i_inode;
244 struct gfs2_inum i_num;
245
246 unsigned long i_flags; /* GIF_... */
247
248 uint64_t i_vn;
249 struct gfs2_dinode i_di; /* To be replaced by ref to block */
250
251 struct gfs2_glock *i_gl; /* Move into i_gh? */
252 struct gfs2_holder i_iopen_gh;
253 struct gfs2_holder i_gh; /* for prepare/commit_write only */
254 struct gfs2_alloc i_alloc;
255 uint64_t i_last_rg_alloc;
256
257 spinlock_t i_spin;
258 struct rw_semaphore i_rw_mutex;
259 unsigned int i_greedy;
260 unsigned long i_last_pfault;
261
262 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
263};
264
265/*
266 * Since i_inode is the first element of struct gfs2_inode,
267 * this is effectively a cast.
268 */
269static inline struct gfs2_inode *GFS2_I(struct inode *inode)
270{
271 return container_of(inode, struct gfs2_inode, i_inode);
272}
273
274/* To be removed? */
275static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
276{
277 return inode->i_sb->s_fs_info;
278}
279
280enum {
281 GFF_DID_DIRECT_ALLOC = 0,
282};
283
284struct gfs2_file {
285 unsigned long f_flags; /* GFF_... */
286 struct mutex f_fl_mutex;
287 struct gfs2_holder f_fl_gh;
288};
289
290struct gfs2_revoke {
291 struct gfs2_log_element rv_le;
292 uint64_t rv_blkno;
293};
294
295struct gfs2_revoke_replay {
296 struct list_head rr_list;
297 uint64_t rr_blkno;
298 unsigned int rr_where;
299};
300
301enum {
302 QDF_USER = 0,
303 QDF_CHANGE = 1,
304 QDF_LOCKED = 2,
305};
306
307struct gfs2_quota_lvb {
308 uint32_t qb_magic;
309 uint32_t __pad;
310 uint64_t qb_limit; /* Hard limit of # blocks to alloc */
311 uint64_t qb_warn; /* Warn user when alloc is above this # */
312 int64_t qb_value; /* Current # blocks allocated */
313};
314
315struct gfs2_quota_data {
316 struct list_head qd_list;
317 unsigned int qd_count;
318
319 uint32_t qd_id;
320 unsigned long qd_flags; /* QDF_... */
321
322 int64_t qd_change;
323 int64_t qd_change_sync;
324
325 unsigned int qd_slot;
326 unsigned int qd_slot_count;
327
328 struct buffer_head *qd_bh;
329 struct gfs2_quota_change *qd_bh_qc;
330 unsigned int qd_bh_count;
331
332 struct gfs2_glock *qd_gl;
333 struct gfs2_quota_lvb qd_qb;
334
335 uint64_t qd_sync_gen;
336 unsigned long qd_last_warn;
337 unsigned long qd_last_touched;
338};
339
340struct gfs2_log_buf {
341 struct list_head lb_list;
342 struct buffer_head *lb_bh;
343 struct buffer_head *lb_real;
344};
345
346struct gfs2_trans {
347 unsigned long tr_ip;
348
349 unsigned int tr_blocks;
350 unsigned int tr_revokes;
351 unsigned int tr_reserved;
352
353 struct gfs2_holder tr_t_gh;
354
355 int tr_touched;
356
357 unsigned int tr_num_buf;
358 unsigned int tr_num_buf_new;
359 unsigned int tr_num_buf_rm;
360 struct list_head tr_list_buf;
361
362 unsigned int tr_num_revoke;
363 unsigned int tr_num_revoke_rm;
364};
365
366struct gfs2_ail {
367 struct list_head ai_list;
368
369 unsigned int ai_first;
370 struct list_head ai_ail1_list;
371 struct list_head ai_ail2_list;
372
373 uint64_t ai_sync_gen;
374};
375
376struct gfs2_jdesc {
377 struct list_head jd_list;
378
379 struct inode *jd_inode;
380 unsigned int jd_jid;
381 int jd_dirty;
382
383 unsigned int jd_blocks;
384};
385
386#define GFS2_GLOCKD_DEFAULT 1
387#define GFS2_GLOCKD_MAX 16
388
389#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
390#define GFS2_QUOTA_OFF 0
391#define GFS2_QUOTA_ACCOUNT 1
392#define GFS2_QUOTA_ON 2
393
394#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
395#define GFS2_DATA_WRITEBACK 1
396#define GFS2_DATA_ORDERED 2
397
398struct gfs2_args {
399 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
400 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
401 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
402 int ar_spectator; /* Don't get a journal because we're always RO */
403 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
404 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
405 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
406 int ar_debug; /* Oops on errors instead of trying to be graceful */
407 int ar_upgrade; /* Upgrade ondisk/multihost format */
408 unsigned int ar_num_glockd; /* Number of glockd threads */
409 int ar_posix_acl; /* Enable posix acls */
410 int ar_quota; /* off/account/on */
411 int ar_suiddir; /* suiddir support */
412 int ar_data; /* ordered/writeback */
413};
414
415struct gfs2_tune {
416 spinlock_t gt_spin;
417
418 unsigned int gt_ilimit;
419 unsigned int gt_ilimit_tries;
420 unsigned int gt_ilimit_min;
421 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
422 unsigned int gt_incore_log_blocks;
423 unsigned int gt_log_flush_secs;
424 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
425
426 unsigned int gt_scand_secs;
427 unsigned int gt_recoverd_secs;
428 unsigned int gt_logd_secs;
429 unsigned int gt_quotad_secs;
430
431 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
432 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
433 unsigned int gt_quota_scale_num; /* Numerator */
434 unsigned int gt_quota_scale_den; /* Denominator */
435 unsigned int gt_quota_cache_secs;
436 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
437 unsigned int gt_atime_quantum; /* Min secs between atime updates */
438 unsigned int gt_new_files_jdata;
439 unsigned int gt_new_files_directio;
440 unsigned int gt_max_atomic_write; /* Split big writes into this size */
441 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
442 unsigned int gt_lockdump_size;
443 unsigned int gt_stall_secs; /* Detects trouble! */
444 unsigned int gt_complain_secs;
445 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
446 unsigned int gt_entries_per_readdir;
447 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
448 unsigned int gt_greedy_default;
449 unsigned int gt_greedy_quantum;
450 unsigned int gt_greedy_max;
451 unsigned int gt_statfs_quantum;
452 unsigned int gt_statfs_slow;
453};
454
455struct gfs2_gl_hash_bucket {
456 rwlock_t hb_lock;
457 struct list_head hb_list;
458};
459
460enum {
461 SDF_JOURNAL_CHECKED = 0,
462 SDF_JOURNAL_LIVE = 1,
463 SDF_SHUTDOWN = 2,
464 SDF_NOATIME = 3,
465};
466
467#define GFS2_GL_HASH_SHIFT 13
468#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
469#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
470#define GFS2_FSNAME_LEN 256
471
472struct gfs2_sbd {
473 struct super_block *sd_vfs;
474 struct kobject sd_kobj;
475 unsigned long sd_flags; /* SDF_... */
476 struct gfs2_sb sd_sb;
477
478 /* Constants computed on mount */
479
480 uint32_t sd_fsb2bb;
481 uint32_t sd_fsb2bb_shift;
482 uint32_t sd_diptrs; /* Number of pointers in a dinode */
483 uint32_t sd_inptrs; /* Number of pointers in a indirect block */
484 uint32_t sd_jbsize; /* Size of a journaled data block */
485 uint32_t sd_hash_bsize; /* sizeof(exhash block) */
486 uint32_t sd_hash_bsize_shift;
487 uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */
488 uint32_t sd_qc_per_block;
489 uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
490 uint32_t sd_max_height; /* Max height of a file's metadata tree */
491 uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
492 uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */
493 uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
494
495 struct gfs2_args sd_args; /* Mount arguments */
496 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
497
498 /* Lock Stuff */
499
500 struct lm_lockstruct sd_lockstruct;
501 struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE];
502 struct list_head sd_reclaim_list;
503 spinlock_t sd_reclaim_lock;
504 wait_queue_head_t sd_reclaim_wq;
505 atomic_t sd_reclaim_count;
506 struct gfs2_holder sd_live_gh;
507 struct gfs2_glock *sd_rename_gl;
508 struct gfs2_glock *sd_trans_gl;
509 struct mutex sd_invalidate_inodes_mutex;
510
511 /* Inode Stuff */
512
513 struct inode *sd_master_dir;
514 struct inode *sd_jindex;
515 struct inode *sd_inum_inode;
516 struct inode *sd_statfs_inode;
517 struct inode *sd_ir_inode;
518 struct inode *sd_sc_inode;
519 struct inode *sd_qc_inode;
520 struct inode *sd_rindex;
521 struct inode *sd_quota_inode;
522
523 /* Inum stuff */
524
525 struct mutex sd_inum_mutex;
526
527 /* StatFS stuff */
528
529 spinlock_t sd_statfs_spin;
530 struct mutex sd_statfs_mutex;
531 struct gfs2_statfs_change sd_statfs_master;
532 struct gfs2_statfs_change sd_statfs_local;
533 unsigned long sd_statfs_sync_time;
534
535 /* Resource group stuff */
536
537 uint64_t sd_rindex_vn;
538 spinlock_t sd_rindex_spin;
539 struct mutex sd_rindex_mutex;
540 struct list_head sd_rindex_list;
541 struct list_head sd_rindex_mru_list;
542 struct list_head sd_rindex_recent_list;
543 struct gfs2_rgrpd *sd_rindex_forward;
544 unsigned int sd_rgrps;
545
546 /* Journal index stuff */
547
548 struct list_head sd_jindex_list;
549 spinlock_t sd_jindex_spin;
550 struct mutex sd_jindex_mutex;
551 unsigned int sd_journals;
552 unsigned long sd_jindex_refresh_time;
553
554 struct gfs2_jdesc *sd_jdesc;
555 struct gfs2_holder sd_journal_gh;
556 struct gfs2_holder sd_jinode_gh;
557
558 struct gfs2_holder sd_ir_gh;
559 struct gfs2_holder sd_sc_gh;
560 struct gfs2_holder sd_qc_gh;
561
562 /* Daemon stuff */
563
564 struct task_struct *sd_scand_process;
565 struct task_struct *sd_recoverd_process;
566 struct task_struct *sd_logd_process;
567 struct task_struct *sd_quotad_process;
568 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
569 unsigned int sd_glockd_num;
570
571 /* Quota stuff */
572
573 struct list_head sd_quota_list;
574 atomic_t sd_quota_count;
575 spinlock_t sd_quota_spin;
576 struct mutex sd_quota_mutex;
577
578 unsigned int sd_quota_slots;
579 unsigned int sd_quota_chunks;
580 unsigned char **sd_quota_bitmap;
581
582 uint64_t sd_quota_sync_gen;
583 unsigned long sd_quota_sync_time;
584
585 /* Log stuff */
586
587 spinlock_t sd_log_lock;
588
589 unsigned int sd_log_blks_reserved;
590 unsigned int sd_log_commited_buf;
591 unsigned int sd_log_commited_revoke;
592
593 unsigned int sd_log_num_gl;
594 unsigned int sd_log_num_buf;
595 unsigned int sd_log_num_revoke;
596 unsigned int sd_log_num_rg;
597 unsigned int sd_log_num_databuf;
598 unsigned int sd_log_num_jdata;
599 unsigned int sd_log_num_hdrs;
600
601 struct list_head sd_log_le_gl;
602 struct list_head sd_log_le_buf;
603 struct list_head sd_log_le_revoke;
604 struct list_head sd_log_le_rg;
605 struct list_head sd_log_le_databuf;
606
607 unsigned int sd_log_blks_free;
608 struct mutex sd_log_reserve_mutex;
609
610 uint64_t sd_log_sequence;
611 unsigned int sd_log_head;
612 unsigned int sd_log_tail;
613 int sd_log_idle;
614
615 unsigned long sd_log_flush_time;
616 struct rw_semaphore sd_log_flush_lock;
617 struct list_head sd_log_flush_list;
618
619 unsigned int sd_log_flush_head;
620 uint64_t sd_log_flush_wrapped;
621
622 struct list_head sd_ail1_list;
623 struct list_head sd_ail2_list;
624 uint64_t sd_ail_sync_gen;
625
626 /* Replay stuff */
627
628 struct list_head sd_revoke_list;
629 unsigned int sd_replay_tail;
630
631 unsigned int sd_found_blocks;
632 unsigned int sd_found_revokes;
633 unsigned int sd_replayed_blocks;
634
635 /* For quiescing the filesystem */
636
637 struct gfs2_holder sd_freeze_gh;
638 struct mutex sd_freeze_lock;
639 unsigned int sd_freeze_count;
640
641 /* Counters */
642
643 atomic_t sd_glock_count;
644 atomic_t sd_glock_held_count;
645 atomic_t sd_inode_count;
646 atomic_t sd_reclaimed;
647
648 char sd_fsname[GFS2_FSNAME_LEN];
649 char sd_table_name[GFS2_FSNAME_LEN];
650 char sd_proto_name[GFS2_FSNAME_LEN];
651
652 /* Debugging crud */
653
654 unsigned long sd_last_warning;
655};
656
657#endif /* __INCORE_DOT_H__ */
658
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..e76f345517b7
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1354 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "bmap.h"
25#include "dir.h"
26#include "eattr.h"
27#include "glock.h"
28#include "glops.h"
29#include "inode.h"
30#include "log.h"
31#include "meta_io.h"
32#include "ops_address.h"
33#include "ops_file.h"
34#include "ops_inode.h"
35#include "quota.h"
36#include "rgrp.h"
37#include "trans.h"
38#include "util.h"
39
40/**
41 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
42 * @ip: The GFS2 inode (with embedded disk inode data)
43 * @inode: The Linux VFS inode
44 *
45 */
46
47void gfs2_inode_attr_in(struct gfs2_inode *ip)
48{
49 struct inode *inode = &ip->i_inode;
50
51 inode->i_ino = ip->i_num.no_addr;
52
53 switch (ip->i_di.di_mode & S_IFMT) {
54 case S_IFBLK:
55 case S_IFCHR:
56 inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
57 break;
58 default:
59 inode->i_rdev = 0;
60 break;
61 };
62
63 inode->i_mode = ip->i_di.di_mode;
64 inode->i_nlink = ip->i_di.di_nlink;
65 inode->i_uid = ip->i_di.di_uid;
66 inode->i_gid = ip->i_di.di_gid;
67 i_size_write(inode, ip->i_di.di_size);
68 inode->i_atime.tv_sec = ip->i_di.di_atime;
69 inode->i_mtime.tv_sec = ip->i_di.di_mtime;
70 inode->i_ctime.tv_sec = ip->i_di.di_ctime;
71 inode->i_atime.tv_nsec = 0;
72 inode->i_mtime.tv_nsec = 0;
73 inode->i_ctime.tv_nsec = 0;
74 inode->i_blksize = PAGE_SIZE;
75 inode->i_blocks = ip->i_di.di_blocks <<
76 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
77
78 if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE)
79 inode->i_flags |= S_IMMUTABLE;
80 else
81 inode->i_flags &= ~S_IMMUTABLE;
82
83 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY)
84 inode->i_flags |= S_APPEND;
85 else
86 inode->i_flags &= ~S_APPEND;
87}
88
89/**
90 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
91 * @ip: The GFS2 inode
92 *
93 * Only copy out the attributes that we want the VFS layer
94 * to be able to modify.
95 */
96
97void gfs2_inode_attr_out(struct gfs2_inode *ip)
98{
99 struct inode *inode = &ip->i_inode;
100
101 gfs2_assert_withdraw(GFS2_SB(inode),
102 (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
103 ip->i_di.di_mode = inode->i_mode;
104 ip->i_di.di_uid = inode->i_uid;
105 ip->i_di.di_gid = inode->i_gid;
106 ip->i_di.di_atime = inode->i_atime.tv_sec;
107 ip->i_di.di_mtime = inode->i_mtime.tv_sec;
108 ip->i_di.di_ctime = inode->i_ctime.tv_sec;
109}
110
111static int iget_test(struct inode *inode, void *opaque)
112{
113 struct gfs2_inode *ip = GFS2_I(inode);
114 struct gfs2_inum *inum = opaque;
115
116 if (ip && ip->i_num.no_addr == inum->no_addr)
117 return 1;
118
119 return 0;
120}
121
122static int iget_set(struct inode *inode, void *opaque)
123{
124 struct gfs2_inode *ip = GFS2_I(inode);
125 struct gfs2_inum *inum = opaque;
126
127 ip->i_num = *inum;
128 return 0;
129}
130
131struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
132{
133 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
134 iget_test, inum);
135}
136
137static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
138{
139 return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
140 iget_test, iget_set, inum);
141}
142
143/**
144 * gfs2_inode_lookup - Lookup an inode
145 * @sb: The super block
146 * @inum: The inode number
147 * @type: The type of the inode
148 *
149 * Returns: A VFS inode, or an error
150 */
151
152struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
153{
154 struct inode *inode = gfs2_iget(sb, inum);
155 struct gfs2_inode *ip = GFS2_I(inode);
156 struct gfs2_glock *io_gl;
157 int error;
158
159 if (inode->i_state & I_NEW) {
160 struct gfs2_sbd *sdp = GFS2_SB(inode);
161 umode_t mode = DT2IF(type);
162 inode->u.generic_ip = ip;
163 inode->i_mode = mode;
164
165 if (S_ISREG(mode)) {
166 inode->i_op = &gfs2_file_iops;
167 inode->i_fop = &gfs2_file_fops;
168 inode->i_mapping->a_ops = &gfs2_file_aops;
169 } else if (S_ISDIR(mode)) {
170 inode->i_op = &gfs2_dir_iops;
171 inode->i_fop = &gfs2_dir_fops;
172 } else if (S_ISLNK(mode)) {
173 inode->i_op = &gfs2_symlink_iops;
174 } else {
175 inode->i_op = &gfs2_dev_iops;
176 }
177
178 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
179 if (unlikely(error))
180 goto fail;
181 ip->i_gl->gl_object = ip;
182
183 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
184 if (unlikely(error))
185 goto fail_put;
186
187 ip->i_vn = ip->i_gl->gl_vn - 1;
188 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
189 if (unlikely(error))
190 goto fail_iopen;
191
192 gfs2_glock_put(io_gl);
193 unlock_new_inode(inode);
194 }
195
196 return inode;
197fail_iopen:
198 gfs2_glock_put(io_gl);
199fail_put:
200 ip->i_gl->gl_object = NULL;
201 gfs2_glock_put(ip->i_gl);
202fail:
203 iput(inode);
204 return ERR_PTR(error);
205}
206
207/**
208 * gfs2_inode_refresh - Refresh the incore copy of the dinode
209 * @ip: The GFS2 inode
210 *
211 * Returns: errno
212 */
213
214int gfs2_inode_refresh(struct gfs2_inode *ip)
215{
216 struct buffer_head *dibh;
217 int error;
218
219 error = gfs2_meta_inode_buffer(ip, &dibh);
220 if (error)
221 return error;
222
223 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
224 brelse(dibh);
225 return -EIO;
226 }
227
228 gfs2_dinode_in(&ip->i_di, dibh->b_data);
229
230 brelse(dibh);
231
232 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
233 if (gfs2_consist_inode(ip))
234 gfs2_dinode_print(&ip->i_di);
235 return -EIO;
236 }
237 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
238 return -ESTALE;
239
240 ip->i_vn = ip->i_gl->gl_vn;
241
242 return 0;
243}
244
245int gfs2_dinode_dealloc(struct gfs2_inode *ip)
246{
247 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
248 struct gfs2_alloc *al;
249 struct gfs2_rgrpd *rgd;
250 int error;
251
252 if (ip->i_di.di_blocks != 1) {
253 if (gfs2_consist_inode(ip))
254 gfs2_dinode_print(&ip->i_di);
255 return -EIO;
256 }
257
258 al = gfs2_alloc_get(ip);
259
260 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
261 if (error)
262 goto out;
263
264 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
265 if (error)
266 goto out_qs;
267
268 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
269 if (!rgd) {
270 gfs2_consist_inode(ip);
271 error = -EIO;
272 goto out_rindex_relse;
273 }
274
275 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
276 &al->al_rgd_gh);
277 if (error)
278 goto out_rindex_relse;
279
280 error = gfs2_trans_begin(sdp, RES_RG_BIT +
281 RES_STATFS + RES_QUOTA, 1);
282 if (error)
283 goto out_rg_gunlock;
284
285 gfs2_trans_add_gl(ip->i_gl);
286
287 gfs2_free_di(rgd, ip);
288
289 gfs2_trans_end(sdp);
290 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
291
292out_rg_gunlock:
293 gfs2_glock_dq_uninit(&al->al_rgd_gh);
294out_rindex_relse:
295 gfs2_glock_dq_uninit(&al->al_ri_gh);
296out_qs:
297 gfs2_quota_unhold(ip);
298out:
299 gfs2_alloc_put(ip);
300 return error;
301}
302
303/**
304 * gfs2_change_nlink - Change nlink count on inode
305 * @ip: The GFS2 inode
306 * @diff: The change in the nlink count required
307 *
308 * Returns: errno
309 */
310
311int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
312{
313 struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
314 struct buffer_head *dibh;
315 uint32_t nlink;
316 int error;
317
318 BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
319 nlink = ip->i_di.di_nlink + diff;
320
321 /* If we are reducing the nlink count, but the new value ends up being
322 bigger than the old one, we must have underflowed. */
323 if (diff < 0 && nlink > ip->i_di.di_nlink) {
324 if (gfs2_consist_inode(ip))
325 gfs2_dinode_print(&ip->i_di);
326 return -EIO;
327 }
328
329 error = gfs2_meta_inode_buffer(ip, &dibh);
330 if (error)
331 return error;
332
333 ip->i_di.di_nlink = nlink;
334 ip->i_di.di_ctime = get_seconds();
335 ip->i_inode.i_nlink = nlink;
336
337 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
338 gfs2_dinode_out(&ip->i_di, dibh->b_data);
339 brelse(dibh);
340 mark_inode_dirty(&ip->i_inode);
341
342 if (ip->i_di.di_nlink == 0) {
343 struct gfs2_rgrpd *rgd;
344 struct gfs2_holder ri_gh, rg_gh;
345
346 error = gfs2_rindex_hold(sdp, &ri_gh);
347 if (error)
348 goto out;
349 error = -EIO;
350 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
351 if (!rgd)
352 goto out_norgrp;
353 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
354 if (error)
355 goto out_norgrp;
356
357 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
358 gfs2_glock_dq_uninit(&rg_gh);
359out_norgrp:
360 gfs2_glock_dq_uninit(&ri_gh);
361 }
362out:
363 return error;
364}
365
366struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
367{
368 struct qstr qstr;
369 gfs2_str2qstr(&qstr, name);
370 return gfs2_lookupi(dip, &qstr, 1, NULL);
371}
372
373
374/**
375 * gfs2_lookupi - Look up a filename in a directory and return its inode
376 * @d_gh: An initialized holder for the directory glock
377 * @name: The name of the inode to look for
378 * @is_root: If 1, ignore the caller's permissions
379 * @i_gh: An uninitialized holder for the new inode glock
380 *
381 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
382 * @is_root is true.
383 *
384 * Returns: errno
385 */
386
387struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
388 int is_root, struct nameidata *nd)
389
390{
391 struct super_block *sb = dir->i_sb;
392 struct gfs2_inode *dip = GFS2_I(dir);
393 struct gfs2_holder d_gh;
394 struct gfs2_inum inum;
395 unsigned int type;
396 int error = 0;
397 struct inode *inode = NULL;
398
399 if (!name->len || name->len > GFS2_FNAMESIZE)
400 return ERR_PTR(-ENAMETOOLONG);
401
402 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
403 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
404 dir == sb->s_root->d_inode)) {
405 igrab(dir);
406 return dir;
407 }
408
409 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
410 if (error)
411 return ERR_PTR(error);
412
413 if (!is_root) {
414 error = permission(dir, MAY_EXEC, NULL);
415 if (error)
416 goto out;
417 }
418
419 error = gfs2_dir_search(dir, name, &inum, &type);
420 if (error)
421 goto out;
422
423 inode = gfs2_inode_lookup(sb, &inum, type);
424
425out:
426 gfs2_glock_dq_uninit(&d_gh);
427 if (error == -ENOENT)
428 return NULL;
429 return inode;
430}
431
432static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino)
433{
434 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
435 struct buffer_head *bh;
436 struct gfs2_inum_range ir;
437 int error;
438
439 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
440 if (error)
441 return error;
442 mutex_lock(&sdp->sd_inum_mutex);
443
444 error = gfs2_meta_inode_buffer(ip, &bh);
445 if (error) {
446 mutex_unlock(&sdp->sd_inum_mutex);
447 gfs2_trans_end(sdp);
448 return error;
449 }
450
451 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
452
453 if (ir.ir_length) {
454 *formal_ino = ir.ir_start++;
455 ir.ir_length--;
456 gfs2_trans_add_bh(ip->i_gl, bh, 1);
457 gfs2_inum_range_out(&ir,
458 bh->b_data + sizeof(struct gfs2_dinode));
459 brelse(bh);
460 mutex_unlock(&sdp->sd_inum_mutex);
461 gfs2_trans_end(sdp);
462 return 0;
463 }
464
465 brelse(bh);
466
467 mutex_unlock(&sdp->sd_inum_mutex);
468 gfs2_trans_end(sdp);
469
470 return 1;
471}
472
473static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino)
474{
475 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
476 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
477 struct gfs2_holder gh;
478 struct buffer_head *bh;
479 struct gfs2_inum_range ir;
480 int error;
481
482 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
483 if (error)
484 return error;
485
486 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
487 if (error)
488 goto out;
489 mutex_lock(&sdp->sd_inum_mutex);
490
491 error = gfs2_meta_inode_buffer(ip, &bh);
492 if (error)
493 goto out_end_trans;
494
495 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
496
497 if (!ir.ir_length) {
498 struct buffer_head *m_bh;
499 uint64_t x, y;
500
501 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
502 if (error)
503 goto out_brelse;
504
505 x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode));
506 x = y = be64_to_cpu(x);
507 ir.ir_start = x;
508 ir.ir_length = GFS2_INUM_QUANTUM;
509 x += GFS2_INUM_QUANTUM;
510 if (x < y)
511 gfs2_consist_inode(m_ip);
512 x = cpu_to_be64(x);
513 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
514 *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
515
516 brelse(m_bh);
517 }
518
519 *formal_ino = ir.ir_start++;
520 ir.ir_length--;
521
522 gfs2_trans_add_bh(ip->i_gl, bh, 1);
523 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
524
525 out_brelse:
526 brelse(bh);
527
528 out_end_trans:
529 mutex_unlock(&sdp->sd_inum_mutex);
530 gfs2_trans_end(sdp);
531
532 out:
533 gfs2_glock_dq_uninit(&gh);
534
535 return error;
536}
537
538static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum)
539{
540 int error;
541
542 error = pick_formal_ino_1(sdp, inum);
543 if (error <= 0)
544 return error;
545
546 error = pick_formal_ino_2(sdp, inum);
547
548 return error;
549}
550
551/**
552 * create_ok - OK to create a new on-disk inode here?
553 * @dip: Directory in which dinode is to be created
554 * @name: Name of new dinode
555 * @mode:
556 *
557 * Returns: errno
558 */
559
560static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
561 unsigned int mode)
562{
563 int error;
564
565 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
566 if (error)
567 return error;
568
569 /* Don't create entries in an unlinked directory */
570 if (!dip->i_di.di_nlink)
571 return -EPERM;
572
573 error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
574 switch (error) {
575 case -ENOENT:
576 error = 0;
577 break;
578 case 0:
579 return -EEXIST;
580 default:
581 return error;
582 }
583
584 if (dip->i_di.di_entries == (uint32_t)-1)
585 return -EFBIG;
586 if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1)
587 return -EMLINK;
588
589 return 0;
590}
591
592static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
593 unsigned int *uid, unsigned int *gid)
594{
595 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
596 (dip->i_di.di_mode & S_ISUID) &&
597 dip->i_di.di_uid) {
598 if (S_ISDIR(*mode))
599 *mode |= S_ISUID;
600 else if (dip->i_di.di_uid != current->fsuid)
601 *mode &= ~07111;
602 *uid = dip->i_di.di_uid;
603 } else
604 *uid = current->fsuid;
605
606 if (dip->i_di.di_mode & S_ISGID) {
607 if (S_ISDIR(*mode))
608 *mode |= S_ISGID;
609 *gid = dip->i_di.di_gid;
610 } else
611 *gid = current->fsgid;
612}
613
614static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
615 u64 *generation)
616{
617 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
618 int error;
619
620 gfs2_alloc_get(dip);
621
622 dip->i_alloc.al_requested = RES_DINODE;
623 error = gfs2_inplace_reserve(dip);
624 if (error)
625 goto out;
626
627 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
628 if (error)
629 goto out_ipreserv;
630
631 inum->no_addr = gfs2_alloc_di(dip, generation);
632
633 gfs2_trans_end(sdp);
634
635out_ipreserv:
636 gfs2_inplace_release(dip);
637
638out:
639 gfs2_alloc_put(dip);
640
641 return error;
642}
643
644/**
645 * init_dinode - Fill in a new dinode structure
646 * @dip: the directory this inode is being created in
647 * @gl: The glock covering the new inode
648 * @inum: the inode number
649 * @mode: the file permissions
650 * @uid:
651 * @gid:
652 *
653 */
654
655static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
656 const struct gfs2_inum *inum, unsigned int mode,
657 unsigned int uid, unsigned int gid,
658 const u64 *generation)
659{
660 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
661 struct gfs2_dinode *di;
662 struct buffer_head *dibh;
663
664 dibh = gfs2_meta_new(gl, inum->no_addr);
665 gfs2_trans_add_bh(gl, dibh, 1);
666 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
667 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
668 di = (struct gfs2_dinode *)dibh->b_data;
669
670 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
671 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
672 di->di_mode = cpu_to_be32(mode);
673 di->di_uid = cpu_to_be32(uid);
674 di->di_gid = cpu_to_be32(gid);
675 di->di_nlink = cpu_to_be32(0);
676 di->di_size = cpu_to_be64(0);
677 di->di_blocks = cpu_to_be64(1);
678 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
679 di->di_major = di->di_minor = cpu_to_be32(0);
680 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
681 di->di_generation = cpu_to_be64(*generation);
682 di->di_flags = cpu_to_be32(0);
683
684 if (S_ISREG(mode)) {
685 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
686 gfs2_tune_get(sdp, gt_new_files_jdata))
687 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
688 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
689 gfs2_tune_get(sdp, gt_new_files_directio))
690 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
691 } else if (S_ISDIR(mode)) {
692 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
693 GFS2_DIF_INHERIT_DIRECTIO);
694 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
695 GFS2_DIF_INHERIT_JDATA);
696 }
697
698 di->__pad1 = 0;
699 di->di_payload_format = cpu_to_be32(0);
700 di->di_height = cpu_to_be32(0);
701 di->__pad2 = 0;
702 di->__pad3 = 0;
703 di->di_depth = cpu_to_be16(0);
704 di->di_entries = cpu_to_be32(0);
705 memset(&di->__pad4, 0, sizeof(di->__pad4));
706 di->di_eattr = cpu_to_be64(0);
707 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
708
709 brelse(dibh);
710}
711
712static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
713 unsigned int mode, const struct gfs2_inum *inum,
714 const u64 *generation)
715{
716 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
717 unsigned int uid, gid;
718 int error;
719
720 munge_mode_uid_gid(dip, &mode, &uid, &gid);
721 gfs2_alloc_get(dip);
722
723 error = gfs2_quota_lock(dip, uid, gid);
724 if (error)
725 goto out;
726
727 error = gfs2_quota_check(dip, uid, gid);
728 if (error)
729 goto out_quota;
730
731 error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
732 if (error)
733 goto out_quota;
734
735 init_dinode(dip, gl, inum, mode, uid, gid, generation);
736 gfs2_quota_change(dip, +1, uid, gid);
737 gfs2_trans_end(sdp);
738
739out_quota:
740 gfs2_quota_unlock(dip);
741out:
742 gfs2_alloc_put(dip);
743 return error;
744}
745
746static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
747 struct gfs2_inode *ip)
748{
749 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
750 struct gfs2_alloc *al;
751 int alloc_required;
752 struct buffer_head *dibh;
753 int error;
754
755 al = gfs2_alloc_get(dip);
756
757 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
758 if (error)
759 goto fail;
760
761 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
762 if (alloc_required < 0)
763 goto fail;
764 if (alloc_required) {
765 error = gfs2_quota_check(dip, dip->i_di.di_uid,
766 dip->i_di.di_gid);
767 if (error)
768 goto fail_quota_locks;
769
770 al->al_requested = sdp->sd_max_dirres;
771
772 error = gfs2_inplace_reserve(dip);
773 if (error)
774 goto fail_quota_locks;
775
776 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
777 al->al_rgd->rd_ri.ri_length +
778 2 * RES_DINODE +
779 RES_STATFS + RES_QUOTA, 0);
780 if (error)
781 goto fail_ipreserv;
782 } else {
783 error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
784 if (error)
785 goto fail_quota_locks;
786 }
787
788 error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
789 if (error)
790 goto fail_end_trans;
791
792 error = gfs2_meta_inode_buffer(ip, &dibh);
793 if (error)
794 goto fail_end_trans;
795 ip->i_di.di_nlink = 1;
796 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
797 gfs2_dinode_out(&ip->i_di, dibh->b_data);
798 brelse(dibh);
799 return 0;
800
801fail_end_trans:
802 gfs2_trans_end(sdp);
803
804fail_ipreserv:
805 if (dip->i_alloc.al_rgd)
806 gfs2_inplace_release(dip);
807
808fail_quota_locks:
809 gfs2_quota_unlock(dip);
810
811fail:
812 gfs2_alloc_put(dip);
813 return error;
814}
815
816/**
817 * gfs2_createi - Create a new inode
818 * @ghs: An array of two holders
819 * @name: The name of the new file
820 * @mode: the permissions on the new inode
821 *
822 * @ghs[0] is an initialized holder for the directory
823 * @ghs[1] is the holder for the inode lock
824 *
825 * If the return value is not NULL, the glocks on both the directory and the new
826 * file are held. A transaction has been started and an inplace reservation
827 * is held, as well.
828 *
829 * Returns: An inode
830 */
831
832struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
833 unsigned int mode)
834{
835 struct inode *inode;
836 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
837 struct inode *dir = &dip->i_inode;
838 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
839 struct gfs2_inum inum;
840 int error;
841 u64 generation;
842
843 if (!name->len || name->len > GFS2_FNAMESIZE)
844 return ERR_PTR(-ENAMETOOLONG);
845
846 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
847 error = gfs2_glock_nq(ghs);
848 if (error)
849 goto fail;
850
851 error = create_ok(dip, name, mode);
852 if (error)
853 goto fail_gunlock;
854
855 error = pick_formal_ino(sdp, &inum.no_formal_ino);
856 if (error)
857 goto fail_gunlock;
858
859 error = alloc_dinode(dip, &inum, &generation);
860 if (error)
861 goto fail_gunlock;
862
863 if (inum.no_addr < dip->i_num.no_addr) {
864 gfs2_glock_dq(ghs);
865
866 error = gfs2_glock_nq_num(sdp, inum.no_addr,
867 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
868 GL_SKIP, ghs + 1);
869 if (error) {
870 return ERR_PTR(error);
871 }
872
873 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
874 error = gfs2_glock_nq(ghs);
875 if (error) {
876 gfs2_glock_dq_uninit(ghs + 1);
877 return ERR_PTR(error);
878 }
879
880 error = create_ok(dip, name, mode);
881 if (error)
882 goto fail_gunlock2;
883 } else {
884 error = gfs2_glock_nq_num(sdp, inum.no_addr,
885 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
886 GL_SKIP, ghs + 1);
887 if (error)
888 goto fail_gunlock;
889 }
890
891 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
892 if (error)
893 goto fail_gunlock2;
894
895 inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
896 if (IS_ERR(inode))
897 goto fail_gunlock2;
898
899 error = gfs2_inode_refresh(GFS2_I(inode));
900 if (error)
901 goto fail_iput;
902
903 error = gfs2_acl_create(dip, GFS2_I(inode));
904 if (error)
905 goto fail_iput;
906
907 error = link_dinode(dip, name, GFS2_I(inode));
908 if (error)
909 goto fail_iput;
910
911 if (!inode)
912 return ERR_PTR(-ENOMEM);
913 return inode;
914
915fail_iput:
916 iput(inode);
917fail_gunlock2:
918 gfs2_glock_dq_uninit(ghs + 1);
919fail_gunlock:
920 gfs2_glock_dq(ghs);
921fail:
922 return ERR_PTR(error);
923}
924
925/**
926 * gfs2_rmdiri - Remove a directory
927 * @dip: The parent directory of the directory to be removed
928 * @name: The name of the directory to be removed
929 * @ip: The GFS2 inode of the directory to be removed
930 *
931 * Assumes Glocks on dip and ip are held
932 *
933 * Returns: errno
934 */
935
936int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
937 struct gfs2_inode *ip)
938{
939 struct qstr dotname;
940 int error;
941
942 if (ip->i_di.di_entries != 2) {
943 if (gfs2_consist_inode(ip))
944 gfs2_dinode_print(&ip->i_di);
945 return -EIO;
946 }
947
948 error = gfs2_dir_del(dip, name);
949 if (error)
950 return error;
951
952 error = gfs2_change_nlink(dip, -1);
953 if (error)
954 return error;
955
956 gfs2_str2qstr(&dotname, ".");
957 error = gfs2_dir_del(ip, &dotname);
958 if (error)
959 return error;
960
961 gfs2_str2qstr(&dotname, "..");
962 error = gfs2_dir_del(ip, &dotname);
963 if (error)
964 return error;
965
966 error = gfs2_change_nlink(ip, -2);
967 if (error)
968 return error;
969
970 return error;
971}
972
973/*
974 * gfs2_unlink_ok - check to see that a inode is still in a directory
975 * @dip: the directory
976 * @name: the name of the file
977 * @ip: the inode
978 *
979 * Assumes that the lock on (at least) @dip is held.
980 *
981 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
982 */
983
984int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
985 struct gfs2_inode *ip)
986{
987 struct gfs2_inum inum;
988 unsigned int type;
989 int error;
990
991 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
992 return -EPERM;
993
994 if ((dip->i_di.di_mode & S_ISVTX) &&
995 dip->i_di.di_uid != current->fsuid &&
996 ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
997 return -EPERM;
998
999 if (IS_APPEND(&dip->i_inode))
1000 return -EPERM;
1001
1002 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
1003 if (error)
1004 return error;
1005
1006 error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
1007 if (error)
1008 return error;
1009
1010 if (!gfs2_inum_equal(&inum, &ip->i_num))
1011 return -ENOENT;
1012
1013 if (IF2DT(ip->i_di.di_mode) != type) {
1014 gfs2_consist_inode(dip);
1015 return -EIO;
1016 }
1017
1018 return 0;
1019}
1020
1021/*
1022 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1023 * @this: move this
1024 * @to: to here
1025 *
1026 * Follow @to back to the root and make sure we don't encounter @this
1027 * Assumes we already hold the rename lock.
1028 *
1029 * Returns: errno
1030 */
1031
1032int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1033{
1034 struct inode *dir = &to->i_inode;
1035 struct super_block *sb = dir->i_sb;
1036 struct inode *tmp;
1037 struct qstr dotdot;
1038 int error = 0;
1039
1040 gfs2_str2qstr(&dotdot, "..");
1041
1042 igrab(dir);
1043
1044 for (;;) {
1045 if (dir == &this->i_inode) {
1046 error = -EINVAL;
1047 break;
1048 }
1049 if (dir == sb->s_root->d_inode) {
1050 error = 0;
1051 break;
1052 }
1053
1054 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1055 if (IS_ERR(tmp)) {
1056 error = PTR_ERR(tmp);
1057 break;
1058 }
1059
1060 iput(dir);
1061 dir = tmp;
1062 }
1063
1064 iput(dir);
1065
1066 return error;
1067}
1068
1069/**
1070 * gfs2_readlinki - return the contents of a symlink
1071 * @ip: the symlink's inode
1072 * @buf: a pointer to the buffer to be filled
1073 * @len: a pointer to the length of @buf
1074 *
1075 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1076 * to be freed by the caller.
1077 *
1078 * Returns: errno
1079 */
1080
1081int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1082{
1083 struct gfs2_holder i_gh;
1084 struct buffer_head *dibh;
1085 unsigned int x;
1086 int error;
1087
1088 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1089 error = gfs2_glock_nq_atime(&i_gh);
1090 if (error) {
1091 gfs2_holder_uninit(&i_gh);
1092 return error;
1093 }
1094
1095 if (!ip->i_di.di_size) {
1096 gfs2_consist_inode(ip);
1097 error = -EIO;
1098 goto out;
1099 }
1100
1101 error = gfs2_meta_inode_buffer(ip, &dibh);
1102 if (error)
1103 goto out;
1104
1105 x = ip->i_di.di_size + 1;
1106 if (x > *len) {
1107 *buf = kmalloc(x, GFP_KERNEL);
1108 if (!*buf) {
1109 error = -ENOMEM;
1110 goto out_brelse;
1111 }
1112 }
1113
1114 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1115 *len = x;
1116
1117out_brelse:
1118 brelse(dibh);
1119out:
1120 gfs2_glock_dq_uninit(&i_gh);
1121 return error;
1122}
1123
1124/**
1125 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1126 * conditionally update the inode's atime
1127 * @gh: the holder to acquire
1128 *
1129 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1130 * Update if the difference between the current time and the inode's current
1131 * atime is greater than an interval specified at mount.
1132 *
1133 * Returns: errno
1134 */
1135
1136int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1137{
1138 struct gfs2_glock *gl = gh->gh_gl;
1139 struct gfs2_sbd *sdp = gl->gl_sbd;
1140 struct gfs2_inode *ip = gl->gl_object;
1141 int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1142 unsigned int state;
1143 int flags;
1144 int error;
1145
1146 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1147 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1148 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1149 return -EINVAL;
1150
1151 state = gh->gh_state;
1152 flags = gh->gh_flags;
1153
1154 error = gfs2_glock_nq(gh);
1155 if (error)
1156 return error;
1157
1158 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1159 (sdp->sd_vfs->s_flags & MS_RDONLY))
1160 return 0;
1161
1162 curtime = get_seconds();
1163 if (curtime - ip->i_di.di_atime >= quantum) {
1164 gfs2_glock_dq(gh);
1165 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1166 gh);
1167 error = gfs2_glock_nq(gh);
1168 if (error)
1169 return error;
1170
1171 /* Verify that atime hasn't been updated while we were
1172 trying to get exclusive lock. */
1173
1174 curtime = get_seconds();
1175 if (curtime - ip->i_di.di_atime >= quantum) {
1176 struct buffer_head *dibh;
1177
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1179 if (error == -EROFS)
1180 return 0;
1181 if (error)
1182 goto fail;
1183
1184 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error)
1186 goto fail_end_trans;
1187
1188 ip->i_di.di_atime = curtime;
1189
1190 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1191 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1192 brelse(dibh);
1193
1194 gfs2_trans_end(sdp);
1195 }
1196
1197 /* If someone else has asked for the glock,
1198 unlock and let them have it. Then reacquire
1199 in the original state. */
1200 if (gfs2_glock_is_blocking(gl)) {
1201 gfs2_glock_dq(gh);
1202 gfs2_holder_reinit(state, flags, gh);
1203 return gfs2_glock_nq(gh);
1204 }
1205 }
1206
1207 return 0;
1208
1209fail_end_trans:
1210 gfs2_trans_end(sdp);
1211fail:
1212 gfs2_glock_dq(gh);
1213 return error;
1214}
1215
1216/**
1217 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1218 * @arg_a: the first structure
1219 * @arg_b: the second structure
1220 *
1221 * Returns: 1 if A > B
1222 * -1 if A < B
1223 * 0 if A = B
1224 */
1225
1226static int glock_compare_atime(const void *arg_a, const void *arg_b)
1227{
1228 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1229 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1230 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1231 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1232 int ret = 0;
1233
1234 if (a->ln_number > b->ln_number)
1235 ret = 1;
1236 else if (a->ln_number < b->ln_number)
1237 ret = -1;
1238 else {
1239 if (gh_a->gh_state == LM_ST_SHARED &&
1240 gh_b->gh_state == LM_ST_EXCLUSIVE)
1241 ret = 1;
1242 else if (gh_a->gh_state == LM_ST_SHARED &&
1243 (gh_b->gh_flags & GL_ATIME))
1244 ret = 1;
1245 }
1246
1247 return ret;
1248}
1249
1250/**
1251 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1252 * atime update
1253 * @num_gh: the number of structures
1254 * @ghs: an array of struct gfs2_holder structures
1255 *
1256 * Returns: 0 on success (all glocks acquired),
1257 * errno on failure (no glocks acquired)
1258 */
1259
1260int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1261{
1262 struct gfs2_holder **p;
1263 unsigned int x;
1264 int error = 0;
1265
1266 if (!num_gh)
1267 return 0;
1268
1269 if (num_gh == 1) {
1270 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1271 if (ghs->gh_flags & GL_ATIME)
1272 error = gfs2_glock_nq_atime(ghs);
1273 else
1274 error = gfs2_glock_nq(ghs);
1275 return error;
1276 }
1277
1278 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1279 if (!p)
1280 return -ENOMEM;
1281
1282 for (x = 0; x < num_gh; x++)
1283 p[x] = &ghs[x];
1284
1285 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1286
1287 for (x = 0; x < num_gh; x++) {
1288 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1289
1290 if (p[x]->gh_flags & GL_ATIME)
1291 error = gfs2_glock_nq_atime(p[x]);
1292 else
1293 error = gfs2_glock_nq(p[x]);
1294
1295 if (error) {
1296 while (x--)
1297 gfs2_glock_dq(p[x]);
1298 break;
1299 }
1300 }
1301
1302 kfree(p);
1303
1304 return error;
1305}
1306
1307
1308static int
1309__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1310{
1311 struct buffer_head *dibh;
1312 int error;
1313
1314 error = gfs2_meta_inode_buffer(ip, &dibh);
1315 if (!error) {
1316 error = inode_setattr(&ip->i_inode, attr);
1317 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1318 gfs2_inode_attr_out(ip);
1319
1320 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1321 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1322 brelse(dibh);
1323 }
1324 return error;
1325}
1326
1327/**
1328 * gfs2_setattr_simple -
1329 * @ip:
1330 * @attr:
1331 *
1332 * Called with a reference on the vnode.
1333 *
1334 * Returns: errno
1335 */
1336
1337int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1338{
1339 int error;
1340
1341 if (current->journal_info)
1342 return __gfs2_setattr_simple(ip, attr);
1343
1344 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
1345 if (error)
1346 return error;
1347
1348 error = __gfs2_setattr_simple(ip, attr);
1349
1350 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1351
1352 return error;
1353}
1354
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..8bb8b559bcea
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
31struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
32
33int gfs2_inode_refresh(struct gfs2_inode *ip);
34
35int gfs2_dinode_dealloc(struct gfs2_inode *inode);
36int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
37struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
38 int is_root, struct nameidata *nd);
39struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
40 unsigned int mode);
41int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
42 struct gfs2_inode *ip);
43int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
44 struct gfs2_inode *ip);
45int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
46int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
47
48int gfs2_glock_nq_atime(struct gfs2_holder *gh);
49int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
50
51int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
52
53struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
54
55#endif /* __INODE_DOT_H__ */
56
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..f45c0ffd1c35
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,244 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "lm.h"
23#include "super.h"
24#include "util.h"
25#include "lvb.h"
26
27/**
28 * gfs2_lm_mount - mount a locking protocol
29 * @sdp: the filesystem
30 * @args: mount arguements
31 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
32 *
33 * Returns: errno
34 */
35
36int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
37{
38 char *proto = sdp->sd_proto_name;
39 char *table = sdp->sd_table_name;
40 int flags = 0;
41 int error;
42
43 if (sdp->sd_args.ar_spectator)
44 flags |= LM_MFLAG_SPECTATOR;
45
46 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
47
48 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
49 gfs2_glock_cb, sdp,
50 GFS2_MIN_LVB_SIZE, flags,
51 &sdp->sd_lockstruct, &sdp->sd_kobj);
52 if (error) {
53 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
54 proto, table, sdp->sd_args.ar_hostdata);
55 goto out;
56 }
57
58 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
59 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
60 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
61 GFS2_MIN_LVB_SIZE)) {
62 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
63 goto out;
64 }
65
66 if (sdp->sd_args.ar_spectator)
67 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
68 else
69 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
70 sdp->sd_lockstruct.ls_jid);
71
72 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
73
74 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
75 !sdp->sd_args.ar_ignore_local_fs) {
76 sdp->sd_args.ar_localflocks = 1;
77 sdp->sd_args.ar_localcaching = 1;
78 }
79
80 out:
81 return error;
82}
83
84void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
85{
86 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
87 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
88 sdp->sd_lockstruct.ls_lockspace);
89}
90
91void gfs2_lm_unmount(struct gfs2_sbd *sdp)
92{
93 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
94 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
95}
96
97int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
98{
99 va_list args;
100
101 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
102 return 0;
103
104 va_start(args, fmt);
105 vprintk(fmt, args);
106 va_end(args);
107
108 fs_err(sdp, "about to withdraw from the cluster\n");
109 BUG_ON(sdp->sd_args.ar_debug);
110
111
112 fs_err(sdp, "waiting for outstanding I/O\n");
113
114 /* FIXME: suspend dm device so oustanding bio's complete
115 and all further io requests fail */
116
117 fs_err(sdp, "telling LM to withdraw\n");
118 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
119 fs_err(sdp, "withdrawn\n");
120 dump_stack();
121
122 return -1;
123}
124
125int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
126 lm_lock_t **lockp)
127{
128 int error;
129 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
130 error = -EIO;
131 else
132 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
133 sdp->sd_lockstruct.ls_lockspace, name, lockp);
134 return error;
135}
136
137void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock)
138{
139 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
140 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
141}
142
143unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
144 unsigned int cur_state, unsigned int req_state,
145 unsigned int flags)
146{
147 int ret;
148 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
149 ret = 0;
150 else
151 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
152 cur_state,
153 req_state, flags);
154 return ret;
155}
156
157unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
158 unsigned int cur_state)
159{
160 int ret;
161 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
162 ret = 0;
163 else
164 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
165 return ret;
166}
167
168void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock)
169{
170 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
171 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
172}
173
174int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp)
175{
176 int error;
177 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
178 error = -EIO;
179 else
180 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
181 return error;
182}
183
184void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
185{
186 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
187 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
188}
189
190#if 0
191void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
192{
193 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
194 sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb);
195}
196#endif /* 0 */
197
198int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
203 error = -EIO;
204 else
205 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
206 sdp->sd_lockstruct.ls_lockspace,
207 name, file, fl);
208 return error;
209}
210
211int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
212 struct file *file, int cmd, struct file_lock *fl)
213{
214 int error;
215 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
216 error = -EIO;
217 else
218 error = sdp->sd_lockstruct.ls_ops->lm_plock(
219 sdp->sd_lockstruct.ls_lockspace,
220 name, file, cmd, fl);
221 return error;
222}
223
224int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
225 struct file *file, struct file_lock *fl)
226{
227 int error;
228 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
229 error = -EIO;
230 else
231 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
232 sdp->sd_lockstruct.ls_lockspace,
233 name, file, fl);
234 return error;
235}
236
237void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
238 unsigned int message)
239{
240 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
241 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
242 sdp->sd_lockstruct.ls_lockspace, jid, message);
243}
244
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..e821101d19c0
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
14void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
16int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
17__attribute__ ((format(printf, 2, 3)));
18int gfs2_lm_get_lock(struct gfs2_sbd *sdp,
19 struct lm_lockname *name, lm_lock_t **lockp);
20void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock);
21unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
22 unsigned int cur_state, unsigned int req_state,
23 unsigned int flags);
24unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
25 unsigned int cur_state);
26void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock);
27int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp);
28void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
29int gfs2_lm_plock_get(struct gfs2_sbd *sdp,
30 struct lm_lockname *name,
31 struct file *file, struct file_lock *fl);
32int gfs2_lm_plock(struct gfs2_sbd *sdp,
33 struct lm_lockname *name,
34 struct file *file, int cmd, struct file_lock *fl);
35int gfs2_lm_punlock(struct gfs2_sbd *sdp,
36 struct lm_lockname *name,
37 struct file *file, struct file_lock *fl);
38void gfs2_lm_recovery_done(struct gfs2_sbd *sdp,
39 unsigned int jid, unsigned int message);
40
41#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/lm_interface.h b/fs/gfs2/lm_interface.h
new file mode 100644
index 000000000000..9d34bf3df103
--- /dev/null
+++ b/fs/gfs2/lm_interface.h
@@ -0,0 +1,295 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13/*
14 * Opaque handles represent the lock module's lockspace structure, the lock
15 * module's lock structures, and GFS's file system (superblock) structure.
16 */
17
18typedef void lm_lockspace_t;
19typedef void lm_lock_t;
20typedef void lm_fsdata_t;
21
22typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
23 void *data);
24
25/*
26 * lm_mount() flags
27 *
28 * LM_MFLAG_SPECTATOR
29 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
30 * modify the filesystem. The lock module shouldn't assign a journal to the FS
31 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
32 * dies or withdraws, all locks can be wiped immediately.
33 */
34
35#define LM_MFLAG_SPECTATOR 0x00000001
36
37/*
38 * lm_lockstruct flags
39 *
40 * LM_LSFLAG_LOCAL
41 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
42 * can make single-node optimizations.
43 */
44
45#define LM_LSFLAG_LOCAL 0x00000001
46
47/*
48 * lm_lockname types
49 */
50
51#define LM_TYPE_RESERVED 0x00
52#define LM_TYPE_NONDISK 0x01
53#define LM_TYPE_INODE 0x02
54#define LM_TYPE_RGRP 0x03
55#define LM_TYPE_META 0x04
56#define LM_TYPE_IOPEN 0x05
57#define LM_TYPE_FLOCK 0x06
58#define LM_TYPE_PLOCK 0x07
59#define LM_TYPE_QUOTA 0x08
60#define LM_TYPE_JOURNAL 0x09
61
62/*
63 * lm_lock() states
64 *
65 * SHARED is compatible with SHARED, not with DEFERRED or EX.
66 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
67 */
68
69#define LM_ST_UNLOCKED 0
70#define LM_ST_EXCLUSIVE 1
71#define LM_ST_DEFERRED 2
72#define LM_ST_SHARED 3
73
74/*
75 * lm_lock() flags
76 *
77 * LM_FLAG_TRY
78 * Don't wait to acquire the lock if it can't be granted immediately.
79 *
80 * LM_FLAG_TRY_1CB
81 * Send one blocking callback if TRY is set and the lock is not granted.
82 *
83 * LM_FLAG_NOEXP
84 * GFS sets this flag on lock requests it makes while doing journal recovery.
85 * These special requests should not be blocked due to the recovery like
86 * ordinary locks would be.
87 *
88 * LM_FLAG_ANY
89 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
90 * also be granted in SHARED. The preferred state is whichever is compatible
91 * with other granted locks, or the specified state if no other locks exist.
92 *
93 * LM_FLAG_PRIORITY
94 * Override fairness considerations. Suppose a lock is held in a shared state
95 * and there is a pending request for the deferred state. A shared lock
96 * request with the priority flag would be allowed to bypass the deferred
97 * request and directly join the other shared lock. A shared lock request
98 * without the priority flag might be forced to wait until the deferred
99 * requested had acquired and released the lock.
100 */
101
102#define LM_FLAG_TRY 0x00000001
103#define LM_FLAG_TRY_1CB 0x00000002
104#define LM_FLAG_NOEXP 0x00000004
105#define LM_FLAG_ANY 0x00000008
106#define LM_FLAG_PRIORITY 0x00000010
107
108/*
109 * lm_lock() and lm_async_cb return flags
110 *
111 * LM_OUT_ST_MASK
112 * Masks the lower two bits of lock state in the returned value.
113 *
114 * LM_OUT_CACHEABLE
115 * The lock hasn't been released so GFS can continue to cache data for it.
116 *
117 * LM_OUT_CANCELED
118 * The lock request was canceled.
119 *
120 * LM_OUT_ASYNC
121 * The result of the request will be returned in an LM_CB_ASYNC callback.
122 */
123
124#define LM_OUT_ST_MASK 0x00000003
125#define LM_OUT_CACHEABLE 0x00000004
126#define LM_OUT_CANCELED 0x00000008
127#define LM_OUT_ASYNC 0x00000080
128#define LM_OUT_ERROR 0x00000100
129
130/*
131 * lm_callback_t types
132 *
133 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
134 * Blocking callback, a remote node is requesting the given lock in
135 * EXCLUSIVE, DEFERRED, or SHARED.
136 *
137 * LM_CB_NEED_RECOVERY
138 * The given journal needs to be recovered.
139 *
140 * LM_CB_DROPLOCKS
141 * Reduce the number of cached locks.
142 *
143 * LM_CB_ASYNC
144 * The given lock has been granted.
145 */
146
147#define LM_CB_NEED_E 257
148#define LM_CB_NEED_D 258
149#define LM_CB_NEED_S 259
150#define LM_CB_NEED_RECOVERY 260
151#define LM_CB_DROPLOCKS 261
152#define LM_CB_ASYNC 262
153
154/*
155 * lm_recovery_done() messages
156 */
157
158#define LM_RD_GAVEUP 308
159#define LM_RD_SUCCESS 309
160
161
162struct lm_lockname {
163 uint64_t ln_number;
164 unsigned int ln_type;
165};
166
167#define lm_name_equal(name1, name2) \
168 (((name1)->ln_number == (name2)->ln_number) && \
169 ((name1)->ln_type == (name2)->ln_type)) \
170
171struct lm_async_cb {
172 struct lm_lockname lc_name;
173 int lc_ret;
174};
175
176struct lm_lockstruct;
177
178struct lm_lockops {
179 char lm_proto_name[256];
180
181 /*
182 * Mount/Unmount
183 */
184
185 int (*lm_mount) (char *table_name, char *host_data,
186 lm_callback_t cb, lm_fsdata_t *fsdata,
187 unsigned int min_lvb_size, int flags,
188 struct lm_lockstruct *lockstruct,
189 struct kobject *fskobj);
190
191 void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
192
193 void (*lm_unmount) (lm_lockspace_t *lockspace);
194
195 void (*lm_withdraw) (lm_lockspace_t *lockspace);
196
197 /*
198 * Lock oriented operations
199 */
200
201 int (*lm_get_lock) (lm_lockspace_t *lockspace,
202 struct lm_lockname *name, lm_lock_t **lockp);
203
204 void (*lm_put_lock) (lm_lock_t *lock);
205
206 unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
207 unsigned int req_state, unsigned int flags);
208
209 unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
210
211 void (*lm_cancel) (lm_lock_t *lock);
212
213 int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
214 void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
215 void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
216
217 /*
218 * Posix Lock oriented operations
219 */
220
221 int (*lm_plock_get) (lm_lockspace_t *lockspace,
222 struct lm_lockname *name,
223 struct file *file, struct file_lock *fl);
224
225 int (*lm_plock) (lm_lockspace_t *lockspace,
226 struct lm_lockname *name,
227 struct file *file, int cmd, struct file_lock *fl);
228
229 int (*lm_punlock) (lm_lockspace_t *lockspace,
230 struct lm_lockname *name,
231 struct file *file, struct file_lock *fl);
232
233 /*
234 * Client oriented operations
235 */
236
237 void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
238 unsigned int message);
239
240 struct module *lm_owner;
241};
242
243/*
244 * lm_mount() return values
245 *
246 * ls_jid - the journal ID this node should use
247 * ls_first - this node is the first to mount the file system
248 * ls_lvb_size - size in bytes of lock value blocks
249 * ls_lockspace - lock module's context for this file system
250 * ls_ops - lock module's functions
251 * ls_flags - lock module features
252 */
253
254struct lm_lockstruct {
255 unsigned int ls_jid;
256 unsigned int ls_first;
257 unsigned int ls_lvb_size;
258 lm_lockspace_t *ls_lockspace;
259 struct lm_lockops *ls_ops;
260 int ls_flags;
261};
262
263void __init gfs2_init_lmh(void);
264
265/*
266 * Lock module bottom interface. A lock module makes itself available to GFS
267 * with these functions.
268 *
269 * For the time being, we copy the gfs1 lock module bottom interface so the
270 * same lock modules can be used with both gfs1 and gfs2 (it won't be possible
271 * to load both gfs1 and gfs2 at once.) Eventually the lock modules will fork
272 * for gfs1/gfs2 and this API can change to the gfs2_ prefix.
273 */
274
275int gfs_register_lockproto(struct lm_lockops *proto);
276
277void gfs_unregister_lockproto(struct lm_lockops *proto);
278
279/*
280 * Lock module top interface. GFS calls these functions when mounting or
281 * unmounting a file system.
282 */
283
284int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
285 lm_callback_t cb, lm_fsdata_t *fsdata,
286 unsigned int min_lvb_size, int flags,
287 struct lm_lockstruct *lockstruct,
288 struct kobject *fskobj);
289
290void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
291
292void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
293
294#endif /* __LM_INTERFACE_DOT_H__ */
295
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..183192836e98
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19
20#include "lm_interface.h"
21
22struct lmh_wrapper {
23 struct list_head lw_list;
24 struct lm_lockops *lw_ops;
25};
26
27/* List of registered low-level locking protocols. A file system selects one
28 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
29
30static struct list_head lmh_list;
31static struct mutex lmh_lock;
32
33/**
34 * gfs_register_lockproto - Register a low-level locking protocol
35 * @proto: the protocol definition
36 *
37 * Returns: 0 on success, -EXXX on failure
38 */
39
40int gfs_register_lockproto(struct lm_lockops *proto)
41{
42 struct lmh_wrapper *lw;
43
44 mutex_lock(&lmh_lock);
45
46 list_for_each_entry(lw, &lmh_list, lw_list) {
47 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
48 mutex_unlock(&lmh_lock);
49 printk(KERN_INFO "GFS2: protocol %s already exists\n",
50 proto->lm_proto_name);
51 return -EEXIST;
52 }
53 }
54
55 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
56 if (!lw) {
57 mutex_unlock(&lmh_lock);
58 return -ENOMEM;
59 }
60
61 lw->lw_ops = proto;
62 list_add(&lw->lw_list, &lmh_list);
63
64 mutex_unlock(&lmh_lock);
65
66 return 0;
67}
68
69/**
70 * gfs_unregister_lockproto - Unregister a low-level locking protocol
71 * @proto: the protocol definition
72 *
73 */
74
75void gfs_unregister_lockproto(struct lm_lockops *proto)
76{
77 struct lmh_wrapper *lw;
78
79 mutex_lock(&lmh_lock);
80
81 list_for_each_entry(lw, &lmh_list, lw_list) {
82 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
83 list_del(&lw->lw_list);
84 mutex_unlock(&lmh_lock);
85 kfree(lw);
86 return;
87 }
88 }
89
90 mutex_unlock(&lmh_lock);
91
92 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
93 proto->lm_proto_name);
94}
95
96/**
97 * gfs2_mount_lockproto - Mount a lock protocol
98 * @proto_name - the name of the protocol
99 * @table_name - the name of the lock space
100 * @host_data - data specific to this host
101 * @cb - the callback to the code using the lock module
102 * @fsdata - data to pass back with the callback
103 * @min_lvb_size - the mininum LVB size that the caller can deal with
104 * @flags - LM_MFLAG_*
105 * @lockstruct - a structure returned describing the mount
106 *
107 * Returns: 0 on success, -EXXX on failure
108 */
109
110int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
111 lm_callback_t cb, lm_fsdata_t *fsdata,
112 unsigned int min_lvb_size, int flags,
113 struct lm_lockstruct *lockstruct,
114 struct kobject *fskobj)
115{
116 struct lmh_wrapper *lw = NULL;
117 int try = 0;
118 int error, found;
119
120 retry:
121 mutex_lock(&lmh_lock);
122
123 found = 0;
124 list_for_each_entry(lw, &lmh_list, lw_list) {
125 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
126 found = 1;
127 break;
128 }
129 }
130
131 if (!found) {
132 if (!try && capable(CAP_SYS_MODULE)) {
133 try = 1;
134 mutex_unlock(&lmh_lock);
135 request_module(proto_name);
136 goto retry;
137 }
138 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
139 error = -ENOENT;
140 goto out;
141 }
142
143 if (!try_module_get(lw->lw_ops->lm_owner)) {
144 try = 0;
145 mutex_unlock(&lmh_lock);
146 msleep(1000);
147 goto retry;
148 }
149
150 error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata,
151 min_lvb_size, flags, lockstruct, fskobj);
152 if (error)
153 module_put(lw->lw_ops->lm_owner);
154 out:
155 mutex_unlock(&lmh_lock);
156 return error;
157}
158
159void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
160{
161 mutex_lock(&lmh_lock);
162 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
163 if (lockstruct->ls_ops->lm_owner)
164 module_put(lockstruct->ls_ops->lm_owner);
165 mutex_unlock(&lmh_lock);
166}
167
168/**
169 * gfs2_withdraw_lockproto - abnormally unmount a lock module
170 * @lockstruct: the lockstruct passed into mount
171 *
172 */
173
174void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
175{
176 mutex_lock(&lmh_lock);
177 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
178 if (lockstruct->ls_ops->lm_owner)
179 module_put(lockstruct->ls_ops->lm_owner);
180 mutex_unlock(&lmh_lock);
181}
182
183void __init gfs2_init_lmh(void)
184{
185 mutex_init(&lmh_lock);
186 INIT_LIST_HEAD(&lmh_list);
187}
188
189EXPORT_SYMBOL_GPL(gfs_register_lockproto);
190EXPORT_SYMBOL_GPL(gfs_unregister_lockproto);
191
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..a9733ff80371
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..e74f1215672f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,541 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete(astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) {
45 list_add_tail(&lp->blist, &ls->blocking);
46 lp->bast_mode = mode;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait);
51}
52
53void gdlm_queue_delayed(struct gdlm_lock *lp)
54{
55 struct gdlm_ls *ls = lp->ls;
56
57 spin_lock(&ls->async_lock);
58 list_add_tail(&lp->delay_list, &ls->delayed);
59 spin_unlock(&ls->async_lock);
60}
61
62/* convert gfs lock-state to dlm lock-mode */
63
64static int16_t make_mode(int16_t lmstate)
65{
66 switch (lmstate) {
67 case LM_ST_UNLOCKED:
68 return DLM_LOCK_NL;
69 case LM_ST_EXCLUSIVE:
70 return DLM_LOCK_EX;
71 case LM_ST_DEFERRED:
72 return DLM_LOCK_CW;
73 case LM_ST_SHARED:
74 return DLM_LOCK_PR;
75 }
76 gdlm_assert(0, "unknown LM state %d", lmstate);
77 return -1;
78}
79
80/* convert dlm lock-mode to gfs lock-state */
81
82int16_t gdlm_make_lmstate(int16_t dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
101
102static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
103{
104 int16_t cur = make_mode(cur_state);
105 if (lp->cur != DLM_LOCK_IV)
106 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
107}
108
109static inline unsigned int make_flags(struct gdlm_lock *lp,
110 unsigned int gfs_flags,
111 int16_t cur, int16_t req)
112{
113 unsigned int lkf = 0;
114
115 if (gfs_flags & LM_FLAG_TRY)
116 lkf |= DLM_LKF_NOQUEUE;
117
118 if (gfs_flags & LM_FLAG_TRY_1CB) {
119 lkf |= DLM_LKF_NOQUEUE;
120 lkf |= DLM_LKF_NOQUEUEBAST;
121 }
122
123 if (gfs_flags & LM_FLAG_PRIORITY) {
124 lkf |= DLM_LKF_NOORDER;
125 lkf |= DLM_LKF_HEADQUE;
126 }
127
128 if (gfs_flags & LM_FLAG_ANY) {
129 if (req == DLM_LOCK_PR)
130 lkf |= DLM_LKF_ALTCW;
131 else if (req == DLM_LOCK_CW)
132 lkf |= DLM_LKF_ALTPR;
133 }
134
135 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK;
144 }
145
146 if (lp->lvb)
147 lkf |= DLM_LKF_VALBLK;
148
149 return lkf;
150}
151
152/* make_strname - convert GFS lock numbers to a string */
153
154static inline void make_strname(struct lm_lockname *lockname,
155 struct gdlm_strname *str)
156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
158 (unsigned long long)lockname->ln_number);
159 str->namelen = GDLM_STRNAME_BYTES;
160}
161
162static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
163 struct gdlm_lock **lpp)
164{
165 struct gdlm_lock *lp;
166
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
168 if (!lp)
169 return -ENOMEM;
170
171 lp->lockname = *name;
172 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL;
175 lp->hold_null = NULL;
176 init_completion(&lp->ast_wait);
177 INIT_LIST_HEAD(&lp->clist);
178 INIT_LIST_HEAD(&lp->blist);
179 INIT_LIST_HEAD(&lp->delay_list);
180
181 spin_lock(&ls->async_lock);
182 list_add(&lp->all_list, &ls->all_locks);
183 ls->all_locks_count++;
184 spin_unlock(&ls->async_lock);
185
186 *lpp = lp;
187 return 0;
188}
189
190void gdlm_delete_lp(struct gdlm_lock *lp)
191{
192 struct gdlm_ls *ls = lp->ls;
193
194 spin_lock(&ls->async_lock);
195 if (!list_empty(&lp->clist))
196 list_del_init(&lp->clist);
197 if (!list_empty(&lp->blist))
198 list_del_init(&lp->blist);
199 if (!list_empty(&lp->delay_list))
200 list_del_init(&lp->delay_list);
201 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
202 (unsigned long long)lp->lockname.ln_number);
203 list_del_init(&lp->all_list);
204 ls->all_locks_count--;
205 spin_unlock(&ls->async_lock);
206
207 kfree(lp);
208}
209
210int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
211 lm_lock_t **lockp)
212{
213 struct gdlm_lock *lp;
214 int error;
215
216 error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp);
217
218 *lockp = (lm_lock_t *) lp;
219 return error;
220}
221
222void gdlm_put_lock(lm_lock_t *lock)
223{
224 gdlm_delete_lp((struct gdlm_lock *) lock);
225}
226
227unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{
229 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1;
232
233 /*
234 * When recovery is in progress, delay lock requests for submission
235 * once recovery is done. Requests for recovery (NOEXP) and unlocks
236 * can pass.
237 */
238
239 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
240 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
241 gdlm_queue_delayed(lp);
242 return LM_OUT_ASYNC;
243 }
244
245 /*
246 * Submit the actual lock request.
247 */
248
249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0;
251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags);
255
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
257 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
258 lp->cur, lp->req, lp->lkf);
259
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp,
262 bast ? gdlm_bast : NULL);
263
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN;
266 queue_complete(lp);
267 error = 0;
268 }
269
270 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags);
275 return LM_OUT_ERROR;
276 }
277 return LM_OUT_ASYNC;
278}
279
280static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
281{
282 struct gdlm_ls *ls = lp->ls;
283 unsigned int lkf = 0;
284 int error;
285
286 set_bit(LFL_DLM_UNLOCK, &lp->flags);
287 set_bit(LFL_ACTIVE, &lp->flags);
288
289 if (lp->lvb)
290 lkf = DLM_LKF_VALBLK;
291
292 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
293 (unsigned long long)lp->lockname.ln_number,
294 lp->lksb.sb_lkid, lp->cur, lkf);
295
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297
298 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags);
303 return LM_OUT_ERROR;
304 }
305 return LM_OUT_ASYNC;
306}
307
308unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state,
309 unsigned int req_state, unsigned int flags)
310{
311 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
312
313 clear_bit(LFL_DLM_CANCEL, &lp->flags);
314 if (flags & LM_FLAG_NOEXP)
315 set_bit(LFL_NOBLOCK, &lp->flags);
316
317 check_cur_state(lp, cur_state);
318 lp->req = make_mode(req_state);
319 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
320
321 return gdlm_do_lock(lp);
322}
323
324unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state)
325{
326 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
327
328 clear_bit(LFL_DLM_CANCEL, &lp->flags);
329 if (lp->cur == DLM_LOCK_IV)
330 return 0;
331 return gdlm_do_unlock(lp);
332}
333
334void gdlm_cancel(lm_lock_t *lock)
335{
336 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
337 struct gdlm_ls *ls = lp->ls;
338 int error, delay_list = 0;
339
340 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
341 return;
342
343 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
344 (unsigned long long)lp->lockname.ln_number, lp->flags);
345
346 spin_lock(&ls->async_lock);
347 if (!list_empty(&lp->delay_list)) {
348 list_del_init(&lp->delay_list);
349 delay_list = 1;
350 }
351 spin_unlock(&ls->async_lock);
352
353 if (delay_list) {
354 set_bit(LFL_CANCEL, &lp->flags);
355 set_bit(LFL_ACTIVE, &lp->flags);
356 queue_complete(lp);
357 return;
358 }
359
360 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
361 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
362 log_info("gdlm_cancel skip %x,%llx flags %lx",
363 lp->lockname.ln_type,
364 (unsigned long long)lp->lockname.ln_number, lp->flags);
365 return;
366 }
367
368 /* the lock is blocked in the dlm */
369
370 set_bit(LFL_DLM_CANCEL, &lp->flags);
371 set_bit(LFL_ACTIVE, &lp->flags);
372
373 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
374 NULL, lp);
375
376 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
377 lp->lockname.ln_type,
378 (unsigned long long)lp->lockname.ln_number, lp->flags);
379
380 if (error == -EBUSY)
381 clear_bit(LFL_DLM_CANCEL, &lp->flags);
382}
383
384static int gdlm_add_lvb(struct gdlm_lock *lp)
385{
386 char *lvb;
387
388 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
389 if (!lvb)
390 return -ENOMEM;
391
392 lp->lksb.sb_lvbptr = lvb;
393 lp->lvb = lvb;
394 return 0;
395}
396
397static void gdlm_del_lvb(struct gdlm_lock *lp)
398{
399 kfree(lp->lvb);
400 lp->lvb = NULL;
401 lp->lksb.sb_lvbptr = NULL;
402}
403
404/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
405 the completion) because gfs won't call hold_lvb() during a callback (from
406 the context of a lock_dlm thread). */
407
408static int hold_null_lock(struct gdlm_lock *lp)
409{
410 struct gdlm_lock *lpn = NULL;
411 int error;
412
413 if (lp->hold_null) {
414 printk(KERN_INFO "lock_dlm: lvb already held\n");
415 return 0;
416 }
417
418 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
419 if (error)
420 goto out;
421
422 lpn->lksb.sb_lvbptr = junk_lvb;
423 lpn->lvb = junk_lvb;
424
425 lpn->req = DLM_LOCK_NL;
426 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
427 set_bit(LFL_NOBAST, &lpn->flags);
428 set_bit(LFL_INLOCK, &lpn->flags);
429
430 init_completion(&lpn->ast_wait);
431 gdlm_do_lock(lpn);
432 wait_for_completion(&lpn->ast_wait);
433 error = lp->lksb.sb_status;
434 if (error) {
435 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
436 error);
437 gdlm_delete_lp(lpn);
438 lpn = NULL;
439 }
440 out:
441 lp->hold_null = lpn;
442 return error;
443}
444
445/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
446 the completion) because gfs may call unhold_lvb() during a callback (from
447 the context of a lock_dlm thread) which could cause a deadlock since the
448 other lock_dlm thread could be engaged in recovery. */
449
450static void unhold_null_lock(struct gdlm_lock *lp)
451{
452 struct gdlm_lock *lpn = lp->hold_null;
453
454 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
455 (unsigned long long)lp->lockname.ln_number);
456 lpn->lksb.sb_lvbptr = NULL;
457 lpn->lvb = NULL;
458 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
459 gdlm_do_unlock(lpn);
460 lp->hold_null = NULL;
461}
462
463/* Acquire a NL lock because gfs requires the value block to remain
464 intact on the resource while the lvb is "held" even if it's holding no locks
465 on the resource. */
466
467int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp)
468{
469 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
470 int error;
471
472 error = gdlm_add_lvb(lp);
473 if (error)
474 return error;
475
476 *lvbp = lp->lvb;
477
478 error = hold_null_lock(lp);
479 if (error)
480 gdlm_del_lvb(lp);
481
482 return error;
483}
484
485void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb)
486{
487 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
488
489 unhold_null_lock(lp);
490 gdlm_del_lvb(lp);
491}
492
493void gdlm_sync_lvb(lm_lock_t *lock, char *lvb)
494{
495 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
496
497 if (lp->cur != DLM_LOCK_EX)
498 return;
499
500 init_completion(&lp->ast_wait);
501 set_bit(LFL_SYNC_LVB, &lp->flags);
502
503 lp->req = DLM_LOCK_EX;
504 lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
505
506 gdlm_do_lock(lp);
507 wait_for_completion(&lp->ast_wait);
508}
509
510void gdlm_submit_delayed(struct gdlm_ls *ls)
511{
512 struct gdlm_lock *lp, *safe;
513
514 spin_lock(&ls->async_lock);
515 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
516 list_del_init(&lp->delay_list);
517 list_add_tail(&lp->delay_list, &ls->submit);
518 }
519 spin_unlock(&ls->async_lock);
520 wake_up(&ls->thread_wait);
521}
522
523int gdlm_release_all_locks(struct gdlm_ls *ls)
524{
525 struct gdlm_lock *lp, *safe;
526 int count = 0;
527
528 spin_lock(&ls->async_lock);
529 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
530 list_del_init(&lp->all_list);
531
532 if (lp->lvb && lp->lvb != junk_lvb)
533 kfree(lp->lvb);
534 kfree(lp);
535 count++;
536 }
537 spin_unlock(&ls->async_lock);
538
539 return count;
540}
541
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..530c2f542584
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,188 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include "../../lm_interface.h"
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 uint32_t id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 lm_fsdata_t *fsdata;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed;
78 struct list_head submit;
79 struct list_head all_locks;
80 uint32_t all_locks_count;
81 wait_queue_head_t wait_control;
82 struct task_struct *thread1;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88};
89
90enum {
91 LFL_NOBLOCK = 0,
92 LFL_NOCACHE = 1,
93 LFL_DLM_UNLOCK = 2,
94 LFL_DLM_CANCEL = 3,
95 LFL_SYNC_LVB = 4,
96 LFL_FORCE_PROMOTE = 5,
97 LFL_REREQUEST = 6,
98 LFL_ACTIVE = 7,
99 LFL_INLOCK = 8,
100 LFL_CANCEL = 9,
101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12,
104};
105
106struct gdlm_lock {
107 struct gdlm_ls *ls;
108 struct lm_lockname lockname;
109 char *lvb;
110 struct dlm_lksb lksb;
111
112 int16_t cur;
113 int16_t req;
114 int16_t prev_req;
115 uint32_t lkf; /* dlm flags DLM_LKF_ */
116 unsigned long flags; /* lock_dlm flags LFL_ */
117
118 int bast_mode; /* protected by async_lock */
119 struct completion ast_wait;
120
121 struct list_head clist; /* complete */
122 struct list_head blist; /* blocking */
123 struct list_head delay_list; /* delayed */
124 struct list_head all_list; /* all locks for the fs */
125 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
126};
127
128#define gdlm_assert(assertion, fmt, args...) \
129do { \
130 if (unlikely(!(assertion))) { \
131 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
132 "lock_dlm: " fmt "\n", \
133 #assertion, ##args); \
134 BUG(); \
135 } \
136} while (0)
137
138#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
139#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
140#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
141#ifdef LOCK_DLM_LOG_DEBUG
142#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
143#else
144#define log_debug(fmt, arg...)
145#endif
146
147/* sysfs.c */
148
149int gdlm_sysfs_init(void);
150void gdlm_sysfs_exit(void);
151int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
152void gdlm_kobject_release(struct gdlm_ls *);
153
154/* thread.c */
155
156int gdlm_init_threads(struct gdlm_ls *);
157void gdlm_release_threads(struct gdlm_ls *);
158
159/* lock.c */
160
161int16_t gdlm_make_lmstate(int16_t);
162void gdlm_queue_delayed(struct gdlm_lock *);
163void gdlm_submit_delayed(struct gdlm_ls *);
164int gdlm_release_all_locks(struct gdlm_ls *);
165void gdlm_delete_lp(struct gdlm_lock *);
166unsigned int gdlm_do_lock(struct gdlm_lock *);
167
168int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **);
169void gdlm_put_lock(lm_lock_t *);
170unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int);
171unsigned int gdlm_unlock(lm_lock_t *, unsigned int);
172void gdlm_cancel(lm_lock_t *);
173int gdlm_hold_lvb(lm_lock_t *, char **);
174void gdlm_unhold_lvb(lm_lock_t *, char *);
175void gdlm_sync_lvb(lm_lock_t *, char *);
176
177/* plock.c */
178
179int gdlm_plock_init(void);
180void gdlm_plock_exit(void);
181int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int,
182 struct file_lock *);
183int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *,
184 struct file_lock *);
185int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *,
186 struct file_lock *);
187#endif
188
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..89728c91665f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19static int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51static void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..3caeafc02a1b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,256 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->fsdata = fsdata;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, lm_fsdata_t *fsdata,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, fsdata, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = gdlm_kobject_setup(ls, fskobj);
147 if (error)
148 goto out_thread;
149
150 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
151 &ls->dlm_lockspace,
152 nodir ? DLM_LSFL_NODIR : 0,
153 GDLM_LVB_SIZE);
154 if (error) {
155 log_error("dlm_new_lockspace error %d", error);
156 goto out_kobj;
157 }
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167 out_kobj:
168 gdlm_kobject_release(ls);
169 out_thread:
170 gdlm_release_threads(ls);
171 out_free:
172 kfree(ls);
173 out:
174 return error;
175}
176
177static void gdlm_unmount(lm_lockspace_t *lockspace)
178{
179 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197 out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
205 ls->recover_jid_done = jid;
206 ls->recover_jid_status = message;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
208}
209
210static void gdlm_others_may_mount(lm_lockspace_t *lockspace)
211{
212 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
213 ls->first_done = 1;
214 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
215}
216
217/* Userspace gets the offline uevent, blocks new gfs locks on
218 other mounters, and lets us know (sets WITHDRAW flag). Then,
219 userspace leaves the mount group while we leave the lockspace. */
220
221static void gdlm_withdraw(lm_lockspace_t *lockspace)
222{
223 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
224
225 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
226
227 wait_event_interruptible(ls->wait_control,
228 test_bit(DFL_WITHDRAW, &ls->flags));
229
230 dlm_release_lockspace(ls->dlm_lockspace, 2);
231 gdlm_release_threads(ls);
232 gdlm_release_all_locks(ls);
233 gdlm_kobject_release(ls);
234}
235
236struct lm_lockops gdlm_ops = {
237 .lm_proto_name = "lock_dlm",
238 .lm_mount = gdlm_mount,
239 .lm_others_may_mount = gdlm_others_may_mount,
240 .lm_unmount = gdlm_unmount,
241 .lm_withdraw = gdlm_withdraw,
242 .lm_get_lock = gdlm_get_lock,
243 .lm_put_lock = gdlm_put_lock,
244 .lm_lock = gdlm_lock,
245 .lm_unlock = gdlm_unlock,
246 .lm_plock = gdlm_plock,
247 .lm_punlock = gdlm_punlock,
248 .lm_plock_get = gdlm_plock_get,
249 .lm_cancel = gdlm_cancel,
250 .lm_hold_lvb = gdlm_hold_lvb,
251 .lm_unhold_lvb = gdlm_unhold_lvb,
252 .lm_sync_lvb = gdlm_sync_lvb,
253 .lm_recovery_done = gdlm_recovery_done,
254 .lm_owner = THIS_MODULE,
255};
256
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..6adfb2d4fd8c
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,299 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = fl->fl_pid;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80
81 send_op(op);
82 wait_event(recv_wq, (op->done != 0));
83
84 spin_lock(&ops_lock);
85 if (!list_empty(&op->list)) {
86 printk(KERN_INFO "plock op on list\n");
87 list_del(&op->list);
88 }
89 spin_unlock(&ops_lock);
90
91 rv = op->info.rv;
92
93 if (!rv) {
94 if (posix_lock_file_wait(file, fl) < 0)
95 log_error("gdlm_plock: vfs lock error %x,%llx",
96 name->ln_type,
97 (unsigned long long)name->ln_number);
98 }
99
100 kfree(op);
101 return rv;
102}
103
104int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
105 struct file *file, struct file_lock *fl)
106{
107 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
108 struct plock_op *op;
109 int rv;
110
111 op = kzalloc(sizeof(*op), GFP_KERNEL);
112 if (!op)
113 return -ENOMEM;
114
115 if (posix_lock_file_wait(file, fl) < 0)
116 log_error("gdlm_punlock: vfs unlock error %x,%llx",
117 name->ln_type, (unsigned long long)name->ln_number);
118
119 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
120 op->info.pid = fl->fl_pid;
121 op->info.fsid = ls->id;
122 op->info.number = name->ln_number;
123 op->info.start = fl->fl_start;
124 op->info.end = fl->fl_end;
125
126 send_op(op);
127 wait_event(recv_wq, (op->done != 0));
128
129 spin_lock(&ops_lock);
130 if (!list_empty(&op->list)) {
131 printk(KERN_INFO "punlock op on list\n");
132 list_del(&op->list);
133 }
134 spin_unlock(&ops_lock);
135
136 rv = op->info.rv;
137
138 kfree(op);
139 return rv;
140}
141
142int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
143 struct file *file, struct file_lock *fl)
144{
145 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
146 struct plock_op *op;
147 int rv;
148
149 op = kzalloc(sizeof(*op), GFP_KERNEL);
150 if (!op)
151 return -ENOMEM;
152
153 op->info.optype = GDLM_PLOCK_OP_GET;
154 op->info.pid = fl->fl_pid;
155 op->info.ex = (fl->fl_type == F_WRLCK);
156 op->info.fsid = ls->id;
157 op->info.number = name->ln_number;
158 op->info.start = fl->fl_start;
159 op->info.end = fl->fl_end;
160
161 send_op(op);
162 wait_event(recv_wq, (op->done != 0));
163
164 spin_lock(&ops_lock);
165 if (!list_empty(&op->list)) {
166 printk(KERN_INFO "plock_get op on list\n");
167 list_del(&op->list);
168 }
169 spin_unlock(&ops_lock);
170
171 rv = op->info.rv;
172
173 if (rv == 0)
174 fl->fl_type = F_UNLCK;
175 else if (rv > 0) {
176 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
177 fl->fl_pid = op->info.pid;
178 fl->fl_start = op->info.start;
179 fl->fl_end = op->info.end;
180 }
181
182 kfree(op);
183 return rv;
184}
185
186/* a read copies out one plock request from the send list */
187static ssize_t dev_read(struct file *file, char __user *u, size_t count,
188 loff_t *ppos)
189{
190 struct gdlm_plock_info info;
191 struct plock_op *op = NULL;
192
193 if (count < sizeof(info))
194 return -EINVAL;
195
196 spin_lock(&ops_lock);
197 if (!list_empty(&send_list)) {
198 op = list_entry(send_list.next, struct plock_op, list);
199 list_move(&op->list, &recv_list);
200 memcpy(&info, &op->info, sizeof(info));
201 }
202 spin_unlock(&ops_lock);
203
204 if (!op)
205 return -EAGAIN;
206
207 if (copy_to_user(u, &info, sizeof(info)))
208 return -EFAULT;
209 return sizeof(info);
210}
211
212/* a write copies in one plock result that should match a plock_op
213 on the recv list */
214static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
215 loff_t *ppos)
216{
217 struct gdlm_plock_info info;
218 struct plock_op *op;
219 int found = 0;
220
221 if (count != sizeof(info))
222 return -EINVAL;
223
224 if (copy_from_user(&info, u, sizeof(info)))
225 return -EFAULT;
226
227 if (check_version(&info))
228 return -EINVAL;
229
230 spin_lock(&ops_lock);
231 list_for_each_entry(op, &recv_list, list) {
232 if (op->info.fsid == info.fsid &&
233 op->info.number == info.number) {
234 list_del_init(&op->list);
235 found = 1;
236 op->done = 1;
237 memcpy(&op->info, &info, sizeof(info));
238 break;
239 }
240 }
241 spin_unlock(&ops_lock);
242
243 if (found)
244 wake_up(&recv_wq);
245 else
246 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
247 (unsigned long long)info.number);
248 return count;
249}
250
251static unsigned int dev_poll(struct file *file, poll_table *wait)
252{
253 poll_wait(file, &send_wq, wait);
254
255 spin_lock(&ops_lock);
256 if (!list_empty(&send_list)) {
257 spin_unlock(&ops_lock);
258 return POLLIN | POLLRDNORM;
259 }
260 spin_unlock(&ops_lock);
261 return 0;
262}
263
264static struct file_operations dev_fops = {
265 .read = dev_read,
266 .write = dev_write,
267 .poll = dev_poll,
268 .owner = THIS_MODULE
269};
270
271static struct miscdevice plock_dev_misc = {
272 .minor = MISC_DYNAMIC_MINOR,
273 .name = GDLM_PLOCK_MISC_NAME,
274 .fops = &dev_fops
275};
276
277int gdlm_plock_init(void)
278{
279 int rv;
280
281 spin_lock_init(&ops_lock);
282 INIT_LIST_HEAD(&send_list);
283 INIT_LIST_HEAD(&recv_list);
284 init_waitqueue_head(&send_wq);
285 init_waitqueue_head(&recv_wq);
286
287 rv = misc_register(&plock_dev_misc);
288 if (rv)
289 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
290 rv);
291 return rv;
292}
293
294void gdlm_plock_exit(void)
295{
296 if (misc_deregister(&plock_dev_misc) < 0)
297 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
298}
299
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..0d8bd0806dba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,225 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else
46 ret = -EINVAL;
47 return ret;
48}
49
50static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
51{
52 ssize_t ret;
53 int val = 0;
54
55 if (test_bit(DFL_WITHDRAW, &ls->flags))
56 val = 1;
57 ret = sprintf(buf, "%d\n", val);
58 return ret;
59}
60
61static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
62{
63 ssize_t ret = len;
64 int val;
65
66 val = simple_strtol(buf, NULL, 0);
67
68 if (val == 1)
69 set_bit(DFL_WITHDRAW, &ls->flags);
70 else
71 ret = -EINVAL;
72 wake_up(&ls->wait_control);
73 return ret;
74}
75
76static ssize_t id_show(struct gdlm_ls *ls, char *buf)
77{
78 return sprintf(buf, "%u\n", ls->id);
79}
80
81static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
82{
83 return sprintf(buf, "%d\n", ls->jid);
84}
85
86static ssize_t first_show(struct gdlm_ls *ls, char *buf)
87{
88 return sprintf(buf, "%d\n", ls->first);
89}
90
91static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
92{
93 return sprintf(buf, "%d\n", ls->first_done);
94}
95
96static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
97{
98 return sprintf(buf, "%d\n", ls->recover_jid);
99}
100
101static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
102{
103 ls->recover_jid = simple_strtol(buf, NULL, 0);
104 ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid);
105 return len;
106}
107
108static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
109{
110 return sprintf(buf, "%d\n", ls->recover_jid_done);
111}
112
113static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
114{
115 return sprintf(buf, "%d\n", ls->recover_jid_status);
116}
117
118struct gdlm_attr {
119 struct attribute attr;
120 ssize_t (*show)(struct gdlm_ls *, char *);
121 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
122};
123
124#define GDLM_ATTR(_name,_mode,_show,_store) \
125static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
126
127GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
128GDLM_ATTR(block, 0644, block_show, block_store);
129GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
130GDLM_ATTR(id, 0444, id_show, NULL);
131GDLM_ATTR(jid, 0444, jid_show, NULL);
132GDLM_ATTR(first, 0444, first_show, NULL);
133GDLM_ATTR(first_done, 0444, first_done_show, NULL);
134GDLM_ATTR(recover, 0644, recover_show, recover_store);
135GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
136GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
137
138static struct attribute *gdlm_attrs[] = {
139 &gdlm_attr_proto_name.attr,
140 &gdlm_attr_block.attr,
141 &gdlm_attr_withdraw.attr,
142 &gdlm_attr_id.attr,
143 &gdlm_attr_jid.attr,
144 &gdlm_attr_first.attr,
145 &gdlm_attr_first_done.attr,
146 &gdlm_attr_recover.attr,
147 &gdlm_attr_recover_done.attr,
148 &gdlm_attr_recover_status.attr,
149 NULL,
150};
151
152static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
153 char *buf)
154{
155 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
156 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
157 return a->show ? a->show(ls, buf) : 0;
158}
159
160static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
161 const char *buf, size_t len)
162{
163 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
164 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
165 return a->store ? a->store(ls, buf, len) : len;
166}
167
168static struct sysfs_ops gdlm_attr_ops = {
169 .show = gdlm_attr_show,
170 .store = gdlm_attr_store,
171};
172
173static struct kobj_type gdlm_ktype = {
174 .default_attrs = gdlm_attrs,
175 .sysfs_ops = &gdlm_attr_ops,
176};
177
178static struct kset gdlm_kset = {
179 .subsys = &kernel_subsys,
180 .kobj = {.name = "lock_dlm",},
181 .ktype = &gdlm_ktype,
182};
183
184int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
185{
186 int error;
187
188 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
189 if (error) {
190 log_error("can't set kobj name %d", error);
191 return error;
192 }
193
194 ls->kobj.kset = &gdlm_kset;
195 ls->kobj.ktype = &gdlm_ktype;
196 ls->kobj.parent = fskobj;
197
198 error = kobject_register(&ls->kobj);
199 if (error)
200 log_error("can't register kobj %d", error);
201
202 return error;
203}
204
205void gdlm_kobject_release(struct gdlm_ls *ls)
206{
207 kobject_unregister(&ls->kobj);
208}
209
210int gdlm_sysfs_init(void)
211{
212 int error;
213
214 error = kset_register(&gdlm_kset);
215 if (error)
216 printk("lock_dlm: cannot register kset %d\n", error);
217
218 return error;
219}
220
221void gdlm_sysfs_exit(void)
222{
223 kset_unregister(&gdlm_kset);
224}
225
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..489235b2edba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->fsdata, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 int16_t prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type,
58 (unsigned long long)lp->lockname.ln_number,
59 lp->flags);
60
61 lp->req = lp->cur;
62 acb.lc_ret |= LM_OUT_CANCELED;
63 if (lp->cur == DLM_LOCK_IV)
64 lp->lksb.sb_lkid = 0;
65 goto out;
66 }
67
68 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
69 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
70 log_info("unlock sb_status %d %x,%llx flags %lx",
71 lp->lksb.sb_status, lp->lockname.ln_type,
72 (unsigned long long)lp->lockname.ln_number,
73 lp->flags);
74 return;
75 }
76
77 lp->cur = DLM_LOCK_IV;
78 lp->req = DLM_LOCK_IV;
79 lp->lksb.sb_lkid = 0;
80
81 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
82 gdlm_delete_lp(lp);
83 return;
84 }
85 goto out;
86 }
87
88 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
89 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
90
91 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
92 if (lp->req == DLM_LOCK_PR)
93 lp->req = DLM_LOCK_CW;
94 else if (lp->req == DLM_LOCK_CW)
95 lp->req = DLM_LOCK_PR;
96 }
97
98 /*
99 * A canceled lock request. The lock was just taken off the delayed
100 * list and was never even submitted to dlm.
101 */
102
103 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
104 log_info("complete internal cancel %x,%llx",
105 lp->lockname.ln_type,
106 (unsigned long long)lp->lockname.ln_number);
107 lp->req = lp->cur;
108 acb.lc_ret |= LM_OUT_CANCELED;
109 goto out;
110 }
111
112 /*
113 * An error occured.
114 */
115
116 if (lp->lksb.sb_status) {
117 /* a "normal" error */
118 if ((lp->lksb.sb_status == -EAGAIN) &&
119 (lp->lkf & DLM_LKF_NOQUEUE)) {
120 lp->req = lp->cur;
121 if (lp->cur == DLM_LOCK_IV)
122 lp->lksb.sb_lkid = 0;
123 goto out;
124 }
125
126 /* this could only happen with cancels I think */
127 log_info("ast sb_status %d %x,%llx flags %lx",
128 lp->lksb.sb_status, lp->lockname.ln_type,
129 (unsigned long long)lp->lockname.ln_number,
130 lp->flags);
131 return;
132 }
133
134 /*
135 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
136 */
137
138 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
139 complete(&lp->ast_wait);
140 return;
141 }
142
143 /*
144 * A lock has been demoted to NL because it initially completed during
145 * BLOCK_LOCKS. Now it must be requested in the originally requested
146 * mode.
147 */
148
149 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
150 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
151 lp->lockname.ln_type,
152 (unsigned long long)lp->lockname.ln_number);
153 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
154 lp->lockname.ln_type,
155 (unsigned long long)lp->lockname.ln_number);
156
157 lp->cur = DLM_LOCK_NL;
158 lp->req = lp->prev_req;
159 lp->prev_req = DLM_LOCK_IV;
160 lp->lkf &= ~DLM_LKF_CONVDEADLK;
161
162 set_bit(LFL_NOCACHE, &lp->flags);
163
164 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
165 !test_bit(LFL_NOBLOCK, &lp->flags))
166 gdlm_queue_delayed(lp);
167 else
168 queue_submit(lp);
169 return;
170 }
171
172 /*
173 * A request is granted during dlm recovery. It may be granted
174 * because the locks of a failed node were cleared. In that case,
175 * there may be inconsistent data beneath this lock and we must wait
176 * for recovery to complete to use it. When gfs recovery is done this
177 * granted lock will be converted to NL and then reacquired in this
178 * granted state.
179 */
180
181 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
182 !test_bit(LFL_NOBLOCK, &lp->flags) &&
183 lp->req != DLM_LOCK_NL) {
184
185 lp->cur = lp->req;
186 lp->prev_req = lp->req;
187 lp->req = DLM_LOCK_NL;
188 lp->lkf |= DLM_LKF_CONVERT;
189 lp->lkf &= ~DLM_LKF_CONVDEADLK;
190
191 log_debug("rereq %x,%llx id %x %d,%d",
192 lp->lockname.ln_type,
193 (unsigned long long)lp->lockname.ln_number,
194 lp->lksb.sb_lkid, lp->cur, lp->req);
195
196 set_bit(LFL_REREQUEST, &lp->flags);
197 queue_submit(lp);
198 return;
199 }
200
201 /*
202 * DLM demoted the lock to NL before it was granted so GFS must be
203 * told it cannot cache data for this lock.
204 */
205
206 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
207 set_bit(LFL_NOCACHE, &lp->flags);
208
209 out:
210 /*
211 * This is an internal lock_dlm lock
212 */
213
214 if (test_bit(LFL_INLOCK, &lp->flags)) {
215 clear_bit(LFL_NOBLOCK, &lp->flags);
216 lp->cur = lp->req;
217 complete(&lp->ast_wait);
218 return;
219 }
220
221 /*
222 * Normal completion of a lock request. Tell GFS it now has the lock.
223 */
224
225 clear_bit(LFL_NOBLOCK, &lp->flags);
226 lp->cur = lp->req;
227
228 acb.lc_name = lp->lockname;
229 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
230
231 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
232 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
233 acb.lc_ret |= LM_OUT_CACHEABLE;
234
235 ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb);
236}
237
238static inline int no_work(struct gdlm_ls *ls, int blocking)
239{
240 int ret;
241
242 spin_lock(&ls->async_lock);
243 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
244 if (ret && blocking)
245 ret = list_empty(&ls->blocking);
246 spin_unlock(&ls->async_lock);
247
248 return ret;
249}
250
251static inline int check_drop(struct gdlm_ls *ls)
252{
253 if (!ls->drop_locks_count)
254 return 0;
255
256 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
257 ls->drop_time = jiffies;
258 if (ls->all_locks_count >= ls->drop_locks_count)
259 return 1;
260 }
261 return 0;
262}
263
264static int gdlm_thread(void *data)
265{
266 struct gdlm_ls *ls = (struct gdlm_ls *) data;
267 struct gdlm_lock *lp = NULL;
268 int blist = 0;
269 uint8_t complete, blocking, submit, drop;
270 DECLARE_WAITQUEUE(wait, current);
271
272 /* Only thread1 is allowed to do blocking callbacks since gfs
273 may wait for a completion callback within a blocking cb. */
274
275 if (current == ls->thread1)
276 blist = 1;
277
278 while (!kthread_should_stop()) {
279 set_current_state(TASK_INTERRUPTIBLE);
280 add_wait_queue(&ls->thread_wait, &wait);
281 if (no_work(ls, blist))
282 schedule();
283 remove_wait_queue(&ls->thread_wait, &wait);
284 set_current_state(TASK_RUNNING);
285
286 complete = blocking = submit = drop = 0;
287
288 spin_lock(&ls->async_lock);
289
290 if (blist && !list_empty(&ls->blocking)) {
291 lp = list_entry(ls->blocking.next, struct gdlm_lock,
292 blist);
293 list_del_init(&lp->blist);
294 blocking = lp->bast_mode;
295 lp->bast_mode = 0;
296 } else if (!list_empty(&ls->complete)) {
297 lp = list_entry(ls->complete.next, struct gdlm_lock,
298 clist);
299 list_del_init(&lp->clist);
300 complete = 1;
301 } else if (!list_empty(&ls->submit)) {
302 lp = list_entry(ls->submit.next, struct gdlm_lock,
303 delay_list);
304 list_del_init(&lp->delay_list);
305 submit = 1;
306 }
307
308 drop = check_drop(ls);
309 spin_unlock(&ls->async_lock);
310
311 if (complete)
312 process_complete(lp);
313
314 else if (blocking)
315 process_blocking(lp, blocking);
316
317 else if (submit)
318 gdlm_do_lock(lp);
319
320 if (drop)
321 ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL);
322
323 schedule();
324 }
325
326 return 0;
327}
328
329int gdlm_init_threads(struct gdlm_ls *ls)
330{
331 struct task_struct *p;
332 int error;
333
334 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
335 error = IS_ERR(p);
336 if (error) {
337 log_error("can't start lock_dlm1 thread %d", error);
338 return error;
339 }
340 ls->thread1 = p;
341
342 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
343 error = IS_ERR(p);
344 if (error) {
345 log_error("can't start lock_dlm2 thread %d", error);
346 kthread_stop(ls->thread1);
347 return error;
348 }
349 ls->thread2 = p;
350
351 return 0;
352}
353
354void gdlm_release_threads(struct gdlm_ls *ls)
355{
356 kthread_stop(ls->thread1);
357 kthread_stop(ls->thread2);
358}
359
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..cdadf956c831
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..97ffac5cdefb
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,259 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17
18#include "../../lm_interface.h"
19
20struct nolock_lockspace {
21 unsigned int nl_lvb_size;
22};
23
24static struct lm_lockops nolock_ops;
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, lm_fsdata_t *fsdata,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj)
31{
32 char *c;
33 unsigned int jid;
34 struct nolock_lockspace *nl;
35
36 c = strstr(host_data, "jid=");
37 if (!c)
38 jid = 0;
39 else {
40 c += 4;
41 sscanf(c, "%u", &jid);
42 }
43
44 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
45 if (!nl)
46 return -ENOMEM;
47
48 nl->nl_lvb_size = min_lvb_size;
49
50 lockstruct->ls_jid = jid;
51 lockstruct->ls_first = 1;
52 lockstruct->ls_lvb_size = min_lvb_size;
53 lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
54 lockstruct->ls_ops = &nolock_ops;
55 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
56
57 return 0;
58}
59
60static void nolock_others_may_mount(lm_lockspace_t *lockspace)
61{
62}
63
64static void nolock_unmount(lm_lockspace_t *lockspace)
65{
66 struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
67 kfree(nl);
68}
69
70static void nolock_withdraw(lm_lockspace_t *lockspace)
71{
72}
73
74/**
75 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
76 * @lockspace: the lockspace the lock lives in
77 * @name: the name of the lock
78 * @lockp: return the lm_lock_t here
79 *
80 * Returns: 0 on success, -EXXX on failure
81 */
82
83static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
84 lm_lock_t **lockp)
85{
86 *lockp = (lm_lock_t *)lockspace;
87 return 0;
88}
89
90/**
91 * nolock_put_lock - get rid of a lock structure
92 * @lock: the lock to throw away
93 *
94 */
95
96static void nolock_put_lock(lm_lock_t *lock)
97{
98}
99
100/**
101 * nolock_lock - acquire a lock
102 * @lock: the lock to manipulate
103 * @cur_state: the current state
104 * @req_state: the requested state
105 * @flags: modifier flags
106 *
107 * Returns: A bitmap of LM_OUT_*
108 */
109
110static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state,
111 unsigned int req_state, unsigned int flags)
112{
113 return req_state | LM_OUT_CACHEABLE;
114}
115
116/**
117 * nolock_unlock - unlock a lock
118 * @lock: the lock to manipulate
119 * @cur_state: the current state
120 *
121 * Returns: 0
122 */
123
124static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
125{
126 return 0;
127}
128
129static void nolock_cancel(lm_lock_t *lock)
130{
131}
132
133/**
134 * nolock_hold_lvb - hold on to a lock value block
135 * @lock: the lock the LVB is associated with
136 * @lvbp: return the lm_lvb_t here
137 *
138 * Returns: 0 on success, -EXXX on failure
139 */
140
141static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
142{
143 struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
144 int error = 0;
145
146 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
147 if (!*lvbp)
148 error = -ENOMEM;
149
150 return error;
151}
152
153/**
154 * nolock_unhold_lvb - release a LVB
155 * @lock: the lock the LVB is associated with
156 * @lvb: the lock value block
157 *
158 */
159
160static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
161{
162 kfree(lvb);
163}
164
165/**
166 * nolock_sync_lvb - sync out the value of a lvb
167 * @lock: the lock the LVB is associated with
168 * @lvb: the lock value block
169 *
170 */
171
172static void nolock_sync_lvb(lm_lock_t *lock, char *lvb)
173{
174}
175
176static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
177 struct file *file, struct file_lock *fl)
178{
179 struct file_lock tmp;
180 int ret;
181
182 ret = posix_test_lock(file, fl, &tmp);
183 fl->fl_type = F_UNLCK;
184 if (ret)
185 memcpy(fl, &tmp, sizeof(struct file_lock));
186
187 return 0;
188}
189
190static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error;
194 error = posix_lock_file_wait(file, fl);
195 return error;
196}
197
198static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 error = posix_lock_file_wait(file, fl);
203 return error;
204}
205
206static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
207 unsigned int message)
208{
209}
210
211static struct lm_lockops nolock_ops = {
212 .lm_proto_name = "lock_nolock",
213 .lm_mount = nolock_mount,
214 .lm_others_may_mount = nolock_others_may_mount,
215 .lm_unmount = nolock_unmount,
216 .lm_withdraw = nolock_withdraw,
217 .lm_get_lock = nolock_get_lock,
218 .lm_put_lock = nolock_put_lock,
219 .lm_lock = nolock_lock,
220 .lm_unlock = nolock_unlock,
221 .lm_cancel = nolock_cancel,
222 .lm_hold_lvb = nolock_hold_lvb,
223 .lm_unhold_lvb = nolock_unhold_lvb,
224 .lm_sync_lvb = nolock_sync_lvb,
225 .lm_plock_get = nolock_plock_get,
226 .lm_plock = nolock_plock,
227 .lm_punlock = nolock_punlock,
228 .lm_recovery_done = nolock_recovery_done,
229 .lm_owner = THIS_MODULE,
230};
231
232static int __init init_nolock(void)
233{
234 int error;
235
236 error = gfs_register_lockproto(&nolock_ops);
237 if (error) {
238 printk(KERN_WARNING
239 "lock_nolock: can't register protocol: %d\n", error);
240 return error;
241 }
242
243 printk(KERN_INFO
244 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
245 return 0;
246}
247
248static void __exit exit_nolock(void)
249{
250 gfs_unregister_lockproto(&nolock_ops);
251}
252
253module_init(init_nolock);
254module_exit(exit_nolock);
255
256MODULE_DESCRIPTION("GFS Nolock Locking Module");
257MODULE_AUTHOR("Red Hat, Inc.");
258MODULE_LICENSE("GPL");
259
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..1596e9436c42
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,601 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "util.h"
27#include "dir.h"
28
29#define PULL 1
30
31/**
32 * gfs2_struct2blk - compute stuff
33 * @sdp: the filesystem
34 * @nstruct: the number of structures
35 * @ssize: the size of the structures
36 *
37 * Compute the number of log descriptor blocks needed to hold a certain number
38 * of structures of a certain size.
39 *
40 * Returns: the number of blocks needed (minimum is always 1)
41 */
42
43unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
44 unsigned int ssize)
45{
46 unsigned int blks;
47 unsigned int first, second;
48
49 blks = 1;
50 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) /
51 ssize;
52
53 if (nstruct > first) {
54 second = (sdp->sd_sb.sb_bsize -
55 sizeof(struct gfs2_meta_header)) / ssize;
56 blks += DIV_ROUND_UP(nstruct - first, second);
57 }
58
59 return blks;
60}
61
62void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
63{
64 struct list_head *head = &sdp->sd_ail1_list;
65 uint64_t sync_gen;
66 struct list_head *first, *tmp;
67 struct gfs2_ail *first_ai, *ai;
68
69 gfs2_log_lock(sdp);
70 if (list_empty(head)) {
71 gfs2_log_unlock(sdp);
72 return;
73 }
74 sync_gen = sdp->sd_ail_sync_gen++;
75
76 first = head->prev;
77 first_ai = list_entry(first, struct gfs2_ail, ai_list);
78 first_ai->ai_sync_gen = sync_gen;
79 gfs2_ail1_start_one(sdp, first_ai);
80
81 if (flags & DIO_ALL)
82 first = NULL;
83
84 for (;;) {
85 if (first && (head->prev != first ||
86 gfs2_ail1_empty_one(sdp, first_ai, 0)))
87 break;
88
89 for (tmp = head->prev; tmp != head; tmp = tmp->prev) {
90 ai = list_entry(tmp, struct gfs2_ail, ai_list);
91 if (ai->ai_sync_gen >= sync_gen)
92 continue;
93 ai->ai_sync_gen = sync_gen;
94 gfs2_ail1_start_one(sdp, ai);
95 break;
96 }
97
98 if (tmp == head)
99 break;
100 }
101
102 gfs2_log_unlock(sdp);
103}
104
105int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
106{
107 struct gfs2_ail *ai, *s;
108 int ret;
109
110 gfs2_log_lock(sdp);
111
112 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
113 if (gfs2_ail1_empty_one(sdp, ai, flags))
114 list_move(&ai->ai_list, &sdp->sd_ail2_list);
115 else if (!(flags & DIO_ALL))
116 break;
117 }
118
119 ret = list_empty(&sdp->sd_ail1_list);
120
121 gfs2_log_unlock(sdp);
122
123 return ret;
124}
125
126static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
127{
128 struct gfs2_ail *ai, *safe;
129 unsigned int old_tail = sdp->sd_log_tail;
130 int wrap = (new_tail < old_tail);
131 int a, b, rm;
132
133 gfs2_log_lock(sdp);
134
135 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
136 a = (old_tail <= ai->ai_first);
137 b = (ai->ai_first < new_tail);
138 rm = (wrap) ? (a || b) : (a && b);
139 if (!rm)
140 continue;
141
142 gfs2_ail2_empty_one(sdp, ai);
143 list_del(&ai->ai_list);
144 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
145 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
146 kfree(ai);
147 }
148
149 gfs2_log_unlock(sdp);
150}
151
152/**
153 * gfs2_log_reserve - Make a log reservation
154 * @sdp: The GFS2 superblock
155 * @blks: The number of blocks to reserve
156 *
157 * Returns: errno
158 */
159
160int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
161{
162 unsigned int try = 0;
163
164 if (gfs2_assert_warn(sdp, blks) ||
165 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
166 return -EINVAL;
167
168 mutex_lock(&sdp->sd_log_reserve_mutex);
169 gfs2_log_lock(sdp);
170 while(sdp->sd_log_blks_free <= blks) {
171 gfs2_log_unlock(sdp);
172 gfs2_ail1_empty(sdp, 0);
173 gfs2_log_flush(sdp, NULL);
174
175 if (try++)
176 gfs2_ail1_start(sdp, 0);
177 gfs2_log_lock(sdp);
178 }
179 sdp->sd_log_blks_free -= blks;
180 /* printk(KERN_INFO "reserved %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
181 gfs2_log_unlock(sdp);
182 mutex_unlock(&sdp->sd_log_reserve_mutex);
183
184 down_read(&sdp->sd_log_flush_lock);
185
186 return 0;
187}
188
189/**
190 * gfs2_log_release - Release a given number of log blocks
191 * @sdp: The GFS2 superblock
192 * @blks: The number of blocks
193 *
194 */
195
196void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
197{
198
199 gfs2_log_lock(sdp);
200 sdp->sd_log_blks_free += blks;
201 /* printk(KERN_INFO "released %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
202 gfs2_assert_withdraw(sdp,
203 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
204 gfs2_log_unlock(sdp);
205 up_read(&sdp->sd_log_flush_lock);
206}
207
208static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
209{
210 int new = 0;
211 uint64_t dbn;
212 int error;
213 int bdy;
214
215 error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, &new, &dbn, &bdy);
216 if (!(!error && dbn)) {
217 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, (unsigned long long)dbn, lbn);
218 }
219 gfs2_assert_withdraw(sdp, !error && dbn);
220
221 return dbn;
222}
223
224/**
225 * log_distance - Compute distance between two journal blocks
226 * @sdp: The GFS2 superblock
227 * @newer: The most recent journal block of the pair
228 * @older: The older journal block of the pair
229 *
230 * Compute the distance (in the journal direction) between two
231 * blocks in the journal
232 *
233 * Returns: the distance in blocks
234 */
235
236static inline unsigned int log_distance(struct gfs2_sbd *sdp,
237 unsigned int newer,
238 unsigned int older)
239{
240 int dist;
241
242 dist = newer - older;
243 if (dist < 0)
244 dist += sdp->sd_jdesc->jd_blocks;
245
246 return dist;
247}
248
249static unsigned int current_tail(struct gfs2_sbd *sdp)
250{
251 struct gfs2_ail *ai;
252 unsigned int tail;
253
254 gfs2_log_lock(sdp);
255
256 if (list_empty(&sdp->sd_ail1_list))
257 tail = sdp->sd_log_head;
258 else {
259 ai = list_entry(sdp->sd_ail1_list.prev,
260 struct gfs2_ail, ai_list);
261 tail = ai->ai_first;
262 }
263
264 gfs2_log_unlock(sdp);
265
266 return tail;
267}
268
269static inline void log_incr_head(struct gfs2_sbd *sdp)
270{
271 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
272 gfs2_assert_withdraw(sdp,
273 sdp->sd_log_flush_head == sdp->sd_log_head);
274
275 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
276 sdp->sd_log_flush_head = 0;
277 sdp->sd_log_flush_wrapped = 1;
278 }
279}
280
281/**
282 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
283 * @sdp: The GFS2 superblock
284 *
285 * Returns: the buffer_head
286 */
287
288struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
289{
290 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
291 struct gfs2_log_buf *lb;
292 struct buffer_head *bh;
293
294 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
295 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
296
297 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
298 lock_buffer(bh);
299 memset(bh->b_data, 0, bh->b_size);
300 set_buffer_uptodate(bh);
301 clear_buffer_dirty(bh);
302 unlock_buffer(bh);
303
304 log_incr_head(sdp);
305
306 return bh;
307}
308
309/**
310 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
311 * @sdp: the filesystem
312 * @data: the data the buffer_head should point to
313 *
314 * Returns: the log buffer descriptor
315 */
316
317struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
318 struct buffer_head *real)
319{
320 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
321 struct gfs2_log_buf *lb;
322 struct buffer_head *bh;
323
324 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
325 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
326 lb->lb_real = real;
327
328 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
329 atomic_set(&bh->b_count, 1);
330 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
331 set_bh_page(bh, real->b_page, bh_offset(real));
332 bh->b_blocknr = blkno;
333 bh->b_size = sdp->sd_sb.sb_bsize;
334 bh->b_bdev = sdp->sd_vfs->s_bdev;
335
336 log_incr_head(sdp);
337
338 return bh;
339}
340
341static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
342{
343 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
344
345 ail2_empty(sdp, new_tail);
346
347 gfs2_log_lock(sdp);
348 sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0);
349 /* printk(KERN_INFO "pull tail refunding %u blocks (%u left) pull=%d\n", dist - ((pull) ? 1 : 0), sdp->sd_log_blks_free, pull); */
350 gfs2_assert_withdraw(sdp,
351 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
352 gfs2_log_unlock(sdp);
353
354 sdp->sd_log_tail = new_tail;
355}
356
357/**
358 * log_write_header - Get and initialize a journal header buffer
359 * @sdp: The GFS2 superblock
360 *
361 * Returns: the initialized log buffer descriptor
362 */
363
364static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull)
365{
366 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
367 struct buffer_head *bh;
368 struct gfs2_log_header *lh;
369 unsigned int tail;
370 uint32_t hash;
371
372 /* printk(KERN_INFO "log write header start (flags=%08x, pull=%d)\n", flags, pull); */
373
374 bh = sb_getblk(sdp->sd_vfs, blkno);
375 lock_buffer(bh);
376 memset(bh->b_data, 0, bh->b_size);
377 set_buffer_uptodate(bh);
378 clear_buffer_dirty(bh);
379 unlock_buffer(bh);
380
381 gfs2_ail1_empty(sdp, 0);
382 tail = current_tail(sdp);
383
384 lh = (struct gfs2_log_header *)bh->b_data;
385 memset(lh, 0, sizeof(struct gfs2_log_header));
386 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
387 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
388 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
389 lh->lh_sequence = be64_to_cpu(sdp->sd_log_sequence++);
390 lh->lh_flags = be32_to_cpu(flags);
391 lh->lh_tail = be32_to_cpu(tail);
392 lh->lh_blkno = be32_to_cpu(sdp->sd_log_flush_head);
393 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
394 lh->lh_hash = cpu_to_be32(hash);
395
396 set_buffer_dirty(bh);
397 if (sync_dirty_buffer(bh))
398 gfs2_io_error_bh(sdp, bh);
399 brelse(bh);
400
401 if (sdp->sd_log_tail != tail)
402 log_pull_tail(sdp, tail, pull);
403 else
404 gfs2_assert_withdraw(sdp, !pull);
405
406 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
407 log_incr_head(sdp);
408
409 /* printk(KERN_INFO "log write header out\n"); */
410}
411
412static void log_flush_commit(struct gfs2_sbd *sdp)
413{
414 struct list_head *head = &sdp->sd_log_flush_list;
415 struct gfs2_log_buf *lb;
416 struct buffer_head *bh;
417#if 0
418 unsigned int d;
419
420 d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head);
421
422 gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved);
423#endif
424
425 while (!list_empty(head)) {
426 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
427 list_del(&lb->lb_list);
428 bh = lb->lb_bh;
429
430 wait_on_buffer(bh);
431 if (!buffer_uptodate(bh))
432 gfs2_io_error_bh(sdp, bh);
433 if (lb->lb_real) {
434 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
435 schedule();
436 free_buffer_head(bh);
437 } else
438 brelse(bh);
439 kfree(lb);
440 }
441
442 log_write_header(sdp, 0, 0);
443}
444
445/**
446 * gfs2_log_flush - flush incore transaction(s)
447 * @sdp: the filesystem
448 * @gl: The glock structure to flush. If NULL, flush the whole incore log
449 *
450 */
451
452void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
453{
454 struct gfs2_ail *ai;
455
456 down_write(&sdp->sd_log_flush_lock);
457
458 if (gl) {
459 gfs2_log_lock(sdp);
460 if (list_empty(&gl->gl_le.le_list)) {
461 gfs2_log_unlock(sdp);
462 up_write(&sdp->sd_log_flush_lock);
463 return;
464 }
465 gfs2_log_unlock(sdp);
466 }
467
468 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
469 INIT_LIST_HEAD(&ai->ai_ail1_list);
470 INIT_LIST_HEAD(&ai->ai_ail2_list);
471
472 gfs2_assert_withdraw(sdp,
473 sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
474 gfs2_assert_withdraw(sdp,
475 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
476
477 sdp->sd_log_flush_head = sdp->sd_log_head;
478 sdp->sd_log_flush_wrapped = 0;
479 ai->ai_first = sdp->sd_log_flush_head;
480
481 lops_before_commit(sdp);
482 if (!list_empty(&sdp->sd_log_flush_list))
483 log_flush_commit(sdp);
484 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
485 log_write_header(sdp, 0, PULL);
486 lops_after_commit(sdp, ai);
487 sdp->sd_log_head = sdp->sd_log_flush_head;
488
489 /* printk(KERN_INFO "sd_log_num_hdrs %u\n", sdp->sd_log_num_hdrs); */
490 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
491
492 sdp->sd_log_blks_reserved =
493 sdp->sd_log_commited_buf =
494 sdp->sd_log_num_hdrs =
495 sdp->sd_log_commited_revoke = 0;
496
497 gfs2_log_lock(sdp);
498 if (!list_empty(&ai->ai_ail1_list)) {
499 list_add(&ai->ai_list, &sdp->sd_ail1_list);
500 ai = NULL;
501 }
502 gfs2_log_unlock(sdp);
503
504 sdp->sd_vfs->s_dirt = 0;
505 up_write(&sdp->sd_log_flush_lock);
506
507 kfree(ai);
508}
509
510static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
511{
512 unsigned int reserved = 1;
513 unsigned int old;
514
515 gfs2_log_lock(sdp);
516
517 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
518 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
519 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
520 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
521
522 if (sdp->sd_log_commited_buf)
523 reserved += sdp->sd_log_commited_buf;
524 if (sdp->sd_log_commited_revoke)
525 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
526 sizeof(uint64_t));
527
528 old = sdp->sd_log_blks_free;
529 sdp->sd_log_blks_free += tr->tr_reserved -
530 (reserved - sdp->sd_log_blks_reserved);
531
532 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
533 gfs2_assert_withdraw(sdp,
534 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
535 sdp->sd_log_num_hdrs);
536
537 sdp->sd_log_blks_reserved = reserved;
538
539 gfs2_log_unlock(sdp);
540}
541
542/**
543 * gfs2_log_commit - Commit a transaction to the log
544 * @sdp: the filesystem
545 * @tr: the transaction
546 *
547 * Returns: errno
548 */
549
550void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
551{
552 log_refund(sdp, tr);
553 lops_incore_commit(sdp, tr);
554
555 sdp->sd_vfs->s_dirt = 1;
556 up_read(&sdp->sd_log_flush_lock);
557
558 gfs2_log_lock(sdp);
559 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
560 gfs2_log_unlock(sdp);
561 gfs2_log_flush(sdp, NULL);
562 } else
563 gfs2_log_unlock(sdp);
564}
565
566/**
567 * gfs2_log_shutdown - write a shutdown header into a journal
568 * @sdp: the filesystem
569 *
570 */
571
572void gfs2_log_shutdown(struct gfs2_sbd *sdp)
573{
574 down_write(&sdp->sd_log_flush_lock);
575
576 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
577 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
578 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
579 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
580 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
581 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
582 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
583 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
584 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
585
586 sdp->sd_log_flush_head = sdp->sd_log_head;
587 sdp->sd_log_flush_wrapped = 0;
588
589 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
590
591 /* printk(KERN_INFO "sd_log_blks_free %u, sd_jdesc->jd_blocks %u\n", sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); */
592 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
593 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
594 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
595
596 sdp->sd_log_head = sdp->sd_log_flush_head;
597 sdp->sd_log_tail = sdp->sd_log_head;
598
599 up_write(&sdp->sd_log_flush_lock);
600}
601
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..8cfd0f1d29f8
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,61 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13/**
14 * gfs2_log_lock - acquire the right to mess with the log manager
15 * @sdp: the filesystem
16 *
17 */
18
19static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
20{
21 spin_lock(&sdp->sd_log_lock);
22}
23
24/**
25 * gfs2_log_unlock - release the right to mess with the log manager
26 * @sdp: the filesystem
27 *
28 */
29
30static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
31{
32 spin_unlock(&sdp->sd_log_lock);
33}
34
35static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
36 unsigned int value)
37{
38 if (++value == sdp->sd_jdesc->jd_blocks) {
39 value = 0;
40 }
41 sdp->sd_log_head = sdp->sd_log_tail = value;
42}
43
44unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
45 unsigned int ssize);
46
47void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
48int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
49
50int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
51void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
52
53struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
54struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
55 struct buffer_head *real);
56void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
57void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
58
59void gfs2_log_shutdown(struct gfs2_sbd *sdp);
60
61#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..af03bf380f46
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,800 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "glock.h"
21#include "log.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "rgrp.h"
26#include "trans.h"
27#include "util.h"
28
29static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
30{
31 struct gfs2_glock *gl;
32 struct gfs2_trans *tr = current->journal_info;
33
34 tr->tr_touched = 1;
35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44
45 gfs2_log_lock(sdp);
46 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp);
49}
50
51static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 struct list_head *head = &sdp->sd_log_le_gl;
54 struct gfs2_glock *gl;
55
56 while (!list_empty(head)) {
57 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
58 list_del_init(&gl->gl_le.le_list);
59 sdp->sd_log_num_gl--;
60
61 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
62 gfs2_glock_put(gl);
63 }
64 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
65}
66
67static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
68{
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr;
71
72 if (!list_empty(&bd->bd_list_tr))
73 return;
74
75 tr = current->journal_info;
76 tr->tr_touched = 1;
77 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
79
80 if (!list_empty(&le->le_list))
81 return;
82
83 gfs2_trans_add_gl(bd->bd_gl);
84
85 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf);
91 gfs2_log_unlock(sdp);
92
93 tr->tr_num_buf_new++;
94}
95
96static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
97{
98 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd;
100
101 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--;
105 }
106 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107}
108
109static void buf_lo_before_commit(struct gfs2_sbd *sdp)
110{
111 struct buffer_head *bh;
112 struct gfs2_log_descriptor *ld;
113 struct gfs2_bufdata *bd1 = NULL, *bd2;
114 unsigned int total = sdp->sd_log_num_buf;
115 unsigned int offset = sizeof(struct gfs2_log_descriptor);
116 unsigned int limit;
117 unsigned int num;
118 unsigned n;
119 __be64 *ptr;
120
121 offset += (sizeof(__be64) - 1);
122 offset &= ~(sizeof(__be64) - 1);
123 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
124 /* for 4k blocks, limit = 503 */
125
126 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
127 while(total) {
128 num = total;
129 if (total > limit)
130 num = limit;
131 bh = gfs2_log_get_buf(sdp);
132 sdp->sd_log_num_hdrs++;
133 ld = (struct gfs2_log_descriptor *)bh->b_data;
134 ptr = (__be64 *)(bh->b_data + offset);
135 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
136 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
137 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
138 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
139 ld->ld_length = cpu_to_be32(num + 1);
140 ld->ld_data1 = cpu_to_be32(num);
141 ld->ld_data2 = cpu_to_be32(0);
142 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
143
144 n = 0;
145 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
146 bd_le.le_list) {
147 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
148 if (++n >= num)
149 break;
150 }
151
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154
155 n = 0;
156 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
157 bd_le.le_list) {
158 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
159 set_buffer_dirty(bh);
160 ll_rw_block(WRITE, 1, &bh);
161 if (++n >= num)
162 break;
163 }
164
165 total -= num;
166 }
167}
168
169static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
170{
171 struct list_head *head = &sdp->sd_log_le_buf;
172 struct gfs2_bufdata *bd;
173
174 while (!list_empty(head)) {
175 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
176 list_del_init(&bd->bd_le.le_list);
177 sdp->sd_log_num_buf--;
178
179 gfs2_unpin(sdp, bd->bd_bh, ai);
180 }
181 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
182}
183
184static void buf_lo_before_scan(struct gfs2_jdesc *jd,
185 struct gfs2_log_header *head, int pass)
186{
187 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
188
189 if (pass != 0)
190 return;
191
192 sdp->sd_found_blocks = 0;
193 sdp->sd_replayed_blocks = 0;
194}
195
196static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
197 struct gfs2_log_descriptor *ld, __be64 *ptr,
198 int pass)
199{
200 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
201 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
202 struct gfs2_glock *gl = ip->i_gl;
203 unsigned int blks = be32_to_cpu(ld->ld_data1);
204 struct buffer_head *bh_log, *bh_ip;
205 uint64_t blkno;
206 int error = 0;
207
208 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
209 return 0;
210
211 gfs2_replay_incr_blk(sdp, &start);
212
213 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
214 blkno = be64_to_cpu(*ptr++);
215
216 sdp->sd_found_blocks++;
217
218 if (gfs2_revoke_check(sdp, blkno, start))
219 continue;
220
221 error = gfs2_replay_read_block(jd, start, &bh_log);
222 if (error)
223 return error;
224
225 bh_ip = gfs2_meta_new(gl, blkno);
226 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
227
228 if (gfs2_meta_check(sdp, bh_ip))
229 error = -EIO;
230 else
231 mark_buffer_dirty(bh_ip);
232
233 brelse(bh_log);
234 brelse(bh_ip);
235
236 if (error)
237 break;
238
239 sdp->sd_replayed_blocks++;
240 }
241
242 return error;
243}
244
245static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
246{
247 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
248 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
249
250 if (error) {
251 gfs2_meta_sync(ip->i_gl,
252 DIO_START | DIO_WAIT);
253 return;
254 }
255 if (pass != 1)
256 return;
257
258 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
259
260 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
261 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
262}
263
264static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
265{
266 struct gfs2_trans *tr;
267
268 tr = current->journal_info;
269 tr->tr_touched = 1;
270 tr->tr_num_revoke++;
271
272 gfs2_log_lock(sdp);
273 sdp->sd_log_num_revoke++;
274 list_add(&le->le_list, &sdp->sd_log_le_revoke);
275 gfs2_log_unlock(sdp);
276}
277
278static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
279{
280 struct gfs2_log_descriptor *ld;
281 struct gfs2_meta_header *mh;
282 struct buffer_head *bh;
283 unsigned int offset;
284 struct list_head *head = &sdp->sd_log_le_revoke;
285 struct gfs2_revoke *rv;
286
287 if (!sdp->sd_log_num_revoke)
288 return;
289
290 bh = gfs2_log_get_buf(sdp);
291 ld = (struct gfs2_log_descriptor *)bh->b_data;
292 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
293 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
294 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
295 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
296 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
297 sizeof(uint64_t)));
298 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
299 ld->ld_data2 = cpu_to_be32(0);
300 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
301 offset = sizeof(struct gfs2_log_descriptor);
302
303 while (!list_empty(head)) {
304 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
305 list_del_init(&rv->rv_le.le_list);
306 sdp->sd_log_num_revoke--;
307
308 if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) {
309 set_buffer_dirty(bh);
310 ll_rw_block(WRITE, 1, &bh);
311
312 bh = gfs2_log_get_buf(sdp);
313 mh = (struct gfs2_meta_header *)bh->b_data;
314 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
315 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
316 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
317 offset = sizeof(struct gfs2_meta_header);
318 }
319
320 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
321 kfree(rv);
322
323 offset += sizeof(uint64_t);
324 }
325 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
326
327 set_buffer_dirty(bh);
328 ll_rw_block(WRITE, 1, &bh);
329}
330
331static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
332 struct gfs2_log_header *head, int pass)
333{
334 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
335
336 if (pass != 0)
337 return;
338
339 sdp->sd_found_revokes = 0;
340 sdp->sd_replay_tail = head->lh_tail;
341}
342
343static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
344 struct gfs2_log_descriptor *ld, __be64 *ptr,
345 int pass)
346{
347 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
348 unsigned int blks = be32_to_cpu(ld->ld_length);
349 unsigned int revokes = be32_to_cpu(ld->ld_data1);
350 struct buffer_head *bh;
351 unsigned int offset;
352 uint64_t blkno;
353 int first = 1;
354 int error;
355
356 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
357 return 0;
358
359 offset = sizeof(struct gfs2_log_descriptor);
360
361 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
362 error = gfs2_replay_read_block(jd, start, &bh);
363 if (error)
364 return error;
365
366 if (!first)
367 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
368
369 while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) {
370 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
371
372 error = gfs2_revoke_add(sdp, blkno, start);
373 if (error < 0)
374 return error;
375 else if (error)
376 sdp->sd_found_revokes++;
377
378 if (!--revokes)
379 break;
380 offset += sizeof(uint64_t);
381 }
382
383 brelse(bh);
384 offset = sizeof(struct gfs2_meta_header);
385 first = 0;
386 }
387
388 return 0;
389}
390
391static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
392{
393 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
394
395 if (error) {
396 gfs2_revoke_clean(sdp);
397 return;
398 }
399 if (pass != 1)
400 return;
401
402 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
403 jd->jd_jid, sdp->sd_found_revokes);
404
405 gfs2_revoke_clean(sdp);
406}
407
408static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
409{
410 struct gfs2_rgrpd *rgd;
411 struct gfs2_trans *tr = current->journal_info;
412
413 tr->tr_touched = 1;
414
415 if (!list_empty(&le->le_list))
416 return;
417
418 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
419 gfs2_rgrp_bh_hold(rgd);
420
421 gfs2_log_lock(sdp);
422 sdp->sd_log_num_rg++;
423 list_add(&le->le_list, &sdp->sd_log_le_rg);
424 gfs2_log_unlock(sdp);
425}
426
427static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
428{
429 struct list_head *head = &sdp->sd_log_le_rg;
430 struct gfs2_rgrpd *rgd;
431
432 while (!list_empty(head)) {
433 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
434 list_del_init(&rgd->rd_le.le_list);
435 sdp->sd_log_num_rg--;
436
437 gfs2_rgrp_repolish_clones(rgd);
438 gfs2_rgrp_bh_put(rgd);
439 }
440 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
441}
442
443/**
444 * databuf_lo_add - Add a databuf to the transaction.
445 *
446 * This is used in two distinct cases:
447 * i) In ordered write mode
448 * We put the data buffer on a list so that we can ensure that its
449 * synced to disk at the right time
450 * ii) In journaled data mode
451 * We need to journal the data block in the same way as metadata in
452 * the functions above. The difference is that here we have a tag
453 * which is two __be64's being the block number (as per meta data)
454 * and a flag which says whether the data block needs escaping or
455 * not. This means we need a new log entry for each 251 or so data
456 * blocks, which isn't an enormous overhead but twice as much as
457 * for normal metadata blocks.
458 */
459static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
460{
461 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
462 struct gfs2_trans *tr = current->journal_info;
463 struct address_space *mapping = bd->bd_bh->b_page->mapping;
464 struct gfs2_inode *ip = GFS2_I(mapping->host);
465
466 tr->tr_touched = 1;
467 if (!list_empty(&bd->bd_list_tr) &&
468 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
469 tr->tr_num_buf++;
470 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
471 gfs2_pin(sdp, bd->bd_bh);
472 tr->tr_num_buf_new++;
473 }
474 gfs2_trans_add_gl(bd->bd_gl);
475 gfs2_log_lock(sdp);
476 if (!list_empty(&le->le_list)) {
477 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
478 sdp->sd_log_num_jdata++;
479 sdp->sd_log_num_databuf++;
480 list_add(&le->le_list, &sdp->sd_log_le_databuf);
481 }
482 gfs2_log_unlock(sdp);
483}
484
485static int gfs2_check_magic(struct buffer_head *bh)
486{
487 struct page *page = bh->b_page;
488 void *kaddr;
489 __be32 *ptr;
490 int rv = 0;
491
492 kaddr = kmap_atomic(page, KM_USER0);
493 ptr = kaddr + bh_offset(bh);
494 if (*ptr == cpu_to_be32(GFS2_MAGIC))
495 rv = 1;
496 kunmap_atomic(page, KM_USER0);
497
498 return rv;
499}
500
501/**
502 * databuf_lo_before_commit - Scan the data buffers, writing as we go
503 *
504 * Here we scan through the lists of buffers and make the assumption
505 * that any buffer thats been pinned is being journaled, and that
506 * any unpinned buffer is an ordered write data buffer and therefore
507 * will be written back rather than journaled.
508 */
509static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
510{
511 LIST_HEAD(started);
512 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
513 struct buffer_head *bh = NULL;
514 unsigned int offset = sizeof(struct gfs2_log_descriptor);
515 struct gfs2_log_descriptor *ld;
516 unsigned int limit;
517 unsigned int total_dbuf = sdp->sd_log_num_databuf;
518 unsigned int total_jdata = sdp->sd_log_num_jdata;
519 unsigned int num, n;
520 __be64 *ptr = NULL;
521
522 offset += (2*sizeof(__be64) - 1);
523 offset &= ~(2*sizeof(__be64) - 1);
524 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
525
526 /*
527 * Start writing ordered buffers, write journaled buffers
528 * into the log along with a header
529 */
530 gfs2_log_lock(sdp);
531 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
532 bd_le.le_list);
533 while(total_dbuf) {
534 num = total_jdata;
535 if (num > limit)
536 num = limit;
537 n = 0;
538 list_for_each_entry_safe_continue(bd1, bdt,
539 &sdp->sd_log_le_databuf,
540 bd_le.le_list) {
541 /* An ordered write buffer */
542 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
543 list_move(&bd1->bd_le.le_list, &started);
544 if (bd1 == bd2) {
545 bd2 = NULL;
546 bd2 = list_prepare_entry(bd2,
547 &sdp->sd_log_le_databuf,
548 bd_le.le_list);
549 }
550 total_dbuf--;
551 if (bd1->bd_bh) {
552 get_bh(bd1->bd_bh);
553 if (buffer_dirty(bd1->bd_bh)) {
554 gfs2_log_unlock(sdp);
555 wait_on_buffer(bd1->bd_bh);
556 ll_rw_block(WRITE, 1,
557 &bd1->bd_bh);
558 gfs2_log_lock(sdp);
559 }
560 brelse(bd1->bd_bh);
561 continue;
562 }
563 continue;
564 } else if (bd1->bd_bh) { /* A journaled buffer */
565 int magic;
566 gfs2_log_unlock(sdp);
567 if (!bh) {
568 bh = gfs2_log_get_buf(sdp);
569 sdp->sd_log_num_hdrs++;
570 ld = (struct gfs2_log_descriptor *)
571 bh->b_data;
572 ptr = (__be64 *)(bh->b_data + offset);
573 ld->ld_header.mh_magic =
574 cpu_to_be32(GFS2_MAGIC);
575 ld->ld_header.mh_type =
576 cpu_to_be32(GFS2_METATYPE_LD);
577 ld->ld_header.mh_format =
578 cpu_to_be32(GFS2_FORMAT_LD);
579 ld->ld_type =
580 cpu_to_be32(GFS2_LOG_DESC_JDATA);
581 ld->ld_length = cpu_to_be32(num + 1);
582 ld->ld_data1 = cpu_to_be32(num);
583 ld->ld_data2 = cpu_to_be32(0);
584 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
585 }
586 magic = gfs2_check_magic(bd1->bd_bh);
587 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
588 *ptr++ = cpu_to_be64((__u64)magic);
589 clear_buffer_escaped(bd1->bd_bh);
590 if (unlikely(magic != 0))
591 set_buffer_escaped(bd1->bd_bh);
592 gfs2_log_lock(sdp);
593 if (n++ > num)
594 break;
595 }
596 }
597 gfs2_log_unlock(sdp);
598 if (bh) {
599 set_buffer_dirty(bh);
600 ll_rw_block(WRITE, 1, &bh);
601 bh = NULL;
602 }
603 n = 0;
604 gfs2_log_lock(sdp);
605 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
606 bd_le.le_list) {
607 if (!bd2->bd_bh)
608 continue;
609 /* copy buffer if it needs escaping */
610 gfs2_log_unlock(sdp);
611 if (unlikely(buffer_escaped(bd2->bd_bh))) {
612 void *kaddr;
613 struct page *page = bd2->bd_bh->b_page;
614 bh = gfs2_log_get_buf(sdp);
615 kaddr = kmap_atomic(page, KM_USER0);
616 memcpy(bh->b_data,
617 kaddr + bh_offset(bd2->bd_bh),
618 sdp->sd_sb.sb_bsize);
619 kunmap_atomic(page, KM_USER0);
620 *(__be32 *)bh->b_data = 0;
621 } else {
622 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
623 }
624 set_buffer_dirty(bh);
625 ll_rw_block(WRITE, 1, &bh);
626 gfs2_log_lock(sdp);
627 if (++n >= num)
628 break;
629 }
630 bh = NULL;
631 total_dbuf -= num;
632 total_jdata -= num;
633 }
634 gfs2_log_unlock(sdp);
635
636 /* Wait on all ordered buffers */
637 while (!list_empty(&started)) {
638 gfs2_log_lock(sdp);
639 bd1 = list_entry(started.next, struct gfs2_bufdata,
640 bd_le.le_list);
641 list_del(&bd1->bd_le.le_list);
642 sdp->sd_log_num_databuf--;
643
644 bh = bd1->bd_bh;
645 if (bh) {
646 bh->b_private = NULL;
647 gfs2_log_unlock(sdp);
648 wait_on_buffer(bh);
649 brelse(bh);
650 } else
651 gfs2_log_unlock(sdp);
652
653 kmem_cache_free(gfs2_bufdata_cachep, bd1);
654 }
655
656 /* We've removed all the ordered write bufs here, so only jdata left */
657 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
658}
659
660static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
661 struct gfs2_log_descriptor *ld,
662 __be64 *ptr, int pass)
663{
664 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
665 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
666 struct gfs2_glock *gl = ip->i_gl;
667 unsigned int blks = be32_to_cpu(ld->ld_data1);
668 struct buffer_head *bh_log, *bh_ip;
669 uint64_t blkno;
670 uint64_t esc;
671 int error = 0;
672
673 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
674 return 0;
675
676 gfs2_replay_incr_blk(sdp, &start);
677 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
678 blkno = be64_to_cpu(*ptr++);
679 esc = be64_to_cpu(*ptr++);
680
681 sdp->sd_found_blocks++;
682
683 if (gfs2_revoke_check(sdp, blkno, start))
684 continue;
685
686 error = gfs2_replay_read_block(jd, start, &bh_log);
687 if (error)
688 return error;
689
690 bh_ip = gfs2_meta_new(gl, blkno);
691 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
692
693 /* Unescape */
694 if (esc) {
695 __be32 *eptr = (__be32 *)bh_ip->b_data;
696 *eptr = cpu_to_be32(GFS2_MAGIC);
697 }
698 mark_buffer_dirty(bh_ip);
699
700 brelse(bh_log);
701 brelse(bh_ip);
702 if (error)
703 break;
704
705 sdp->sd_replayed_blocks++;
706 }
707
708 return error;
709}
710
711/* FIXME: sort out accounting for log blocks etc. */
712
713static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
714{
715 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
716 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
717
718 if (error) {
719 gfs2_meta_sync(ip->i_gl,
720 DIO_START | DIO_WAIT);
721 return;
722 }
723 if (pass != 1)
724 return;
725
726 /* data sync? */
727 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
728
729 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
730 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
731}
732
733static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
734{
735 struct list_head *head = &sdp->sd_log_le_databuf;
736 struct gfs2_bufdata *bd;
737
738 while (!list_empty(head)) {
739 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
740 list_del(&bd->bd_le.le_list);
741 sdp->sd_log_num_databuf--;
742 sdp->sd_log_num_jdata--;
743 gfs2_unpin(sdp, bd->bd_bh, ai);
744 }
745 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
746 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
747}
748
749
750const struct gfs2_log_operations gfs2_glock_lops = {
751 .lo_add = glock_lo_add,
752 .lo_after_commit = glock_lo_after_commit,
753 .lo_name = "glock"
754};
755
756const struct gfs2_log_operations gfs2_buf_lops = {
757 .lo_add = buf_lo_add,
758 .lo_incore_commit = buf_lo_incore_commit,
759 .lo_before_commit = buf_lo_before_commit,
760 .lo_after_commit = buf_lo_after_commit,
761 .lo_before_scan = buf_lo_before_scan,
762 .lo_scan_elements = buf_lo_scan_elements,
763 .lo_after_scan = buf_lo_after_scan,
764 .lo_name = "buf"
765};
766
767const struct gfs2_log_operations gfs2_revoke_lops = {
768 .lo_add = revoke_lo_add,
769 .lo_before_commit = revoke_lo_before_commit,
770 .lo_before_scan = revoke_lo_before_scan,
771 .lo_scan_elements = revoke_lo_scan_elements,
772 .lo_after_scan = revoke_lo_after_scan,
773 .lo_name = "revoke"
774};
775
776const struct gfs2_log_operations gfs2_rg_lops = {
777 .lo_add = rg_lo_add,
778 .lo_after_commit = rg_lo_after_commit,
779 .lo_name = "rg"
780};
781
782const struct gfs2_log_operations gfs2_databuf_lops = {
783 .lo_add = databuf_lo_add,
784 .lo_incore_commit = buf_lo_incore_commit,
785 .lo_before_commit = databuf_lo_before_commit,
786 .lo_after_commit = databuf_lo_after_commit,
787 .lo_scan_elements = databuf_lo_scan_elements,
788 .lo_after_scan = databuf_lo_after_scan,
789 .lo_name = "databuf"
790};
791
792const struct gfs2_log_operations *gfs2_log_ops[] = {
793 &gfs2_glock_lops,
794 &gfs2_buf_lops,
795 &gfs2_revoke_lops,
796 &gfs2_rg_lops,
797 &gfs2_databuf_lops,
798 NULL
799};
800
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..8a1029d3d389
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13extern const struct gfs2_log_operations gfs2_glock_lops;
14extern const struct gfs2_log_operations gfs2_buf_lops;
15extern const struct gfs2_log_operations gfs2_revoke_lops;
16extern const struct gfs2_log_operations gfs2_rg_lops;
17extern const struct gfs2_log_operations gfs2_databuf_lops;
18
19extern const struct gfs2_log_operations *gfs2_log_ops[];
20
21static inline void lops_init_le(struct gfs2_log_element *le,
22 const struct gfs2_log_operations *lops)
23{
24 INIT_LIST_HEAD(&le->le_list);
25 le->le_ops = lops;
26}
27
28static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
29{
30 if (le->le_ops->lo_add)
31 le->le_ops->lo_add(sdp, le);
32}
33
34static inline void lops_incore_commit(struct gfs2_sbd *sdp,
35 struct gfs2_trans *tr)
36{
37 int x;
38 for (x = 0; gfs2_log_ops[x]; x++)
39 if (gfs2_log_ops[x]->lo_incore_commit)
40 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
41}
42
43static inline void lops_before_commit(struct gfs2_sbd *sdp)
44{
45 int x;
46 for (x = 0; gfs2_log_ops[x]; x++)
47 if (gfs2_log_ops[x]->lo_before_commit)
48 gfs2_log_ops[x]->lo_before_commit(sdp);
49}
50
51static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 int x;
54 for (x = 0; gfs2_log_ops[x]; x++)
55 if (gfs2_log_ops[x]->lo_after_commit)
56 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
57}
58
59static inline void lops_before_scan(struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head,
61 unsigned int pass)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_before_scan)
66 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
67}
68
69static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
70 struct gfs2_log_descriptor *ld,
71 __be64 *ptr,
72 unsigned int pass)
73{
74 int x, error;
75 for (x = 0; gfs2_log_ops[x]; x++)
76 if (gfs2_log_ops[x]->lo_scan_elements) {
77 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
78 ld, ptr, pass);
79 if (error)
80 return error;
81 }
82
83 return 0;
84}
85
86static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
87 unsigned int pass)
88{
89 int x;
90 for (x = 0; gfs2_log_ops[x]; x++)
91 if (gfs2_log_ops[x]->lo_before_scan)
92 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
93}
94
95#endif /* __LOPS_DOT_H__ */
96
diff --git a/fs/gfs2/lvb.c b/fs/gfs2/lvb.c
new file mode 100644
index 000000000000..e88e9cce14e7
--- /dev/null
+++ b/fs/gfs2/lvb.c
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "lvb.h"
21
22#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
23 struct->member);
24
25void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb)
26{
27 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
28
29 qb->qb_magic = be32_to_cpu(str->qb_magic);
30 qb->qb_limit = be64_to_cpu(str->qb_limit);
31 qb->qb_warn = be64_to_cpu(str->qb_warn);
32 qb->qb_value = be64_to_cpu(str->qb_value);
33}
34
35void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb)
36{
37 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
38
39 str->qb_magic = cpu_to_be32(qb->qb_magic);
40 str->qb_limit = cpu_to_be64(qb->qb_limit);
41 str->qb_warn = cpu_to_be64(qb->qb_warn);
42 str->qb_value = cpu_to_be64(qb->qb_value);
43}
44
45
diff --git a/fs/gfs2/lvb.h b/fs/gfs2/lvb.h
new file mode 100644
index 000000000000..1b1a8b75219a
--- /dev/null
+++ b/fs/gfs2/lvb.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LVB_DOT_H__
11#define __LVB_DOT_H__
12
13#define GFS2_MIN_LVB_SIZE 32
14
15void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb);
16void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb);
17
18#endif /* __LVB_DOT_H__ */
19
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..c112943ee8c1
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "ops_fstype.h"
23#include "sys.h"
24#include "util.h"
25
26static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
27{
28 struct gfs2_inode *ip = foo;
29 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
30 SLAB_CTOR_CONSTRUCTOR) {
31 inode_init_once(&ip->i_inode);
32 spin_lock_init(&ip->i_spin);
33 init_rwsem(&ip->i_rw_mutex);
34 memset(ip->i_cache, 0, sizeof(ip->i_cache));
35 }
36}
37
38/**
39 * init_gfs2_fs - Register GFS2 as a filesystem
40 *
41 * Returns: 0 on success, error code on failure
42 */
43
44static int __init init_gfs2_fs(void)
45{
46 int error;
47
48 gfs2_init_lmh();
49
50 error = gfs2_sys_init();
51 if (error)
52 return error;
53
54 error = -ENOMEM;
55
56 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
57 sizeof(struct gfs2_glock),
58 0, 0, NULL, NULL);
59 if (!gfs2_glock_cachep)
60 goto fail;
61
62 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
63 sizeof(struct gfs2_inode),
64 0, (SLAB_RECLAIM_ACCOUNT|
65 SLAB_PANIC|SLAB_MEM_SPREAD),
66 gfs2_init_inode_once, NULL);
67 if (!gfs2_inode_cachep)
68 goto fail;
69
70 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
71 sizeof(struct gfs2_bufdata),
72 0, 0, NULL, NULL);
73 if (!gfs2_bufdata_cachep)
74 goto fail;
75
76 error = register_filesystem(&gfs2_fs_type);
77 if (error)
78 goto fail;
79
80 error = register_filesystem(&gfs2meta_fs_type);
81 if (error)
82 goto fail_unregister;
83
84 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
85
86 return 0;
87
88fail_unregister:
89 unregister_filesystem(&gfs2_fs_type);
90fail:
91 if (gfs2_bufdata_cachep)
92 kmem_cache_destroy(gfs2_bufdata_cachep);
93
94 if (gfs2_inode_cachep)
95 kmem_cache_destroy(gfs2_inode_cachep);
96
97 if (gfs2_glock_cachep)
98 kmem_cache_destroy(gfs2_glock_cachep);
99
100 gfs2_sys_uninit();
101 return error;
102}
103
104/**
105 * exit_gfs2_fs - Unregister the file system
106 *
107 */
108
109static void __exit exit_gfs2_fs(void)
110{
111 unregister_filesystem(&gfs2_fs_type);
112 unregister_filesystem(&gfs2meta_fs_type);
113
114 kmem_cache_destroy(gfs2_bufdata_cachep);
115 kmem_cache_destroy(gfs2_inode_cachep);
116 kmem_cache_destroy(gfs2_glock_cachep);
117
118 gfs2_sys_uninit();
119}
120
121MODULE_DESCRIPTION("Global File System");
122MODULE_AUTHOR("Red Hat, Inc.");
123MODULE_LICENSE("GPL");
124
125module_init(init_gfs2_fs);
126module_exit(exit_gfs2_fs);
127
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..42dfd32059bc
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,780 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21
22#include "gfs2.h"
23#include "lm_interface.h"
24#include "incore.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "lops.h"
30#include "meta_io.h"
31#include "rgrp.h"
32#include "trans.h"
33#include "util.h"
34#include "ops_address.h"
35
36#define buffer_busy(bh) \
37((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
38#define buffer_in_io(bh) \
39((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
40
41static int aspace_get_block(struct inode *inode, sector_t lblock,
42 struct buffer_head *bh_result, int create)
43{
44 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
45 return -EOPNOTSUPP;
46}
47
48static int gfs2_aspace_writepage(struct page *page,
49 struct writeback_control *wbc)
50{
51 return block_write_full_page(page, aspace_get_block, wbc);
52}
53
54static const struct address_space_operations aspace_aops = {
55 .writepage = gfs2_aspace_writepage,
56 .releasepage = gfs2_releasepage,
57};
58
59/**
60 * gfs2_aspace_get - Create and initialize a struct inode structure
61 * @sdp: the filesystem the aspace is in
62 *
63 * Right now a struct inode is just a struct inode. Maybe Linux
64 * will supply a more lightweight address space construct (that works)
65 * in the future.
66 *
67 * Make sure pages/buffers in this aspace aren't in high memory.
68 *
69 * Returns: the aspace
70 */
71
72struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
73{
74 struct inode *aspace;
75
76 aspace = new_inode(sdp->sd_vfs);
77 if (aspace) {
78 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
79 aspace->i_mapping->a_ops = &aspace_aops;
80 aspace->i_size = ~0ULL;
81 aspace->u.generic_ip = NULL;
82 insert_inode_hash(aspace);
83 }
84 return aspace;
85}
86
87void gfs2_aspace_put(struct inode *aspace)
88{
89 remove_inode_hash(aspace);
90 iput(aspace);
91}
92
93/**
94 * gfs2_ail1_start_one - Start I/O on a part of the AIL
95 * @sdp: the filesystem
96 * @tr: the part of the AIL
97 *
98 */
99
100void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
101{
102 struct gfs2_bufdata *bd, *s;
103 struct buffer_head *bh;
104 int retry;
105
106 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
107
108 do {
109 retry = 0;
110
111 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
112 bd_ail_st_list) {
113 bh = bd->bd_bh;
114
115 gfs2_assert(sdp, bd->bd_ail == ai);
116
117 if (!buffer_busy(bh)) {
118 if (!buffer_uptodate(bh)) {
119 gfs2_log_unlock(sdp);
120 gfs2_io_error_bh(sdp, bh);
121 gfs2_log_lock(sdp);
122 }
123 list_move(&bd->bd_ail_st_list,
124 &ai->ai_ail2_list);
125 continue;
126 }
127
128 if (!buffer_dirty(bh))
129 continue;
130
131 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
132
133 gfs2_log_unlock(sdp);
134 wait_on_buffer(bh);
135 ll_rw_block(WRITE, 1, &bh);
136 gfs2_log_lock(sdp);
137
138 retry = 1;
139 break;
140 }
141 } while (retry);
142}
143
144/**
145 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
146 * @sdp: the filesystem
147 * @ai: the AIL entry
148 *
149 */
150
151int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
152{
153 struct gfs2_bufdata *bd, *s;
154 struct buffer_head *bh;
155
156 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
157 bd_ail_st_list) {
158 bh = bd->bd_bh;
159
160 gfs2_assert(sdp, bd->bd_ail == ai);
161
162 if (buffer_busy(bh)) {
163 if (flags & DIO_ALL)
164 continue;
165 else
166 break;
167 }
168
169 if (!buffer_uptodate(bh))
170 gfs2_io_error_bh(sdp, bh);
171
172 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
173 }
174
175 return list_empty(&ai->ai_ail1_list);
176}
177
178/**
179 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
180 * @sdp: the filesystem
181 * @ai: the AIL entry
182 *
183 */
184
185void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
186{
187 struct list_head *head = &ai->ai_ail2_list;
188 struct gfs2_bufdata *bd;
189
190 while (!list_empty(head)) {
191 bd = list_entry(head->prev, struct gfs2_bufdata,
192 bd_ail_st_list);
193 gfs2_assert(sdp, bd->bd_ail == ai);
194 bd->bd_ail = NULL;
195 list_del(&bd->bd_ail_st_list);
196 list_del(&bd->bd_ail_gl_list);
197 atomic_dec(&bd->bd_gl->gl_ail_count);
198 brelse(bd->bd_bh);
199 }
200}
201
202/**
203 * ail_empty_gl - remove all buffers for a given lock from the AIL
204 * @gl: the glock
205 *
206 * None of the buffers should be dirty, locked, or pinned.
207 */
208
209void gfs2_ail_empty_gl(struct gfs2_glock *gl)
210{
211 struct gfs2_sbd *sdp = gl->gl_sbd;
212 unsigned int blocks;
213 struct list_head *head = &gl->gl_ail_list;
214 struct gfs2_bufdata *bd;
215 struct buffer_head *bh;
216 uint64_t blkno;
217 int error;
218
219 blocks = atomic_read(&gl->gl_ail_count);
220 if (!blocks)
221 return;
222
223 error = gfs2_trans_begin(sdp, 0, blocks);
224 if (gfs2_assert_withdraw(sdp, !error))
225 return;
226
227 gfs2_log_lock(sdp);
228 while (!list_empty(head)) {
229 bd = list_entry(head->next, struct gfs2_bufdata,
230 bd_ail_gl_list);
231 bh = bd->bd_bh;
232 blkno = bh->b_blocknr;
233 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
234
235 bd->bd_ail = NULL;
236 list_del(&bd->bd_ail_st_list);
237 list_del(&bd->bd_ail_gl_list);
238 atomic_dec(&gl->gl_ail_count);
239 brelse(bh);
240 gfs2_log_unlock(sdp);
241
242 gfs2_trans_add_revoke(sdp, blkno);
243
244 gfs2_log_lock(sdp);
245 }
246 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
247 gfs2_log_unlock(sdp);
248
249 gfs2_trans_end(sdp);
250 gfs2_log_flush(sdp, NULL);
251}
252
253/**
254 * gfs2_meta_inval - Invalidate all buffers associated with a glock
255 * @gl: the glock
256 *
257 */
258
259void gfs2_meta_inval(struct gfs2_glock *gl)
260{
261 struct gfs2_sbd *sdp = gl->gl_sbd;
262 struct inode *aspace = gl->gl_aspace;
263 struct address_space *mapping = gl->gl_aspace->i_mapping;
264
265 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
266
267 atomic_inc(&aspace->i_writecount);
268 truncate_inode_pages(mapping, 0);
269 atomic_dec(&aspace->i_writecount);
270
271 gfs2_assert_withdraw(sdp, !mapping->nrpages);
272}
273
274/**
275 * gfs2_meta_sync - Sync all buffers associated with a glock
276 * @gl: The glock
277 * @flags: DIO_START | DIO_WAIT
278 *
279 */
280
281void gfs2_meta_sync(struct gfs2_glock *gl, int flags)
282{
283 struct address_space *mapping = gl->gl_aspace->i_mapping;
284 int error = 0;
285
286 if (flags & DIO_START)
287 filemap_fdatawrite(mapping);
288 if (!error && (flags & DIO_WAIT))
289 error = filemap_fdatawait(mapping);
290
291 if (error)
292 gfs2_io_error(gl->gl_sbd);
293}
294
295/**
296 * getbuf - Get a buffer with a given address space
297 * @sdp: the filesystem
298 * @aspace: the address space
299 * @blkno: the block number (filesystem scope)
300 * @create: 1 if the buffer should be created
301 *
302 * Returns: the buffer
303 */
304
305static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
306 uint64_t blkno, int create)
307{
308 struct page *page;
309 struct buffer_head *bh;
310 unsigned int shift;
311 unsigned long index;
312 unsigned int bufnum;
313
314 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
315 index = blkno >> shift; /* convert block to page */
316 bufnum = blkno - (index << shift); /* block buf index within page */
317
318 if (create) {
319 for (;;) {
320 page = grab_cache_page(aspace->i_mapping, index);
321 if (page)
322 break;
323 yield();
324 }
325 } else {
326 page = find_lock_page(aspace->i_mapping, index);
327 if (!page)
328 return NULL;
329 }
330
331 if (!page_has_buffers(page))
332 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
333
334 /* Locate header for our buffer within our page */
335 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
336 /* Do nothing */;
337 get_bh(bh);
338
339 if (!buffer_mapped(bh))
340 map_bh(bh, sdp->sd_vfs, blkno);
341
342 unlock_page(page);
343 mark_page_accessed(page);
344 page_cache_release(page);
345
346 return bh;
347}
348
349static void meta_prep_new(struct buffer_head *bh)
350{
351 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
352
353 lock_buffer(bh);
354 clear_buffer_dirty(bh);
355 set_buffer_uptodate(bh);
356 unlock_buffer(bh);
357
358 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
359}
360
361/**
362 * gfs2_meta_new - Get a block
363 * @gl: The glock associated with this block
364 * @blkno: The block number
365 *
366 * Returns: The buffer
367 */
368
369struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno)
370{
371 struct buffer_head *bh;
372 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
373 meta_prep_new(bh);
374 return bh;
375}
376
377/**
378 * gfs2_meta_read - Read a block from disk
379 * @gl: The glock covering the block
380 * @blkno: The block number
381 * @flags: flags to gfs2_dreread()
382 * @bhp: the place where the buffer is returned (NULL on failure)
383 *
384 * Returns: errno
385 */
386
387int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags,
388 struct buffer_head **bhp)
389{
390 int error;
391
392 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
393 error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags);
394 if (error)
395 brelse(*bhp);
396
397 return error;
398}
399
400/**
401 * gfs2_meta_reread - Reread a block from disk
402 * @sdp: the filesystem
403 * @bh: The block to read
404 * @flags: Flags that control the read
405 *
406 * Returns: errno
407 */
408
409int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags)
410{
411 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
412 return -EIO;
413
414 if (flags & DIO_FORCE)
415 clear_buffer_uptodate(bh);
416
417 if ((flags & DIO_START) && !buffer_uptodate(bh))
418 ll_rw_block(READ, 1, &bh);
419
420 if (flags & DIO_WAIT) {
421 wait_on_buffer(bh);
422
423 if (!buffer_uptodate(bh)) {
424 struct gfs2_trans *tr = current->journal_info;
425 if (tr && tr->tr_touched)
426 gfs2_io_error_bh(sdp, bh);
427 return -EIO;
428 }
429 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
430 return -EIO;
431 }
432
433 return 0;
434}
435
436/**
437 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
438 * @gl: the glock the buffer belongs to
439 * @bh: The buffer to be attached to
440 * @meta: Flag to indicate whether its metadata or not
441 */
442
443void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
444 int meta)
445{
446 struct gfs2_bufdata *bd;
447
448 if (meta)
449 lock_page(bh->b_page);
450
451 if (bh->b_private) {
452 if (meta)
453 unlock_page(bh->b_page);
454 return;
455 }
456
457 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
458 memset(bd, 0, sizeof(struct gfs2_bufdata));
459 bd->bd_bh = bh;
460 bd->bd_gl = gl;
461
462 INIT_LIST_HEAD(&bd->bd_list_tr);
463 if (meta) {
464 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
465 } else {
466 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
467 get_bh(bh);
468 }
469 bh->b_private = bd;
470
471 if (meta)
472 unlock_page(bh->b_page);
473}
474
475/**
476 * gfs2_pin - Pin a buffer in memory
477 * @sdp: the filesystem the buffer belongs to
478 * @bh: The buffer to be pinned
479 *
480 */
481
482void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
483{
484 struct gfs2_bufdata *bd = bh->b_private;
485
486 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
487
488 if (test_set_buffer_pinned(bh))
489 gfs2_assert_withdraw(sdp, 0);
490
491 wait_on_buffer(bh);
492
493 /* If this buffer is in the AIL and it has already been written
494 to in-place disk block, remove it from the AIL. */
495
496 gfs2_log_lock(sdp);
497 if (bd->bd_ail && !buffer_in_io(bh))
498 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
499 gfs2_log_unlock(sdp);
500
501 clear_buffer_dirty(bh);
502 wait_on_buffer(bh);
503
504 if (!buffer_uptodate(bh))
505 gfs2_io_error_bh(sdp, bh);
506
507 get_bh(bh);
508}
509
510/**
511 * gfs2_unpin - Unpin a buffer
512 * @sdp: the filesystem the buffer belongs to
513 * @bh: The buffer to unpin
514 * @ai:
515 *
516 */
517
518void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
519 struct gfs2_ail *ai)
520{
521 struct gfs2_bufdata *bd = bh->b_private;
522
523 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
524
525 if (!buffer_pinned(bh))
526 gfs2_assert_withdraw(sdp, 0);
527
528 mark_buffer_dirty(bh);
529 clear_buffer_pinned(bh);
530
531 gfs2_log_lock(sdp);
532 if (bd->bd_ail) {
533 list_del(&bd->bd_ail_st_list);
534 brelse(bh);
535 } else {
536 struct gfs2_glock *gl = bd->bd_gl;
537 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
538 atomic_inc(&gl->gl_ail_count);
539 }
540 bd->bd_ail = ai;
541 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
542 gfs2_log_unlock(sdp);
543}
544
545/**
546 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
547 * @ip: the inode who owns the buffers
548 * @bstart: the first buffer in the run
549 * @blen: the number of buffers in the run
550 *
551 */
552
553void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
554{
555 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
556 struct inode *aspace = ip->i_gl->gl_aspace;
557 struct buffer_head *bh;
558
559 while (blen) {
560 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
561 if (bh) {
562 struct gfs2_bufdata *bd = bh->b_private;
563
564 if (test_clear_buffer_pinned(bh)) {
565 struct gfs2_trans *tr = current->journal_info;
566 gfs2_log_lock(sdp);
567 list_del_init(&bd->bd_le.le_list);
568 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
569 sdp->sd_log_num_buf--;
570 gfs2_log_unlock(sdp);
571 tr->tr_num_buf_rm++;
572 brelse(bh);
573 }
574 if (bd) {
575 gfs2_log_lock(sdp);
576 if (bd->bd_ail) {
577 uint64_t blkno = bh->b_blocknr;
578 bd->bd_ail = NULL;
579 list_del(&bd->bd_ail_st_list);
580 list_del(&bd->bd_ail_gl_list);
581 atomic_dec(&bd->bd_gl->gl_ail_count);
582 brelse(bh);
583 gfs2_log_unlock(sdp);
584 gfs2_trans_add_revoke(sdp, blkno);
585 } else
586 gfs2_log_unlock(sdp);
587 }
588
589 lock_buffer(bh);
590 clear_buffer_dirty(bh);
591 clear_buffer_uptodate(bh);
592 unlock_buffer(bh);
593
594 brelse(bh);
595 }
596
597 bstart++;
598 blen--;
599 }
600}
601
602/**
603 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
604 * @ip: The GFS2 inode
605 *
606 * This releases buffers that are in the most-recently-used array of
607 * blocks used for indirect block addressing for this inode.
608 */
609
610void gfs2_meta_cache_flush(struct gfs2_inode *ip)
611{
612 struct buffer_head **bh_slot;
613 unsigned int x;
614
615 spin_lock(&ip->i_spin);
616
617 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
618 bh_slot = &ip->i_cache[x];
619 if (!*bh_slot)
620 break;
621 brelse(*bh_slot);
622 *bh_slot = NULL;
623 }
624
625 spin_unlock(&ip->i_spin);
626}
627
628/**
629 * gfs2_meta_indirect_buffer - Get a metadata buffer
630 * @ip: The GFS2 inode
631 * @height: The level of this buf in the metadata (indir addr) tree (if any)
632 * @num: The block number (device relative) of the buffer
633 * @new: Non-zero if we may create a new buffer
634 * @bhp: the buffer is returned here
635 *
636 * Try to use the gfs2_inode's MRU metadata tree cache.
637 *
638 * Returns: errno
639 */
640
641int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
642 int new, struct buffer_head **bhp)
643{
644 struct buffer_head *bh, **bh_slot = ip->i_cache + height;
645 int error;
646
647 spin_lock(&ip->i_spin);
648 bh = *bh_slot;
649 if (bh) {
650 if (bh->b_blocknr == num)
651 get_bh(bh);
652 else
653 bh = NULL;
654 }
655 spin_unlock(&ip->i_spin);
656
657 if (bh) {
658 if (new)
659 meta_prep_new(bh);
660 else {
661 error = gfs2_meta_reread(GFS2_SB(&ip->i_inode), bh,
662 DIO_START | DIO_WAIT);
663 if (error) {
664 brelse(bh);
665 return error;
666 }
667 }
668 } else {
669 if (new)
670 bh = gfs2_meta_new(ip->i_gl, num);
671 else {
672 error = gfs2_meta_read(ip->i_gl, num,
673 DIO_START | DIO_WAIT, &bh);
674 if (error)
675 return error;
676 }
677
678 spin_lock(&ip->i_spin);
679 if (*bh_slot != bh) {
680 brelse(*bh_slot);
681 *bh_slot = bh;
682 get_bh(bh);
683 }
684 spin_unlock(&ip->i_spin);
685 }
686
687 if (new) {
688 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), height)) {
689 brelse(bh);
690 return -EIO;
691 }
692 gfs2_trans_add_bh(ip->i_gl, bh, 1);
693 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
694 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
695
696 } else if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh,
697 (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) {
698 brelse(bh);
699 return -EIO;
700 }
701
702 *bhp = bh;
703
704 return 0;
705}
706
707/**
708 * gfs2_meta_ra - start readahead on an extent of a file
709 * @gl: the glock the blocks belong to
710 * @dblock: the starting disk block
711 * @extlen: the number of blocks in the extent
712 *
713 */
714
715void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen)
716{
717 struct gfs2_sbd *sdp = gl->gl_sbd;
718 struct inode *aspace = gl->gl_aspace;
719 struct buffer_head *first_bh, *bh;
720 uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
721 sdp->sd_sb.sb_bsize_shift;
722 int error;
723
724 if (!extlen || !max_ra)
725 return;
726 if (extlen > max_ra)
727 extlen = max_ra;
728
729 first_bh = getbuf(sdp, aspace, dblock, CREATE);
730
731 if (buffer_uptodate(first_bh))
732 goto out;
733 if (!buffer_locked(first_bh)) {
734 error = gfs2_meta_reread(sdp, first_bh, DIO_START);
735 if (error)
736 goto out;
737 }
738
739 dblock++;
740 extlen--;
741
742 while (extlen) {
743 bh = getbuf(sdp, aspace, dblock, CREATE);
744
745 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
746 error = gfs2_meta_reread(sdp, bh, DIO_START);
747 brelse(bh);
748 if (error)
749 goto out;
750 } else
751 brelse(bh);
752
753 dblock++;
754 extlen--;
755
756 if (buffer_uptodate(first_bh))
757 break;
758 }
759
760 out:
761 brelse(first_bh);
762}
763
764/**
765 * gfs2_meta_syncfs - sync all the buffers in a filesystem
766 * @sdp: the filesystem
767 *
768 */
769
770void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
771{
772 gfs2_log_flush(sdp, NULL);
773 for (;;) {
774 gfs2_ail1_start(sdp, DIO_ALL);
775 if (gfs2_ail1_empty(sdp, DIO_ALL))
776 break;
777 msleep(10);
778 }
779}
780
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..23c6a596fd9e
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,89 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13static inline void gfs2_buffer_clear(struct buffer_head *bh)
14{
15 memset(bh->b_data, 0, bh->b_size);
16}
17
18static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
19{
20 memset(bh->b_data + head, 0, bh->b_size - head);
21}
22
23static inline void gfs2_buffer_clear_ends(struct buffer_head *bh, int offset,
24 int amount, int journaled)
25{
26 int z_off1 = (journaled) ? sizeof(struct gfs2_meta_header) : 0;
27 int z_len1 = offset - z_off1;
28 int z_off2 = offset + amount;
29 int z_len2 = (bh)->b_size - z_off2;
30
31 if (z_len1)
32 memset(bh->b_data + z_off1, 0, z_len1);
33
34 if (z_len2)
35 memset(bh->b_data + z_off2, 0, z_len2);
36}
37
38static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
39 int to_head,
40 struct buffer_head *from_bh,
41 int from_head)
42{
43 memcpy(to_bh->b_data + to_head,
44 from_bh->b_data + from_head,
45 from_bh->b_size - from_head);
46 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
47 0,
48 from_head - to_head);
49}
50
51struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
52void gfs2_aspace_put(struct inode *aspace);
53
54void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
55int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags);
56void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
57void gfs2_ail_empty_gl(struct gfs2_glock *gl);
58
59void gfs2_meta_inval(struct gfs2_glock *gl);
60void gfs2_meta_sync(struct gfs2_glock *gl, int flags);
61
62struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno);
63int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno,
64 int flags, struct buffer_head **bhp);
65int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags);
66
67void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
68 int meta);
69void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
70void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
71 struct gfs2_ail *ai);
72
73void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
74
75void gfs2_meta_cache_flush(struct gfs2_inode *ip);
76int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
77 int new, struct buffer_head **bhp);
78
79static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
80 struct buffer_head **bhp)
81{
82 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
83}
84
85void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen);
86void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
87
88#endif /* __DIO_DOT_H__ */
89
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..0d4b230785af
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "mount.h"
21#include "sys.h"
22#include "util.h"
23
24/**
25 * gfs2_mount_args - Parse mount options
26 * @sdp:
27 * @data:
28 *
29 * Return: errno
30 */
31
32int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
33{
34 struct gfs2_args *args = &sdp->sd_args;
35 char *data = data_arg;
36 char *options, *o, *v;
37 int error = 0;
38
39 if (!remount) {
40 /* If someone preloaded options, use those instead */
41 spin_lock(&gfs2_sys_margs_lock);
42 if (gfs2_sys_margs) {
43 data = gfs2_sys_margs;
44 gfs2_sys_margs = NULL;
45 }
46 spin_unlock(&gfs2_sys_margs_lock);
47
48 /* Set some defaults */
49 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
50 args->ar_quota = GFS2_QUOTA_DEFAULT;
51 args->ar_data = GFS2_DATA_DEFAULT;
52 }
53
54 /* Split the options into tokens with the "," character and
55 process them */
56
57 for (options = data; (o = strsep(&options, ",")); ) {
58 if (!*o)
59 continue;
60
61 v = strchr(o, '=');
62 if (v)
63 *v++ = 0;
64
65 if (!strcmp(o, "lockproto")) {
66 if (!v)
67 goto need_value;
68 if (remount && strcmp(v, args->ar_lockproto))
69 goto cant_remount;
70 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
71 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
72 }
73
74 else if (!strcmp(o, "locktable")) {
75 if (!v)
76 goto need_value;
77 if (remount && strcmp(v, args->ar_locktable))
78 goto cant_remount;
79 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
80 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
81 }
82
83 else if (!strcmp(o, "hostdata")) {
84 if (!v)
85 goto need_value;
86 if (remount && strcmp(v, args->ar_hostdata))
87 goto cant_remount;
88 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
89 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
90 }
91
92 else if (!strcmp(o, "spectator")) {
93 if (remount && !args->ar_spectator)
94 goto cant_remount;
95 args->ar_spectator = 1;
96 sdp->sd_vfs->s_flags |= MS_RDONLY;
97 }
98
99 else if (!strcmp(o, "ignore_local_fs")) {
100 if (remount && !args->ar_ignore_local_fs)
101 goto cant_remount;
102 args->ar_ignore_local_fs = 1;
103 }
104
105 else if (!strcmp(o, "localflocks")) {
106 if (remount && !args->ar_localflocks)
107 goto cant_remount;
108 args->ar_localflocks = 1;
109 }
110
111 else if (!strcmp(o, "localcaching")) {
112 if (remount && !args->ar_localcaching)
113 goto cant_remount;
114 args->ar_localcaching = 1;
115 }
116
117 else if (!strcmp(o, "debug"))
118 args->ar_debug = 1;
119
120 else if (!strcmp(o, "nodebug"))
121 args->ar_debug = 0;
122
123 else if (!strcmp(o, "upgrade")) {
124 if (remount && !args->ar_upgrade)
125 goto cant_remount;
126 args->ar_upgrade = 1;
127 }
128
129 else if (!strcmp(o, "num_glockd")) {
130 unsigned int x;
131 if (!v)
132 goto need_value;
133 sscanf(v, "%u", &x);
134 if (remount && x != args->ar_num_glockd)
135 goto cant_remount;
136 if (!x || x > GFS2_GLOCKD_MAX) {
137 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
138 GFS2_GLOCKD_MAX, x);
139 error = -EINVAL;
140 break;
141 }
142 args->ar_num_glockd = x;
143 }
144
145 else if (!strcmp(o, "acl")) {
146 args->ar_posix_acl = 1;
147 sdp->sd_vfs->s_flags |= MS_POSIXACL;
148 }
149
150 else if (!strcmp(o, "noacl")) {
151 args->ar_posix_acl = 0;
152 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
153 }
154
155 else if (!strcmp(o, "quota")) {
156 if (!v)
157 goto need_value;
158 if (!strcmp(v, "off"))
159 args->ar_quota = GFS2_QUOTA_OFF;
160 else if (!strcmp(v, "account"))
161 args->ar_quota = GFS2_QUOTA_ACCOUNT;
162 else if (!strcmp(v, "on"))
163 args->ar_quota = GFS2_QUOTA_ON;
164 else {
165 fs_info(sdp, "invalid value for quota\n");
166 error = -EINVAL;
167 break;
168 }
169 }
170
171 else if (!strcmp(o, "suiddir"))
172 args->ar_suiddir = 1;
173
174 else if (!strcmp(o, "nosuiddir"))
175 args->ar_suiddir = 0;
176
177 else if (!strcmp(o, "data")) {
178 if (!v)
179 goto need_value;
180 if (!strcmp(v, "writeback"))
181 args->ar_data = GFS2_DATA_WRITEBACK;
182 else if (!strcmp(v, "ordered"))
183 args->ar_data = GFS2_DATA_ORDERED;
184 else {
185 fs_info(sdp, "invalid value for data\n");
186 error = -EINVAL;
187 break;
188 }
189 }
190
191 else {
192 fs_info(sdp, "unknown option: %s\n", o);
193 error = -EINVAL;
194 break;
195 }
196 }
197
198 if (error)
199 fs_info(sdp, "invalid mount option(s)\n");
200
201 if (data != data_arg)
202 kfree(data);
203
204 return error;
205
206 need_value:
207 fs_info(sdp, "need value for option %s\n", o);
208 return -EINVAL;
209
210 cant_remount:
211 fs_info(sdp, "can't remount with option %s\n", o);
212 return -EINVAL;
213}
214
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..2eb14722144f
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
14
15#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..39c7f0345fc6
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15
16#include "gfs2.h"
17#include <linux/gfs2_ondisk.h>
18
19#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
20 struct->member);
21
22/*
23 * gfs2_xxx_in - read in an xxx struct
24 * first arg: the cpu-order structure
25 * buf: the disk-order buffer
26 *
27 * gfs2_xxx_out - write out an xxx struct
28 * first arg: the cpu-order structure
29 * buf: the disk-order buffer
30 *
31 * gfs2_xxx_print - print out an xxx struct
32 * first arg: the cpu-order structure
33 */
34
35void gfs2_inum_in(struct gfs2_inum *no, char *buf)
36{
37 struct gfs2_inum *str = (struct gfs2_inum *)buf;
38
39 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
40 no->no_addr = be64_to_cpu(str->no_addr);
41}
42
43void gfs2_inum_out(const struct gfs2_inum *no, char *buf)
44{
45 struct gfs2_inum *str = (struct gfs2_inum *)buf;
46
47 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
48 str->no_addr = cpu_to_be64(no->no_addr);
49}
50
51static void gfs2_inum_print(struct gfs2_inum *no)
52{
53 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
54 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
55}
56
57static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf)
58{
59 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
60
61 mh->mh_magic = be32_to_cpu(str->mh_magic);
62 mh->mh_type = be32_to_cpu(str->mh_type);
63 mh->mh_format = be32_to_cpu(str->mh_format);
64}
65
66static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf)
67{
68 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
69
70 str->mh_magic = cpu_to_be32(mh->mh_magic);
71 str->mh_type = cpu_to_be32(mh->mh_type);
72 str->mh_format = cpu_to_be32(mh->mh_format);
73}
74
75static void gfs2_meta_header_print(struct gfs2_meta_header *mh)
76{
77 pv(mh, mh_magic, "0x%.8X");
78 pv(mh, mh_type, "%u");
79 pv(mh, mh_format, "%u");
80}
81
82void gfs2_sb_in(struct gfs2_sb *sb, char *buf)
83{
84 struct gfs2_sb *str = (struct gfs2_sb *)buf;
85
86 gfs2_meta_header_in(&sb->sb_header, buf);
87
88 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
89 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
90 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
91 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
92
93 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
94 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
95
96 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
97 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
98}
99
100void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf)
101{
102 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
103
104 ri->ri_addr = be64_to_cpu(str->ri_addr);
105 ri->ri_length = be32_to_cpu(str->ri_length);
106 ri->ri_data0 = be64_to_cpu(str->ri_data0);
107 ri->ri_data = be32_to_cpu(str->ri_data);
108 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
109
110}
111
112void gfs2_rindex_print(struct gfs2_rindex *ri)
113{
114 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
115 pv(ri, ri_length, "%u");
116
117 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
118 pv(ri, ri_data, "%u");
119
120 pv(ri, ri_bitbytes, "%u");
121}
122
123void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf)
124{
125 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
126
127 gfs2_meta_header_in(&rg->rg_header, buf);
128 rg->rg_flags = be32_to_cpu(str->rg_flags);
129 rg->rg_free = be32_to_cpu(str->rg_free);
130 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
131 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
132}
133
134void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf)
135{
136 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
137
138 gfs2_meta_header_out(&rg->rg_header, buf);
139 str->rg_flags = cpu_to_be32(rg->rg_flags);
140 str->rg_free = cpu_to_be32(rg->rg_free);
141 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
142 str->__pad = cpu_to_be32(0);
143 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
144 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
145}
146
147void gfs2_quota_in(struct gfs2_quota *qu, char *buf)
148{
149 struct gfs2_quota *str = (struct gfs2_quota *)buf;
150
151 qu->qu_limit = be64_to_cpu(str->qu_limit);
152 qu->qu_warn = be64_to_cpu(str->qu_warn);
153 qu->qu_value = be64_to_cpu(str->qu_value);
154}
155
156void gfs2_dinode_in(struct gfs2_dinode *di, char *buf)
157{
158 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
159
160 gfs2_meta_header_in(&di->di_header, buf);
161 gfs2_inum_in(&di->di_num, (char *)&str->di_num);
162
163 di->di_mode = be32_to_cpu(str->di_mode);
164 di->di_uid = be32_to_cpu(str->di_uid);
165 di->di_gid = be32_to_cpu(str->di_gid);
166 di->di_nlink = be32_to_cpu(str->di_nlink);
167 di->di_size = be64_to_cpu(str->di_size);
168 di->di_blocks = be64_to_cpu(str->di_blocks);
169 di->di_atime = be64_to_cpu(str->di_atime);
170 di->di_mtime = be64_to_cpu(str->di_mtime);
171 di->di_ctime = be64_to_cpu(str->di_ctime);
172 di->di_major = be32_to_cpu(str->di_major);
173 di->di_minor = be32_to_cpu(str->di_minor);
174
175 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
176 di->di_goal_data = be64_to_cpu(str->di_goal_data);
177 di->di_generation = be64_to_cpu(str->di_generation);
178
179 di->di_flags = be32_to_cpu(str->di_flags);
180 di->di_payload_format = be32_to_cpu(str->di_payload_format);
181 di->di_height = be16_to_cpu(str->di_height);
182
183 di->di_depth = be16_to_cpu(str->di_depth);
184 di->di_entries = be32_to_cpu(str->di_entries);
185
186 di->di_eattr = be64_to_cpu(str->di_eattr);
187
188}
189
190void gfs2_dinode_out(struct gfs2_dinode *di, char *buf)
191{
192 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
193
194 gfs2_meta_header_out(&di->di_header, buf);
195 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
196
197 str->di_mode = cpu_to_be32(di->di_mode);
198 str->di_uid = cpu_to_be32(di->di_uid);
199 str->di_gid = cpu_to_be32(di->di_gid);
200 str->di_nlink = cpu_to_be32(di->di_nlink);
201 str->di_size = cpu_to_be64(di->di_size);
202 str->di_blocks = cpu_to_be64(di->di_blocks);
203 str->di_atime = cpu_to_be64(di->di_atime);
204 str->di_mtime = cpu_to_be64(di->di_mtime);
205 str->di_ctime = cpu_to_be64(di->di_ctime);
206 str->di_major = cpu_to_be32(di->di_major);
207 str->di_minor = cpu_to_be32(di->di_minor);
208
209 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
210 str->di_goal_data = cpu_to_be64(di->di_goal_data);
211 str->di_generation = cpu_to_be64(di->di_generation);
212
213 str->di_flags = cpu_to_be32(di->di_flags);
214 str->di_payload_format = cpu_to_be32(di->di_payload_format);
215 str->di_height = cpu_to_be16(di->di_height);
216
217 str->di_depth = cpu_to_be16(di->di_depth);
218 str->di_entries = cpu_to_be32(di->di_entries);
219
220 str->di_eattr = cpu_to_be64(di->di_eattr);
221
222}
223
224void gfs2_dinode_print(struct gfs2_dinode *di)
225{
226 gfs2_meta_header_print(&di->di_header);
227 gfs2_inum_print(&di->di_num);
228
229 pv(di, di_mode, "0%o");
230 pv(di, di_uid, "%u");
231 pv(di, di_gid, "%u");
232 pv(di, di_nlink, "%u");
233 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
234 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
235 printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
236 printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
237 printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
238 pv(di, di_major, "%u");
239 pv(di, di_minor, "%u");
240
241 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
242 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
243
244 pv(di, di_flags, "0x%.8X");
245 pv(di, di_payload_format, "%u");
246 pv(di, di_height, "%u");
247
248 pv(di, di_depth, "%u");
249 pv(di, di_entries, "%u");
250
251 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
252}
253
254void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf)
255{
256 struct gfs2_log_header *str = (struct gfs2_log_header *)buf;
257
258 gfs2_meta_header_in(&lh->lh_header, buf);
259 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
260 lh->lh_flags = be32_to_cpu(str->lh_flags);
261 lh->lh_tail = be32_to_cpu(str->lh_tail);
262 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
263 lh->lh_hash = be32_to_cpu(str->lh_hash);
264}
265
266void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf)
267{
268 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
269
270 ir->ir_start = be64_to_cpu(str->ir_start);
271 ir->ir_length = be64_to_cpu(str->ir_length);
272}
273
274void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf)
275{
276 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
277
278 str->ir_start = cpu_to_be64(ir->ir_start);
279 str->ir_length = cpu_to_be64(ir->ir_length);
280}
281
282void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf)
283{
284 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
285
286 sc->sc_total = be64_to_cpu(str->sc_total);
287 sc->sc_free = be64_to_cpu(str->sc_free);
288 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
289}
290
291void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf)
292{
293 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
294
295 str->sc_total = cpu_to_be64(sc->sc_total);
296 str->sc_free = cpu_to_be64(sc->sc_free);
297 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
298}
299
300void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf)
301{
302 struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf;
303
304 qc->qc_change = be64_to_cpu(str->qc_change);
305 qc->qc_flags = be32_to_cpu(str->qc_flags);
306 qc->qc_id = be32_to_cpu(str->qc_id);
307}
308
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..031270ad55e2
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,784 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/pagevec.h>
17#include <linux/mpage.h>
18#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "page.h"
31#include "quota.h"
32#include "trans.h"
33#include "rgrp.h"
34#include "ops_file.h"
35#include "util.h"
36#include "glops.h"
37
38/**
39 * gfs2_get_block - Fills in a buffer head with details about a block
40 * @inode: The inode
41 * @lblock: The block number to look up
42 * @bh_result: The buffer head to return the result in
43 * @create: Non-zero if we may add block to the file
44 *
45 * Returns: errno
46 */
47
48int gfs2_get_block(struct inode *inode, sector_t lblock,
49 struct buffer_head *bh_result, int create)
50{
51 int new = create;
52 uint64_t dblock;
53 int error;
54 int boundary;
55
56 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
57 if (error)
58 return error;
59
60 if (!dblock)
61 return 0;
62
63 map_bh(bh_result, inode->i_sb, dblock);
64 if (new)
65 set_buffer_new(bh_result);
66 if (boundary)
67 set_buffer_boundary(bh_result);
68
69 return 0;
70}
71
72/**
73 * get_block_noalloc - Fills in a buffer head with details about a block
74 * @inode: The inode
75 * @lblock: The block number to look up
76 * @bh_result: The buffer head to return the result in
77 * @create: Non-zero if we may add block to the file
78 *
79 * Returns: errno
80 */
81
82static int get_block_noalloc(struct inode *inode, sector_t lblock,
83 struct buffer_head *bh_result, int create)
84{
85 int new = 0;
86 uint64_t dblock;
87 int error;
88 int boundary;
89
90 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
91 if (error)
92 return error;
93
94 if (dblock)
95 map_bh(bh_result, inode->i_sb, dblock);
96 else if (gfs2_assert_withdraw(GFS2_SB(inode), !create))
97 error = -EIO;
98 if (boundary)
99 set_buffer_boundary(bh_result);
100
101 return error;
102}
103
104/**
105 * gfs2_writepage - Write complete page
106 * @page: Page to write
107 *
108 * Returns: errno
109 *
110 * Some of this is copied from block_write_full_page() although we still
111 * call it to do most of the work.
112 */
113
114static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
115{
116 struct inode *inode = page->mapping->host;
117 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
118 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
119 loff_t i_size = i_size_read(inode);
120 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
121 unsigned offset;
122 int error;
123 int done_trans = 0;
124
125 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
126 unlock_page(page);
127 return -EIO;
128 }
129 if (current->journal_info)
130 goto out_ignore;
131
132 /* Is the page fully outside i_size? (truncate in progress) */
133 offset = i_size & (PAGE_CACHE_SIZE-1);
134 if (page->index > end_index || (page->index == end_index && !offset)) {
135 page->mapping->a_ops->invalidatepage(page, 0);
136 unlock_page(page);
137 return 0; /* don't care */
138 }
139
140 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
141 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
142 if (error)
143 goto out_ignore;
144 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
145 done_trans = 1;
146 }
147 error = block_write_full_page(page, get_block_noalloc, wbc);
148 if (done_trans)
149 gfs2_trans_end(sdp);
150 gfs2_meta_cache_flush(ip);
151 return error;
152
153out_ignore:
154 redirty_page_for_writepage(wbc, page);
155 unlock_page(page);
156 return 0;
157}
158
159static int zero_readpage(struct page *page)
160{
161 void *kaddr;
162
163 kaddr = kmap_atomic(page, KM_USER0);
164 memset(kaddr, 0, PAGE_CACHE_SIZE);
165 kunmap_atomic(page, KM_USER0);
166
167 SetPageUptodate(page);
168
169 return 0;
170}
171
172/**
173 * stuffed_readpage - Fill in a Linux page with stuffed file data
174 * @ip: the inode
175 * @page: the page
176 *
177 * Returns: errno
178 */
179
180static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
181{
182 struct buffer_head *dibh;
183 void *kaddr;
184 int error;
185
186 /* Only the first page of a stuffed file might contain data */
187 if (unlikely(page->index))
188 return zero_readpage(page);
189
190 error = gfs2_meta_inode_buffer(ip, &dibh);
191 if (error)
192 return error;
193
194 kaddr = kmap_atomic(page, KM_USER0);
195 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
196 ip->i_di.di_size);
197 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
198 kunmap_atomic(page, KM_USER0);
199
200 brelse(dibh);
201
202 SetPageUptodate(page);
203
204 return 0;
205}
206
207
208/**
209 * gfs2_readpage - readpage with locking
210 * @file: The file to read a page for. N.B. This may be NULL if we are
211 * reading an internal file.
212 * @page: The page to read
213 *
214 * Returns: errno
215 */
216
217static int gfs2_readpage(struct file *file, struct page *page)
218{
219 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
220 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
221 struct gfs2_holder gh;
222 int error;
223
224 if (likely(file != &gfs2_internal_file_sentinal)) {
225 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
226 error = gfs2_glock_nq_m_atime(1, &gh);
227 if (unlikely(error))
228 goto out_unlock;
229 }
230
231 if (gfs2_is_stuffed(ip)) {
232 error = stuffed_readpage(ip, page);
233 unlock_page(page);
234 } else
235 error = mpage_readpage(page, gfs2_get_block);
236
237 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
238 error = -EIO;
239
240 if (file != &gfs2_internal_file_sentinal) {
241 gfs2_glock_dq_m(1, &gh);
242 gfs2_holder_uninit(&gh);
243 }
244out:
245 return error;
246out_unlock:
247 unlock_page(page);
248 if (file != &gfs2_internal_file_sentinal)
249 gfs2_holder_uninit(&gh);
250 goto out;
251}
252
253/**
254 * gfs2_readpages - Read a bunch of pages at once
255 *
256 * Some notes:
257 * 1. This is only for readahead, so we can simply ignore any things
258 * which are slightly inconvenient (such as locking conflicts between
259 * the page lock and the glock) and return having done no I/O. Its
260 * obviously not something we'd want to do on too regular a basis.
261 * Any I/O we ignore at this time will be done via readpage later.
262 * 2. We have to handle stuffed files here too.
263 * 3. mpage_readpages() does most of the heavy lifting in the common case.
264 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
265 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
266 * well as read-ahead.
267 */
268static int gfs2_readpages(struct file *file, struct address_space *mapping,
269 struct list_head *pages, unsigned nr_pages)
270{
271 struct inode *inode = mapping->host;
272 struct gfs2_inode *ip = GFS2_I(inode);
273 struct gfs2_sbd *sdp = GFS2_SB(inode);
274 struct gfs2_holder gh;
275 unsigned page_idx;
276 int ret;
277
278 if (likely(file != &gfs2_internal_file_sentinal)) {
279 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
280 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
281 ret = gfs2_glock_nq_m_atime(1, &gh);
282 if (ret == GLR_TRYFAILED)
283 goto out_noerror;
284 if (unlikely(ret))
285 goto out_unlock;
286 }
287
288 if (gfs2_is_stuffed(ip)) {
289 struct pagevec lru_pvec;
290 pagevec_init(&lru_pvec, 0);
291 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
292 struct page *page = list_entry(pages->prev, struct page, lru);
293 prefetchw(&page->flags);
294 list_del(&page->lru);
295 if (!add_to_page_cache(page, mapping,
296 page->index, GFP_KERNEL)) {
297 ret = stuffed_readpage(ip, page);
298 unlock_page(page);
299 if (!pagevec_add(&lru_pvec, page))
300 __pagevec_lru_add(&lru_pvec);
301 } else {
302 page_cache_release(page);
303 }
304 }
305 pagevec_lru_add(&lru_pvec);
306 ret = 0;
307 } else {
308 /* What we really want to do .... */
309 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
310 }
311
312 if (likely(file != &gfs2_internal_file_sentinal)) {
313 gfs2_glock_dq_m(1, &gh);
314 gfs2_holder_uninit(&gh);
315 }
316out:
317 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
318 ret = -EIO;
319 return ret;
320out_noerror:
321 ret = 0;
322out_unlock:
323 /* unlock all pages, we can't do any I/O right now */
324 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
325 struct page *page = list_entry(pages->prev, struct page, lru);
326 list_del(&page->lru);
327 unlock_page(page);
328 page_cache_release(page);
329 }
330 if (likely(file != &gfs2_internal_file_sentinal))
331 gfs2_holder_uninit(&gh);
332 goto out;
333}
334
335/**
336 * gfs2_prepare_write - Prepare to write a page to a file
337 * @file: The file to write to
338 * @page: The page which is to be prepared for writing
339 * @from: From (byte range within page)
340 * @to: To (byte range within page)
341 *
342 * Returns: errno
343 */
344
345static int gfs2_prepare_write(struct file *file, struct page *page,
346 unsigned from, unsigned to)
347{
348 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
349 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
350 unsigned int data_blocks, ind_blocks, rblocks;
351 int alloc_required;
352 int error = 0;
353 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
354 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
355 struct gfs2_alloc *al;
356
357 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
358 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
359 if (error)
360 goto out_uninit;
361
362 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
363
364 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
365 if (error)
366 goto out_unlock;
367
368
369 if (alloc_required) {
370 al = gfs2_alloc_get(ip);
371
372 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
373 if (error)
374 goto out_alloc_put;
375
376 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
377 if (error)
378 goto out_qunlock;
379
380 al->al_requested = data_blocks + ind_blocks;
381 error = gfs2_inplace_reserve(ip);
382 if (error)
383 goto out_qunlock;
384 }
385
386 rblocks = RES_DINODE + ind_blocks;
387 if (gfs2_is_jdata(ip))
388 rblocks += data_blocks ? data_blocks : 1;
389 if (ind_blocks || data_blocks)
390 rblocks += RES_STATFS + RES_QUOTA;
391
392 error = gfs2_trans_begin(sdp, rblocks, 0);
393 if (error)
394 goto out;
395
396 if (gfs2_is_stuffed(ip)) {
397 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
398 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
399 page);
400 if (error == 0)
401 goto prepare_write;
402 } else if (!PageUptodate(page))
403 error = stuffed_readpage(ip, page);
404 goto out;
405 }
406
407prepare_write:
408 error = block_prepare_write(page, from, to, gfs2_get_block);
409
410out:
411 if (error) {
412 gfs2_trans_end(sdp);
413 if (alloc_required) {
414 gfs2_inplace_release(ip);
415out_qunlock:
416 gfs2_quota_unlock(ip);
417out_alloc_put:
418 gfs2_alloc_put(ip);
419 }
420out_unlock:
421 gfs2_glock_dq_m(1, &ip->i_gh);
422out_uninit:
423 gfs2_holder_uninit(&ip->i_gh);
424 }
425
426 return error;
427}
428
429/**
430 * gfs2_commit_write - Commit write to a file
431 * @file: The file to write to
432 * @page: The page containing the data
433 * @from: From (byte range within page)
434 * @to: To (byte range within page)
435 *
436 * Returns: errno
437 */
438
439static int gfs2_commit_write(struct file *file, struct page *page,
440 unsigned from, unsigned to)
441{
442 struct inode *inode = page->mapping->host;
443 struct gfs2_inode *ip = GFS2_I(inode);
444 struct gfs2_sbd *sdp = GFS2_SB(inode);
445 int error = -EOPNOTSUPP;
446 struct buffer_head *dibh;
447 struct gfs2_alloc *al = &ip->i_alloc;;
448
449 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
450 goto fail_nounlock;
451
452 error = gfs2_meta_inode_buffer(ip, &dibh);
453 if (error)
454 goto fail_endtrans;
455
456 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
457
458 if (gfs2_is_stuffed(ip)) {
459 uint64_t file_size;
460 void *kaddr;
461
462 file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
463
464 kaddr = kmap_atomic(page, KM_USER0);
465 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
466 (char *)kaddr + from, to - from);
467 kunmap_atomic(page, KM_USER0);
468
469 SetPageUptodate(page);
470
471 if (inode->i_size < file_size)
472 i_size_write(inode, file_size);
473 } else {
474 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
475 gfs2_is_jdata(ip))
476 gfs2_page_add_databufs(ip, page, from, to);
477 error = generic_commit_write(file, page, from, to);
478 if (error)
479 goto fail;
480 }
481
482 if (ip->i_di.di_size < inode->i_size)
483 ip->i_di.di_size = inode->i_size;
484
485 gfs2_dinode_out(&ip->i_di, dibh->b_data);
486 brelse(dibh);
487 gfs2_trans_end(sdp);
488 if (al->al_requested) {
489 gfs2_inplace_release(ip);
490 gfs2_quota_unlock(ip);
491 gfs2_alloc_put(ip);
492 }
493 gfs2_glock_dq_m(1, &ip->i_gh);
494 gfs2_holder_uninit(&ip->i_gh);
495 return 0;
496
497fail:
498 brelse(dibh);
499fail_endtrans:
500 gfs2_trans_end(sdp);
501 if (al->al_requested) {
502 gfs2_inplace_release(ip);
503 gfs2_quota_unlock(ip);
504 gfs2_alloc_put(ip);
505 }
506 gfs2_glock_dq_m(1, &ip->i_gh);
507 gfs2_holder_uninit(&ip->i_gh);
508fail_nounlock:
509 ClearPageUptodate(page);
510 return error;
511}
512
513/**
514 * gfs2_bmap - Block map function
515 * @mapping: Address space info
516 * @lblock: The block to map
517 *
518 * Returns: The disk address for the block or 0 on hole or error
519 */
520
521static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
522{
523 struct gfs2_inode *ip = GFS2_I(mapping->host);
524 struct gfs2_holder i_gh;
525 sector_t dblock = 0;
526 int error;
527
528 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
529 if (error)
530 return 0;
531
532 if (!gfs2_is_stuffed(ip))
533 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
534
535 gfs2_glock_dq_uninit(&i_gh);
536
537 return dblock;
538}
539
540static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
541{
542 struct gfs2_bufdata *bd;
543
544 gfs2_log_lock(sdp);
545 bd = bh->b_private;
546 if (bd) {
547 bd->bd_bh = NULL;
548 bh->b_private = NULL;
549 gfs2_log_unlock(sdp);
550 brelse(bh);
551 } else
552 gfs2_log_unlock(sdp);
553
554 lock_buffer(bh);
555 clear_buffer_dirty(bh);
556 bh->b_bdev = NULL;
557 clear_buffer_mapped(bh);
558 clear_buffer_req(bh);
559 clear_buffer_new(bh);
560 clear_buffer_delay(bh);
561 unlock_buffer(bh);
562}
563
564static void gfs2_invalidatepage(struct page *page, unsigned long offset)
565{
566 struct gfs2_sbd *sdp = page->mapping->host->i_sb->s_fs_info;
567 struct buffer_head *head, *bh, *next;
568 unsigned int curr_off = 0;
569
570 BUG_ON(!PageLocked(page));
571 if (!page_has_buffers(page))
572 return;
573
574 bh = head = page_buffers(page);
575 do {
576 unsigned int next_off = curr_off + bh->b_size;
577 next = bh->b_this_page;
578
579 if (offset <= curr_off)
580 discard_buffer(sdp, bh);
581
582 curr_off = next_off;
583 bh = next;
584 } while (bh != head);
585
586 if (!offset)
587 try_to_release_page(page, 0);
588
589 return;
590}
591
592static ssize_t gfs2_direct_IO_write(struct kiocb *iocb, const struct iovec *iov,
593 loff_t offset, unsigned long nr_segs)
594{
595 struct file *file = iocb->ki_filp;
596 struct inode *inode = file->f_mapping->host;
597 struct gfs2_inode *ip = GFS2_I(inode);
598 struct gfs2_holder gh;
599 int rv;
600
601 /*
602 * Shared lock, even though its write, since we do no allocation
603 * on this path. All we need change is atime.
604 */
605 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
606 rv = gfs2_glock_nq_m_atime(1, &gh);
607 if (rv)
608 goto out;
609
610 /*
611 * Should we return an error here? I can't see that O_DIRECT for
612 * a journaled file makes any sense. For now we'll silently fall
613 * back to buffered I/O, likewise we do the same for stuffed
614 * files since they are (a) small and (b) unaligned.
615 */
616 if (gfs2_is_jdata(ip))
617 goto out;
618
619 if (gfs2_is_stuffed(ip))
620 goto out;
621
622 rv = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
623 iov, offset, nr_segs, gfs2_get_block,
624 NULL, DIO_OWN_LOCKING);
625out:
626 gfs2_glock_dq_m(1, &gh);
627 gfs2_holder_uninit(&gh);
628
629 return rv;
630}
631
632/**
633 * gfs2_direct_IO
634 *
635 * This is called with a shared lock already held for the read path.
636 * Currently, no locks are held when the write path is called.
637 */
638static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
639 const struct iovec *iov, loff_t offset,
640 unsigned long nr_segs)
641{
642 struct file *file = iocb->ki_filp;
643 struct inode *inode = file->f_mapping->host;
644 struct gfs2_inode *ip = GFS2_I(inode);
645 struct gfs2_sbd *sdp = GFS2_SB(inode);
646 int ret;
647
648 if (rw == WRITE)
649 return gfs2_direct_IO_write(iocb, iov, offset, nr_segs);
650
651 if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)) ||
652 gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
653 return -EINVAL;
654
655 mutex_lock(&inode->i_mutex);
656 ret = __blockdev_direct_IO(READ, iocb, inode, inode->i_sb->s_bdev, iov,
657 offset, nr_segs, gfs2_get_block, NULL,
658 DIO_OWN_LOCKING);
659 mutex_unlock(&inode->i_mutex);
660 return ret;
661}
662
663/**
664 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
665 * @bh: the buffer we're stuck on
666 *
667 */
668
669static void stuck_releasepage(struct buffer_head *bh)
670{
671 struct inode *inode = bh->b_page->mapping->host;
672 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
673 struct gfs2_bufdata *bd = bh->b_private;
674 struct gfs2_glock *gl;
675
676 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
677 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
678 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
679 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
680 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
681
682 if (!bd)
683 return;
684
685 gl = bd->bd_gl;
686
687 fs_warn(sdp, "gl = (%u, %llu)\n",
688 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
689
690 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
691 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
692 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
693
694 if (gl->gl_ops == &gfs2_inode_glops) {
695 struct gfs2_inode *ip = gl->gl_object;
696 unsigned int x;
697
698 if (!ip)
699 return;
700
701 fs_warn(sdp, "ip = %llu %llu\n",
702 (unsigned long long)ip->i_num.no_formal_ino,
703 (unsigned long long)ip->i_num.no_addr);
704
705 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
706 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
707 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
708 }
709}
710
711/**
712 * gfs2_aspace_releasepage - free the metadata associated with a page
713 * @page: the page that's being released
714 * @gfp_mask: passed from Linux VFS, ignored by us
715 *
716 * Call try_to_free_buffers() if the buffers in this page can be
717 * released.
718 *
719 * Returns: 0
720 */
721
722int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
723{
724 struct inode *aspace = page->mapping->host;
725 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
726 struct buffer_head *bh, *head;
727 struct gfs2_bufdata *bd;
728 unsigned long t;
729
730 if (!page_has_buffers(page))
731 goto out;
732
733 head = bh = page_buffers(page);
734 do {
735 t = jiffies;
736
737 while (atomic_read(&bh->b_count)) {
738 if (atomic_read(&aspace->i_writecount)) {
739 if (time_after_eq(jiffies, t +
740 gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
741 stuck_releasepage(bh);
742 t = jiffies;
743 }
744
745 yield();
746 continue;
747 }
748
749 return 0;
750 }
751
752 gfs2_assert_warn(sdp, !buffer_pinned(bh));
753
754 bd = bh->b_private;
755 if (bd) {
756 gfs2_assert_warn(sdp, bd->bd_bh == bh);
757 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
758 gfs2_assert_warn(sdp, list_empty(&bd->bd_le.le_list));
759 gfs2_assert_warn(sdp, !bd->bd_ail);
760 kmem_cache_free(gfs2_bufdata_cachep, bd);
761 bh->b_private = NULL;
762 }
763
764 bh = bh->b_this_page;
765 }
766 while (bh != head);
767
768 out:
769 return try_to_free_buffers(page);
770}
771
772const struct address_space_operations gfs2_file_aops = {
773 .writepage = gfs2_writepage,
774 .readpage = gfs2_readpage,
775 .readpages = gfs2_readpages,
776 .sync_page = block_sync_page,
777 .prepare_write = gfs2_prepare_write,
778 .commit_write = gfs2_commit_write,
779 .bmap = gfs2_bmap,
780 .invalidatepage = gfs2_invalidatepage,
781 .releasepage = gfs2_releasepage,
782 .direct_IO = gfs2_direct_IO,
783};
784
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..dfc3dda6de11
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13extern const struct address_space_operations gfs2_file_aops;
14extern int gfs2_get_block(struct inode *inode, sector_t lblock,
15 struct buffer_head *bh_result, int create);
16extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
17
18#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..fd55979ec428
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "ops_dentry.h"
25#include "util.h"
26
27/**
28 * gfs2_drevalidate - Check directory lookup consistency
29 * @dentry: the mapping to check
30 * @nd:
31 *
32 * Check to make sure the lookup necessary to arrive at this inode from its
33 * parent is still good.
34 *
35 * Returns: 1 if the dentry is ok, 0 if it isn't
36 */
37
38static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
39{
40 struct dentry *parent = dget_parent(dentry);
41 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
42 struct gfs2_inode *dip = GFS2_I(parent->d_inode);
43 struct inode *inode = dentry->d_inode;
44 struct gfs2_holder d_gh;
45 struct gfs2_inode *ip;
46 struct gfs2_inum inum;
47 unsigned int type;
48 int error;
49
50 if (inode && is_bad_inode(inode))
51 goto invalid;
52
53 if (sdp->sd_args.ar_localcaching)
54 goto valid;
55
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
57 if (error)
58 goto fail;
59
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) {
62 case 0:
63 if (!inode)
64 goto invalid_gunlock;
65 break;
66 case -ENOENT:
67 if (!inode)
68 goto valid_gunlock;
69 goto invalid_gunlock;
70 default:
71 goto fail_gunlock;
72 }
73
74 ip = GFS2_I(inode);
75
76 if (!gfs2_inum_equal(&ip->i_num, &inum))
77 goto invalid_gunlock;
78
79 if (IF2DT(ip->i_di.di_mode) != type) {
80 gfs2_consist_inode(dip);
81 goto fail_gunlock;
82 }
83
84 valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh);
86
87 valid:
88 dput(parent);
89 return 1;
90
91 invalid_gunlock:
92 gfs2_glock_dq_uninit(&d_gh);
93
94 invalid:
95 if (inode && S_ISDIR(inode->i_mode)) {
96 if (have_submounts(dentry))
97 goto valid;
98 shrink_dcache_parent(dentry);
99 }
100 d_drop(dentry);
101
102 dput(parent);
103 return 0;
104
105 fail_gunlock:
106 gfs2_glock_dq_uninit(&d_gh);
107
108 fail:
109 dput(parent);
110 return 0;
111}
112
113static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
114{
115 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0;
117}
118
119struct dentry_operations gfs2_dops = {
120 .d_revalidate = gfs2_drevalidate,
121 .d_hash = gfs2_dhash,
122};
123
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..1b6e75c0a4a7
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13extern struct dentry_operations gfs2_dops;
14
15#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..6354f4799e68
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,293 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "dir.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "ops_export.h"
26#include "rgrp.h"
27#include "util.h"
28
29static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *fh,
31 int fh_len,
32 int fh_type,
33 int (*acceptable)(void *context,
34 struct dentry *dentry),
35 void *context)
36{
37 struct gfs2_fh_obj fh_obj;
38 struct gfs2_inum *this, parent;
39
40 if (fh_type != fh_len)
41 return NULL;
42
43 this = &fh_obj.this;
44 fh_obj.imode = DT_UNKNOWN;
45 memset(&parent, 0, sizeof(struct gfs2_inum));
46
47 switch (fh_type) {
48 case 10:
49 parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32;
50 parent.no_formal_ino |= be32_to_cpu(fh[5]);
51 parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32;
52 parent.no_addr |= be32_to_cpu(fh[7]);
53 fh_obj.imode = be32_to_cpu(fh[8]);
54 case 4:
55 this->no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32;
56 this->no_formal_ino |= be32_to_cpu(fh[1]);
57 this->no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32;
58 this->no_addr |= be32_to_cpu(fh[3]);
59 break;
60 default:
61 return NULL;
62 }
63
64 return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
65 acceptable, context);
66}
67
68static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
69 int connectable)
70{
71 struct inode *inode = dentry->d_inode;
72 struct super_block *sb = inode->i_sb;
73 struct gfs2_inode *ip = GFS2_I(inode);
74
75 if (*len < 4 || (connectable && *len < 10))
76 return 255;
77
78 fh[0] = ip->i_num.no_formal_ino >> 32;
79 fh[0] = cpu_to_be32(fh[0]);
80 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
81 fh[1] = cpu_to_be32(fh[1]);
82 fh[2] = ip->i_num.no_addr >> 32;
83 fh[2] = cpu_to_be32(fh[2]);
84 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
85 fh[3] = cpu_to_be32(fh[3]);
86 *len = 4;
87
88 if (!connectable || inode == sb->s_root->d_inode)
89 return *len;
90
91 spin_lock(&dentry->d_lock);
92 inode = dentry->d_parent->d_inode;
93 ip = GFS2_I(inode);
94 igrab(inode);
95 spin_unlock(&dentry->d_lock);
96
97 fh[4] = ip->i_num.no_formal_ino >> 32;
98 fh[4] = cpu_to_be32(fh[4]);
99 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
100 fh[5] = cpu_to_be32(fh[5]);
101 fh[6] = ip->i_num.no_addr >> 32;
102 fh[6] = cpu_to_be32(fh[6]);
103 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
104 fh[7] = cpu_to_be32(fh[7]);
105
106 fh[8] = cpu_to_be32(inode->i_mode);
107 fh[9] = 0; /* pad to double word */
108 *len = 10;
109
110 iput(inode);
111
112 return *len;
113}
114
115struct get_name_filldir {
116 struct gfs2_inum inum;
117 char *name;
118};
119
120static int get_name_filldir(void *opaque, const char *name, unsigned int length,
121 uint64_t offset, struct gfs2_inum *inum,
122 unsigned int type)
123{
124 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
125
126 if (!gfs2_inum_equal(inum, &gnfd->inum))
127 return 0;
128
129 memcpy(gnfd->name, name, length);
130 gnfd->name[length] = 0;
131
132 return 1;
133}
134
135static int gfs2_get_name(struct dentry *parent, char *name,
136 struct dentry *child)
137{
138 struct inode *dir = parent->d_inode;
139 struct inode *inode = child->d_inode;
140 struct gfs2_inode *dip, *ip;
141 struct get_name_filldir gnfd;
142 struct gfs2_holder gh;
143 uint64_t offset = 0;
144 int error;
145
146 if (!dir)
147 return -EINVAL;
148
149 if (!S_ISDIR(dir->i_mode) || !inode)
150 return -EINVAL;
151
152 dip = GFS2_I(dir);
153 ip = GFS2_I(inode);
154
155 *name = 0;
156 gnfd.inum = ip->i_num;
157 gnfd.name = name;
158
159 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
160 if (error)
161 return error;
162
163 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
164
165 gfs2_glock_dq_uninit(&gh);
166
167 if (!error && !*name)
168 error = -ENOENT;
169
170 return error;
171}
172
173static struct dentry *gfs2_get_parent(struct dentry *child)
174{
175 struct qstr dotdot;
176 struct inode *inode;
177 struct dentry *dentry;
178
179 gfs2_str2qstr(&dotdot, "..");
180 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
181
182 if (!inode)
183 return ERR_PTR(-ENOENT);
184 if (IS_ERR(inode))
185 return ERR_PTR(PTR_ERR(inode));
186
187 dentry = d_alloc_anon(inode);
188 if (!dentry) {
189 iput(inode);
190 return ERR_PTR(-ENOMEM);
191 }
192
193 return dentry;
194}
195
196static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
197{
198 struct gfs2_sbd *sdp = sb->s_fs_info;
199 struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
200 struct gfs2_inum *inum = &fh_obj->this;
201 struct gfs2_holder i_gh, ri_gh, rgd_gh;
202 struct gfs2_rgrpd *rgd;
203 struct inode *inode;
204 struct dentry *dentry;
205 int error;
206
207 /* System files? */
208
209 inode = gfs2_ilookup(sb, inum);
210 if (inode) {
211 if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
212 iput(inode);
213 return ERR_PTR(-ESTALE);
214 }
215 goto out_inode;
216 }
217
218 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
219 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
220 &i_gh);
221 if (error)
222 return ERR_PTR(error);
223
224 error = gfs2_rindex_hold(sdp, &ri_gh);
225 if (error)
226 goto fail;
227
228 error = -EINVAL;
229 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
230 if (!rgd)
231 goto fail_rindex;
232
233 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
234 if (error)
235 goto fail_rindex;
236
237 error = -ESTALE;
238 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
239 goto fail_rgd;
240
241 gfs2_glock_dq_uninit(&rgd_gh);
242 gfs2_glock_dq_uninit(&ri_gh);
243
244 inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
245 if (!inode)
246 goto fail;
247 if (IS_ERR(inode)) {
248 error = PTR_ERR(inode);
249 goto fail;
250 }
251
252 error = gfs2_inode_refresh(GFS2_I(inode));
253 if (error) {
254 iput(inode);
255 goto fail;
256 }
257
258 error = -EIO;
259 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
260 iput(inode);
261 goto fail;
262 }
263
264 gfs2_glock_dq_uninit(&i_gh);
265
266out_inode:
267 dentry = d_alloc_anon(inode);
268 if (!dentry) {
269 iput(inode);
270 return ERR_PTR(-ENOMEM);
271 }
272
273 return dentry;
274
275fail_rgd:
276 gfs2_glock_dq_uninit(&rgd_gh);
277
278fail_rindex:
279 gfs2_glock_dq_uninit(&ri_gh);
280
281fail:
282 gfs2_glock_dq_uninit(&i_gh);
283 return ERR_PTR(error);
284}
285
286struct export_operations gfs2_export_ops = {
287 .decode_fh = gfs2_decode_fh,
288 .encode_fh = gfs2_encode_fh,
289 .get_name = gfs2_get_name,
290 .get_parent = gfs2_get_parent,
291 .get_dentry = gfs2_get_dentry,
292};
293
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09fc077657d1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13extern struct export_operations gfs2_export_ops;
14struct gfs2_fh_obj {
15 struct gfs2_inum this;
16 __u32 imode;
17};
18
19#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..26f1d3249b0f
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,982 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/iflags.h>
25#include <asm/uaccess.h>
26
27#include "gfs2.h"
28#include "lm_interface.h"
29#include "incore.h"
30#include "bmap.h"
31#include "dir.h"
32#include "glock.h"
33#include "glops.h"
34#include "inode.h"
35#include "lm.h"
36#include "log.h"
37#include "meta_io.h"
38#include "ops_file.h"
39#include "ops_vm.h"
40#include "quota.h"
41#include "rgrp.h"
42#include "trans.h"
43#include "util.h"
44#include "eaops.h"
45
46/* "bad" is for NFS support */
47struct filldir_bad_entry {
48 char *fbe_name;
49 unsigned int fbe_length;
50 uint64_t fbe_offset;
51 struct gfs2_inum fbe_inum;
52 unsigned int fbe_type;
53};
54
55struct filldir_bad {
56 struct gfs2_sbd *fdb_sbd;
57
58 struct filldir_bad_entry *fdb_entry;
59 unsigned int fdb_entry_num;
60 unsigned int fdb_entry_off;
61
62 char *fdb_name;
63 unsigned int fdb_name_size;
64 unsigned int fdb_name_off;
65};
66
67/* For regular, non-NFS */
68struct filldir_reg {
69 struct gfs2_sbd *fdr_sbd;
70 int fdr_prefetch;
71
72 filldir_t fdr_filldir;
73 void *fdr_opaque;
74};
75
76/*
77 * Most fields left uninitialised to catch anybody who tries to
78 * use them. f_flags set to prevent file_accessed() from touching
79 * any other part of this. Its use is purely as a flag so that we
80 * know (in readpage()) whether or not do to locking.
81 */
82struct file gfs2_internal_file_sentinal = {
83 .f_flags = O_NOATIME|O_RDONLY,
84};
85
86static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
87 unsigned long offset, unsigned long size)
88{
89 char *kaddr;
90 unsigned long count = desc->count;
91
92 if (size > count)
93 size = count;
94
95 kaddr = kmap(page);
96 memcpy(desc->arg.buf, kaddr + offset, size);
97 kunmap(page);
98
99 desc->count = count - size;
100 desc->written += size;
101 desc->arg.buf += size;
102 return size;
103}
104
105int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
106 char *buf, loff_t *pos, unsigned size)
107{
108 struct inode *inode = &ip->i_inode;
109 read_descriptor_t desc;
110 desc.written = 0;
111 desc.arg.buf = buf;
112 desc.count = size;
113 desc.error = 0;
114 do_generic_mapping_read(inode->i_mapping, ra_state,
115 &gfs2_internal_file_sentinal, pos, &desc,
116 gfs2_read_actor);
117 return desc.written ? desc.written : desc.error;
118}
119
120/**
121 * gfs2_llseek - seek to a location in a file
122 * @file: the file
123 * @offset: the offset
124 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
125 *
126 * SEEK_END requires the glock for the file because it references the
127 * file's size.
128 *
129 * Returns: The new offset, or errno
130 */
131
132static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
133{
134 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
135 struct gfs2_holder i_gh;
136 loff_t error;
137
138 if (origin == 2) {
139 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
140 &i_gh);
141 if (!error) {
142 error = remote_llseek(file, offset, origin);
143 gfs2_glock_dq_uninit(&i_gh);
144 }
145 } else
146 error = remote_llseek(file, offset, origin);
147
148 return error;
149}
150
151
152static ssize_t gfs2_direct_IO_read(struct kiocb *iocb, const struct iovec *iov,
153 loff_t offset, unsigned long nr_segs)
154{
155 struct file *file = iocb->ki_filp;
156 struct address_space *mapping = file->f_mapping;
157 ssize_t retval;
158
159 retval = filemap_write_and_wait(mapping);
160 if (retval == 0) {
161 retval = mapping->a_ops->direct_IO(READ, iocb, iov, offset,
162 nr_segs);
163 }
164 return retval;
165}
166
167/**
168 * __gfs2_file_aio_read - The main GFS2 read function
169 *
170 * N.B. This is almost, but not quite the same as __generic_file_aio_read()
171 * the important subtle different being that inode->i_size isn't valid
172 * unless we are holding a lock, and we do this _only_ on the O_DIRECT
173 * path since otherwise locking is done entirely at the page cache
174 * layer.
175 */
176static ssize_t __gfs2_file_aio_read(struct kiocb *iocb,
177 const struct iovec *iov,
178 unsigned long nr_segs, loff_t *ppos)
179{
180 struct file *filp = iocb->ki_filp;
181 struct gfs2_inode *ip = GFS2_I(filp->f_mapping->host);
182 struct gfs2_holder gh;
183 ssize_t retval;
184 unsigned long seg;
185 size_t count;
186
187 count = 0;
188 for (seg = 0; seg < nr_segs; seg++) {
189 const struct iovec *iv = &iov[seg];
190
191 /*
192 * If any segment has a negative length, or the cumulative
193 * length ever wraps negative then return -EINVAL.
194 */
195 count += iv->iov_len;
196 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
197 return -EINVAL;
198 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
199 continue;
200 if (seg == 0)
201 return -EFAULT;
202 nr_segs = seg;
203 count -= iv->iov_len; /* This segment is no good */
204 break;
205 }
206
207 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
208 if (filp->f_flags & O_DIRECT) {
209 loff_t pos = *ppos, size;
210 struct address_space *mapping;
211 struct inode *inode;
212
213 mapping = filp->f_mapping;
214 inode = mapping->host;
215 retval = 0;
216 if (!count)
217 goto out; /* skip atime */
218
219 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
220 retval = gfs2_glock_nq_m_atime(1, &gh);
221 if (retval)
222 goto out;
223 if (gfs2_is_stuffed(ip)) {
224 gfs2_glock_dq_m(1, &gh);
225 gfs2_holder_uninit(&gh);
226 goto fallback_to_normal;
227 }
228 size = i_size_read(inode);
229 if (pos < size) {
230 retval = gfs2_direct_IO_read(iocb, iov, pos, nr_segs);
231 if (retval > 0 && !is_sync_kiocb(iocb))
232 retval = -EIOCBQUEUED;
233 if (retval > 0)
234 *ppos = pos + retval;
235 }
236 file_accessed(filp);
237 gfs2_glock_dq_m(1, &gh);
238 gfs2_holder_uninit(&gh);
239 goto out;
240 }
241
242fallback_to_normal:
243 retval = 0;
244 if (count) {
245 for (seg = 0; seg < nr_segs; seg++) {
246 read_descriptor_t desc;
247
248 desc.written = 0;
249 desc.arg.buf = iov[seg].iov_base;
250 desc.count = iov[seg].iov_len;
251 if (desc.count == 0)
252 continue;
253 desc.error = 0;
254 do_generic_file_read(filp,ppos,&desc,file_read_actor);
255 retval += desc.written;
256 if (desc.error) {
257 retval = retval ?: desc.error;
258 break;
259 }
260 }
261 }
262out:
263 return retval;
264}
265
266/**
267 * gfs2_read - Read bytes from a file
268 * @file: The file to read from
269 * @buf: The buffer to copy into
270 * @size: The amount of data requested
271 * @offset: The current file offset
272 *
273 * Outputs: Offset - updated according to number of bytes read
274 *
275 * Returns: The number of bytes read, errno on failure
276 */
277
278static ssize_t gfs2_read(struct file *filp, char __user *buf, size_t size,
279 loff_t *offset)
280{
281 struct iovec local_iov = { .iov_base = buf, .iov_len = size };
282 struct kiocb kiocb;
283 ssize_t ret;
284
285 init_sync_kiocb(&kiocb, filp);
286 ret = __gfs2_file_aio_read(&kiocb, &local_iov, 1, offset);
287 if (-EIOCBQUEUED == ret)
288 ret = wait_on_sync_kiocb(&kiocb);
289 return ret;
290}
291
292static ssize_t gfs2_file_readv(struct file *filp, const struct iovec *iov,
293 unsigned long nr_segs, loff_t *ppos)
294{
295 struct kiocb kiocb;
296 ssize_t ret;
297
298 init_sync_kiocb(&kiocb, filp);
299 ret = __gfs2_file_aio_read(&kiocb, iov, nr_segs, ppos);
300 if (-EIOCBQUEUED == ret)
301 ret = wait_on_sync_kiocb(&kiocb);
302 return ret;
303}
304
305static ssize_t gfs2_file_aio_read(struct kiocb *iocb, char __user *buf,
306 size_t count, loff_t pos)
307{
308 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
309
310 BUG_ON(iocb->ki_pos != pos);
311 return __gfs2_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
312}
313
314
315/**
316 * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read()
317 * @opaque: opaque data used by the function
318 * @name: the name of the directory entry
319 * @length: the length of the name
320 * @offset: the entry's offset in the directory
321 * @inum: the inode number the entry points to
322 * @type: the type of inode the entry points to
323 *
324 * Returns: 0 on success, 1 if buffer full
325 */
326
327static int filldir_reg_func(void *opaque, const char *name, unsigned int length,
328 uint64_t offset, struct gfs2_inum *inum,
329 unsigned int type)
330{
331 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
332 struct gfs2_sbd *sdp = fdr->fdr_sbd;
333 int error;
334
335 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
336 inum->no_addr, type);
337 if (error)
338 return 1;
339
340 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
341 gfs2_glock_prefetch_num(sdp,
342 inum->no_addr, &gfs2_inode_glops,
343 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
344 gfs2_glock_prefetch_num(sdp,
345 inum->no_addr, &gfs2_iopen_glops,
346 LM_ST_SHARED, LM_FLAG_TRY);
347 }
348
349 return 0;
350}
351
352/**
353 * readdir_reg - Read directory entries from a directory
354 * @file: The directory to read from
355 * @dirent: Buffer for dirents
356 * @filldir: Function used to do the copying
357 *
358 * Returns: errno
359 */
360
361static int readdir_reg(struct file *file, void *dirent, filldir_t filldir)
362{
363 struct inode *dir = file->f_mapping->host;
364 struct gfs2_inode *dip = GFS2_I(dir);
365 struct filldir_reg fdr;
366 struct gfs2_holder d_gh;
367 uint64_t offset = file->f_pos;
368 int error;
369
370 fdr.fdr_sbd = GFS2_SB(dir);
371 fdr.fdr_prefetch = 1;
372 fdr.fdr_filldir = filldir;
373 fdr.fdr_opaque = dirent;
374
375 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
376 error = gfs2_glock_nq_atime(&d_gh);
377 if (error) {
378 gfs2_holder_uninit(&d_gh);
379 return error;
380 }
381
382 error = gfs2_dir_read(dir, &offset, &fdr, filldir_reg_func);
383
384 gfs2_glock_dq_uninit(&d_gh);
385
386 file->f_pos = offset;
387
388 return error;
389}
390
391/**
392 * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read()
393 * @opaque: opaque data used by the function
394 * @name: the name of the directory entry
395 * @length: the length of the name
396 * @offset: the entry's offset in the directory
397 * @inum: the inode number the entry points to
398 * @type: the type of inode the entry points to
399 *
400 * For supporting NFS.
401 *
402 * Returns: 0 on success, 1 if buffer full
403 */
404
405static int filldir_bad_func(void *opaque, const char *name, unsigned int length,
406 uint64_t offset, struct gfs2_inum *inum,
407 unsigned int type)
408{
409 struct filldir_bad *fdb = (struct filldir_bad *)opaque;
410 struct gfs2_sbd *sdp = fdb->fdb_sbd;
411 struct filldir_bad_entry *fbe;
412
413 if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
414 fdb->fdb_name_off + length > fdb->fdb_name_size)
415 return 1;
416
417 fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
418 fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
419 memcpy(fbe->fbe_name, name, length);
420 fbe->fbe_length = length;
421 fbe->fbe_offset = offset;
422 fbe->fbe_inum = *inum;
423 fbe->fbe_type = type;
424
425 fdb->fdb_entry_off++;
426 fdb->fdb_name_off += length;
427
428 if (!(length == 1 && *name == '.')) {
429 gfs2_glock_prefetch_num(sdp,
430 inum->no_addr, &gfs2_inode_glops,
431 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
432 gfs2_glock_prefetch_num(sdp,
433 inum->no_addr, &gfs2_iopen_glops,
434 LM_ST_SHARED, LM_FLAG_TRY);
435 }
436
437 return 0;
438}
439
440/**
441 * readdir_bad - Read directory entries from a directory
442 * @file: The directory to read from
443 * @dirent: Buffer for dirents
444 * @filldir: Function used to do the copying
445 *
446 * For supporting NFS.
447 *
448 * Returns: errno
449 */
450
451static int readdir_bad(struct file *file, void *dirent, filldir_t filldir)
452{
453 struct inode *dir = file->f_mapping->host;
454 struct gfs2_inode *dip = GFS2_I(dir);
455 struct gfs2_sbd *sdp = GFS2_SB(dir);
456 struct filldir_reg fdr;
457 unsigned int entries, size;
458 struct filldir_bad *fdb;
459 struct gfs2_holder d_gh;
460 uint64_t offset = file->f_pos;
461 unsigned int x;
462 struct filldir_bad_entry *fbe;
463 int error;
464
465 entries = gfs2_tune_get(sdp, gt_entries_per_readdir);
466 size = sizeof(struct filldir_bad) +
467 entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE);
468
469 fdb = kzalloc(size, GFP_KERNEL);
470 if (!fdb)
471 return -ENOMEM;
472
473 fdb->fdb_sbd = sdp;
474 fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
475 fdb->fdb_entry_num = entries;
476 fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
477 entries * sizeof(struct filldir_bad_entry);
478 fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE;
479
480 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
481 error = gfs2_glock_nq_atime(&d_gh);
482 if (error) {
483 gfs2_holder_uninit(&d_gh);
484 goto out;
485 }
486
487 error = gfs2_dir_read(dir, &offset, fdb, filldir_bad_func);
488
489 gfs2_glock_dq_uninit(&d_gh);
490
491 fdr.fdr_sbd = sdp;
492 fdr.fdr_prefetch = 0;
493 fdr.fdr_filldir = filldir;
494 fdr.fdr_opaque = dirent;
495
496 for (x = 0; x < fdb->fdb_entry_off; x++) {
497 fbe = &fdb->fdb_entry[x];
498
499 error = filldir_reg_func(&fdr,
500 fbe->fbe_name, fbe->fbe_length,
501 fbe->fbe_offset,
502 &fbe->fbe_inum, fbe->fbe_type);
503 if (error) {
504 file->f_pos = fbe->fbe_offset;
505 error = 0;
506 goto out;
507 }
508 }
509
510 file->f_pos = offset;
511
512 out:
513 kfree(fdb);
514
515 return error;
516}
517
518/**
519 * gfs2_readdir - Read directory entries from a directory
520 * @file: The directory to read from
521 * @dirent: Buffer for dirents
522 * @filldir: Function used to do the copying
523 *
524 * Returns: errno
525 */
526
527static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
528{
529 int error;
530
531 if (strcmp(current->comm, "nfsd") != 0)
532 error = readdir_reg(file, dirent, filldir);
533 else
534 error = readdir_bad(file, dirent, filldir);
535
536 return error;
537}
538
539static const u32 iflags_to_gfs2[32] = {
540 [iflag_Sync] = GFS2_DIF_SYNC,
541 [iflag_Immutable] = GFS2_DIF_IMMUTABLE,
542 [iflag_Append] = GFS2_DIF_APPENDONLY,
543 [iflag_NoAtime] = GFS2_DIF_NOATIME,
544 [iflag_Index] = GFS2_DIF_EXHASH,
545 [iflag_JournalData] = GFS2_DIF_JDATA,
546 [iflag_DirectIO] = GFS2_DIF_DIRECTIO,
547};
548
549static const u32 gfs2_to_iflags[32] = {
550 [gfs2fl_Sync] = IFLAG_SYNC,
551 [gfs2fl_Immutable] = IFLAG_IMMUTABLE,
552 [gfs2fl_AppendOnly] = IFLAG_APPEND,
553 [gfs2fl_NoAtime] = IFLAG_NOATIME,
554 [gfs2fl_ExHash] = IFLAG_INDEX,
555 [gfs2fl_Jdata] = IFLAG_JOURNAL_DATA,
556 [gfs2fl_Directio] = IFLAG_DIRECTIO,
557 [gfs2fl_InheritDirectio] = IFLAG_DIRECTIO,
558 [gfs2fl_InheritJdata] = IFLAG_JOURNAL_DATA,
559};
560
561static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
562{
563 struct inode *inode = filp->f_dentry->d_inode;
564 struct gfs2_inode *ip = GFS2_I(inode);
565 struct gfs2_holder gh;
566 int error;
567 u32 iflags;
568
569 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
570 error = gfs2_glock_nq_m_atime(1, &gh);
571 if (error)
572 return error;
573
574 iflags = iflags_cvt(gfs2_to_iflags, ip->i_di.di_flags);
575 if (put_user(iflags, ptr))
576 error = -EFAULT;
577
578 gfs2_glock_dq_m(1, &gh);
579 gfs2_holder_uninit(&gh);
580 return error;
581}
582
583/* Flags that can be set by user space */
584#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
585 GFS2_DIF_DIRECTIO| \
586 GFS2_DIF_IMMUTABLE| \
587 GFS2_DIF_APPENDONLY| \
588 GFS2_DIF_NOATIME| \
589 GFS2_DIF_SYNC| \
590 GFS2_DIF_SYSTEM| \
591 GFS2_DIF_INHERIT_DIRECTIO| \
592 GFS2_DIF_INHERIT_JDATA)
593
594/**
595 * gfs2_set_flags - set flags on an inode
596 * @inode: The inode
597 * @flags: The flags to set
598 * @mask: Indicates which flags are valid
599 *
600 */
601static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
602{
603 struct inode *inode = filp->f_dentry->d_inode;
604 struct gfs2_inode *ip = GFS2_I(inode);
605 struct gfs2_sbd *sdp = GFS2_SB(inode);
606 struct buffer_head *bh;
607 struct gfs2_holder gh;
608 int error;
609 u32 new_flags, flags;
610
611 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
612 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
613 if (error) {
614 gfs2_holder_uninit(&gh);
615 return error;
616 }
617
618 flags = ip->i_di.di_flags;
619 new_flags = (flags & ~mask) | (reqflags & mask);
620 if ((new_flags ^ flags) == 0)
621 goto out;
622
623 if (S_ISDIR(inode->i_mode)) {
624 if ((new_flags ^ flags) & GFS2_DIF_JDATA)
625 new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
626 if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
627 new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
628 }
629
630 error = -EINVAL;
631 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
632 goto out;
633
634 error = -EPERM;
635 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
636 goto out;
637 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
638 goto out;
639 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
640 !capable(CAP_LINUX_IMMUTABLE))
641 goto out;
642 if (!IS_IMMUTABLE(inode)) {
643 error = permission(inode, MAY_WRITE, NULL);
644 if (error)
645 goto out;
646 }
647
648 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
649 if (error)
650 goto out;
651 error = gfs2_meta_inode_buffer(ip, &bh);
652 if (error)
653 goto out_trans_end;
654 gfs2_trans_add_bh(ip->i_gl, bh, 1);
655 ip->i_di.di_flags = new_flags;
656 gfs2_dinode_out(&ip->i_di, bh->b_data);
657 brelse(bh);
658out_trans_end:
659 gfs2_trans_end(sdp);
660out:
661 gfs2_glock_dq_uninit(&gh);
662 return error;
663}
664
665static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
666{
667 u32 iflags, gfsflags;
668 if (get_user(iflags, ptr))
669 return -EFAULT;
670 gfsflags = iflags_cvt(iflags_to_gfs2, iflags);
671 return do_gfs2_set_flags(filp, gfsflags, ~0);
672}
673
674static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
675{
676 switch(cmd) {
677 case IFLAGS_GET_IOC:
678 return gfs2_get_flags(filp, (u32 __user *)arg);
679 case IFLAGS_SET_IOC:
680 return gfs2_set_flags(filp, (u32 __user *)arg);
681 }
682 return -ENOTTY;
683}
684
685
686/**
687 * gfs2_mmap -
688 * @file: The file to map
689 * @vma: The VMA which described the mapping
690 *
691 * Returns: 0 or error code
692 */
693
694static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
695{
696 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
697 struct gfs2_holder i_gh;
698 int error;
699
700 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
701 error = gfs2_glock_nq_atime(&i_gh);
702 if (error) {
703 gfs2_holder_uninit(&i_gh);
704 return error;
705 }
706
707 /* This is VM_MAYWRITE instead of VM_WRITE because a call
708 to mprotect() can turn on VM_WRITE later. */
709
710 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
711 (VM_MAYSHARE | VM_MAYWRITE))
712 vma->vm_ops = &gfs2_vm_ops_sharewrite;
713 else
714 vma->vm_ops = &gfs2_vm_ops_private;
715
716 gfs2_glock_dq_uninit(&i_gh);
717
718 return error;
719}
720
721/**
722 * gfs2_open - open a file
723 * @inode: the inode to open
724 * @file: the struct file for this opening
725 *
726 * Returns: errno
727 */
728
729static int gfs2_open(struct inode *inode, struct file *file)
730{
731 struct gfs2_inode *ip = GFS2_I(inode);
732 struct gfs2_holder i_gh;
733 struct gfs2_file *fp;
734 int error;
735
736 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
737 if (!fp)
738 return -ENOMEM;
739
740 mutex_init(&fp->f_fl_mutex);
741
742 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
743 file->private_data = fp;
744
745 if (S_ISREG(ip->i_di.di_mode)) {
746 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
747 &i_gh);
748 if (error)
749 goto fail;
750
751 if (!(file->f_flags & O_LARGEFILE) &&
752 ip->i_di.di_size > MAX_NON_LFS) {
753 error = -EFBIG;
754 goto fail_gunlock;
755 }
756
757 /* Listen to the Direct I/O flag */
758
759 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
760 file->f_flags |= O_DIRECT;
761
762 gfs2_glock_dq_uninit(&i_gh);
763 }
764
765 return 0;
766
767 fail_gunlock:
768 gfs2_glock_dq_uninit(&i_gh);
769
770 fail:
771 file->private_data = NULL;
772 kfree(fp);
773
774 return error;
775}
776
777/**
778 * gfs2_close - called to close a struct file
779 * @inode: the inode the struct file belongs to
780 * @file: the struct file being closed
781 *
782 * Returns: errno
783 */
784
785static int gfs2_close(struct inode *inode, struct file *file)
786{
787 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
788 struct gfs2_file *fp;
789
790 fp = file->private_data;
791 file->private_data = NULL;
792
793 if (gfs2_assert_warn(sdp, fp))
794 return -EIO;
795
796 kfree(fp);
797
798 return 0;
799}
800
801/**
802 * gfs2_fsync - sync the dirty data for a file (across the cluster)
803 * @file: the file that points to the dentry (we ignore this)
804 * @dentry: the dentry that points to the inode to sync
805 *
806 * Returns: errno
807 */
808
809static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
810{
811 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
812
813 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
814
815 return 0;
816}
817
818/**
819 * gfs2_lock - acquire/release a posix lock on a file
820 * @file: the file pointer
821 * @cmd: either modify or retrieve lock state, possibly wait
822 * @fl: type and range of lock
823 *
824 * Returns: errno
825 */
826
827static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
828{
829 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
830 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
831 struct lm_lockname name =
832 { .ln_number = ip->i_num.no_addr,
833 .ln_type = LM_TYPE_PLOCK };
834
835 if (!(fl->fl_flags & FL_POSIX))
836 return -ENOLCK;
837 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
838 return -ENOLCK;
839
840 if (sdp->sd_args.ar_localflocks) {
841 if (IS_GETLK(cmd)) {
842 struct file_lock tmp;
843 int ret;
844 ret = posix_test_lock(file, fl, &tmp);
845 fl->fl_type = F_UNLCK;
846 if (ret)
847 memcpy(fl, &tmp, sizeof(struct file_lock));
848 return 0;
849 } else {
850 return posix_lock_file_wait(file, fl);
851 }
852 }
853
854 if (IS_GETLK(cmd))
855 return gfs2_lm_plock_get(sdp, &name, file, fl);
856 else if (fl->fl_type == F_UNLCK)
857 return gfs2_lm_punlock(sdp, &name, file, fl);
858 else
859 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
860}
861
862static int do_flock(struct file *file, int cmd, struct file_lock *fl)
863{
864 struct gfs2_file *fp = file->private_data;
865 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
866 struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
867 struct gfs2_glock *gl;
868 unsigned int state;
869 int flags;
870 int error = 0;
871
872 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
873 flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
874
875 mutex_lock(&fp->f_fl_mutex);
876
877 gl = fl_gh->gh_gl;
878 if (gl) {
879 if (fl_gh->gh_state == state)
880 goto out;
881 gfs2_glock_hold(gl);
882 flock_lock_file_wait(file,
883 &(struct file_lock){.fl_type = F_UNLCK});
884 gfs2_glock_dq_uninit(fl_gh);
885 } else {
886 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
887 ip->i_num.no_addr, &gfs2_flock_glops,
888 CREATE, &gl);
889 if (error)
890 goto out;
891 }
892
893 gfs2_holder_init(gl, state, flags, fl_gh);
894 gfs2_glock_put(gl);
895
896 error = gfs2_glock_nq(fl_gh);
897 if (error) {
898 gfs2_holder_uninit(fl_gh);
899 if (error == GLR_TRYFAILED)
900 error = -EAGAIN;
901 } else {
902 error = flock_lock_file_wait(file, fl);
903 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
904 }
905
906 out:
907 mutex_unlock(&fp->f_fl_mutex);
908
909 return error;
910}
911
912static void do_unflock(struct file *file, struct file_lock *fl)
913{
914 struct gfs2_file *fp = file->private_data;
915 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
916
917 mutex_lock(&fp->f_fl_mutex);
918 flock_lock_file_wait(file, fl);
919 if (fl_gh->gh_gl)
920 gfs2_glock_dq_uninit(fl_gh);
921 mutex_unlock(&fp->f_fl_mutex);
922}
923
924/**
925 * gfs2_flock - acquire/release a flock lock on a file
926 * @file: the file pointer
927 * @cmd: either modify or retrieve lock state, possibly wait
928 * @fl: type and range of lock
929 *
930 * Returns: errno
931 */
932
933static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
934{
935 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
936 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
937
938 if (!(fl->fl_flags & FL_FLOCK))
939 return -ENOLCK;
940 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
941 return -ENOLCK;
942
943 if (sdp->sd_args.ar_localflocks)
944 return flock_lock_file_wait(file, fl);
945
946 if (fl->fl_type == F_UNLCK) {
947 do_unflock(file, fl);
948 return 0;
949 } else
950 return do_flock(file, cmd, fl);
951}
952
953const struct file_operations gfs2_file_fops = {
954 .llseek = gfs2_llseek,
955 .read = gfs2_read,
956 .readv = gfs2_file_readv,
957 .aio_read = gfs2_file_aio_read,
958 .write = generic_file_write,
959 .writev = generic_file_writev,
960 .aio_write = generic_file_aio_write,
961 .unlocked_ioctl = gfs2_ioctl,
962 .mmap = gfs2_mmap,
963 .open = gfs2_open,
964 .release = gfs2_close,
965 .fsync = gfs2_fsync,
966 .lock = gfs2_lock,
967 .sendfile = generic_file_sendfile,
968 .flock = gfs2_flock,
969 .splice_read = generic_file_splice_read,
970 .splice_write = generic_file_splice_write,
971};
972
973const struct file_operations gfs2_dir_fops = {
974 .readdir = gfs2_readdir,
975 .unlocked_ioctl = gfs2_ioctl,
976 .open = gfs2_open,
977 .release = gfs2_close,
978 .fsync = gfs2_fsync,
979 .lock = gfs2_lock,
980 .flock = gfs2_flock,
981};
982
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..46302b513937
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12extern struct file gfs2_internal_file_sentinal;
13extern int gfs2_internal_read(struct gfs2_inode *ip,
14 struct file_ra_state *ra_state,
15 char *buf, loff_t *pos, unsigned size);
16
17extern const struct file_operations gfs2_file_fops;
18extern const struct file_operations gfs2_dir_fops;
19
20#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..a86ce67949db
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,841 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/blkdev.h>
17#include <linux/kthread.h>
18#include <linux/gfs2_ondisk.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "lm.h"
28#include "mount.h"
29#include "ops_export.h"
30#include "ops_fstype.h"
31#include "ops_super.h"
32#include "recovery.h"
33#include "rgrp.h"
34#include "super.h"
35#include "sys.h"
36#include "util.h"
37
38#define DO 0
39#define UNDO 1
40
41extern struct dentry_operations gfs2_dops;
42
43static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{
45 struct gfs2_sbd *sdp;
46 unsigned int x;
47
48 sdp = vmalloc(sizeof(struct gfs2_sbd));
49 if (!sdp)
50 return NULL;
51
52 memset(sdp, 0, sizeof(struct gfs2_sbd));
53
54 sb->s_fs_info = sdp;
55 sdp->sd_vfs = sb;
56
57 gfs2_tune_init(&sdp->sd_tune);
58
59 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
60 rwlock_init(&sdp->sd_gl_hash[x].hb_lock);
61 INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
62 }
63 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
64 spin_lock_init(&sdp->sd_reclaim_lock);
65 init_waitqueue_head(&sdp->sd_reclaim_wq);
66 mutex_init(&sdp->sd_invalidate_inodes_mutex);
67
68 mutex_init(&sdp->sd_inum_mutex);
69 spin_lock_init(&sdp->sd_statfs_spin);
70 mutex_init(&sdp->sd_statfs_mutex);
71
72 spin_lock_init(&sdp->sd_rindex_spin);
73 mutex_init(&sdp->sd_rindex_mutex);
74 INIT_LIST_HEAD(&sdp->sd_rindex_list);
75 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
76 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
77
78 INIT_LIST_HEAD(&sdp->sd_jindex_list);
79 spin_lock_init(&sdp->sd_jindex_spin);
80 mutex_init(&sdp->sd_jindex_mutex);
81
82 INIT_LIST_HEAD(&sdp->sd_quota_list);
83 spin_lock_init(&sdp->sd_quota_spin);
84 mutex_init(&sdp->sd_quota_mutex);
85
86 spin_lock_init(&sdp->sd_log_lock);
87
88 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
89 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
90 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
91 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
92 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
93
94 mutex_init(&sdp->sd_log_reserve_mutex);
95 INIT_LIST_HEAD(&sdp->sd_ail1_list);
96 INIT_LIST_HEAD(&sdp->sd_ail2_list);
97
98 init_rwsem(&sdp->sd_log_flush_lock);
99 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
100
101 INIT_LIST_HEAD(&sdp->sd_revoke_list);
102
103 mutex_init(&sdp->sd_freeze_lock);
104
105 return sdp;
106}
107
108static void init_vfs(struct super_block *sb, unsigned noatime)
109{
110 struct gfs2_sbd *sdp = sb->s_fs_info;
111
112 sb->s_magic = GFS2_MAGIC;
113 sb->s_op = &gfs2_super_ops;
114 sb->s_export_op = &gfs2_export_ops;
115 sb->s_maxbytes = MAX_LFS_FILESIZE;
116
117 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
118 set_bit(noatime, &sdp->sd_flags);
119
120 /* Don't let the VFS update atimes. GFS2 handles this itself. */
121 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
122}
123
124static int init_names(struct gfs2_sbd *sdp, int silent)
125{
126 struct gfs2_sb *sb = NULL;
127 char *proto, *table;
128 int error = 0;
129
130 proto = sdp->sd_args.ar_lockproto;
131 table = sdp->sd_args.ar_locktable;
132
133 /* Try to autodetect */
134
135 if (!proto[0] || !table[0]) {
136 struct buffer_head *bh;
137 bh = sb_getblk(sdp->sd_vfs,
138 GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
139 lock_buffer(bh);
140 clear_buffer_uptodate(bh);
141 clear_buffer_dirty(bh);
142 unlock_buffer(bh);
143 ll_rw_block(READ, 1, &bh);
144 wait_on_buffer(bh);
145
146 if (!buffer_uptodate(bh)) {
147 brelse(bh);
148 return -EIO;
149 }
150
151 sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL);
152 if (!sb) {
153 brelse(bh);
154 return -ENOMEM;
155 }
156 gfs2_sb_in(sb, bh->b_data);
157 brelse(bh);
158
159 error = gfs2_check_sb(sdp, sb, silent);
160 if (error)
161 goto out;
162
163 if (!proto[0])
164 proto = sb->sb_lockproto;
165 if (!table[0])
166 table = sb->sb_locktable;
167 }
168
169 if (!table[0])
170 table = sdp->sd_vfs->s_id;
171
172 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
173 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
174
175 out:
176 kfree(sb);
177
178 return error;
179}
180
181static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
182 int undo)
183{
184 struct task_struct *p;
185 int error = 0;
186
187 if (undo)
188 goto fail_trans;
189
190 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
191 error = IS_ERR(p);
192 if (error) {
193 fs_err(sdp, "can't start scand thread: %d\n", error);
194 return error;
195 }
196 sdp->sd_scand_process = p;
197
198 for (sdp->sd_glockd_num = 0;
199 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
200 sdp->sd_glockd_num++) {
201 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
202 error = IS_ERR(p);
203 if (error) {
204 fs_err(sdp, "can't start glockd thread: %d\n", error);
205 goto fail;
206 }
207 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
208 }
209
210 error = gfs2_glock_nq_num(sdp,
211 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
212 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
213 mount_gh);
214 if (error) {
215 fs_err(sdp, "can't acquire mount glock: %d\n", error);
216 goto fail;
217 }
218
219 error = gfs2_glock_nq_num(sdp,
220 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
221 LM_ST_SHARED,
222 LM_FLAG_NOEXP | GL_EXACT,
223 &sdp->sd_live_gh);
224 if (error) {
225 fs_err(sdp, "can't acquire live glock: %d\n", error);
226 goto fail_mount;
227 }
228
229 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
230 CREATE, &sdp->sd_rename_gl);
231 if (error) {
232 fs_err(sdp, "can't create rename glock: %d\n", error);
233 goto fail_live;
234 }
235
236 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
237 CREATE, &sdp->sd_trans_gl);
238 if (error) {
239 fs_err(sdp, "can't create transaction glock: %d\n", error);
240 goto fail_rename;
241 }
242 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
243
244 return 0;
245
246fail_trans:
247 gfs2_glock_put(sdp->sd_trans_gl);
248
249fail_rename:
250 gfs2_glock_put(sdp->sd_rename_gl);
251
252fail_live:
253 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
254
255fail_mount:
256 gfs2_glock_dq_uninit(mount_gh);
257
258fail:
259 while (sdp->sd_glockd_num--)
260 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
261
262 kthread_stop(sdp->sd_scand_process);
263
264 return error;
265}
266
267static struct inode *gfs2_lookup_root(struct super_block *sb,
268 struct gfs2_inum *inum)
269{
270 return gfs2_inode_lookup(sb, inum, DT_DIR);
271}
272
273static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
274{
275 struct super_block *sb = sdp->sd_vfs;
276 struct gfs2_holder sb_gh;
277 struct gfs2_inum *inum;
278 struct inode *inode;
279 int error = 0;
280
281 if (undo) {
282 return 0;
283 }
284
285 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
286 LM_ST_SHARED, 0, &sb_gh);
287 if (error) {
288 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
289 return error;
290 }
291
292 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
293 if (error) {
294 fs_err(sdp, "can't read superblock: %d\n", error);
295 goto out;
296 }
297
298 /* Set up the buffer cache and SB for real */
299 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
300 error = -EINVAL;
301 fs_err(sdp, "FS block size (%u) is too small for device "
302 "block size (%u)\n",
303 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
304 goto out;
305 }
306 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
307 error = -EINVAL;
308 fs_err(sdp, "FS block size (%u) is too big for machine "
309 "page size (%u)\n",
310 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
311 goto out;
312 }
313
314 /* Get rid of buffers from the original block size */
315 sb_gh.gh_gl->gl_ops->go_inval(sb_gh.gh_gl, DIO_METADATA | DIO_DATA);
316 sb_gh.gh_gl->gl_aspace->i_blkbits = sdp->sd_sb.sb_bsize_shift;
317
318 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
319
320 /* Get the root inode */
321 inum = &sdp->sd_sb.sb_root_dir;
322 if (sb->s_type == &gfs2meta_fs_type)
323 inum = &sdp->sd_sb.sb_master_dir;
324 inode = gfs2_lookup_root(sb, inum);
325 if (IS_ERR(inode)) {
326 error = PTR_ERR(inode);
327 fs_err(sdp, "can't read in root inode: %d\n", error);
328 goto out;
329 }
330
331 sb->s_root = d_alloc_root(inode);
332 if (!sb->s_root) {
333 fs_err(sdp, "can't get root dentry\n");
334 error = -ENOMEM;
335 iput(inode);
336 }
337 sb->s_root->d_op = &gfs2_dops;
338out:
339 gfs2_glock_dq_uninit(&sb_gh);
340 return error;
341}
342
343static int init_journal(struct gfs2_sbd *sdp, int undo)
344{
345 struct gfs2_holder ji_gh;
346 struct task_struct *p;
347 struct gfs2_inode *ip;
348 int jindex = 1;
349 int error = 0;
350
351 if (undo) {
352 jindex = 0;
353 goto fail_recoverd;
354 }
355
356 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
357 if (IS_ERR(sdp->sd_jindex)) {
358 fs_err(sdp, "can't lookup journal index: %d\n", error);
359 return PTR_ERR(sdp->sd_jindex);
360 }
361 ip = GFS2_I(sdp->sd_jindex);
362 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
363
364 /* Load in the journal index special file */
365
366 error = gfs2_jindex_hold(sdp, &ji_gh);
367 if (error) {
368 fs_err(sdp, "can't read journal index: %d\n", error);
369 goto fail;
370 }
371
372 error = -EINVAL;
373 if (!gfs2_jindex_size(sdp)) {
374 fs_err(sdp, "no journals!\n");
375 goto fail_jindex;
376 }
377
378 if (sdp->sd_args.ar_spectator) {
379 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
380 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
381 } else {
382 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
383 fs_err(sdp, "can't mount journal #%u\n",
384 sdp->sd_lockstruct.ls_jid);
385 fs_err(sdp, "there are only %u journals (0 - %u)\n",
386 gfs2_jindex_size(sdp),
387 gfs2_jindex_size(sdp) - 1);
388 goto fail_jindex;
389 }
390 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
391
392 error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
393 &gfs2_journal_glops,
394 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
395 &sdp->sd_journal_gh);
396 if (error) {
397 fs_err(sdp, "can't acquire journal glock: %d\n", error);
398 goto fail_jindex;
399 }
400
401 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
402 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
403 LM_FLAG_NOEXP | GL_EXACT,
404 &sdp->sd_jinode_gh);
405 if (error) {
406 fs_err(sdp, "can't acquire journal inode glock: %d\n",
407 error);
408 goto fail_journal_gh;
409 }
410
411 error = gfs2_jdesc_check(sdp->sd_jdesc);
412 if (error) {
413 fs_err(sdp, "my journal (%u) is bad: %d\n",
414 sdp->sd_jdesc->jd_jid, error);
415 goto fail_jinode_gh;
416 }
417 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
418 }
419
420 if (sdp->sd_lockstruct.ls_first) {
421 unsigned int x;
422 for (x = 0; x < sdp->sd_journals; x++) {
423 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
424 if (error) {
425 fs_err(sdp, "error recovering journal %u: %d\n",
426 x, error);
427 goto fail_jinode_gh;
428 }
429 }
430
431 gfs2_lm_others_may_mount(sdp);
432 } else if (!sdp->sd_args.ar_spectator) {
433 error = gfs2_recover_journal(sdp->sd_jdesc);
434 if (error) {
435 fs_err(sdp, "error recovering my journal: %d\n", error);
436 goto fail_jinode_gh;
437 }
438 }
439
440 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
441 gfs2_glock_dq_uninit(&ji_gh);
442 jindex = 0;
443
444 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
445 error = IS_ERR(p);
446 if (error) {
447 fs_err(sdp, "can't start recoverd thread: %d\n", error);
448 goto fail_jinode_gh;
449 }
450 sdp->sd_recoverd_process = p;
451
452 return 0;
453
454 fail_recoverd:
455 kthread_stop(sdp->sd_recoverd_process);
456
457 fail_jinode_gh:
458 if (!sdp->sd_args.ar_spectator)
459 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
460
461 fail_journal_gh:
462 if (!sdp->sd_args.ar_spectator)
463 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
464
465 fail_jindex:
466 gfs2_jindex_free(sdp);
467 if (jindex)
468 gfs2_glock_dq_uninit(&ji_gh);
469
470 fail:
471 iput(sdp->sd_jindex);
472
473 return error;
474}
475
476
477static int init_inodes(struct gfs2_sbd *sdp, int undo)
478{
479 int error = 0;
480 struct gfs2_inode *ip;
481 struct inode *inode;
482
483 if (undo)
484 goto fail_qinode;
485
486 inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
487 if (IS_ERR(inode)) {
488 error = PTR_ERR(inode);
489 fs_err(sdp, "can't read in master directory: %d\n", error);
490 goto fail;
491 }
492 sdp->sd_master_dir = inode;
493
494 error = init_journal(sdp, undo);
495 if (error)
496 goto fail_master;
497
498 /* Read in the master inode number inode */
499 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
500 if (IS_ERR(sdp->sd_inum_inode)) {
501 error = PTR_ERR(sdp->sd_inum_inode);
502 fs_err(sdp, "can't read in inum inode: %d\n", error);
503 goto fail_journal;
504 }
505
506
507 /* Read in the master statfs inode */
508 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
509 if (IS_ERR(sdp->sd_statfs_inode)) {
510 error = PTR_ERR(sdp->sd_statfs_inode);
511 fs_err(sdp, "can't read in statfs inode: %d\n", error);
512 goto fail_inum;
513 }
514
515 /* Read in the resource index inode */
516 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
517 if (IS_ERR(sdp->sd_rindex)) {
518 error = PTR_ERR(sdp->sd_rindex);
519 fs_err(sdp, "can't get resource index inode: %d\n", error);
520 goto fail_statfs;
521 }
522 ip = GFS2_I(sdp->sd_rindex);
523 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
524 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
525
526 /* Read in the quota inode */
527 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
528 if (IS_ERR(sdp->sd_quota_inode)) {
529 error = PTR_ERR(sdp->sd_quota_inode);
530 fs_err(sdp, "can't get quota file inode: %d\n", error);
531 goto fail_rindex;
532 }
533 return 0;
534
535fail_qinode:
536 iput(sdp->sd_quota_inode);
537
538fail_rindex:
539 gfs2_clear_rgrpd(sdp);
540 iput(sdp->sd_rindex);
541
542fail_statfs:
543 iput(sdp->sd_statfs_inode);
544
545fail_inum:
546 iput(sdp->sd_inum_inode);
547fail_journal:
548 init_journal(sdp, UNDO);
549fail_master:
550 iput(sdp->sd_master_dir);
551fail:
552 return error;
553}
554
555static int init_per_node(struct gfs2_sbd *sdp, int undo)
556{
557 struct inode *pn = NULL;
558 char buf[30];
559 int error = 0;
560 struct gfs2_inode *ip;
561
562 if (sdp->sd_args.ar_spectator)
563 return 0;
564
565 if (undo)
566 goto fail_qc_gh;
567
568 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
569 if (IS_ERR(pn)) {
570 error = PTR_ERR(pn);
571 fs_err(sdp, "can't find per_node directory: %d\n", error);
572 return error;
573 }
574
575 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
576 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
577 if (IS_ERR(sdp->sd_ir_inode)) {
578 error = PTR_ERR(sdp->sd_ir_inode);
579 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
580 goto fail;
581 }
582
583 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
584 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
585 if (IS_ERR(sdp->sd_sc_inode)) {
586 error = PTR_ERR(sdp->sd_sc_inode);
587 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
588 goto fail_ir_i;
589 }
590
591 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
592 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
593 if (IS_ERR(sdp->sd_qc_inode)) {
594 error = PTR_ERR(sdp->sd_qc_inode);
595 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
596 goto fail_ut_i;
597 }
598
599 iput(pn);
600 pn = NULL;
601
602 ip = GFS2_I(sdp->sd_ir_inode);
603 error = gfs2_glock_nq_init(ip->i_gl,
604 LM_ST_EXCLUSIVE, 0,
605 &sdp->sd_ir_gh);
606 if (error) {
607 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
608 goto fail_qc_i;
609 }
610
611 ip = GFS2_I(sdp->sd_sc_inode);
612 error = gfs2_glock_nq_init(ip->i_gl,
613 LM_ST_EXCLUSIVE, 0,
614 &sdp->sd_sc_gh);
615 if (error) {
616 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
617 goto fail_ir_gh;
618 }
619
620 ip = GFS2_I(sdp->sd_qc_inode);
621 error = gfs2_glock_nq_init(ip->i_gl,
622 LM_ST_EXCLUSIVE, 0,
623 &sdp->sd_qc_gh);
624 if (error) {
625 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
626 goto fail_ut_gh;
627 }
628
629 return 0;
630
631 fail_qc_gh:
632 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
633
634 fail_ut_gh:
635
636 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
637
638 fail_ir_gh:
639 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
640
641 fail_qc_i:
642 iput(sdp->sd_qc_inode);
643
644 fail_ut_i:
645
646 iput(sdp->sd_sc_inode);
647
648 fail_ir_i:
649 iput(sdp->sd_ir_inode);
650
651 fail:
652 if (pn)
653 iput(pn);
654 return error;
655}
656
657static int init_threads(struct gfs2_sbd *sdp, int undo)
658{
659 struct task_struct *p;
660 int error = 0;
661
662 if (undo)
663 goto fail_quotad;
664
665 sdp->sd_log_flush_time = jiffies;
666 sdp->sd_jindex_refresh_time = jiffies;
667
668 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
669 error = IS_ERR(p);
670 if (error) {
671 fs_err(sdp, "can't start logd thread: %d\n", error);
672 return error;
673 }
674 sdp->sd_logd_process = p;
675
676 sdp->sd_statfs_sync_time = jiffies;
677 sdp->sd_quota_sync_time = jiffies;
678
679 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
680 error = IS_ERR(p);
681 if (error) {
682 fs_err(sdp, "can't start quotad thread: %d\n", error);
683 goto fail;
684 }
685 sdp->sd_quotad_process = p;
686
687 return 0;
688
689
690fail_quotad:
691 kthread_stop(sdp->sd_quotad_process);
692fail:
693 kthread_stop(sdp->sd_logd_process);
694 return error;
695}
696
697/**
698 * fill_super - Read in superblock
699 * @sb: The VFS superblock
700 * @data: Mount options
701 * @silent: Don't complain if it's not a GFS2 filesystem
702 *
703 * Returns: errno
704 */
705
706static int fill_super(struct super_block *sb, void *data, int silent)
707{
708 struct gfs2_sbd *sdp;
709 struct gfs2_holder mount_gh;
710 int error;
711
712 sdp = init_sbd(sb);
713 if (!sdp) {
714 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
715 return -ENOMEM;
716 }
717
718 error = gfs2_mount_args(sdp, (char *)data, 0);
719 if (error) {
720 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
721 goto fail;
722 }
723
724 init_vfs(sb, SDF_NOATIME);
725
726 /* Set up the buffer cache and fill in some fake block size values
727 to allow us to read-in the on-disk superblock. */
728 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
729 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
730 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
731 GFS2_BASIC_BLOCK_SHIFT;
732 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
733
734 error = init_names(sdp, silent);
735 if (error)
736 goto fail;
737
738 error = gfs2_sys_fs_add(sdp);
739 if (error)
740 goto fail;
741
742 error = gfs2_lm_mount(sdp, silent);
743 if (error)
744 goto fail_sys;
745
746 error = init_locking(sdp, &mount_gh, DO);
747 if (error)
748 goto fail_lm;
749
750 error = init_sb(sdp, silent, DO);
751 if (error)
752 goto fail_locking;
753
754 error = init_inodes(sdp, DO);
755 if (error)
756 goto fail_sb;
757
758 error = init_per_node(sdp, DO);
759 if (error)
760 goto fail_inodes;
761
762 error = gfs2_statfs_init(sdp);
763 if (error) {
764 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
765 goto fail_per_node;
766 }
767
768 error = init_threads(sdp, DO);
769 if (error)
770 goto fail_per_node;
771
772 if (!(sb->s_flags & MS_RDONLY)) {
773 error = gfs2_make_fs_rw(sdp);
774 if (error) {
775 fs_err(sdp, "can't make FS RW: %d\n", error);
776 goto fail_threads;
777 }
778 }
779
780 gfs2_glock_dq_uninit(&mount_gh);
781
782 return 0;
783
784 fail_threads:
785 init_threads(sdp, UNDO);
786
787 fail_per_node:
788 init_per_node(sdp, UNDO);
789
790 fail_inodes:
791 init_inodes(sdp, UNDO);
792
793 fail_sb:
794 init_sb(sdp, 0, UNDO);
795
796 fail_locking:
797 init_locking(sdp, &mount_gh, UNDO);
798
799 fail_lm:
800 gfs2_gl_hash_clear(sdp, WAIT);
801 gfs2_lm_unmount(sdp);
802 while (invalidate_inodes(sb))
803 yield();
804
805 fail_sys:
806 gfs2_sys_fs_del(sdp);
807
808 fail:
809 vfree(sdp);
810 sb->s_fs_info = NULL;
811
812 return error;
813}
814
815static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
816 const char *dev_name, void *data, struct vfsmount *mnt)
817{
818 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
819}
820
821static void gfs2_kill_sb(struct super_block *sb)
822{
823 kill_block_super(sb);
824}
825
826struct file_system_type gfs2_fs_type = {
827 .name = "gfs2",
828 .fs_flags = FS_REQUIRES_DEV,
829 .get_sb = gfs2_get_sb,
830 .kill_sb = gfs2_kill_sb,
831 .owner = THIS_MODULE,
832};
833
834struct file_system_type gfs2meta_fs_type = {
835 .name = "gfs2meta",
836 .fs_flags = FS_REQUIRES_DEV,
837 .get_sb = gfs2_get_sb,
838 .kill_sb = gfs2_kill_sb,
839 .owner = THIS_MODULE,
840};
841
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..622f5760d6b2
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13extern struct file_system_type gfs2_fs_type;
14extern struct file_system_type gfs2meta_fs_type;
15
16#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..caecafe0469b
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1166 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "acl.h"
28#include "bmap.h"
29#include "dir.h"
30#include "eaops.h"
31#include "eattr.h"
32#include "glock.h"
33#include "inode.h"
34#include "meta_io.h"
35#include "ops_dentry.h"
36#include "ops_inode.h"
37#include "page.h"
38#include "quota.h"
39#include "rgrp.h"
40#include "trans.h"
41#include "util.h"
42
43/**
44 * gfs2_create - Create a file
45 * @dir: The directory in which to create the file
46 * @dentry: The dentry of the new file
47 * @mode: The mode of the new file
48 *
49 * Returns: errno
50 */
51
52static int gfs2_create(struct inode *dir, struct dentry *dentry,
53 int mode, struct nameidata *nd)
54{
55 struct gfs2_inode *dip = GFS2_I(dir);
56 struct gfs2_sbd *sdp = GFS2_SB(dir);
57 struct gfs2_holder ghs[2];
58 struct inode *inode;
59
60 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
61
62 for (;;) {
63 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
64 if (!IS_ERR(inode)) {
65 gfs2_trans_end(sdp);
66 if (dip->i_alloc.al_rgd)
67 gfs2_inplace_release(dip);
68 gfs2_quota_unlock(dip);
69 gfs2_alloc_put(dip);
70 gfs2_glock_dq_uninit_m(2, ghs);
71 mark_inode_dirty(inode);
72 break;
73 } else if (PTR_ERR(inode) != -EEXIST ||
74 (nd->intent.open.flags & O_EXCL)) {
75 gfs2_holder_uninit(ghs);
76 return PTR_ERR(inode);
77 }
78
79 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
80 if (inode) {
81 if (!IS_ERR(inode)) {
82 gfs2_holder_uninit(ghs);
83 break;
84 } else {
85 gfs2_holder_uninit(ghs);
86 return PTR_ERR(inode);
87 }
88 }
89 }
90
91 d_instantiate(dentry, inode);
92
93 return 0;
94}
95
96/**
97 * gfs2_lookup - Look up a filename in a directory and return its inode
98 * @dir: The directory inode
99 * @dentry: The dentry of the new inode
100 * @nd: passed from Linux VFS, ignored by us
101 *
102 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
103 *
104 * Returns: errno
105 */
106
107static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
108 struct nameidata *nd)
109{
110 struct inode *inode = NULL;
111
112 dentry->d_op = &gfs2_dops;
113
114 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
115 if (inode && IS_ERR(inode))
116 return ERR_PTR(PTR_ERR(inode));
117
118 if (inode)
119 return d_splice_alias(inode, dentry);
120 d_add(dentry, inode);
121
122 return NULL;
123}
124
125/**
126 * gfs2_link - Link to a file
127 * @old_dentry: The inode to link
128 * @dir: Add link to this directory
129 * @dentry: The name of the link
130 *
131 * Link the inode in "old_dentry" into the directory "dir" with the
132 * name in "dentry".
133 *
134 * Returns: errno
135 */
136
137static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
138 struct dentry *dentry)
139{
140 struct gfs2_inode *dip = GFS2_I(dir);
141 struct gfs2_sbd *sdp = GFS2_SB(dir);
142 struct inode *inode = old_dentry->d_inode;
143 struct gfs2_inode *ip = GFS2_I(inode);
144 struct gfs2_holder ghs[2];
145 int alloc_required;
146 int error;
147
148 if (S_ISDIR(ip->i_di.di_mode))
149 return -EPERM;
150
151 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
152 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
153
154 error = gfs2_glock_nq_m(2, ghs);
155 if (error)
156 goto out;
157
158 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
159 if (error)
160 goto out_gunlock;
161
162 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
163 switch (error) {
164 case -ENOENT:
165 break;
166 case 0:
167 error = -EEXIST;
168 default:
169 goto out_gunlock;
170 }
171
172 error = -EINVAL;
173 if (!dip->i_di.di_nlink)
174 goto out_gunlock;
175 error = -EFBIG;
176 if (dip->i_di.di_entries == (uint32_t)-1)
177 goto out_gunlock;
178 error = -EPERM;
179 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
180 goto out_gunlock;
181 error = -EINVAL;
182 if (!ip->i_di.di_nlink)
183 goto out_gunlock;
184 error = -EMLINK;
185 if (ip->i_di.di_nlink == (uint32_t)-1)
186 goto out_gunlock;
187
188 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
189 if (error < 0)
190 goto out_gunlock;
191 error = 0;
192
193 if (alloc_required) {
194 struct gfs2_alloc *al = gfs2_alloc_get(dip);
195
196 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
197 if (error)
198 goto out_alloc;
199
200 error = gfs2_quota_check(dip, dip->i_di.di_uid,
201 dip->i_di.di_gid);
202 if (error)
203 goto out_gunlock_q;
204
205 al->al_requested = sdp->sd_max_dirres;
206
207 error = gfs2_inplace_reserve(dip);
208 if (error)
209 goto out_gunlock_q;
210
211 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
212 al->al_rgd->rd_ri.ri_length +
213 2 * RES_DINODE + RES_STATFS +
214 RES_QUOTA, 0);
215 if (error)
216 goto out_ipres;
217 } else {
218 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
219 if (error)
220 goto out_ipres;
221 }
222
223 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
224 IF2DT(ip->i_di.di_mode));
225 if (error)
226 goto out_end_trans;
227
228 error = gfs2_change_nlink(ip, +1);
229
230out_end_trans:
231 gfs2_trans_end(sdp);
232
233out_ipres:
234 if (alloc_required)
235 gfs2_inplace_release(dip);
236
237out_gunlock_q:
238 if (alloc_required)
239 gfs2_quota_unlock(dip);
240
241out_alloc:
242 if (alloc_required)
243 gfs2_alloc_put(dip);
244
245out_gunlock:
246 gfs2_glock_dq_m(2, ghs);
247
248out:
249 gfs2_holder_uninit(ghs);
250 gfs2_holder_uninit(ghs + 1);
251
252 if (!error) {
253 atomic_inc(&inode->i_count);
254 d_instantiate(dentry, inode);
255 mark_inode_dirty(inode);
256 }
257
258 return error;
259}
260
261/**
262 * gfs2_unlink - Unlink a file
263 * @dir: The inode of the directory containing the file to unlink
264 * @dentry: The file itself
265 *
266 * Unlink a file. Call gfs2_unlinki()
267 *
268 * Returns: errno
269 */
270
271static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
272{
273 struct gfs2_inode *dip = GFS2_I(dir);
274 struct gfs2_sbd *sdp = GFS2_SB(dir);
275 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
276 struct gfs2_holder ghs[2];
277 int error;
278
279 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
280 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
281
282 error = gfs2_glock_nq_m(2, ghs);
283 if (error)
284 goto out;
285
286 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
287 if (error)
288 goto out_gunlock;
289
290 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
291 if (error)
292 goto out_gunlock;
293
294 error = gfs2_dir_del(dip, &dentry->d_name);
295 if (error)
296 goto out_end_trans;
297
298 error = gfs2_change_nlink(ip, -1);
299
300out_end_trans:
301 gfs2_trans_end(sdp);
302out_gunlock:
303 gfs2_glock_dq_m(2, ghs);
304out:
305 gfs2_holder_uninit(ghs);
306 gfs2_holder_uninit(ghs + 1);
307 return error;
308}
309
310/**
311 * gfs2_symlink - Create a symlink
312 * @dir: The directory to create the symlink in
313 * @dentry: The dentry to put the symlink in
314 * @symname: The thing which the link points to
315 *
316 * Returns: errno
317 */
318
319static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
320 const char *symname)
321{
322 struct gfs2_inode *dip = GFS2_I(dir), *ip;
323 struct gfs2_sbd *sdp = GFS2_SB(dir);
324 struct gfs2_holder ghs[2];
325 struct inode *inode;
326 struct buffer_head *dibh;
327 int size;
328 int error;
329
330 /* Must be stuffed with a null terminator for gfs2_follow_link() */
331 size = strlen(symname);
332 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
333 return -ENAMETOOLONG;
334
335 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
336
337 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
338 if (IS_ERR(inode)) {
339 gfs2_holder_uninit(ghs);
340 return PTR_ERR(inode);
341 }
342
343 ip = ghs[1].gh_gl->gl_object;
344
345 ip->i_di.di_size = size;
346
347 error = gfs2_meta_inode_buffer(ip, &dibh);
348
349 if (!gfs2_assert_withdraw(sdp, !error)) {
350 gfs2_dinode_out(&ip->i_di, dibh->b_data);
351 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
352 size);
353 brelse(dibh);
354 }
355
356 gfs2_trans_end(sdp);
357 if (dip->i_alloc.al_rgd)
358 gfs2_inplace_release(dip);
359 gfs2_quota_unlock(dip);
360 gfs2_alloc_put(dip);
361
362 gfs2_glock_dq_uninit_m(2, ghs);
363
364 d_instantiate(dentry, inode);
365 mark_inode_dirty(inode);
366
367 return 0;
368}
369
370/**
371 * gfs2_mkdir - Make a directory
372 * @dir: The parent directory of the new one
373 * @dentry: The dentry of the new directory
374 * @mode: The mode of the new directory
375 *
376 * Returns: errno
377 */
378
379static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
380{
381 struct gfs2_inode *dip = GFS2_I(dir), *ip;
382 struct gfs2_sbd *sdp = GFS2_SB(dir);
383 struct gfs2_holder ghs[2];
384 struct inode *inode;
385 struct buffer_head *dibh;
386 int error;
387
388 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
389
390 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
391 if (IS_ERR(inode)) {
392 gfs2_holder_uninit(ghs);
393 return PTR_ERR(inode);
394 }
395
396 ip = ghs[1].gh_gl->gl_object;
397
398 ip->i_di.di_nlink = 2;
399 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
400 ip->i_di.di_flags |= GFS2_DIF_JDATA;
401 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
402 ip->i_di.di_entries = 2;
403
404 error = gfs2_meta_inode_buffer(ip, &dibh);
405
406 if (!gfs2_assert_withdraw(sdp, !error)) {
407 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
408 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
409 struct qstr str;
410
411 gfs2_str2qstr(&str, ".");
412 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
413 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
414 dent->de_inum = di->di_num; /* already GFS2 endian */
415 dent->de_type = DT_DIR;
416 di->di_entries = cpu_to_be32(1);
417
418 gfs2_str2qstr(&str, "..");
419 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
420 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
421
422 gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum);
423 dent->de_type = DT_DIR;
424
425 gfs2_dinode_out(&ip->i_di, (char *)di);
426
427 brelse(dibh);
428 }
429
430 error = gfs2_change_nlink(dip, +1);
431 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
432
433 gfs2_trans_end(sdp);
434 if (dip->i_alloc.al_rgd)
435 gfs2_inplace_release(dip);
436 gfs2_quota_unlock(dip);
437 gfs2_alloc_put(dip);
438
439 gfs2_glock_dq_uninit_m(2, ghs);
440
441 d_instantiate(dentry, inode);
442 mark_inode_dirty(inode);
443
444 return 0;
445}
446
447/**
448 * gfs2_rmdir - Remove a directory
449 * @dir: The parent directory of the directory to be removed
450 * @dentry: The dentry of the directory to remove
451 *
452 * Remove a directory. Call gfs2_rmdiri()
453 *
454 * Returns: errno
455 */
456
457static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
458{
459 struct gfs2_inode *dip = GFS2_I(dir);
460 struct gfs2_sbd *sdp = GFS2_SB(dir);
461 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
462 struct gfs2_holder ghs[2];
463 int error;
464
465 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
466 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
467
468 error = gfs2_glock_nq_m(2, ghs);
469 if (error)
470 goto out;
471
472 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
473 if (error)
474 goto out_gunlock;
475
476 if (ip->i_di.di_entries < 2) {
477 if (gfs2_consist_inode(ip))
478 gfs2_dinode_print(&ip->i_di);
479 error = -EIO;
480 goto out_gunlock;
481 }
482 if (ip->i_di.di_entries > 2) {
483 error = -ENOTEMPTY;
484 goto out_gunlock;
485 }
486
487 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
488 if (error)
489 goto out_gunlock;
490
491 error = gfs2_rmdiri(dip, &dentry->d_name, ip);
492
493 gfs2_trans_end(sdp);
494
495 out_gunlock:
496 gfs2_glock_dq_m(2, ghs);
497
498 out:
499 gfs2_holder_uninit(ghs);
500 gfs2_holder_uninit(ghs + 1);
501
502 return error;
503}
504
505/**
506 * gfs2_mknod - Make a special file
507 * @dir: The directory in which the special file will reside
508 * @dentry: The dentry of the special file
509 * @mode: The mode of the special file
510 * @rdev: The device specification of the special file
511 *
512 */
513
514static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
515 dev_t dev)
516{
517 struct gfs2_inode *dip = GFS2_I(dir), *ip;
518 struct gfs2_sbd *sdp = GFS2_SB(dir);
519 struct gfs2_holder ghs[2];
520 struct inode *inode;
521 struct buffer_head *dibh;
522 uint32_t major = 0, minor = 0;
523 int error;
524
525 switch (mode & S_IFMT) {
526 case S_IFBLK:
527 case S_IFCHR:
528 major = MAJOR(dev);
529 minor = MINOR(dev);
530 break;
531 case S_IFIFO:
532 case S_IFSOCK:
533 break;
534 default:
535 return -EOPNOTSUPP;
536 };
537
538 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
539
540 inode = gfs2_createi(ghs, &dentry->d_name, mode);
541 if (IS_ERR(inode)) {
542 gfs2_holder_uninit(ghs);
543 return PTR_ERR(inode);
544 }
545
546 ip = ghs[1].gh_gl->gl_object;
547
548 ip->i_di.di_major = major;
549 ip->i_di.di_minor = minor;
550
551 error = gfs2_meta_inode_buffer(ip, &dibh);
552
553 if (!gfs2_assert_withdraw(sdp, !error)) {
554 gfs2_dinode_out(&ip->i_di, dibh->b_data);
555 brelse(dibh);
556 }
557
558 gfs2_trans_end(sdp);
559 if (dip->i_alloc.al_rgd)
560 gfs2_inplace_release(dip);
561 gfs2_quota_unlock(dip);
562 gfs2_alloc_put(dip);
563
564 gfs2_glock_dq_uninit_m(2, ghs);
565
566 d_instantiate(dentry, inode);
567 mark_inode_dirty(inode);
568
569 return 0;
570}
571
572/**
573 * gfs2_rename - Rename a file
574 * @odir: Parent directory of old file name
575 * @odentry: The old dentry of the file
576 * @ndir: Parent directory of new file name
577 * @ndentry: The new dentry of the file
578 *
579 * Returns: errno
580 */
581
582static int gfs2_rename(struct inode *odir, struct dentry *odentry,
583 struct inode *ndir, struct dentry *ndentry)
584{
585 struct gfs2_inode *odip = GFS2_I(odir);
586 struct gfs2_inode *ndip = GFS2_I(ndir);
587 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
588 struct gfs2_inode *nip = NULL;
589 struct gfs2_sbd *sdp = GFS2_SB(odir);
590 struct gfs2_holder ghs[4], r_gh;
591 unsigned int num_gh;
592 int dir_rename = 0;
593 int alloc_required;
594 unsigned int x;
595 int error;
596
597 if (ndentry->d_inode) {
598 nip = GFS2_I(ndentry->d_inode);
599 if (ip == nip)
600 return 0;
601 }
602
603 /* Make sure we aren't trying to move a dirctory into it's subdir */
604
605 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
606 dir_rename = 1;
607
608 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
609 LM_ST_EXCLUSIVE, 0,
610 &r_gh);
611 if (error)
612 goto out;
613
614 error = gfs2_ok_to_move(ip, ndip);
615 if (error)
616 goto out_gunlock_r;
617 }
618
619 num_gh = 1;
620 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
621 if (odip != ndip) {
622 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
623 num_gh++;
624 }
625 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
626 num_gh++;
627
628 if (nip) {
629 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
630 num_gh++;
631 }
632
633 error = gfs2_glock_nq_m(num_gh, ghs);
634 if (error)
635 goto out_uninit;
636
637 /* Check out the old directory */
638
639 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
640 if (error)
641 goto out_gunlock;
642
643 /* Check out the new directory */
644
645 if (nip) {
646 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
647 if (error)
648 goto out_gunlock;
649
650 if (S_ISDIR(nip->i_di.di_mode)) {
651 if (nip->i_di.di_entries < 2) {
652 if (gfs2_consist_inode(nip))
653 gfs2_dinode_print(&nip->i_di);
654 error = -EIO;
655 goto out_gunlock;
656 }
657 if (nip->i_di.di_entries > 2) {
658 error = -ENOTEMPTY;
659 goto out_gunlock;
660 }
661 }
662 } else {
663 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
664 if (error)
665 goto out_gunlock;
666
667 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
668 switch (error) {
669 case -ENOENT:
670 error = 0;
671 break;
672 case 0:
673 error = -EEXIST;
674 default:
675 goto out_gunlock;
676 };
677
678 if (odip != ndip) {
679 if (!ndip->i_di.di_nlink) {
680 error = -EINVAL;
681 goto out_gunlock;
682 }
683 if (ndip->i_di.di_entries == (uint32_t)-1) {
684 error = -EFBIG;
685 goto out_gunlock;
686 }
687 if (S_ISDIR(ip->i_di.di_mode) &&
688 ndip->i_di.di_nlink == (uint32_t)-1) {
689 error = -EMLINK;
690 goto out_gunlock;
691 }
692 }
693 }
694
695 /* Check out the dir to be renamed */
696
697 if (dir_rename) {
698 error = permission(odentry->d_inode, MAY_WRITE, NULL);
699 if (error)
700 goto out_gunlock;
701 }
702
703 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
704 if (error < 0)
705 goto out_gunlock;
706 error = 0;
707
708 if (alloc_required) {
709 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
710
711 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
712 if (error)
713 goto out_alloc;
714
715 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
716 ndip->i_di.di_gid);
717 if (error)
718 goto out_gunlock_q;
719
720 al->al_requested = sdp->sd_max_dirres;
721
722 error = gfs2_inplace_reserve(ndip);
723 if (error)
724 goto out_gunlock_q;
725
726 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
727 al->al_rgd->rd_ri.ri_length +
728 4 * RES_DINODE + 4 * RES_LEAF +
729 RES_STATFS + RES_QUOTA, 0);
730 if (error)
731 goto out_ipreserv;
732 } else {
733 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
734 5 * RES_LEAF, 0);
735 if (error)
736 goto out_gunlock;
737 }
738
739 /* Remove the target file, if it exists */
740
741 if (nip) {
742 if (S_ISDIR(nip->i_di.di_mode))
743 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
744 else {
745 error = gfs2_dir_del(ndip, &ndentry->d_name);
746 if (error)
747 goto out_end_trans;
748 error = gfs2_change_nlink(nip, -1);
749 }
750 if (error)
751 goto out_end_trans;
752 }
753
754 if (dir_rename) {
755 struct qstr name;
756 gfs2_str2qstr(&name, "..");
757
758 error = gfs2_change_nlink(ndip, +1);
759 if (error)
760 goto out_end_trans;
761 error = gfs2_change_nlink(odip, -1);
762 if (error)
763 goto out_end_trans;
764
765 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
766 if (error)
767 goto out_end_trans;
768 } else {
769 struct buffer_head *dibh;
770 error = gfs2_meta_inode_buffer(ip, &dibh);
771 if (error)
772 goto out_end_trans;
773 ip->i_di.di_ctime = get_seconds();
774 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
775 gfs2_dinode_out(&ip->i_di, dibh->b_data);
776 brelse(dibh);
777 }
778
779 error = gfs2_dir_del(odip, &odentry->d_name);
780 if (error)
781 goto out_end_trans;
782
783 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
784 IF2DT(ip->i_di.di_mode));
785 if (error)
786 goto out_end_trans;
787
788out_end_trans:
789 gfs2_trans_end(sdp);
790out_ipreserv:
791 if (alloc_required)
792 gfs2_inplace_release(ndip);
793out_gunlock_q:
794 if (alloc_required)
795 gfs2_quota_unlock(ndip);
796out_alloc:
797 if (alloc_required)
798 gfs2_alloc_put(ndip);
799out_gunlock:
800 gfs2_glock_dq_m(num_gh, ghs);
801out_uninit:
802 for (x = 0; x < num_gh; x++)
803 gfs2_holder_uninit(ghs + x);
804out_gunlock_r:
805 if (dir_rename)
806 gfs2_glock_dq_uninit(&r_gh);
807out:
808 return error;
809}
810
811/**
812 * gfs2_readlink - Read the value of a symlink
813 * @dentry: the symlink
814 * @buf: the buffer to read the symlink data into
815 * @size: the size of the buffer
816 *
817 * Returns: errno
818 */
819
820static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
821 int user_size)
822{
823 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
824 char array[GFS2_FAST_NAME_SIZE], *buf = array;
825 unsigned int len = GFS2_FAST_NAME_SIZE;
826 int error;
827
828 error = gfs2_readlinki(ip, &buf, &len);
829 if (error)
830 return error;
831
832 if (user_size > len - 1)
833 user_size = len - 1;
834
835 if (copy_to_user(user_buf, buf, user_size))
836 error = -EFAULT;
837 else
838 error = user_size;
839
840 if (buf != array)
841 kfree(buf);
842
843 return error;
844}
845
846/**
847 * gfs2_follow_link - Follow a symbolic link
848 * @dentry: The dentry of the link
849 * @nd: Data that we pass to vfs_follow_link()
850 *
851 * This can handle symlinks of any size. It is optimised for symlinks
852 * under GFS2_FAST_NAME_SIZE.
853 *
854 * Returns: 0 on success or error code
855 */
856
857static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
858{
859 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
860 char array[GFS2_FAST_NAME_SIZE], *buf = array;
861 unsigned int len = GFS2_FAST_NAME_SIZE;
862 int error;
863
864 error = gfs2_readlinki(ip, &buf, &len);
865 if (!error) {
866 error = vfs_follow_link(nd, buf);
867 if (buf != array)
868 kfree(buf);
869 }
870
871 return ERR_PTR(error);
872}
873
874/**
875 * gfs2_permission -
876 * @inode:
877 * @mask:
878 * @nd: passed from Linux VFS, ignored by us
879 *
880 * Returns: errno
881 */
882
883static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
884{
885 struct gfs2_inode *ip = GFS2_I(inode);
886 struct gfs2_holder i_gh;
887 int error;
888
889 if (ip->i_vn == ip->i_gl->gl_vn)
890 return generic_permission(inode, mask, gfs2_check_acl);
891
892 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
893 if (!error) {
894 error = generic_permission(inode, mask, gfs2_check_acl_locked);
895 gfs2_glock_dq_uninit(&i_gh);
896 }
897
898 return error;
899}
900
901static int setattr_size(struct inode *inode, struct iattr *attr)
902{
903 struct gfs2_inode *ip = GFS2_I(inode);
904 int error;
905
906 if (attr->ia_size != ip->i_di.di_size) {
907 error = vmtruncate(inode, attr->ia_size);
908 if (error)
909 return error;
910 }
911
912 error = gfs2_truncatei(ip, attr->ia_size);
913 if (error)
914 return error;
915
916 return error;
917}
918
919static int setattr_chown(struct inode *inode, struct iattr *attr)
920{
921 struct gfs2_inode *ip = GFS2_I(inode);
922 struct gfs2_sbd *sdp = GFS2_SB(inode);
923 struct buffer_head *dibh;
924 uint32_t ouid, ogid, nuid, ngid;
925 int error;
926
927 ouid = ip->i_di.di_uid;
928 ogid = ip->i_di.di_gid;
929 nuid = attr->ia_uid;
930 ngid = attr->ia_gid;
931
932 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
933 ouid = nuid = NO_QUOTA_CHANGE;
934 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
935 ogid = ngid = NO_QUOTA_CHANGE;
936
937 gfs2_alloc_get(ip);
938
939 error = gfs2_quota_lock(ip, nuid, ngid);
940 if (error)
941 goto out_alloc;
942
943 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
944 error = gfs2_quota_check(ip, nuid, ngid);
945 if (error)
946 goto out_gunlock_q;
947 }
948
949 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
950 if (error)
951 goto out_gunlock_q;
952
953 error = gfs2_meta_inode_buffer(ip, &dibh);
954 if (error)
955 goto out_end_trans;
956
957 error = inode_setattr(inode, attr);
958 gfs2_assert_warn(sdp, !error);
959 gfs2_inode_attr_out(ip);
960
961 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
962 gfs2_dinode_out(&ip->i_di, dibh->b_data);
963 brelse(dibh);
964
965 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
966 gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
967 gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
968 }
969
970 out_end_trans:
971 gfs2_trans_end(sdp);
972
973 out_gunlock_q:
974 gfs2_quota_unlock(ip);
975
976 out_alloc:
977 gfs2_alloc_put(ip);
978
979 return error;
980}
981
982/**
983 * gfs2_setattr - Change attributes on an inode
984 * @dentry: The dentry which is changing
985 * @attr: The structure describing the change
986 *
987 * The VFS layer wants to change one or more of an inodes attributes. Write
988 * that change out to disk.
989 *
990 * Returns: errno
991 */
992
993static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
994{
995 struct inode *inode = dentry->d_inode;
996 struct gfs2_inode *ip = GFS2_I(inode);
997 struct gfs2_holder i_gh;
998 int error;
999
1000 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1001 if (error)
1002 return error;
1003
1004 error = -EPERM;
1005 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1006 goto out;
1007
1008 error = inode_change_ok(inode, attr);
1009 if (error)
1010 goto out;
1011
1012 if (attr->ia_valid & ATTR_SIZE)
1013 error = setattr_size(inode, attr);
1014 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1015 error = setattr_chown(inode, attr);
1016 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1017 error = gfs2_acl_chmod(ip, attr);
1018 else
1019 error = gfs2_setattr_simple(ip, attr);
1020
1021 out:
1022 gfs2_glock_dq_uninit(&i_gh);
1023
1024 if (!error)
1025 mark_inode_dirty(inode);
1026
1027 return error;
1028}
1029
1030/**
1031 * gfs2_getattr - Read out an inode's attributes
1032 * @mnt: ?
1033 * @dentry: The dentry to stat
1034 * @stat: The inode's stats
1035 *
1036 * Returns: errno
1037 */
1038
1039static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1040 struct kstat *stat)
1041{
1042 struct inode *inode = dentry->d_inode;
1043 struct gfs2_inode *ip = GFS2_I(inode);
1044 struct gfs2_holder gh;
1045 int error;
1046
1047 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1048 if (!error) {
1049 generic_fillattr(inode, stat);
1050 gfs2_glock_dq_uninit(&gh);
1051 }
1052
1053 return error;
1054}
1055
1056static int gfs2_setxattr(struct dentry *dentry, const char *name,
1057 const void *data, size_t size, int flags)
1058{
1059 struct inode *inode = dentry->d_inode;
1060 struct gfs2_ea_request er;
1061
1062 memset(&er, 0, sizeof(struct gfs2_ea_request));
1063 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1064 if (er.er_type == GFS2_EATYPE_UNUSED)
1065 return -EOPNOTSUPP;
1066 er.er_data = (char *)data;
1067 er.er_name_len = strlen(er.er_name);
1068 er.er_data_len = size;
1069 er.er_flags = flags;
1070
1071 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1072
1073 return gfs2_ea_set(GFS2_I(inode), &er);
1074}
1075
1076static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1077 void *data, size_t size)
1078{
1079 struct gfs2_ea_request er;
1080
1081 memset(&er, 0, sizeof(struct gfs2_ea_request));
1082 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1083 if (er.er_type == GFS2_EATYPE_UNUSED)
1084 return -EOPNOTSUPP;
1085 er.er_data = data;
1086 er.er_name_len = strlen(er.er_name);
1087 er.er_data_len = size;
1088
1089 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1090}
1091
1092static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1093{
1094 struct gfs2_ea_request er;
1095
1096 memset(&er, 0, sizeof(struct gfs2_ea_request));
1097 er.er_data = (size) ? buffer : NULL;
1098 er.er_data_len = size;
1099
1100 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
1101}
1102
1103static int gfs2_removexattr(struct dentry *dentry, const char *name)
1104{
1105 struct gfs2_ea_request er;
1106
1107 memset(&er, 0, sizeof(struct gfs2_ea_request));
1108 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1109 if (er.er_type == GFS2_EATYPE_UNUSED)
1110 return -EOPNOTSUPP;
1111 er.er_name_len = strlen(er.er_name);
1112
1113 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1114}
1115
1116struct inode_operations gfs2_file_iops = {
1117 .permission = gfs2_permission,
1118 .setattr = gfs2_setattr,
1119 .getattr = gfs2_getattr,
1120 .setxattr = gfs2_setxattr,
1121 .getxattr = gfs2_getxattr,
1122 .listxattr = gfs2_listxattr,
1123 .removexattr = gfs2_removexattr,
1124};
1125
1126struct inode_operations gfs2_dev_iops = {
1127 .permission = gfs2_permission,
1128 .setattr = gfs2_setattr,
1129 .getattr = gfs2_getattr,
1130 .setxattr = gfs2_setxattr,
1131 .getxattr = gfs2_getxattr,
1132 .listxattr = gfs2_listxattr,
1133 .removexattr = gfs2_removexattr,
1134};
1135
1136struct inode_operations gfs2_dir_iops = {
1137 .create = gfs2_create,
1138 .lookup = gfs2_lookup,
1139 .link = gfs2_link,
1140 .unlink = gfs2_unlink,
1141 .symlink = gfs2_symlink,
1142 .mkdir = gfs2_mkdir,
1143 .rmdir = gfs2_rmdir,
1144 .mknod = gfs2_mknod,
1145 .rename = gfs2_rename,
1146 .permission = gfs2_permission,
1147 .setattr = gfs2_setattr,
1148 .getattr = gfs2_getattr,
1149 .setxattr = gfs2_setxattr,
1150 .getxattr = gfs2_getxattr,
1151 .listxattr = gfs2_listxattr,
1152 .removexattr = gfs2_removexattr,
1153};
1154
1155struct inode_operations gfs2_symlink_iops = {
1156 .readlink = gfs2_readlink,
1157 .follow_link = gfs2_follow_link,
1158 .permission = gfs2_permission,
1159 .setattr = gfs2_setattr,
1160 .getattr = gfs2_getattr,
1161 .setxattr = gfs2_setxattr,
1162 .getxattr = gfs2_getxattr,
1163 .listxattr = gfs2_listxattr,
1164 .removexattr = gfs2_removexattr,
1165};
1166
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..930aaae91377
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13extern struct inode_operations gfs2_file_iops;
14extern struct inode_operations gfs2_dir_iops;
15extern struct inode_operations gfs2_symlink_iops;
16extern struct inode_operations gfs2_dev_iops;
17
18#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..50ea7f21f9cf
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,472 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/vmalloc.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/crc32.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "glock.h"
28#include "inode.h"
29#include "lm.h"
30#include "log.h"
31#include "mount.h"
32#include "ops_super.h"
33#include "page.h"
34#include "quota.h"
35#include "recovery.h"
36#include "rgrp.h"
37#include "super.h"
38#include "sys.h"
39#include "util.h"
40#include "trans.h"
41#include "dir.h"
42#include "eattr.h"
43#include "bmap.h"
44
45/**
46 * gfs2_write_inode - Make sure the inode is stable on the disk
47 * @inode: The inode
48 * @sync: synchronous write flag
49 *
50 * Returns: errno
51 */
52
53static int gfs2_write_inode(struct inode *inode, int sync)
54{
55 struct gfs2_inode *ip = GFS2_I(inode);
56
57 /* Check this is a "normal" inode */
58 if (inode->u.generic_ip) {
59 if (current->flags & PF_MEMALLOC)
60 return 0;
61 if (sync)
62 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
63 }
64
65 return 0;
66}
67
68/**
69 * gfs2_put_super - Unmount the filesystem
70 * @sb: The VFS superblock
71 *
72 */
73
74static void gfs2_put_super(struct super_block *sb)
75{
76 struct gfs2_sbd *sdp = sb->s_fs_info;
77 int error;
78
79 if (!sdp)
80 return;
81
82 /* Unfreeze the filesystem, if we need to */
83
84 mutex_lock(&sdp->sd_freeze_lock);
85 if (sdp->sd_freeze_count)
86 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
87 mutex_unlock(&sdp->sd_freeze_lock);
88
89 kthread_stop(sdp->sd_quotad_process);
90 kthread_stop(sdp->sd_logd_process);
91 kthread_stop(sdp->sd_recoverd_process);
92 while (sdp->sd_glockd_num--)
93 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
94 kthread_stop(sdp->sd_scand_process);
95
96 if (!(sb->s_flags & MS_RDONLY)) {
97 error = gfs2_make_fs_ro(sdp);
98 if (error)
99 gfs2_io_error(sdp);
100 }
101 /* At this point, we're through modifying the disk */
102
103 /* Release stuff */
104
105 iput(sdp->sd_master_dir);
106 iput(sdp->sd_jindex);
107 iput(sdp->sd_inum_inode);
108 iput(sdp->sd_statfs_inode);
109 iput(sdp->sd_rindex);
110 iput(sdp->sd_quota_inode);
111
112 gfs2_glock_put(sdp->sd_rename_gl);
113 gfs2_glock_put(sdp->sd_trans_gl);
114
115 if (!sdp->sd_args.ar_spectator) {
116 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
117 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
118 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
119 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
120 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
121 iput(sdp->sd_ir_inode);
122 iput(sdp->sd_sc_inode);
123 iput(sdp->sd_qc_inode);
124 }
125
126 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
127 gfs2_clear_rgrpd(sdp);
128 gfs2_jindex_free(sdp);
129 /* Take apart glock structures and buffer lists */
130 gfs2_gl_hash_clear(sdp, WAIT);
131 /* Unmount the locking protocol */
132 gfs2_lm_unmount(sdp);
133
134 /* At this point, we're through participating in the lockspace */
135 gfs2_sys_fs_del(sdp);
136 vfree(sdp);
137 sb->s_fs_info = NULL;
138}
139
140/**
141 * gfs2_write_super - disk commit all incore transactions
142 * @sb: the filesystem
143 *
144 * This function is called every time sync(2) is called.
145 * After this exits, all dirty buffers are synced.
146 */
147
148static void gfs2_write_super(struct super_block *sb)
149{
150 struct gfs2_sbd *sdp = sb->s_fs_info;
151 gfs2_log_flush(sdp, NULL);
152}
153
154/**
155 * gfs2_write_super_lockfs - prevent further writes to the filesystem
156 * @sb: the VFS structure for the filesystem
157 *
158 */
159
160static void gfs2_write_super_lockfs(struct super_block *sb)
161{
162 struct gfs2_sbd *sdp = sb->s_fs_info;
163 int error;
164
165 for (;;) {
166 error = gfs2_freeze_fs(sdp);
167 if (!error)
168 break;
169
170 switch (error) {
171 case -EBUSY:
172 fs_err(sdp, "waiting for recovery before freeze\n");
173 break;
174
175 default:
176 fs_err(sdp, "error freezing FS: %d\n", error);
177 break;
178 }
179
180 fs_err(sdp, "retrying...\n");
181 msleep(1000);
182 }
183}
184
185/**
186 * gfs2_unlockfs - reallow writes to the filesystem
187 * @sb: the VFS structure for the filesystem
188 *
189 */
190
191static void gfs2_unlockfs(struct super_block *sb)
192{
193 struct gfs2_sbd *sdp = sb->s_fs_info;
194 gfs2_unfreeze_fs(sdp);
195}
196
197/**
198 * gfs2_statfs - Gather and return stats about the filesystem
199 * @sb: The superblock
200 * @statfsbuf: The buffer
201 *
202 * Returns: 0 on success or error code
203 */
204
205static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
206{
207 struct super_block *sb = dentry->d_inode->i_sb;
208 struct gfs2_sbd *sdp = sb->s_fs_info;
209 struct gfs2_statfs_change sc;
210 int error;
211
212 if (gfs2_tune_get(sdp, gt_statfs_slow))
213 error = gfs2_statfs_slow(sdp, &sc);
214 else
215 error = gfs2_statfs_i(sdp, &sc);
216
217 if (error)
218 return error;
219
220 memset(buf, 0, sizeof(struct kstatfs));
221
222 buf->f_type = GFS2_MAGIC;
223 buf->f_bsize = sdp->sd_sb.sb_bsize;
224 buf->f_blocks = sc.sc_total;
225 buf->f_bfree = sc.sc_free;
226 buf->f_bavail = sc.sc_free;
227 buf->f_files = sc.sc_dinodes + sc.sc_free;
228 buf->f_ffree = sc.sc_free;
229 buf->f_namelen = GFS2_FNAMESIZE;
230
231 return 0;
232}
233
234/**
235 * gfs2_remount_fs - called when the FS is remounted
236 * @sb: the filesystem
237 * @flags: the remount flags
238 * @data: extra data passed in (not used right now)
239 *
240 * Returns: errno
241 */
242
243static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
244{
245 struct gfs2_sbd *sdp = sb->s_fs_info;
246 int error;
247
248 error = gfs2_mount_args(sdp, data, 1);
249 if (error)
250 return error;
251
252 if (sdp->sd_args.ar_spectator)
253 *flags |= MS_RDONLY;
254 else {
255 if (*flags & MS_RDONLY) {
256 if (!(sb->s_flags & MS_RDONLY))
257 error = gfs2_make_fs_ro(sdp);
258 } else if (!(*flags & MS_RDONLY) &&
259 (sb->s_flags & MS_RDONLY)) {
260 error = gfs2_make_fs_rw(sdp);
261 }
262 }
263
264 if (*flags & (MS_NOATIME | MS_NODIRATIME))
265 set_bit(SDF_NOATIME, &sdp->sd_flags);
266 else
267 clear_bit(SDF_NOATIME, &sdp->sd_flags);
268
269 /* Don't let the VFS update atimes. GFS2 handles this itself. */
270 *flags |= MS_NOATIME | MS_NODIRATIME;
271
272 return error;
273}
274
275/**
276 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
277 * @inode: The VFS inode
278 *
279 */
280
281static void gfs2_clear_inode(struct inode *inode)
282{
283 /* This tells us its a "real" inode and not one which only
284 * serves to contain an address space (see rgrp.c, meta_io.c)
285 * which therefore doesn't have its own glocks.
286 */
287 if (inode->u.generic_ip) {
288 struct gfs2_inode *ip = GFS2_I(inode);
289 gfs2_glock_inode_squish(inode);
290 gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
291 ip->i_gl->gl_object = NULL;
292 gfs2_glock_schedule_for_reclaim(ip->i_gl);
293 gfs2_glock_put(ip->i_gl);
294 ip->i_gl = NULL;
295 if (ip->i_iopen_gh.gh_gl)
296 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
297 }
298}
299
300/**
301 * gfs2_show_options - Show mount options for /proc/mounts
302 * @s: seq_file structure
303 * @mnt: vfsmount
304 *
305 * Returns: 0 on success or error code
306 */
307
308static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
309{
310 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
311 struct gfs2_args *args = &sdp->sd_args;
312
313 if (args->ar_lockproto[0])
314 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
315 if (args->ar_locktable[0])
316 seq_printf(s, ",locktable=%s", args->ar_locktable);
317 if (args->ar_hostdata[0])
318 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
319 if (args->ar_spectator)
320 seq_printf(s, ",spectator");
321 if (args->ar_ignore_local_fs)
322 seq_printf(s, ",ignore_local_fs");
323 if (args->ar_localflocks)
324 seq_printf(s, ",localflocks");
325 if (args->ar_localcaching)
326 seq_printf(s, ",localcaching");
327 if (args->ar_debug)
328 seq_printf(s, ",debug");
329 if (args->ar_upgrade)
330 seq_printf(s, ",upgrade");
331 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
332 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
333 if (args->ar_posix_acl)
334 seq_printf(s, ",acl");
335 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
336 char *state;
337 switch (args->ar_quota) {
338 case GFS2_QUOTA_OFF:
339 state = "off";
340 break;
341 case GFS2_QUOTA_ACCOUNT:
342 state = "account";
343 break;
344 case GFS2_QUOTA_ON:
345 state = "on";
346 break;
347 default:
348 state = "unknown";
349 break;
350 }
351 seq_printf(s, ",quota=%s", state);
352 }
353 if (args->ar_suiddir)
354 seq_printf(s, ",suiddir");
355 if (args->ar_data != GFS2_DATA_DEFAULT) {
356 char *state;
357 switch (args->ar_data) {
358 case GFS2_DATA_WRITEBACK:
359 state = "writeback";
360 break;
361 case GFS2_DATA_ORDERED:
362 state = "ordered";
363 break;
364 default:
365 state = "unknown";
366 break;
367 }
368 seq_printf(s, ",data=%s", state);
369 }
370
371 return 0;
372}
373
374/*
375 * We have to (at the moment) hold the inodes main lock to cover
376 * the gap between unlocking the shared lock on the iopen lock and
377 * taking the exclusive lock. I'd rather do a shared -> exclusive
378 * conversion on the iopen lock, but we can change that later. This
379 * is safe, just less efficient.
380 */
381static void gfs2_delete_inode(struct inode *inode)
382{
383 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
384 struct gfs2_inode *ip = GFS2_I(inode);
385 struct gfs2_holder gh;
386 int error;
387
388 if (!inode->u.generic_ip)
389 goto out;
390
391 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
392 if (unlikely(error)) {
393 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
394 goto out;
395 }
396
397 gfs2_glock_dq(&ip->i_iopen_gh);
398 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
399 error = gfs2_glock_nq(&ip->i_iopen_gh);
400 if (error)
401 goto out_uninit;
402
403 if (S_ISDIR(ip->i_di.di_mode) &&
404 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
405 error = gfs2_dir_exhash_dealloc(ip);
406 if (error)
407 goto out_unlock;
408 }
409
410 if (ip->i_di.di_eattr) {
411 error = gfs2_ea_dealloc(ip);
412 if (error)
413 goto out_unlock;
414 }
415
416 if (!gfs2_is_stuffed(ip)) {
417 error = gfs2_file_dealloc(ip);
418 if (error)
419 goto out_unlock;
420 }
421
422 error = gfs2_dinode_dealloc(ip);
423
424out_unlock:
425 gfs2_glock_dq(&ip->i_iopen_gh);
426out_uninit:
427 gfs2_holder_uninit(&ip->i_iopen_gh);
428 gfs2_glock_dq_uninit(&gh);
429 if (error)
430 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
431out:
432 truncate_inode_pages(&inode->i_data, 0);
433 clear_inode(inode);
434}
435
436
437
438static struct inode *gfs2_alloc_inode(struct super_block *sb)
439{
440 struct gfs2_sbd *sdp = sb->s_fs_info;
441 struct gfs2_inode *ip;
442
443 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
444 if (ip) {
445 ip->i_flags = 0;
446 ip->i_gl = NULL;
447 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
448 ip->i_last_pfault = jiffies;
449 }
450 return &ip->i_inode;
451}
452
453static void gfs2_destroy_inode(struct inode *inode)
454{
455 kmem_cache_free(gfs2_inode_cachep, inode);
456}
457
458struct super_operations gfs2_super_ops = {
459 .alloc_inode = gfs2_alloc_inode,
460 .destroy_inode = gfs2_destroy_inode,
461 .write_inode = gfs2_write_inode,
462 .delete_inode = gfs2_delete_inode,
463 .put_super = gfs2_put_super,
464 .write_super = gfs2_write_super,
465 .write_super_lockfs = gfs2_write_super_lockfs,
466 .unlockfs = gfs2_unlockfs,
467 .statfs = gfs2_statfs,
468 .remount_fs = gfs2_remount_fs,
469 .clear_inode = gfs2_clear_inode,
470 .show_options = gfs2_show_options,
471};
472
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..a15ccc276113
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13extern struct super_operations gfs2_super_ops;
14
15#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..08709f19ea98
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,195 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "ops_vm.h"
26#include "page.h"
27#include "quota.h"
28#include "rgrp.h"
29#include "trans.h"
30#include "util.h"
31
32static void pfault_be_greedy(struct gfs2_inode *ip)
33{
34 unsigned int time;
35
36 spin_lock(&ip->i_spin);
37 time = ip->i_greedy;
38 ip->i_last_pfault = jiffies;
39 spin_unlock(&ip->i_spin);
40
41 igrab(&ip->i_inode);
42 if (gfs2_glock_be_greedy(ip->i_gl, time))
43 iput(&ip->i_inode);
44}
45
46static struct page *gfs2_private_nopage(struct vm_area_struct *area,
47 unsigned long address, int *type)
48{
49 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
50 struct gfs2_holder i_gh;
51 struct page *result;
52 int error;
53
54 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
55 if (error)
56 return NULL;
57
58 set_bit(GIF_PAGED, &ip->i_flags);
59
60 result = filemap_nopage(area, address, type);
61
62 if (result && result != NOPAGE_OOM)
63 pfault_be_greedy(ip);
64
65 gfs2_glock_dq_uninit(&i_gh);
66
67 return result;
68}
69
70static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
71{
72 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
73 unsigned long index = page->index;
74 uint64_t lblock = index << (PAGE_CACHE_SHIFT -
75 sdp->sd_sb.sb_bsize_shift);
76 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
77 struct gfs2_alloc *al;
78 unsigned int data_blocks, ind_blocks;
79 unsigned int x;
80 int error;
81
82 al = gfs2_alloc_get(ip);
83
84 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
85 if (error)
86 goto out;
87
88 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
89 if (error)
90 goto out_gunlock_q;
91
92 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
93
94 al->al_requested = data_blocks + ind_blocks;
95
96 error = gfs2_inplace_reserve(ip);
97 if (error)
98 goto out_gunlock_q;
99
100 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
101 ind_blocks + RES_DINODE +
102 RES_STATFS + RES_QUOTA, 0);
103 if (error)
104 goto out_ipres;
105
106 if (gfs2_is_stuffed(ip)) {
107 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, NULL);
108 if (error)
109 goto out_trans;
110 }
111
112 for (x = 0; x < blocks; ) {
113 uint64_t dblock;
114 unsigned int extlen;
115 int new = 1;
116
117 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
118 if (error)
119 goto out_trans;
120
121 lblock += extlen;
122 x += extlen;
123 }
124
125 gfs2_assert_warn(sdp, al->al_alloced);
126
127 out_trans:
128 gfs2_trans_end(sdp);
129
130 out_ipres:
131 gfs2_inplace_release(ip);
132
133 out_gunlock_q:
134 gfs2_quota_unlock(ip);
135
136 out:
137 gfs2_alloc_put(ip);
138
139 return error;
140}
141
142static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
143 unsigned long address, int *type)
144{
145 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
146 struct gfs2_holder i_gh;
147 struct page *result = NULL;
148 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
149 area->vm_pgoff;
150 int alloc_required;
151 int error;
152
153 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
154 if (error)
155 return NULL;
156
157 set_bit(GIF_PAGED, &ip->i_flags);
158 set_bit(GIF_SW_PAGED, &ip->i_flags);
159
160 error = gfs2_write_alloc_required(ip,
161 (uint64_t)index << PAGE_CACHE_SHIFT,
162 PAGE_CACHE_SIZE, &alloc_required);
163 if (error)
164 goto out;
165
166 result = filemap_nopage(area, address, type);
167 if (!result || result == NOPAGE_OOM)
168 goto out;
169
170 if (alloc_required) {
171 error = alloc_page_backing(ip, result);
172 if (error) {
173 page_cache_release(result);
174 result = NULL;
175 goto out;
176 }
177 set_page_dirty(result);
178 }
179
180 pfault_be_greedy(ip);
181
182 out:
183 gfs2_glock_dq_uninit(&i_gh);
184
185 return result;
186}
187
188struct vm_operations_struct gfs2_vm_ops_private = {
189 .nopage = gfs2_private_nopage,
190};
191
192struct vm_operations_struct gfs2_vm_ops_sharewrite = {
193 .nopage = gfs2_sharewrite_nopage,
194};
195
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..077cffcd4085
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13extern struct vm_operations_struct gfs2_vm_ops_private;
14extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
15
16#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/page.c b/fs/gfs2/page.c
new file mode 100644
index 000000000000..b93caf294b9f
--- /dev/null
+++ b/fs/gfs2/page.c
@@ -0,0 +1,267 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/mm.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "inode.h"
24#include "page.h"
25#include "trans.h"
26#include "ops_address.h"
27#include "util.h"
28
29/**
30 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
31 * @gl: the glock
32 *
33 */
34
35void gfs2_pte_inval(struct gfs2_glock *gl)
36{
37 struct gfs2_inode *ip;
38 struct inode *inode;
39
40 ip = gl->gl_object;
41 inode = &ip->i_inode;
42 if (!ip || !S_ISREG(ip->i_di.di_mode))
43 return;
44
45 if (!test_bit(GIF_PAGED, &ip->i_flags))
46 return;
47
48 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
49
50 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
51 set_bit(GLF_DIRTY, &gl->gl_flags);
52
53 clear_bit(GIF_SW_PAGED, &ip->i_flags);
54}
55
56/**
57 * gfs2_page_inval - Invalidate all pages associated with a glock
58 * @gl: the glock
59 *
60 */
61
62void gfs2_page_inval(struct gfs2_glock *gl)
63{
64 struct gfs2_inode *ip;
65 struct inode *inode;
66
67 ip = gl->gl_object;
68 inode = &ip->i_inode;
69 if (!ip || !S_ISREG(ip->i_di.di_mode))
70 return;
71
72 truncate_inode_pages(inode->i_mapping, 0);
73 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
74 clear_bit(GIF_PAGED, &ip->i_flags);
75}
76
77/**
78 * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock
79 * @gl: the glock
80 * @flags: DIO_START | DIO_WAIT
81 *
82 * Syncs data (not metadata) for a regular file.
83 * No-op for all other types.
84 */
85
86void gfs2_page_sync(struct gfs2_glock *gl, int flags)
87{
88 struct gfs2_inode *ip;
89 struct inode *inode;
90 struct address_space *mapping;
91 int error = 0;
92
93 ip = gl->gl_object;
94 inode = &ip->i_inode;
95 if (!ip || !S_ISREG(ip->i_di.di_mode))
96 return;
97
98 mapping = inode->i_mapping;
99
100 if (flags & DIO_START)
101 filemap_fdatawrite(mapping);
102 if (!error && (flags & DIO_WAIT))
103 error = filemap_fdatawait(mapping);
104
105 /* Put back any errors cleared by filemap_fdatawait()
106 so they can be caught by someone who can pass them
107 up to user space. */
108
109 if (error == -ENOSPC)
110 set_bit(AS_ENOSPC, &mapping->flags);
111 else if (error)
112 set_bit(AS_EIO, &mapping->flags);
113
114}
115
116/**
117 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
118 * @ip: the inode
119 * @dibh: the dinode buffer
120 * @block: the block number that was allocated
121 * @private: any locked page held by the caller process
122 *
123 * Returns: errno
124 */
125
126int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
127 uint64_t block, void *private)
128{
129 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
130 struct inode *inode = &ip->i_inode;
131 struct page *page = (struct page *)private;
132 struct buffer_head *bh;
133 int release = 0;
134
135 if (!page || page->index) {
136 page = grab_cache_page(inode->i_mapping, 0);
137 if (!page)
138 return -ENOMEM;
139 release = 1;
140 }
141
142 if (!PageUptodate(page)) {
143 void *kaddr = kmap(page);
144
145 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
146 ip->i_di.di_size);
147 memset(kaddr + ip->i_di.di_size, 0,
148 PAGE_CACHE_SIZE - ip->i_di.di_size);
149 kunmap(page);
150
151 SetPageUptodate(page);
152 }
153
154 if (!page_has_buffers(page))
155 create_empty_buffers(page, 1 << inode->i_blkbits,
156 (1 << BH_Uptodate));
157
158 bh = page_buffers(page);
159
160 if (!buffer_mapped(bh))
161 map_bh(bh, inode->i_sb, block);
162
163 set_buffer_uptodate(bh);
164 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip))
165 gfs2_trans_add_bh(ip->i_gl, bh, 0);
166 mark_buffer_dirty(bh);
167
168 if (release) {
169 unlock_page(page);
170 page_cache_release(page);
171 }
172
173 return 0;
174}
175
176/**
177 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
178 *
179 * This is partly borrowed from ext3.
180 */
181int gfs2_block_truncate_page(struct address_space *mapping)
182{
183 struct inode *inode = mapping->host;
184 struct gfs2_inode *ip = GFS2_I(inode);
185 struct gfs2_sbd *sdp = GFS2_SB(inode);
186 loff_t from = inode->i_size;
187 unsigned long index = from >> PAGE_CACHE_SHIFT;
188 unsigned offset = from & (PAGE_CACHE_SIZE-1);
189 unsigned blocksize, iblock, length, pos;
190 struct buffer_head *bh;
191 struct page *page;
192 void *kaddr;
193 int err;
194
195 page = grab_cache_page(mapping, index);
196 if (!page)
197 return 0;
198
199 blocksize = inode->i_sb->s_blocksize;
200 length = blocksize - (offset & (blocksize - 1));
201 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
202
203 if (!page_has_buffers(page))
204 create_empty_buffers(page, blocksize, 0);
205
206 /* Find the buffer that contains "offset" */
207 bh = page_buffers(page);
208 pos = blocksize;
209 while (offset >= pos) {
210 bh = bh->b_this_page;
211 iblock++;
212 pos += blocksize;
213 }
214
215 err = 0;
216
217 if (!buffer_mapped(bh)) {
218 gfs2_get_block(inode, iblock, bh, 0);
219 /* unmapped? It's a hole - nothing to do */
220 if (!buffer_mapped(bh))
221 goto unlock;
222 }
223
224 /* Ok, it's mapped. Make sure it's up-to-date */
225 if (PageUptodate(page))
226 set_buffer_uptodate(bh);
227
228 if (!buffer_uptodate(bh)) {
229 err = -EIO;
230 ll_rw_block(READ, 1, &bh);
231 wait_on_buffer(bh);
232 /* Uhhuh. Read error. Complain and punt. */
233 if (!buffer_uptodate(bh))
234 goto unlock;
235 }
236
237 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
238 gfs2_trans_add_bh(ip->i_gl, bh, 0);
239
240 kaddr = kmap_atomic(page, KM_USER0);
241 memset(kaddr + offset, 0, length);
242 flush_dcache_page(page);
243 kunmap_atomic(kaddr, KM_USER0);
244
245unlock:
246 unlock_page(page);
247 page_cache_release(page);
248 return err;
249}
250
251void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
252 unsigned int from, unsigned int to)
253{
254 struct buffer_head *head = page_buffers(page);
255 unsigned int bsize = head->b_size;
256 struct buffer_head *bh;
257 unsigned int start, end;
258
259 for (bh = head, start = 0; bh != head || !start;
260 bh = bh->b_this_page, start = end) {
261 end = start + bsize;
262 if (end <= from || start >= to)
263 continue;
264 gfs2_trans_add_bh(ip->i_gl, bh, 0);
265 }
266}
267
diff --git a/fs/gfs2/page.h b/fs/gfs2/page.h
new file mode 100644
index 000000000000..2c853a90ac04
--- /dev/null
+++ b/fs/gfs2/page.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __PAGE_DOT_H__
11#define __PAGE_DOT_H__
12
13void gfs2_pte_inval(struct gfs2_glock *gl);
14void gfs2_page_inval(struct gfs2_glock *gl);
15void gfs2_page_sync(struct gfs2_glock *gl, int flags);
16
17int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
18 uint64_t block, void *private);
19int gfs2_block_truncate_page(struct address_space *mapping);
20void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
21 unsigned int from, unsigned int to);
22
23#endif /* __PAGE_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..3ca65c37c354
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1286 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/gfs2_ondisk.h>
47
48#include "gfs2.h"
49#include "lm_interface.h"
50#include "incore.h"
51#include "bmap.h"
52#include "glock.h"
53#include "glops.h"
54#include "log.h"
55#include "lvb.h"
56#include "meta_io.h"
57#include "quota.h"
58#include "rgrp.h"
59#include "super.h"
60#include "trans.h"
61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h"
64#include "util.h"
65
66#define QUOTA_USER 1
67#define QUOTA_GROUP 0
68
69static uint64_t qd2offset(struct gfs2_quota_data *qd)
70{
71 uint64_t offset;
72
73 offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
74 offset *= sizeof(struct gfs2_quota);
75
76 return offset;
77}
78
79static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id,
80 struct gfs2_quota_data **qdp)
81{
82 struct gfs2_quota_data *qd;
83 int error;
84
85 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
86 if (!qd)
87 return -ENOMEM;
88
89 qd->qd_count = 1;
90 qd->qd_id = id;
91 if (user)
92 set_bit(QDF_USER, &qd->qd_flags);
93 qd->qd_slot = -1;
94
95 error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user,
96 &gfs2_quota_glops, CREATE, &qd->qd_gl);
97 if (error)
98 goto fail;
99
100 error = gfs2_lvb_hold(qd->qd_gl);
101 gfs2_glock_put(qd->qd_gl);
102 if (error)
103 goto fail;
104
105 *qdp = qd;
106
107 return 0;
108
109 fail:
110 kfree(qd);
111 return error;
112}
113
114static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
115 struct gfs2_quota_data **qdp)
116{
117 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
118 int error, found;
119
120 *qdp = NULL;
121
122 for (;;) {
123 found = 0;
124 spin_lock(&sdp->sd_quota_spin);
125 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
126 if (qd->qd_id == id &&
127 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
128 qd->qd_count++;
129 found = 1;
130 break;
131 }
132 }
133
134 if (!found)
135 qd = NULL;
136
137 if (!qd && new_qd) {
138 qd = new_qd;
139 list_add(&qd->qd_list, &sdp->sd_quota_list);
140 atomic_inc(&sdp->sd_quota_count);
141 new_qd = NULL;
142 }
143
144 spin_unlock(&sdp->sd_quota_spin);
145
146 if (qd || !create) {
147 if (new_qd) {
148 gfs2_lvb_unhold(new_qd->qd_gl);
149 kfree(new_qd);
150 }
151 *qdp = qd;
152 return 0;
153 }
154
155 error = qd_alloc(sdp, user, id, &new_qd);
156 if (error)
157 return error;
158 }
159}
160
161static void qd_hold(struct gfs2_quota_data *qd)
162{
163 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
164
165 spin_lock(&sdp->sd_quota_spin);
166 gfs2_assert(sdp, qd->qd_count);
167 qd->qd_count++;
168 spin_unlock(&sdp->sd_quota_spin);
169}
170
171static void qd_put(struct gfs2_quota_data *qd)
172{
173 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
174 spin_lock(&sdp->sd_quota_spin);
175 gfs2_assert(sdp, qd->qd_count);
176 if (!--qd->qd_count)
177 qd->qd_last_touched = jiffies;
178 spin_unlock(&sdp->sd_quota_spin);
179}
180
181static int slot_get(struct gfs2_quota_data *qd)
182{
183 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
184 unsigned int c, o = 0, b;
185 unsigned char byte = 0;
186
187 spin_lock(&sdp->sd_quota_spin);
188
189 if (qd->qd_slot_count++) {
190 spin_unlock(&sdp->sd_quota_spin);
191 return 0;
192 }
193
194 for (c = 0; c < sdp->sd_quota_chunks; c++)
195 for (o = 0; o < PAGE_SIZE; o++) {
196 byte = sdp->sd_quota_bitmap[c][o];
197 if (byte != 0xFF)
198 goto found;
199 }
200
201 goto fail;
202
203 found:
204 for (b = 0; b < 8; b++)
205 if (!(byte & (1 << b)))
206 break;
207 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
208
209 if (qd->qd_slot >= sdp->sd_quota_slots)
210 goto fail;
211
212 sdp->sd_quota_bitmap[c][o] |= 1 << b;
213
214 spin_unlock(&sdp->sd_quota_spin);
215
216 return 0;
217
218 fail:
219 qd->qd_slot_count--;
220 spin_unlock(&sdp->sd_quota_spin);
221 return -ENOSPC;
222}
223
224static void slot_hold(struct gfs2_quota_data *qd)
225{
226 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
227
228 spin_lock(&sdp->sd_quota_spin);
229 gfs2_assert(sdp, qd->qd_slot_count);
230 qd->qd_slot_count++;
231 spin_unlock(&sdp->sd_quota_spin);
232}
233
234static void slot_put(struct gfs2_quota_data *qd)
235{
236 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
237
238 spin_lock(&sdp->sd_quota_spin);
239 gfs2_assert(sdp, qd->qd_slot_count);
240 if (!--qd->qd_slot_count) {
241 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
242 qd->qd_slot = -1;
243 }
244 spin_unlock(&sdp->sd_quota_spin);
245}
246
247static int bh_get(struct gfs2_quota_data *qd)
248{
249 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
250 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
251 unsigned int block, offset;
252 uint64_t dblock;
253 int new = 0;
254 struct buffer_head *bh;
255 int error;
256 int boundary;
257
258 mutex_lock(&sdp->sd_quota_mutex);
259
260 if (qd->qd_bh_count++) {
261 mutex_unlock(&sdp->sd_quota_mutex);
262 return 0;
263 }
264
265 block = qd->qd_slot / sdp->sd_qc_per_block;
266 offset = qd->qd_slot % sdp->sd_qc_per_block;;
267
268 error = gfs2_block_map(&ip->i_inode, block, &new, &dblock, &boundary);
269 if (error)
270 goto fail;
271 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
272 if (error)
273 goto fail;
274 error = -EIO;
275 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
276 goto fail_brelse;
277
278 qd->qd_bh = bh;
279 qd->qd_bh_qc = (struct gfs2_quota_change *)
280 (bh->b_data + sizeof(struct gfs2_meta_header) +
281 offset * sizeof(struct gfs2_quota_change));
282
283 mutex_lock(&sdp->sd_quota_mutex);
284
285 return 0;
286
287 fail_brelse:
288 brelse(bh);
289
290 fail:
291 qd->qd_bh_count--;
292 mutex_unlock(&sdp->sd_quota_mutex);
293 return error;
294}
295
296static void bh_put(struct gfs2_quota_data *qd)
297{
298 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
299
300 mutex_lock(&sdp->sd_quota_mutex);
301 gfs2_assert(sdp, qd->qd_bh_count);
302 if (!--qd->qd_bh_count) {
303 brelse(qd->qd_bh);
304 qd->qd_bh = NULL;
305 qd->qd_bh_qc = NULL;
306 }
307 mutex_unlock(&sdp->sd_quota_mutex);
308}
309
310static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
311{
312 struct gfs2_quota_data *qd = NULL;
313 int error;
314 int found = 0;
315
316 *qdp = NULL;
317
318 if (sdp->sd_vfs->s_flags & MS_RDONLY)
319 return 0;
320
321 spin_lock(&sdp->sd_quota_spin);
322
323 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
324 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
325 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
326 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
327 continue;
328
329 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
330
331 set_bit(QDF_LOCKED, &qd->qd_flags);
332 gfs2_assert_warn(sdp, qd->qd_count);
333 qd->qd_count++;
334 qd->qd_change_sync = qd->qd_change;
335 gfs2_assert_warn(sdp, qd->qd_slot_count);
336 qd->qd_slot_count++;
337 found = 1;
338
339 break;
340 }
341
342 if (!found)
343 qd = NULL;
344
345 spin_unlock(&sdp->sd_quota_spin);
346
347 if (qd) {
348 gfs2_assert_warn(sdp, qd->qd_change_sync);
349 error = bh_get(qd);
350 if (error) {
351 clear_bit(QDF_LOCKED, &qd->qd_flags);
352 slot_put(qd);
353 qd_put(qd);
354 return error;
355 }
356 }
357
358 *qdp = qd;
359
360 return 0;
361}
362
363static int qd_trylock(struct gfs2_quota_data *qd)
364{
365 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
366
367 if (sdp->sd_vfs->s_flags & MS_RDONLY)
368 return 0;
369
370 spin_lock(&sdp->sd_quota_spin);
371
372 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
373 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
374 spin_unlock(&sdp->sd_quota_spin);
375 return 0;
376 }
377
378 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
379
380 set_bit(QDF_LOCKED, &qd->qd_flags);
381 gfs2_assert_warn(sdp, qd->qd_count);
382 qd->qd_count++;
383 qd->qd_change_sync = qd->qd_change;
384 gfs2_assert_warn(sdp, qd->qd_slot_count);
385 qd->qd_slot_count++;
386
387 spin_unlock(&sdp->sd_quota_spin);
388
389 gfs2_assert_warn(sdp, qd->qd_change_sync);
390 if (bh_get(qd)) {
391 clear_bit(QDF_LOCKED, &qd->qd_flags);
392 slot_put(qd);
393 qd_put(qd);
394 return 0;
395 }
396
397 return 1;
398}
399
400static void qd_unlock(struct gfs2_quota_data *qd)
401{
402 gfs2_assert_warn(qd->qd_gl->gl_sbd,
403 test_bit(QDF_LOCKED, &qd->qd_flags));
404 clear_bit(QDF_LOCKED, &qd->qd_flags);
405 bh_put(qd);
406 slot_put(qd);
407 qd_put(qd);
408}
409
410static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
411 struct gfs2_quota_data **qdp)
412{
413 int error;
414
415 error = qd_get(sdp, user, id, create, qdp);
416 if (error)
417 return error;
418
419 error = slot_get(*qdp);
420 if (error)
421 goto fail;
422
423 error = bh_get(*qdp);
424 if (error)
425 goto fail_slot;
426
427 return 0;
428
429 fail_slot:
430 slot_put(*qdp);
431
432 fail:
433 qd_put(*qdp);
434 return error;
435}
436
437static void qdsb_put(struct gfs2_quota_data *qd)
438{
439 bh_put(qd);
440 slot_put(qd);
441 qd_put(qd);
442}
443
444int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
445{
446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
447 struct gfs2_alloc *al = &ip->i_alloc;
448 struct gfs2_quota_data **qd = al->al_qd;
449 int error;
450
451 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
452 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
453 return -EIO;
454
455 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
456 return 0;
457
458 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
459 if (error)
460 goto out;
461 al->al_qd_num++;
462 qd++;
463
464 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
465 if (error)
466 goto out;
467 al->al_qd_num++;
468 qd++;
469
470 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
471 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
472 if (error)
473 goto out;
474 al->al_qd_num++;
475 qd++;
476 }
477
478 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
479 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
480 if (error)
481 goto out;
482 al->al_qd_num++;
483 qd++;
484 }
485
486 out:
487 if (error)
488 gfs2_quota_unhold(ip);
489
490 return error;
491}
492
493void gfs2_quota_unhold(struct gfs2_inode *ip)
494{
495 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
496 struct gfs2_alloc *al = &ip->i_alloc;
497 unsigned int x;
498
499 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
500
501 for (x = 0; x < al->al_qd_num; x++) {
502 qdsb_put(al->al_qd[x]);
503 al->al_qd[x] = NULL;
504 }
505 al->al_qd_num = 0;
506}
507
508static int sort_qd(const void *a, const void *b)
509{
510 struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a;
511 struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b;
512 int ret = 0;
513
514 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
515 !test_bit(QDF_USER, &qd_b->qd_flags)) {
516 if (test_bit(QDF_USER, &qd_a->qd_flags))
517 ret = -1;
518 else
519 ret = 1;
520 } else {
521 if (qd_a->qd_id < qd_b->qd_id)
522 ret = -1;
523 else if (qd_a->qd_id > qd_b->qd_id)
524 ret = 1;
525 }
526
527 return ret;
528}
529
530static void do_qc(struct gfs2_quota_data *qd, int64_t change)
531{
532 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
533 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
534 struct gfs2_quota_change *qc = qd->qd_bh_qc;
535 int64_t x;
536
537 mutex_lock(&sdp->sd_quota_mutex);
538 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
539
540 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
541 qc->qc_change = 0;
542 qc->qc_flags = 0;
543 if (test_bit(QDF_USER, &qd->qd_flags))
544 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
545 qc->qc_id = cpu_to_be32(qd->qd_id);
546 }
547
548 x = qc->qc_change;
549 x = be64_to_cpu(x) + change;
550 qc->qc_change = cpu_to_be64(x);
551
552 spin_lock(&sdp->sd_quota_spin);
553 qd->qd_change = x;
554 spin_unlock(&sdp->sd_quota_spin);
555
556 if (!x) {
557 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
558 clear_bit(QDF_CHANGE, &qd->qd_flags);
559 qc->qc_flags = 0;
560 qc->qc_id = 0;
561 slot_put(qd);
562 qd_put(qd);
563 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
564 qd_hold(qd);
565 slot_hold(qd);
566 }
567
568 mutex_unlock(&sdp->sd_quota_mutex);
569}
570
571/**
572 * gfs2_adjust_quota
573 *
574 * This function was mostly borrowed from gfs2_block_truncate_page which was
575 * in turn mostly borrowed from ext3
576 */
577static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
578 int64_t change, struct gfs2_quota_data *qd)
579{
580 struct inode *inode = &ip->i_inode;
581 struct address_space *mapping = inode->i_mapping;
582 unsigned long index = loc >> PAGE_CACHE_SHIFT;
583 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
584 unsigned blocksize, iblock, pos;
585 struct buffer_head *bh;
586 struct page *page;
587 void *kaddr;
588 __be64 *ptr;
589 u64 value;
590 int err = -EIO;
591
592 page = grab_cache_page(mapping, index);
593 if (!page)
594 return -ENOMEM;
595
596 blocksize = inode->i_sb->s_blocksize;
597 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
598
599 if (!page_has_buffers(page))
600 create_empty_buffers(page, blocksize, 0);
601
602 bh = page_buffers(page);
603 pos = blocksize;
604 while (offset >= pos) {
605 bh = bh->b_this_page;
606 iblock++;
607 pos += blocksize;
608 }
609
610 if (!buffer_mapped(bh)) {
611 gfs2_get_block(inode, iblock, bh, 1);
612 if (!buffer_mapped(bh))
613 goto unlock;
614 }
615
616 if (PageUptodate(page))
617 set_buffer_uptodate(bh);
618
619 if (!buffer_uptodate(bh)) {
620 ll_rw_block(READ, 1, &bh);
621 wait_on_buffer(bh);
622 if (!buffer_uptodate(bh))
623 goto unlock;
624 }
625
626 gfs2_trans_add_bh(ip->i_gl, bh, 0);
627
628 kaddr = kmap_atomic(page, KM_USER0);
629 ptr = (__be64 *)(kaddr + offset);
630 value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change);
631 flush_dcache_page(page);
632 kunmap_atomic(kaddr, KM_USER0);
633 err = 0;
634 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
635#if 0
636 qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit);
637 qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn);
638#endif
639 qd->qd_qb.qb_value = cpu_to_be64(value);
640unlock:
641 unlock_page(page);
642 page_cache_release(page);
643 return err;
644}
645
646static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
647{
648 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
649 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
650 unsigned int data_blocks, ind_blocks;
651 struct gfs2_holder *ghs, i_gh;
652 unsigned int qx, x;
653 struct gfs2_quota_data *qd;
654 loff_t offset;
655 unsigned int nalloc = 0;
656 struct gfs2_alloc *al = NULL;
657 int error;
658
659 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
660 &data_blocks, &ind_blocks);
661
662 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
663 if (!ghs)
664 return -ENOMEM;
665
666 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
667 for (qx = 0; qx < num_qd; qx++) {
668 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
669 LM_ST_EXCLUSIVE,
670 GL_NOCACHE, &ghs[qx]);
671 if (error)
672 goto out;
673 }
674
675 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
676 if (error)
677 goto out;
678
679 for (x = 0; x < num_qd; x++) {
680 int alloc_required;
681
682 offset = qd2offset(qda[x]);
683 error = gfs2_write_alloc_required(ip, offset,
684 sizeof(struct gfs2_quota),
685 &alloc_required);
686 if (error)
687 goto out_gunlock;
688 if (alloc_required)
689 nalloc++;
690 }
691
692 if (nalloc) {
693 al = gfs2_alloc_get(ip);
694
695 al->al_requested = nalloc * (data_blocks + ind_blocks);
696
697 error = gfs2_inplace_reserve(ip);
698 if (error)
699 goto out_alloc;
700
701 error = gfs2_trans_begin(sdp,
702 al->al_rgd->rd_ri.ri_length +
703 num_qd * data_blocks +
704 nalloc * ind_blocks +
705 RES_DINODE + num_qd +
706 RES_STATFS, 0);
707 if (error)
708 goto out_ipres;
709 } else {
710 error = gfs2_trans_begin(sdp,
711 num_qd * data_blocks +
712 RES_DINODE + num_qd, 0);
713 if (error)
714 goto out_gunlock;
715 }
716
717 for (x = 0; x < num_qd; x++) {
718 qd = qda[x];
719 offset = qd2offset(qd);
720 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
721 (struct gfs2_quota_data *)
722 qd->qd_gl->gl_lvb);
723 if (error)
724 goto out_end_trans;
725
726 do_qc(qd, -qd->qd_change_sync);
727 }
728
729 error = 0;
730
731 out_end_trans:
732 gfs2_trans_end(sdp);
733
734 out_ipres:
735 if (nalloc)
736 gfs2_inplace_release(ip);
737
738 out_alloc:
739 if (nalloc)
740 gfs2_alloc_put(ip);
741
742 out_gunlock:
743 gfs2_glock_dq_uninit(&i_gh);
744
745 out:
746 while (qx--)
747 gfs2_glock_dq_uninit(&ghs[qx]);
748 kfree(ghs);
749 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
750
751 return error;
752}
753
754static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
755 struct gfs2_holder *q_gh)
756{
757 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
758 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
759 struct gfs2_holder i_gh;
760 struct gfs2_quota q;
761 char buf[sizeof(struct gfs2_quota)];
762 struct file_ra_state ra_state;
763 int error;
764
765 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
766 restart:
767 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
768 if (error)
769 return error;
770
771 gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
772
773 if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) {
774 loff_t pos;
775 gfs2_glock_dq_uninit(q_gh);
776 error = gfs2_glock_nq_init(qd->qd_gl,
777 LM_ST_EXCLUSIVE, GL_NOCACHE,
778 q_gh);
779 if (error)
780 return error;
781
782 error = gfs2_glock_nq_init(ip->i_gl,
783 LM_ST_SHARED, 0,
784 &i_gh);
785 if (error)
786 goto fail;
787
788 memset(buf, 0, sizeof(struct gfs2_quota));
789 pos = qd2offset(qd);
790 error = gfs2_internal_read(ip, &ra_state, buf,
791 &pos, sizeof(struct gfs2_quota));
792 if (error < 0)
793 goto fail_gunlock;
794
795 gfs2_glock_dq_uninit(&i_gh);
796
797 gfs2_quota_in(&q, buf);
798
799 memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb));
800 qd->qd_qb.qb_magic = GFS2_MAGIC;
801 qd->qd_qb.qb_limit = q.qu_limit;
802 qd->qd_qb.qb_warn = q.qu_warn;
803 qd->qd_qb.qb_value = q.qu_value;
804
805 gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
806
807 if (gfs2_glock_is_blocking(qd->qd_gl)) {
808 gfs2_glock_dq_uninit(q_gh);
809 force_refresh = 0;
810 goto restart;
811 }
812 }
813
814 return 0;
815
816 fail_gunlock:
817 gfs2_glock_dq_uninit(&i_gh);
818
819 fail:
820 gfs2_glock_dq_uninit(q_gh);
821
822 return error;
823}
824
825int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
826{
827 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
828 struct gfs2_alloc *al = &ip->i_alloc;
829 unsigned int x;
830 int error = 0;
831
832 gfs2_quota_hold(ip, uid, gid);
833
834 if (capable(CAP_SYS_RESOURCE) ||
835 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
836 return 0;
837
838 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
839 sort_qd, NULL);
840
841 for (x = 0; x < al->al_qd_num; x++) {
842 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
843 if (error)
844 break;
845 }
846
847 if (!error)
848 set_bit(GIF_QD_LOCKED, &ip->i_flags);
849 else {
850 while (x--)
851 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
852 gfs2_quota_unhold(ip);
853 }
854
855 return error;
856}
857
858static int need_sync(struct gfs2_quota_data *qd)
859{
860 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
861 struct gfs2_tune *gt = &sdp->sd_tune;
862 int64_t value;
863 unsigned int num, den;
864 int do_sync = 1;
865
866 if (!qd->qd_qb.qb_limit)
867 return 0;
868
869 spin_lock(&sdp->sd_quota_spin);
870 value = qd->qd_change;
871 spin_unlock(&sdp->sd_quota_spin);
872
873 spin_lock(&gt->gt_spin);
874 num = gt->gt_quota_scale_num;
875 den = gt->gt_quota_scale_den;
876 spin_unlock(&gt->gt_spin);
877
878 if (value < 0)
879 do_sync = 0;
880 else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
881 do_sync = 0;
882 else {
883 value *= gfs2_jindex_size(sdp) * num;
884 do_div(value, den);
885 value += qd->qd_qb.qb_value;
886 if (value < (int64_t)qd->qd_qb.qb_limit)
887 do_sync = 0;
888 }
889
890 return do_sync;
891}
892
893void gfs2_quota_unlock(struct gfs2_inode *ip)
894{
895 struct gfs2_alloc *al = &ip->i_alloc;
896 struct gfs2_quota_data *qda[4];
897 unsigned int count = 0;
898 unsigned int x;
899
900 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
901 goto out;
902
903 for (x = 0; x < al->al_qd_num; x++) {
904 struct gfs2_quota_data *qd;
905 int sync;
906
907 qd = al->al_qd[x];
908 sync = need_sync(qd);
909
910 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
911
912 if (sync && qd_trylock(qd))
913 qda[count++] = qd;
914 }
915
916 if (count) {
917 do_sync(count, qda);
918 for (x = 0; x < count; x++)
919 qd_unlock(qda[x]);
920 }
921
922 out:
923 gfs2_quota_unhold(ip);
924}
925
926#define MAX_LINE 256
927
928static int print_message(struct gfs2_quota_data *qd, char *type)
929{
930 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
931
932 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
933 sdp->sd_fsname, type,
934 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
935 qd->qd_id);
936
937 return 0;
938}
939
940int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
941{
942 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
943 struct gfs2_alloc *al = &ip->i_alloc;
944 struct gfs2_quota_data *qd;
945 int64_t value;
946 unsigned int x;
947 int error = 0;
948
949 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
950 return 0;
951
952 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
953 return 0;
954
955 for (x = 0; x < al->al_qd_num; x++) {
956 qd = al->al_qd[x];
957
958 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
959 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
960 continue;
961
962 value = qd->qd_qb.qb_value;
963 spin_lock(&sdp->sd_quota_spin);
964 value += qd->qd_change;
965 spin_unlock(&sdp->sd_quota_spin);
966
967 if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
968 print_message(qd, "exceeded");
969 error = -EDQUOT;
970 break;
971 } else if (qd->qd_qb.qb_warn &&
972 (int64_t)qd->qd_qb.qb_warn < value &&
973 time_after_eq(jiffies, qd->qd_last_warn +
974 gfs2_tune_get(sdp,
975 gt_quota_warn_period) * HZ)) {
976 error = print_message(qd, "warning");
977 qd->qd_last_warn = jiffies;
978 }
979 }
980
981 return error;
982}
983
984void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
985 uint32_t uid, uint32_t gid)
986{
987 struct gfs2_alloc *al = &ip->i_alloc;
988 struct gfs2_quota_data *qd;
989 unsigned int x;
990 unsigned int found = 0;
991
992 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
993 return;
994 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
995 return;
996
997 for (x = 0; x < al->al_qd_num; x++) {
998 qd = al->al_qd[x];
999
1000 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1001 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1002 do_qc(qd, change);
1003 found++;
1004 }
1005 }
1006}
1007
1008int gfs2_quota_sync(struct gfs2_sbd *sdp)
1009{
1010 struct gfs2_quota_data **qda;
1011 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1012 unsigned int num_qd;
1013 unsigned int x;
1014 int error = 0;
1015
1016 sdp->sd_quota_sync_gen++;
1017
1018 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1019 if (!qda)
1020 return -ENOMEM;
1021
1022 do {
1023 num_qd = 0;
1024
1025 for (;;) {
1026 error = qd_fish(sdp, qda + num_qd);
1027 if (error || !qda[num_qd])
1028 break;
1029 if (++num_qd == max_qd)
1030 break;
1031 }
1032
1033 if (num_qd) {
1034 if (!error)
1035 error = do_sync(num_qd, qda);
1036 if (!error)
1037 for (x = 0; x < num_qd; x++)
1038 qda[x]->qd_sync_gen =
1039 sdp->sd_quota_sync_gen;
1040
1041 for (x = 0; x < num_qd; x++)
1042 qd_unlock(qda[x]);
1043 }
1044 } while (!error && num_qd == max_qd);
1045
1046 kfree(qda);
1047
1048 return error;
1049}
1050
1051int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id)
1052{
1053 struct gfs2_quota_data *qd;
1054 struct gfs2_holder q_gh;
1055 int error;
1056
1057 error = qd_get(sdp, user, id, CREATE, &qd);
1058 if (error)
1059 return error;
1060
1061 error = do_glock(qd, FORCE, &q_gh);
1062 if (!error)
1063 gfs2_glock_dq_uninit(&q_gh);
1064
1065 qd_put(qd);
1066
1067 return error;
1068}
1069
1070#if 0
1071int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
1072 struct gfs2_quota *q)
1073{
1074 struct gfs2_quota_data *qd;
1075 struct gfs2_holder q_gh;
1076 int error;
1077
1078 if (((user) ? (id != current->fsuid) : (!in_group_p(id))) &&
1079 !capable(CAP_SYS_ADMIN))
1080 return -EACCES;
1081
1082 error = qd_get(sdp, user, id, CREATE, &qd);
1083 if (error)
1084 return error;
1085
1086 error = do_glock(qd, NO_FORCE, &q_gh);
1087 if (error)
1088 goto out;
1089
1090 memset(q, 0, sizeof(struct gfs2_quota));
1091 q->qu_limit = qd->qd_qb.qb_limit;
1092 q->qu_warn = qd->qd_qb.qb_warn;
1093 q->qu_value = qd->qd_qb.qb_value;
1094
1095 spin_lock(&sdp->sd_quota_spin);
1096 q->qu_value += qd->qd_change;
1097 spin_unlock(&sdp->sd_quota_spin);
1098
1099 gfs2_glock_dq_uninit(&q_gh);
1100
1101 out:
1102 qd_put(qd);
1103
1104 return error;
1105}
1106#endif /* 0 */
1107
1108int gfs2_quota_init(struct gfs2_sbd *sdp)
1109{
1110 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1111 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1112 unsigned int x, slot = 0;
1113 unsigned int found = 0;
1114 uint64_t dblock;
1115 uint32_t extlen = 0;
1116 int error;
1117
1118 if (!ip->i_di.di_size ||
1119 ip->i_di.di_size > (64 << 20) ||
1120 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1121 gfs2_consist_inode(ip);
1122 return -EIO;
1123 }
1124 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1125 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1126
1127 error = -ENOMEM;
1128
1129 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1130 sizeof(unsigned char *), GFP_KERNEL);
1131 if (!sdp->sd_quota_bitmap)
1132 return error;
1133
1134 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1135 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1136 if (!sdp->sd_quota_bitmap[x])
1137 goto fail;
1138 }
1139
1140 for (x = 0; x < blocks; x++) {
1141 struct buffer_head *bh;
1142 unsigned int y;
1143
1144 if (!extlen) {
1145 int new = 0;
1146 error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
1147 if (error)
1148 goto fail;
1149 }
1150 gfs2_meta_ra(ip->i_gl, dblock, extlen);
1151 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
1152 &bh);
1153 if (error)
1154 goto fail;
1155 error = -EIO;
1156 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1157 brelse(bh);
1158 goto fail;
1159 }
1160
1161 for (y = 0;
1162 y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1163 y++, slot++) {
1164 struct gfs2_quota_change qc;
1165 struct gfs2_quota_data *qd;
1166
1167 gfs2_quota_change_in(&qc, bh->b_data +
1168 sizeof(struct gfs2_meta_header) +
1169 y * sizeof(struct gfs2_quota_change));
1170 if (!qc.qc_change)
1171 continue;
1172
1173 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1174 qc.qc_id, &qd);
1175 if (error) {
1176 brelse(bh);
1177 goto fail;
1178 }
1179
1180 set_bit(QDF_CHANGE, &qd->qd_flags);
1181 qd->qd_change = qc.qc_change;
1182 qd->qd_slot = slot;
1183 qd->qd_slot_count = 1;
1184 qd->qd_last_touched = jiffies;
1185
1186 spin_lock(&sdp->sd_quota_spin);
1187 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1188 list_add(&qd->qd_list, &sdp->sd_quota_list);
1189 atomic_inc(&sdp->sd_quota_count);
1190 spin_unlock(&sdp->sd_quota_spin);
1191
1192 found++;
1193 }
1194
1195 brelse(bh);
1196 dblock++;
1197 extlen--;
1198 }
1199
1200 if (found)
1201 fs_info(sdp, "found %u quota changes\n", found);
1202
1203 return 0;
1204
1205 fail:
1206 gfs2_quota_cleanup(sdp);
1207 return error;
1208}
1209
1210void gfs2_quota_scan(struct gfs2_sbd *sdp)
1211{
1212 struct gfs2_quota_data *qd, *safe;
1213 LIST_HEAD(dead);
1214
1215 spin_lock(&sdp->sd_quota_spin);
1216 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1217 if (!qd->qd_count &&
1218 time_after_eq(jiffies, qd->qd_last_touched +
1219 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1220 list_move(&qd->qd_list, &dead);
1221 gfs2_assert_warn(sdp,
1222 atomic_read(&sdp->sd_quota_count) > 0);
1223 atomic_dec(&sdp->sd_quota_count);
1224 }
1225 }
1226 spin_unlock(&sdp->sd_quota_spin);
1227
1228 while (!list_empty(&dead)) {
1229 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1230 list_del(&qd->qd_list);
1231
1232 gfs2_assert_warn(sdp, !qd->qd_change);
1233 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1234 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1235
1236 gfs2_lvb_unhold(qd->qd_gl);
1237 kfree(qd);
1238 }
1239}
1240
1241void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1242{
1243 struct list_head *head = &sdp->sd_quota_list;
1244 struct gfs2_quota_data *qd;
1245 unsigned int x;
1246
1247 spin_lock(&sdp->sd_quota_spin);
1248 while (!list_empty(head)) {
1249 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1250
1251 if (qd->qd_count > 1 ||
1252 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1253 list_move(&qd->qd_list, head);
1254 spin_unlock(&sdp->sd_quota_spin);
1255 schedule();
1256 spin_lock(&sdp->sd_quota_spin);
1257 continue;
1258 }
1259
1260 list_del(&qd->qd_list);
1261 atomic_dec(&sdp->sd_quota_count);
1262 spin_unlock(&sdp->sd_quota_spin);
1263
1264 if (!qd->qd_count) {
1265 gfs2_assert_warn(sdp, !qd->qd_change);
1266 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1267 } else
1268 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1269 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1270
1271 gfs2_lvb_unhold(qd->qd_gl);
1272 kfree(qd);
1273
1274 spin_lock(&sdp->sd_quota_spin);
1275 }
1276 spin_unlock(&sdp->sd_quota_spin);
1277
1278 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1279
1280 if (sdp->sd_quota_bitmap) {
1281 for (x = 0; x < sdp->sd_quota_chunks; x++)
1282 kfree(sdp->sd_quota_bitmap[x]);
1283 kfree(sdp->sd_quota_bitmap);
1284 }
1285}
1286
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..af05492f9644
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13#define NO_QUOTA_CHANGE ((uint32_t)-1)
14
15int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
16void gfs2_quota_unhold(struct gfs2_inode *ip);
17
18int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
19void gfs2_quota_unlock(struct gfs2_inode *ip);
20
21int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
22void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
23 uint32_t uid, uint32_t gid);
24
25int gfs2_quota_sync(struct gfs2_sbd *sdp);
26int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id);
27
28int gfs2_quota_init(struct gfs2_sbd *sdp);
29void gfs2_quota_scan(struct gfs2_sbd *sdp);
30void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
31
32#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..7aabc03e4abd
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "lm.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30#include "dir.h"
31
32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
33 struct buffer_head **bh)
34{
35 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
36 struct gfs2_glock *gl = ip->i_gl;
37 int new = 0;
38 uint64_t dblock;
39 uint32_t extlen;
40 int error;
41
42 error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
43 if (error)
44 return error;
45 if (!dblock) {
46 gfs2_consist_inode(ip);
47 return -EIO;
48 }
49
50 gfs2_meta_ra(gl, dblock, extlen);
51 error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh);
52
53 return error;
54}
55
56int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
57{
58 struct list_head *head = &sdp->sd_revoke_list;
59 struct gfs2_revoke_replay *rr;
60 int found = 0;
61
62 list_for_each_entry(rr, head, rr_list) {
63 if (rr->rr_blkno == blkno) {
64 found = 1;
65 break;
66 }
67 }
68
69 if (found) {
70 rr->rr_where = where;
71 return 0;
72 }
73
74 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
75 if (!rr)
76 return -ENOMEM;
77
78 rr->rr_blkno = blkno;
79 rr->rr_where = where;
80 list_add(&rr->rr_list, head);
81
82 return 1;
83}
84
85int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
86{
87 struct gfs2_revoke_replay *rr;
88 int wrap, a, b, revoke;
89 int found = 0;
90
91 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
92 if (rr->rr_blkno == blkno) {
93 found = 1;
94 break;
95 }
96 }
97
98 if (!found)
99 return 0;
100
101 wrap = (rr->rr_where < sdp->sd_replay_tail);
102 a = (sdp->sd_replay_tail < where);
103 b = (where < rr->rr_where);
104 revoke = (wrap) ? (a || b) : (a && b);
105
106 return revoke;
107}
108
109void gfs2_revoke_clean(struct gfs2_sbd *sdp)
110{
111 struct list_head *head = &sdp->sd_revoke_list;
112 struct gfs2_revoke_replay *rr;
113
114 while (!list_empty(head)) {
115 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
116 list_del(&rr->rr_list);
117 kfree(rr);
118 }
119}
120
121/**
122 * get_log_header - read the log header for a given segment
123 * @jd: the journal
124 * @blk: the block to look at
125 * @lh: the log header to return
126 *
127 * Read the log header for a given segement in a given journal. Do a few
128 * sanity checks on it.
129 *
130 * Returns: 0 on success,
131 * 1 if the header was invalid or incomplete,
132 * errno on error
133 */
134
135static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
136 struct gfs2_log_header *head)
137{
138 struct buffer_head *bh;
139 struct gfs2_log_header lh;
140 uint32_t hash;
141 int error;
142
143 error = gfs2_replay_read_block(jd, blk, &bh);
144 if (error)
145 return error;
146
147 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
148 lh.lh_hash = 0;
149 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
150 gfs2_log_header_in(&lh, bh->b_data);
151
152 brelse(bh);
153
154 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
155 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
156 lh.lh_blkno != blk ||
157 lh.lh_hash != hash)
158 return 1;
159
160 *head = lh;
161
162 return 0;
163}
164
165/**
166 * find_good_lh - find a good log header
167 * @jd: the journal
168 * @blk: the segment to start searching from
169 * @lh: the log header to fill in
170 * @forward: if true search forward in the log, else search backward
171 *
172 * Call get_log_header() to get a log header for a segment, but if the
173 * segment is bad, either scan forward or backward until we find a good one.
174 *
175 * Returns: errno
176 */
177
178static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
179 struct gfs2_log_header *head)
180{
181 unsigned int orig_blk = *blk;
182 int error;
183
184 for (;;) {
185 error = get_log_header(jd, *blk, head);
186 if (error <= 0)
187 return error;
188
189 if (++*blk == jd->jd_blocks)
190 *blk = 0;
191
192 if (*blk == orig_blk) {
193 gfs2_consist_inode(GFS2_I(jd->jd_inode));
194 return -EIO;
195 }
196 }
197}
198
199/**
200 * jhead_scan - make sure we've found the head of the log
201 * @jd: the journal
202 * @head: this is filled in with the log descriptor of the head
203 *
204 * At this point, seg and lh should be either the head of the log or just
205 * before. Scan forward until we find the head.
206 *
207 * Returns: errno
208 */
209
210static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
211{
212 unsigned int blk = head->lh_blkno;
213 struct gfs2_log_header lh;
214 int error;
215
216 for (;;) {
217 if (++blk == jd->jd_blocks)
218 blk = 0;
219
220 error = get_log_header(jd, blk, &lh);
221 if (error < 0)
222 return error;
223 if (error == 1)
224 continue;
225
226 if (lh.lh_sequence == head->lh_sequence) {
227 gfs2_consist_inode(GFS2_I(jd->jd_inode));
228 return -EIO;
229 }
230 if (lh.lh_sequence < head->lh_sequence)
231 break;
232
233 *head = lh;
234 }
235
236 return 0;
237}
238
239/**
240 * gfs2_find_jhead - find the head of a log
241 * @jd: the journal
242 * @head: the log descriptor for the head of the log is returned here
243 *
244 * Do a binary search of a journal and find the valid log entry with the
245 * highest sequence number. (i.e. the log head)
246 *
247 * Returns: errno
248 */
249
250int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
251{
252 struct gfs2_log_header lh_1, lh_m;
253 uint32_t blk_1, blk_2, blk_m;
254 int error;
255
256 blk_1 = 0;
257 blk_2 = jd->jd_blocks - 1;
258
259 for (;;) {
260 blk_m = (blk_1 + blk_2) / 2;
261
262 error = find_good_lh(jd, &blk_1, &lh_1);
263 if (error)
264 return error;
265
266 error = find_good_lh(jd, &blk_m, &lh_m);
267 if (error)
268 return error;
269
270 if (blk_1 == blk_m || blk_m == blk_2)
271 break;
272
273 if (lh_1.lh_sequence <= lh_m.lh_sequence)
274 blk_1 = blk_m;
275 else
276 blk_2 = blk_m;
277 }
278
279 error = jhead_scan(jd, &lh_1);
280 if (error)
281 return error;
282
283 *head = lh_1;
284
285 return error;
286}
287
288/**
289 * foreach_descriptor - go through the active part of the log
290 * @jd: the journal
291 * @start: the first log header in the active region
292 * @end: the last log header (don't process the contents of this entry))
293 *
294 * Call a given function once for every log descriptor in the active
295 * portion of the log.
296 *
297 * Returns: errno
298 */
299
300static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
301 unsigned int end, int pass)
302{
303 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
304 struct buffer_head *bh;
305 struct gfs2_log_descriptor *ld;
306 int error = 0;
307 u32 length;
308 __be64 *ptr;
309 unsigned int offset = sizeof(struct gfs2_log_descriptor);
310 offset += (sizeof(__be64)-1);
311 offset &= ~(sizeof(__be64)-1);
312
313 while (start != end) {
314 error = gfs2_replay_read_block(jd, start, &bh);
315 if (error)
316 return error;
317 if (gfs2_meta_check(sdp, bh)) {
318 brelse(bh);
319 return -EIO;
320 }
321 ld = (struct gfs2_log_descriptor *)bh->b_data;
322 length = be32_to_cpu(ld->ld_length);
323
324 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
325 struct gfs2_log_header lh;
326 error = get_log_header(jd, start, &lh);
327 if (!error) {
328 gfs2_replay_incr_blk(sdp, &start);
329 continue;
330 }
331 if (error == 1) {
332 gfs2_consist_inode(GFS2_I(jd->jd_inode));
333 error = -EIO;
334 }
335 brelse(bh);
336 return error;
337 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
338 brelse(bh);
339 return -EIO;
340 }
341 ptr = (__be64 *)(bh->b_data + offset);
342 error = lops_scan_elements(jd, start, ld, ptr, pass);
343 if (error) {
344 brelse(bh);
345 return error;
346 }
347
348 while (length--)
349 gfs2_replay_incr_blk(sdp, &start);
350
351 brelse(bh);
352 }
353
354 return 0;
355}
356
357/**
358 * clean_journal - mark a dirty journal as being clean
359 * @sdp: the filesystem
360 * @jd: the journal
361 * @gl: the journal's glock
362 * @head: the head journal to start from
363 *
364 * Returns: errno
365 */
366
367static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
368{
369 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
370 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
371 unsigned int lblock;
372 int new = 0;
373 uint64_t dblock;
374 struct gfs2_log_header *lh;
375 uint32_t hash;
376 struct buffer_head *bh;
377 int error;
378 int boundary;
379
380 lblock = head->lh_blkno;
381 gfs2_replay_incr_blk(sdp, &lblock);
382 error = gfs2_block_map(&ip->i_inode, lblock, &new, &dblock, &boundary);
383 if (error)
384 return error;
385 if (!dblock) {
386 gfs2_consist_inode(ip);
387 return -EIO;
388 }
389
390 bh = sb_getblk(sdp->sd_vfs, dblock);
391 lock_buffer(bh);
392 memset(bh->b_data, 0, bh->b_size);
393 set_buffer_uptodate(bh);
394 clear_buffer_dirty(bh);
395 unlock_buffer(bh);
396
397 lh = (struct gfs2_log_header *)bh->b_data;
398 memset(lh, 0, sizeof(struct gfs2_log_header));
399 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
400 lh->lh_header.mh_type = cpu_to_be16(GFS2_METATYPE_LH);
401 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
402 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
403 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
404 lh->lh_blkno = cpu_to_be32(lblock);
405 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
406 lh->lh_hash = cpu_to_be32(hash);
407
408 set_buffer_dirty(bh);
409 if (sync_dirty_buffer(bh))
410 gfs2_io_error_bh(sdp, bh);
411 brelse(bh);
412
413 return error;
414}
415
416/**
417 * gfs2_recover_journal - recovery a given journal
418 * @jd: the struct gfs2_jdesc describing the journal
419 *
420 * Acquire the journal's lock, check to see if the journal is clean, and
421 * do recovery if necessary.
422 *
423 * Returns: errno
424 */
425
426int gfs2_recover_journal(struct gfs2_jdesc *jd)
427{
428 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
429 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
430 struct gfs2_log_header head;
431 struct gfs2_holder j_gh, ji_gh, t_gh;
432 unsigned long t;
433 int ro = 0;
434 unsigned int pass;
435 int error;
436
437 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
438 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
439 jd->jd_jid);
440
441 /* Aquire the journal lock so we can do recovery */
442
443 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
444 LM_ST_EXCLUSIVE,
445 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
446 &j_gh);
447 switch (error) {
448 case 0:
449 break;
450
451 case GLR_TRYFAILED:
452 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
453 error = 0;
454
455 default:
456 goto fail;
457 };
458
459 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
460 LM_FLAG_NOEXP, &ji_gh);
461 if (error)
462 goto fail_gunlock_j;
463 } else {
464 fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
465 }
466
467 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
468
469 error = gfs2_jdesc_check(jd);
470 if (error)
471 goto fail_gunlock_ji;
472
473 error = gfs2_find_jhead(jd, &head);
474 if (error)
475 goto fail_gunlock_ji;
476
477 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
478 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
479 jd->jd_jid);
480
481 t = jiffies;
482
483 /* Acquire a shared hold on the transaction lock */
484
485 error = gfs2_glock_nq_init(sdp->sd_trans_gl,
486 LM_ST_SHARED,
487 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
488 GL_NOCANCEL | GL_NOCACHE,
489 &t_gh);
490 if (error)
491 goto fail_gunlock_ji;
492
493 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
494 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
495 ro = 1;
496 } else {
497 if (sdp->sd_vfs->s_flags & MS_RDONLY)
498 ro = 1;
499 }
500
501 if (ro) {
502 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
503 jd->jd_jid);
504 error = -EROFS;
505 goto fail_gunlock_tr;
506 }
507
508 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
509
510 for (pass = 0; pass < 2; pass++) {
511 lops_before_scan(jd, &head, pass);
512 error = foreach_descriptor(jd, head.lh_tail,
513 head.lh_blkno, pass);
514 lops_after_scan(jd, error, pass);
515 if (error)
516 goto fail_gunlock_tr;
517 }
518
519 error = clean_journal(jd, &head);
520 if (error)
521 goto fail_gunlock_tr;
522
523 gfs2_glock_dq_uninit(&t_gh);
524 t = DIV_ROUND_UP(jiffies - t, HZ);
525 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
526 jd->jd_jid, t);
527 }
528
529 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
530 gfs2_glock_dq_uninit(&ji_gh);
531
532 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
533
534 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
535 gfs2_glock_dq_uninit(&j_gh);
536
537 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
538 return 0;
539
540fail_gunlock_tr:
541 gfs2_glock_dq_uninit(&t_gh);
542fail_gunlock_ji:
543 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
544 gfs2_glock_dq_uninit(&ji_gh);
545fail_gunlock_j:
546 gfs2_glock_dq_uninit(&j_gh);
547 }
548
549 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
550
551fail:
552 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
553 return error;
554}
555
556/**
557 * gfs2_check_journals - Recover any dirty journals
558 * @sdp: the filesystem
559 *
560 */
561
562void gfs2_check_journals(struct gfs2_sbd *sdp)
563{
564 struct gfs2_jdesc *jd;
565
566 for (;;) {
567 jd = gfs2_jdesc_find_dirty(sdp);
568 if (!jd)
569 break;
570
571 if (jd != sdp->sd_jdesc)
572 gfs2_recover_journal(jd);
573 }
574}
575
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..ac0f1d6ce456
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
14{
15 if (++*blk == sdp->sd_jdesc->jd_blocks)
16 *blk = 0;
17}
18
19int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
20 struct buffer_head **bh);
21
22int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
23int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
24void gfs2_revoke_clean(struct gfs2_sbd *sdp);
25
26int gfs2_find_jhead(struct gfs2_jdesc *jd,
27 struct gfs2_log_header *head);
28int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
29void gfs2_check_journals(struct gfs2_sbd *sdp);
30
31#endif /* __RECOVERY_DOT_H__ */
32
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..65eea0b88bf7
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1529 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "glops.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "super.h"
28#include "trans.h"
29#include "ops_file.h"
30#include "util.h"
31
32#define BFITNOENT 0xFFFFFFFF
33
34/*
35 * These routines are used by the resource group routines (rgrp.c)
36 * to keep track of block allocation. Each block is represented by two
37 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
38 *
39 * 0 = Free
40 * 1 = Used (not metadata)
41 * 2 = Unlinked (still in use) inode
42 * 3 = Used (metadata)
43 */
44
45static const char valid_change[16] = {
46 /* current */
47 /* n */ 0, 1, 1, 1,
48 /* e */ 1, 0, 0, 0,
49 /* w */ 0, 0, 0, 1,
50 1, 0, 0, 0
51};
52
53/**
54 * gfs2_setbit - Set a bit in the bitmaps
55 * @buffer: the buffer that holds the bitmaps
56 * @buflen: the length (in bytes) of the buffer
57 * @block: the block to set
58 * @new_state: the new state of the block
59 *
60 */
61
62static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
63 unsigned int buflen, uint32_t block,
64 unsigned char new_state)
65{
66 unsigned char *byte, *end, cur_state;
67 unsigned int bit;
68
69 byte = buffer + (block / GFS2_NBBY);
70 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
71 end = buffer + buflen;
72
73 gfs2_assert(rgd->rd_sbd, byte < end);
74
75 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
76
77 if (valid_change[new_state * 4 + cur_state]) {
78 *byte ^= cur_state << bit;
79 *byte |= new_state << bit;
80 } else
81 gfs2_consist_rgrpd(rgd);
82}
83
84/**
85 * gfs2_testbit - test a bit in the bitmaps
86 * @buffer: the buffer that holds the bitmaps
87 * @buflen: the length (in bytes) of the buffer
88 * @block: the block to read
89 *
90 */
91
92static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
93 unsigned int buflen, uint32_t block)
94{
95 unsigned char *byte, *end, cur_state;
96 unsigned int bit;
97
98 byte = buffer + (block / GFS2_NBBY);
99 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
100 end = buffer + buflen;
101
102 gfs2_assert(rgd->rd_sbd, byte < end);
103
104 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
105
106 return cur_state;
107}
108
109/**
110 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
111 * a block in a given allocation state.
112 * @buffer: the buffer that holds the bitmaps
113 * @buflen: the length (in bytes) of the buffer
114 * @goal: start search at this block's bit-pair (within @buffer)
115 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
116 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
117 *
118 * Scope of @goal and returned block number is only within this bitmap buffer,
119 * not entire rgrp or filesystem. @buffer will be offset from the actual
120 * beginning of a bitmap block buffer, skipping any header structures.
121 *
122 * Return: the block number (bitmap buffer scope) that was found
123 */
124
125static uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 unsigned int buflen, uint32_t goal,
127 unsigned char old_state)
128{
129 unsigned char *byte, *end, alloc;
130 uint32_t blk = goal;
131 unsigned int bit;
132
133 byte = buffer + (goal / GFS2_NBBY);
134 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
135 end = buffer + buflen;
136 alloc = (old_state & 1) ? 0 : 0x55;
137
138 while (byte < end) {
139 if ((*byte & 0x55) == alloc) {
140 blk += (8 - bit) >> 1;
141
142 bit = 0;
143 byte++;
144
145 continue;
146 }
147
148 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
149 return blk;
150
151 bit += GFS2_BIT_SIZE;
152 if (bit >= 8) {
153 bit = 0;
154 byte++;
155 }
156
157 blk++;
158 }
159
160 return BFITNOENT;
161}
162
163/**
164 * gfs2_bitcount - count the number of bits in a certain state
165 * @buffer: the buffer that holds the bitmaps
166 * @buflen: the length (in bytes) of the buffer
167 * @state: the state of the block we're looking for
168 *
169 * Returns: The number of bits
170 */
171
172static uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
173 unsigned int buflen, unsigned char state)
174{
175 unsigned char *byte = buffer;
176 unsigned char *end = buffer + buflen;
177 unsigned char state1 = state << 2;
178 unsigned char state2 = state << 4;
179 unsigned char state3 = state << 6;
180 uint32_t count = 0;
181
182 for (; byte < end; byte++) {
183 if (((*byte) & 0x03) == state)
184 count++;
185 if (((*byte) & 0x0C) == state1)
186 count++;
187 if (((*byte) & 0x30) == state2)
188 count++;
189 if (((*byte) & 0xC0) == state3)
190 count++;
191 }
192
193 return count;
194}
195
196/**
197 * gfs2_rgrp_verify - Verify that a resource group is consistent
198 * @sdp: the filesystem
199 * @rgd: the rgrp
200 *
201 */
202
203void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
204{
205 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi = NULL;
207 uint32_t length = rgd->rd_ri.ri_length;
208 uint32_t count[4], tmp;
209 int buf, x;
210
211 memset(count, 0, 4 * sizeof(uint32_t));
212
213 /* Count # blocks in each of 4 possible allocation states */
214 for (buf = 0; buf < length; buf++) {
215 bi = rgd->rd_bits + buf;
216 for (x = 0; x < 4; x++)
217 count[x] += gfs2_bitcount(rgd,
218 bi->bi_bh->b_data +
219 bi->bi_offset,
220 bi->bi_len, x);
221 }
222
223 if (count[0] != rgd->rd_rg.rg_free) {
224 if (gfs2_consist_rgrpd(rgd))
225 fs_err(sdp, "free data mismatch: %u != %u\n",
226 count[0], rgd->rd_rg.rg_free);
227 return;
228 }
229
230 tmp = rgd->rd_ri.ri_data -
231 rgd->rd_rg.rg_free -
232 rgd->rd_rg.rg_dinodes;
233 if (count[1] + count[2] != tmp) {
234 if (gfs2_consist_rgrpd(rgd))
235 fs_err(sdp, "used data mismatch: %u != %u\n",
236 count[1], tmp);
237 return;
238 }
239
240 if (count[3] != rgd->rd_rg.rg_dinodes) {
241 if (gfs2_consist_rgrpd(rgd))
242 fs_err(sdp, "used metadata mismatch: %u != %u\n",
243 count[3], rgd->rd_rg.rg_dinodes);
244 return;
245 }
246
247 if (count[2] > count[3]) {
248 if (gfs2_consist_rgrpd(rgd))
249 fs_err(sdp, "unlinked inodes > inodes: %u\n",
250 count[2]);
251 return;
252 }
253
254}
255
256static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block)
257{
258 uint64_t first = ri->ri_data0;
259 uint64_t last = first + ri->ri_data;
260 return !!(first <= block && block < last);
261}
262
263/**
264 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
265 * @sdp: The GFS2 superblock
266 * @n: The data block number
267 *
268 * Returns: The resource group, or NULL if not found
269 */
270
271struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk)
272{
273 struct gfs2_rgrpd *rgd;
274
275 spin_lock(&sdp->sd_rindex_spin);
276
277 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
278 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
279 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
280 spin_unlock(&sdp->sd_rindex_spin);
281 return rgd;
282 }
283 }
284
285 spin_unlock(&sdp->sd_rindex_spin);
286
287 return NULL;
288}
289
290/**
291 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
292 * @sdp: The GFS2 superblock
293 *
294 * Returns: The first rgrp in the filesystem
295 */
296
297struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
298{
299 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
300 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
301}
302
303/**
304 * gfs2_rgrpd_get_next - get the next RG
305 * @rgd: A RG
306 *
307 * Returns: The next rgrp
308 */
309
310struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
311{
312 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
313 return NULL;
314 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
315}
316
317static void clear_rgrpdi(struct gfs2_sbd *sdp)
318{
319 struct list_head *head;
320 struct gfs2_rgrpd *rgd;
321 struct gfs2_glock *gl;
322
323 spin_lock(&sdp->sd_rindex_spin);
324 sdp->sd_rindex_forward = NULL;
325 head = &sdp->sd_rindex_recent_list;
326 while (!list_empty(head)) {
327 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
328 list_del(&rgd->rd_recent);
329 }
330 spin_unlock(&sdp->sd_rindex_spin);
331
332 head = &sdp->sd_rindex_list;
333 while (!list_empty(head)) {
334 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
335 gl = rgd->rd_gl;
336
337 list_del(&rgd->rd_list);
338 list_del(&rgd->rd_list_mru);
339
340 if (gl) {
341 gl->gl_object = NULL;
342 gfs2_glock_put(gl);
343 }
344
345 kfree(rgd->rd_bits);
346 kfree(rgd);
347 }
348}
349
350void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
351{
352 mutex_lock(&sdp->sd_rindex_mutex);
353 clear_rgrpdi(sdp);
354 mutex_unlock(&sdp->sd_rindex_mutex);
355}
356
357/**
358 * gfs2_compute_bitstructs - Compute the bitmap sizes
359 * @rgd: The resource group descriptor
360 *
361 * Calculates bitmap descriptors, one for each block that contains bitmap data
362 *
363 * Returns: errno
364 */
365
366static int compute_bitstructs(struct gfs2_rgrpd *rgd)
367{
368 struct gfs2_sbd *sdp = rgd->rd_sbd;
369 struct gfs2_bitmap *bi;
370 uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
371 uint32_t bytes_left, bytes;
372 int x;
373
374 if (!length)
375 return -EINVAL;
376
377 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_KERNEL);
378 if (!rgd->rd_bits)
379 return -ENOMEM;
380
381 bytes_left = rgd->rd_ri.ri_bitbytes;
382
383 for (x = 0; x < length; x++) {
384 bi = rgd->rd_bits + x;
385
386 /* small rgrp; bitmap stored completely in header block */
387 if (length == 1) {
388 bytes = bytes_left;
389 bi->bi_offset = sizeof(struct gfs2_rgrp);
390 bi->bi_start = 0;
391 bi->bi_len = bytes;
392 /* header block */
393 } else if (x == 0) {
394 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
395 bi->bi_offset = sizeof(struct gfs2_rgrp);
396 bi->bi_start = 0;
397 bi->bi_len = bytes;
398 /* last block */
399 } else if (x + 1 == length) {
400 bytes = bytes_left;
401 bi->bi_offset = sizeof(struct gfs2_meta_header);
402 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
403 bi->bi_len = bytes;
404 /* other blocks */
405 } else {
406 bytes = sdp->sd_sb.sb_bsize -
407 sizeof(struct gfs2_meta_header);
408 bi->bi_offset = sizeof(struct gfs2_meta_header);
409 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
410 bi->bi_len = bytes;
411 }
412
413 bytes_left -= bytes;
414 }
415
416 if (bytes_left) {
417 gfs2_consist_rgrpd(rgd);
418 return -EIO;
419 }
420 bi = rgd->rd_bits + (length - 1);
421 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
422 if (gfs2_consist_rgrpd(rgd)) {
423 gfs2_rindex_print(&rgd->rd_ri);
424 fs_err(sdp, "start=%u len=%u offset=%u\n",
425 bi->bi_start, bi->bi_len, bi->bi_offset);
426 }
427 return -EIO;
428 }
429
430 return 0;
431}
432
433/**
434 * gfs2_ri_update - Pull in a new resource index from the disk
435 * @gl: The glock covering the rindex inode
436 *
437 * Returns: 0 on successful update, error code otherwise
438 */
439
440static int gfs2_ri_update(struct gfs2_inode *ip)
441{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct inode *inode = &ip->i_inode;
444 struct gfs2_rgrpd *rgd;
445 char buf[sizeof(struct gfs2_rindex)];
446 struct file_ra_state ra_state;
447 uint64_t junk = ip->i_di.di_size;
448 int error;
449
450 if (do_div(junk, sizeof(struct gfs2_rindex))) {
451 gfs2_consist_inode(ip);
452 return -EIO;
453 }
454
455 clear_rgrpdi(sdp);
456
457 file_ra_state_init(&ra_state, inode->i_mapping);
458 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
459 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
460 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
461 sizeof(struct gfs2_rindex));
462 if (!error)
463 break;
464 if (error != sizeof(struct gfs2_rindex)) {
465 if (error > 0)
466 error = -EIO;
467 goto fail;
468 }
469
470 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_KERNEL);
471 error = -ENOMEM;
472 if (!rgd)
473 goto fail;
474
475 mutex_init(&rgd->rd_mutex);
476 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
477 rgd->rd_sbd = sdp;
478
479 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
480 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
481
482 gfs2_rindex_in(&rgd->rd_ri, buf);
483 error = compute_bitstructs(rgd);
484 if (error)
485 goto fail;
486
487 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
488 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
489 if (error)
490 goto fail;
491
492 rgd->rd_gl->gl_object = rgd;
493 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
494 }
495
496 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
497 return 0;
498
499fail:
500 clear_rgrpdi(sdp);
501 return error;
502}
503
504/**
505 * gfs2_rindex_hold - Grab a lock on the rindex
506 * @sdp: The GFS2 superblock
507 * @ri_gh: the glock holder
508 *
509 * We grab a lock on the rindex inode to make sure that it doesn't
510 * change whilst we are performing an operation. We keep this lock
511 * for quite long periods of time compared to other locks. This
512 * doesn't matter, since it is shared and it is very, very rarely
513 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
514 *
515 * This makes sure that we're using the latest copy of the resource index
516 * special file, which might have been updated if someone expanded the
517 * filesystem (via gfs2_grow utility), which adds new resource groups.
518 *
519 * Returns: 0 on success, error code otherwise
520 */
521
522int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
523{
524 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
525 struct gfs2_glock *gl = ip->i_gl;
526 int error;
527
528 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
529 if (error)
530 return error;
531
532 /* Read new copy from disk if we don't have the latest */
533 if (sdp->sd_rindex_vn != gl->gl_vn) {
534 mutex_lock(&sdp->sd_rindex_mutex);
535 if (sdp->sd_rindex_vn != gl->gl_vn) {
536 error = gfs2_ri_update(ip);
537 if (error)
538 gfs2_glock_dq_uninit(ri_gh);
539 }
540 mutex_unlock(&sdp->sd_rindex_mutex);
541 }
542
543 return error;
544}
545
546/**
547 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
548 * @rgd: the struct gfs2_rgrpd describing the RG to read in
549 *
550 * Read in all of a Resource Group's header and bitmap blocks.
551 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
552 *
553 * Returns: errno
554 */
555
556int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
557{
558 struct gfs2_sbd *sdp = rgd->rd_sbd;
559 struct gfs2_glock *gl = rgd->rd_gl;
560 unsigned int length = rgd->rd_ri.ri_length;
561 struct gfs2_bitmap *bi;
562 unsigned int x, y;
563 int error;
564
565 mutex_lock(&rgd->rd_mutex);
566
567 spin_lock(&sdp->sd_rindex_spin);
568 if (rgd->rd_bh_count) {
569 rgd->rd_bh_count++;
570 spin_unlock(&sdp->sd_rindex_spin);
571 mutex_unlock(&rgd->rd_mutex);
572 return 0;
573 }
574 spin_unlock(&sdp->sd_rindex_spin);
575
576 for (x = 0; x < length; x++) {
577 bi = rgd->rd_bits + x;
578 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START,
579 &bi->bi_bh);
580 if (error)
581 goto fail;
582 }
583
584 for (y = length; y--;) {
585 bi = rgd->rd_bits + y;
586 error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT);
587 if (error)
588 goto fail;
589 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
590 GFS2_METATYPE_RG)) {
591 error = -EIO;
592 goto fail;
593 }
594 }
595
596 if (rgd->rd_rg_vn != gl->gl_vn) {
597 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
598 rgd->rd_rg_vn = gl->gl_vn;
599 }
600
601 spin_lock(&sdp->sd_rindex_spin);
602 rgd->rd_free_clone = rgd->rd_rg.rg_free;
603 rgd->rd_bh_count++;
604 spin_unlock(&sdp->sd_rindex_spin);
605
606 mutex_unlock(&rgd->rd_mutex);
607
608 return 0;
609
610fail:
611 while (x--) {
612 bi = rgd->rd_bits + x;
613 brelse(bi->bi_bh);
614 bi->bi_bh = NULL;
615 gfs2_assert_warn(sdp, !bi->bi_clone);
616 }
617 mutex_unlock(&rgd->rd_mutex);
618
619 return error;
620}
621
622void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
623{
624 struct gfs2_sbd *sdp = rgd->rd_sbd;
625
626 spin_lock(&sdp->sd_rindex_spin);
627 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
628 rgd->rd_bh_count++;
629 spin_unlock(&sdp->sd_rindex_spin);
630}
631
632/**
633 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
634 * @rgd: the struct gfs2_rgrpd describing the RG to read in
635 *
636 */
637
638void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
639{
640 struct gfs2_sbd *sdp = rgd->rd_sbd;
641 int x, length = rgd->rd_ri.ri_length;
642
643 spin_lock(&sdp->sd_rindex_spin);
644 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
645 if (--rgd->rd_bh_count) {
646 spin_unlock(&sdp->sd_rindex_spin);
647 return;
648 }
649
650 for (x = 0; x < length; x++) {
651 struct gfs2_bitmap *bi = rgd->rd_bits + x;
652 kfree(bi->bi_clone);
653 bi->bi_clone = NULL;
654 brelse(bi->bi_bh);
655 bi->bi_bh = NULL;
656 }
657
658 spin_unlock(&sdp->sd_rindex_spin);
659}
660
661void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
662{
663 struct gfs2_sbd *sdp = rgd->rd_sbd;
664 unsigned int length = rgd->rd_ri.ri_length;
665 unsigned int x;
666
667 for (x = 0; x < length; x++) {
668 struct gfs2_bitmap *bi = rgd->rd_bits + x;
669 if (!bi->bi_clone)
670 continue;
671 memcpy(bi->bi_clone + bi->bi_offset,
672 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
673 }
674
675 spin_lock(&sdp->sd_rindex_spin);
676 rgd->rd_free_clone = rgd->rd_rg.rg_free;
677 spin_unlock(&sdp->sd_rindex_spin);
678}
679
680/**
681 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
682 * @ip: the incore GFS2 inode structure
683 *
684 * Returns: the struct gfs2_alloc
685 */
686
687struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
688{
689 struct gfs2_alloc *al = &ip->i_alloc;
690
691 /* FIXME: Should assert that the correct locks are held here... */
692 memset(al, 0, sizeof(*al));
693 return al;
694}
695
696/**
697 * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode
698 * @ip: the inode
699 *
700 */
701
702void gfs2_alloc_put(struct gfs2_inode *ip)
703{
704 return;
705}
706
707/**
708 * try_rgrp_fit - See if a given reservation will fit in a given RG
709 * @rgd: the RG data
710 * @al: the struct gfs2_alloc structure describing the reservation
711 *
712 * If there's room for the requested blocks to be allocated from the RG:
713 * Sets the $al_reserved_data field in @al.
714 * Sets the $al_reserved_meta field in @al.
715 * Sets the $al_rgd field in @al.
716 *
717 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
718 */
719
720static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
721{
722 struct gfs2_sbd *sdp = rgd->rd_sbd;
723 int ret = 0;
724
725 spin_lock(&sdp->sd_rindex_spin);
726 if (rgd->rd_free_clone >= al->al_requested) {
727 al->al_rgd = rgd;
728 ret = 1;
729 }
730 spin_unlock(&sdp->sd_rindex_spin);
731
732 return ret;
733}
734
735/**
736 * recent_rgrp_first - get first RG from "recent" list
737 * @sdp: The GFS2 superblock
738 * @rglast: address of the rgrp used last
739 *
740 * Returns: The first rgrp in the recent list
741 */
742
743static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
744 uint64_t rglast)
745{
746 struct gfs2_rgrpd *rgd = NULL;
747
748 spin_lock(&sdp->sd_rindex_spin);
749
750 if (list_empty(&sdp->sd_rindex_recent_list))
751 goto out;
752
753 if (!rglast)
754 goto first;
755
756 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
757 if (rgd->rd_ri.ri_addr == rglast)
758 goto out;
759 }
760
761first:
762 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
763 rd_recent);
764out:
765 spin_unlock(&sdp->sd_rindex_spin);
766 return rgd;
767}
768
769/**
770 * recent_rgrp_next - get next RG from "recent" list
771 * @cur_rgd: current rgrp
772 * @remove:
773 *
774 * Returns: The next rgrp in the recent list
775 */
776
777static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
778 int remove)
779{
780 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
781 struct list_head *head;
782 struct gfs2_rgrpd *rgd;
783
784 spin_lock(&sdp->sd_rindex_spin);
785
786 head = &sdp->sd_rindex_recent_list;
787
788 list_for_each_entry(rgd, head, rd_recent) {
789 if (rgd == cur_rgd) {
790 if (cur_rgd->rd_recent.next != head)
791 rgd = list_entry(cur_rgd->rd_recent.next,
792 struct gfs2_rgrpd, rd_recent);
793 else
794 rgd = NULL;
795
796 if (remove)
797 list_del(&cur_rgd->rd_recent);
798
799 goto out;
800 }
801 }
802
803 rgd = NULL;
804 if (!list_empty(head))
805 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
806
807out:
808 spin_unlock(&sdp->sd_rindex_spin);
809 return rgd;
810}
811
812/**
813 * recent_rgrp_add - add an RG to tail of "recent" list
814 * @new_rgd: The rgrp to add
815 *
816 */
817
818static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
819{
820 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
821 struct gfs2_rgrpd *rgd;
822 unsigned int count = 0;
823 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
824
825 spin_lock(&sdp->sd_rindex_spin);
826
827 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
828 if (rgd == new_rgd)
829 goto out;
830
831 if (++count >= max)
832 goto out;
833 }
834 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
835
836out:
837 spin_unlock(&sdp->sd_rindex_spin);
838}
839
840/**
841 * forward_rgrp_get - get an rgrp to try next from full list
842 * @sdp: The GFS2 superblock
843 *
844 * Returns: The rgrp to try next
845 */
846
847static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
848{
849 struct gfs2_rgrpd *rgd;
850 unsigned int journals = gfs2_jindex_size(sdp);
851 unsigned int rg = 0, x;
852
853 spin_lock(&sdp->sd_rindex_spin);
854
855 rgd = sdp->sd_rindex_forward;
856 if (!rgd) {
857 if (sdp->sd_rgrps >= journals)
858 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
859
860 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp);
861 x < rg;
862 x++, rgd = gfs2_rgrpd_get_next(rgd))
863 /* Do Nothing */;
864
865 sdp->sd_rindex_forward = rgd;
866 }
867
868 spin_unlock(&sdp->sd_rindex_spin);
869
870 return rgd;
871}
872
873/**
874 * forward_rgrp_set - set the forward rgrp pointer
875 * @sdp: the filesystem
876 * @rgd: The new forward rgrp
877 *
878 */
879
880static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
881{
882 spin_lock(&sdp->sd_rindex_spin);
883 sdp->sd_rindex_forward = rgd;
884 spin_unlock(&sdp->sd_rindex_spin);
885}
886
887/**
888 * get_local_rgrp - Choose and lock a rgrp for allocation
889 * @ip: the inode to reserve space for
890 * @rgp: the chosen and locked rgrp
891 *
892 * Try to acquire rgrp in way which avoids contending with others.
893 *
894 * Returns: errno
895 */
896
897static int get_local_rgrp(struct gfs2_inode *ip)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_rgrpd *rgd, *begin = NULL;
901 struct gfs2_alloc *al = &ip->i_alloc;
902 int flags = LM_FLAG_TRY;
903 int skipped = 0;
904 int loops = 0;
905 int error;
906
907 /* Try recently successful rgrps */
908
909 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
910
911 while (rgd) {
912 error = gfs2_glock_nq_init(rgd->rd_gl,
913 LM_ST_EXCLUSIVE, LM_FLAG_TRY,
914 &al->al_rgd_gh);
915 switch (error) {
916 case 0:
917 if (try_rgrp_fit(rgd, al))
918 goto out;
919 gfs2_glock_dq_uninit(&al->al_rgd_gh);
920 rgd = recent_rgrp_next(rgd, 1);
921 break;
922
923 case GLR_TRYFAILED:
924 rgd = recent_rgrp_next(rgd, 0);
925 break;
926
927 default:
928 return error;
929 }
930 }
931
932 /* Go through full list of rgrps */
933
934 begin = rgd = forward_rgrp_get(sdp);
935
936 for (;;) {
937 error = gfs2_glock_nq_init(rgd->rd_gl,
938 LM_ST_EXCLUSIVE, flags,
939 &al->al_rgd_gh);
940 switch (error) {
941 case 0:
942 if (try_rgrp_fit(rgd, al))
943 goto out;
944 gfs2_glock_dq_uninit(&al->al_rgd_gh);
945 break;
946
947 case GLR_TRYFAILED:
948 skipped++;
949 break;
950
951 default:
952 return error;
953 }
954
955 rgd = gfs2_rgrpd_get_next(rgd);
956 if (!rgd)
957 rgd = gfs2_rgrpd_get_first(sdp);
958
959 if (rgd == begin) {
960 if (++loops >= 2 || !skipped)
961 return -ENOSPC;
962 flags = 0;
963 }
964 }
965
966out:
967 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
968
969 if (begin) {
970 recent_rgrp_add(rgd);
971 rgd = gfs2_rgrpd_get_next(rgd);
972 if (!rgd)
973 rgd = gfs2_rgrpd_get_first(sdp);
974 forward_rgrp_set(sdp, rgd);
975 }
976
977 return 0;
978}
979
980/**
981 * gfs2_inplace_reserve_i - Reserve space in the filesystem
982 * @ip: the inode to reserve space for
983 *
984 * Returns: errno
985 */
986
987int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
988{
989 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
990 struct gfs2_alloc *al = &ip->i_alloc;
991 int error;
992
993 if (gfs2_assert_warn(sdp, al->al_requested))
994 return -EINVAL;
995
996 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
997 if (error)
998 return error;
999
1000 error = get_local_rgrp(ip);
1001 if (error) {
1002 gfs2_glock_dq_uninit(&al->al_ri_gh);
1003 return error;
1004 }
1005
1006 al->al_file = file;
1007 al->al_line = line;
1008
1009 return 0;
1010}
1011
1012/**
1013 * gfs2_inplace_release - release an inplace reservation
1014 * @ip: the inode the reservation was taken out on
1015 *
1016 * Release a reservation made by gfs2_inplace_reserve().
1017 */
1018
1019void gfs2_inplace_release(struct gfs2_inode *ip)
1020{
1021 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1022 struct gfs2_alloc *al = &ip->i_alloc;
1023
1024 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1025 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1026 "al_file = %s, al_line = %u\n",
1027 al->al_alloced, al->al_requested, al->al_file,
1028 al->al_line);
1029
1030 al->al_rgd = NULL;
1031 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1032 gfs2_glock_dq_uninit(&al->al_ri_gh);
1033}
1034
1035/**
1036 * gfs2_get_block_type - Check a block in a RG is of given type
1037 * @rgd: the resource group holding the block
1038 * @block: the block number
1039 *
1040 * Returns: The block type (GFS2_BLKST_*)
1041 */
1042
1043unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block)
1044{
1045 struct gfs2_bitmap *bi = NULL;
1046 uint32_t length, rgrp_block, buf_block;
1047 unsigned int buf;
1048 unsigned char type;
1049
1050 length = rgd->rd_ri.ri_length;
1051 rgrp_block = block - rgd->rd_ri.ri_data0;
1052
1053 for (buf = 0; buf < length; buf++) {
1054 bi = rgd->rd_bits + buf;
1055 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1056 break;
1057 }
1058
1059 gfs2_assert(rgd->rd_sbd, buf < length);
1060 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
1061
1062 type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1063 bi->bi_len, buf_block);
1064
1065 return type;
1066}
1067
1068/**
1069 * rgblk_search - find a block in @old_state, change allocation
1070 * state to @new_state
1071 * @rgd: the resource group descriptor
1072 * @goal: the goal block within the RG (start here to search for avail block)
1073 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1074 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1075 *
1076 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1077 * Add the found bitmap buffer to the transaction.
1078 * Set the found bits to @new_state to change block's allocation state.
1079 *
1080 * This function never fails, because we wouldn't call it unless we
1081 * know (from reservation results, etc.) that a block is available.
1082 *
1083 * Scope of @goal and returned block is just within rgrp, not the whole
1084 * filesystem.
1085 *
1086 * Returns: the block number allocated
1087 */
1088
1089static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal,
1090 unsigned char old_state, unsigned char new_state)
1091{
1092 struct gfs2_bitmap *bi = NULL;
1093 uint32_t length = rgd->rd_ri.ri_length;
1094 uint32_t blk = 0;
1095 unsigned int buf, x;
1096
1097 /* Find bitmap block that contains bits for goal block */
1098 for (buf = 0; buf < length; buf++) {
1099 bi = rgd->rd_bits + buf;
1100 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1101 break;
1102 }
1103
1104 gfs2_assert(rgd->rd_sbd, buf < length);
1105
1106 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1107 goal -= bi->bi_start * GFS2_NBBY;
1108
1109 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1110 "x <= length", instead of "x < length", because we typically start
1111 the search in the middle of a bit block, but if we can't find an
1112 allocatable block anywhere else, we want to be able wrap around and
1113 search in the first part of our first-searched bit block. */
1114 for (x = 0; x <= length; x++) {
1115 if (bi->bi_clone)
1116 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1117 bi->bi_len, goal, old_state);
1118 else
1119 blk = gfs2_bitfit(rgd,
1120 bi->bi_bh->b_data + bi->bi_offset,
1121 bi->bi_len, goal, old_state);
1122 if (blk != BFITNOENT)
1123 break;
1124
1125 /* Try next bitmap block (wrap back to rgrp header if at end) */
1126 buf = (buf + 1) % length;
1127 bi = rgd->rd_bits + buf;
1128 goal = 0;
1129 }
1130
1131 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
1132 blk = 0;
1133
1134 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1135 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1136 bi->bi_len, blk, new_state);
1137 if (bi->bi_clone)
1138 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1139 bi->bi_len, blk, new_state);
1140
1141 return bi->bi_start * GFS2_NBBY + blk;
1142}
1143
1144/**
1145 * rgblk_free - Change alloc state of given block(s)
1146 * @sdp: the filesystem
1147 * @bstart: the start of a run of blocks to free
1148 * @blen: the length of the block run (all must lie within ONE RG!)
1149 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1150 *
1151 * Returns: Resource group containing the block(s)
1152 */
1153
1154static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart,
1155 uint32_t blen, unsigned char new_state)
1156{
1157 struct gfs2_rgrpd *rgd;
1158 struct gfs2_bitmap *bi = NULL;
1159 uint32_t length, rgrp_blk, buf_blk;
1160 unsigned int buf;
1161
1162 rgd = gfs2_blk2rgrpd(sdp, bstart);
1163 if (!rgd) {
1164 if (gfs2_consist(sdp))
1165 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1166 return NULL;
1167 }
1168
1169 length = rgd->rd_ri.ri_length;
1170
1171 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1172
1173 while (blen--) {
1174 for (buf = 0; buf < length; buf++) {
1175 bi = rgd->rd_bits + buf;
1176 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1177 break;
1178 }
1179
1180 gfs2_assert(rgd->rd_sbd, buf < length);
1181
1182 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1183 rgrp_blk++;
1184
1185 if (!bi->bi_clone) {
1186 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1187 GFP_KERNEL | __GFP_NOFAIL);
1188 memcpy(bi->bi_clone + bi->bi_offset,
1189 bi->bi_bh->b_data + bi->bi_offset,
1190 bi->bi_len);
1191 }
1192 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1193 gfs2_setbit(rgd,
1194 bi->bi_bh->b_data + bi->bi_offset,
1195 bi->bi_len, buf_blk, new_state);
1196 }
1197
1198 return rgd;
1199}
1200
1201/**
1202 * gfs2_alloc_data - Allocate a data block
1203 * @ip: the inode to allocate the data block for
1204 *
1205 * Returns: the allocated block
1206 */
1207
1208u64 gfs2_alloc_data(struct gfs2_inode *ip)
1209{
1210 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1211 struct gfs2_alloc *al = &ip->i_alloc;
1212 struct gfs2_rgrpd *rgd = al->al_rgd;
1213 uint32_t goal, blk;
1214 uint64_t block;
1215
1216 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1217 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1218 else
1219 goal = rgd->rd_last_alloc_data;
1220
1221 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1222 rgd->rd_last_alloc_data = blk;
1223
1224 block = rgd->rd_ri.ri_data0 + blk;
1225 ip->i_di.di_goal_data = block;
1226
1227 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1228 rgd->rd_rg.rg_free--;
1229
1230 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1231 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1232
1233 al->al_alloced++;
1234
1235 gfs2_statfs_change(sdp, 0, -1, 0);
1236 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1237
1238 spin_lock(&sdp->sd_rindex_spin);
1239 rgd->rd_free_clone--;
1240 spin_unlock(&sdp->sd_rindex_spin);
1241
1242 return block;
1243}
1244
1245/**
1246 * gfs2_alloc_meta - Allocate a metadata block
1247 * @ip: the inode to allocate the metadata block for
1248 *
1249 * Returns: the allocated block
1250 */
1251
1252u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1253{
1254 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1255 struct gfs2_alloc *al = &ip->i_alloc;
1256 struct gfs2_rgrpd *rgd = al->al_rgd;
1257 uint32_t goal, blk;
1258 uint64_t block;
1259
1260 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1261 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1262 else
1263 goal = rgd->rd_last_alloc_meta;
1264
1265 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1266 rgd->rd_last_alloc_meta = blk;
1267
1268 block = rgd->rd_ri.ri_data0 + blk;
1269 ip->i_di.di_goal_meta = block;
1270
1271 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1272 rgd->rd_rg.rg_free--;
1273
1274 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1275 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1276
1277 al->al_alloced++;
1278
1279 gfs2_statfs_change(sdp, 0, -1, 0);
1280 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1281 gfs2_trans_add_unrevoke(sdp, block);
1282
1283 spin_lock(&sdp->sd_rindex_spin);
1284 rgd->rd_free_clone--;
1285 spin_unlock(&sdp->sd_rindex_spin);
1286
1287 return block;
1288}
1289
1290/**
1291 * gfs2_alloc_di - Allocate a dinode
1292 * @dip: the directory that the inode is going in
1293 *
1294 * Returns: the block allocated
1295 */
1296
1297u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1298{
1299 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1300 struct gfs2_alloc *al = &dip->i_alloc;
1301 struct gfs2_rgrpd *rgd = al->al_rgd;
1302 u32 blk;
1303 u64 block;
1304
1305 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1306 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1307
1308 rgd->rd_last_alloc_meta = blk;
1309
1310 block = rgd->rd_ri.ri_data0 + blk;
1311
1312 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1313 rgd->rd_rg.rg_free--;
1314 rgd->rd_rg.rg_dinodes++;
1315 *generation = rgd->rd_rg.rg_igeneration++;
1316 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1317 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1318
1319 al->al_alloced++;
1320
1321 gfs2_statfs_change(sdp, 0, -1, +1);
1322 gfs2_trans_add_unrevoke(sdp, block);
1323
1324 spin_lock(&sdp->sd_rindex_spin);
1325 rgd->rd_free_clone--;
1326 spin_unlock(&sdp->sd_rindex_spin);
1327
1328 return block;
1329}
1330
1331/**
1332 * gfs2_free_data - free a contiguous run of data block(s)
1333 * @ip: the inode these blocks are being freed from
1334 * @bstart: first block of a run of contiguous blocks
1335 * @blen: the length of the block run
1336 *
1337 */
1338
1339void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1340{
1341 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1342 struct gfs2_rgrpd *rgd;
1343
1344 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1345 if (!rgd)
1346 return;
1347
1348 rgd->rd_rg.rg_free += blen;
1349
1350 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1351 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1352
1353 gfs2_trans_add_rg(rgd);
1354
1355 gfs2_statfs_change(sdp, 0, +blen, 0);
1356 gfs2_quota_change(ip, -(int64_t)blen,
1357 ip->i_di.di_uid, ip->i_di.di_gid);
1358}
1359
1360/**
1361 * gfs2_free_meta - free a contiguous run of data block(s)
1362 * @ip: the inode these blocks are being freed from
1363 * @bstart: first block of a run of contiguous blocks
1364 * @blen: the length of the block run
1365 *
1366 */
1367
1368void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1369{
1370 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1371 struct gfs2_rgrpd *rgd;
1372
1373 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1374 if (!rgd)
1375 return;
1376
1377 rgd->rd_rg.rg_free += blen;
1378
1379 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1380 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1381
1382 gfs2_trans_add_rg(rgd);
1383
1384 gfs2_statfs_change(sdp, 0, +blen, 0);
1385 gfs2_quota_change(ip, -(int64_t)blen, ip->i_di.di_uid, ip->i_di.di_gid);
1386 gfs2_meta_wipe(ip, bstart, blen);
1387}
1388
1389void gfs2_unlink_di(struct inode *inode)
1390{
1391 struct gfs2_inode *ip = GFS2_I(inode);
1392 struct gfs2_sbd *sdp = GFS2_SB(inode);
1393 struct gfs2_rgrpd *rgd;
1394 u64 blkno = ip->i_num.no_addr;
1395
1396 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1397 if (!rgd)
1398 return;
1399 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1400 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1401 gfs2_trans_add_rg(rgd);
1402}
1403
1404static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno)
1405{
1406 struct gfs2_sbd *sdp = rgd->rd_sbd;
1407 struct gfs2_rgrpd *tmp_rgd;
1408
1409 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1410 if (!tmp_rgd)
1411 return;
1412 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1413
1414 if (!rgd->rd_rg.rg_dinodes)
1415 gfs2_consist_rgrpd(rgd);
1416 rgd->rd_rg.rg_dinodes--;
1417 rgd->rd_rg.rg_free++;
1418
1419 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1420 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1421
1422 gfs2_statfs_change(sdp, 0, +1, -1);
1423 gfs2_trans_add_rg(rgd);
1424}
1425
1426
1427void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1428{
1429 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1430 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1431 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1432}
1433
1434/**
1435 * gfs2_rlist_add - add a RG to a list of RGs
1436 * @sdp: the filesystem
1437 * @rlist: the list of resource groups
1438 * @block: the block
1439 *
1440 * Figure out what RG a block belongs to and add that RG to the list
1441 *
1442 * FIXME: Don't use NOFAIL
1443 *
1444 */
1445
1446void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1447 uint64_t block)
1448{
1449 struct gfs2_rgrpd *rgd;
1450 struct gfs2_rgrpd **tmp;
1451 unsigned int new_space;
1452 unsigned int x;
1453
1454 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1455 return;
1456
1457 rgd = gfs2_blk2rgrpd(sdp, block);
1458 if (!rgd) {
1459 if (gfs2_consist(sdp))
1460 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1461 return;
1462 }
1463
1464 for (x = 0; x < rlist->rl_rgrps; x++)
1465 if (rlist->rl_rgd[x] == rgd)
1466 return;
1467
1468 if (rlist->rl_rgrps == rlist->rl_space) {
1469 new_space = rlist->rl_space + 10;
1470
1471 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1472 GFP_KERNEL | __GFP_NOFAIL);
1473
1474 if (rlist->rl_rgd) {
1475 memcpy(tmp, rlist->rl_rgd,
1476 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1477 kfree(rlist->rl_rgd);
1478 }
1479
1480 rlist->rl_space = new_space;
1481 rlist->rl_rgd = tmp;
1482 }
1483
1484 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1485}
1486
1487/**
1488 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1489 * and initialize an array of glock holders for them
1490 * @rlist: the list of resource groups
1491 * @state: the lock state to acquire the RG lock in
1492 * @flags: the modifier flags for the holder structures
1493 *
1494 * FIXME: Don't use NOFAIL
1495 *
1496 */
1497
1498void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1499 int flags)
1500{
1501 unsigned int x;
1502
1503 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1504 GFP_KERNEL | __GFP_NOFAIL);
1505 for (x = 0; x < rlist->rl_rgrps; x++)
1506 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1507 state, flags,
1508 &rlist->rl_ghs[x]);
1509}
1510
1511/**
1512 * gfs2_rlist_free - free a resource group list
1513 * @list: the list of resource groups
1514 *
1515 */
1516
1517void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1518{
1519 unsigned int x;
1520
1521 kfree(rlist->rl_rgd);
1522
1523 if (rlist->rl_ghs) {
1524 for (x = 0; x < rlist->rl_rgrps; x++)
1525 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1526 kfree(rlist->rl_ghs);
1527 }
1528}
1529
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..14600944d184
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
14
15struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk);
16struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
17struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
18
19void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
20int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
21
22int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
23void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
24void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
25
26void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
27
28struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
29void gfs2_alloc_put(struct gfs2_inode *ip);
30
31int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
32 char *file, unsigned int line);
33#define gfs2_inplace_reserve(ip) \
34gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
35
36void gfs2_inplace_release(struct gfs2_inode *ip);
37
38unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block);
39
40u64 gfs2_alloc_data(struct gfs2_inode *ip);
41u64 gfs2_alloc_meta(struct gfs2_inode *ip);
42u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
43
44void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
45void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
46void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
47void gfs2_unlink_di(struct inode *inode);
48
49struct gfs2_rgrp_list {
50 unsigned int rl_rgrps;
51 unsigned int rl_space;
52 struct gfs2_rgrpd **rl_rgd;
53 struct gfs2_holder *rl_ghs;
54};
55
56void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
57 uint64_t block);
58void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
59 int flags);
60void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
61
62#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..f2d287660cc9
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,928 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "dir.h"
23#include "format.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "quota.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "trans.h"
34#include "util.h"
35
36/**
37 * gfs2_tune_init - Fill a gfs2_tune structure with default values
38 * @gt: tune
39 *
40 */
41
42void gfs2_tune_init(struct gfs2_tune *gt)
43{
44 spin_lock_init(&gt->gt_spin);
45
46 gt->gt_ilimit = 100;
47 gt->gt_ilimit_tries = 3;
48 gt->gt_ilimit_min = 1;
49 gt->gt_demote_secs = 300;
50 gt->gt_incore_log_blocks = 1024;
51 gt->gt_log_flush_secs = 60;
52 gt->gt_jindex_refresh_secs = 60;
53 gt->gt_scand_secs = 15;
54 gt->gt_recoverd_secs = 60;
55 gt->gt_logd_secs = 1;
56 gt->gt_quotad_secs = 5;
57 gt->gt_quota_simul_sync = 64;
58 gt->gt_quota_warn_period = 10;
59 gt->gt_quota_scale_num = 1;
60 gt->gt_quota_scale_den = 1;
61 gt->gt_quota_cache_secs = 300;
62 gt->gt_quota_quantum = 60;
63 gt->gt_atime_quantum = 3600;
64 gt->gt_new_files_jdata = 0;
65 gt->gt_new_files_directio = 0;
66 gt->gt_max_atomic_write = 4 << 20;
67 gt->gt_max_readahead = 1 << 18;
68 gt->gt_lockdump_size = 131072;
69 gt->gt_stall_secs = 600;
70 gt->gt_complain_secs = 10;
71 gt->gt_reclaim_limit = 5000;
72 gt->gt_entries_per_readdir = 32;
73 gt->gt_prefetch_secs = 10;
74 gt->gt_greedy_default = HZ / 10;
75 gt->gt_greedy_quantum = HZ / 40;
76 gt->gt_greedy_max = HZ / 4;
77 gt->gt_statfs_quantum = 30;
78 gt->gt_statfs_slow = 0;
79}
80
81/**
82 * gfs2_check_sb - Check superblock
83 * @sdp: the filesystem
84 * @sb: The superblock
85 * @silent: Don't print a message if the check fails
86 *
87 * Checks the version code of the FS is one that we understand how to
88 * read and that the sizes of the various on-disk structures have not
89 * changed.
90 */
91
92int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
93{
94 unsigned int x;
95
96 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
97 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
98 if (!silent)
99 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
100 return -EINVAL;
101 }
102
103 /* If format numbers match exactly, we're done. */
104
105 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
106 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
107 return 0;
108
109 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
110 for (x = 0; gfs2_old_fs_formats[x]; x++)
111 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
112 break;
113
114 if (!gfs2_old_fs_formats[x]) {
115 printk(KERN_WARNING
116 "GFS2: code version (%u, %u) is incompatible "
117 "with ondisk format (%u, %u)\n",
118 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
119 sb->sb_fs_format, sb->sb_multihost_format);
120 printk(KERN_WARNING
121 "GFS2: I don't know how to upgrade this FS\n");
122 return -EINVAL;
123 }
124 }
125
126 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
127 for (x = 0; gfs2_old_multihost_formats[x]; x++)
128 if (gfs2_old_multihost_formats[x] ==
129 sb->sb_multihost_format)
130 break;
131
132 if (!gfs2_old_multihost_formats[x]) {
133 printk(KERN_WARNING
134 "GFS2: code version (%u, %u) is incompatible "
135 "with ondisk format (%u, %u)\n",
136 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
137 sb->sb_fs_format, sb->sb_multihost_format);
138 printk(KERN_WARNING
139 "GFS2: I don't know how to upgrade this FS\n");
140 return -EINVAL;
141 }
142 }
143
144 if (!sdp->sd_args.ar_upgrade) {
145 printk(KERN_WARNING
146 "GFS2: code version (%u, %u) is incompatible "
147 "with ondisk format (%u, %u)\n",
148 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
149 sb->sb_fs_format, sb->sb_multihost_format);
150 printk(KERN_INFO
151 "GFS2: Use the \"upgrade\" mount option to upgrade "
152 "the FS\n");
153 printk(KERN_INFO "GFS2: See the manual for more details\n");
154 return -EINVAL;
155 }
156
157 return 0;
158}
159
160/**
161 * gfs2_read_sb - Read super block
162 * @sdp: The GFS2 superblock
163 * @gl: the glock for the superblock (assumed to be held)
164 * @silent: Don't print message if mount fails
165 *
166 */
167
168int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
169{
170 struct buffer_head *bh;
171 uint32_t hash_blocks, ind_blocks, leaf_blocks;
172 uint32_t tmp_blocks;
173 unsigned int x;
174 int error;
175
176 error = gfs2_meta_read(gl, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift,
177 DIO_FORCE | DIO_START | DIO_WAIT, &bh);
178 if (error) {
179 if (!silent)
180 fs_err(sdp, "can't read superblock\n");
181 return error;
182 }
183
184 gfs2_assert(sdp, sizeof(struct gfs2_sb) <= bh->b_size);
185 gfs2_sb_in(&sdp->sd_sb, bh->b_data);
186 brelse(bh);
187
188 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
189 if (error)
190 return error;
191
192 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
193 GFS2_BASIC_BLOCK_SHIFT;
194 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
195 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
196 sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
197 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
198 sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
199 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
200 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
201 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
202 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
203 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
204 sizeof(struct gfs2_meta_header)) /
205 sizeof(struct gfs2_quota_change);
206
207 /* Compute maximum reservation required to add a entry to a directory */
208
209 hash_blocks = DIV_ROUND_UP(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH),
210 sdp->sd_jbsize);
211
212 ind_blocks = 0;
213 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
214 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
215 ind_blocks += tmp_blocks;
216 }
217
218 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
219
220 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
221
222 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
223 sizeof(struct gfs2_dinode);
224 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
225 for (x = 2;; x++) {
226 uint64_t space, d;
227 uint32_t m;
228
229 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
230 d = space;
231 m = do_div(d, sdp->sd_inptrs);
232
233 if (d != sdp->sd_heightsize[x - 1] || m)
234 break;
235 sdp->sd_heightsize[x] = space;
236 }
237 sdp->sd_max_height = x;
238 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
239
240 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
241 sizeof(struct gfs2_dinode);
242 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
243 for (x = 2;; x++) {
244 uint64_t space, d;
245 uint32_t m;
246
247 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
248 d = space;
249 m = do_div(d, sdp->sd_inptrs);
250
251 if (d != sdp->sd_jheightsize[x - 1] || m)
252 break;
253 sdp->sd_jheightsize[x] = space;
254 }
255 sdp->sd_max_jheight = x;
256 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
257
258 return 0;
259}
260
261/**
262 * gfs2_jindex_hold - Grab a lock on the jindex
263 * @sdp: The GFS2 superblock
264 * @ji_gh: the holder for the jindex glock
265 *
266 * This is very similar to the gfs2_rindex_hold() function, except that
267 * in general we hold the jindex lock for longer periods of time and
268 * we grab it far less frequently (in general) then the rgrp lock.
269 *
270 * Returns: errno
271 */
272
273int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
274{
275 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
276 struct qstr name;
277 char buf[20];
278 struct gfs2_jdesc *jd;
279 int error;
280
281 name.name = buf;
282
283 mutex_lock(&sdp->sd_jindex_mutex);
284
285 for (;;) {
286 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
287 GL_LOCAL_EXCL, ji_gh);
288 if (error)
289 break;
290
291 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
292 name.hash = gfs2_disk_hash(name.name, name.len);
293
294 error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
295 if (error == -ENOENT) {
296 error = 0;
297 break;
298 }
299
300 gfs2_glock_dq_uninit(ji_gh);
301
302 if (error)
303 break;
304
305 error = -ENOMEM;
306 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
307 if (!jd)
308 break;
309
310 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
311 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
312 if (!jd->jd_inode)
313 error = -ENOENT;
314 else
315 error = PTR_ERR(jd->jd_inode);
316 kfree(jd);
317 break;
318 }
319
320 spin_lock(&sdp->sd_jindex_spin);
321 jd->jd_jid = sdp->sd_journals++;
322 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
323 spin_unlock(&sdp->sd_jindex_spin);
324 }
325
326 mutex_unlock(&sdp->sd_jindex_mutex);
327
328 return error;
329}
330
331/**
332 * gfs2_jindex_free - Clear all the journal index information
333 * @sdp: The GFS2 superblock
334 *
335 */
336
337void gfs2_jindex_free(struct gfs2_sbd *sdp)
338{
339 struct list_head list;
340 struct gfs2_jdesc *jd;
341
342 spin_lock(&sdp->sd_jindex_spin);
343 list_add(&list, &sdp->sd_jindex_list);
344 list_del_init(&sdp->sd_jindex_list);
345 sdp->sd_journals = 0;
346 spin_unlock(&sdp->sd_jindex_spin);
347
348 while (!list_empty(&list)) {
349 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
350 list_del(&jd->jd_list);
351 iput(jd->jd_inode);
352 kfree(jd);
353 }
354}
355
356static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
357{
358 struct gfs2_jdesc *jd;
359 int found = 0;
360
361 list_for_each_entry(jd, head, jd_list) {
362 if (jd->jd_jid == jid) {
363 found = 1;
364 break;
365 }
366 }
367
368 if (!found)
369 jd = NULL;
370
371 return jd;
372}
373
374struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
375{
376 struct gfs2_jdesc *jd;
377
378 spin_lock(&sdp->sd_jindex_spin);
379 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
380 spin_unlock(&sdp->sd_jindex_spin);
381
382 return jd;
383}
384
385void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
386{
387 struct gfs2_jdesc *jd;
388
389 spin_lock(&sdp->sd_jindex_spin);
390 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
391 if (jd)
392 jd->jd_dirty = 1;
393 spin_unlock(&sdp->sd_jindex_spin);
394}
395
396struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
397{
398 struct gfs2_jdesc *jd;
399 int found = 0;
400
401 spin_lock(&sdp->sd_jindex_spin);
402
403 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
404 if (jd->jd_dirty) {
405 jd->jd_dirty = 0;
406 found = 1;
407 break;
408 }
409 }
410 spin_unlock(&sdp->sd_jindex_spin);
411
412 if (!found)
413 jd = NULL;
414
415 return jd;
416}
417
418int gfs2_jdesc_check(struct gfs2_jdesc *jd)
419{
420 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
421 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
422 int ar;
423 int error;
424
425 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
426 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
427 gfs2_consist_inode(ip);
428 return -EIO;
429 }
430 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
431
432 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
433 if (!error && ar) {
434 gfs2_consist_inode(ip);
435 error = -EIO;
436 }
437
438 return error;
439}
440
441/**
442 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
443 * @sdp: the filesystem
444 *
445 * Returns: errno
446 */
447
448int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
449{
450 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
451 struct gfs2_glock *j_gl = ip->i_gl;
452 struct gfs2_holder t_gh;
453 struct gfs2_log_header head;
454 int error;
455
456 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
457 GL_LOCAL_EXCL, &t_gh);
458 if (error)
459 return error;
460
461 gfs2_meta_cache_flush(ip);
462 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
463
464 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
465 if (error)
466 goto fail;
467
468 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
469 gfs2_consist(sdp);
470 error = -EIO;
471 goto fail;
472 }
473
474 /* Initialize some head of the log stuff */
475 sdp->sd_log_sequence = head.lh_sequence + 1;
476 gfs2_log_pointers_init(sdp, head.lh_blkno);
477
478 error = gfs2_quota_init(sdp);
479 if (error)
480 goto fail_unlinked;
481
482 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
483
484 gfs2_glock_dq_uninit(&t_gh);
485
486 return 0;
487
488 fail_unlinked:
489
490 fail:
491 t_gh.gh_flags |= GL_NOCACHE;
492 gfs2_glock_dq_uninit(&t_gh);
493
494 return error;
495}
496
497/**
498 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
499 * @sdp: the filesystem
500 *
501 * Returns: errno
502 */
503
504int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
505{
506 struct gfs2_holder t_gh;
507 int error;
508
509 gfs2_quota_sync(sdp);
510 gfs2_statfs_sync(sdp);
511
512 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
513 GL_LOCAL_EXCL | GL_NOCACHE,
514 &t_gh);
515 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
516 return error;
517
518 gfs2_meta_syncfs(sdp);
519 gfs2_log_shutdown(sdp);
520
521 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
522
523 if (t_gh.gh_gl)
524 gfs2_glock_dq_uninit(&t_gh);
525
526 gfs2_quota_cleanup(sdp);
527
528 return error;
529}
530
531int gfs2_statfs_init(struct gfs2_sbd *sdp)
532{
533 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
534 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
535 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
536 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
537 struct buffer_head *m_bh, *l_bh;
538 struct gfs2_holder gh;
539 int error;
540
541 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
542 &gh);
543 if (error)
544 return error;
545
546 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
547 if (error)
548 goto out;
549
550 if (sdp->sd_args.ar_spectator) {
551 spin_lock(&sdp->sd_statfs_spin);
552 gfs2_statfs_change_in(m_sc, m_bh->b_data +
553 sizeof(struct gfs2_dinode));
554 spin_unlock(&sdp->sd_statfs_spin);
555 } else {
556 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
557 if (error)
558 goto out_m_bh;
559
560 spin_lock(&sdp->sd_statfs_spin);
561 gfs2_statfs_change_in(m_sc, m_bh->b_data +
562 sizeof(struct gfs2_dinode));
563 gfs2_statfs_change_in(l_sc, l_bh->b_data +
564 sizeof(struct gfs2_dinode));
565 spin_unlock(&sdp->sd_statfs_spin);
566
567 brelse(l_bh);
568 }
569
570 out_m_bh:
571 brelse(m_bh);
572
573 out:
574 gfs2_glock_dq_uninit(&gh);
575
576 return 0;
577}
578
579void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free,
580 int64_t dinodes)
581{
582 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
583 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
584 struct buffer_head *l_bh;
585 int error;
586
587 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
588 if (error)
589 return;
590
591 mutex_lock(&sdp->sd_statfs_mutex);
592 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
593 mutex_unlock(&sdp->sd_statfs_mutex);
594
595 spin_lock(&sdp->sd_statfs_spin);
596 l_sc->sc_total += total;
597 l_sc->sc_free += free;
598 l_sc->sc_dinodes += dinodes;
599 gfs2_statfs_change_out(l_sc, l_bh->b_data +
600 sizeof(struct gfs2_dinode));
601 spin_unlock(&sdp->sd_statfs_spin);
602
603 brelse(l_bh);
604}
605
606int gfs2_statfs_sync(struct gfs2_sbd *sdp)
607{
608 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
609 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
610 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
611 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
612 struct gfs2_holder gh;
613 struct buffer_head *m_bh, *l_bh;
614 int error;
615
616 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
617 &gh);
618 if (error)
619 return error;
620
621 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
622 if (error)
623 goto out;
624
625 spin_lock(&sdp->sd_statfs_spin);
626 gfs2_statfs_change_in(m_sc, m_bh->b_data +
627 sizeof(struct gfs2_dinode));
628 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
629 spin_unlock(&sdp->sd_statfs_spin);
630 goto out_bh;
631 }
632 spin_unlock(&sdp->sd_statfs_spin);
633
634 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
635 if (error)
636 goto out_bh;
637
638 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
639 if (error)
640 goto out_bh2;
641
642 mutex_lock(&sdp->sd_statfs_mutex);
643 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
644 mutex_unlock(&sdp->sd_statfs_mutex);
645
646 spin_lock(&sdp->sd_statfs_spin);
647 m_sc->sc_total += l_sc->sc_total;
648 m_sc->sc_free += l_sc->sc_free;
649 m_sc->sc_dinodes += l_sc->sc_dinodes;
650 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
651 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
652 0, sizeof(struct gfs2_statfs_change));
653 spin_unlock(&sdp->sd_statfs_spin);
654
655 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
656 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
657
658 gfs2_trans_end(sdp);
659
660 out_bh2:
661 brelse(l_bh);
662
663 out_bh:
664 brelse(m_bh);
665
666 out:
667 gfs2_glock_dq_uninit(&gh);
668
669 return error;
670}
671
672/**
673 * gfs2_statfs_i - Do a statfs
674 * @sdp: the filesystem
675 * @sg: the sg structure
676 *
677 * Returns: errno
678 */
679
680int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
681{
682 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
683 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
684
685 spin_lock(&sdp->sd_statfs_spin);
686
687 *sc = *m_sc;
688 sc->sc_total += l_sc->sc_total;
689 sc->sc_free += l_sc->sc_free;
690 sc->sc_dinodes += l_sc->sc_dinodes;
691
692 spin_unlock(&sdp->sd_statfs_spin);
693
694 if (sc->sc_free < 0)
695 sc->sc_free = 0;
696 if (sc->sc_free > sc->sc_total)
697 sc->sc_free = sc->sc_total;
698 if (sc->sc_dinodes < 0)
699 sc->sc_dinodes = 0;
700
701 return 0;
702}
703
704/**
705 * statfs_fill - fill in the sg for a given RG
706 * @rgd: the RG
707 * @sc: the sc structure
708 *
709 * Returns: 0 on success, -ESTALE if the LVB is invalid
710 */
711
712static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
713 struct gfs2_statfs_change *sc)
714{
715 gfs2_rgrp_verify(rgd);
716 sc->sc_total += rgd->rd_ri.ri_data;
717 sc->sc_free += rgd->rd_rg.rg_free;
718 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
719 return 0;
720}
721
722/**
723 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
724 * @sdp: the filesystem
725 * @sc: the sc info that will be returned
726 *
727 * Any error (other than a signal) will cause this routine to fall back
728 * to the synchronous version.
729 *
730 * FIXME: This really shouldn't busy wait like this.
731 *
732 * Returns: errno
733 */
734
735int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
736{
737 struct gfs2_holder ri_gh;
738 struct gfs2_rgrpd *rgd_next;
739 struct gfs2_holder *gha, *gh;
740 unsigned int slots = 64;
741 unsigned int x;
742 int done;
743 int error = 0, err;
744
745 memset(sc, 0, sizeof(struct gfs2_statfs_change));
746 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
747 if (!gha)
748 return -ENOMEM;
749
750 error = gfs2_rindex_hold(sdp, &ri_gh);
751 if (error)
752 goto out;
753
754 rgd_next = gfs2_rgrpd_get_first(sdp);
755
756 for (;;) {
757 done = 1;
758
759 for (x = 0; x < slots; x++) {
760 gh = gha + x;
761
762 if (gh->gh_gl && gfs2_glock_poll(gh)) {
763 err = gfs2_glock_wait(gh);
764 if (err) {
765 gfs2_holder_uninit(gh);
766 error = err;
767 } else {
768 if (!error)
769 error = statfs_slow_fill(
770 gh->gh_gl->gl_object, sc);
771 gfs2_glock_dq_uninit(gh);
772 }
773 }
774
775 if (gh->gh_gl)
776 done = 0;
777 else if (rgd_next && !error) {
778 error = gfs2_glock_nq_init(rgd_next->rd_gl,
779 LM_ST_SHARED,
780 GL_ASYNC,
781 gh);
782 rgd_next = gfs2_rgrpd_get_next(rgd_next);
783 done = 0;
784 }
785
786 if (signal_pending(current))
787 error = -ERESTARTSYS;
788 }
789
790 if (done)
791 break;
792
793 yield();
794 }
795
796 gfs2_glock_dq_uninit(&ri_gh);
797
798 out:
799 kfree(gha);
800
801 return error;
802}
803
804struct lfcc {
805 struct list_head list;
806 struct gfs2_holder gh;
807};
808
809/**
810 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
811 * journals are clean
812 * @sdp: the file system
813 * @state: the state to put the transaction lock into
814 * @t_gh: the hold on the transaction lock
815 *
816 * Returns: errno
817 */
818
819static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
820 struct gfs2_holder *t_gh)
821{
822 struct gfs2_inode *ip;
823 struct gfs2_holder ji_gh;
824 struct gfs2_jdesc *jd;
825 struct lfcc *lfcc;
826 LIST_HEAD(list);
827 struct gfs2_log_header lh;
828 int error;
829
830 error = gfs2_jindex_hold(sdp, &ji_gh);
831 if (error)
832 return error;
833
834 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
835 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
836 if (!lfcc) {
837 error = -ENOMEM;
838 goto out;
839 }
840 ip = GFS2_I(jd->jd_inode);
841 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
842 if (error) {
843 kfree(lfcc);
844 goto out;
845 }
846 list_add(&lfcc->list, &list);
847 }
848
849 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
850 LM_FLAG_PRIORITY | GL_NOCACHE,
851 t_gh);
852
853 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
854 error = gfs2_jdesc_check(jd);
855 if (error)
856 break;
857 error = gfs2_find_jhead(jd, &lh);
858 if (error)
859 break;
860 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
861 error = -EBUSY;
862 break;
863 }
864 }
865
866 if (error)
867 gfs2_glock_dq_uninit(t_gh);
868
869 out:
870 while (!list_empty(&list)) {
871 lfcc = list_entry(list.next, struct lfcc, list);
872 list_del(&lfcc->list);
873 gfs2_glock_dq_uninit(&lfcc->gh);
874 kfree(lfcc);
875 }
876 gfs2_glock_dq_uninit(&ji_gh);
877
878 return error;
879}
880
881/**
882 * gfs2_freeze_fs - freezes the file system
883 * @sdp: the file system
884 *
885 * This function flushes data and meta data for all machines by
886 * aquiring the transaction log exclusively. All journals are
887 * ensured to be in a clean state as well.
888 *
889 * Returns: errno
890 */
891
892int gfs2_freeze_fs(struct gfs2_sbd *sdp)
893{
894 int error = 0;
895
896 mutex_lock(&sdp->sd_freeze_lock);
897
898 if (!sdp->sd_freeze_count++) {
899 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
900 if (error)
901 sdp->sd_freeze_count--;
902 }
903
904 mutex_unlock(&sdp->sd_freeze_lock);
905
906 return error;
907}
908
909/**
910 * gfs2_unfreeze_fs - unfreezes the file system
911 * @sdp: the file system
912 *
913 * This function allows the file system to proceed by unlocking
914 * the exclusively held transaction lock. Other GFS2 nodes are
915 * now free to acquire the lock shared and go on with their lives.
916 *
917 */
918
919void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
920{
921 mutex_lock(&sdp->sd_freeze_lock);
922
923 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
924 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
925
926 mutex_unlock(&sdp->sd_freeze_lock);
927}
928
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..df2495230402
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13void gfs2_tune_init(struct gfs2_tune *gt);
14
15int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
16int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
17
18static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
19{
20 unsigned int x;
21 spin_lock(&sdp->sd_jindex_spin);
22 x = sdp->sd_journals;
23 spin_unlock(&sdp->sd_jindex_spin);
24 return x;
25}
26
27int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
28void gfs2_jindex_free(struct gfs2_sbd *sdp);
29
30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
31void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
32struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
33int gfs2_jdesc_check(struct gfs2_jdesc *jd);
34
35int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp);
37
38int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
40
41int gfs2_statfs_init(struct gfs2_sbd *sdp);
42void gfs2_statfs_change(struct gfs2_sbd *sdp,
43 int64_t total, int64_t free, int64_t dinodes);
44int gfs2_statfs_sync(struct gfs2_sbd *sdp);
45int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
46int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
47
48int gfs2_freeze_fs(struct gfs2_sbd *sdp);
49void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
50
51#endif /* __SUPER_DOT_H__ */
52
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..3c4cb4558905
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,579 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "lm.h"
24#include "sys.h"
25#include "super.h"
26#include "glock.h"
27#include "quota.h"
28#include "util.h"
29
30char *gfs2_sys_margs;
31spinlock_t gfs2_sys_margs_lock;
32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{
35 return sprintf(buf, "%s\n", sdp->sd_vfs->s_id);
36}
37
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
39{
40 return sprintf(buf, "%s\n", sdp->sd_fsname);
41}
42
43static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
44{
45 unsigned int count;
46
47 mutex_lock(&sdp->sd_freeze_lock);
48 count = sdp->sd_freeze_count;
49 mutex_unlock(&sdp->sd_freeze_lock);
50
51 return sprintf(buf, "%u\n", count);
52}
53
54static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
55{
56 ssize_t ret = len;
57 int error = 0;
58 int n = simple_strtol(buf, NULL, 0);
59
60 if (!capable(CAP_SYS_ADMIN))
61 return -EACCES;
62
63 switch (n) {
64 case 0:
65 gfs2_unfreeze_fs(sdp);
66 break;
67 case 1:
68 error = gfs2_freeze_fs(sdp);
69 break;
70 default:
71 ret = -EINVAL;
72 }
73
74 if (error)
75 fs_warn(sdp, "freeze %d error %d", n, error);
76
77 return ret;
78}
79
80static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
81{
82 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
83 return sprintf(buf, "%u\n", b);
84}
85
86static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
87{
88 if (!capable(CAP_SYS_ADMIN))
89 return -EACCES;
90
91 if (simple_strtol(buf, NULL, 0) != 1)
92 return -EINVAL;
93
94 gfs2_lm_withdraw(sdp,
95 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
96 sdp->sd_fsname);
97 return len;
98}
99
100static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
101 size_t len)
102{
103 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES;
105
106 if (simple_strtol(buf, NULL, 0) != 1)
107 return -EINVAL;
108
109 gfs2_statfs_sync(sdp);
110 return len;
111}
112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len)
127{
128 if (!capable(CAP_SYS_ADMIN))
129 return -EACCES;
130
131 if (simple_strtol(buf, NULL, 0) != 1)
132 return -EINVAL;
133
134 gfs2_quota_sync(sdp);
135 return len;
136}
137
138static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
139 size_t len)
140{
141 uint32_t id;
142
143 if (!capable(CAP_SYS_ADMIN))
144 return -EACCES;
145
146 id = simple_strtoul(buf, NULL, 0);
147
148 gfs2_quota_refresh(sdp, 1, id);
149 return len;
150}
151
152static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
153 size_t len)
154{
155 uint32_t id;
156
157 if (!capable(CAP_SYS_ADMIN))
158 return -EACCES;
159
160 id = simple_strtoul(buf, NULL, 0);
161
162 gfs2_quota_refresh(sdp, 0, id);
163 return len;
164}
165
166struct gfs2_attr {
167 struct attribute attr;
168 ssize_t (*show)(struct gfs2_sbd *, char *);
169 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
170};
171
172#define GFS2_ATTR(name, mode, show, store) \
173static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
174
175GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
182GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
183GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
184
185static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr,
193 &gfs2_attr_quota_refresh_user.attr,
194 &gfs2_attr_quota_refresh_group.attr,
195 NULL,
196};
197
198static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
199 char *buf)
200{
201 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
202 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
203 return a->show ? a->show(sdp, buf) : 0;
204}
205
206static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
207 const char *buf, size_t len)
208{
209 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
210 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
211 return a->store ? a->store(sdp, buf, len) : len;
212}
213
214static struct sysfs_ops gfs2_attr_ops = {
215 .show = gfs2_attr_show,
216 .store = gfs2_attr_store,
217};
218
219static struct kobj_type gfs2_ktype = {
220 .default_attrs = gfs2_attrs,
221 .sysfs_ops = &gfs2_attr_ops,
222};
223
224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2",},
227 .ktype = &gfs2_ktype,
228};
229
230/*
231 * display struct lm_lockstruct fields
232 */
233
234struct lockstruct_attr {
235 struct attribute attr;
236 ssize_t (*show)(struct gfs2_sbd *, char *);
237};
238
239#define LOCKSTRUCT_ATTR(name, fmt) \
240static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
241{ \
242 return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \
243} \
244static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
245
246LOCKSTRUCT_ATTR(jid, "%u\n");
247LOCKSTRUCT_ATTR(first, "%u\n");
248LOCKSTRUCT_ATTR(lvb_size, "%u\n");
249LOCKSTRUCT_ATTR(flags, "%d\n");
250
251static struct attribute *lockstruct_attrs[] = {
252 &lockstruct_attr_jid.attr,
253 &lockstruct_attr_first.attr,
254 &lockstruct_attr_lvb_size.attr,
255 &lockstruct_attr_flags.attr,
256 NULL
257};
258
259/*
260 * display struct gfs2_args fields
261 */
262
263struct args_attr {
264 struct attribute attr;
265 ssize_t (*show)(struct gfs2_sbd *, char *);
266};
267
268#define ARGS_ATTR(name, fmt) \
269static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
270{ \
271 return sprintf(buf, fmt, sdp->sd_args.ar_##name); \
272} \
273static struct args_attr args_attr_##name = __ATTR_RO(name)
274
275ARGS_ATTR(lockproto, "%s\n");
276ARGS_ATTR(locktable, "%s\n");
277ARGS_ATTR(hostdata, "%s\n");
278ARGS_ATTR(spectator, "%d\n");
279ARGS_ATTR(ignore_local_fs, "%d\n");
280ARGS_ATTR(localcaching, "%d\n");
281ARGS_ATTR(localflocks, "%d\n");
282ARGS_ATTR(debug, "%d\n");
283ARGS_ATTR(upgrade, "%d\n");
284ARGS_ATTR(num_glockd, "%u\n");
285ARGS_ATTR(posix_acl, "%d\n");
286ARGS_ATTR(quota, "%u\n");
287ARGS_ATTR(suiddir, "%d\n");
288ARGS_ATTR(data, "%d\n");
289
290/* one oddball doesn't fit the macro mold */
291static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
292{
293 return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags));
294}
295static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
296
297static struct attribute *args_attrs[] = {
298 &args_attr_lockproto.attr,
299 &args_attr_locktable.attr,
300 &args_attr_hostdata.attr,
301 &args_attr_spectator.attr,
302 &args_attr_ignore_local_fs.attr,
303 &args_attr_localcaching.attr,
304 &args_attr_localflocks.attr,
305 &args_attr_debug.attr,
306 &args_attr_upgrade.attr,
307 &args_attr_num_glockd.attr,
308 &args_attr_posix_acl.attr,
309 &args_attr_quota.attr,
310 &args_attr_suiddir.attr,
311 &args_attr_data.attr,
312 &args_attr_noatime.attr,
313 NULL
314};
315
316/*
317 * display counters from superblock
318 */
319
320struct counters_attr {
321 struct attribute attr;
322 ssize_t (*show)(struct gfs2_sbd *, char *);
323};
324
325#define COUNTERS_ATTR(name, fmt) \
326static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
327{ \
328 return sprintf(buf, fmt, (unsigned int)atomic_read(&sdp->sd_##name)); \
329} \
330static struct counters_attr counters_attr_##name = __ATTR_RO(name)
331
332COUNTERS_ATTR(glock_count, "%u\n");
333COUNTERS_ATTR(glock_held_count, "%u\n");
334COUNTERS_ATTR(inode_count, "%u\n");
335COUNTERS_ATTR(reclaimed, "%u\n");
336
337static struct attribute *counters_attrs[] = {
338 &counters_attr_glock_count.attr,
339 &counters_attr_glock_held_count.attr,
340 &counters_attr_inode_count.attr,
341 &counters_attr_reclaimed.attr,
342 NULL
343};
344
345/*
346 * get and set struct gfs2_tune fields
347 */
348
349static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
350{
351 return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num,
352 sdp->sd_tune.gt_quota_scale_den);
353}
354
355static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
356 size_t len)
357{
358 struct gfs2_tune *gt = &sdp->sd_tune;
359 unsigned int x, y;
360
361 if (!capable(CAP_SYS_ADMIN))
362 return -EACCES;
363
364 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
365 return -EINVAL;
366
367 spin_lock(&gt->gt_spin);
368 gt->gt_quota_scale_num = x;
369 gt->gt_quota_scale_den = y;
370 spin_unlock(&gt->gt_spin);
371 return len;
372}
373
374static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
375 int check_zero, const char *buf, size_t len)
376{
377 struct gfs2_tune *gt = &sdp->sd_tune;
378 unsigned int x;
379
380 if (!capable(CAP_SYS_ADMIN))
381 return -EACCES;
382
383 x = simple_strtoul(buf, NULL, 0);
384
385 if (check_zero && !x)
386 return -EINVAL;
387
388 spin_lock(&gt->gt_spin);
389 *field = x;
390 spin_unlock(&gt->gt_spin);
391 return len;
392}
393
394struct tune_attr {
395 struct attribute attr;
396 ssize_t (*show)(struct gfs2_sbd *, char *);
397 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
398};
399
400#define TUNE_ATTR_3(name, show, store) \
401static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
402
403#define TUNE_ATTR_2(name, store) \
404static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
405{ \
406 return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \
407} \
408TUNE_ATTR_3(name, name##_show, store)
409
410#define TUNE_ATTR(name, check_zero) \
411static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
412{ \
413 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
414} \
415TUNE_ATTR_2(name, name##_store)
416
417#define TUNE_ATTR_DAEMON(name, process) \
418static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
419{ \
420 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
421 wake_up_process(sdp->sd_##process); \
422 return r; \
423} \
424TUNE_ATTR_2(name, name##_store)
425
426TUNE_ATTR(ilimit, 0);
427TUNE_ATTR(ilimit_tries, 0);
428TUNE_ATTR(ilimit_min, 0);
429TUNE_ATTR(demote_secs, 0);
430TUNE_ATTR(incore_log_blocks, 0);
431TUNE_ATTR(log_flush_secs, 0);
432TUNE_ATTR(jindex_refresh_secs, 0);
433TUNE_ATTR(quota_warn_period, 0);
434TUNE_ATTR(quota_quantum, 0);
435TUNE_ATTR(atime_quantum, 0);
436TUNE_ATTR(max_readahead, 0);
437TUNE_ATTR(complain_secs, 0);
438TUNE_ATTR(reclaim_limit, 0);
439TUNE_ATTR(prefetch_secs, 0);
440TUNE_ATTR(statfs_slow, 0);
441TUNE_ATTR(new_files_jdata, 0);
442TUNE_ATTR(new_files_directio, 0);
443TUNE_ATTR(quota_simul_sync, 1);
444TUNE_ATTR(quota_cache_secs, 1);
445TUNE_ATTR(max_atomic_write, 1);
446TUNE_ATTR(stall_secs, 1);
447TUNE_ATTR(entries_per_readdir, 1);
448TUNE_ATTR(greedy_default, 1);
449TUNE_ATTR(greedy_quantum, 1);
450TUNE_ATTR(greedy_max, 1);
451TUNE_ATTR(statfs_quantum, 1);
452TUNE_ATTR_DAEMON(scand_secs, scand_process);
453TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
454TUNE_ATTR_DAEMON(logd_secs, logd_process);
455TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
456TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
457
458static struct attribute *tune_attrs[] = {
459 &tune_attr_ilimit.attr,
460 &tune_attr_ilimit_tries.attr,
461 &tune_attr_ilimit_min.attr,
462 &tune_attr_demote_secs.attr,
463 &tune_attr_incore_log_blocks.attr,
464 &tune_attr_log_flush_secs.attr,
465 &tune_attr_jindex_refresh_secs.attr,
466 &tune_attr_quota_warn_period.attr,
467 &tune_attr_quota_quantum.attr,
468 &tune_attr_atime_quantum.attr,
469 &tune_attr_max_readahead.attr,
470 &tune_attr_complain_secs.attr,
471 &tune_attr_reclaim_limit.attr,
472 &tune_attr_prefetch_secs.attr,
473 &tune_attr_statfs_slow.attr,
474 &tune_attr_quota_simul_sync.attr,
475 &tune_attr_quota_cache_secs.attr,
476 &tune_attr_max_atomic_write.attr,
477 &tune_attr_stall_secs.attr,
478 &tune_attr_entries_per_readdir.attr,
479 &tune_attr_greedy_default.attr,
480 &tune_attr_greedy_quantum.attr,
481 &tune_attr_greedy_max.attr,
482 &tune_attr_statfs_quantum.attr,
483 &tune_attr_scand_secs.attr,
484 &tune_attr_recoverd_secs.attr,
485 &tune_attr_logd_secs.attr,
486 &tune_attr_quotad_secs.attr,
487 &tune_attr_quota_scale.attr,
488 &tune_attr_new_files_jdata.attr,
489 &tune_attr_new_files_directio.attr,
490 NULL
491};
492
493static struct attribute_group lockstruct_group = {
494 .name = "lockstruct",
495 .attrs = lockstruct_attrs
496};
497
498static struct attribute_group counters_group = {
499 .name = "counters",
500 .attrs = counters_attrs
501};
502
503static struct attribute_group args_group = {
504 .name = "args",
505 .attrs = args_attrs
506};
507
508static struct attribute_group tune_group = {
509 .name = "tune",
510 .attrs = tune_attrs
511};
512
513int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
514{
515 int error;
516
517 sdp->sd_kobj.kset = &gfs2_kset;
518 sdp->sd_kobj.ktype = &gfs2_ktype;
519
520 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
521 if (error)
522 goto fail;
523
524 error = kobject_register(&sdp->sd_kobj);
525 if (error)
526 goto fail;
527
528 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
529 if (error)
530 goto fail_reg;
531
532 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
533 if (error)
534 goto fail_lockstruct;
535
536 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
537 if (error)
538 goto fail_counters;
539
540 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
541 if (error)
542 goto fail_args;
543
544 return 0;
545
546 fail_args:
547 sysfs_remove_group(&sdp->sd_kobj, &args_group);
548 fail_counters:
549 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
550 fail_lockstruct:
551 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
552 fail_reg:
553 kobject_unregister(&sdp->sd_kobj);
554 fail:
555 return error;
556}
557
558void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
559{
560 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
561 sysfs_remove_group(&sdp->sd_kobj, &args_group);
562 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
563 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
564 kobject_unregister(&sdp->sd_kobj);
565}
566
567int gfs2_sys_init(void)
568{
569 gfs2_sys_margs = NULL;
570 spin_lock_init(&gfs2_sys_margs_lock);
571 return kset_register(&gfs2_kset);
572}
573
574void gfs2_sys_uninit(void)
575{
576 kfree(gfs2_sys_margs);
577 kset_unregister(&gfs2_kset);
578}
579
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..c46a700e801e
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13/* Allow args to be passed to GFS2 when using an initial ram disk */
14extern char *gfs2_sys_margs;
15extern spinlock_t gfs2_sys_margs_lock;
16
17int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
18void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19
20int gfs2_sys_init(void);
21void gfs2_sys_uninit(void);
22
23#endif /* __SYS_DOT_H__ */
24
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..05e0b72d56ff
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "util.h"
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes)
30{
31 struct gfs2_trans *tr;
32 int error;
33
34 BUG_ON(current->journal_info);
35 BUG_ON(blocks == 0 && revokes == 0);
36
37 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
38 if (!tr)
39 return -ENOMEM;
40
41 tr->tr_ip = (unsigned long)__builtin_return_address(0);
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 6 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(uint64_t));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
53
54 error = gfs2_glock_nq(&tr->tr_t_gh);
55 if (error)
56 goto fail_holder_uninit;
57
58 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
59 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
60 error = -EROFS;
61 goto fail_gunlock;
62 }
63
64 error = gfs2_log_reserve(sdp, tr->tr_reserved);
65 if (error)
66 goto fail_gunlock;
67
68 current->journal_info = tr;
69
70 return 0;
71
72fail_gunlock:
73 gfs2_glock_dq(&tr->tr_t_gh);
74
75fail_holder_uninit:
76 gfs2_holder_uninit(&tr->tr_t_gh);
77 kfree(tr);
78
79 return error;
80}
81
82void gfs2_trans_end(struct gfs2_sbd *sdp)
83{
84 struct gfs2_trans *tr = current->journal_info;
85
86 BUG_ON(!tr);
87 current->journal_info = NULL;
88
89 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh);
92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 return;
95 }
96
97 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
98 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
99 tr->tr_num_buf, tr->tr_blocks);
100 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
101 }
102 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
103 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
104 tr->tr_num_revoke, tr->tr_revokes);
105 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
106 }
107
108 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh);
110 gfs2_holder_uninit(&tr->tr_t_gh);
111 kfree(tr);
112
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL);
115}
116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to
125 * @bh: The buffer to add
126 * @meta: True in the case of adding metadata
127 *
128 */
129
130void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
131{
132 struct gfs2_sbd *sdp = gl->gl_sbd;
133 struct gfs2_bufdata *bd;
134
135 bd = bh->b_private;
136 if (bd)
137 gfs2_assert(sdp, bd->bd_gl == gl);
138 else {
139 gfs2_attach_bufdata(gl, bh, meta);
140 bd = bh->b_private;
141 }
142 lops_add(sdp, &bd->bd_le);
143}
144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno)
146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
148 GFP_NOFS | __GFP_NOFAIL);
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
150 rv->rv_blkno = blkno;
151 lops_add(sdp, &rv->rv_le);
152}
153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno)
155{
156 struct gfs2_revoke *rv;
157 int found = 0;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
162 if (rv->rv_blkno == blkno) {
163 list_del(&rv->rv_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--;
166 found = 1;
167 break;
168 }
169 }
170
171 gfs2_log_unlock(sdp);
172
173 if (found) {
174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv);
176 tr->tr_num_revoke_rm++;
177 }
178}
179
180void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
181{
182 lops_add(rgd->rd_sbd, &rgd->rd_le);
183}
184
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..fbef3f5a99e3
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#define RES_DINODE 1
14#define RES_INDIRECT 1
15#define RES_JDATA 1
16#define RES_DATA 1
17#define RES_LEAF 1
18#define RES_RG_BIT 2
19#define RES_EATTR 1
20#define RES_STATFS 1
21#define RES_QUOTA 2
22
23int gfs2_trans_begin(struct gfs2_sbd *sdp,
24 unsigned int blocks, unsigned int revokes);
25
26void gfs2_trans_end(struct gfs2_sbd *sdp);
27
28void gfs2_trans_add_gl(struct gfs2_glock *gl);
29void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
30void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno);
31void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno);
32void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
33
34#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..39e67b1ec70a
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "util.h"
25
26kmem_cache_t *gfs2_glock_cachep __read_mostly;
27kmem_cache_t *gfs2_inode_cachep __read_mostly;
28kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
29
30void gfs2_assert_i(struct gfs2_sbd *sdp)
31{
32 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
33 sdp->sd_fsname);
34}
35
36/**
37 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
38 * Returns: -1 if this call withdrew the machine,
39 * -2 if it was already withdrawn
40 */
41
42int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
43 const char *function, char *file, unsigned int line)
44{
45 int me;
46 me = gfs2_lm_withdraw(sdp,
47 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
48 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
49 sdp->sd_fsname, assertion,
50 sdp->sd_fsname, function, file, line);
51 dump_stack();
52 return (me) ? -1 : -2;
53}
54
55/**
56 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
57 * Returns: -1 if we printed something
58 * -2 if we didn't
59 */
60
61int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
62 const char *function, char *file, unsigned int line)
63{
64 if (time_before(jiffies,
65 sdp->sd_last_warning +
66 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
67 return -2;
68
69 printk(KERN_WARNING
70 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
71 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
72 sdp->sd_fsname, assertion,
73 sdp->sd_fsname, function, file, line);
74
75 if (sdp->sd_args.ar_debug)
76 BUG();
77 else
78 dump_stack();
79
80 sdp->sd_last_warning = jiffies;
81
82 return -1;
83}
84
85/**
86 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
87 * Returns: -1 if this call withdrew the machine,
88 * 0 if it was already withdrawn
89 */
90
91int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
92 char *file, unsigned int line)
93{
94 int rv;
95 rv = gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
97 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
98 sdp->sd_fsname,
99 sdp->sd_fsname, function, file, line);
100 return rv;
101}
102
103/**
104 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
105 * Returns: -1 if this call withdrew the machine,
106 * 0 if it was already withdrawn
107 */
108
109int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
110 const char *function, char *file, unsigned int line)
111{
112 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
113 int rv;
114 rv = gfs2_lm_withdraw(sdp,
115 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
116 "GFS2: fsid=%s: inode = %llu %llu\n"
117 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
118 sdp->sd_fsname,
119 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
120 (unsigned long long)ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 uint16_t type, uint16_t t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..8216d28bd816
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13
14#define fs_printk(level, fs, fmt, arg...) \
15 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
16
17#define fs_info(fs, fmt, arg...) \
18 fs_printk(KERN_INFO , fs , fmt , ## arg)
19
20#define fs_warn(fs, fmt, arg...) \
21 fs_printk(KERN_WARNING , fs , fmt , ## arg)
22
23#define fs_err(fs, fmt, arg...) \
24 fs_printk(KERN_ERR, fs , fmt , ## arg)
25
26
27void gfs2_assert_i(struct gfs2_sbd *sdp);
28
29#define gfs2_assert(sdp, assertion) \
30do { \
31 if (unlikely(!(assertion))) { \
32 gfs2_assert_i(sdp); \
33 BUG(); \
34 } \
35} while (0)
36
37
38int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
39 const char *function, char *file, unsigned int line);
40
41#define gfs2_assert_withdraw(sdp, assertion) \
42((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
43 __FUNCTION__, __FILE__, __LINE__))
44
45
46int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
47 const char *function, char *file, unsigned int line);
48
49#define gfs2_assert_warn(sdp, assertion) \
50((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
51 __FUNCTION__, __FILE__, __LINE__))
52
53
54int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
55 const char *function, char *file, unsigned int line);
56
57#define gfs2_consist(sdp) \
58gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
59
60
61int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
62 const char *function, char *file, unsigned int line);
63
64#define gfs2_consist_inode(ip) \
65gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
66
67
68int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
69 const char *function, char *file, unsigned int line);
70
71#define gfs2_consist_rgrpd(rgd) \
72gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
73
74
75int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
76 const char *type, const char *function,
77 char *file, unsigned int line);
78
79static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
80 struct buffer_head *bh,
81 const char *function,
82 char *file, unsigned int line)
83{
84 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
85 uint32_t magic = mh->mh_magic;
86 magic = be32_to_cpu(magic);
87 if (unlikely(magic != GFS2_MAGIC))
88 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
89 file, line);
90 return 0;
91}
92
93#define gfs2_meta_check(sdp, bh) \
94gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
95
96
97int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
98 uint16_t type, uint16_t t,
99 const char *function,
100 char *file, unsigned int line);
101
102static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
103 struct buffer_head *bh,
104 uint16_t type,
105 const char *function,
106 char *file, unsigned int line)
107{
108 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
109 uint32_t magic = mh->mh_magic;
110 uint16_t t = be32_to_cpu(mh->mh_type);
111 magic = be32_to_cpu(magic);
112 if (unlikely(magic != GFS2_MAGIC))
113 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
114 file, line);
115 if (unlikely(t != type))
116 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
117 file, line);
118 return 0;
119}
120
121#define gfs2_metatype_check(sdp, bh, type) \
122gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
123
124static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type,
125 uint16_t format)
126{
127 struct gfs2_meta_header *mh;
128 mh = (struct gfs2_meta_header *)bh->b_data;
129 mh->mh_type = cpu_to_be32(type);
130 mh->mh_format = cpu_to_be32(format);
131}
132
133
134int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
135 char *file, unsigned int line);
136
137#define gfs2_io_error(sdp) \
138gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
139
140
141int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
142 const char *function, char *file, unsigned int line);
143
144#define gfs2_io_error_bh(sdp, bh) \
145gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
146
147
148extern kmem_cache_t *gfs2_glock_cachep;
149extern kmem_cache_t *gfs2_inode_cachep;
150extern kmem_cache_t *gfs2_bufdata_cachep;
151
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p)
154{
155 unsigned int x;
156 spin_lock(&gt->gt_spin);
157 x = *p;
158 spin_unlock(&gt->gt_spin);
159 return x;
160}
161
162#define gfs2_tune_get(sdp, field) \
163gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
164
165void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
166 unsigned int bit, int new_value);
167
168#endif /* __UTIL_DOT_H__ */
169
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 2b8a7d68fae3..2121cde187d8 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -8,7 +8,7 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h \
8 atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h \ 8 atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h \
9 awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h \ 9 awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h \
10 bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h \ 10 bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h \
11 consolemap.h cycx_cfm.h dm-ioctl.h dn.h dqblk_v1.h \ 11 consolemap.h cycx_cfm.h dlm_device.h dm-ioctl.h dn.h dqblk_v1.h \
12 dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h \ 12 dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h \
13 fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h \ 13 fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h \
14 fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h \ 14 fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h \
@@ -18,28 +18,29 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h \
18 if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h \ 18 if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h \
19 in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h \ 19 in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h \
20 ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h \ 20 ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h \
21 jffs2.h keyctl.h limits.h major.h matroxfb.h meye.h minix_fs.h \ 21 jffs2.h keyctl.h limits.h lock_dlm_plock.h major.h matroxfb.h \
22 mmtimer.h mqueue.h mtio.h ncp_no.h netfilter_arp.h netrom.h \ 22 meye.h minix_fs.h mmtimer.h mqueue.h mtio.h ncp_no.h \
23 nfs2.h nfs4_mount.h nfs_mount.h openprom_fs.h param.h \ 23 netfilter_arp.h netrom.h nfs2.h nfs4_mount.h nfs_mount.h \
24 pci_ids.h pci_regs.h personality.h pfkeyv2.h pg.h pkt_cls.h \ 24 openprom_fs.h param.h pci_ids.h pci_regs.h personality.h \
25 pkt_sched.h posix_types.h ppdev.h prctl.h ps2esdi.h qic117.h \ 25 pfkeyv2.h pg.h pkt_cls.h pkt_sched.h posix_types.h ppdev.h \
26 qnxtypes.h quotaio_v1.h quotaio_v2.h radeonfb.h raw.h \ 26 prctl.h ps2esdi.h qic117.h qnxtypes.h quotaio_v1.h quotaio_v2.h \
27 resource.h rose.h sctp.h smbno.h snmp.h sockios.h som.h \ 27 radeonfb.h raw.h resource.h rose.h sctp.h smbno.h snmp.h \
28 sound.h stddef.h synclink.h telephony.h termios.h ticable.h \ 28 sockios.h som.h sound.h stddef.h synclink.h telephony.h \
29 times.h tiocl.h tipc.h toshiba.h ultrasound.h un.h utime.h \ 29 termios.h ticable.h times.h tiocl.h tipc.h toshiba.h \
30 utsname.h video_decoder.h video_encoder.h videotext.h vt.h \ 30 ultrasound.h un.h utime.h utsname.h video_decoder.h \
31 wavefront.h wireless.h xattr.h x25.h zorro_ids.h 31 video_encoder.h videotext.h vt.h wavefront.h wireless.h xattr.h \
32 x25.h zorro_ids.h
32 33
33unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h \ 34unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h \
34 atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h \ 35 atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h \
35 capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h \ 36 capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h \
36 cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h \ 37 cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h \
37 dccp.h dirent.h divert.h elfcore.h errno.h errqueue.h \ 38 dccp.h dirent.h divert.h dlm.h elfcore.h errno.h errqueue.h \
38 ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h \ 39 ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h \
39 filter.h flat.h fs.h ftape.h gameport.h generic_serial.h \ 40 filter.h flat.h fs.h ftape.h gameport.h generic_serial.h \
40 genhd.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h hiddev.h hpet.h \ 41 genhd.h gfs2_ondisk.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h \
41 i2c.h i2o-dev.h icmpv6.h if_bridge.h if_ec.h \ 42 hiddev.h hpet.h i2c.h i2o-dev.h icmpv6.h iflags.h if_bridge.h \
42 if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h \ 43 if_ec.h if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h \
43 if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h \ 44 if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h \
44 inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h \ 45 inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h \
45 ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h \ 46 ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h \
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..1b1dcb9a40bb
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,302 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock status block
164 *
165 * Use this structure to specify the contents of the lock value block. For a
166 * conversion request, this structure is used to specify the lock ID of the
167 * lock. DLM writes the status of the lock request and the lock ID assigned
168 * to the request in the lock status block.
169 *
170 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
171 * It is available when dlm_lock returns.
172 *
173 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
174 * shown for the DLM_LKF_VALBLK flag.
175 *
176 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
177 * it was first demoted to NL to avoid conversion deadlock.
178 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
179 *
180 * sb_status: the returned status of the lock request set prior to AST
181 * execution. Possible return values:
182 *
183 * 0 if lock request was successful
184 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
185 * -ENOMEM if there is no memory to process request
186 * -EINVAL if there are invalid parameters
187 * -DLM_EUNLOCK if unlock request was successful
188 * -DLM_ECANCEL if a cancel completed successfully
189 */
190
191#define DLM_SBF_DEMOTED 0x01
192#define DLM_SBF_VALNOTVALID 0x02
193#define DLM_SBF_ALTMODE 0x04
194
195struct dlm_lksb {
196 int sb_status;
197 uint32_t sb_lkid;
198 char sb_flags;
199 char * sb_lvbptr;
200};
201
202
203#ifdef __KERNEL__
204
205#define DLM_LSFL_NODIR 0x00000001
206
207/*
208 * dlm_new_lockspace
209 *
210 * Starts a lockspace with the given name. If the named lockspace exists in
211 * the cluster, the calling node joins it.
212 */
213
214int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
215 uint32_t flags, int lvblen);
216
217/*
218 * dlm_release_lockspace
219 *
220 * Stop a lockspace.
221 */
222
223int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
224
225/*
226 * dlm_lock
227 *
228 * Make an asyncronous request to acquire or convert a lock on a named
229 * resource.
230 *
231 * lockspace: context for the request
232 * mode: the requested mode of the lock (DLM_LOCK_)
233 * lksb: lock status block for input and async return values
234 * flags: input flags (DLM_LKF_)
235 * name: name of the resource to lock, can be binary
236 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
237 * parent: the lock ID of a parent lock or 0 if none
238 * lockast: function DLM executes when it completes processing the request
239 * astarg: argument passed to lockast and bast functions
240 * bast: function DLM executes when this lock later blocks another request
241 *
242 * Returns:
243 * 0 if request is successfully queued for processing
244 * -EINVAL if any input parameters are invalid
245 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
246 * -ENOMEM if there is no memory to process request
247 * -ENOTCONN if there is a communication error
248 *
249 * If the call to dlm_lock returns an error then the operation has failed and
250 * the AST routine will not be called. If dlm_lock returns 0 it is still
251 * possible that the lock operation will fail. The AST routine will be called
252 * when the locking is complete and the status is returned in the lksb.
253 *
254 * If the AST routines or parameter are passed to a conversion operation then
255 * they will overwrite those values that were passed to a previous dlm_lock
256 * call.
257 *
258 * AST routines should not block (at least not for long), but may make
259 * any locking calls they please.
260 */
261
262int dlm_lock(dlm_lockspace_t *lockspace,
263 int mode,
264 struct dlm_lksb *lksb,
265 uint32_t flags,
266 void *name,
267 unsigned int namelen,
268 uint32_t parent_lkid,
269 void (*lockast) (void *astarg),
270 void *astarg,
271 void (*bast) (void *astarg, int mode));
272
273/*
274 * dlm_unlock
275 *
276 * Asynchronously release a lock on a resource. The AST routine is called
277 * when the resource is successfully unlocked.
278 *
279 * lockspace: context for the request
280 * lkid: the lock ID as returned in the lksb
281 * flags: input flags (DLM_LKF_)
282 * lksb: if NULL the lksb parameter passed to last lock request is used
283 * astarg: the arg used with the completion ast for the unlock
284 *
285 * Returns:
286 * 0 if request is successfully queued for processing
287 * -EINVAL if any input parameters are invalid
288 * -ENOTEMPTY if the lock still has sublocks
289 * -EBUSY if the lock is waiting for a remote lock operation
290 * -ENOTCONN if there is a communication error
291 */
292
293int dlm_unlock(dlm_lockspace_t *lockspace,
294 uint32_t lkid,
295 uint32_t flags,
296 struct dlm_lksb *lksb,
297 void *astarg);
298
299#endif /* __KERNEL__ */
300
301#endif /* __DLM_DOT_H__ */
302
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..2a2dd189b9fd
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,86 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 5
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u8 namelen;
29 __u16 flags;
30 __u32 lkid;
31 __u32 parent;
32 void __user *castparam;
33 void __user *castaddr;
34 void __user *bastparam;
35 void __user *bastaddr;
36 struct dlm_lksb __user *lksb;
37 char lvb[DLM_USER_LVB_LEN];
38 char name[0];
39};
40
41struct dlm_lspace_params {
42 __u32 flags;
43 __u32 minor;
44 char name[0];
45};
46
47struct dlm_write_request {
48 __u32 version[3];
49 __u8 cmd;
50 __u8 is64bit;
51 __u8 unused[2];
52
53 union {
54 struct dlm_lock_params lock;
55 struct dlm_lspace_params lspace;
56 } i;
57};
58
59/* struct read from the "device" fd,
60 consists mainly of userspace pointers for the library to use */
61struct dlm_lock_result {
62 __u32 length;
63 void __user * user_astaddr;
64 void __user * user_astparam;
65 struct dlm_lksb __user * user_lksb;
66 struct dlm_lksb lksb;
67 __u8 bast_mode;
68 __u8 unused[3];
69 /* Offsets may be zero if no data is present */
70 __u32 lvb_offset;
71};
72
73/* Commands passed to the device */
74#define DLM_USER_LOCK 1
75#define DLM_USER_UNLOCK 2
76#define DLM_USER_QUERY 3
77#define DLM_USER_CREATE_LOCKSPACE 4
78#define DLM_USER_REMOVE_LOCKSPACE 5
79
80/* Arbitrary length restriction */
81#define MAX_LS_NAME_LEN 64
82
83/* Lockspace flags */
84#define DLM_USER_LSFLG_AUTOFREE 1
85#define DLM_USER_LSFLG_FORCEFREE 2
86
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 25610205c90d..83abd9d7898f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1375,6 +1375,9 @@ extern struct subsystem fs_subsys;
1375#define FLOCK_VERIFY_READ 1 1375#define FLOCK_VERIFY_READ 1
1376#define FLOCK_VERIFY_WRITE 2 1376#define FLOCK_VERIFY_WRITE 2
1377 1377
1378/* /sys/fs */
1379extern struct subsystem fs_subsys;
1380
1378extern int locks_mandatory_locked(struct inode *); 1381extern int locks_mandatory_locked(struct inode *);
1379extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1382extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1380 1383
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
new file mode 100644
index 000000000000..3ebd8743ce8c
--- /dev/null
+++ b/include/linux/gfs2_ondisk.h
@@ -0,0 +1,443 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_ONDISK_DOT_H__
11#define __GFS2_ONDISK_DOT_H__
12
13#define GFS2_MAGIC 0x01161970
14#define GFS2_BASIC_BLOCK 512
15#define GFS2_BASIC_BLOCK_SHIFT 9
16
17/* Lock numbers of the LM_TYPE_NONDISK type */
18
19#define GFS2_MOUNT_LOCK 0
20#define GFS2_LIVE_LOCK 1
21#define GFS2_TRANS_LOCK 2
22#define GFS2_RENAME_LOCK 3
23
24/* Format numbers for various metadata types */
25
26#define GFS2_FORMAT_NONE 0
27#define GFS2_FORMAT_SB 100
28#define GFS2_FORMAT_RG 200
29#define GFS2_FORMAT_RB 300
30#define GFS2_FORMAT_DI 400
31#define GFS2_FORMAT_IN 500
32#define GFS2_FORMAT_LF 600
33#define GFS2_FORMAT_JD 700
34#define GFS2_FORMAT_LH 800
35#define GFS2_FORMAT_LD 900
36#define GFS2_FORMAT_LB 1000
37#define GFS2_FORMAT_EA 1600
38#define GFS2_FORMAT_ED 1700
39#define GFS2_FORMAT_QC 1400
40/* These are format numbers for entities contained in files */
41#define GFS2_FORMAT_RI 1100
42#define GFS2_FORMAT_DE 1200
43#define GFS2_FORMAT_QU 1500
44/* These are part of the superblock */
45#define GFS2_FORMAT_FS 1801
46#define GFS2_FORMAT_MULTI 1900
47
48/*
49 * An on-disk inode number
50 */
51
52struct gfs2_inum {
53 __be64 no_formal_ino;
54 __be64 no_addr;
55};
56
57static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
58 const struct gfs2_inum *ino2)
59{
60 return ino1->no_formal_ino == ino2->no_formal_ino &&
61 ino1->no_addr == ino2->no_addr;
62}
63
64/*
65 * Generic metadata head structure
66 * Every inplace buffer logged in the journal must start with this.
67 */
68
69#define GFS2_METATYPE_NONE 0
70#define GFS2_METATYPE_SB 1
71#define GFS2_METATYPE_RG 2
72#define GFS2_METATYPE_RB 3
73#define GFS2_METATYPE_DI 4
74#define GFS2_METATYPE_IN 5
75#define GFS2_METATYPE_LF 6
76#define GFS2_METATYPE_JD 7
77#define GFS2_METATYPE_LH 8
78#define GFS2_METATYPE_LD 9
79#define GFS2_METATYPE_LB 12
80#define GFS2_METATYPE_EA 10
81#define GFS2_METATYPE_ED 11
82#define GFS2_METATYPE_QC 14
83
84struct gfs2_meta_header {
85 __be32 mh_magic;
86 __be32 mh_type;
87 __be64 __pad0; /* Was generation number in gfs1 */
88 __be32 mh_format;
89 __be32 __pad1; /* Was incarnation number in gfs1 */
90};
91
92/*
93 * super-block structure
94 *
95 * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes)
96 *
97 * Order is important, need to be able to read old superblocks to do on-disk
98 * version upgrades.
99 */
100
101/* Address of superblock in GFS2 basic blocks */
102#define GFS2_SB_ADDR 128
103
104/* The lock number for the superblock (must be zero) */
105#define GFS2_SB_LOCK 0
106
107/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0
108 Includes: the fencing zero at the end */
109#define GFS2_LOCKNAME_LEN 64
110
111struct gfs2_sb {
112 struct gfs2_meta_header sb_header;
113
114 __be32 sb_fs_format;
115 __be32 sb_multihost_format;
116 __u32 __pad0; /* Was superblock flags in gfs1 */
117
118 __be32 sb_bsize;
119 __be32 sb_bsize_shift;
120 __u32 __pad1; /* Was journal segment size in gfs1 */
121
122 struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
123 struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
124 struct gfs2_inum sb_root_dir;
125
126 char sb_lockproto[GFS2_LOCKNAME_LEN];
127 char sb_locktable[GFS2_LOCKNAME_LEN];
128 /* In gfs1, quota and license dinodes followed */
129};
130
131/*
132 * resource index structure
133 */
134
135struct gfs2_rindex {
136 __be64 ri_addr; /* grp block disk address */
137 __be32 ri_length; /* length of rgrp header in fs blocks */
138 __u32 __pad;
139
140 __be64 ri_data0; /* first data location */
141 __be32 ri_data; /* num of data blocks in rgrp */
142
143 __be32 ri_bitbytes; /* number of bytes in data bitmaps */
144
145 __u8 ri_reserved[64];
146};
147
148/*
149 * resource group header structure
150 */
151
152/* Number of blocks per byte in rgrp */
153#define GFS2_NBBY 4
154#define GFS2_BIT_SIZE 2
155#define GFS2_BIT_MASK 0x00000003
156
157#define GFS2_BLKST_FREE 0
158#define GFS2_BLKST_USED 1
159#define GFS2_BLKST_UNLINKED 2
160#define GFS2_BLKST_DINODE 3
161
162#define GFS2_RGF_JOURNAL 0x00000001
163#define GFS2_RGF_METAONLY 0x00000002
164#define GFS2_RGF_DATAONLY 0x00000004
165#define GFS2_RGF_NOALLOC 0x00000008
166
167struct gfs2_rgrp {
168 struct gfs2_meta_header rg_header;
169
170 __be32 rg_flags;
171 __be32 rg_free;
172 __be32 rg_dinodes;
173 __be32 __pad;
174 __be64 rg_igeneration;
175
176 __u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
177};
178
179/*
180 * quota structure
181 */
182
183struct gfs2_quota {
184 __be64 qu_limit;
185 __be64 qu_warn;
186 __be64 qu_value;
187 __u8 qu_reserved[64];
188};
189
190/*
191 * dinode structure
192 */
193
194#define GFS2_MAX_META_HEIGHT 10
195#define GFS2_DIR_MAX_DEPTH 17
196
197#define DT2IF(dt) (((dt) << 12) & S_IFMT)
198#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
199
200enum {
201 gfs2fl_Jdata = 0,
202 gfs2fl_ExHash = 1,
203 gfs2fl_Unused = 2,
204 gfs2fl_EaIndirect = 3,
205 gfs2fl_Directio = 4,
206 gfs2fl_Immutable = 5,
207 gfs2fl_AppendOnly = 6,
208 gfs2fl_NoAtime = 7,
209 gfs2fl_Sync = 8,
210 gfs2fl_System = 9,
211 gfs2fl_TruncInProg = 29,
212 gfs2fl_InheritDirectio = 30,
213 gfs2fl_InheritJdata = 31,
214};
215
216/* Dinode flags */
217#define GFS2_DIF_JDATA 0x00000001
218#define GFS2_DIF_EXHASH 0x00000002
219#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */
220#define GFS2_DIF_EA_INDIRECT 0x00000008
221#define GFS2_DIF_DIRECTIO 0x00000010
222#define GFS2_DIF_IMMUTABLE 0x00000020
223#define GFS2_DIF_APPENDONLY 0x00000040
224#define GFS2_DIF_NOATIME 0x00000080
225#define GFS2_DIF_SYNC 0x00000100
226#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */
227#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */
228#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000
229#define GFS2_DIF_INHERIT_JDATA 0x80000000
230
231struct gfs2_dinode {
232 struct gfs2_meta_header di_header;
233
234 struct gfs2_inum di_num;
235
236 __be32 di_mode; /* mode of file */
237 __be32 di_uid; /* owner's user id */
238 __be32 di_gid; /* owner's group id */
239 __be32 di_nlink; /* number of links to this file */
240 __be64 di_size; /* number of bytes in file */
241 __be64 di_blocks; /* number of blocks in file */
242 __be64 di_atime; /* time last accessed */
243 __be64 di_mtime; /* time last modified */
244 __be64 di_ctime; /* time last changed */
245 __be32 di_major; /* device major number */
246 __be32 di_minor; /* device minor number */
247
248 /* This section varies from gfs1. Padding added to align with
249 * remainder of dinode
250 */
251 __be64 di_goal_meta; /* rgrp to alloc from next */
252 __be64 di_goal_data; /* data block goal */
253 __be64 di_generation; /* generation number for NFS */
254
255 __be32 di_flags; /* GFS2_DIF_... */
256 __be32 di_payload_format; /* GFS2_FORMAT_... */
257 __u16 __pad1; /* Was ditype in gfs1 */
258 __be16 di_height; /* height of metadata */
259 __u32 __pad2; /* Unused incarnation number from gfs1 */
260
261 /* These only apply to directories */
262 __u16 __pad3; /* Padding */
263 __be16 di_depth; /* Number of bits in the table */
264 __be32 di_entries; /* The number of entries in the directory */
265
266 struct gfs2_inum __pad4; /* Unused even in current gfs1 */
267
268 __be64 di_eattr; /* extended attribute block number */
269
270 __u8 di_reserved[56];
271};
272
273/*
274 * directory structure - many of these per directory file
275 */
276
277#define GFS2_FNAMESIZE 255
278#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
279
280struct gfs2_dirent {
281 struct gfs2_inum de_inum;
282 __be32 de_hash;
283 __be16 de_rec_len;
284 __be16 de_name_len;
285 __be16 de_type;
286 __u8 __pad[14];
287};
288
289/*
290 * Header of leaf directory nodes
291 */
292
293struct gfs2_leaf {
294 struct gfs2_meta_header lf_header;
295
296 __be16 lf_depth; /* Depth of leaf */
297 __be16 lf_entries; /* Number of dirents in leaf */
298 __be32 lf_dirent_format; /* Format of the dirents */
299 __be64 lf_next; /* Next leaf, if overflow */
300
301 __u8 lf_reserved[64];
302};
303
304/*
305 * Extended attribute header format
306 */
307
308#define GFS2_EA_MAX_NAME_LEN 255
309#define GFS2_EA_MAX_DATA_LEN 65536
310
311#define GFS2_EATYPE_UNUSED 0
312#define GFS2_EATYPE_USR 1
313#define GFS2_EATYPE_SYS 2
314#define GFS2_EATYPE_SECURITY 3
315
316#define GFS2_EATYPE_LAST 3
317#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST)
318
319#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */
320
321struct gfs2_ea_header {
322 __be32 ea_rec_len;
323 __be32 ea_data_len;
324 __u8 ea_name_len; /* no NULL pointer after the string */
325 __u8 ea_type; /* GFS2_EATYPE_... */
326 __u8 ea_flags; /* GFS2_EAFLAG_... */
327 __u8 ea_num_ptrs;
328 __u32 __pad;
329};
330
331/*
332 * Log header structure
333 */
334
335#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */
336
337struct gfs2_log_header {
338 struct gfs2_meta_header lh_header;
339
340 __be64 lh_sequence; /* Sequence number of this transaction */
341 __be32 lh_flags; /* GFS2_LOG_HEAD_... */
342 __be32 lh_tail; /* Block number of log tail */
343 __be32 lh_blkno;
344 __be32 lh_hash;
345};
346
347/*
348 * Log type descriptor
349 */
350
351#define GFS2_LOG_DESC_METADATA 300
352/* ld_data1 is the number of metadata blocks in the descriptor.
353 ld_data2 is unused. */
354
355#define GFS2_LOG_DESC_REVOKE 301
356/* ld_data1 is the number of revoke blocks in the descriptor.
357 ld_data2 is unused. */
358
359#define GFS2_LOG_DESC_JDATA 302
360/* ld_data1 is the number of data blocks in the descriptor.
361 ld_data2 is unused. */
362
363struct gfs2_log_descriptor {
364 struct gfs2_meta_header ld_header;
365
366 __be32 ld_type; /* GFS2_LOG_DESC_... */
367 __be32 ld_length; /* Number of buffers in this chunk */
368 __be32 ld_data1; /* descriptor-specific field */
369 __be32 ld_data2; /* descriptor-specific field */
370
371 __u8 ld_reserved[32];
372};
373
374/*
375 * Inum Range
376 * Describe a range of formal inode numbers allocated to
377 * one machine to assign to inodes.
378 */
379
380#define GFS2_INUM_QUANTUM 1048576
381
382struct gfs2_inum_range {
383 __be64 ir_start;
384 __be64 ir_length;
385};
386
387/*
388 * Statfs change
389 * Describes an change to the pool of free and allocated
390 * blocks.
391 */
392
393struct gfs2_statfs_change {
394 __be64 sc_total;
395 __be64 sc_free;
396 __be64 sc_dinodes;
397};
398
399/*
400 * Quota change
401 * Describes an allocation change for a particular
402 * user or group.
403 */
404
405#define GFS2_QCF_USER 0x00000001
406
407struct gfs2_quota_change {
408 __be64 qc_change;
409 __be32 qc_flags; /* GFS2_QCF_... */
410 __be32 qc_id;
411};
412
413#ifdef __KERNEL__
414/* Translation functions */
415
416extern void gfs2_inum_in(struct gfs2_inum *no, char *buf);
417extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf);
418extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf);
419extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf);
420extern void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf);
421extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf);
422extern void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf);
423extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf);
424extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf);
425extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf);
426extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf);
427extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf);
428extern void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf);
429extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf);
430extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf);
431extern void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf);
432extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf);
433extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf);
434extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf);
435
436/* Printing functions */
437
438extern void gfs2_rindex_print(struct gfs2_rindex *ri);
439extern void gfs2_dinode_print(struct gfs2_dinode *di);
440
441#endif /* __KERNEL__ */
442
443#endif /* __GFS2_ONDISK_DOT_H__ */
diff --git a/include/linux/iflags.h b/include/linux/iflags.h
new file mode 100644
index 000000000000..5b27102dfeaf
--- /dev/null
+++ b/include/linux/iflags.h
@@ -0,0 +1,102 @@
1#ifndef _LINUX_IFLAGS_H
2#define _LINUX_IFLAGS_H
3
4/*
5 * A universal set of inode flags.
6 *
7 * Originally taken from ext2/3 with additions for other filesystems.
8 * Filesystems supporting this interface should interoperate with
9 * the lsattr and chattr command line tools.
10 *
11 * This interface is supported in whole or in part by:
12 * ext2
13 * ext3
14 * xfs
15 * jfs
16 * gfs2
17 *
18 */
19
20#define IFLAGS_GET_IOC _IOR('f', 1, long)
21#define IFLAGS_SET_IOC _IOW('f', 2, long)
22
23/*
24 * These values are provided for use as indices of an array
25 * for use with the iflags_cvt function below
26 */
27enum {
28 iflag_SecureRm = 0, /* Secure deletion */
29 iflag_Unrm = 1, /* Undelete */
30 iflag_Compress = 2, /* Compress file */
31 iflag_Sync = 3, /* Synchronous updates */
32 iflag_Immutable = 4, /* Immutable */
33 iflag_Append = 5, /* Append */
34 iflag_NoDump = 6, /* Don't dump file */
35 iflag_NoAtime = 7, /* No atime updates */
36 /* Reserved for compression usage */
37 iflag_Dirty = 8,
38 iflag_ComprBlk = 9, /* One or more compressed clusters */
39 iflag_NoComp = 10, /* Don't compress */
40 iflag_Ecompr = 11, /* Compression error */
41 /* End of compression flags */
42 iflag_Btree = 12, /* btree format dir */
43 iflag_Index = 12, /* hash-indexed directory */
44 iflag_Imagic = 13, /* AFS directory */
45 iflag_JournalData = 14, /* file data should be journaled */
46 iflag_NoTail = 15, /* file tail should not be merged */
47 iflag_DirSync = 16, /* dirsync behaviour */
48 iflag_TopDir = 17, /* Top of directory hierarchies */
49 iflag_Extent = 19, /* Extents */
50 iflag_DirectIO = 20, /* Always use direct I/O on this file */
51 iflag_Reserved = 31 /* reserved for ext2/3 lib */
52};
53
54#define __IFL(x) (1<<(iflag_##x))
55#define IFLAG_SECRM __IFL(SecureRm) /* 0x00000001 */
56#define IFLAG_UNRM __IFL(Unrm) /* 0x00000002 */
57#define IFLAG_COMPR __IFL(Compr) /* 0x00000004 */
58#define IFLAG_SYNC __IFL(Sync) /* 0x00000008 */
59#define IFLAG_IMMUTABLE __IFL(Immutable) /* 0x00000010 */
60#define IFLAG_APPEND __IFL(Append) /* 0x00000020 */
61#define IFLAG_NODUMP __IFL(NoDump) /* 0x00000040 */
62#define IFLAG_NOATIME __IFL(NoAtime) /* 0x00000080 */
63#define IFLAG_DIRTY __IFL(Dirty) /* 0x00000100 */
64#define IFLAG_COMPRBLK __IFL(ComprBlk) /* 0x00000200 */
65#define IFLAG_NOCOMP __IFL(NoComp) /* 0x00000400 */
66#define IFLAG_ECOMPR __IFL(Ecompr) /* 0x00000800 */
67#define IFLAG_BTREE __IFL(Btree) /* 0x00001000 */
68#define IFLAG_INDEX __IFL(Index) /* 0x00001000 */
69#define IFLAG_IMAGIC __IFL(Imagic) /* 0x00002000 */
70#define IFLAG_JOURNAL_DATA __IFL(JournalData) /* 0x00004000 */
71#define IFLAG_NOTAIL __IFL(NoTail) /* 0x00008000 */
72#define IFLAG_DIRSYNC __IFL(DirSync) /* 0x00010000 */
73#define IFLAG_TOPDIR __IFL(TopDir) /* 0x00020000 */
74#define IFLAG_EXTENT __IFL(Extent) /* 0x00080000 */
75#define IFLAG_DIRECTIO __IFL(DirectIO) /* 0x00100000 */
76#define IFLAG_RESERVED __IFL(Reserved) /* 0x80000000 */
77
78#ifdef __KERNEL__
79/**
80 * iflags_cvt
81 * @table: A table of 32 u32 flags
82 * @val: a 32 bit value to convert
83 *
84 * This function can be used to convert between IFLAGS values and
85 * the filesystem's own flags values.
86 *
87 * Returns: the converted flags
88 */
89static inline u32 iflags_cvt(const u32 *table, u32 val)
90{
91 u32 res = 0;
92 while(val) {
93 if (val & 1)
94 res |= *table;
95 table++;
96 val >>= 1;
97 }
98 return res;
99}
100#endif /* __KERNEL__ */
101
102#endif /* _LINUX_IFLAGS_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 181c69cad4e3..06c2768e1330 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -32,6 +32,7 @@ extern const char linux_banner[];
32 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 33#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
34#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) 34#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
35#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
35#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) 36#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
36#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) 37#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
37 38
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
new file mode 100644
index 000000000000..007b07a178ab
--- /dev/null
+++ b/include/linux/lock_dlm_plock.h
@@ -0,0 +1,40 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __LOCK_DLM_PLOCK_DOT_H__
10#define __LOCK_DLM_PLOCK_DOT_H__
11
12#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock"
13
14#define GDLM_PLOCK_VERSION_MAJOR 1
15#define GDLM_PLOCK_VERSION_MINOR 0
16#define GDLM_PLOCK_VERSION_PATCH 0
17
18enum {
19 GDLM_PLOCK_OP_LOCK = 1,
20 GDLM_PLOCK_OP_UNLOCK,
21 GDLM_PLOCK_OP_GET,
22};
23
24struct gdlm_plock_info {
25 __u32 version[3];
26 __u8 optype;
27 __u8 ex;
28 __u8 wait;
29 __u8 pad;
30 __u32 pid;
31 __s32 nodeid;
32 __s32 rv;
33 __u32 fsid;
34 __u64 number;
35 __u64 start;
36 __u64 end;
37};
38
39#endif
40
diff --git a/mm/filemap.c b/mm/filemap.c
index d087fc3d3281..b9c91ab7f0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1122,6 +1122,7 @@ success:
1122 desc->arg.buf += size; 1122 desc->arg.buf += size;
1123 return size; 1123 return size;
1124} 1124}
1125EXPORT_SYMBOL_GPL(file_read_actor);
1125 1126
1126/** 1127/**
1127 * __generic_file_aio_read - generic filesystem read routine 1128 * __generic_file_aio_read - generic filesystem read routine
diff --git a/mm/readahead.c b/mm/readahead.c
index aa7ec424656a..1ba736ac0367 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
38 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1; 39 ra->prev_page = -1;
40} 40}
41EXPORT_SYMBOL_GPL(file_ra_state_init);
41 42
42/* 43/*
43 * Return max readahead size for this inode in number-of-pages. 44 * Return max readahead size for this inode in number-of-pages.