aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS6
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--MAINTAINERS18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig21
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c172
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c384
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h539
-rw-r--r--fs/dlm/lock.c3845
-rw-r--r--fs/dlm/lock.h61
-rw-r--r--fs/dlm/lockspace.c705
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1238
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c312
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c115
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c457
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c776
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c285
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c785
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c313
-rw-r--r--fs/gfs2/acl.h37
-rw-r--r--fs/gfs2/bmap.c1236
-rw-r--r--fs/gfs2/bmap.h27
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1974
-rw-r--r--fs/gfs2/dir.h73
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h29
-rw-r--r--fs/gfs2/eattr.c1548
-rw-r--r--fs/gfs2/eattr.h97
-rw-r--r--fs/gfs2/format.h21
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2279
-rw-r--r--fs/gfs2/glock.h152
-rw-r--r--fs/gfs2/glops.c564
-rw-r--r--fs/gfs2/glops.h23
-rw-r--r--fs/gfs2/incore.h658
-rw-r--r--fs/gfs2/inode.c1354
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c244
-rw-r--r--fs/gfs2/lm.h41
-rw-r--r--fs/gfs2/lm_interface.h295
-rw-r--r--fs/gfs2/locking.c191
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c541
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h188
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c256
-rw-r--r--fs/gfs2/locking/dlm/plock.c299
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c225
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c259
-rw-r--r--fs/gfs2/log.c601
-rw-r--r--fs/gfs2/log.h61
-rw-r--r--fs/gfs2/lops.c800
-rw-r--r--fs/gfs2/lops.h96
-rw-r--r--fs/gfs2/lvb.c45
-rw-r--r--fs/gfs2/lvb.h19
-rw-r--r--fs/gfs2/main.c127
-rw-r--r--fs/gfs2/meta_io.c780
-rw-r--r--fs/gfs2/meta_io.h89
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h15
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c778
-rw-r--r--fs/gfs2/ops_address.h18
-rw-r--r--fs/gfs2/ops_dentry.c123
-rw-r--r--fs/gfs2/ops_dentry.h15
-rw-r--r--fs/gfs2/ops_export.c293
-rw-r--r--fs/gfs2/ops_export.h19
-rw-r--r--fs/gfs2/ops_file.c815
-rw-r--r--fs/gfs2/ops_file.h20
-rw-r--r--fs/gfs2/ops_fstype.c836
-rw-r--r--fs/gfs2/ops_fstype.h16
-rw-r--r--fs/gfs2/ops_inode.c1165
-rw-r--r--fs/gfs2/ops_inode.h18
-rw-r--r--fs/gfs2/ops_super.c471
-rw-r--r--fs/gfs2/ops_super.h15
-rw-r--r--fs/gfs2/ops_vm.c194
-rw-r--r--fs/gfs2/ops_vm.h16
-rw-r--r--fs/gfs2/quota.c1286
-rw-r--r--fs/gfs2/quota.h32
-rw-r--r--fs/gfs2/recovery.c575
-rw-r--r--fs/gfs2/recovery.h32
-rw-r--r--fs/gfs2/rgrp.c1528
-rw-r--r--fs/gfs2/rgrp.h62
-rw-r--r--fs/gfs2/super.c979
-rw-r--r--fs/gfs2/super.h52
-rw-r--r--fs/gfs2/sys.c579
-rw-r--r--fs/gfs2/sys.h24
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h34
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h169
-rw-r--r--include/linux/Kbuild33
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h86
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/gfs2_ondisk.h443
-rw-r--r--include/linux/iflags.h102
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/lock_dlm_plock.h40
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/readahead.c1
131 files changed, 40644 insertions, 21 deletions
diff --git a/CREDITS b/CREDITS
index 29be6d1fdf49..f41e1d2952c9 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3541,11 +3541,11 @@ S: Fargo, North Dakota 58122
3541S: USA 3541S: USA
3542 3542
3543N: Steven Whitehouse 3543N: Steven Whitehouse
3544E: SteveW@ACM.org 3544E: steve@chygwyn.com
3545W: http://www.chygwyn.com/~steve 3545W: http://www.chygwyn.com/~steve
3546D: Linux DECnet project: http://www.sucs.swan.ac.uk/~rohan/DECnet/index.html 3546D: Linux DECnet project
3547D: Minor debugging of other networking protocols. 3547D: Minor debugging of other networking protocols.
3548D: Misc bug fixes and filesystem development 3548D: Misc bug fixes and GFS2 filesystem development
3549 3549
3550N: Hans-Joachim Widmaier 3550N: Hans-Joachim Widmaier
3551E: hjw@zvw.de 3551E: hjw@zvw.de
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..593004b6bbab
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
1Global File System
2------------------
3
4http://sources.redhat.com/cluster/
5
6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC,
8iSCSI, NBD, etc). GFS reads and writes to the block device like a local
9file system, but also uses a lock module to allow the computers coordinate
10their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster.
13
14GFS uses interchangable inter-node locking mechanisms. Different lock
15modules can plug into GFS and each file system selects the appropriate
16lock module at mount time. Lock modules include:
17
18 lock_nolock -- allows gfs to be used as a local file system
19
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/
22
23In addition to interfacing with an external locking manager, a gfs lock
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above.
27
28To use gfs as a local file system, no external clustering systems are
29needed, simply:
30
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir
33
34GFS2 is not on-disk compatible with previous versions of GFS.
35
36The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem
38 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem
42 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem
diff --git a/MAINTAINERS b/MAINTAINERS
index b2afc7ae965b..e42e14335194 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -862,6 +862,16 @@ M: jack@suse.cz
862L: linux-kernel@vger.kernel.org 862L: linux-kernel@vger.kernel.org
863S: Maintained 863S: Maintained
864 864
865DISTRIBUTED LOCK MANAGER
866P: Patrick Caulfield
867M: pcaulfie@redhat.com
868P: David Teigland
869M: teigland@redhat.com
870L: cluster-devel@redhat.com
871W: http://sources.redhat.com/cluster/
872T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
873S: Supported
874
865DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER 875DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
866P: Tobias Ringstrom 876P: Tobias Ringstrom
867M: tori@unhappy.mine.nu 877M: tori@unhappy.mine.nu
@@ -1112,6 +1122,14 @@ M: khc@pm.waw.pl
1112W: http://www.kernel.org/pub/linux/utils/net/hdlc/ 1122W: http://www.kernel.org/pub/linux/utils/net/hdlc/
1113S: Maintained 1123S: Maintained
1114 1124
1125GFS2 FILE SYSTEM
1126P: Steven Whitehouse
1127M: swhiteho@redhat.com
1128L: cluster-devel@redhat.com
1129W: http://sources.redhat.com/cluster/
1130T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
1131S: Supported
1132
1115GIGASET ISDN DRIVERS 1133GIGASET ISDN DRIVERS
1116P: Hansjoerg Lipp 1134P: Hansjoerg Lipp
1117M: hjlipp@web.de 1135M: hjlipp@web.de
diff --git a/fs/Kconfig b/fs/Kconfig
index 3f00a9faabcb..ddc7462ddb56 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -323,6 +323,7 @@ config FS_POSIX_ACL
323 default n 323 default n
324 324
325source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
326source "fs/gfs2/Kconfig"
326 327
327config OCFS2_FS 328config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)" 329 tristate "OCFS2 file system support (EXPERIMENTAL)"
@@ -1930,6 +1931,7 @@ source "fs/partitions/Kconfig"
1930endmenu 1931endmenu
1931 1932
1932source "fs/nls/Kconfig" 1933source "fs/nls/Kconfig"
1934source "fs/dlm/Kconfig"
1933 1935
1934endmenu 1936endmenu
1935 1937
diff --git a/fs/Makefile b/fs/Makefile
index 89135428a539..64df11047ccc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
50obj-y += devpts/ 50obj-y += devpts/
51 51
52obj-$(CONFIG_PROFILING) += dcookies.o 52obj-$(CONFIG_PROFILING) += dcookies.o
53obj-$(CONFIG_DLM) += dlm/
53 54
54# Do not add any filesystems before this line 55# Do not add any filesystems before this line
55obj-$(CONFIG_REISERFS_FS) += reiserfs/ 56obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -102,3 +103,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
102obj-$(CONFIG_HPPFS) += hppfs/ 103obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 104obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_OCFS2_FS) += ocfs2/ 105obj-$(CONFIG_OCFS2_FS) += ocfs2/
106obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 depends on IP_SCTP
8 select CONFIGFS_FS
9 help
10 A general purpose distributed lock manager for kernel or userspace
11 applications.
12
13config DLM_DEBUG
14 bool "DLM debugging"
15 depends on DLM
16 help
17 Under the debugfs mount point, the name of each lockspace will
18 appear as a file in the "dlm" directory. The output is the
19 list of resource and locks the local node knows about.
20
21endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
1obj-$(CONFIG_DLM) += dlm.o
2dlm-y := ast.o \
3 config.o \
4 dir.o \
5 lock.o \
6 lockspace.o \
7 lowcomms.o \
8 main.o \
9 member.o \
10 memory.o \
11 midcomms.o \
12 rcom.o \
13 recover.o \
14 recoverd.o \
15 requestqueue.o \
16 user.o \
17 util.o
18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
19
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..a211330cbc42
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,172 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "user.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 if (lkb->lkb_flags & DLM_IFL_USER) {
38 dlm_user_add_ast(lkb, type);
39 return;
40 }
41
42 spin_lock(&ast_queue_lock);
43 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
44 kref_get(&lkb->lkb_ref);
45 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
46 }
47 lkb->lkb_ast_type |= type;
48 spin_unlock(&ast_queue_lock);
49
50 set_bit(WAKE_ASTS, &astd_wakeflags);
51 wake_up_process(astd_task);
52}
53
54static void process_asts(void)
55{
56 struct dlm_ls *ls = NULL;
57 struct dlm_rsb *r = NULL;
58 struct dlm_lkb *lkb;
59 void (*cast) (long param);
60 void (*bast) (long param, int mode);
61 int type = 0, found, bmode;
62
63 for (;;) {
64 found = 0;
65 spin_lock(&ast_queue_lock);
66 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
67 r = lkb->lkb_resource;
68 ls = r->res_ls;
69
70 if (dlm_locking_stopped(ls))
71 continue;
72
73 list_del(&lkb->lkb_astqueue);
74 type = lkb->lkb_ast_type;
75 lkb->lkb_ast_type = 0;
76 found = 1;
77 break;
78 }
79 spin_unlock(&ast_queue_lock);
80
81 if (!found)
82 break;
83
84 cast = lkb->lkb_astaddr;
85 bast = lkb->lkb_bastaddr;
86 bmode = lkb->lkb_bastmode;
87
88 if ((type & AST_COMP) && cast)
89 cast(lkb->lkb_astparam);
90
91 /* FIXME: Is it safe to look at lkb_grmode here
92 without doing a lock_rsb() ?
93 Look at other checks in v1 to avoid basts. */
94
95 if ((type & AST_BAST) && bast)
96 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
97 bast(lkb->lkb_astparam, bmode);
98
99 /* this removes the reference added by dlm_add_ast
100 and may result in the lkb being freed */
101 dlm_put_lkb(lkb);
102
103 schedule();
104 }
105}
106
107static inline int no_asts(void)
108{
109 int ret;
110
111 spin_lock(&ast_queue_lock);
112 ret = list_empty(&ast_queue);
113 spin_unlock(&ast_queue_lock);
114 return ret;
115}
116
117static int dlm_astd(void *data)
118{
119 while (!kthread_should_stop()) {
120 set_current_state(TASK_INTERRUPTIBLE);
121 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
122 schedule();
123 set_current_state(TASK_RUNNING);
124
125 mutex_lock(&astd_running);
126 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
127 process_asts();
128 mutex_unlock(&astd_running);
129 }
130 return 0;
131}
132
133void dlm_astd_wake(void)
134{
135 if (!no_asts()) {
136 set_bit(WAKE_ASTS, &astd_wakeflags);
137 wake_up_process(astd_task);
138 }
139}
140
141int dlm_astd_start(void)
142{
143 struct task_struct *p;
144 int error = 0;
145
146 INIT_LIST_HEAD(&ast_queue);
147 spin_lock_init(&ast_queue_lock);
148 mutex_init(&astd_running);
149
150 p = kthread_run(dlm_astd, NULL, "dlm_astd");
151 if (IS_ERR(p))
152 error = PTR_ERR(p);
153 else
154 astd_task = p;
155 return error;
156}
157
158void dlm_astd_stop(void)
159{
160 kthread_stop(astd_task);
161}
162
163void dlm_astd_suspend(void)
164{
165 mutex_lock(&astd_running);
166}
167
168void dlm_astd_resume(void)
169{
170 mutex_unlock(&astd_running);
171}
172
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..8f471d9a9e3a
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,384 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21#define DLM_DEBUG_BUF_LEN 4096
22static char debug_buf[DLM_DEBUG_BUF_LEN];
23static struct mutex debug_buf_lock;
24
25static struct dentry *dlm_root;
26
27struct rsb_iter {
28 int entry;
29 struct dlm_ls *ls;
30 struct list_head *next;
31 struct dlm_rsb *rsb;
32};
33
34/*
35 * dump all rsb's in the lockspace hash table
36 */
37
38static char *print_lockmode(int mode)
39{
40 switch (mode) {
41 case DLM_LOCK_IV:
42 return "--";
43 case DLM_LOCK_NL:
44 return "NL";
45 case DLM_LOCK_CR:
46 return "CR";
47 case DLM_LOCK_CW:
48 return "CW";
49 case DLM_LOCK_PR:
50 return "PR";
51 case DLM_LOCK_PW:
52 return "PW";
53 case DLM_LOCK_EX:
54 return "EX";
55 default:
56 return "??";
57 }
58}
59
60static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
61 struct dlm_rsb *res)
62{
63 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
64
65 if (lkb->lkb_status == DLM_LKSTS_CONVERT
66 || lkb->lkb_status == DLM_LKSTS_WAITING)
67 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
68
69 if (lkb->lkb_nodeid) {
70 if (lkb->lkb_nodeid != res->res_nodeid)
71 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
72 lkb->lkb_remid);
73 else
74 seq_printf(s, " Master: %08x", lkb->lkb_remid);
75 }
76
77 if (lkb->lkb_wait_type)
78 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
79
80 seq_printf(s, "\n");
81}
82
83static int print_resource(struct dlm_rsb *res, struct seq_file *s)
84{
85 struct dlm_lkb *lkb;
86 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
87
88 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
89 for (i = 0; i < res->res_length; i++) {
90 if (isprint(res->res_name[i]))
91 seq_printf(s, "%c", res->res_name[i]);
92 else
93 seq_printf(s, "%c", '.');
94 }
95 if (res->res_nodeid > 0)
96 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
97 res->res_nodeid);
98 else if (res->res_nodeid == 0)
99 seq_printf(s, "\" \nMaster Copy\n");
100 else if (res->res_nodeid == -1)
101 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
102 res->res_first_lkid);
103 else
104 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
105
106 /* Print the LVB: */
107 if (res->res_lvbptr) {
108 seq_printf(s, "LVB: ");
109 for (i = 0; i < lvblen; i++) {
110 if (i == lvblen / 2)
111 seq_printf(s, "\n ");
112 seq_printf(s, "%02x ",
113 (unsigned char) res->res_lvbptr[i]);
114 }
115 if (rsb_flag(res, RSB_VALNOTVALID))
116 seq_printf(s, " (INVALID)");
117 seq_printf(s, "\n");
118 }
119
120 root_list = !list_empty(&res->res_root_list);
121 recover_list = !list_empty(&res->res_recover_list);
122
123 if (root_list || recover_list) {
124 seq_printf(s, "Recovery: root %d recover %d flags %lx "
125 "count %d\n", root_list, recover_list,
126 res->res_flags, res->res_recover_locks_count);
127 }
128
129 /* Print the locks attached to this resource */
130 seq_printf(s, "Granted Queue\n");
131 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
132 print_lock(s, lkb, res);
133
134 seq_printf(s, "Conversion Queue\n");
135 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
136 print_lock(s, lkb, res);
137
138 seq_printf(s, "Waiting Queue\n");
139 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
140 print_lock(s, lkb, res);
141
142 if (list_empty(&res->res_lookup))
143 goto out;
144
145 seq_printf(s, "Lookup Queue\n");
146 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
147 seq_printf(s, "%08x %s", lkb->lkb_id,
148 print_lockmode(lkb->lkb_rqmode));
149 if (lkb->lkb_wait_type)
150 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
151 seq_printf(s, "\n");
152 }
153 out:
154 return 0;
155}
156
157static int rsb_iter_next(struct rsb_iter *ri)
158{
159 struct dlm_ls *ls = ri->ls;
160 int i;
161
162 if (!ri->next) {
163 top:
164 /* Find the next non-empty hash bucket */
165 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
168 ri->next = ls->ls_rsbtbl[i].list.next;
169 read_unlock(&ls->ls_rsbtbl[i].lock);
170 break;
171 }
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 }
174 ri->entry = i;
175
176 if (ri->entry >= ls->ls_rsbtbl_size)
177 return 1;
178 } else {
179 i = ri->entry;
180 read_lock(&ls->ls_rsbtbl[i].lock);
181 ri->next = ri->next->next;
182 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
183 /* End of list - move to next bucket */
184 ri->next = NULL;
185 ri->entry++;
186 read_unlock(&ls->ls_rsbtbl[i].lock);
187 goto top;
188 }
189 read_unlock(&ls->ls_rsbtbl[i].lock);
190 }
191 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
192
193 return 0;
194}
195
196static void rsb_iter_free(struct rsb_iter *ri)
197{
198 kfree(ri);
199}
200
201static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
202{
203 struct rsb_iter *ri;
204
205 ri = kmalloc(sizeof *ri, GFP_KERNEL);
206 if (!ri)
207 return NULL;
208
209 ri->ls = ls;
210 ri->entry = 0;
211 ri->next = NULL;
212
213 if (rsb_iter_next(ri)) {
214 rsb_iter_free(ri);
215 return NULL;
216 }
217
218 return ri;
219}
220
221static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
222{
223 struct rsb_iter *ri;
224 loff_t n = *pos;
225
226 ri = rsb_iter_init(file->private);
227 if (!ri)
228 return NULL;
229
230 while (n--) {
231 if (rsb_iter_next(ri)) {
232 rsb_iter_free(ri);
233 return NULL;
234 }
235 }
236
237 return ri;
238}
239
240static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
241{
242 struct rsb_iter *ri = iter_ptr;
243
244 (*pos)++;
245
246 if (rsb_iter_next(ri)) {
247 rsb_iter_free(ri);
248 return NULL;
249 }
250
251 return ri;
252}
253
254static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
255{
256 /* nothing for now */
257}
258
259static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
260{
261 struct rsb_iter *ri = iter_ptr;
262
263 print_resource(ri->rsb, file);
264
265 return 0;
266}
267
268static struct seq_operations rsb_seq_ops = {
269 .start = rsb_seq_start,
270 .next = rsb_seq_next,
271 .stop = rsb_seq_stop,
272 .show = rsb_seq_show,
273};
274
275static int rsb_open(struct inode *inode, struct file *file)
276{
277 struct seq_file *seq;
278 int ret;
279
280 ret = seq_open(file, &rsb_seq_ops);
281 if (ret)
282 return ret;
283
284 seq = file->private_data;
285 seq->private = inode->u.generic_ip;
286
287 return 0;
288}
289
290static struct file_operations rsb_fops = {
291 .owner = THIS_MODULE,
292 .open = rsb_open,
293 .read = seq_read,
294 .llseek = seq_lseek,
295 .release = seq_release
296};
297
298/*
299 * dump lkb's on the ls_waiters list
300 */
301
302static int waiters_open(struct inode *inode, struct file *file)
303{
304 file->private_data = inode->u.generic_ip;
305 return 0;
306}
307
308static ssize_t waiters_read(struct file *file, char __user *userbuf,
309 size_t count, loff_t *ppos)
310{
311 struct dlm_ls *ls = file->private_data;
312 struct dlm_lkb *lkb;
313 size_t len = DLM_DEBUG_BUF_LEN, pos = 0, rv;
314
315 mutex_lock(&debug_buf_lock);
316 mutex_lock(&ls->ls_waiters_mutex);
317 memset(debug_buf, 0, sizeof(debug_buf));
318
319 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
320 pos += snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
321 lkb->lkb_id, lkb->lkb_wait_type,
322 lkb->lkb_nodeid, lkb->lkb_resource->res_name);
323 }
324 mutex_unlock(&ls->ls_waiters_mutex);
325
326 rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
327 mutex_unlock(&debug_buf_lock);
328 return rv;
329}
330
331static struct file_operations waiters_fops = {
332 .owner = THIS_MODULE,
333 .open = waiters_open,
334 .read = waiters_read
335};
336
337int dlm_create_debug_file(struct dlm_ls *ls)
338{
339 char name[DLM_LOCKSPACE_LEN+8];
340
341 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
342 S_IFREG | S_IRUGO,
343 dlm_root,
344 ls,
345 &rsb_fops);
346 if (!ls->ls_debug_rsb_dentry)
347 return -ENOMEM;
348
349 memset(name, 0, sizeof(name));
350 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
351
352 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
353 S_IFREG | S_IRUGO,
354 dlm_root,
355 ls,
356 &waiters_fops);
357 if (!ls->ls_debug_waiters_dentry) {
358 debugfs_remove(ls->ls_debug_rsb_dentry);
359 return -ENOMEM;
360 }
361
362 return 0;
363}
364
365void dlm_delete_debug_file(struct dlm_ls *ls)
366{
367 if (ls->ls_debug_rsb_dentry)
368 debugfs_remove(ls->ls_debug_rsb_dentry);
369 if (ls->ls_debug_waiters_dentry)
370 debugfs_remove(ls->ls_debug_waiters_dentry);
371}
372
373int dlm_register_debugfs(void)
374{
375 mutex_init(&debug_buf_lock);
376 dlm_root = debugfs_create_dir("dlm", NULL);
377 return dlm_root ? 0 : -ENOMEM;
378}
379
380void dlm_unregister_debugfs(void)
381{
382 debugfs_remove(dlm_root);
383}
384
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..7c3c2d27c012
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,539 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/miscdevice.h>
39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h>
42
43#include <linux/dlm.h>
44
45#define DLM_LOCKSPACE_LEN 64
46
47/* Size of the temp buffer midcomms allocates on the stack.
48 We try to make this large enough so most messages fit.
49 FIXME: should sctp make this unnecessary? */
50
51#define DLM_INBUF_LEN 148
52
53struct dlm_ls;
54struct dlm_lkb;
55struct dlm_rsb;
56struct dlm_member;
57struct dlm_lkbtable;
58struct dlm_rsbtable;
59struct dlm_dirtable;
60struct dlm_direntry;
61struct dlm_recover;
62struct dlm_header;
63struct dlm_message;
64struct dlm_rcom;
65struct dlm_mhandle;
66
67#define log_print(fmt, args...) \
68 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
69#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71
72#define DLM_LOG_DEBUG
73#ifdef DLM_LOG_DEBUG
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
75#else
76#define log_debug(ls, fmt, args...)
77#endif
78
79#define DLM_ASSERT(x, do) \
80{ \
81 if (!(x)) \
82 { \
83 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
84 "DLM: assertion: \"%s\"\n" \
85 "DLM: time = %lu\n", \
86 __LINE__, __FILE__, #x, jiffies); \
87 {do} \
88 printk("\n"); \
89 BUG(); \
90 panic("DLM: Record message above and reboot.\n"); \
91 } \
92}
93
94
95struct dlm_direntry {
96 struct list_head list;
97 uint32_t master_nodeid;
98 uint16_t length;
99 char name[1];
100};
101
102struct dlm_dirtable {
103 struct list_head list;
104 rwlock_t lock;
105};
106
107struct dlm_rsbtable {
108 struct list_head list;
109 struct list_head toss;
110 rwlock_t lock;
111};
112
113struct dlm_lkbtable {
114 struct list_head list;
115 rwlock_t lock;
116 uint16_t counter;
117};
118
119/*
120 * Lockspace member (per node in a ls)
121 */
122
123struct dlm_member {
124 struct list_head list;
125 int nodeid;
126 int weight;
127};
128
129/*
130 * Save and manage recovery state for a lockspace.
131 */
132
133struct dlm_recover {
134 struct list_head list;
135 int *nodeids;
136 int node_count;
137 uint64_t seq;
138};
139
140/*
141 * Pass input args to second stage locking function.
142 */
143
144struct dlm_args {
145 uint32_t flags;
146 void *astaddr;
147 long astparam;
148 void *bastaddr;
149 int mode;
150 struct dlm_lksb *lksb;
151};
152
153
154/*
155 * Lock block
156 *
157 * A lock can be one of three types:
158 *
159 * local copy lock is mastered locally
160 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
161 * process copy lock is mastered on a remote node
162 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
163 * master copy master node's copy of a lock owned by remote node
164 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
165 *
166 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
167 * dlm_unlock. The dlm does not modify these or use any private flags in
168 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
169 * are sent as-is to the remote master when the lock is remote.
170 *
171 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
172 * Some internal flags are shared between the master and process nodes;
173 * these shared flags are kept in the lower two bytes. One of these
174 * flags set on the master copy will be propagated to the process copy
175 * and v.v. Other internal flags are private to the master or process
176 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
177 *
178 * lkb_sbflags: status block flags. These flags are copied directly into
179 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
180 * ast. All defined in dlm.h with DLM_SBF_ prefix.
181 *
182 * lkb_status: the lock status indicates which rsb queue the lock is
183 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
184 *
185 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
186 * reply is needed. Only set when the lkb is on the lockspace waiters
187 * list awaiting a reply from a remote node.
188 *
189 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
190 * is a master copy, nodeid specifies the remote lock holder, when the
191 * lkb is a process copy, the nodeid specifies the lock master.
192 */
193
194/* lkb_ast_type */
195
196#define AST_COMP 1
197#define AST_BAST 2
198
199/* lkb_status */
200
201#define DLM_LKSTS_WAITING 1
202#define DLM_LKSTS_GRANTED 2
203#define DLM_LKSTS_CONVERT 3
204
205/* lkb_flags */
206
207#define DLM_IFL_MSTCPY 0x00010000
208#define DLM_IFL_RESEND 0x00020000
209#define DLM_IFL_DEAD 0x00040000
210#define DLM_IFL_USER 0x00000001
211#define DLM_IFL_ORPHAN 0x00000002
212
213struct dlm_lkb {
214 struct dlm_rsb *lkb_resource; /* the rsb */
215 struct kref lkb_ref;
216 int lkb_nodeid; /* copied from rsb */
217 int lkb_ownpid; /* pid of lock owner */
218 uint32_t lkb_id; /* our lock ID */
219 uint32_t lkb_remid; /* lock ID on remote partner */
220 uint32_t lkb_exflags; /* external flags from caller */
221 uint32_t lkb_sbflags; /* lksb flags */
222 uint32_t lkb_flags; /* internal flags */
223 uint32_t lkb_lvbseq; /* lvb sequence number */
224
225 int8_t lkb_status; /* granted, waiting, convert */
226 int8_t lkb_rqmode; /* requested lock mode */
227 int8_t lkb_grmode; /* granted lock mode */
228 int8_t lkb_bastmode; /* requested mode */
229 int8_t lkb_highbast; /* highest mode bast sent for */
230
231 int8_t lkb_wait_type; /* type of reply waiting for */
232 int8_t lkb_ast_type; /* type of ast queued for */
233
234 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
235 struct list_head lkb_statequeue; /* rsb g/c/w list */
236 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
237 struct list_head lkb_wait_reply; /* waiting for remote reply */
238 struct list_head lkb_astqueue; /* need ast to be sent */
239 struct list_head lkb_ownqueue; /* list of locks for a process */
240
241 char *lkb_lvbptr;
242 struct dlm_lksb *lkb_lksb; /* caller's status block */
243 void *lkb_astaddr; /* caller's ast function */
244 void *lkb_bastaddr; /* caller's bast function */
245 long lkb_astparam; /* caller's ast arg */
246};
247
248
249struct dlm_rsb {
250 struct dlm_ls *res_ls; /* the lockspace */
251 struct kref res_ref;
252 struct mutex res_mutex;
253 unsigned long res_flags;
254 int res_length; /* length of rsb name */
255 int res_nodeid;
256 uint32_t res_lvbseq;
257 uint32_t res_hash;
258 uint32_t res_bucket; /* rsbtbl */
259 unsigned long res_toss_time;
260 uint32_t res_first_lkid;
261 struct list_head res_lookup; /* lkbs waiting on first */
262 struct list_head res_hashchain; /* rsbtbl */
263 struct list_head res_grantqueue;
264 struct list_head res_convertqueue;
265 struct list_head res_waitqueue;
266
267 struct list_head res_root_list; /* used for recovery */
268 struct list_head res_recover_list; /* used for recovery */
269 int res_recover_locks_count;
270
271 char *res_lvbptr;
272 char res_name[1];
273};
274
275/* find_rsb() flags */
276
277#define R_MASTER 1 /* only return rsb if it's a master */
278#define R_CREATE 2 /* create/add rsb if not found */
279
280/* rsb_flags */
281
282enum rsb_flags {
283 RSB_MASTER_UNCERTAIN,
284 RSB_VALNOTVALID,
285 RSB_VALNOTVALID_PREV,
286 RSB_NEW_MASTER,
287 RSB_NEW_MASTER2,
288 RSB_RECOVER_CONVERT,
289 RSB_LOCKS_PURGED,
290};
291
292static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
293{
294 __set_bit(flag, &r->res_flags);
295}
296
297static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
298{
299 __clear_bit(flag, &r->res_flags);
300}
301
302static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
303{
304 return test_bit(flag, &r->res_flags);
305}
306
307
308/* dlm_header is first element of all structs sent between nodes */
309
310#define DLM_HEADER_MAJOR 0x00020000
311#define DLM_HEADER_MINOR 0x00000001
312
313#define DLM_MSG 1
314#define DLM_RCOM 2
315
316struct dlm_header {
317 uint32_t h_version;
318 uint32_t h_lockspace;
319 uint32_t h_nodeid; /* nodeid of sender */
320 uint16_t h_length;
321 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
322 uint8_t h_pad;
323};
324
325
326#define DLM_MSG_REQUEST 1
327#define DLM_MSG_CONVERT 2
328#define DLM_MSG_UNLOCK 3
329#define DLM_MSG_CANCEL 4
330#define DLM_MSG_REQUEST_REPLY 5
331#define DLM_MSG_CONVERT_REPLY 6
332#define DLM_MSG_UNLOCK_REPLY 7
333#define DLM_MSG_CANCEL_REPLY 8
334#define DLM_MSG_GRANT 9
335#define DLM_MSG_BAST 10
336#define DLM_MSG_LOOKUP 11
337#define DLM_MSG_REMOVE 12
338#define DLM_MSG_LOOKUP_REPLY 13
339
340struct dlm_message {
341 struct dlm_header m_header;
342 uint32_t m_type; /* DLM_MSG_ */
343 uint32_t m_nodeid;
344 uint32_t m_pid;
345 uint32_t m_lkid; /* lkid on sender */
346 uint32_t m_remid; /* lkid on receiver */
347 uint32_t m_parent_lkid;
348 uint32_t m_parent_remid;
349 uint32_t m_exflags;
350 uint32_t m_sbflags;
351 uint32_t m_flags;
352 uint32_t m_lvbseq;
353 uint32_t m_hash;
354 int m_status;
355 int m_grmode;
356 int m_rqmode;
357 int m_bastmode;
358 int m_asts;
359 int m_result; /* 0 or -EXXX */
360 char m_extra[0]; /* name or lvb */
361};
362
363
364#define DLM_RS_NODES 0x00000001
365#define DLM_RS_NODES_ALL 0x00000002
366#define DLM_RS_DIR 0x00000004
367#define DLM_RS_DIR_ALL 0x00000008
368#define DLM_RS_LOCKS 0x00000010
369#define DLM_RS_LOCKS_ALL 0x00000020
370#define DLM_RS_DONE 0x00000040
371#define DLM_RS_DONE_ALL 0x00000080
372
373#define DLM_RCOM_STATUS 1
374#define DLM_RCOM_NAMES 2
375#define DLM_RCOM_LOOKUP 3
376#define DLM_RCOM_LOCK 4
377#define DLM_RCOM_STATUS_REPLY 5
378#define DLM_RCOM_NAMES_REPLY 6
379#define DLM_RCOM_LOOKUP_REPLY 7
380#define DLM_RCOM_LOCK_REPLY 8
381
382struct dlm_rcom {
383 struct dlm_header rc_header;
384 uint32_t rc_type; /* DLM_RCOM_ */
385 int rc_result; /* multi-purpose */
386 uint64_t rc_id; /* match reply with request */
387 char rc_buf[0];
388};
389
390struct rcom_config {
391 uint32_t rf_lvblen;
392 uint32_t rf_lsflags;
393 uint64_t rf_unused;
394};
395
396struct rcom_lock {
397 uint32_t rl_ownpid;
398 uint32_t rl_lkid;
399 uint32_t rl_remid;
400 uint32_t rl_parent_lkid;
401 uint32_t rl_parent_remid;
402 uint32_t rl_exflags;
403 uint32_t rl_flags;
404 uint32_t rl_lvbseq;
405 int rl_result;
406 int8_t rl_rqmode;
407 int8_t rl_grmode;
408 int8_t rl_status;
409 int8_t rl_asts;
410 uint16_t rl_wait_type;
411 uint16_t rl_namelen;
412 char rl_name[DLM_RESNAME_MAXLEN];
413 char rl_lvb[0];
414};
415
416struct dlm_ls {
417 struct list_head ls_list; /* list of lockspaces */
418 dlm_lockspace_t *ls_local_handle;
419 uint32_t ls_global_id; /* global unique lockspace ID */
420 uint32_t ls_exflags;
421 int ls_lvblen;
422 int ls_count; /* reference count */
423 unsigned long ls_flags; /* LSFL_ */
424 struct kobject ls_kobj;
425
426 struct dlm_rsbtable *ls_rsbtbl;
427 uint32_t ls_rsbtbl_size;
428
429 struct dlm_lkbtable *ls_lkbtbl;
430 uint32_t ls_lkbtbl_size;
431
432 struct dlm_dirtable *ls_dirtbl;
433 uint32_t ls_dirtbl_size;
434
435 struct mutex ls_waiters_mutex;
436 struct list_head ls_waiters; /* lkbs needing a reply */
437
438 struct list_head ls_nodes; /* current nodes in ls */
439 struct list_head ls_nodes_gone; /* dead node list, recovery */
440 int ls_num_nodes; /* number of nodes in ls */
441 int ls_low_nodeid;
442 int ls_total_weight;
443 int *ls_node_array;
444
445 struct dlm_rsb ls_stub_rsb; /* for returning errors */
446 struct dlm_lkb ls_stub_lkb; /* for returning errors */
447 struct dlm_message ls_stub_ms; /* for faking a reply */
448
449 struct dentry *ls_debug_rsb_dentry; /* debugfs */
450 struct dentry *ls_debug_waiters_dentry; /* debugfs */
451
452 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
453 int ls_uevent_result;
454
455 struct miscdevice ls_device;
456
457 /* recovery related */
458
459 struct timer_list ls_timer;
460 struct task_struct *ls_recoverd_task;
461 struct mutex ls_recoverd_active;
462 spinlock_t ls_recover_lock;
463 uint32_t ls_recover_status; /* DLM_RS_ */
464 uint64_t ls_recover_seq;
465 struct dlm_recover *ls_recover_args;
466 struct rw_semaphore ls_in_recovery; /* block local requests */
467 struct list_head ls_requestqueue;/* queue remote requests */
468 struct mutex ls_requestqueue_mutex;
469 char *ls_recover_buf;
470 struct list_head ls_recover_list;
471 spinlock_t ls_recover_list_lock;
472 int ls_recover_list_count;
473 wait_queue_head_t ls_wait_general;
474 struct mutex ls_clear_proc_locks;
475
476 struct list_head ls_root_list; /* root resources */
477 struct rw_semaphore ls_root_sem; /* protect root_list */
478
479 int ls_namelen;
480 char ls_name[1];
481};
482
483#define LSFL_WORK 0
484#define LSFL_RUNNING 1
485#define LSFL_RECOVERY_STOP 2
486#define LSFL_RCOM_READY 3
487#define LSFL_UEVENT_WAIT 4
488
489/* much of this is just saving user space pointers associated with the
490 lock that we pass back to the user lib with an ast */
491
492struct dlm_user_args {
493 struct dlm_user_proc *proc; /* each process that opens the lockspace
494 device has private data
495 (dlm_user_proc) on the struct file,
496 the process's locks point back to it*/
497 struct dlm_lksb lksb;
498 int old_mode;
499 int update_user_lvb;
500 struct dlm_lksb __user *user_lksb;
501 void __user *castparam;
502 void __user *castaddr;
503 void __user *bastparam;
504 void __user *bastaddr;
505};
506
507#define DLM_PROC_FLAGS_CLOSING 1
508#define DLM_PROC_FLAGS_COMPAT 2
509
510/* locks list is kept so we can remove all a process's locks when it
511 exits (or orphan those that are persistent) */
512
513struct dlm_user_proc {
514 dlm_lockspace_t *lockspace;
515 unsigned long flags; /* DLM_PROC_FLAGS */
516 struct list_head asts;
517 spinlock_t asts_spin;
518 struct list_head locks;
519 spinlock_t locks_spin;
520 wait_queue_head_t wait;
521};
522
523static inline int dlm_locking_stopped(struct dlm_ls *ls)
524{
525 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
526}
527
528static inline int dlm_recovery_stopped(struct dlm_ls *ls)
529{
530 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
531}
532
533static inline int dlm_no_directory(struct dlm_ls *ls)
534{
535 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
536}
537
538#endif /* __DLM_INTERNAL_DOT_H__ */
539
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..7d38f914c5b9
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3845 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58#include <linux/types.h>
59#include "dlm_internal.h"
60#include <linux/dlm_device.h>
61#include "memory.h"
62#include "lowcomms.h"
63#include "requestqueue.h"
64#include "util.h"
65#include "dir.h"
66#include "member.h"
67#include "lockspace.h"
68#include "ast.h"
69#include "lock.h"
70#include "rcom.h"
71#include "recover.h"
72#include "lvb_table.h"
73#include "user.h"
74#include "config.h"
75
76static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms);
88
89#define FAKE_USER_AST (void*)0xff00ff00
90
91/*
92 * Lock compatibilty matrix - thanks Steve
93 * UN = Unlocked state. Not really a state, used as a flag
94 * PD = Padding. Used to make the matrix a nice power of two in size
95 * Other states are the same as the VMS DLM.
96 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
97 */
98
99static const int __dlm_compat_matrix[8][8] = {
100 /* UN NL CR CW PR PW EX PD */
101 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
102 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
103 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
104 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
105 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
106 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
107 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
108 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
109};
110
111/*
112 * This defines the direction of transfer of LVB data.
113 * Granted mode is the row; requested mode is the column.
114 * Usage: matrix[grmode+1][rqmode+1]
115 * 1 = LVB is returned to the caller
116 * 0 = LVB is written to the resource
117 * -1 = nothing happens to the LVB
118 */
119
120const int dlm_lvb_operations[8][8] = {
121 /* UN NL CR CW PR PW EX PD*/
122 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
123 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
124 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
125 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
126 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
127 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
128 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
129 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
130};
131
132#define modes_compat(gr, rq) \
133 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
134
135int dlm_modes_compat(int mode1, int mode2)
136{
137 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
138}
139
140/*
141 * Compatibility matrix for conversions with QUECVT set.
142 * Granted mode is the row; requested mode is the column.
143 * Usage: matrix[grmode+1][rqmode+1]
144 */
145
146static const int __quecvt_compat_matrix[8][8] = {
147 /* UN NL CR CW PR PW EX PD */
148 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
149 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
150 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
151 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
152 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
153 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
154 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
155 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
156};
157
158void dlm_print_lkb(struct dlm_lkb *lkb)
159{
160 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
161 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
162 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
163 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
164 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
165}
166
167void dlm_print_rsb(struct dlm_rsb *r)
168{
169 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
170 r->res_nodeid, r->res_flags, r->res_first_lkid,
171 r->res_recover_locks_count, r->res_name);
172}
173
174/* Threads cannot use the lockspace while it's being recovered */
175
176static inline void lock_recovery(struct dlm_ls *ls)
177{
178 down_read(&ls->ls_in_recovery);
179}
180
181static inline void unlock_recovery(struct dlm_ls *ls)
182{
183 up_read(&ls->ls_in_recovery);
184}
185
186static inline int lock_recovery_try(struct dlm_ls *ls)
187{
188 return down_read_trylock(&ls->ls_in_recovery);
189}
190
191static inline int can_be_queued(struct dlm_lkb *lkb)
192{
193 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
194}
195
196static inline int force_blocking_asts(struct dlm_lkb *lkb)
197{
198 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
199}
200
201static inline int is_demoted(struct dlm_lkb *lkb)
202{
203 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
204}
205
206static inline int is_remote(struct dlm_rsb *r)
207{
208 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
209 return !!r->res_nodeid;
210}
211
212static inline int is_process_copy(struct dlm_lkb *lkb)
213{
214 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
215}
216
217static inline int is_master_copy(struct dlm_lkb *lkb)
218{
219 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
220 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
221 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
222}
223
224static inline int middle_conversion(struct dlm_lkb *lkb)
225{
226 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
227 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
228 return 1;
229 return 0;
230}
231
232static inline int down_conversion(struct dlm_lkb *lkb)
233{
234 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
235}
236
237static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
238{
239 if (is_master_copy(lkb))
240 return;
241
242 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
243
244 lkb->lkb_lksb->sb_status = rv;
245 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
246
247 dlm_add_ast(lkb, AST_COMP);
248}
249
250static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
251{
252 if (is_master_copy(lkb))
253 send_bast(r, lkb, rqmode);
254 else {
255 lkb->lkb_bastmode = rqmode;
256 dlm_add_ast(lkb, AST_BAST);
257 }
258}
259
260/*
261 * Basic operations on rsb's and lkb's
262 */
263
264static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
265{
266 struct dlm_rsb *r;
267
268 r = allocate_rsb(ls, len);
269 if (!r)
270 return NULL;
271
272 r->res_ls = ls;
273 r->res_length = len;
274 memcpy(r->res_name, name, len);
275 mutex_init(&r->res_mutex);
276
277 INIT_LIST_HEAD(&r->res_lookup);
278 INIT_LIST_HEAD(&r->res_grantqueue);
279 INIT_LIST_HEAD(&r->res_convertqueue);
280 INIT_LIST_HEAD(&r->res_waitqueue);
281 INIT_LIST_HEAD(&r->res_root_list);
282 INIT_LIST_HEAD(&r->res_recover_list);
283
284 return r;
285}
286
287static int search_rsb_list(struct list_head *head, char *name, int len,
288 unsigned int flags, struct dlm_rsb **r_ret)
289{
290 struct dlm_rsb *r;
291 int error = 0;
292
293 list_for_each_entry(r, head, res_hashchain) {
294 if (len == r->res_length && !memcmp(name, r->res_name, len))
295 goto found;
296 }
297 return -EBADR;
298
299 found:
300 if (r->res_nodeid && (flags & R_MASTER))
301 error = -ENOTBLK;
302 *r_ret = r;
303 return error;
304}
305
306static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
307 unsigned int flags, struct dlm_rsb **r_ret)
308{
309 struct dlm_rsb *r;
310 int error;
311
312 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
313 if (!error) {
314 kref_get(&r->res_ref);
315 goto out;
316 }
317 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
318 if (error)
319 goto out;
320
321 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
322
323 if (dlm_no_directory(ls))
324 goto out;
325
326 if (r->res_nodeid == -1) {
327 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
328 r->res_first_lkid = 0;
329 } else if (r->res_nodeid > 0) {
330 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
331 r->res_first_lkid = 0;
332 } else {
333 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
334 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
335 }
336 out:
337 *r_ret = r;
338 return error;
339}
340
341static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
342 unsigned int flags, struct dlm_rsb **r_ret)
343{
344 int error;
345 write_lock(&ls->ls_rsbtbl[b].lock);
346 error = _search_rsb(ls, name, len, b, flags, r_ret);
347 write_unlock(&ls->ls_rsbtbl[b].lock);
348 return error;
349}
350
351/*
352 * Find rsb in rsbtbl and potentially create/add one
353 *
354 * Delaying the release of rsb's has a similar benefit to applications keeping
355 * NL locks on an rsb, but without the guarantee that the cached master value
356 * will still be valid when the rsb is reused. Apps aren't always smart enough
357 * to keep NL locks on an rsb that they may lock again shortly; this can lead
358 * to excessive master lookups and removals if we don't delay the release.
359 *
360 * Searching for an rsb means looking through both the normal list and toss
361 * list. When found on the toss list the rsb is moved to the normal list with
362 * ref count of 1; when found on normal list the ref count is incremented.
363 */
364
365static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
366 unsigned int flags, struct dlm_rsb **r_ret)
367{
368 struct dlm_rsb *r, *tmp;
369 uint32_t hash, bucket;
370 int error = 0;
371
372 if (dlm_no_directory(ls))
373 flags |= R_CREATE;
374
375 hash = jhash(name, namelen, 0);
376 bucket = hash & (ls->ls_rsbtbl_size - 1);
377
378 error = search_rsb(ls, name, namelen, bucket, flags, &r);
379 if (!error)
380 goto out;
381
382 if (error == -EBADR && !(flags & R_CREATE))
383 goto out;
384
385 /* the rsb was found but wasn't a master copy */
386 if (error == -ENOTBLK)
387 goto out;
388
389 error = -ENOMEM;
390 r = create_rsb(ls, name, namelen);
391 if (!r)
392 goto out;
393
394 r->res_hash = hash;
395 r->res_bucket = bucket;
396 r->res_nodeid = -1;
397 kref_init(&r->res_ref);
398
399 /* With no directory, the master can be set immediately */
400 if (dlm_no_directory(ls)) {
401 int nodeid = dlm_dir_nodeid(r);
402 if (nodeid == dlm_our_nodeid())
403 nodeid = 0;
404 r->res_nodeid = nodeid;
405 }
406
407 write_lock(&ls->ls_rsbtbl[bucket].lock);
408 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
409 if (!error) {
410 write_unlock(&ls->ls_rsbtbl[bucket].lock);
411 free_rsb(r);
412 r = tmp;
413 goto out;
414 }
415 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
416 write_unlock(&ls->ls_rsbtbl[bucket].lock);
417 error = 0;
418 out:
419 *r_ret = r;
420 return error;
421}
422
423int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
424 unsigned int flags, struct dlm_rsb **r_ret)
425{
426 return find_rsb(ls, name, namelen, flags, r_ret);
427}
428
429/* This is only called to add a reference when the code already holds
430 a valid reference to the rsb, so there's no need for locking. */
431
432static inline void hold_rsb(struct dlm_rsb *r)
433{
434 kref_get(&r->res_ref);
435}
436
437void dlm_hold_rsb(struct dlm_rsb *r)
438{
439 hold_rsb(r);
440}
441
442static void toss_rsb(struct kref *kref)
443{
444 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
445 struct dlm_ls *ls = r->res_ls;
446
447 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
448 kref_init(&r->res_ref);
449 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
450 r->res_toss_time = jiffies;
451 if (r->res_lvbptr) {
452 free_lvb(r->res_lvbptr);
453 r->res_lvbptr = NULL;
454 }
455}
456
457/* When all references to the rsb are gone it's transfered to
458 the tossed list for later disposal. */
459
460static void put_rsb(struct dlm_rsb *r)
461{
462 struct dlm_ls *ls = r->res_ls;
463 uint32_t bucket = r->res_bucket;
464
465 write_lock(&ls->ls_rsbtbl[bucket].lock);
466 kref_put(&r->res_ref, toss_rsb);
467 write_unlock(&ls->ls_rsbtbl[bucket].lock);
468}
469
470void dlm_put_rsb(struct dlm_rsb *r)
471{
472 put_rsb(r);
473}
474
475/* See comment for unhold_lkb */
476
477static void unhold_rsb(struct dlm_rsb *r)
478{
479 int rv;
480 rv = kref_put(&r->res_ref, toss_rsb);
481 DLM_ASSERT(!rv, dlm_print_rsb(r););
482}
483
484static void kill_rsb(struct kref *kref)
485{
486 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
487
488 /* All work is done after the return from kref_put() so we
489 can release the write_lock before the remove and free. */
490
491 DLM_ASSERT(list_empty(&r->res_lookup),);
492 DLM_ASSERT(list_empty(&r->res_grantqueue),);
493 DLM_ASSERT(list_empty(&r->res_convertqueue),);
494 DLM_ASSERT(list_empty(&r->res_waitqueue),);
495 DLM_ASSERT(list_empty(&r->res_root_list),);
496 DLM_ASSERT(list_empty(&r->res_recover_list),);
497}
498
499/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
500 The rsb must exist as long as any lkb's for it do. */
501
502static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
503{
504 hold_rsb(r);
505 lkb->lkb_resource = r;
506}
507
508static void detach_lkb(struct dlm_lkb *lkb)
509{
510 if (lkb->lkb_resource) {
511 put_rsb(lkb->lkb_resource);
512 lkb->lkb_resource = NULL;
513 }
514}
515
516static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
517{
518 struct dlm_lkb *lkb, *tmp;
519 uint32_t lkid = 0;
520 uint16_t bucket;
521
522 lkb = allocate_lkb(ls);
523 if (!lkb)
524 return -ENOMEM;
525
526 lkb->lkb_nodeid = -1;
527 lkb->lkb_grmode = DLM_LOCK_IV;
528 kref_init(&lkb->lkb_ref);
529 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
530
531 get_random_bytes(&bucket, sizeof(bucket));
532 bucket &= (ls->ls_lkbtbl_size - 1);
533
534 write_lock(&ls->ls_lkbtbl[bucket].lock);
535
536 /* counter can roll over so we must verify lkid is not in use */
537
538 while (lkid == 0) {
539 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
540
541 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
542 lkb_idtbl_list) {
543 if (tmp->lkb_id != lkid)
544 continue;
545 lkid = 0;
546 break;
547 }
548 }
549
550 lkb->lkb_id = lkid;
551 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
552 write_unlock(&ls->ls_lkbtbl[bucket].lock);
553
554 *lkb_ret = lkb;
555 return 0;
556}
557
558static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
559{
560 uint16_t bucket = lkid & 0xFFFF;
561 struct dlm_lkb *lkb;
562
563 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
564 if (lkb->lkb_id == lkid)
565 return lkb;
566 }
567 return NULL;
568}
569
570static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
571{
572 struct dlm_lkb *lkb;
573 uint16_t bucket = lkid & 0xFFFF;
574
575 if (bucket >= ls->ls_lkbtbl_size)
576 return -EBADSLT;
577
578 read_lock(&ls->ls_lkbtbl[bucket].lock);
579 lkb = __find_lkb(ls, lkid);
580 if (lkb)
581 kref_get(&lkb->lkb_ref);
582 read_unlock(&ls->ls_lkbtbl[bucket].lock);
583
584 *lkb_ret = lkb;
585 return lkb ? 0 : -ENOENT;
586}
587
588static void kill_lkb(struct kref *kref)
589{
590 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
591
592 /* All work is done after the return from kref_put() so we
593 can release the write_lock before the detach_lkb */
594
595 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
596}
597
598/* __put_lkb() is used when an lkb may not have an rsb attached to
599 it so we need to provide the lockspace explicitly */
600
601static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
602{
603 uint16_t bucket = lkb->lkb_id & 0xFFFF;
604
605 write_lock(&ls->ls_lkbtbl[bucket].lock);
606 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
607 list_del(&lkb->lkb_idtbl_list);
608 write_unlock(&ls->ls_lkbtbl[bucket].lock);
609
610 detach_lkb(lkb);
611
612 /* for local/process lkbs, lvbptr points to caller's lksb */
613 if (lkb->lkb_lvbptr && is_master_copy(lkb))
614 free_lvb(lkb->lkb_lvbptr);
615 free_lkb(lkb);
616 return 1;
617 } else {
618 write_unlock(&ls->ls_lkbtbl[bucket].lock);
619 return 0;
620 }
621}
622
623int dlm_put_lkb(struct dlm_lkb *lkb)
624{
625 struct dlm_ls *ls;
626
627 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
628 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
629
630 ls = lkb->lkb_resource->res_ls;
631 return __put_lkb(ls, lkb);
632}
633
634/* This is only called to add a reference when the code already holds
635 a valid reference to the lkb, so there's no need for locking. */
636
637static inline void hold_lkb(struct dlm_lkb *lkb)
638{
639 kref_get(&lkb->lkb_ref);
640}
641
642/* This is called when we need to remove a reference and are certain
643 it's not the last ref. e.g. del_lkb is always called between a
644 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
645 put_lkb would work fine, but would involve unnecessary locking */
646
647static inline void unhold_lkb(struct dlm_lkb *lkb)
648{
649 int rv;
650 rv = kref_put(&lkb->lkb_ref, kill_lkb);
651 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
652}
653
654static void lkb_add_ordered(struct list_head *new, struct list_head *head,
655 int mode)
656{
657 struct dlm_lkb *lkb = NULL;
658
659 list_for_each_entry(lkb, head, lkb_statequeue)
660 if (lkb->lkb_rqmode < mode)
661 break;
662
663 if (!lkb)
664 list_add_tail(new, head);
665 else
666 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
667}
668
669/* add/remove lkb to rsb's grant/convert/wait queue */
670
671static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
672{
673 kref_get(&lkb->lkb_ref);
674
675 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
676
677 lkb->lkb_status = status;
678
679 switch (status) {
680 case DLM_LKSTS_WAITING:
681 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
682 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
683 else
684 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
685 break;
686 case DLM_LKSTS_GRANTED:
687 /* convention says granted locks kept in order of grmode */
688 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
689 lkb->lkb_grmode);
690 break;
691 case DLM_LKSTS_CONVERT:
692 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
693 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
694 else
695 list_add_tail(&lkb->lkb_statequeue,
696 &r->res_convertqueue);
697 break;
698 default:
699 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
700 }
701}
702
703static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
704{
705 lkb->lkb_status = 0;
706 list_del(&lkb->lkb_statequeue);
707 unhold_lkb(lkb);
708}
709
710static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
711{
712 hold_lkb(lkb);
713 del_lkb(r, lkb);
714 add_lkb(r, lkb, sts);
715 unhold_lkb(lkb);
716}
717
718/* add/remove lkb from global waiters list of lkb's waiting for
719 a reply from a remote node */
720
721static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
722{
723 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
724
725 mutex_lock(&ls->ls_waiters_mutex);
726 if (lkb->lkb_wait_type) {
727 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
728 goto out;
729 }
730 lkb->lkb_wait_type = mstype;
731 kref_get(&lkb->lkb_ref);
732 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
733 out:
734 mutex_unlock(&ls->ls_waiters_mutex);
735}
736
737static int _remove_from_waiters(struct dlm_lkb *lkb)
738{
739 int error = 0;
740
741 if (!lkb->lkb_wait_type) {
742 log_print("remove_from_waiters error");
743 error = -EINVAL;
744 goto out;
745 }
746 lkb->lkb_wait_type = 0;
747 list_del(&lkb->lkb_wait_reply);
748 unhold_lkb(lkb);
749 out:
750 return error;
751}
752
753static int remove_from_waiters(struct dlm_lkb *lkb)
754{
755 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
756 int error;
757
758 mutex_lock(&ls->ls_waiters_mutex);
759 error = _remove_from_waiters(lkb);
760 mutex_unlock(&ls->ls_waiters_mutex);
761 return error;
762}
763
764static void dir_remove(struct dlm_rsb *r)
765{
766 int to_nodeid;
767
768 if (dlm_no_directory(r->res_ls))
769 return;
770
771 to_nodeid = dlm_dir_nodeid(r);
772 if (to_nodeid != dlm_our_nodeid())
773 send_remove(r);
774 else
775 dlm_dir_remove_entry(r->res_ls, to_nodeid,
776 r->res_name, r->res_length);
777}
778
779/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
780 found since they are in order of newest to oldest? */
781
782static int shrink_bucket(struct dlm_ls *ls, int b)
783{
784 struct dlm_rsb *r;
785 int count = 0, found;
786
787 for (;;) {
788 found = 0;
789 write_lock(&ls->ls_rsbtbl[b].lock);
790 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
791 res_hashchain) {
792 if (!time_after_eq(jiffies, r->res_toss_time +
793 dlm_config.toss_secs * HZ))
794 continue;
795 found = 1;
796 break;
797 }
798
799 if (!found) {
800 write_unlock(&ls->ls_rsbtbl[b].lock);
801 break;
802 }
803
804 if (kref_put(&r->res_ref, kill_rsb)) {
805 list_del(&r->res_hashchain);
806 write_unlock(&ls->ls_rsbtbl[b].lock);
807
808 if (is_master(r))
809 dir_remove(r);
810 free_rsb(r);
811 count++;
812 } else {
813 write_unlock(&ls->ls_rsbtbl[b].lock);
814 log_error(ls, "tossed rsb in use %s", r->res_name);
815 }
816 }
817
818 return count;
819}
820
821void dlm_scan_rsbs(struct dlm_ls *ls)
822{
823 int i;
824
825 if (dlm_locking_stopped(ls))
826 return;
827
828 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
829 shrink_bucket(ls, i);
830 cond_resched();
831 }
832}
833
834/* lkb is master or local copy */
835
836static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
837{
838 int b, len = r->res_ls->ls_lvblen;
839
840 /* b=1 lvb returned to caller
841 b=0 lvb written to rsb or invalidated
842 b=-1 do nothing */
843
844 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
845
846 if (b == 1) {
847 if (!lkb->lkb_lvbptr)
848 return;
849
850 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
851 return;
852
853 if (!r->res_lvbptr)
854 return;
855
856 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
857 lkb->lkb_lvbseq = r->res_lvbseq;
858
859 } else if (b == 0) {
860 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
861 rsb_set_flag(r, RSB_VALNOTVALID);
862 return;
863 }
864
865 if (!lkb->lkb_lvbptr)
866 return;
867
868 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
869 return;
870
871 if (!r->res_lvbptr)
872 r->res_lvbptr = allocate_lvb(r->res_ls);
873
874 if (!r->res_lvbptr)
875 return;
876
877 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
878 r->res_lvbseq++;
879 lkb->lkb_lvbseq = r->res_lvbseq;
880 rsb_clear_flag(r, RSB_VALNOTVALID);
881 }
882
883 if (rsb_flag(r, RSB_VALNOTVALID))
884 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
885}
886
887static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
888{
889 if (lkb->lkb_grmode < DLM_LOCK_PW)
890 return;
891
892 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
893 rsb_set_flag(r, RSB_VALNOTVALID);
894 return;
895 }
896
897 if (!lkb->lkb_lvbptr)
898 return;
899
900 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
901 return;
902
903 if (!r->res_lvbptr)
904 r->res_lvbptr = allocate_lvb(r->res_ls);
905
906 if (!r->res_lvbptr)
907 return;
908
909 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
910 r->res_lvbseq++;
911 rsb_clear_flag(r, RSB_VALNOTVALID);
912}
913
914/* lkb is process copy (pc) */
915
916static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
917 struct dlm_message *ms)
918{
919 int b;
920
921 if (!lkb->lkb_lvbptr)
922 return;
923
924 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
925 return;
926
927 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
928 if (b == 1) {
929 int len = receive_extralen(ms);
930 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
931 lkb->lkb_lvbseq = ms->m_lvbseq;
932 }
933}
934
935/* Manipulate lkb's on rsb's convert/granted/waiting queues
936 remove_lock -- used for unlock, removes lkb from granted
937 revert_lock -- used for cancel, moves lkb from convert to granted
938 grant_lock -- used for request and convert, adds lkb to granted or
939 moves lkb from convert or waiting to granted
940
941 Each of these is used for master or local copy lkb's. There is
942 also a _pc() variation used to make the corresponding change on
943 a process copy (pc) lkb. */
944
945static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
946{
947 del_lkb(r, lkb);
948 lkb->lkb_grmode = DLM_LOCK_IV;
949 /* this unhold undoes the original ref from create_lkb()
950 so this leads to the lkb being freed */
951 unhold_lkb(lkb);
952}
953
954static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
955{
956 set_lvb_unlock(r, lkb);
957 _remove_lock(r, lkb);
958}
959
960static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
961{
962 _remove_lock(r, lkb);
963}
964
965static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
966{
967 lkb->lkb_rqmode = DLM_LOCK_IV;
968
969 switch (lkb->lkb_status) {
970 case DLM_LKSTS_GRANTED:
971 break;
972 case DLM_LKSTS_CONVERT:
973 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
974 break;
975 case DLM_LKSTS_WAITING:
976 del_lkb(r, lkb);
977 lkb->lkb_grmode = DLM_LOCK_IV;
978 /* this unhold undoes the original ref from create_lkb()
979 so this leads to the lkb being freed */
980 unhold_lkb(lkb);
981 break;
982 default:
983 log_print("invalid status for revert %d", lkb->lkb_status);
984 }
985}
986
987static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
988{
989 revert_lock(r, lkb);
990}
991
992static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
993{
994 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
995 lkb->lkb_grmode = lkb->lkb_rqmode;
996 if (lkb->lkb_status)
997 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
998 else
999 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1000 }
1001
1002 lkb->lkb_rqmode = DLM_LOCK_IV;
1003}
1004
1005static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1006{
1007 set_lvb_lock(r, lkb);
1008 _grant_lock(r, lkb);
1009 lkb->lkb_highbast = 0;
1010}
1011
1012static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1013 struct dlm_message *ms)
1014{
1015 set_lvb_lock_pc(r, lkb, ms);
1016 _grant_lock(r, lkb);
1017}
1018
1019/* called by grant_pending_locks() which means an async grant message must
1020 be sent to the requesting node in addition to granting the lock if the
1021 lkb belongs to a remote node. */
1022
1023static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1024{
1025 grant_lock(r, lkb);
1026 if (is_master_copy(lkb))
1027 send_grant(r, lkb);
1028 else
1029 queue_cast(r, lkb, 0);
1030}
1031
1032static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1033{
1034 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1035 lkb_statequeue);
1036 if (lkb->lkb_id == first->lkb_id)
1037 return 1;
1038
1039 return 0;
1040}
1041
1042/* Check if the given lkb conflicts with another lkb on the queue. */
1043
1044static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1045{
1046 struct dlm_lkb *this;
1047
1048 list_for_each_entry(this, head, lkb_statequeue) {
1049 if (this == lkb)
1050 continue;
1051 if (!modes_compat(this, lkb))
1052 return 1;
1053 }
1054 return 0;
1055}
1056
1057/*
1058 * "A conversion deadlock arises with a pair of lock requests in the converting
1059 * queue for one resource. The granted mode of each lock blocks the requested
1060 * mode of the other lock."
1061 *
1062 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1063 * convert queue from being granted, then demote lkb (set grmode to NL).
1064 * This second form requires that we check for conv-deadlk even when
1065 * now == 0 in _can_be_granted().
1066 *
1067 * Example:
1068 * Granted Queue: empty
1069 * Convert Queue: NL->EX (first lock)
1070 * PR->EX (second lock)
1071 *
1072 * The first lock can't be granted because of the granted mode of the second
1073 * lock and the second lock can't be granted because it's not first in the
1074 * list. We demote the granted mode of the second lock (the lkb passed to this
1075 * function).
1076 *
1077 * After the resolution, the "grant pending" function needs to go back and try
1078 * to grant locks on the convert queue again since the first lock can now be
1079 * granted.
1080 */
1081
1082static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1083{
1084 struct dlm_lkb *this, *first = NULL, *self = NULL;
1085
1086 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1087 if (!first)
1088 first = this;
1089 if (this == lkb) {
1090 self = lkb;
1091 continue;
1092 }
1093
1094 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1095 return 1;
1096 }
1097
1098 /* if lkb is on the convert queue and is preventing the first
1099 from being granted, then there's deadlock and we demote lkb.
1100 multiple converting locks may need to do this before the first
1101 converting lock can be granted. */
1102
1103 if (self && self != first) {
1104 if (!modes_compat(lkb, first) &&
1105 !queue_conflict(&rsb->res_grantqueue, first))
1106 return 1;
1107 }
1108
1109 return 0;
1110}
1111
1112/*
1113 * Return 1 if the lock can be granted, 0 otherwise.
1114 * Also detect and resolve conversion deadlocks.
1115 *
1116 * lkb is the lock to be granted
1117 *
1118 * now is 1 if the function is being called in the context of the
1119 * immediate request, it is 0 if called later, after the lock has been
1120 * queued.
1121 *
1122 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1123 */
1124
1125static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1126{
1127 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1128
1129 /*
1130 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1131 * a new request for a NL mode lock being blocked.
1132 *
1133 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1134 * request, then it would be granted. In essence, the use of this flag
1135 * tells the Lock Manager to expedite theis request by not considering
1136 * what may be in the CONVERTING or WAITING queues... As of this
1137 * writing, the EXPEDITE flag can be used only with new requests for NL
1138 * mode locks. This flag is not valid for conversion requests.
1139 *
1140 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1141 * conversion or used with a non-NL requested mode. We also know an
1142 * EXPEDITE request is always granted immediately, so now must always
1143 * be 1. The full condition to grant an expedite request: (now &&
1144 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1145 * therefore be shortened to just checking the flag.
1146 */
1147
1148 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1149 return 1;
1150
1151 /*
1152 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1153 * added to the remaining conditions.
1154 */
1155
1156 if (queue_conflict(&r->res_grantqueue, lkb))
1157 goto out;
1158
1159 /*
1160 * 6-3: By default, a conversion request is immediately granted if the
1161 * requested mode is compatible with the modes of all other granted
1162 * locks
1163 */
1164
1165 if (queue_conflict(&r->res_convertqueue, lkb))
1166 goto out;
1167
1168 /*
1169 * 6-5: But the default algorithm for deciding whether to grant or
1170 * queue conversion requests does not by itself guarantee that such
1171 * requests are serviced on a "first come first serve" basis. This, in
1172 * turn, can lead to a phenomenon known as "indefinate postponement".
1173 *
1174 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1175 * the system service employed to request a lock conversion. This flag
1176 * forces certain conversion requests to be queued, even if they are
1177 * compatible with the granted modes of other locks on the same
1178 * resource. Thus, the use of this flag results in conversion requests
1179 * being ordered on a "first come first servce" basis.
1180 *
1181 * DCT: This condition is all about new conversions being able to occur
1182 * "in place" while the lock remains on the granted queue (assuming
1183 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1184 * doesn't _have_ to go onto the convert queue where it's processed in
1185 * order. The "now" variable is necessary to distinguish converts
1186 * being received and processed for the first time now, because once a
1187 * convert is moved to the conversion queue the condition below applies
1188 * requiring fifo granting.
1189 */
1190
1191 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1192 return 1;
1193
1194 /*
1195 * The NOORDER flag is set to avoid the standard vms rules on grant
1196 * order.
1197 */
1198
1199 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1200 return 1;
1201
1202 /*
1203 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1204 * granted until all other conversion requests ahead of it are granted
1205 * and/or canceled.
1206 */
1207
1208 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1209 return 1;
1210
1211 /*
1212 * 6-4: By default, a new request is immediately granted only if all
1213 * three of the following conditions are satisfied when the request is
1214 * issued:
1215 * - The queue of ungranted conversion requests for the resource is
1216 * empty.
1217 * - The queue of ungranted new requests for the resource is empty.
1218 * - The mode of the new request is compatible with the most
1219 * restrictive mode of all granted locks on the resource.
1220 */
1221
1222 if (now && !conv && list_empty(&r->res_convertqueue) &&
1223 list_empty(&r->res_waitqueue))
1224 return 1;
1225
1226 /*
1227 * 6-4: Once a lock request is in the queue of ungranted new requests,
1228 * it cannot be granted until the queue of ungranted conversion
1229 * requests is empty, all ungranted new requests ahead of it are
1230 * granted and/or canceled, and it is compatible with the granted mode
1231 * of the most restrictive lock granted on the resource.
1232 */
1233
1234 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1235 first_in_list(lkb, &r->res_waitqueue))
1236 return 1;
1237
1238 out:
1239 /*
1240 * The following, enabled by CONVDEADLK, departs from VMS.
1241 */
1242
1243 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1244 conversion_deadlock_detect(r, lkb)) {
1245 lkb->lkb_grmode = DLM_LOCK_NL;
1246 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1247 }
1248
1249 return 0;
1250}
1251
1252/*
1253 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1254 * simple way to provide a big optimization to applications that can use them.
1255 */
1256
1257static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1258{
1259 uint32_t flags = lkb->lkb_exflags;
1260 int rv;
1261 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1262
1263 rv = _can_be_granted(r, lkb, now);
1264 if (rv)
1265 goto out;
1266
1267 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1268 goto out;
1269
1270 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1271 alt = DLM_LOCK_PR;
1272 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1273 alt = DLM_LOCK_CW;
1274
1275 if (alt) {
1276 lkb->lkb_rqmode = alt;
1277 rv = _can_be_granted(r, lkb, now);
1278 if (rv)
1279 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1280 else
1281 lkb->lkb_rqmode = rqmode;
1282 }
1283 out:
1284 return rv;
1285}
1286
1287static int grant_pending_convert(struct dlm_rsb *r, int high)
1288{
1289 struct dlm_lkb *lkb, *s;
1290 int hi, demoted, quit, grant_restart, demote_restart;
1291
1292 quit = 0;
1293 restart:
1294 grant_restart = 0;
1295 demote_restart = 0;
1296 hi = DLM_LOCK_IV;
1297
1298 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1299 demoted = is_demoted(lkb);
1300 if (can_be_granted(r, lkb, 0)) {
1301 grant_lock_pending(r, lkb);
1302 grant_restart = 1;
1303 } else {
1304 hi = max_t(int, lkb->lkb_rqmode, hi);
1305 if (!demoted && is_demoted(lkb))
1306 demote_restart = 1;
1307 }
1308 }
1309
1310 if (grant_restart)
1311 goto restart;
1312 if (demote_restart && !quit) {
1313 quit = 1;
1314 goto restart;
1315 }
1316
1317 return max_t(int, high, hi);
1318}
1319
1320static int grant_pending_wait(struct dlm_rsb *r, int high)
1321{
1322 struct dlm_lkb *lkb, *s;
1323
1324 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1325 if (can_be_granted(r, lkb, 0))
1326 grant_lock_pending(r, lkb);
1327 else
1328 high = max_t(int, lkb->lkb_rqmode, high);
1329 }
1330
1331 return high;
1332}
1333
1334static void grant_pending_locks(struct dlm_rsb *r)
1335{
1336 struct dlm_lkb *lkb, *s;
1337 int high = DLM_LOCK_IV;
1338
1339 DLM_ASSERT(is_master(r), dlm_print_rsb(r););
1340
1341 high = grant_pending_convert(r, high);
1342 high = grant_pending_wait(r, high);
1343
1344 if (high == DLM_LOCK_IV)
1345 return;
1346
1347 /*
1348 * If there are locks left on the wait/convert queue then send blocking
1349 * ASTs to granted locks based on the largest requested mode (high)
1350 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1351 */
1352
1353 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1354 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1355 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1356 queue_bast(r, lkb, high);
1357 lkb->lkb_highbast = high;
1358 }
1359 }
1360}
1361
1362static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1363 struct dlm_lkb *lkb)
1364{
1365 struct dlm_lkb *gr;
1366
1367 list_for_each_entry(gr, head, lkb_statequeue) {
1368 if (gr->lkb_bastaddr &&
1369 gr->lkb_highbast < lkb->lkb_rqmode &&
1370 !modes_compat(gr, lkb)) {
1371 queue_bast(r, gr, lkb->lkb_rqmode);
1372 gr->lkb_highbast = lkb->lkb_rqmode;
1373 }
1374 }
1375}
1376
1377static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1378{
1379 send_bast_queue(r, &r->res_grantqueue, lkb);
1380}
1381
1382static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1383{
1384 send_bast_queue(r, &r->res_grantqueue, lkb);
1385 send_bast_queue(r, &r->res_convertqueue, lkb);
1386}
1387
1388/* set_master(r, lkb) -- set the master nodeid of a resource
1389
1390 The purpose of this function is to set the nodeid field in the given
1391 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1392 known, it can just be copied to the lkb and the function will return
1393 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1394 before it can be copied to the lkb.
1395
1396 When the rsb nodeid is being looked up remotely, the initial lkb
1397 causing the lookup is kept on the ls_waiters list waiting for the
1398 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1399 on the rsb's res_lookup list until the master is verified.
1400
1401 Return values:
1402 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1403 1: the rsb master is not available and the lkb has been placed on
1404 a wait queue
1405*/
1406
1407static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1408{
1409 struct dlm_ls *ls = r->res_ls;
1410 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1411
1412 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1413 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1414 r->res_first_lkid = lkb->lkb_id;
1415 lkb->lkb_nodeid = r->res_nodeid;
1416 return 0;
1417 }
1418
1419 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1420 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1421 return 1;
1422 }
1423
1424 if (r->res_nodeid == 0) {
1425 lkb->lkb_nodeid = 0;
1426 return 0;
1427 }
1428
1429 if (r->res_nodeid > 0) {
1430 lkb->lkb_nodeid = r->res_nodeid;
1431 return 0;
1432 }
1433
1434 DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
1435
1436 dir_nodeid = dlm_dir_nodeid(r);
1437
1438 if (dir_nodeid != our_nodeid) {
1439 r->res_first_lkid = lkb->lkb_id;
1440 send_lookup(r, lkb);
1441 return 1;
1442 }
1443
1444 for (;;) {
1445 /* It's possible for dlm_scand to remove an old rsb for
1446 this same resource from the toss list, us to create
1447 a new one, look up the master locally, and find it
1448 already exists just before dlm_scand does the
1449 dir_remove() on the previous rsb. */
1450
1451 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1452 r->res_length, &ret_nodeid);
1453 if (!error)
1454 break;
1455 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1456 schedule();
1457 }
1458
1459 if (ret_nodeid == our_nodeid) {
1460 r->res_first_lkid = 0;
1461 r->res_nodeid = 0;
1462 lkb->lkb_nodeid = 0;
1463 } else {
1464 r->res_first_lkid = lkb->lkb_id;
1465 r->res_nodeid = ret_nodeid;
1466 lkb->lkb_nodeid = ret_nodeid;
1467 }
1468 return 0;
1469}
1470
1471static void process_lookup_list(struct dlm_rsb *r)
1472{
1473 struct dlm_lkb *lkb, *safe;
1474
1475 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1476 list_del(&lkb->lkb_rsb_lookup);
1477 _request_lock(r, lkb);
1478 schedule();
1479 }
1480}
1481
1482/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1483
1484static void confirm_master(struct dlm_rsb *r, int error)
1485{
1486 struct dlm_lkb *lkb;
1487
1488 if (!r->res_first_lkid)
1489 return;
1490
1491 switch (error) {
1492 case 0:
1493 case -EINPROGRESS:
1494 r->res_first_lkid = 0;
1495 process_lookup_list(r);
1496 break;
1497
1498 case -EAGAIN:
1499 /* the remote master didn't queue our NOQUEUE request;
1500 make a waiting lkb the first_lkid */
1501
1502 r->res_first_lkid = 0;
1503
1504 if (!list_empty(&r->res_lookup)) {
1505 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1506 lkb_rsb_lookup);
1507 list_del(&lkb->lkb_rsb_lookup);
1508 r->res_first_lkid = lkb->lkb_id;
1509 _request_lock(r, lkb);
1510 } else
1511 r->res_nodeid = -1;
1512 break;
1513
1514 default:
1515 log_error(r->res_ls, "confirm_master unknown error %d", error);
1516 }
1517}
1518
1519static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1520 int namelen, uint32_t parent_lkid, void *ast,
1521 void *astarg, void *bast, struct dlm_args *args)
1522{
1523 int rv = -EINVAL;
1524
1525 /* check for invalid arg usage */
1526
1527 if (mode < 0 || mode > DLM_LOCK_EX)
1528 goto out;
1529
1530 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1531 goto out;
1532
1533 if (flags & DLM_LKF_CANCEL)
1534 goto out;
1535
1536 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1537 goto out;
1538
1539 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1540 goto out;
1541
1542 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1543 goto out;
1544
1545 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1546 goto out;
1547
1548 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1549 goto out;
1550
1551 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1552 goto out;
1553
1554 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1555 goto out;
1556
1557 if (!ast || !lksb)
1558 goto out;
1559
1560 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1561 goto out;
1562
1563 /* parent/child locks not yet supported */
1564 if (parent_lkid)
1565 goto out;
1566
1567 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1568 goto out;
1569
1570 /* these args will be copied to the lkb in validate_lock_args,
1571 it cannot be done now because when converting locks, fields in
1572 an active lkb cannot be modified before locking the rsb */
1573
1574 args->flags = flags;
1575 args->astaddr = ast;
1576 args->astparam = (long) astarg;
1577 args->bastaddr = bast;
1578 args->mode = mode;
1579 args->lksb = lksb;
1580 rv = 0;
1581 out:
1582 return rv;
1583}
1584
1585static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1586{
1587 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1588 DLM_LKF_FORCEUNLOCK))
1589 return -EINVAL;
1590
1591 args->flags = flags;
1592 args->astparam = (long) astarg;
1593 return 0;
1594}
1595
1596static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1597 struct dlm_args *args)
1598{
1599 int rv = -EINVAL;
1600
1601 if (args->flags & DLM_LKF_CONVERT) {
1602 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1603 goto out;
1604
1605 if (args->flags & DLM_LKF_QUECVT &&
1606 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1607 goto out;
1608
1609 rv = -EBUSY;
1610 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1611 goto out;
1612
1613 if (lkb->lkb_wait_type)
1614 goto out;
1615 }
1616
1617 lkb->lkb_exflags = args->flags;
1618 lkb->lkb_sbflags = 0;
1619 lkb->lkb_astaddr = args->astaddr;
1620 lkb->lkb_astparam = args->astparam;
1621 lkb->lkb_bastaddr = args->bastaddr;
1622 lkb->lkb_rqmode = args->mode;
1623 lkb->lkb_lksb = args->lksb;
1624 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1625 lkb->lkb_ownpid = (int) current->pid;
1626 rv = 0;
1627 out:
1628 return rv;
1629}
1630
1631static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1632{
1633 int rv = -EINVAL;
1634
1635 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1636 goto out;
1637
1638 if (args->flags & DLM_LKF_FORCEUNLOCK)
1639 goto out_ok;
1640
1641 if (args->flags & DLM_LKF_CANCEL &&
1642 lkb->lkb_status == DLM_LKSTS_GRANTED)
1643 goto out;
1644
1645 if (!(args->flags & DLM_LKF_CANCEL) &&
1646 lkb->lkb_status != DLM_LKSTS_GRANTED)
1647 goto out;
1648
1649 rv = -EBUSY;
1650 if (lkb->lkb_wait_type)
1651 goto out;
1652
1653 out_ok:
1654 lkb->lkb_exflags = args->flags;
1655 lkb->lkb_sbflags = 0;
1656 lkb->lkb_astparam = args->astparam;
1657
1658 rv = 0;
1659 out:
1660 return rv;
1661}
1662
1663/*
1664 * Four stage 4 varieties:
1665 * do_request(), do_convert(), do_unlock(), do_cancel()
1666 * These are called on the master node for the given lock and
1667 * from the central locking logic.
1668 */
1669
1670static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1671{
1672 int error = 0;
1673
1674 if (can_be_granted(r, lkb, 1)) {
1675 grant_lock(r, lkb);
1676 queue_cast(r, lkb, 0);
1677 goto out;
1678 }
1679
1680 if (can_be_queued(lkb)) {
1681 error = -EINPROGRESS;
1682 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1683 send_blocking_asts(r, lkb);
1684 goto out;
1685 }
1686
1687 error = -EAGAIN;
1688 if (force_blocking_asts(lkb))
1689 send_blocking_asts_all(r, lkb);
1690 queue_cast(r, lkb, -EAGAIN);
1691
1692 out:
1693 return error;
1694}
1695
1696static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1697{
1698 int error = 0;
1699
1700 /* changing an existing lock may allow others to be granted */
1701
1702 if (can_be_granted(r, lkb, 1)) {
1703 grant_lock(r, lkb);
1704 queue_cast(r, lkb, 0);
1705 grant_pending_locks(r);
1706 goto out;
1707 }
1708
1709 if (can_be_queued(lkb)) {
1710 if (is_demoted(lkb))
1711 grant_pending_locks(r);
1712 error = -EINPROGRESS;
1713 del_lkb(r, lkb);
1714 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1715 send_blocking_asts(r, lkb);
1716 goto out;
1717 }
1718
1719 error = -EAGAIN;
1720 if (force_blocking_asts(lkb))
1721 send_blocking_asts_all(r, lkb);
1722 queue_cast(r, lkb, -EAGAIN);
1723
1724 out:
1725 return error;
1726}
1727
1728static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1729{
1730 remove_lock(r, lkb);
1731 queue_cast(r, lkb, -DLM_EUNLOCK);
1732 grant_pending_locks(r);
1733 return -DLM_EUNLOCK;
1734}
1735
1736/* FIXME: if revert_lock() finds that the lkb is granted, we should
1737 skip the queue_cast(ECANCEL). It indicates that the request/convert
1738 completed (and queued a normal ast) just before the cancel; we don't
1739 want to clobber the sb_result for the normal ast with ECANCEL. */
1740
1741static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1742{
1743 revert_lock(r, lkb);
1744 queue_cast(r, lkb, -DLM_ECANCEL);
1745 grant_pending_locks(r);
1746 return -DLM_ECANCEL;
1747}
1748
1749/*
1750 * Four stage 3 varieties:
1751 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1752 */
1753
1754/* add a new lkb to a possibly new rsb, called by requesting process */
1755
1756static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1757{
1758 int error;
1759
1760 /* set_master: sets lkb nodeid from r */
1761
1762 error = set_master(r, lkb);
1763 if (error < 0)
1764 goto out;
1765 if (error) {
1766 error = 0;
1767 goto out;
1768 }
1769
1770 if (is_remote(r))
1771 /* receive_request() calls do_request() on remote node */
1772 error = send_request(r, lkb);
1773 else
1774 error = do_request(r, lkb);
1775 out:
1776 return error;
1777}
1778
1779/* change some property of an existing lkb, e.g. mode */
1780
1781static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1782{
1783 int error;
1784
1785 if (is_remote(r))
1786 /* receive_convert() calls do_convert() on remote node */
1787 error = send_convert(r, lkb);
1788 else
1789 error = do_convert(r, lkb);
1790
1791 return error;
1792}
1793
1794/* remove an existing lkb from the granted queue */
1795
1796static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1797{
1798 int error;
1799
1800 if (is_remote(r))
1801 /* receive_unlock() calls do_unlock() on remote node */
1802 error = send_unlock(r, lkb);
1803 else
1804 error = do_unlock(r, lkb);
1805
1806 return error;
1807}
1808
1809/* remove an existing lkb from the convert or wait queue */
1810
1811static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1812{
1813 int error;
1814
1815 if (is_remote(r))
1816 /* receive_cancel() calls do_cancel() on remote node */
1817 error = send_cancel(r, lkb);
1818 else
1819 error = do_cancel(r, lkb);
1820
1821 return error;
1822}
1823
1824/*
1825 * Four stage 2 varieties:
1826 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1827 */
1828
1829static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1830 int len, struct dlm_args *args)
1831{
1832 struct dlm_rsb *r;
1833 int error;
1834
1835 error = validate_lock_args(ls, lkb, args);
1836 if (error)
1837 goto out;
1838
1839 error = find_rsb(ls, name, len, R_CREATE, &r);
1840 if (error)
1841 goto out;
1842
1843 lock_rsb(r);
1844
1845 attach_lkb(r, lkb);
1846 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1847
1848 error = _request_lock(r, lkb);
1849
1850 unlock_rsb(r);
1851 put_rsb(r);
1852
1853 out:
1854 return error;
1855}
1856
1857static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1858 struct dlm_args *args)
1859{
1860 struct dlm_rsb *r;
1861 int error;
1862
1863 r = lkb->lkb_resource;
1864
1865 hold_rsb(r);
1866 lock_rsb(r);
1867
1868 error = validate_lock_args(ls, lkb, args);
1869 if (error)
1870 goto out;
1871
1872 error = _convert_lock(r, lkb);
1873 out:
1874 unlock_rsb(r);
1875 put_rsb(r);
1876 return error;
1877}
1878
1879static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1880 struct dlm_args *args)
1881{
1882 struct dlm_rsb *r;
1883 int error;
1884
1885 r = lkb->lkb_resource;
1886
1887 hold_rsb(r);
1888 lock_rsb(r);
1889
1890 error = validate_unlock_args(lkb, args);
1891 if (error)
1892 goto out;
1893
1894 error = _unlock_lock(r, lkb);
1895 out:
1896 unlock_rsb(r);
1897 put_rsb(r);
1898 return error;
1899}
1900
1901static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1902 struct dlm_args *args)
1903{
1904 struct dlm_rsb *r;
1905 int error;
1906
1907 r = lkb->lkb_resource;
1908
1909 hold_rsb(r);
1910 lock_rsb(r);
1911
1912 error = validate_unlock_args(lkb, args);
1913 if (error)
1914 goto out;
1915
1916 error = _cancel_lock(r, lkb);
1917 out:
1918 unlock_rsb(r);
1919 put_rsb(r);
1920 return error;
1921}
1922
1923/*
1924 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1925 */
1926
1927int dlm_lock(dlm_lockspace_t *lockspace,
1928 int mode,
1929 struct dlm_lksb *lksb,
1930 uint32_t flags,
1931 void *name,
1932 unsigned int namelen,
1933 uint32_t parent_lkid,
1934 void (*ast) (void *astarg),
1935 void *astarg,
1936 void (*bast) (void *astarg, int mode))
1937{
1938 struct dlm_ls *ls;
1939 struct dlm_lkb *lkb;
1940 struct dlm_args args;
1941 int error, convert = flags & DLM_LKF_CONVERT;
1942
1943 ls = dlm_find_lockspace_local(lockspace);
1944 if (!ls)
1945 return -EINVAL;
1946
1947 lock_recovery(ls);
1948
1949 if (convert)
1950 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1951 else
1952 error = create_lkb(ls, &lkb);
1953
1954 if (error)
1955 goto out;
1956
1957 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1958 astarg, bast, &args);
1959 if (error)
1960 goto out_put;
1961
1962 if (convert)
1963 error = convert_lock(ls, lkb, &args);
1964 else
1965 error = request_lock(ls, lkb, name, namelen, &args);
1966
1967 if (error == -EINPROGRESS)
1968 error = 0;
1969 out_put:
1970 if (convert || error)
1971 __put_lkb(ls, lkb);
1972 if (error == -EAGAIN)
1973 error = 0;
1974 out:
1975 unlock_recovery(ls);
1976 dlm_put_lockspace(ls);
1977 return error;
1978}
1979
1980int dlm_unlock(dlm_lockspace_t *lockspace,
1981 uint32_t lkid,
1982 uint32_t flags,
1983 struct dlm_lksb *lksb,
1984 void *astarg)
1985{
1986 struct dlm_ls *ls;
1987 struct dlm_lkb *lkb;
1988 struct dlm_args args;
1989 int error;
1990
1991 ls = dlm_find_lockspace_local(lockspace);
1992 if (!ls)
1993 return -EINVAL;
1994
1995 lock_recovery(ls);
1996
1997 error = find_lkb(ls, lkid, &lkb);
1998 if (error)
1999 goto out;
2000
2001 error = set_unlock_args(flags, astarg, &args);
2002 if (error)
2003 goto out_put;
2004
2005 if (flags & DLM_LKF_CANCEL)
2006 error = cancel_lock(ls, lkb, &args);
2007 else
2008 error = unlock_lock(ls, lkb, &args);
2009
2010 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2011 error = 0;
2012 out_put:
2013 dlm_put_lkb(lkb);
2014 out:
2015 unlock_recovery(ls);
2016 dlm_put_lockspace(ls);
2017 return error;
2018}
2019
2020/*
2021 * send/receive routines for remote operations and replies
2022 *
2023 * send_args
2024 * send_common
2025 * send_request receive_request
2026 * send_convert receive_convert
2027 * send_unlock receive_unlock
2028 * send_cancel receive_cancel
2029 * send_grant receive_grant
2030 * send_bast receive_bast
2031 * send_lookup receive_lookup
2032 * send_remove receive_remove
2033 *
2034 * send_common_reply
2035 * receive_request_reply send_request_reply
2036 * receive_convert_reply send_convert_reply
2037 * receive_unlock_reply send_unlock_reply
2038 * receive_cancel_reply send_cancel_reply
2039 * receive_lookup_reply send_lookup_reply
2040 */
2041
2042static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2043 int to_nodeid, int mstype,
2044 struct dlm_message **ms_ret,
2045 struct dlm_mhandle **mh_ret)
2046{
2047 struct dlm_message *ms;
2048 struct dlm_mhandle *mh;
2049 char *mb;
2050 int mb_len = sizeof(struct dlm_message);
2051
2052 switch (mstype) {
2053 case DLM_MSG_REQUEST:
2054 case DLM_MSG_LOOKUP:
2055 case DLM_MSG_REMOVE:
2056 mb_len += r->res_length;
2057 break;
2058 case DLM_MSG_CONVERT:
2059 case DLM_MSG_UNLOCK:
2060 case DLM_MSG_REQUEST_REPLY:
2061 case DLM_MSG_CONVERT_REPLY:
2062 case DLM_MSG_GRANT:
2063 if (lkb && lkb->lkb_lvbptr)
2064 mb_len += r->res_ls->ls_lvblen;
2065 break;
2066 }
2067
2068 /* get_buffer gives us a message handle (mh) that we need to
2069 pass into lowcomms_commit and a message buffer (mb) that we
2070 write our data into */
2071
2072 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2073 if (!mh)
2074 return -ENOBUFS;
2075
2076 memset(mb, 0, mb_len);
2077
2078 ms = (struct dlm_message *) mb;
2079
2080 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2081 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2082 ms->m_header.h_nodeid = dlm_our_nodeid();
2083 ms->m_header.h_length = mb_len;
2084 ms->m_header.h_cmd = DLM_MSG;
2085
2086 ms->m_type = mstype;
2087
2088 *mh_ret = mh;
2089 *ms_ret = ms;
2090 return 0;
2091}
2092
2093/* further lowcomms enhancements or alternate implementations may make
2094 the return value from this function useful at some point */
2095
2096static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2097{
2098 dlm_message_out(ms);
2099 dlm_lowcomms_commit_buffer(mh);
2100 return 0;
2101}
2102
2103static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2104 struct dlm_message *ms)
2105{
2106 ms->m_nodeid = lkb->lkb_nodeid;
2107 ms->m_pid = lkb->lkb_ownpid;
2108 ms->m_lkid = lkb->lkb_id;
2109 ms->m_remid = lkb->lkb_remid;
2110 ms->m_exflags = lkb->lkb_exflags;
2111 ms->m_sbflags = lkb->lkb_sbflags;
2112 ms->m_flags = lkb->lkb_flags;
2113 ms->m_lvbseq = lkb->lkb_lvbseq;
2114 ms->m_status = lkb->lkb_status;
2115 ms->m_grmode = lkb->lkb_grmode;
2116 ms->m_rqmode = lkb->lkb_rqmode;
2117 ms->m_hash = r->res_hash;
2118
2119 /* m_result and m_bastmode are set from function args,
2120 not from lkb fields */
2121
2122 if (lkb->lkb_bastaddr)
2123 ms->m_asts |= AST_BAST;
2124 if (lkb->lkb_astaddr)
2125 ms->m_asts |= AST_COMP;
2126
2127 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2128 memcpy(ms->m_extra, r->res_name, r->res_length);
2129
2130 else if (lkb->lkb_lvbptr)
2131 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2132
2133}
2134
2135static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2136{
2137 struct dlm_message *ms;
2138 struct dlm_mhandle *mh;
2139 int to_nodeid, error;
2140
2141 add_to_waiters(lkb, mstype);
2142
2143 to_nodeid = r->res_nodeid;
2144
2145 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2146 if (error)
2147 goto fail;
2148
2149 send_args(r, lkb, ms);
2150
2151 error = send_message(mh, ms);
2152 if (error)
2153 goto fail;
2154 return 0;
2155
2156 fail:
2157 remove_from_waiters(lkb);
2158 return error;
2159}
2160
2161static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2162{
2163 return send_common(r, lkb, DLM_MSG_REQUEST);
2164}
2165
2166static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2167{
2168 int error;
2169
2170 error = send_common(r, lkb, DLM_MSG_CONVERT);
2171
2172 /* down conversions go without a reply from the master */
2173 if (!error && down_conversion(lkb)) {
2174 remove_from_waiters(lkb);
2175 r->res_ls->ls_stub_ms.m_result = 0;
2176 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2177 }
2178
2179 return error;
2180}
2181
2182/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2183 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2184 that the master is still correct. */
2185
2186static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2187{
2188 return send_common(r, lkb, DLM_MSG_UNLOCK);
2189}
2190
2191static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2192{
2193 return send_common(r, lkb, DLM_MSG_CANCEL);
2194}
2195
2196static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2197{
2198 struct dlm_message *ms;
2199 struct dlm_mhandle *mh;
2200 int to_nodeid, error;
2201
2202 to_nodeid = lkb->lkb_nodeid;
2203
2204 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2205 if (error)
2206 goto out;
2207
2208 send_args(r, lkb, ms);
2209
2210 ms->m_result = 0;
2211
2212 error = send_message(mh, ms);
2213 out:
2214 return error;
2215}
2216
2217static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2218{
2219 struct dlm_message *ms;
2220 struct dlm_mhandle *mh;
2221 int to_nodeid, error;
2222
2223 to_nodeid = lkb->lkb_nodeid;
2224
2225 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2226 if (error)
2227 goto out;
2228
2229 send_args(r, lkb, ms);
2230
2231 ms->m_bastmode = mode;
2232
2233 error = send_message(mh, ms);
2234 out:
2235 return error;
2236}
2237
2238static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2239{
2240 struct dlm_message *ms;
2241 struct dlm_mhandle *mh;
2242 int to_nodeid, error;
2243
2244 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2245
2246 to_nodeid = dlm_dir_nodeid(r);
2247
2248 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2249 if (error)
2250 goto fail;
2251
2252 send_args(r, lkb, ms);
2253
2254 error = send_message(mh, ms);
2255 if (error)
2256 goto fail;
2257 return 0;
2258
2259 fail:
2260 remove_from_waiters(lkb);
2261 return error;
2262}
2263
2264static int send_remove(struct dlm_rsb *r)
2265{
2266 struct dlm_message *ms;
2267 struct dlm_mhandle *mh;
2268 int to_nodeid, error;
2269
2270 to_nodeid = dlm_dir_nodeid(r);
2271
2272 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2273 if (error)
2274 goto out;
2275
2276 memcpy(ms->m_extra, r->res_name, r->res_length);
2277 ms->m_hash = r->res_hash;
2278
2279 error = send_message(mh, ms);
2280 out:
2281 return error;
2282}
2283
2284static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2285 int mstype, int rv)
2286{
2287 struct dlm_message *ms;
2288 struct dlm_mhandle *mh;
2289 int to_nodeid, error;
2290
2291 to_nodeid = lkb->lkb_nodeid;
2292
2293 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2294 if (error)
2295 goto out;
2296
2297 send_args(r, lkb, ms);
2298
2299 ms->m_result = rv;
2300
2301 error = send_message(mh, ms);
2302 out:
2303 return error;
2304}
2305
2306static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2307{
2308 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2309}
2310
2311static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2312{
2313 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2314}
2315
2316static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2317{
2318 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2319}
2320
2321static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2322{
2323 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2324}
2325
2326static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2327 int ret_nodeid, int rv)
2328{
2329 struct dlm_rsb *r = &ls->ls_stub_rsb;
2330 struct dlm_message *ms;
2331 struct dlm_mhandle *mh;
2332 int error, nodeid = ms_in->m_header.h_nodeid;
2333
2334 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2335 if (error)
2336 goto out;
2337
2338 ms->m_lkid = ms_in->m_lkid;
2339 ms->m_result = rv;
2340 ms->m_nodeid = ret_nodeid;
2341
2342 error = send_message(mh, ms);
2343 out:
2344 return error;
2345}
2346
2347/* which args we save from a received message depends heavily on the type
2348 of message, unlike the send side where we can safely send everything about
2349 the lkb for any type of message */
2350
2351static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2352{
2353 lkb->lkb_exflags = ms->m_exflags;
2354 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2355 (ms->m_flags & 0x0000FFFF);
2356}
2357
2358static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2359{
2360 lkb->lkb_sbflags = ms->m_sbflags;
2361 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2362 (ms->m_flags & 0x0000FFFF);
2363}
2364
2365static int receive_extralen(struct dlm_message *ms)
2366{
2367 return (ms->m_header.h_length - sizeof(struct dlm_message));
2368}
2369
2370static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2371 struct dlm_message *ms)
2372{
2373 int len;
2374
2375 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2376 if (!lkb->lkb_lvbptr)
2377 lkb->lkb_lvbptr = allocate_lvb(ls);
2378 if (!lkb->lkb_lvbptr)
2379 return -ENOMEM;
2380 len = receive_extralen(ms);
2381 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2382 }
2383 return 0;
2384}
2385
2386static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2387 struct dlm_message *ms)
2388{
2389 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2390 lkb->lkb_ownpid = ms->m_pid;
2391 lkb->lkb_remid = ms->m_lkid;
2392 lkb->lkb_grmode = DLM_LOCK_IV;
2393 lkb->lkb_rqmode = ms->m_rqmode;
2394 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2395 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2396
2397 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2398
2399 if (receive_lvb(ls, lkb, ms))
2400 return -ENOMEM;
2401
2402 return 0;
2403}
2404
2405static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2406 struct dlm_message *ms)
2407{
2408 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2409 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2410 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2411 lkb->lkb_id, lkb->lkb_remid);
2412 return -EINVAL;
2413 }
2414
2415 if (!is_master_copy(lkb))
2416 return -EINVAL;
2417
2418 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2419 return -EBUSY;
2420
2421 if (receive_lvb(ls, lkb, ms))
2422 return -ENOMEM;
2423
2424 lkb->lkb_rqmode = ms->m_rqmode;
2425 lkb->lkb_lvbseq = ms->m_lvbseq;
2426
2427 return 0;
2428}
2429
2430static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2431 struct dlm_message *ms)
2432{
2433 if (!is_master_copy(lkb))
2434 return -EINVAL;
2435 if (receive_lvb(ls, lkb, ms))
2436 return -ENOMEM;
2437 return 0;
2438}
2439
2440/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2441 uses to send a reply and that the remote end uses to process the reply. */
2442
2443static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2444{
2445 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2446 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2447 lkb->lkb_remid = ms->m_lkid;
2448}
2449
2450static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2451{
2452 struct dlm_lkb *lkb;
2453 struct dlm_rsb *r;
2454 int error, namelen;
2455
2456 error = create_lkb(ls, &lkb);
2457 if (error)
2458 goto fail;
2459
2460 receive_flags(lkb, ms);
2461 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2462 error = receive_request_args(ls, lkb, ms);
2463 if (error) {
2464 __put_lkb(ls, lkb);
2465 goto fail;
2466 }
2467
2468 namelen = receive_extralen(ms);
2469
2470 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2471 if (error) {
2472 __put_lkb(ls, lkb);
2473 goto fail;
2474 }
2475
2476 lock_rsb(r);
2477
2478 attach_lkb(r, lkb);
2479 error = do_request(r, lkb);
2480 send_request_reply(r, lkb, error);
2481
2482 unlock_rsb(r);
2483 put_rsb(r);
2484
2485 if (error == -EINPROGRESS)
2486 error = 0;
2487 if (error)
2488 dlm_put_lkb(lkb);
2489 return;
2490
2491 fail:
2492 setup_stub_lkb(ls, ms);
2493 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2494}
2495
2496static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2497{
2498 struct dlm_lkb *lkb;
2499 struct dlm_rsb *r;
2500 int error, reply = 1;
2501
2502 error = find_lkb(ls, ms->m_remid, &lkb);
2503 if (error)
2504 goto fail;
2505
2506 r = lkb->lkb_resource;
2507
2508 hold_rsb(r);
2509 lock_rsb(r);
2510
2511 receive_flags(lkb, ms);
2512 error = receive_convert_args(ls, lkb, ms);
2513 if (error)
2514 goto out;
2515 reply = !down_conversion(lkb);
2516
2517 error = do_convert(r, lkb);
2518 out:
2519 if (reply)
2520 send_convert_reply(r, lkb, error);
2521
2522 unlock_rsb(r);
2523 put_rsb(r);
2524 dlm_put_lkb(lkb);
2525 return;
2526
2527 fail:
2528 setup_stub_lkb(ls, ms);
2529 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2530}
2531
2532static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2533{
2534 struct dlm_lkb *lkb;
2535 struct dlm_rsb *r;
2536 int error;
2537
2538 error = find_lkb(ls, ms->m_remid, &lkb);
2539 if (error)
2540 goto fail;
2541
2542 r = lkb->lkb_resource;
2543
2544 hold_rsb(r);
2545 lock_rsb(r);
2546
2547 receive_flags(lkb, ms);
2548 error = receive_unlock_args(ls, lkb, ms);
2549 if (error)
2550 goto out;
2551
2552 error = do_unlock(r, lkb);
2553 out:
2554 send_unlock_reply(r, lkb, error);
2555
2556 unlock_rsb(r);
2557 put_rsb(r);
2558 dlm_put_lkb(lkb);
2559 return;
2560
2561 fail:
2562 setup_stub_lkb(ls, ms);
2563 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2564}
2565
2566static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2567{
2568 struct dlm_lkb *lkb;
2569 struct dlm_rsb *r;
2570 int error;
2571
2572 error = find_lkb(ls, ms->m_remid, &lkb);
2573 if (error)
2574 goto fail;
2575
2576 receive_flags(lkb, ms);
2577
2578 r = lkb->lkb_resource;
2579
2580 hold_rsb(r);
2581 lock_rsb(r);
2582
2583 error = do_cancel(r, lkb);
2584 send_cancel_reply(r, lkb, error);
2585
2586 unlock_rsb(r);
2587 put_rsb(r);
2588 dlm_put_lkb(lkb);
2589 return;
2590
2591 fail:
2592 setup_stub_lkb(ls, ms);
2593 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2594}
2595
2596static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2597{
2598 struct dlm_lkb *lkb;
2599 struct dlm_rsb *r;
2600 int error;
2601
2602 error = find_lkb(ls, ms->m_remid, &lkb);
2603 if (error) {
2604 log_error(ls, "receive_grant no lkb");
2605 return;
2606 }
2607 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2608
2609 r = lkb->lkb_resource;
2610
2611 hold_rsb(r);
2612 lock_rsb(r);
2613
2614 receive_flags_reply(lkb, ms);
2615 grant_lock_pc(r, lkb, ms);
2616 queue_cast(r, lkb, 0);
2617
2618 unlock_rsb(r);
2619 put_rsb(r);
2620 dlm_put_lkb(lkb);
2621}
2622
2623static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2624{
2625 struct dlm_lkb *lkb;
2626 struct dlm_rsb *r;
2627 int error;
2628
2629 error = find_lkb(ls, ms->m_remid, &lkb);
2630 if (error) {
2631 log_error(ls, "receive_bast no lkb");
2632 return;
2633 }
2634 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2635
2636 r = lkb->lkb_resource;
2637
2638 hold_rsb(r);
2639 lock_rsb(r);
2640
2641 queue_bast(r, lkb, ms->m_bastmode);
2642
2643 unlock_rsb(r);
2644 put_rsb(r);
2645 dlm_put_lkb(lkb);
2646}
2647
2648static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2649{
2650 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2651
2652 from_nodeid = ms->m_header.h_nodeid;
2653 our_nodeid = dlm_our_nodeid();
2654
2655 len = receive_extralen(ms);
2656
2657 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2658 if (dir_nodeid != our_nodeid) {
2659 log_error(ls, "lookup dir_nodeid %d from %d",
2660 dir_nodeid, from_nodeid);
2661 error = -EINVAL;
2662 ret_nodeid = -1;
2663 goto out;
2664 }
2665
2666 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2667
2668 /* Optimization: we're master so treat lookup as a request */
2669 if (!error && ret_nodeid == our_nodeid) {
2670 receive_request(ls, ms);
2671 return;
2672 }
2673 out:
2674 send_lookup_reply(ls, ms, ret_nodeid, error);
2675}
2676
2677static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2678{
2679 int len, dir_nodeid, from_nodeid;
2680
2681 from_nodeid = ms->m_header.h_nodeid;
2682
2683 len = receive_extralen(ms);
2684
2685 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2686 if (dir_nodeid != dlm_our_nodeid()) {
2687 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2688 dir_nodeid, from_nodeid);
2689 return;
2690 }
2691
2692 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2693}
2694
2695static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2696{
2697 struct dlm_lkb *lkb;
2698 struct dlm_rsb *r;
2699 int error, mstype;
2700
2701 error = find_lkb(ls, ms->m_remid, &lkb);
2702 if (error) {
2703 log_error(ls, "receive_request_reply no lkb");
2704 return;
2705 }
2706 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2707
2708 mstype = lkb->lkb_wait_type;
2709 error = remove_from_waiters(lkb);
2710 if (error) {
2711 log_error(ls, "receive_request_reply not on waiters");
2712 goto out;
2713 }
2714
2715 /* this is the value returned from do_request() on the master */
2716 error = ms->m_result;
2717
2718 r = lkb->lkb_resource;
2719 hold_rsb(r);
2720 lock_rsb(r);
2721
2722 /* Optimization: the dir node was also the master, so it took our
2723 lookup as a request and sent request reply instead of lookup reply */
2724 if (mstype == DLM_MSG_LOOKUP) {
2725 r->res_nodeid = ms->m_header.h_nodeid;
2726 lkb->lkb_nodeid = r->res_nodeid;
2727 }
2728
2729 switch (error) {
2730 case -EAGAIN:
2731 /* request would block (be queued) on remote master;
2732 the unhold undoes the original ref from create_lkb()
2733 so it leads to the lkb being freed */
2734 queue_cast(r, lkb, -EAGAIN);
2735 confirm_master(r, -EAGAIN);
2736 unhold_lkb(lkb);
2737 break;
2738
2739 case -EINPROGRESS:
2740 case 0:
2741 /* request was queued or granted on remote master */
2742 receive_flags_reply(lkb, ms);
2743 lkb->lkb_remid = ms->m_lkid;
2744 if (error)
2745 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2746 else {
2747 grant_lock_pc(r, lkb, ms);
2748 queue_cast(r, lkb, 0);
2749 }
2750 confirm_master(r, error);
2751 break;
2752
2753 case -EBADR:
2754 case -ENOTBLK:
2755 /* find_rsb failed to find rsb or rsb wasn't master */
2756 r->res_nodeid = -1;
2757 lkb->lkb_nodeid = -1;
2758 _request_lock(r, lkb);
2759 break;
2760
2761 default:
2762 log_error(ls, "receive_request_reply error %d", error);
2763 }
2764
2765 unlock_rsb(r);
2766 put_rsb(r);
2767 out:
2768 dlm_put_lkb(lkb);
2769}
2770
2771static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2772 struct dlm_message *ms)
2773{
2774 int error = ms->m_result;
2775
2776 /* this is the value returned from do_convert() on the master */
2777
2778 switch (error) {
2779 case -EAGAIN:
2780 /* convert would block (be queued) on remote master */
2781 queue_cast(r, lkb, -EAGAIN);
2782 break;
2783
2784 case -EINPROGRESS:
2785 /* convert was queued on remote master */
2786 del_lkb(r, lkb);
2787 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2788 break;
2789
2790 case 0:
2791 /* convert was granted on remote master */
2792 receive_flags_reply(lkb, ms);
2793 grant_lock_pc(r, lkb, ms);
2794 queue_cast(r, lkb, 0);
2795 break;
2796
2797 default:
2798 log_error(r->res_ls, "receive_convert_reply error %d", error);
2799 }
2800}
2801
2802static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2803{
2804 struct dlm_rsb *r = lkb->lkb_resource;
2805
2806 hold_rsb(r);
2807 lock_rsb(r);
2808
2809 __receive_convert_reply(r, lkb, ms);
2810
2811 unlock_rsb(r);
2812 put_rsb(r);
2813}
2814
2815static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2816{
2817 struct dlm_lkb *lkb;
2818 int error;
2819
2820 error = find_lkb(ls, ms->m_remid, &lkb);
2821 if (error) {
2822 log_error(ls, "receive_convert_reply no lkb");
2823 return;
2824 }
2825 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2826
2827 error = remove_from_waiters(lkb);
2828 if (error) {
2829 log_error(ls, "receive_convert_reply not on waiters");
2830 goto out;
2831 }
2832
2833 _receive_convert_reply(lkb, ms);
2834 out:
2835 dlm_put_lkb(lkb);
2836}
2837
2838static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2839{
2840 struct dlm_rsb *r = lkb->lkb_resource;
2841 int error = ms->m_result;
2842
2843 hold_rsb(r);
2844 lock_rsb(r);
2845
2846 /* this is the value returned from do_unlock() on the master */
2847
2848 switch (error) {
2849 case -DLM_EUNLOCK:
2850 receive_flags_reply(lkb, ms);
2851 remove_lock_pc(r, lkb);
2852 queue_cast(r, lkb, -DLM_EUNLOCK);
2853 break;
2854 default:
2855 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2856 }
2857
2858 unlock_rsb(r);
2859 put_rsb(r);
2860}
2861
2862static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2863{
2864 struct dlm_lkb *lkb;
2865 int error;
2866
2867 error = find_lkb(ls, ms->m_remid, &lkb);
2868 if (error) {
2869 log_error(ls, "receive_unlock_reply no lkb");
2870 return;
2871 }
2872 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2873
2874 error = remove_from_waiters(lkb);
2875 if (error) {
2876 log_error(ls, "receive_unlock_reply not on waiters");
2877 goto out;
2878 }
2879
2880 _receive_unlock_reply(lkb, ms);
2881 out:
2882 dlm_put_lkb(lkb);
2883}
2884
2885static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2886{
2887 struct dlm_rsb *r = lkb->lkb_resource;
2888 int error = ms->m_result;
2889
2890 hold_rsb(r);
2891 lock_rsb(r);
2892
2893 /* this is the value returned from do_cancel() on the master */
2894
2895 switch (error) {
2896 case -DLM_ECANCEL:
2897 receive_flags_reply(lkb, ms);
2898 revert_lock_pc(r, lkb);
2899 queue_cast(r, lkb, -DLM_ECANCEL);
2900 break;
2901 default:
2902 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2903 }
2904
2905 unlock_rsb(r);
2906 put_rsb(r);
2907}
2908
2909static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2910{
2911 struct dlm_lkb *lkb;
2912 int error;
2913
2914 error = find_lkb(ls, ms->m_remid, &lkb);
2915 if (error) {
2916 log_error(ls, "receive_cancel_reply no lkb");
2917 return;
2918 }
2919 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2920
2921 error = remove_from_waiters(lkb);
2922 if (error) {
2923 log_error(ls, "receive_cancel_reply not on waiters");
2924 goto out;
2925 }
2926
2927 _receive_cancel_reply(lkb, ms);
2928 out:
2929 dlm_put_lkb(lkb);
2930}
2931
2932static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2933{
2934 struct dlm_lkb *lkb;
2935 struct dlm_rsb *r;
2936 int error, ret_nodeid;
2937
2938 error = find_lkb(ls, ms->m_lkid, &lkb);
2939 if (error) {
2940 log_error(ls, "receive_lookup_reply no lkb");
2941 return;
2942 }
2943
2944 error = remove_from_waiters(lkb);
2945 if (error) {
2946 log_error(ls, "receive_lookup_reply not on waiters");
2947 goto out;
2948 }
2949
2950 /* this is the value returned by dlm_dir_lookup on dir node
2951 FIXME: will a non-zero error ever be returned? */
2952 error = ms->m_result;
2953
2954 r = lkb->lkb_resource;
2955 hold_rsb(r);
2956 lock_rsb(r);
2957
2958 ret_nodeid = ms->m_nodeid;
2959 if (ret_nodeid == dlm_our_nodeid()) {
2960 r->res_nodeid = 0;
2961 ret_nodeid = 0;
2962 r->res_first_lkid = 0;
2963 } else {
2964 /* set_master() will copy res_nodeid to lkb_nodeid */
2965 r->res_nodeid = ret_nodeid;
2966 }
2967
2968 _request_lock(r, lkb);
2969
2970 if (!ret_nodeid)
2971 process_lookup_list(r);
2972
2973 unlock_rsb(r);
2974 put_rsb(r);
2975 out:
2976 dlm_put_lkb(lkb);
2977}
2978
2979int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
2980{
2981 struct dlm_message *ms = (struct dlm_message *) hd;
2982 struct dlm_ls *ls;
2983 int error;
2984
2985 if (!recovery)
2986 dlm_message_in(ms);
2987
2988 ls = dlm_find_lockspace_global(hd->h_lockspace);
2989 if (!ls) {
2990 log_print("drop message %d from %d for unknown lockspace %d",
2991 ms->m_type, nodeid, hd->h_lockspace);
2992 return -EINVAL;
2993 }
2994
2995 /* recovery may have just ended leaving a bunch of backed-up requests
2996 in the requestqueue; wait while dlm_recoverd clears them */
2997
2998 if (!recovery)
2999 dlm_wait_requestqueue(ls);
3000
3001 /* recovery may have just started while there were a bunch of
3002 in-flight requests -- save them in requestqueue to be processed
3003 after recovery. we can't let dlm_recvd block on the recovery
3004 lock. if dlm_recoverd is calling this function to clear the
3005 requestqueue, it needs to be interrupted (-EINTR) if another
3006 recovery operation is starting. */
3007
3008 while (1) {
3009 if (dlm_locking_stopped(ls)) {
3010 if (!recovery)
3011 dlm_add_requestqueue(ls, nodeid, hd);
3012 error = -EINTR;
3013 goto out;
3014 }
3015
3016 if (lock_recovery_try(ls))
3017 break;
3018 schedule();
3019 }
3020
3021 switch (ms->m_type) {
3022
3023 /* messages sent to a master node */
3024
3025 case DLM_MSG_REQUEST:
3026 receive_request(ls, ms);
3027 break;
3028
3029 case DLM_MSG_CONVERT:
3030 receive_convert(ls, ms);
3031 break;
3032
3033 case DLM_MSG_UNLOCK:
3034 receive_unlock(ls, ms);
3035 break;
3036
3037 case DLM_MSG_CANCEL:
3038 receive_cancel(ls, ms);
3039 break;
3040
3041 /* messages sent from a master node (replies to above) */
3042
3043 case DLM_MSG_REQUEST_REPLY:
3044 receive_request_reply(ls, ms);
3045 break;
3046
3047 case DLM_MSG_CONVERT_REPLY:
3048 receive_convert_reply(ls, ms);
3049 break;
3050
3051 case DLM_MSG_UNLOCK_REPLY:
3052 receive_unlock_reply(ls, ms);
3053 break;
3054
3055 case DLM_MSG_CANCEL_REPLY:
3056 receive_cancel_reply(ls, ms);
3057 break;
3058
3059 /* messages sent from a master node (only two types of async msg) */
3060
3061 case DLM_MSG_GRANT:
3062 receive_grant(ls, ms);
3063 break;
3064
3065 case DLM_MSG_BAST:
3066 receive_bast(ls, ms);
3067 break;
3068
3069 /* messages sent to a dir node */
3070
3071 case DLM_MSG_LOOKUP:
3072 receive_lookup(ls, ms);
3073 break;
3074
3075 case DLM_MSG_REMOVE:
3076 receive_remove(ls, ms);
3077 break;
3078
3079 /* messages sent from a dir node (remove has no reply) */
3080
3081 case DLM_MSG_LOOKUP_REPLY:
3082 receive_lookup_reply(ls, ms);
3083 break;
3084
3085 default:
3086 log_error(ls, "unknown message type %d", ms->m_type);
3087 }
3088
3089 unlock_recovery(ls);
3090 out:
3091 dlm_put_lockspace(ls);
3092 dlm_astd_wake();
3093 return 0;
3094}
3095
3096
3097/*
3098 * Recovery related
3099 */
3100
3101static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3102{
3103 if (middle_conversion(lkb)) {
3104 hold_lkb(lkb);
3105 ls->ls_stub_ms.m_result = -EINPROGRESS;
3106 _remove_from_waiters(lkb);
3107 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3108
3109 /* Same special case as in receive_rcom_lock_args() */
3110 lkb->lkb_grmode = DLM_LOCK_IV;
3111 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3112 unhold_lkb(lkb);
3113
3114 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3115 lkb->lkb_flags |= DLM_IFL_RESEND;
3116 }
3117
3118 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3119 conversions are async; there's no reply from the remote master */
3120}
3121
3122/* A waiting lkb needs recovery if the master node has failed, or
3123 the master node is changing (only when no directory is used) */
3124
3125static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3126{
3127 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3128 return 1;
3129
3130 if (!dlm_no_directory(ls))
3131 return 0;
3132
3133 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3134 return 1;
3135
3136 return 0;
3137}
3138
3139/* Recovery for locks that are waiting for replies from nodes that are now
3140 gone. We can just complete unlocks and cancels by faking a reply from the
3141 dead node. Requests and up-conversions we flag to be resent after
3142 recovery. Down-conversions can just be completed with a fake reply like
3143 unlocks. Conversions between PR and CW need special attention. */
3144
3145void dlm_recover_waiters_pre(struct dlm_ls *ls)
3146{
3147 struct dlm_lkb *lkb, *safe;
3148
3149 mutex_lock(&ls->ls_waiters_mutex);
3150
3151 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3152 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3153 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3154
3155 /* all outstanding lookups, regardless of destination will be
3156 resent after recovery is done */
3157
3158 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3159 lkb->lkb_flags |= DLM_IFL_RESEND;
3160 continue;
3161 }
3162
3163 if (!waiter_needs_recovery(ls, lkb))
3164 continue;
3165
3166 switch (lkb->lkb_wait_type) {
3167
3168 case DLM_MSG_REQUEST:
3169 lkb->lkb_flags |= DLM_IFL_RESEND;
3170 break;
3171
3172 case DLM_MSG_CONVERT:
3173 recover_convert_waiter(ls, lkb);
3174 break;
3175
3176 case DLM_MSG_UNLOCK:
3177 hold_lkb(lkb);
3178 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3179 _remove_from_waiters(lkb);
3180 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3181 dlm_put_lkb(lkb);
3182 break;
3183
3184 case DLM_MSG_CANCEL:
3185 hold_lkb(lkb);
3186 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3187 _remove_from_waiters(lkb);
3188 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3189 dlm_put_lkb(lkb);
3190 break;
3191
3192 default:
3193 log_error(ls, "invalid lkb wait_type %d",
3194 lkb->lkb_wait_type);
3195 }
3196 schedule();
3197 }
3198 mutex_unlock(&ls->ls_waiters_mutex);
3199}
3200
3201static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3202{
3203 struct dlm_lkb *lkb;
3204 int rv = 0;
3205
3206 mutex_lock(&ls->ls_waiters_mutex);
3207 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3208 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3209 rv = lkb->lkb_wait_type;
3210 _remove_from_waiters(lkb);
3211 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3212 break;
3213 }
3214 }
3215 mutex_unlock(&ls->ls_waiters_mutex);
3216
3217 if (!rv)
3218 lkb = NULL;
3219 *lkb_ret = lkb;
3220 return rv;
3221}
3222
3223/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3224 master or dir-node for r. Processing the lkb may result in it being placed
3225 back on waiters. */
3226
3227int dlm_recover_waiters_post(struct dlm_ls *ls)
3228{
3229 struct dlm_lkb *lkb;
3230 struct dlm_rsb *r;
3231 int error = 0, mstype;
3232
3233 while (1) {
3234 if (dlm_locking_stopped(ls)) {
3235 log_debug(ls, "recover_waiters_post aborted");
3236 error = -EINTR;
3237 break;
3238 }
3239
3240 mstype = remove_resend_waiter(ls, &lkb);
3241 if (!mstype)
3242 break;
3243
3244 r = lkb->lkb_resource;
3245
3246 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3247 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3248
3249 switch (mstype) {
3250
3251 case DLM_MSG_LOOKUP:
3252 hold_rsb(r);
3253 lock_rsb(r);
3254 _request_lock(r, lkb);
3255 if (is_master(r))
3256 confirm_master(r, 0);
3257 unlock_rsb(r);
3258 put_rsb(r);
3259 break;
3260
3261 case DLM_MSG_REQUEST:
3262 hold_rsb(r);
3263 lock_rsb(r);
3264 _request_lock(r, lkb);
3265 unlock_rsb(r);
3266 put_rsb(r);
3267 break;
3268
3269 case DLM_MSG_CONVERT:
3270 hold_rsb(r);
3271 lock_rsb(r);
3272 _convert_lock(r, lkb);
3273 unlock_rsb(r);
3274 put_rsb(r);
3275 break;
3276
3277 default:
3278 log_error(ls, "recover_waiters_post type %d", mstype);
3279 }
3280 }
3281
3282 return error;
3283}
3284
3285static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3286 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3287{
3288 struct dlm_ls *ls = r->res_ls;
3289 struct dlm_lkb *lkb, *safe;
3290
3291 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3292 if (test(ls, lkb)) {
3293 rsb_set_flag(r, RSB_LOCKS_PURGED);
3294 del_lkb(r, lkb);
3295 /* this put should free the lkb */
3296 if (!dlm_put_lkb(lkb))
3297 log_error(ls, "purged lkb not released");
3298 }
3299 }
3300}
3301
3302static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3303{
3304 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3305}
3306
3307static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3308{
3309 return is_master_copy(lkb);
3310}
3311
3312static void purge_dead_locks(struct dlm_rsb *r)
3313{
3314 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3315 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3316 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3317}
3318
3319void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3320{
3321 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3322 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3323 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3324}
3325
3326/* Get rid of locks held by nodes that are gone. */
3327
3328int dlm_purge_locks(struct dlm_ls *ls)
3329{
3330 struct dlm_rsb *r;
3331
3332 log_debug(ls, "dlm_purge_locks");
3333
3334 down_write(&ls->ls_root_sem);
3335 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3336 hold_rsb(r);
3337 lock_rsb(r);
3338 if (is_master(r))
3339 purge_dead_locks(r);
3340 unlock_rsb(r);
3341 unhold_rsb(r);
3342
3343 schedule();
3344 }
3345 up_write(&ls->ls_root_sem);
3346
3347 return 0;
3348}
3349
3350static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3351{
3352 struct dlm_rsb *r, *r_ret = NULL;
3353
3354 read_lock(&ls->ls_rsbtbl[bucket].lock);
3355 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3356 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3357 continue;
3358 hold_rsb(r);
3359 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3360 r_ret = r;
3361 break;
3362 }
3363 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3364 return r_ret;
3365}
3366
3367void dlm_grant_after_purge(struct dlm_ls *ls)
3368{
3369 struct dlm_rsb *r;
3370 int bucket = 0;
3371
3372 while (1) {
3373 r = find_purged_rsb(ls, bucket);
3374 if (!r) {
3375 if (bucket == ls->ls_rsbtbl_size - 1)
3376 break;
3377 bucket++;
3378 continue;
3379 }
3380 lock_rsb(r);
3381 if (is_master(r)) {
3382 grant_pending_locks(r);
3383 confirm_master(r, 0);
3384 }
3385 unlock_rsb(r);
3386 put_rsb(r);
3387 schedule();
3388 }
3389}
3390
3391static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3392 uint32_t remid)
3393{
3394 struct dlm_lkb *lkb;
3395
3396 list_for_each_entry(lkb, head, lkb_statequeue) {
3397 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3398 return lkb;
3399 }
3400 return NULL;
3401}
3402
3403static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3404 uint32_t remid)
3405{
3406 struct dlm_lkb *lkb;
3407
3408 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3409 if (lkb)
3410 return lkb;
3411 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3412 if (lkb)
3413 return lkb;
3414 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3415 if (lkb)
3416 return lkb;
3417 return NULL;
3418}
3419
3420static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3421 struct dlm_rsb *r, struct dlm_rcom *rc)
3422{
3423 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3424 int lvblen;
3425
3426 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3427 lkb->lkb_ownpid = rl->rl_ownpid;
3428 lkb->lkb_remid = rl->rl_lkid;
3429 lkb->lkb_exflags = rl->rl_exflags;
3430 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3431 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3432 lkb->lkb_lvbseq = rl->rl_lvbseq;
3433 lkb->lkb_rqmode = rl->rl_rqmode;
3434 lkb->lkb_grmode = rl->rl_grmode;
3435 /* don't set lkb_status because add_lkb wants to itself */
3436
3437 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3438 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3439
3440 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3441 lkb->lkb_lvbptr = allocate_lvb(ls);
3442 if (!lkb->lkb_lvbptr)
3443 return -ENOMEM;
3444 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3445 sizeof(struct rcom_lock);
3446 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3447 }
3448
3449 /* Conversions between PR and CW (middle modes) need special handling.
3450 The real granted mode of these converting locks cannot be determined
3451 until all locks have been rebuilt on the rsb (recover_conversion) */
3452
3453 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3454 rl->rl_status = DLM_LKSTS_CONVERT;
3455 lkb->lkb_grmode = DLM_LOCK_IV;
3456 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3457 }
3458
3459 return 0;
3460}
3461
3462/* This lkb may have been recovered in a previous aborted recovery so we need
3463 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3464 If so we just send back a standard reply. If not, we create a new lkb with
3465 the given values and send back our lkid. We send back our lkid by sending
3466 back the rcom_lock struct we got but with the remid field filled in. */
3467
3468int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3469{
3470 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3471 struct dlm_rsb *r;
3472 struct dlm_lkb *lkb;
3473 int error;
3474
3475 if (rl->rl_parent_lkid) {
3476 error = -EOPNOTSUPP;
3477 goto out;
3478 }
3479
3480 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3481 if (error)
3482 goto out;
3483
3484 lock_rsb(r);
3485
3486 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3487 if (lkb) {
3488 error = -EEXIST;
3489 goto out_remid;
3490 }
3491
3492 error = create_lkb(ls, &lkb);
3493 if (error)
3494 goto out_unlock;
3495
3496 error = receive_rcom_lock_args(ls, lkb, r, rc);
3497 if (error) {
3498 __put_lkb(ls, lkb);
3499 goto out_unlock;
3500 }
3501
3502 attach_lkb(r, lkb);
3503 add_lkb(r, lkb, rl->rl_status);
3504 error = 0;
3505
3506 out_remid:
3507 /* this is the new value returned to the lock holder for
3508 saving in its process-copy lkb */
3509 rl->rl_remid = lkb->lkb_id;
3510
3511 out_unlock:
3512 unlock_rsb(r);
3513 put_rsb(r);
3514 out:
3515 if (error)
3516 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3517 rl->rl_result = error;
3518 return error;
3519}
3520
3521int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3522{
3523 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3524 struct dlm_rsb *r;
3525 struct dlm_lkb *lkb;
3526 int error;
3527
3528 error = find_lkb(ls, rl->rl_lkid, &lkb);
3529 if (error) {
3530 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3531 return error;
3532 }
3533
3534 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3535
3536 error = rl->rl_result;
3537
3538 r = lkb->lkb_resource;
3539 hold_rsb(r);
3540 lock_rsb(r);
3541
3542 switch (error) {
3543 case -EEXIST:
3544 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3545 /* fall through */
3546 case 0:
3547 lkb->lkb_remid = rl->rl_remid;
3548 break;
3549 default:
3550 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3551 error, lkb->lkb_id);
3552 }
3553
3554 /* an ack for dlm_recover_locks() which waits for replies from
3555 all the locks it sends to new masters */
3556 dlm_recovered_lock(r);
3557
3558 unlock_rsb(r);
3559 put_rsb(r);
3560 dlm_put_lkb(lkb);
3561
3562 return 0;
3563}
3564
3565int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3566 int mode, uint32_t flags, void *name, unsigned int namelen,
3567 uint32_t parent_lkid)
3568{
3569 struct dlm_lkb *lkb;
3570 struct dlm_args args;
3571 int error;
3572
3573 lock_recovery(ls);
3574
3575 error = create_lkb(ls, &lkb);
3576 if (error) {
3577 kfree(ua);
3578 goto out;
3579 }
3580
3581 if (flags & DLM_LKF_VALBLK) {
3582 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3583 if (!ua->lksb.sb_lvbptr) {
3584 kfree(ua);
3585 __put_lkb(ls, lkb);
3586 error = -ENOMEM;
3587 goto out;
3588 }
3589 }
3590
3591 /* After ua is attached to lkb it will be freed by free_lkb().
3592 When DLM_IFL_USER is set, the dlm knows that this is a userspace
3593 lock and that lkb_astparam is the dlm_user_args structure. */
3594
3595 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
3596 FAKE_USER_AST, ua, FAKE_USER_AST, &args);
3597 lkb->lkb_flags |= DLM_IFL_USER;
3598 ua->old_mode = DLM_LOCK_IV;
3599
3600 if (error) {
3601 __put_lkb(ls, lkb);
3602 goto out;
3603 }
3604
3605 error = request_lock(ls, lkb, name, namelen, &args);
3606
3607 switch (error) {
3608 case 0:
3609 break;
3610 case -EINPROGRESS:
3611 error = 0;
3612 break;
3613 case -EAGAIN:
3614 error = 0;
3615 /* fall through */
3616 default:
3617 __put_lkb(ls, lkb);
3618 goto out;
3619 }
3620
3621 /* add this new lkb to the per-process list of locks */
3622 spin_lock(&ua->proc->locks_spin);
3623 kref_get(&lkb->lkb_ref);
3624 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3625 spin_unlock(&ua->proc->locks_spin);
3626 out:
3627 unlock_recovery(ls);
3628 return error;
3629}
3630
3631int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3632 int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
3633{
3634 struct dlm_lkb *lkb;
3635 struct dlm_args args;
3636 struct dlm_user_args *ua;
3637 int error;
3638
3639 lock_recovery(ls);
3640
3641 error = find_lkb(ls, lkid, &lkb);
3642 if (error)
3643 goto out;
3644
3645 /* user can change the params on its lock when it converts it, or
3646 add an lvb that didn't exist before */
3647
3648 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3649
3650 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3651 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3652 if (!ua->lksb.sb_lvbptr) {
3653 error = -ENOMEM;
3654 goto out_put;
3655 }
3656 }
3657 if (lvb_in && ua->lksb.sb_lvbptr)
3658 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3659
3660 ua->castparam = ua_tmp->castparam;
3661 ua->castaddr = ua_tmp->castaddr;
3662 ua->bastparam = ua_tmp->bastparam;
3663 ua->bastaddr = ua_tmp->bastaddr;
3664 ua->old_mode = lkb->lkb_grmode;
3665
3666 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, FAKE_USER_AST, ua,
3667 FAKE_USER_AST, &args);
3668 if (error)
3669 goto out_put;
3670
3671 error = convert_lock(ls, lkb, &args);
3672
3673 if (error == -EINPROGRESS || error == -EAGAIN)
3674 error = 0;
3675 out_put:
3676 dlm_put_lkb(lkb);
3677 out:
3678 unlock_recovery(ls);
3679 kfree(ua_tmp);
3680 return error;
3681}
3682
3683int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3684 uint32_t flags, uint32_t lkid, char *lvb_in)
3685{
3686 struct dlm_lkb *lkb;
3687 struct dlm_args args;
3688 struct dlm_user_args *ua;
3689 int error;
3690
3691 lock_recovery(ls);
3692
3693 error = find_lkb(ls, lkid, &lkb);
3694 if (error)
3695 goto out;
3696
3697 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3698
3699 if (lvb_in && ua->lksb.sb_lvbptr)
3700 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3701 ua->castparam = ua_tmp->castparam;
3702
3703 error = set_unlock_args(flags, ua, &args);
3704 if (error)
3705 goto out_put;
3706
3707 error = unlock_lock(ls, lkb, &args);
3708
3709 if (error == -DLM_EUNLOCK)
3710 error = 0;
3711 if (error)
3712 goto out_put;
3713
3714 spin_lock(&ua->proc->locks_spin);
3715 list_del_init(&lkb->lkb_ownqueue);
3716 spin_unlock(&ua->proc->locks_spin);
3717
3718 /* this removes the reference for the proc->locks list added by
3719 dlm_user_request */
3720 unhold_lkb(lkb);
3721 out_put:
3722 dlm_put_lkb(lkb);
3723 out:
3724 unlock_recovery(ls);
3725 return error;
3726}
3727
3728int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3729 uint32_t flags, uint32_t lkid)
3730{
3731 struct dlm_lkb *lkb;
3732 struct dlm_args args;
3733 struct dlm_user_args *ua;
3734 int error;
3735
3736 lock_recovery(ls);
3737
3738 error = find_lkb(ls, lkid, &lkb);
3739 if (error)
3740 goto out;
3741
3742 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3743 ua->castparam = ua_tmp->castparam;
3744
3745 error = set_unlock_args(flags, ua, &args);
3746 if (error)
3747 goto out_put;
3748
3749 error = cancel_lock(ls, lkb, &args);
3750
3751 if (error == -DLM_ECANCEL)
3752 error = 0;
3753 if (error)
3754 goto out_put;
3755
3756 /* this lkb was removed from the WAITING queue */
3757 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3758 spin_lock(&ua->proc->locks_spin);
3759 list_del_init(&lkb->lkb_ownqueue);
3760 spin_unlock(&ua->proc->locks_spin);
3761 unhold_lkb(lkb);
3762 }
3763 out_put:
3764 dlm_put_lkb(lkb);
3765 out:
3766 unlock_recovery(ls);
3767 return error;
3768}
3769
3770static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3771{
3772 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3773
3774 if (ua->lksb.sb_lvbptr)
3775 kfree(ua->lksb.sb_lvbptr);
3776 kfree(ua);
3777 lkb->lkb_astparam = (long)NULL;
3778
3779 /* TODO: propogate to master if needed */
3780 return 0;
3781}
3782
3783/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
3784 Regardless of what rsb queue the lock is on, it's removed and freed. */
3785
3786static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3787{
3788 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3789 struct dlm_args args;
3790 int error;
3791
3792 /* FIXME: we need to handle the case where the lkb is in limbo
3793 while the rsb is being looked up, currently we assert in
3794 _unlock_lock/is_remote because rsb nodeid is -1. */
3795
3796 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3797
3798 error = unlock_lock(ls, lkb, &args);
3799 if (error == -DLM_EUNLOCK)
3800 error = 0;
3801 return error;
3802}
3803
3804/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3805 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3806 which we clear here. */
3807
3808/* proc CLOSING flag is set so no more device_reads should look at proc->asts
3809 list, and no more device_writes should add lkb's to proc->locks list; so we
3810 shouldn't need to take asts_spin or locks_spin here. this assumes that
3811 device reads/writes/closes are serialized -- FIXME: we may need to serialize
3812 them ourself. */
3813
3814void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3815{
3816 struct dlm_lkb *lkb, *safe;
3817
3818 lock_recovery(ls);
3819 mutex_lock(&ls->ls_clear_proc_locks);
3820
3821 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3822 if (lkb->lkb_ast_type) {
3823 list_del(&lkb->lkb_astqueue);
3824 unhold_lkb(lkb);
3825 }
3826
3827 list_del_init(&lkb->lkb_ownqueue);
3828
3829 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
3830 lkb->lkb_flags |= DLM_IFL_ORPHAN;
3831 orphan_proc_lock(ls, lkb);
3832 } else {
3833 lkb->lkb_flags |= DLM_IFL_DEAD;
3834 unlock_proc_lock(ls, lkb);
3835 }
3836
3837 /* this removes the reference for the proc->locks list
3838 added by dlm_user_request, it may result in the lkb
3839 being freed */
3840
3841 dlm_put_lkb(lkb);
3842 }
3843 mutex_unlock(&ls->ls_clear_proc_locks);
3844 unlock_recovery(ls);
3845}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..8d2660f0ab10
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,61 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_print_lkb(struct dlm_lkb *lkb);
18int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
19int dlm_modes_compat(int mode1, int mode2);
20int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
21 unsigned int flags, struct dlm_rsb **r_ret);
22void dlm_put_rsb(struct dlm_rsb *r);
23void dlm_hold_rsb(struct dlm_rsb *r);
24int dlm_put_lkb(struct dlm_lkb *lkb);
25void dlm_scan_rsbs(struct dlm_ls *ls);
26
27int dlm_purge_locks(struct dlm_ls *ls);
28void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
29void dlm_grant_after_purge(struct dlm_ls *ls);
30int dlm_recover_waiters_post(struct dlm_ls *ls);
31void dlm_recover_waiters_pre(struct dlm_ls *ls);
32int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
33int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34
35int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
36 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
37int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
38 int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
39int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
40 uint32_t flags, uint32_t lkid, char *lvb_in);
41int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
42 uint32_t flags, uint32_t lkid);
43void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
44
45static inline int is_master(struct dlm_rsb *r)
46{
47 return !r->res_nodeid;
48}
49
50static inline void lock_rsb(struct dlm_rsb *r)
51{
52 mutex_lock(&r->res_mutex);
53}
54
55static inline void unlock_rsb(struct dlm_rsb *r)
56{
57 mutex_unlock(&r->res_mutex);
58}
59
60#endif
61
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..31ed0fe16a31
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,705 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return sprintf(buf, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return sprintf(buf, "%x\n", status);
82}
83
84struct dlm_attr {
85 struct attribute attr;
86 ssize_t (*show)(struct dlm_ls *, char *);
87 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
88};
89
90static struct dlm_attr dlm_attr_control = {
91 .attr = {.name = "control", .mode = S_IWUSR},
92 .store = dlm_control_store
93};
94
95static struct dlm_attr dlm_attr_event = {
96 .attr = {.name = "event_done", .mode = S_IWUSR},
97 .store = dlm_event_store
98};
99
100static struct dlm_attr dlm_attr_id = {
101 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
102 .show = dlm_id_show,
103 .store = dlm_id_store
104};
105
106static struct dlm_attr dlm_attr_recover_status = {
107 .attr = {.name = "recover_status", .mode = S_IRUGO},
108 .show = dlm_recover_status_show
109};
110
111static struct attribute *dlm_attrs[] = {
112 &dlm_attr_control.attr,
113 &dlm_attr_event.attr,
114 &dlm_attr_id.attr,
115 &dlm_attr_recover_status.attr,
116 NULL,
117};
118
119static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
120 char *buf)
121{
122 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
123 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
124 return a->show ? a->show(ls, buf) : 0;
125}
126
127static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
128 const char *buf, size_t len)
129{
130 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
131 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
132 return a->store ? a->store(ls, buf, len) : len;
133}
134
135static struct sysfs_ops dlm_attr_ops = {
136 .show = dlm_attr_show,
137 .store = dlm_attr_store,
138};
139
140static struct kobj_type dlm_ktype = {
141 .default_attrs = dlm_attrs,
142 .sysfs_ops = &dlm_attr_ops,
143};
144
145static struct kset dlm_kset = {
146 .subsys = &kernel_subsys,
147 .kobj = {.name = "dlm",},
148 .ktype = &dlm_ktype,
149};
150
151static int kobject_setup(struct dlm_ls *ls)
152{
153 char lsname[DLM_LOCKSPACE_LEN];
154 int error;
155
156 memset(lsname, 0, DLM_LOCKSPACE_LEN);
157 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
158
159 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
160 if (error)
161 return error;
162
163 ls->ls_kobj.kset = &dlm_kset;
164 ls->ls_kobj.ktype = &dlm_ktype;
165 return 0;
166}
167
168static int do_uevent(struct dlm_ls *ls, int in)
169{
170 int error;
171
172 if (in)
173 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
174 else
175 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
176
177 error = wait_event_interruptible(ls->ls_uevent_wait,
178 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
179 if (error)
180 goto out;
181
182 error = ls->ls_uevent_result;
183 out:
184 return error;
185}
186
187
188int dlm_lockspace_init(void)
189{
190 int error;
191
192 ls_count = 0;
193 mutex_init(&ls_lock);
194 INIT_LIST_HEAD(&lslist);
195 spin_lock_init(&lslist_lock);
196
197 error = kset_register(&dlm_kset);
198 if (error)
199 printk("dlm_lockspace_init: cannot register kset %d\n", error);
200 return error;
201}
202
203void dlm_lockspace_exit(void)
204{
205 kset_unregister(&dlm_kset);
206}
207
208static int dlm_scand(void *data)
209{
210 struct dlm_ls *ls;
211
212 while (!kthread_should_stop()) {
213 list_for_each_entry(ls, &lslist, ls_list)
214 dlm_scan_rsbs(ls);
215 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
216 }
217 return 0;
218}
219
220static int dlm_scand_start(void)
221{
222 struct task_struct *p;
223 int error = 0;
224
225 p = kthread_run(dlm_scand, NULL, "dlm_scand");
226 if (IS_ERR(p))
227 error = PTR_ERR(p);
228 else
229 scand_task = p;
230 return error;
231}
232
233static void dlm_scand_stop(void)
234{
235 kthread_stop(scand_task);
236}
237
238static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
239{
240 struct dlm_ls *ls;
241
242 spin_lock(&lslist_lock);
243
244 list_for_each_entry(ls, &lslist, ls_list) {
245 if (ls->ls_namelen == namelen &&
246 memcmp(ls->ls_name, name, namelen) == 0)
247 goto out;
248 }
249 ls = NULL;
250 out:
251 spin_unlock(&lslist_lock);
252 return ls;
253}
254
255struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
256{
257 struct dlm_ls *ls;
258
259 spin_lock(&lslist_lock);
260
261 list_for_each_entry(ls, &lslist, ls_list) {
262 if (ls->ls_global_id == id) {
263 ls->ls_count++;
264 goto out;
265 }
266 }
267 ls = NULL;
268 out:
269 spin_unlock(&lslist_lock);
270 return ls;
271}
272
273struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
274{
275 struct dlm_ls *ls;
276
277 spin_lock(&lslist_lock);
278 list_for_each_entry(ls, &lslist, ls_list) {
279 if (ls->ls_local_handle == lockspace) {
280 ls->ls_count++;
281 goto out;
282 }
283 }
284 ls = NULL;
285 out:
286 spin_unlock(&lslist_lock);
287 return ls;
288}
289
290struct dlm_ls *dlm_find_lockspace_device(int minor)
291{
292 struct dlm_ls *ls;
293
294 spin_lock(&lslist_lock);
295 list_for_each_entry(ls, &lslist, ls_list) {
296 if (ls->ls_device.minor == minor) {
297 ls->ls_count++;
298 goto out;
299 }
300 }
301 ls = NULL;
302 out:
303 spin_unlock(&lslist_lock);
304 return ls;
305}
306
307void dlm_put_lockspace(struct dlm_ls *ls)
308{
309 spin_lock(&lslist_lock);
310 ls->ls_count--;
311 spin_unlock(&lslist_lock);
312}
313
314static void remove_lockspace(struct dlm_ls *ls)
315{
316 for (;;) {
317 spin_lock(&lslist_lock);
318 if (ls->ls_count == 0) {
319 list_del(&ls->ls_list);
320 spin_unlock(&lslist_lock);
321 return;
322 }
323 spin_unlock(&lslist_lock);
324 ssleep(1);
325 }
326}
327
328static int threads_start(void)
329{
330 int error;
331
332 /* Thread which process lock requests for all lockspace's */
333 error = dlm_astd_start();
334 if (error) {
335 log_print("cannot start dlm_astd thread %d", error);
336 goto fail;
337 }
338
339 error = dlm_scand_start();
340 if (error) {
341 log_print("cannot start dlm_scand thread %d", error);
342 goto astd_fail;
343 }
344
345 /* Thread for sending/receiving messages for all lockspace's */
346 error = dlm_lowcomms_start();
347 if (error) {
348 log_print("cannot start dlm lowcomms %d", error);
349 goto scand_fail;
350 }
351
352 return 0;
353
354 scand_fail:
355 dlm_scand_stop();
356 astd_fail:
357 dlm_astd_stop();
358 fail:
359 return error;
360}
361
362static void threads_stop(void)
363{
364 dlm_scand_stop();
365 dlm_lowcomms_stop();
366 dlm_astd_stop();
367}
368
369static int new_lockspace(char *name, int namelen, void **lockspace,
370 uint32_t flags, int lvblen)
371{
372 struct dlm_ls *ls;
373 int i, size, error = -ENOMEM;
374
375 if (namelen > DLM_LOCKSPACE_LEN)
376 return -EINVAL;
377
378 if (!lvblen || (lvblen % 8))
379 return -EINVAL;
380
381 if (!try_module_get(THIS_MODULE))
382 return -EINVAL;
383
384 ls = dlm_find_lockspace_name(name, namelen);
385 if (ls) {
386 *lockspace = ls;
387 module_put(THIS_MODULE);
388 return -EEXIST;
389 }
390
391 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
392 if (!ls)
393 goto out;
394 memcpy(ls->ls_name, name, namelen);
395 ls->ls_namelen = namelen;
396 ls->ls_exflags = flags;
397 ls->ls_lvblen = lvblen;
398 ls->ls_count = 0;
399 ls->ls_flags = 0;
400
401 size = dlm_config.rsbtbl_size;
402 ls->ls_rsbtbl_size = size;
403
404 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
405 if (!ls->ls_rsbtbl)
406 goto out_lsfree;
407 for (i = 0; i < size; i++) {
408 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
409 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
410 rwlock_init(&ls->ls_rsbtbl[i].lock);
411 }
412
413 size = dlm_config.lkbtbl_size;
414 ls->ls_lkbtbl_size = size;
415
416 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
417 if (!ls->ls_lkbtbl)
418 goto out_rsbfree;
419 for (i = 0; i < size; i++) {
420 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
421 rwlock_init(&ls->ls_lkbtbl[i].lock);
422 ls->ls_lkbtbl[i].counter = 1;
423 }
424
425 size = dlm_config.dirtbl_size;
426 ls->ls_dirtbl_size = size;
427
428 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
429 if (!ls->ls_dirtbl)
430 goto out_lkbfree;
431 for (i = 0; i < size; i++) {
432 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
433 rwlock_init(&ls->ls_dirtbl[i].lock);
434 }
435
436 INIT_LIST_HEAD(&ls->ls_waiters);
437 mutex_init(&ls->ls_waiters_mutex);
438
439 INIT_LIST_HEAD(&ls->ls_nodes);
440 INIT_LIST_HEAD(&ls->ls_nodes_gone);
441 ls->ls_num_nodes = 0;
442 ls->ls_low_nodeid = 0;
443 ls->ls_total_weight = 0;
444 ls->ls_node_array = NULL;
445
446 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
447 ls->ls_stub_rsb.res_ls = ls;
448
449 ls->ls_debug_rsb_dentry = NULL;
450 ls->ls_debug_waiters_dentry = NULL;
451
452 init_waitqueue_head(&ls->ls_uevent_wait);
453 ls->ls_uevent_result = 0;
454
455 ls->ls_recoverd_task = NULL;
456 mutex_init(&ls->ls_recoverd_active);
457 spin_lock_init(&ls->ls_recover_lock);
458 ls->ls_recover_status = 0;
459 ls->ls_recover_seq = 0;
460 ls->ls_recover_args = NULL;
461 init_rwsem(&ls->ls_in_recovery);
462 INIT_LIST_HEAD(&ls->ls_requestqueue);
463 mutex_init(&ls->ls_requestqueue_mutex);
464 mutex_init(&ls->ls_clear_proc_locks);
465
466 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
467 if (!ls->ls_recover_buf)
468 goto out_dirfree;
469
470 INIT_LIST_HEAD(&ls->ls_recover_list);
471 spin_lock_init(&ls->ls_recover_list_lock);
472 ls->ls_recover_list_count = 0;
473 ls->ls_local_handle = ls;
474 init_waitqueue_head(&ls->ls_wait_general);
475 INIT_LIST_HEAD(&ls->ls_root_list);
476 init_rwsem(&ls->ls_root_sem);
477
478 down_write(&ls->ls_in_recovery);
479
480 error = dlm_recoverd_start(ls);
481 if (error) {
482 log_error(ls, "can't start dlm_recoverd %d", error);
483 goto out_rcomfree;
484 }
485
486 spin_lock(&lslist_lock);
487 list_add(&ls->ls_list, &lslist);
488 spin_unlock(&lslist_lock);
489
490 dlm_create_debug_file(ls);
491
492 error = kobject_setup(ls);
493 if (error)
494 goto out_del;
495
496 error = kobject_register(&ls->ls_kobj);
497 if (error)
498 goto out_del;
499
500 error = do_uevent(ls, 1);
501 if (error)
502 goto out_unreg;
503
504 *lockspace = ls;
505 return 0;
506
507 out_unreg:
508 kobject_unregister(&ls->ls_kobj);
509 out_del:
510 dlm_delete_debug_file(ls);
511 spin_lock(&lslist_lock);
512 list_del(&ls->ls_list);
513 spin_unlock(&lslist_lock);
514 dlm_recoverd_stop(ls);
515 out_rcomfree:
516 kfree(ls->ls_recover_buf);
517 out_dirfree:
518 kfree(ls->ls_dirtbl);
519 out_lkbfree:
520 kfree(ls->ls_lkbtbl);
521 out_rsbfree:
522 kfree(ls->ls_rsbtbl);
523 out_lsfree:
524 kfree(ls);
525 out:
526 module_put(THIS_MODULE);
527 return error;
528}
529
530int dlm_new_lockspace(char *name, int namelen, void **lockspace,
531 uint32_t flags, int lvblen)
532{
533 int error = 0;
534
535 mutex_lock(&ls_lock);
536 if (!ls_count)
537 error = threads_start();
538 if (error)
539 goto out;
540
541 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
542 if (!error)
543 ls_count++;
544 out:
545 mutex_unlock(&ls_lock);
546 return error;
547}
548
549/* Return 1 if the lockspace still has active remote locks,
550 * 2 if the lockspace still has active local locks.
551 */
552static int lockspace_busy(struct dlm_ls *ls)
553{
554 int i, lkb_found = 0;
555 struct dlm_lkb *lkb;
556
557 /* NOTE: We check the lockidtbl here rather than the resource table.
558 This is because there may be LKBs queued as ASTs that have been
559 unlinked from their RSBs and are pending deletion once the AST has
560 been delivered */
561
562 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
563 read_lock(&ls->ls_lkbtbl[i].lock);
564 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
565 lkb_found = 1;
566 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
567 lkb_idtbl_list) {
568 if (!lkb->lkb_nodeid) {
569 read_unlock(&ls->ls_lkbtbl[i].lock);
570 return 2;
571 }
572 }
573 }
574 read_unlock(&ls->ls_lkbtbl[i].lock);
575 }
576 return lkb_found;
577}
578
579static int release_lockspace(struct dlm_ls *ls, int force)
580{
581 struct dlm_lkb *lkb;
582 struct dlm_rsb *rsb;
583 struct list_head *head;
584 int i;
585 int busy = lockspace_busy(ls);
586
587 if (busy > force)
588 return -EBUSY;
589
590 if (force < 3)
591 do_uevent(ls, 0);
592
593 dlm_recoverd_stop(ls);
594
595 remove_lockspace(ls);
596
597 dlm_delete_debug_file(ls);
598
599 dlm_astd_suspend();
600
601 kfree(ls->ls_recover_buf);
602
603 /*
604 * Free direntry structs.
605 */
606
607 dlm_dir_clear(ls);
608 kfree(ls->ls_dirtbl);
609
610 /*
611 * Free all lkb's on lkbtbl[] lists.
612 */
613
614 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
615 head = &ls->ls_lkbtbl[i].list;
616 while (!list_empty(head)) {
617 lkb = list_entry(head->next, struct dlm_lkb,
618 lkb_idtbl_list);
619
620 list_del(&lkb->lkb_idtbl_list);
621
622 dlm_del_ast(lkb);
623
624 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
625 free_lvb(lkb->lkb_lvbptr);
626
627 free_lkb(lkb);
628 }
629 }
630 dlm_astd_resume();
631
632 kfree(ls->ls_lkbtbl);
633
634 /*
635 * Free all rsb's on rsbtbl[] lists
636 */
637
638 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
639 head = &ls->ls_rsbtbl[i].list;
640 while (!list_empty(head)) {
641 rsb = list_entry(head->next, struct dlm_rsb,
642 res_hashchain);
643
644 list_del(&rsb->res_hashchain);
645 free_rsb(rsb);
646 }
647
648 head = &ls->ls_rsbtbl[i].toss;
649 while (!list_empty(head)) {
650 rsb = list_entry(head->next, struct dlm_rsb,
651 res_hashchain);
652 list_del(&rsb->res_hashchain);
653 free_rsb(rsb);
654 }
655 }
656
657 kfree(ls->ls_rsbtbl);
658
659 /*
660 * Free structures on any other lists
661 */
662
663 kfree(ls->ls_recover_args);
664 dlm_clear_free_entries(ls);
665 dlm_clear_members(ls);
666 dlm_clear_members_gone(ls);
667 kfree(ls->ls_node_array);
668 kobject_unregister(&ls->ls_kobj);
669 kfree(ls);
670
671 mutex_lock(&ls_lock);
672 ls_count--;
673 if (!ls_count)
674 threads_stop();
675 mutex_unlock(&ls_lock);
676
677 module_put(THIS_MODULE);
678 return 0;
679}
680
681/*
682 * Called when a system has released all its locks and is not going to use the
683 * lockspace any longer. We free everything we're managing for this lockspace.
684 * Remaining nodes will go through the recovery process as if we'd died. The
685 * lockspace must continue to function as usual, participating in recoveries,
686 * until this returns.
687 *
688 * Force has 4 possible values:
689 * 0 - don't destroy locksapce if it has any LKBs
690 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
691 * 2 - destroy lockspace regardless of LKBs
692 * 3 - destroy lockspace as part of a forced shutdown
693 */
694
695int dlm_release_lockspace(void *lockspace, int force)
696{
697 struct dlm_ls *ls;
698
699 ls = dlm_find_lockspace_local(lockspace);
700 if (!ls)
701 return -EINVAL;
702 dlm_put_lockspace(ls);
703 return release_lockspace(ls, force);
704}
705
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls);
23
24#endif /* __LOCKSPACE_DOT_H__ */
25
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..7ab40422ab57
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!dlm_local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (dlm_local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!dlm_local_count)
264 return;
265
266 if (!port) {
267 if (dlm_local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = dlm_local_addr[0]->ss_family;
277 if (dlm_local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 dlm_local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 dlm_local_addr[dlm_local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!dlm_local_count) {
663 init_local();
664 if (!dlm_local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < dlm_local_count; i++) {
704 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 kmap(e->page);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066int dlm_lowcomms_close(int nodeid)
1067{
1068 struct nodeinfo *ni;
1069
1070 ni = nodeid2nodeinfo(nodeid, 0);
1071 if (!ni)
1072 return -1;
1073
1074 spin_lock(&ni->lock);
1075 if (ni->assoc_id) {
1076 ni->assoc_id = 0;
1077 /* Don't send shutdown here, sctp will just queue it
1078 till the node comes back up! */
1079 }
1080 spin_unlock(&ni->lock);
1081
1082 clean_one_writequeue(ni);
1083 clear_bit(NI_INIT_PENDING, &ni->flags);
1084 return 0;
1085}
1086
1087static int write_list_empty(void)
1088{
1089 int status;
1090
1091 spin_lock_bh(&write_nodes_lock);
1092 status = list_empty(&write_nodes);
1093 spin_unlock_bh(&write_nodes_lock);
1094
1095 return status;
1096}
1097
1098static int dlm_recvd(void *data)
1099{
1100 DECLARE_WAITQUEUE(wait, current);
1101
1102 while (!kthread_should_stop()) {
1103 int count = 0;
1104
1105 set_current_state(TASK_INTERRUPTIBLE);
1106 add_wait_queue(&lowcomms_recv_wait, &wait);
1107 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1108 schedule();
1109 remove_wait_queue(&lowcomms_recv_wait, &wait);
1110 set_current_state(TASK_RUNNING);
1111
1112 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1113 int ret;
1114
1115 do {
1116 ret = receive_from_sock();
1117
1118 /* Don't starve out everyone else */
1119 if (++count >= MAX_RX_MSG_COUNT) {
1120 schedule();
1121 count = 0;
1122 }
1123 } while (!kthread_should_stop() && ret >=0);
1124 }
1125 schedule();
1126 }
1127
1128 return 0;
1129}
1130
1131static int dlm_sendd(void *data)
1132{
1133 DECLARE_WAITQUEUE(wait, current);
1134
1135 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1136
1137 while (!kthread_should_stop()) {
1138 set_current_state(TASK_INTERRUPTIBLE);
1139 if (write_list_empty())
1140 schedule();
1141 set_current_state(TASK_RUNNING);
1142
1143 if (sctp_con.eagain_flag) {
1144 sctp_con.eagain_flag = 0;
1145 refill_write_queue();
1146 }
1147 process_output_queue();
1148 }
1149
1150 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1151
1152 return 0;
1153}
1154
1155static void daemons_stop(void)
1156{
1157 kthread_stop(recv_task);
1158 kthread_stop(send_task);
1159}
1160
1161static int daemons_start(void)
1162{
1163 struct task_struct *p;
1164 int error;
1165
1166 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1167 error = IS_ERR(p);
1168 if (error) {
1169 log_print("can't start dlm_recvd %d", error);
1170 return error;
1171 }
1172 recv_task = p;
1173
1174 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1175 error = IS_ERR(p);
1176 if (error) {
1177 log_print("can't start dlm_sendd %d", error);
1178 kthread_stop(recv_task);
1179 return error;
1180 }
1181 send_task = p;
1182
1183 return 0;
1184}
1185
1186/*
1187 * This is quite likely to sleep...
1188 */
1189int dlm_lowcomms_start(void)
1190{
1191 int error;
1192
1193 error = init_sock();
1194 if (error)
1195 goto fail_sock;
1196 error = daemons_start();
1197 if (error)
1198 goto fail_sock;
1199 atomic_set(&accepting, 1);
1200 return 0;
1201
1202 fail_sock:
1203 close_connection();
1204 return error;
1205}
1206
1207/* Set all the activity flags to prevent any socket activity. */
1208
1209void dlm_lowcomms_stop(void)
1210{
1211 atomic_set(&accepting, 0);
1212 sctp_con.flags = 0x7;
1213 daemons_stop();
1214 clean_writequeues();
1215 close_connection();
1216 dealloc_nodeinfo();
1217 max_nodeid = 0;
1218}
1219
1220int dlm_lowcomms_init(void)
1221{
1222 init_waitqueue_head(&lowcomms_recv_wait);
1223 spin_lock_init(&write_nodes_lock);
1224 INIT_LIST_HEAD(&write_nodes);
1225 init_rwsem(&nodeinfo_lock);
1226 return 0;
1227}
1228
1229void dlm_lowcomms_exit(void)
1230{
1231 int i;
1232
1233 for (i = 0; i < dlm_local_count; i++)
1234 kfree(dlm_local_addr[i]);
1235 dlm_local_count = 0;
1236 dlm_local_nodeid = 0;
1237}
1238
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "user.h"
18#include "memory.h"
19#include "lowcomms.h"
20#include "config.h"
21
22#ifdef CONFIG_DLM_DEBUG
23int dlm_register_debugfs(void);
24void dlm_unregister_debugfs(void);
25#else
26static inline int dlm_register_debugfs(void) { return 0; }
27static inline void dlm_unregister_debugfs(void) { }
28#endif
29
30static int __init init_dlm(void)
31{
32 int error;
33
34 error = dlm_memory_init();
35 if (error)
36 goto out;
37
38 error = dlm_lockspace_init();
39 if (error)
40 goto out_mem;
41
42 error = dlm_config_init();
43 if (error)
44 goto out_lockspace;
45
46 error = dlm_register_debugfs();
47 if (error)
48 goto out_config;
49
50 error = dlm_lowcomms_init();
51 if (error)
52 goto out_debug;
53
54 error = dlm_user_init();
55 if (error)
56 goto out_lowcomms;
57
58 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
59
60 return 0;
61
62 out_lowcomms:
63 dlm_lowcomms_exit();
64 out_debug:
65 dlm_unregister_debugfs();
66 out_config:
67 dlm_config_exit();
68 out_lockspace:
69 dlm_lockspace_exit();
70 out_mem:
71 dlm_memory_exit();
72 out:
73 return error;
74}
75
76static void __exit exit_dlm(void)
77{
78 dlm_user_exit();
79 dlm_lowcomms_exit();
80 dlm_config_exit();
81 dlm_memory_exit();
82 dlm_lockspace_exit();
83 dlm_unregister_debugfs();
84}
85
86module_init(init_dlm);
87module_exit(exit_dlm);
88
89MODULE_DESCRIPTION("Distributed Lock Manager");
90MODULE_AUTHOR("Red Hat, Inc.");
91MODULE_LICENSE("GPL");
92
93EXPORT_SYMBOL_GPL(dlm_new_lockspace);
94EXPORT_SYMBOL_GPL(dlm_release_lockspace);
95EXPORT_SYMBOL_GPL(dlm_lock);
96EXPORT_SYMBOL_GPL(dlm_unlock);
97
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..cd0c51e724e0
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,312 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static void ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 list_for_each_entry(memb, &ls->ls_nodes, list)
169 dlm_rcom_status(ls, memb->nodeid);
170}
171
172int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
173{
174 struct dlm_member *memb, *safe;
175 int i, error, found, pos = 0, neg = 0, low = -1;
176
177 /* move departed members from ls_nodes to ls_nodes_gone */
178
179 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
180 found = 0;
181 for (i = 0; i < rv->node_count; i++) {
182 if (memb->nodeid == rv->nodeids[i]) {
183 found = 1;
184 break;
185 }
186 }
187
188 if (!found) {
189 neg++;
190 dlm_remove_member(ls, memb);
191 log_debug(ls, "remove member %d", memb->nodeid);
192 }
193 }
194
195 /* add new members to ls_nodes */
196
197 for (i = 0; i < rv->node_count; i++) {
198 if (dlm_is_member(ls, rv->nodeids[i]))
199 continue;
200 dlm_add_member(ls, rv->nodeids[i]);
201 pos++;
202 log_debug(ls, "add member %d", rv->nodeids[i]);
203 }
204
205 list_for_each_entry(memb, &ls->ls_nodes, list) {
206 if (low == -1 || memb->nodeid < low)
207 low = memb->nodeid;
208 }
209 ls->ls_low_nodeid = low;
210
211 make_member_array(ls);
212 dlm_set_recover_status(ls, DLM_RS_NODES);
213 *neg_out = neg;
214
215 ping_members(ls);
216
217 error = dlm_recover_members_wait(ls);
218 log_debug(ls, "total members %d", ls->ls_num_nodes);
219 return error;
220}
221
222/*
223 * Following called from lockspace.c
224 */
225
226int dlm_ls_stop(struct dlm_ls *ls)
227{
228 int new;
229
230 /*
231 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
232 * dlm_recovery_stopped()) and prevents any new locks from being
233 * processed (see RUNNING, dlm_locking_stopped()).
234 */
235
236 spin_lock(&ls->ls_recover_lock);
237 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
238 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
239 ls->ls_recover_seq++;
240 spin_unlock(&ls->ls_recover_lock);
241
242 /*
243 * This in_recovery lock does two things:
244 *
245 * 1) Keeps this function from returning until all threads are out
246 * of locking routines and locking is truely stopped.
247 * 2) Keeps any new requests from being processed until it's unlocked
248 * when recovery is complete.
249 */
250
251 if (new)
252 down_write(&ls->ls_in_recovery);
253
254 /*
255 * The recoverd suspend/resume makes sure that dlm_recoverd (if
256 * running) has noticed the clearing of RUNNING above and quit
257 * processing the previous recovery. This will be true for all nodes
258 * before any nodes start the new recovery.
259 */
260
261 dlm_recoverd_suspend(ls);
262 ls->ls_recover_status = 0;
263 dlm_recoverd_resume(ls);
264 return 0;
265}
266
267int dlm_ls_start(struct dlm_ls *ls)
268{
269 struct dlm_recover *rv = NULL, *rv_old;
270 int *ids = NULL;
271 int error, count;
272
273 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
274 if (!rv)
275 return -ENOMEM;
276
277 error = count = dlm_nodeid_list(ls->ls_name, &ids);
278 if (error <= 0)
279 goto fail;
280
281 spin_lock(&ls->ls_recover_lock);
282
283 /* the lockspace needs to be stopped before it can be started */
284
285 if (!dlm_locking_stopped(ls)) {
286 spin_unlock(&ls->ls_recover_lock);
287 log_error(ls, "start ignored: lockspace running");
288 error = -EINVAL;
289 goto fail;
290 }
291
292 rv->nodeids = ids;
293 rv->node_count = count;
294 rv->seq = ++ls->ls_recover_seq;
295 rv_old = ls->ls_recover_args;
296 ls->ls_recover_args = rv;
297 spin_unlock(&ls->ls_recover_lock);
298
299 if (rv_old) {
300 kfree(rv_old->nodeids);
301 kfree(rv_old);
302 }
303
304 dlm_recoverd_kick(ls);
305 return 0;
306
307 fail:
308 kfree(rv);
309 kfree(ids);
310 return error;
311}
312
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..48dfc27861f4
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,115 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 if (lkb->lkb_flags & DLM_IFL_USER) {
88 struct dlm_user_args *ua;
89 ua = (struct dlm_user_args *)lkb->lkb_astparam;
90 if (ua) {
91 if (ua->lksb.sb_lvbptr)
92 kfree(ua->lksb.sb_lvbptr);
93 kfree(ua);
94 }
95 }
96 kmem_cache_free(lkb_cache, lkb);
97}
98
99struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
100{
101 struct dlm_direntry *de;
102
103 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
104
105 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
106 if (de)
107 memset(de, 0, sizeof(*de) + namelen);
108 return de;
109}
110
111void free_direntry(struct dlm_direntry *de)
112{
113 kfree(de);
114}
115
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..55fbe313340e
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,457 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100
101 if (nodeid == dlm_our_nodeid()) {
102 rc = (struct dlm_rcom *) ls->ls_recover_buf;
103 rc->rc_result = dlm_recover_status(ls);
104 goto out;
105 }
106
107 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
108 if (error)
109 goto out;
110
111 send_rcom(ls, mh, rc);
112
113 error = dlm_wait_function(ls, &rcom_response);
114 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
115 if (error)
116 goto out;
117
118 rc = (struct dlm_rcom *) ls->ls_recover_buf;
119
120 if (rc->rc_result == -ESRCH) {
121 /* we pretend the remote lockspace exists with 0 status */
122 log_debug(ls, "remote node %d not ready", nodeid);
123 rc->rc_result = 0;
124 } else
125 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
126 nodeid);
127 /* the caller looks at rc_result for the remote recovery status */
128 out:
129 return error;
130}
131
132static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
133{
134 struct dlm_rcom *rc;
135 struct dlm_mhandle *mh;
136 int error, nodeid = rc_in->rc_header.h_nodeid;
137
138 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
139 sizeof(struct rcom_config), &rc, &mh);
140 if (error)
141 return;
142 rc->rc_result = dlm_recover_status(ls);
143 make_config(ls, (struct rcom_config *) rc->rc_buf);
144
145 send_rcom(ls, mh, rc);
146}
147
148static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
149{
150 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
151 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
152 wake_up(&ls->ls_wait_general);
153}
154
155int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
156{
157 struct dlm_rcom *rc;
158 struct dlm_mhandle *mh;
159 int error = 0, len = sizeof(struct dlm_rcom);
160
161 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
162
163 if (nodeid == dlm_our_nodeid()) {
164 dlm_copy_master_names(ls, last_name, last_len,
165 ls->ls_recover_buf + len,
166 dlm_config.buffer_size - len, nodeid);
167 goto out;
168 }
169
170 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
171 if (error)
172 goto out;
173 memcpy(rc->rc_buf, last_name, last_len);
174
175 send_rcom(ls, mh, rc);
176
177 error = dlm_wait_function(ls, &rcom_response);
178 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
179 out:
180 return error;
181}
182
183static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
184{
185 struct dlm_rcom *rc;
186 struct dlm_mhandle *mh;
187 int error, inlen, outlen;
188 int nodeid = rc_in->rc_header.h_nodeid;
189 uint32_t status = dlm_recover_status(ls);
190
191 /*
192 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
193 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
194 * It could only happen in rare cases where we get a late NAMES
195 * message from a previous instance of recovery.
196 */
197
198 if (!(status & DLM_RS_NODES)) {
199 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
200 return;
201 }
202
203 nodeid = rc_in->rc_header.h_nodeid;
204 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
205 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
206
207 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
208 if (error)
209 return;
210
211 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
212 nodeid);
213 send_rcom(ls, mh, rc);
214}
215
216static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
217{
218 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
219 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
220 wake_up(&ls->ls_wait_general);
221}
222
223int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
224{
225 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh;
227 struct dlm_ls *ls = r->res_ls;
228 int error;
229
230 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
231 &rc, &mh);
232 if (error)
233 goto out;
234 memcpy(rc->rc_buf, r->res_name, r->res_length);
235 rc->rc_id = (unsigned long) r;
236
237 send_rcom(ls, mh, rc);
238 out:
239 return error;
240}
241
242static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
243{
244 struct dlm_rcom *rc;
245 struct dlm_mhandle *mh;
246 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
247 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
248
249 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
250 if (error)
251 return;
252
253 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
254 if (error)
255 ret_nodeid = error;
256 rc->rc_result = ret_nodeid;
257 rc->rc_id = rc_in->rc_id;
258
259 send_rcom(ls, mh, rc);
260}
261
262static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
263{
264 dlm_recover_master_reply(ls, rc_in);
265}
266
267static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
268 struct rcom_lock *rl)
269{
270 memset(rl, 0, sizeof(*rl));
271
272 rl->rl_ownpid = lkb->lkb_ownpid;
273 rl->rl_lkid = lkb->lkb_id;
274 rl->rl_exflags = lkb->lkb_exflags;
275 rl->rl_flags = lkb->lkb_flags;
276 rl->rl_lvbseq = lkb->lkb_lvbseq;
277 rl->rl_rqmode = lkb->lkb_rqmode;
278 rl->rl_grmode = lkb->lkb_grmode;
279 rl->rl_status = lkb->lkb_status;
280 rl->rl_wait_type = lkb->lkb_wait_type;
281
282 if (lkb->lkb_bastaddr)
283 rl->rl_asts |= AST_BAST;
284 if (lkb->lkb_astaddr)
285 rl->rl_asts |= AST_COMP;
286
287 rl->rl_namelen = r->res_length;
288 memcpy(rl->rl_name, r->res_name, r->res_length);
289
290 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
291 If so, receive_rcom_lock_args() won't take this copy. */
292
293 if (lkb->lkb_lvbptr)
294 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
295}
296
297int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
298{
299 struct dlm_ls *ls = r->res_ls;
300 struct dlm_rcom *rc;
301 struct dlm_mhandle *mh;
302 struct rcom_lock *rl;
303 int error, len = sizeof(struct rcom_lock);
304
305 if (lkb->lkb_lvbptr)
306 len += ls->ls_lvblen;
307
308 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
309 if (error)
310 goto out;
311
312 rl = (struct rcom_lock *) rc->rc_buf;
313 pack_rcom_lock(r, lkb, rl);
314 rc->rc_id = (unsigned long) r;
315
316 send_rcom(ls, mh, rc);
317 out:
318 return error;
319}
320
321static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
322{
323 struct dlm_rcom *rc;
324 struct dlm_mhandle *mh;
325 int error, nodeid = rc_in->rc_header.h_nodeid;
326
327 dlm_recover_master_copy(ls, rc_in);
328
329 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
330 sizeof(struct rcom_lock), &rc, &mh);
331 if (error)
332 return;
333
334 /* We send back the same rcom_lock struct we received, but
335 dlm_recover_master_copy() has filled in rl_remid and rl_result */
336
337 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
338 rc->rc_id = rc_in->rc_id;
339
340 send_rcom(ls, mh, rc);
341}
342
343static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
344{
345 uint32_t status = dlm_recover_status(ls);
346
347 if (!(status & DLM_RS_DIR)) {
348 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
349 rc_in->rc_header.h_nodeid);
350 return;
351 }
352
353 dlm_recover_process_copy(ls, rc_in);
354}
355
356static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
357{
358 struct dlm_rcom *rc;
359 struct dlm_mhandle *mh;
360 char *mb;
361 int mb_len = sizeof(struct dlm_rcom);
362
363 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
364 if (!mh)
365 return -ENOBUFS;
366 memset(mb, 0, mb_len);
367
368 rc = (struct dlm_rcom *) mb;
369
370 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
371 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
372 rc->rc_header.h_nodeid = dlm_our_nodeid();
373 rc->rc_header.h_length = mb_len;
374 rc->rc_header.h_cmd = DLM_RCOM;
375
376 rc->rc_type = DLM_RCOM_STATUS_REPLY;
377 rc->rc_result = -ESRCH;
378
379 dlm_rcom_out(rc);
380 dlm_lowcomms_commit_buffer(mh);
381
382 return 0;
383}
384
385/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
386 recovery-only comms are sent through here. */
387
388void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
389{
390 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
391 struct dlm_ls *ls;
392
393 dlm_rcom_in(rc);
394
395 /* If the lockspace doesn't exist then still send a status message
396 back; it's possible that it just doesn't have its global_id yet. */
397
398 ls = dlm_find_lockspace_global(hd->h_lockspace);
399 if (!ls) {
400 log_print("lockspace %x from %d not found",
401 hd->h_lockspace, nodeid);
402 send_ls_not_ready(nodeid, rc);
403 return;
404 }
405
406 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
407 log_error(ls, "ignoring recovery message %x from %d",
408 rc->rc_type, nodeid);
409 goto out;
410 }
411
412 if (nodeid != rc->rc_header.h_nodeid) {
413 log_error(ls, "bad rcom nodeid %d from %d",
414 rc->rc_header.h_nodeid, nodeid);
415 goto out;
416 }
417
418 switch (rc->rc_type) {
419 case DLM_RCOM_STATUS:
420 receive_rcom_status(ls, rc);
421 break;
422
423 case DLM_RCOM_NAMES:
424 receive_rcom_names(ls, rc);
425 break;
426
427 case DLM_RCOM_LOOKUP:
428 receive_rcom_lookup(ls, rc);
429 break;
430
431 case DLM_RCOM_LOCK:
432 receive_rcom_lock(ls, rc);
433 break;
434
435 case DLM_RCOM_STATUS_REPLY:
436 receive_rcom_status_reply(ls, rc);
437 break;
438
439 case DLM_RCOM_NAMES_REPLY:
440 receive_rcom_names_reply(ls, rc);
441 break;
442
443 case DLM_RCOM_LOOKUP_REPLY:
444 receive_rcom_lookup_reply(ls, rc);
445 break;
446
447 case DLM_RCOM_LOCK_REPLY:
448 receive_rcom_lock_reply(ls, rc);
449 break;
450
451 default:
452 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
453 }
454 out:
455 dlm_put_lockspace(ls);
456}
457
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..1a86dfc8034e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,776 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
309 * rsb's to consider.
310 */
311
312static void set_new_master(struct dlm_rsb *r, int nodeid)
313{
314 lock_rsb(r);
315 r->res_nodeid = nodeid;
316 set_master_lkbs(r);
317 rsb_set_flag(r, RSB_NEW_MASTER);
318 rsb_set_flag(r, RSB_NEW_MASTER2);
319 unlock_rsb(r);
320}
321
322/*
323 * We do async lookups on rsb's that need new masters. The rsb's
324 * waiting for a lookup reply are kept on the recover_list.
325 */
326
327static int recover_master(struct dlm_rsb *r)
328{
329 struct dlm_ls *ls = r->res_ls;
330 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
331
332 dir_nodeid = dlm_dir_nodeid(r);
333
334 if (dir_nodeid == our_nodeid) {
335 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
336 r->res_length, &ret_nodeid);
337 if (error)
338 log_error(ls, "recover dir lookup error %d", error);
339
340 if (ret_nodeid == our_nodeid)
341 ret_nodeid = 0;
342 set_new_master(r, ret_nodeid);
343 } else {
344 recover_list_add(r);
345 error = dlm_send_rcom_lookup(r, dir_nodeid);
346 }
347
348 return error;
349}
350
351/*
352 * When not using a directory, most resource names will hash to a new static
353 * master nodeid and the resource will need to be remastered.
354 */
355
356static int recover_master_static(struct dlm_rsb *r)
357{
358 int master = dlm_dir_nodeid(r);
359
360 if (master == dlm_our_nodeid())
361 master = 0;
362
363 if (r->res_nodeid != master) {
364 if (is_master(r))
365 dlm_purge_mstcpy_locks(r);
366 set_new_master(r, master);
367 return 1;
368 }
369 return 0;
370}
371
372/*
373 * Go through local root resources and for each rsb which has a master which
374 * has departed, get the new master nodeid from the directory. The dir will
375 * assign mastery to the first node to look up the new master. That means
376 * we'll discover in this lookup if we're the new master of any rsb's.
377 *
378 * We fire off all the dir lookup requests individually and asynchronously to
379 * the correct dir node.
380 */
381
382int dlm_recover_masters(struct dlm_ls *ls)
383{
384 struct dlm_rsb *r;
385 int error = 0, count = 0;
386
387 log_debug(ls, "dlm_recover_masters");
388
389 down_read(&ls->ls_root_sem);
390 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
391 if (dlm_recovery_stopped(ls)) {
392 up_read(&ls->ls_root_sem);
393 error = -EINTR;
394 goto out;
395 }
396
397 if (dlm_no_directory(ls))
398 count += recover_master_static(r);
399 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
400 recover_master(r);
401 count++;
402 }
403
404 schedule();
405 }
406 up_read(&ls->ls_root_sem);
407
408 log_debug(ls, "dlm_recover_masters %d resources", count);
409
410 error = dlm_wait_function(ls, &recover_list_empty);
411 out:
412 if (error)
413 recover_list_clear(ls);
414 return error;
415}
416
417int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
418{
419 struct dlm_rsb *r;
420 int nodeid;
421
422 r = recover_list_find(ls, rc->rc_id);
423 if (!r) {
424 log_error(ls, "dlm_recover_master_reply no id %llx",
425 (unsigned long long)rc->rc_id);
426 goto out;
427 }
428
429 nodeid = rc->rc_result;
430 if (nodeid == dlm_our_nodeid())
431 nodeid = 0;
432
433 set_new_master(r, nodeid);
434 recover_list_del(r);
435
436 if (recover_list_empty(ls))
437 wake_up(&ls->ls_wait_general);
438 out:
439 return 0;
440}
441
442
443/* Lock recovery: rebuild the process-copy locks we hold on a
444 remastered rsb on the new rsb master.
445
446 dlm_recover_locks
447 recover_locks
448 recover_locks_queue
449 dlm_send_rcom_lock -> receive_rcom_lock
450 dlm_recover_master_copy
451 receive_rcom_lock_reply <-
452 dlm_recover_process_copy
453*/
454
455
456/*
457 * keep a count of the number of lkb's we send to the new master; when we get
458 * an equal number of replies then recovery for the rsb is done
459 */
460
461static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
462{
463 struct dlm_lkb *lkb;
464 int error = 0;
465
466 list_for_each_entry(lkb, head, lkb_statequeue) {
467 error = dlm_send_rcom_lock(r, lkb);
468 if (error)
469 break;
470 r->res_recover_locks_count++;
471 }
472
473 return error;
474}
475
476static int all_queues_empty(struct dlm_rsb *r)
477{
478 if (!list_empty(&r->res_grantqueue) ||
479 !list_empty(&r->res_convertqueue) ||
480 !list_empty(&r->res_waitqueue))
481 return 0;
482 return 1;
483}
484
485static int recover_locks(struct dlm_rsb *r)
486{
487 int error = 0;
488
489 lock_rsb(r);
490 if (all_queues_empty(r))
491 goto out;
492
493 DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
494
495 error = recover_locks_queue(r, &r->res_grantqueue);
496 if (error)
497 goto out;
498 error = recover_locks_queue(r, &r->res_convertqueue);
499 if (error)
500 goto out;
501 error = recover_locks_queue(r, &r->res_waitqueue);
502 if (error)
503 goto out;
504
505 if (r->res_recover_locks_count)
506 recover_list_add(r);
507 else
508 rsb_clear_flag(r, RSB_NEW_MASTER);
509 out:
510 unlock_rsb(r);
511 return error;
512}
513
514int dlm_recover_locks(struct dlm_ls *ls)
515{
516 struct dlm_rsb *r;
517 int error, count = 0;
518
519 log_debug(ls, "dlm_recover_locks");
520
521 down_read(&ls->ls_root_sem);
522 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
523 if (is_master(r)) {
524 rsb_clear_flag(r, RSB_NEW_MASTER);
525 continue;
526 }
527
528 if (!rsb_flag(r, RSB_NEW_MASTER))
529 continue;
530
531 if (dlm_recovery_stopped(ls)) {
532 error = -EINTR;
533 up_read(&ls->ls_root_sem);
534 goto out;
535 }
536
537 error = recover_locks(r);
538 if (error) {
539 up_read(&ls->ls_root_sem);
540 goto out;
541 }
542
543 count += r->res_recover_locks_count;
544 }
545 up_read(&ls->ls_root_sem);
546
547 log_debug(ls, "dlm_recover_locks %d locks", count);
548
549 error = dlm_wait_function(ls, &recover_list_empty);
550 out:
551 if (error)
552 recover_list_clear(ls);
553 else
554 dlm_set_recover_status(ls, DLM_RS_LOCKS);
555 return error;
556}
557
558void dlm_recovered_lock(struct dlm_rsb *r)
559{
560 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
561
562 r->res_recover_locks_count--;
563 if (!r->res_recover_locks_count) {
564 rsb_clear_flag(r, RSB_NEW_MASTER);
565 recover_list_del(r);
566 }
567
568 if (recover_list_empty(r->res_ls))
569 wake_up(&r->res_ls->ls_wait_general);
570}
571
572/*
573 * The lvb needs to be recovered on all master rsb's. This includes setting
574 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
575 * based on the lvb's of the locks held on the rsb.
576 *
577 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
578 * was already set prior to recovery, it's not cleared, regardless of locks.
579 *
580 * The LVB contents are only considered for changing when this is a new master
581 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
582 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
583 * from the lkb with the largest lvb sequence number.
584 */
585
586static void recover_lvb(struct dlm_rsb *r)
587{
588 struct dlm_lkb *lkb, *high_lkb = NULL;
589 uint32_t high_seq = 0;
590 int lock_lvb_exists = 0;
591 int big_lock_exists = 0;
592 int lvblen = r->res_ls->ls_lvblen;
593
594 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
595 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
596 continue;
597
598 lock_lvb_exists = 1;
599
600 if (lkb->lkb_grmode > DLM_LOCK_CR) {
601 big_lock_exists = 1;
602 goto setflag;
603 }
604
605 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
606 high_lkb = lkb;
607 high_seq = lkb->lkb_lvbseq;
608 }
609 }
610
611 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
612 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
613 continue;
614
615 lock_lvb_exists = 1;
616
617 if (lkb->lkb_grmode > DLM_LOCK_CR) {
618 big_lock_exists = 1;
619 goto setflag;
620 }
621
622 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
623 high_lkb = lkb;
624 high_seq = lkb->lkb_lvbseq;
625 }
626 }
627
628 setflag:
629 if (!lock_lvb_exists)
630 goto out;
631
632 if (!big_lock_exists)
633 rsb_set_flag(r, RSB_VALNOTVALID);
634
635 /* don't mess with the lvb unless we're the new master */
636 if (!rsb_flag(r, RSB_NEW_MASTER2))
637 goto out;
638
639 if (!r->res_lvbptr) {
640 r->res_lvbptr = allocate_lvb(r->res_ls);
641 if (!r->res_lvbptr)
642 goto out;
643 }
644
645 if (big_lock_exists) {
646 r->res_lvbseq = lkb->lkb_lvbseq;
647 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
648 } else if (high_lkb) {
649 r->res_lvbseq = high_lkb->lkb_lvbseq;
650 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
651 } else {
652 r->res_lvbseq = 0;
653 memset(r->res_lvbptr, 0, lvblen);
654 }
655 out:
656 return;
657}
658
659/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
660 converting PR->CW or CW->PR need to have their lkb_grmode set. */
661
662static void recover_conversion(struct dlm_rsb *r)
663{
664 struct dlm_lkb *lkb;
665 int grmode = -1;
666
667 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
668 if (lkb->lkb_grmode == DLM_LOCK_PR ||
669 lkb->lkb_grmode == DLM_LOCK_CW) {
670 grmode = lkb->lkb_grmode;
671 break;
672 }
673 }
674
675 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
676 if (lkb->lkb_grmode != DLM_LOCK_IV)
677 continue;
678 if (grmode == -1)
679 lkb->lkb_grmode = lkb->lkb_rqmode;
680 else
681 lkb->lkb_grmode = grmode;
682 }
683}
684
685/* We've become the new master for this rsb and waiting/converting locks may
686 need to be granted in dlm_grant_after_purge() due to locks that may have
687 existed from a removed node. */
688
689static void set_locks_purged(struct dlm_rsb *r)
690{
691 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
692 rsb_set_flag(r, RSB_LOCKS_PURGED);
693}
694
695void dlm_recover_rsbs(struct dlm_ls *ls)
696{
697 struct dlm_rsb *r;
698 int count = 0;
699
700 log_debug(ls, "dlm_recover_rsbs");
701
702 down_read(&ls->ls_root_sem);
703 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
704 lock_rsb(r);
705 if (is_master(r)) {
706 if (rsb_flag(r, RSB_RECOVER_CONVERT))
707 recover_conversion(r);
708 if (rsb_flag(r, RSB_NEW_MASTER2))
709 set_locks_purged(r);
710 recover_lvb(r);
711 count++;
712 }
713 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
714 rsb_clear_flag(r, RSB_NEW_MASTER2);
715 unlock_rsb(r);
716 }
717 up_read(&ls->ls_root_sem);
718
719 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
720}
721
722/* Create a single list of all root rsb's to be used during recovery */
723
724int dlm_create_root_list(struct dlm_ls *ls)
725{
726 struct dlm_rsb *r;
727 int i, error = 0;
728
729 down_write(&ls->ls_root_sem);
730 if (!list_empty(&ls->ls_root_list)) {
731 log_error(ls, "root list not empty");
732 error = -EINVAL;
733 goto out;
734 }
735
736 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
737 read_lock(&ls->ls_rsbtbl[i].lock);
738 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
739 list_add(&r->res_root_list, &ls->ls_root_list);
740 dlm_hold_rsb(r);
741 }
742 read_unlock(&ls->ls_rsbtbl[i].lock);
743 }
744 out:
745 up_write(&ls->ls_root_sem);
746 return error;
747}
748
749void dlm_release_root_list(struct dlm_ls *ls)
750{
751 struct dlm_rsb *r, *safe;
752
753 down_write(&ls->ls_root_sem);
754 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
755 list_del_init(&r->res_root_list);
756 dlm_put_rsb(r);
757 }
758 up_write(&ls->ls_root_sem);
759}
760
761void dlm_clear_toss_list(struct dlm_ls *ls)
762{
763 struct dlm_rsb *r, *safe;
764 int i;
765
766 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
767 write_lock(&ls->ls_rsbtbl[i].lock);
768 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
769 res_hashchain) {
770 list_del(&r->res_hashchain);
771 free_rsb(r);
772 }
773 write_unlock(&ls->ls_rsbtbl[i].lock);
774 }
775}
776
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..70103533677d
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,285 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237
238 while (!kthread_should_stop()) {
239 set_current_state(TASK_INTERRUPTIBLE);
240 if (!test_bit(LSFL_WORK, &ls->ls_flags))
241 schedule();
242 set_current_state(TASK_RUNNING);
243
244 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
245 do_ls_recovery(ls);
246 }
247
248 dlm_put_lockspace(ls);
249 return 0;
250}
251
252void dlm_recoverd_kick(struct dlm_ls *ls)
253{
254 set_bit(LSFL_WORK, &ls->ls_flags);
255 wake_up_process(ls->ls_recoverd_task);
256}
257
258int dlm_recoverd_start(struct dlm_ls *ls)
259{
260 struct task_struct *p;
261 int error = 0;
262
263 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
264 if (IS_ERR(p))
265 error = PTR_ERR(p);
266 else
267 ls->ls_recoverd_task = p;
268 return error;
269}
270
271void dlm_recoverd_stop(struct dlm_ls *ls)
272{
273 kthread_stop(ls->ls_recoverd_task);
274}
275
276void dlm_recoverd_suspend(struct dlm_ls *ls)
277{
278 mutex_lock(&ls->ls_recoverd_active);
279}
280
281void dlm_recoverd_resume(struct dlm_ls *ls)
282{
283 mutex_unlock(&ls->ls_recoverd_active);
284}
285
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..494d00ac014e
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,785 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/init.h>
11#include <linux/wait.h>
12#include <linux/module.h>
13#include <linux/file.h>
14#include <linux/fs.h>
15#include <linux/poll.h>
16#include <linux/signal.h>
17#include <linux/spinlock.h>
18#include <linux/dlm.h>
19#include <linux/dlm_device.h>
20
21#include "dlm_internal.h"
22#include "lockspace.h"
23#include "lock.h"
24#include "lvb_table.h"
25
26static const char *name_prefix="dlm";
27static struct miscdevice ctl_device;
28static struct file_operations device_fops;
29
30#ifdef CONFIG_COMPAT
31
32struct dlm_lock_params32 {
33 __u8 mode;
34 __u8 namelen;
35 __u16 flags;
36 __u32 lkid;
37 __u32 parent;
38
39 __u32 castparam;
40 __u32 castaddr;
41 __u32 bastparam;
42 __u32 bastaddr;
43 __u32 lksb;
44
45 char lvb[DLM_USER_LVB_LEN];
46 char name[0];
47};
48
49struct dlm_write_request32 {
50 __u32 version[3];
51 __u8 cmd;
52 __u8 is64bit;
53 __u8 unused[2];
54
55 union {
56 struct dlm_lock_params32 lock;
57 struct dlm_lspace_params lspace;
58 } i;
59};
60
61struct dlm_lksb32 {
62 __u32 sb_status;
63 __u32 sb_lkid;
64 __u8 sb_flags;
65 __u32 sb_lvbptr;
66};
67
68struct dlm_lock_result32 {
69 __u32 length;
70 __u32 user_astaddr;
71 __u32 user_astparam;
72 __u32 user_lksb;
73 struct dlm_lksb32 lksb;
74 __u8 bast_mode;
75 __u8 unused[3];
76 /* Offsets may be zero if no data is present */
77 __u32 lvb_offset;
78};
79
80static void compat_input(struct dlm_write_request *kb,
81 struct dlm_write_request32 *kb32)
82{
83 kb->version[0] = kb32->version[0];
84 kb->version[1] = kb32->version[1];
85 kb->version[2] = kb32->version[2];
86
87 kb->cmd = kb32->cmd;
88 kb->is64bit = kb32->is64bit;
89 if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
90 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
91 kb->i.lspace.flags = kb32->i.lspace.flags;
92 kb->i.lspace.minor = kb32->i.lspace.minor;
93 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
94 } else {
95 kb->i.lock.mode = kb32->i.lock.mode;
96 kb->i.lock.namelen = kb32->i.lock.namelen;
97 kb->i.lock.flags = kb32->i.lock.flags;
98 kb->i.lock.lkid = kb32->i.lock.lkid;
99 kb->i.lock.parent = kb32->i.lock.parent;
100 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
101 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
102 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
103 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
104 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
105 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
106 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
107 }
108}
109
110static void compat_output(struct dlm_lock_result *res,
111 struct dlm_lock_result32 *res32)
112{
113 res32->length = res->length - (sizeof(struct dlm_lock_result) -
114 sizeof(struct dlm_lock_result32));
115 res32->user_astaddr = (__u32)(long)res->user_astaddr;
116 res32->user_astparam = (__u32)(long)res->user_astparam;
117 res32->user_lksb = (__u32)(long)res->user_lksb;
118 res32->bast_mode = res->bast_mode;
119
120 res32->lvb_offset = res->lvb_offset;
121 res32->length = res->length;
122
123 res32->lksb.sb_status = res->lksb.sb_status;
124 res32->lksb.sb_flags = res->lksb.sb_flags;
125 res32->lksb.sb_lkid = res->lksb.sb_lkid;
126 res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
127}
128#endif
129
130
131void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
132{
133 struct dlm_ls *ls;
134 struct dlm_user_args *ua;
135 struct dlm_user_proc *proc;
136 int remove_ownqueue = 0;
137
138 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
139 lkb before dealing with it. We need to check this
140 flag before taking ls_clear_proc_locks mutex because if
141 it's set, dlm_clear_proc_locks() holds the mutex. */
142
143 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
144 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
145 return;
146 }
147
148 ls = lkb->lkb_resource->res_ls;
149 mutex_lock(&ls->ls_clear_proc_locks);
150
151 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
152 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
153 lkb->ua so we can't try to use it. */
154
155 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
156 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
157 goto out;
158 }
159
160 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
161 ua = (struct dlm_user_args *)lkb->lkb_astparam;
162 proc = ua->proc;
163
164 if (type == AST_BAST && ua->bastaddr == NULL)
165 goto out;
166
167 spin_lock(&proc->asts_spin);
168 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
169 kref_get(&lkb->lkb_ref);
170 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
171 lkb->lkb_ast_type |= type;
172 wake_up_interruptible(&proc->wait);
173 }
174
175 /* noqueue requests that fail may need to be removed from the
176 proc's locks list, there should be a better way of detecting
177 this situation than checking all these things... */
178
179 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
181 remove_ownqueue = 1;
182
183 /* We want to copy the lvb to userspace when the completion
184 ast is read if the status is 0, the lock has an lvb and
185 lvb_ops says we should. We could probably have set_lvb_lock()
186 set update_user_lvb instead and not need old_mode */
187
188 if ((lkb->lkb_ast_type & AST_COMP) &&
189 (lkb->lkb_lksb->sb_status == 0) &&
190 lkb->lkb_lksb->sb_lvbptr &&
191 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
192 ua->update_user_lvb = 1;
193 else
194 ua->update_user_lvb = 0;
195
196 spin_unlock(&proc->asts_spin);
197
198 if (remove_ownqueue) {
199 spin_lock(&ua->proc->locks_spin);
200 list_del_init(&lkb->lkb_ownqueue);
201 spin_unlock(&ua->proc->locks_spin);
202 dlm_put_lkb(lkb);
203 }
204 out:
205 mutex_unlock(&ls->ls_clear_proc_locks);
206}
207
208static int device_user_lock(struct dlm_user_proc *proc,
209 struct dlm_lock_params *params)
210{
211 struct dlm_ls *ls;
212 struct dlm_user_args *ua;
213 int error = -ENOMEM;
214
215 ls = dlm_find_lockspace_local(proc->lockspace);
216 if (!ls)
217 return -ENOENT;
218
219 if (!params->castaddr || !params->lksb) {
220 error = -EINVAL;
221 goto out;
222 }
223
224 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
225 if (!ua)
226 goto out;
227 ua->proc = proc;
228 ua->user_lksb = params->lksb;
229 ua->castparam = params->castparam;
230 ua->castaddr = params->castaddr;
231 ua->bastparam = params->bastparam;
232 ua->bastaddr = params->bastaddr;
233
234 if (params->flags & DLM_LKF_CONVERT)
235 error = dlm_user_convert(ls, ua,
236 params->mode, params->flags,
237 params->lkid, params->lvb);
238 else {
239 error = dlm_user_request(ls, ua,
240 params->mode, params->flags,
241 params->name, params->namelen,
242 params->parent);
243 if (!error)
244 error = ua->lksb.sb_lkid;
245 }
246 out:
247 dlm_put_lockspace(ls);
248 return error;
249}
250
251static int device_user_unlock(struct dlm_user_proc *proc,
252 struct dlm_lock_params *params)
253{
254 struct dlm_ls *ls;
255 struct dlm_user_args *ua;
256 int error = -ENOMEM;
257
258 ls = dlm_find_lockspace_local(proc->lockspace);
259 if (!ls)
260 return -ENOENT;
261
262 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
263 if (!ua)
264 goto out;
265 ua->proc = proc;
266 ua->user_lksb = params->lksb;
267 ua->castparam = params->castparam;
268 ua->castaddr = params->castaddr;
269
270 if (params->flags & DLM_LKF_CANCEL)
271 error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
272 else
273 error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
274 params->lvb);
275 out:
276 dlm_put_lockspace(ls);
277 return error;
278}
279
280static int device_create_lockspace(struct dlm_lspace_params *params)
281{
282 dlm_lockspace_t *lockspace;
283 struct dlm_ls *ls;
284 int error, len;
285
286 if (!capable(CAP_SYS_ADMIN))
287 return -EPERM;
288
289 error = dlm_new_lockspace(params->name, strlen(params->name),
290 &lockspace, 0, DLM_USER_LVB_LEN);
291 if (error)
292 return error;
293
294 ls = dlm_find_lockspace_local(lockspace);
295 if (!ls)
296 return -ENOENT;
297
298 error = -ENOMEM;
299 len = strlen(params->name) + strlen(name_prefix) + 2;
300 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
301 if (!ls->ls_device.name)
302 goto fail;
303 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
304 params->name);
305 ls->ls_device.fops = &device_fops;
306 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
307
308 error = misc_register(&ls->ls_device);
309 if (error) {
310 kfree(ls->ls_device.name);
311 goto fail;
312 }
313
314 error = ls->ls_device.minor;
315 dlm_put_lockspace(ls);
316 return error;
317
318 fail:
319 dlm_put_lockspace(ls);
320 dlm_release_lockspace(lockspace, 0);
321 return error;
322}
323
324static int device_remove_lockspace(struct dlm_lspace_params *params)
325{
326 dlm_lockspace_t *lockspace;
327 struct dlm_ls *ls;
328 int error;
329
330 if (!capable(CAP_SYS_ADMIN))
331 return -EPERM;
332
333 ls = dlm_find_lockspace_device(params->minor);
334 if (!ls)
335 return -ENOENT;
336
337 error = misc_deregister(&ls->ls_device);
338 if (error) {
339 dlm_put_lockspace(ls);
340 goto out;
341 }
342 kfree(ls->ls_device.name);
343
344 lockspace = ls->ls_local_handle;
345
346 /* dlm_release_lockspace waits for references to go to zero,
347 so all processes will need to close their device for the ls
348 before the release will procede */
349
350 dlm_put_lockspace(ls);
351 error = dlm_release_lockspace(lockspace, 0);
352out:
353 return error;
354}
355
356/* Check the user's version matches ours */
357static int check_version(struct dlm_write_request *req)
358{
359 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
360 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
361 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
362
363 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
364 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
365 current->comm,
366 current->pid,
367 req->version[0],
368 req->version[1],
369 req->version[2],
370 DLM_DEVICE_VERSION_MAJOR,
371 DLM_DEVICE_VERSION_MINOR,
372 DLM_DEVICE_VERSION_PATCH);
373 return -EINVAL;
374 }
375 return 0;
376}
377
378/*
379 * device_write
380 *
381 * device_user_lock
382 * dlm_user_request -> request_lock
383 * dlm_user_convert -> convert_lock
384 *
385 * device_user_unlock
386 * dlm_user_unlock -> unlock_lock
387 * dlm_user_cancel -> cancel_lock
388 *
389 * device_create_lockspace
390 * dlm_new_lockspace
391 *
392 * device_remove_lockspace
393 * dlm_release_lockspace
394 */
395
396/* a write to a lockspace device is a lock or unlock request, a write
397 to the control device is to create/remove a lockspace */
398
399static ssize_t device_write(struct file *file, const char __user *buf,
400 size_t count, loff_t *ppos)
401{
402 struct dlm_user_proc *proc = file->private_data;
403 struct dlm_write_request *kbuf;
404 sigset_t tmpsig, allsigs;
405 int error;
406
407#ifdef CONFIG_COMPAT
408 if (count < sizeof(struct dlm_write_request32))
409#else
410 if (count < sizeof(struct dlm_write_request))
411#endif
412 return -EINVAL;
413
414 kbuf = kmalloc(count, GFP_KERNEL);
415 if (!kbuf)
416 return -ENOMEM;
417
418 if (copy_from_user(kbuf, buf, count)) {
419 error = -EFAULT;
420 goto out_free;
421 }
422
423 if (check_version(kbuf)) {
424 error = -EBADE;
425 goto out_free;
426 }
427
428#ifdef CONFIG_COMPAT
429 if (!kbuf->is64bit) {
430 struct dlm_write_request32 *k32buf;
431 k32buf = (struct dlm_write_request32 *)kbuf;
432 kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
433 sizeof(struct dlm_write_request32)), GFP_KERNEL);
434 if (!kbuf)
435 return -ENOMEM;
436
437 if (proc)
438 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
439 compat_input(kbuf, k32buf);
440 kfree(k32buf);
441 }
442#endif
443
444 /* do we really need this? can a write happen after a close? */
445 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
446 test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
447 return -EINVAL;
448
449 sigfillset(&allsigs);
450 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
451
452 error = -EINVAL;
453
454 switch (kbuf->cmd)
455 {
456 case DLM_USER_LOCK:
457 if (!proc) {
458 log_print("no locking on control device");
459 goto out_sig;
460 }
461 error = device_user_lock(proc, &kbuf->i.lock);
462 break;
463
464 case DLM_USER_UNLOCK:
465 if (!proc) {
466 log_print("no locking on control device");
467 goto out_sig;
468 }
469 error = device_user_unlock(proc, &kbuf->i.lock);
470 break;
471
472 case DLM_USER_CREATE_LOCKSPACE:
473 if (proc) {
474 log_print("create/remove only on control device");
475 goto out_sig;
476 }
477 error = device_create_lockspace(&kbuf->i.lspace);
478 break;
479
480 case DLM_USER_REMOVE_LOCKSPACE:
481 if (proc) {
482 log_print("create/remove only on control device");
483 goto out_sig;
484 }
485 error = device_remove_lockspace(&kbuf->i.lspace);
486 break;
487
488 default:
489 log_print("Unknown command passed to DLM device : %d\n",
490 kbuf->cmd);
491 }
492
493 out_sig:
494 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
495 recalc_sigpending();
496 out_free:
497 kfree(kbuf);
498 return error;
499}
500
501/* Every process that opens the lockspace device has its own "proc" structure
502 hanging off the open file that's used to keep track of locks owned by the
503 process and asts that need to be delivered to the process. */
504
505static int device_open(struct inode *inode, struct file *file)
506{
507 struct dlm_user_proc *proc;
508 struct dlm_ls *ls;
509
510 ls = dlm_find_lockspace_device(iminor(inode));
511 if (!ls)
512 return -ENOENT;
513
514 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
515 if (!proc) {
516 dlm_put_lockspace(ls);
517 return -ENOMEM;
518 }
519
520 proc->lockspace = ls->ls_local_handle;
521 INIT_LIST_HEAD(&proc->asts);
522 INIT_LIST_HEAD(&proc->locks);
523 spin_lock_init(&proc->asts_spin);
524 spin_lock_init(&proc->locks_spin);
525 init_waitqueue_head(&proc->wait);
526 file->private_data = proc;
527
528 return 0;
529}
530
531static int device_close(struct inode *inode, struct file *file)
532{
533 struct dlm_user_proc *proc = file->private_data;
534 struct dlm_ls *ls;
535 sigset_t tmpsig, allsigs;
536
537 ls = dlm_find_lockspace_local(proc->lockspace);
538 if (!ls)
539 return -ENOENT;
540
541 sigfillset(&allsigs);
542 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
543
544 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
545
546 dlm_clear_proc_locks(ls, proc);
547
548 /* at this point no more lkb's should exist for this lockspace,
549 so there's no chance of dlm_user_add_ast() being called and
550 looking for lkb->ua->proc */
551
552 kfree(proc);
553 file->private_data = NULL;
554
555 dlm_put_lockspace(ls);
556 dlm_put_lockspace(ls); /* for the find in device_open() */
557
558 /* FIXME: AUTOFREE: if this ls is no longer used do
559 device_remove_lockspace() */
560
561 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
562 recalc_sigpending();
563
564 return 0;
565}
566
567static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
568 int bmode, char __user *buf, size_t count)
569{
570#ifdef CONFIG_COMPAT
571 struct dlm_lock_result32 result32;
572#endif
573 struct dlm_lock_result result;
574 void *resultptr;
575 int error=0;
576 int len;
577 int struct_len;
578
579 memset(&result, 0, sizeof(struct dlm_lock_result));
580 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
581 result.user_lksb = ua->user_lksb;
582
583 /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
584 in a conversion unless the conversion is successful. See code
585 in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
586 notes that a new blocking AST address and parameter are set even if
587 the conversion fails, so maybe we should just do that. */
588
589 if (type == AST_BAST) {
590 result.user_astaddr = ua->bastaddr;
591 result.user_astparam = ua->bastparam;
592 result.bast_mode = bmode;
593 } else {
594 result.user_astaddr = ua->castaddr;
595 result.user_astparam = ua->castparam;
596 }
597
598#ifdef CONFIG_COMPAT
599 if (compat)
600 len = sizeof(struct dlm_lock_result32);
601 else
602#endif
603 len = sizeof(struct dlm_lock_result);
604 struct_len = len;
605
606 /* copy lvb to userspace if there is one, it's been updated, and
607 the user buffer has space for it */
608
609 if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
610 count >= len + DLM_USER_LVB_LEN) {
611 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
612 DLM_USER_LVB_LEN)) {
613 error = -EFAULT;
614 goto out;
615 }
616
617 result.lvb_offset = len;
618 len += DLM_USER_LVB_LEN;
619 }
620
621 result.length = len;
622 resultptr = &result;
623#ifdef CONFIG_COMPAT
624 if (compat) {
625 compat_output(&result, &result32);
626 resultptr = &result32;
627 }
628#endif
629
630 if (copy_to_user(buf, resultptr, struct_len))
631 error = -EFAULT;
632 else
633 error = len;
634 out:
635 return error;
636}
637
638/* a read returns a single ast described in a struct dlm_lock_result */
639
640static ssize_t device_read(struct file *file, char __user *buf, size_t count,
641 loff_t *ppos)
642{
643 struct dlm_user_proc *proc = file->private_data;
644 struct dlm_lkb *lkb;
645 struct dlm_user_args *ua;
646 DECLARE_WAITQUEUE(wait, current);
647 int error, type=0, bmode=0, removed = 0;
648
649#ifdef CONFIG_COMPAT
650 if (count < sizeof(struct dlm_lock_result32))
651#else
652 if (count < sizeof(struct dlm_lock_result))
653#endif
654 return -EINVAL;
655
656 /* do we really need this? can a read happen after a close? */
657 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
658 return -EINVAL;
659
660 spin_lock(&proc->asts_spin);
661 if (list_empty(&proc->asts)) {
662 if (file->f_flags & O_NONBLOCK) {
663 spin_unlock(&proc->asts_spin);
664 return -EAGAIN;
665 }
666
667 add_wait_queue(&proc->wait, &wait);
668
669 repeat:
670 set_current_state(TASK_INTERRUPTIBLE);
671 if (list_empty(&proc->asts) && !signal_pending(current)) {
672 spin_unlock(&proc->asts_spin);
673 schedule();
674 spin_lock(&proc->asts_spin);
675 goto repeat;
676 }
677 set_current_state(TASK_RUNNING);
678 remove_wait_queue(&proc->wait, &wait);
679
680 if (signal_pending(current)) {
681 spin_unlock(&proc->asts_spin);
682 return -ERESTARTSYS;
683 }
684 }
685
686 if (list_empty(&proc->asts)) {
687 spin_unlock(&proc->asts_spin);
688 return -EAGAIN;
689 }
690
691 /* there may be both completion and blocking asts to return for
692 the lkb, don't remove lkb from asts list unless no asts remain */
693
694 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
695
696 if (lkb->lkb_ast_type & AST_COMP) {
697 lkb->lkb_ast_type &= ~AST_COMP;
698 type = AST_COMP;
699 } else if (lkb->lkb_ast_type & AST_BAST) {
700 lkb->lkb_ast_type &= ~AST_BAST;
701 type = AST_BAST;
702 bmode = lkb->lkb_bastmode;
703 }
704
705 if (!lkb->lkb_ast_type) {
706 list_del(&lkb->lkb_astqueue);
707 removed = 1;
708 }
709 spin_unlock(&proc->asts_spin);
710
711 ua = (struct dlm_user_args *)lkb->lkb_astparam;
712 error = copy_result_to_user(ua,
713 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
714 type, bmode, buf, count);
715
716 /* removes reference for the proc->asts lists added by
717 dlm_user_add_ast() and may result in the lkb being freed */
718 if (removed)
719 dlm_put_lkb(lkb);
720
721 return error;
722}
723
724static unsigned int device_poll(struct file *file, poll_table *wait)
725{
726 struct dlm_user_proc *proc = file->private_data;
727
728 poll_wait(file, &proc->wait, wait);
729
730 spin_lock(&proc->asts_spin);
731 if (!list_empty(&proc->asts)) {
732 spin_unlock(&proc->asts_spin);
733 return POLLIN | POLLRDNORM;
734 }
735 spin_unlock(&proc->asts_spin);
736 return 0;
737}
738
739static int ctl_device_open(struct inode *inode, struct file *file)
740{
741 file->private_data = NULL;
742 return 0;
743}
744
745static int ctl_device_close(struct inode *inode, struct file *file)
746{
747 return 0;
748}
749
750static struct file_operations device_fops = {
751 .open = device_open,
752 .release = device_close,
753 .read = device_read,
754 .write = device_write,
755 .poll = device_poll,
756 .owner = THIS_MODULE,
757};
758
759static struct file_operations ctl_device_fops = {
760 .open = ctl_device_open,
761 .release = ctl_device_close,
762 .write = device_write,
763 .owner = THIS_MODULE,
764};
765
766int dlm_user_init(void)
767{
768 int error;
769
770 ctl_device.name = "dlm-control";
771 ctl_device.fops = &ctl_device_fops;
772 ctl_device.minor = MISC_DYNAMIC_MINOR;
773
774 error = misc_register(&ctl_device);
775 if (error)
776 log_print("misc_register failed for control device");
777
778 return error;
779}
780
781void dlm_user_exit(void)
782{
783 misc_deregister(&ctl_device);
784}
785
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__
11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void);
14void dlm_user_exit(void);
15
16#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL
4 select FS_POSIX_ACL
5 help
6 A cluster filesystem.
7
8 Allows a cluster of computers to simultaneously use a block device
9 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
10 and writes to the block device like a local filesystem, but also uses
11 a lock module to allow the computers coordinate their I/O so
12 filesystem consistency is maintained. One of the nifty features of
13 GFS is perfect consistency -- changes made to the filesystem on one
14 machine show up immediately on all other machines in the cluster.
15
16 To use the GFS2 filesystem, you will need to enable one or more of
17 the below locking modules. Documentation and utilities for GFS2 can
18 be found here: http://sources.redhat.com/cluster
19
20config GFS2_FS_LOCKING_NOLOCK
21 tristate "GFS2 \"nolock\" locking module"
22 depends on GFS2_FS
23 help
24 Single node locking module for GFS2.
25
26 Use this module if you want to use GFS2 on a single node without
27 its clustering features. You can still take advantage of the
28 large file support, and upgrade to running a full cluster later on
29 if required.
30
31 If you will only be using GFS2 in cluster mode, you do not need this
32 module.
33
34config GFS2_FS_LOCKING_DLM
35 tristate "GFS2 DLM locking module"
36 depends on GFS2_FS
37 select DLM
38 help
39 Multiple node locking module for GFS2
40
41 Most users of GFS2 will require this module. It provides the locking
42 interface between GFS2 and the DLM, which is required to use GFS2
43 in a cluster environment.
44
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..b92852b66629
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o lvb.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o
7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..399317841501
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,313 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "trans.h"
29#include "util.h"
30
31#define ACL_ACCESS 1
32#define ACL_DEFAULT 0
33
34int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
35 struct gfs2_ea_request *er,
36 int *remove, mode_t *mode)
37{
38 struct posix_acl *acl;
39 int error;
40
41 error = gfs2_acl_validate_remove(ip, access);
42 if (error)
43 return error;
44
45 if (!er->er_data)
46 return -EINVAL;
47
48 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
49 if (IS_ERR(acl))
50 return PTR_ERR(acl);
51 if (!acl) {
52 *remove = 1;
53 return 0;
54 }
55
56 error = posix_acl_valid(acl);
57 if (error)
58 goto out;
59
60 if (access) {
61 error = posix_acl_equiv_mode(acl, mode);
62 if (!error)
63 *remove = 1;
64 else if (error > 0)
65 error = 0;
66 }
67
68 out:
69 posix_acl_release(acl);
70
71 return error;
72}
73
74int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
75{
76 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
77 return -EOPNOTSUPP;
78 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
79 return -EPERM;
80 if (S_ISLNK(ip->i_di.di_mode))
81 return -EOPNOTSUPP;
82 if (!access && !S_ISDIR(ip->i_di.di_mode))
83 return -EACCES;
84
85 return 0;
86}
87
88static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
89 struct gfs2_ea_location *el, char **data, unsigned int *len)
90{
91 struct gfs2_ea_request er;
92 struct gfs2_ea_location el_this;
93 int error;
94
95 if (!ip->i_di.di_eattr)
96 return 0;
97
98 memset(&er, 0, sizeof(struct gfs2_ea_request));
99 if (access) {
100 er.er_name = GFS2_POSIX_ACL_ACCESS;
101 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
102 } else {
103 er.er_name = GFS2_POSIX_ACL_DEFAULT;
104 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
105 }
106 er.er_type = GFS2_EATYPE_SYS;
107
108 if (!el)
109 el = &el_this;
110
111 error = gfs2_ea_find(ip, &er, el);
112 if (error)
113 return error;
114 if (!el->el_ea)
115 return 0;
116 if (!GFS2_EA_DATA_LEN(el->el_ea))
117 goto out;
118
119 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
120 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
121 error = -ENOMEM;
122 if (!er.er_data)
123 goto out;
124
125 error = gfs2_ea_get_copy(ip, el, er.er_data);
126 if (error)
127 goto out_kfree;
128
129 if (acl) {
130 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
131 if (IS_ERR(*acl))
132 error = PTR_ERR(*acl);
133 }
134
135 out_kfree:
136 if (error || !data)
137 kfree(er.er_data);
138 else {
139 *data = er.er_data;
140 *len = er.er_data_len;
141 }
142
143 out:
144 if (error || el == &el_this)
145 brelse(el->el_bh);
146
147 return error;
148}
149
150/**
151 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
152 * @inode: the file we want to do something to
153 * @mask: what we want to do
154 *
155 * Returns: errno
156 */
157
158int gfs2_check_acl_locked(struct inode *inode, int mask)
159{
160 struct posix_acl *acl = NULL;
161 int error;
162
163 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
164 if (error)
165 return error;
166
167 if (acl) {
168 error = posix_acl_permission(inode, acl, mask);
169 posix_acl_release(acl);
170 return error;
171 }
172
173 return -EAGAIN;
174}
175
176int gfs2_check_acl(struct inode *inode, int mask)
177{
178 struct gfs2_inode *ip = GFS2_I(inode);
179 struct gfs2_holder i_gh;
180 int error;
181
182 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
183 if (!error) {
184 error = gfs2_check_acl_locked(inode, mask);
185 gfs2_glock_dq_uninit(&i_gh);
186 }
187
188 return error;
189}
190
191static int munge_mode(struct gfs2_inode *ip, mode_t mode)
192{
193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
194 struct buffer_head *dibh;
195 int error;
196
197 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
198 if (error)
199 return error;
200
201 error = gfs2_meta_inode_buffer(ip, &dibh);
202 if (!error) {
203 gfs2_assert_withdraw(sdp,
204 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
205 ip->i_di.di_mode = mode;
206 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
207 gfs2_dinode_out(&ip->i_di, dibh->b_data);
208 brelse(dibh);
209 }
210
211 gfs2_trans_end(sdp);
212
213 return 0;
214}
215
216int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
217{
218 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
219 struct posix_acl *acl = NULL, *clone;
220 struct gfs2_ea_request er;
221 mode_t mode = ip->i_di.di_mode;
222 int error;
223
224 if (!sdp->sd_args.ar_posix_acl)
225 return 0;
226 if (S_ISLNK(ip->i_di.di_mode))
227 return 0;
228
229 memset(&er, 0, sizeof(struct gfs2_ea_request));
230 er.er_type = GFS2_EATYPE_SYS;
231
232 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
233 &er.er_data, &er.er_data_len);
234 if (error)
235 return error;
236 if (!acl) {
237 mode &= ~current->fs->umask;
238 if (mode != ip->i_di.di_mode)
239 error = munge_mode(ip, mode);
240 return error;
241 }
242
243 clone = posix_acl_clone(acl, GFP_KERNEL);
244 error = -ENOMEM;
245 if (!clone)
246 goto out;
247 posix_acl_release(acl);
248 acl = clone;
249
250 if (S_ISDIR(ip->i_di.di_mode)) {
251 er.er_name = GFS2_POSIX_ACL_DEFAULT;
252 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
253 error = gfs2_system_eaops.eo_set(ip, &er);
254 if (error)
255 goto out;
256 }
257
258 error = posix_acl_create_masq(acl, &mode);
259 if (error < 0)
260 goto out;
261 if (error > 0) {
262 er.er_name = GFS2_POSIX_ACL_ACCESS;
263 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
264 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
265 er.er_mode = mode;
266 er.er_flags = GFS2_ERF_MODE;
267 error = gfs2_system_eaops.eo_set(ip, &er);
268 if (error)
269 goto out;
270 } else
271 munge_mode(ip, mode);
272
273 out:
274 posix_acl_release(acl);
275 kfree(er.er_data);
276 return error;
277}
278
279int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
280{
281 struct posix_acl *acl = NULL, *clone;
282 struct gfs2_ea_location el;
283 char *data;
284 unsigned int len;
285 int error;
286
287 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
288 if (error)
289 return error;
290 if (!acl)
291 return gfs2_setattr_simple(ip, attr);
292
293 clone = posix_acl_clone(acl, GFP_KERNEL);
294 error = -ENOMEM;
295 if (!clone)
296 goto out;
297 posix_acl_release(acl);
298 acl = clone;
299
300 error = posix_acl_chmod_masq(acl, attr->ia_mode);
301 if (!error) {
302 posix_acl_to_xattr(acl, data, len);
303 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
304 }
305
306 out:
307 posix_acl_release(acl);
308 brelse(el.el_bh);
309 kfree(data);
310
311 return error;
312}
313
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..067105786eaa
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,37 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
14#define GFS2_POSIX_ACL_ACCESS_LEN 16
15#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
16#define GFS2_POSIX_ACL_DEFAULT_LEN 17
17
18#define GFS2_ACL_IS_ACCESS(name, len) \
19 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
20 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
21
22#define GFS2_ACL_IS_DEFAULT(name, len) \
23 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
24 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
25
26struct gfs2_ea_request;
27
28int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
29 struct gfs2_ea_request *er,
30 int *remove, mode_t *mode);
31int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
32int gfs2_check_acl_locked(struct inode *inode, int mask);
33int gfs2_check_acl(struct inode *inode, int mask);
34int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
35int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
36
37#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..d20d41e1c028
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1236 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "dir.h"
29#include "util.h"
30#include "ops_address.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
38};
39
40typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
41 struct buffer_head *bh, uint64_t *top,
42 uint64_t *bottom, unsigned int height,
43 void *data);
44
45struct strip_mine {
46 int sm_first;
47 unsigned int sm_height;
48};
49
50/**
51 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
52 * @ip: the inode
53 * @dibh: the dinode buffer
54 * @block: the block number that was allocated
55 * @private: any locked page held by the caller process
56 *
57 * Returns: errno
58 */
59
60static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
61 uint64_t block, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 struct inode *inode = &ip->i_inode;
65 struct buffer_head *bh;
66 int release = 0;
67
68 if (!page || page->index) {
69 page = grab_cache_page(inode->i_mapping, 0);
70 if (!page)
71 return -ENOMEM;
72 release = 1;
73 }
74
75 if (!PageUptodate(page)) {
76 void *kaddr = kmap(page);
77
78 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
79 ip->i_di.di_size);
80 memset(kaddr + ip->i_di.di_size, 0,
81 PAGE_CACHE_SIZE - ip->i_di.di_size);
82 kunmap(page);
83
84 SetPageUptodate(page);
85 }
86
87 if (!page_has_buffers(page))
88 create_empty_buffers(page, 1 << inode->i_blkbits,
89 (1 << BH_Uptodate));
90
91 bh = page_buffers(page);
92
93 if (!buffer_mapped(bh))
94 map_bh(bh, inode->i_sb, block);
95
96 set_buffer_uptodate(bh);
97 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip))
98 gfs2_trans_add_bh(ip->i_gl, bh, 0);
99 mark_buffer_dirty(bh);
100
101 if (release) {
102 unlock_page(page);
103 page_cache_release(page);
104 }
105
106 return 0;
107}
108
109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file
113 * @private: private data for the unstuffer
114 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way.
117 *
118 * Returns: errno
119 */
120
121int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
122{
123 struct buffer_head *bh, *dibh;
124 uint64_t block = 0;
125 int isdir = gfs2_is_dir(ip);
126 int error;
127
128 down_write(&ip->i_rw_mutex);
129
130 error = gfs2_meta_inode_buffer(ip, &dibh);
131 if (error)
132 goto out;
133
134 if (ip->i_di.di_size) {
135 /* Get a free block, fill it with the stuffed data,
136 and write it out to disk */
137
138 if (isdir) {
139 block = gfs2_alloc_meta(ip);
140
141 error = gfs2_dir_get_new_buffer(ip, block, &bh);
142 if (error)
143 goto out_brelse;
144 gfs2_buffer_copy_tail(bh,
145 sizeof(struct gfs2_meta_header),
146 dibh, sizeof(struct gfs2_dinode));
147 brelse(bh);
148 } else {
149 block = gfs2_alloc_data(ip);
150
151 error = gfs2_unstuffer_page(ip, dibh, block, page);
152 if (error)
153 goto out_brelse;
154 }
155 }
156
157 /* Set up the pointer to the new block */
158
159 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
160
161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
162
163 if (ip->i_di.di_size) {
164 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) =
165 cpu_to_be64(block);
166 ip->i_di.di_blocks++;
167 }
168
169 ip->i_di.di_height = 1;
170
171 gfs2_dinode_out(&ip->i_di, dibh->b_data);
172
173 out_brelse:
174 brelse(dibh);
175
176 out:
177 up_write(&ip->i_rw_mutex);
178
179 return error;
180}
181
182/**
183 * calc_tree_height - Calculate the height of a metadata tree
184 * @ip: The GFS2 inode
185 * @size: The proposed size of the file
186 *
187 * Work out how tall a metadata tree needs to be in order to accommodate a
188 * file of a particular size. If size is less than the current size of
189 * the inode, then the current size of the inode is used instead of the
190 * supplied one.
191 *
192 * Returns: the height the tree should be
193 */
194
195static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
196{
197 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
198 uint64_t *arr;
199 unsigned int max, height;
200
201 if (ip->i_di.di_size > size)
202 size = ip->i_di.di_size;
203
204 if (gfs2_is_dir(ip)) {
205 arr = sdp->sd_jheightsize;
206 max = sdp->sd_max_jheight;
207 } else {
208 arr = sdp->sd_heightsize;
209 max = sdp->sd_max_height;
210 }
211
212 for (height = 0; height < max; height++)
213 if (arr[height] >= size)
214 break;
215
216 return height;
217}
218
219/**
220 * build_height - Build a metadata tree of the requested height
221 * @ip: The GFS2 inode
222 * @height: The height to build to
223 *
224 *
225 * Returns: errno
226 */
227
228static int build_height(struct inode *inode, unsigned height)
229{
230 struct gfs2_inode *ip = GFS2_I(inode);
231 unsigned new_height = height - ip->i_di.di_height;
232 struct buffer_head *dibh;
233 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
234 int error;
235 u64 *bp;
236 u64 bn;
237 unsigned n;
238
239 if (height <= ip->i_di.di_height)
240 return 0;
241
242 error = gfs2_meta_inode_buffer(ip, &dibh);
243 if (error)
244 return error;
245
246 for(n = 0; n < new_height; n++) {
247 bn = gfs2_alloc_meta(ip);
248 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
249 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
250 }
251
252 n = 0;
253 bn = blocks[0]->b_blocknr;
254 if (new_height > 1) {
255 for(; n < new_height-1; n++) {
256 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
257 GFS2_FORMAT_IN);
258 gfs2_buffer_clear_tail(blocks[n],
259 sizeof(struct gfs2_meta_header));
260 bp = (u64 *)(blocks[n]->b_data +
261 sizeof(struct gfs2_meta_header));
262 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
263 brelse(blocks[n]);
264 blocks[n] = NULL;
265 }
266 }
267 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
268 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
269 dibh, sizeof(struct gfs2_dinode));
270 brelse(blocks[n]);
271 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
272 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
273 bp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
274 *bp = cpu_to_be64(bn);
275 ip->i_di.di_height += new_height;
276 ip->i_di.di_blocks += new_height;
277 gfs2_dinode_out(&ip->i_di, dibh->b_data);
278 brelse(dibh);
279 return error;
280}
281
282/**
283 * find_metapath - Find path through the metadata tree
284 * @ip: The inode pointer
285 * @mp: The metapath to return the result in
286 * @block: The disk block to look up
287 *
288 * This routine returns a struct metapath structure that defines a path
289 * through the metadata of inode "ip" to get to block "block".
290 *
291 * Example:
292 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
293 * filesystem with a blocksize of 4096.
294 *
295 * find_metapath() would return a struct metapath structure set to:
296 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
297 * and mp_list[2] = 165.
298 *
299 * That means that in order to get to the block containing the byte at
300 * offset 101342453, we would load the indirect block pointed to by pointer
301 * 0 in the dinode. We would then load the indirect block pointed to by
302 * pointer 48 in that indirect block. We would then load the data block
303 * pointed to by pointer 165 in that indirect block.
304 *
305 * ----------------------------------------
306 * | Dinode | |
307 * | | 4|
308 * | |0 1 2 3 4 5 9|
309 * | | 6|
310 * ----------------------------------------
311 * |
312 * |
313 * V
314 * ----------------------------------------
315 * | Indirect Block |
316 * | 5|
317 * | 4 4 4 4 4 5 5 1|
318 * |0 5 6 7 8 9 0 1 2|
319 * ----------------------------------------
320 * |
321 * |
322 * V
323 * ----------------------------------------
324 * | Indirect Block |
325 * | 1 1 1 1 1 5|
326 * | 6 6 6 6 6 1|
327 * |0 3 4 5 6 7 2|
328 * ----------------------------------------
329 * |
330 * |
331 * V
332 * ----------------------------------------
333 * | Data block containing offset |
334 * | 101342453 |
335 * | |
336 * | |
337 * ----------------------------------------
338 *
339 */
340
341static void find_metapath(struct gfs2_inode *ip, uint64_t block,
342 struct metapath *mp)
343{
344 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
345 uint64_t b = block;
346 unsigned int i;
347
348 for (i = ip->i_di.di_height; i--;)
349 mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
350
351}
352
353/**
354 * metapointer - Return pointer to start of metadata in a buffer
355 * @bh: The buffer
356 * @height: The metadata height (0 = dinode)
357 * @mp: The metapath
358 *
359 * Return a pointer to the block number of the next height of the metadata
360 * tree given a buffer containing the pointer to the current height of the
361 * metadata tree.
362 */
363
364static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
365 unsigned int height, const struct metapath *mp)
366{
367 unsigned int head_size = (height > 0) ?
368 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
369 u64 *ptr;
370 *boundary = 0;
371 ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
372 if (ptr + 1 == (u64*)(bh->b_data + bh->b_size))
373 *boundary = 1;
374 return ptr;
375}
376
377/**
378 * lookup_block - Get the next metadata block in metadata tree
379 * @ip: The GFS2 inode
380 * @bh: Buffer containing the pointers to metadata blocks
381 * @height: The height of the tree (0 = dinode)
382 * @mp: The metapath
383 * @create: Non-zero if we may create a new meatdata block
384 * @new: Used to indicate if we did create a new metadata block
385 * @block: the returned disk block number
386 *
387 * Given a metatree, complete to a particular height, checks to see if the next
388 * height of the tree exists. If not the next height of the tree is created.
389 * The block number of the next height of the metadata tree is returned.
390 *
391 */
392
393static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
394 unsigned int height, struct metapath *mp, int create,
395 int *new, uint64_t *block)
396{
397 int boundary;
398 uint64_t *ptr = metapointer(bh, &boundary, height, mp);
399
400 if (*ptr) {
401 *block = be64_to_cpu(*ptr);
402 return boundary;
403 }
404
405 *block = 0;
406
407 if (!create)
408 return 0;
409
410 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
411 *block = gfs2_alloc_data(ip);
412 else
413 *block = gfs2_alloc_meta(ip);
414
415 gfs2_trans_add_bh(ip->i_gl, bh, 1);
416
417 *ptr = cpu_to_be64(*block);
418 ip->i_di.di_blocks++;
419
420 *new = 1;
421 return 0;
422}
423
424/**
425 * gfs2_block_pointers - Map a block from an inode to a disk block
426 * @inode: The inode
427 * @lblock: The logical block number
428 * @new: Value/Result argument (1 = may create/did create new blocks)
429 * @boundary: gets set if we've hit a block boundary
430 * @mp: metapath to use
431 *
432 * Find the block number on the current device which corresponds to an
433 * inode's block. If the block had to be created, "new" will be set.
434 *
435 * Returns: errno
436 */
437
438static struct buffer_head *gfs2_block_pointers(struct inode *inode, u64 lblock,
439 int *new, u64 *dblock,
440 int *boundary,
441 struct metapath *mp)
442{
443 struct gfs2_inode *ip = GFS2_I(inode);
444 struct gfs2_sbd *sdp = GFS2_SB(inode);
445 struct buffer_head *bh;
446 int create = *new;
447 unsigned int bsize;
448 unsigned int height;
449 unsigned int end_of_metadata;
450 unsigned int x;
451 int error = 0;
452
453 *new = 0;
454 *dblock = 0;
455
456 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
457 goto out;
458
459 bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
460
461 height = calc_tree_height(ip, (lblock + 1) * bsize);
462 if (ip->i_di.di_height < height) {
463 if (!create)
464 goto out;
465
466 error = build_height(inode, height);
467 if (error)
468 goto out;
469 }
470
471 find_metapath(ip, lblock, mp);
472 end_of_metadata = ip->i_di.di_height - 1;
473
474 error = gfs2_meta_inode_buffer(ip, &bh);
475 if (error)
476 goto out;
477
478 for (x = 0; x < end_of_metadata; x++) {
479 lookup_block(ip, bh, x, mp, create, new, dblock);
480 brelse(bh);
481 if (!*dblock)
482 goto out;
483
484 error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
485 if (error)
486 goto out;
487 }
488
489 *boundary = lookup_block(ip, bh, end_of_metadata, mp, create, new, dblock);
490 if (*new) {
491 struct buffer_head *dibh;
492 error = gfs2_meta_inode_buffer(ip, &dibh);
493 if (!error) {
494 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
495 gfs2_dinode_out(&ip->i_di, dibh->b_data);
496 brelse(dibh);
497 }
498 }
499 return bh;
500out:
501 return ERR_PTR(error);
502}
503
504
505static inline void bmap_lock(struct inode *inode, int create)
506{
507 struct gfs2_inode *ip = GFS2_I(inode);
508 if (create)
509 down_write(&ip->i_rw_mutex);
510 else
511 down_read(&ip->i_rw_mutex);
512}
513
514static inline void bmap_unlock(struct inode *inode, int create)
515{
516 struct gfs2_inode *ip = GFS2_I(inode);
517 if (create)
518 up_write(&ip->i_rw_mutex);
519 else
520 up_read(&ip->i_rw_mutex);
521}
522
523int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary)
524{
525 struct metapath mp;
526 struct buffer_head *bh;
527 int create = *new;
528
529 bmap_lock(inode, create);
530 bh = gfs2_block_pointers(inode, lblock, new, dblock, boundary, &mp);
531 bmap_unlock(inode, create);
532 if (!bh)
533 return 0;
534 if (IS_ERR(bh))
535 return PTR_ERR(bh);
536 brelse(bh);
537 return 0;
538}
539
540int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
541{
542 struct gfs2_inode *ip = GFS2_I(inode);
543 struct gfs2_sbd *sdp = GFS2_SB(inode);
544 struct metapath mp;
545 struct buffer_head *bh;
546 int boundary;
547 int create = *new;
548
549 BUG_ON(!extlen);
550 BUG_ON(!dblock);
551 BUG_ON(!new);
552
553 bmap_lock(inode, create);
554 bh = gfs2_block_pointers(inode, lblock, new, dblock, &boundary, &mp);
555 *extlen = 1;
556
557 if (bh && !IS_ERR(bh) && *dblock && !*new) {
558 u64 tmp_dblock;
559 int tmp_new;
560 unsigned int nptrs;
561 unsigned end_of_metadata = ip->i_di.di_height - 1;
562
563 nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs;
564 while (++mp.mp_list[end_of_metadata] < nptrs) {
565 lookup_block(ip, bh, end_of_metadata, &mp, 0, &tmp_new, &tmp_dblock);
566 if (*dblock + *extlen != tmp_dblock)
567 break;
568 (*extlen)++;
569 }
570 }
571 bmap_unlock(inode, create);
572 if (!bh)
573 return 0;
574 if (IS_ERR(bh))
575 return PTR_ERR(bh);
576 brelse(bh);
577 return 0;
578}
579
580/**
581 * recursive_scan - recursively scan through the end of a file
582 * @ip: the inode
583 * @dibh: the dinode buffer
584 * @mp: the path through the metadata to the point to start
585 * @height: the height the recursion is at
586 * @block: the indirect block to look at
587 * @first: 1 if this is the first block
588 * @bc: the call to make for each piece of metadata
589 * @data: data opaque to this function to pass to @bc
590 *
591 * When this is first called @height and @block should be zero and
592 * @first should be 1.
593 *
594 * Returns: errno
595 */
596
597static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
598 struct metapath *mp, unsigned int height,
599 uint64_t block, int first, block_call_t bc,
600 void *data)
601{
602 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
603 struct buffer_head *bh = NULL;
604 uint64_t *top, *bottom;
605 uint64_t bn;
606 int error;
607 int mh_size = sizeof(struct gfs2_meta_header);
608
609 if (!height) {
610 error = gfs2_meta_inode_buffer(ip, &bh);
611 if (error)
612 return error;
613 dibh = bh;
614
615 top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
616 mp->mp_list[0];
617 bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
618 sdp->sd_diptrs;
619 } else {
620 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
621 if (error)
622 return error;
623
624 top = (uint64_t *)(bh->b_data + mh_size) +
625 ((first) ? mp->mp_list[height] : 0);
626
627 bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
628 }
629
630 error = bc(ip, dibh, bh, top, bottom, height, data);
631 if (error)
632 goto out;
633
634 if (height < ip->i_di.di_height - 1)
635 for (; top < bottom; top++, first = 0) {
636 if (!*top)
637 continue;
638
639 bn = be64_to_cpu(*top);
640
641 error = recursive_scan(ip, dibh, mp, height + 1, bn,
642 first, bc, data);
643 if (error)
644 break;
645 }
646
647 out:
648 brelse(bh);
649
650 return error;
651}
652
653/**
654 * do_strip - Look for a layer a particular layer of the file and strip it off
655 * @ip: the inode
656 * @dibh: the dinode buffer
657 * @bh: A buffer of pointers
658 * @top: The first pointer in the buffer
659 * @bottom: One more than the last pointer
660 * @height: the height this buffer is at
661 * @data: a pointer to a struct strip_mine
662 *
663 * Returns: errno
664 */
665
666static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
667 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
668 unsigned int height, void *data)
669{
670 struct strip_mine *sm = data;
671 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
672 struct gfs2_rgrp_list rlist;
673 uint64_t bn, bstart;
674 uint32_t blen;
675 uint64_t *p;
676 unsigned int rg_blocks = 0;
677 int metadata;
678 unsigned int revokes = 0;
679 int x;
680 int error;
681
682 if (!*top)
683 sm->sm_first = 0;
684
685 if (height != sm->sm_height)
686 return 0;
687
688 if (sm->sm_first) {
689 top++;
690 sm->sm_first = 0;
691 }
692
693 metadata = (height != ip->i_di.di_height - 1);
694 if (metadata)
695 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
696
697 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
698 if (error)
699 return error;
700
701 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
702 bstart = 0;
703 blen = 0;
704
705 for (p = top; p < bottom; p++) {
706 if (!*p)
707 continue;
708
709 bn = be64_to_cpu(*p);
710
711 if (bstart + blen == bn)
712 blen++;
713 else {
714 if (bstart)
715 gfs2_rlist_add(sdp, &rlist, bstart);
716
717 bstart = bn;
718 blen = 1;
719 }
720 }
721
722 if (bstart)
723 gfs2_rlist_add(sdp, &rlist, bstart);
724 else
725 goto out; /* Nothing to do */
726
727 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
728
729 for (x = 0; x < rlist.rl_rgrps; x++) {
730 struct gfs2_rgrpd *rgd;
731 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
732 rg_blocks += rgd->rd_ri.ri_length;
733 }
734
735 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
736 if (error)
737 goto out_rlist;
738
739 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
740 RES_INDIRECT + RES_STATFS + RES_QUOTA,
741 revokes);
742 if (error)
743 goto out_rg_gunlock;
744
745 down_write(&ip->i_rw_mutex);
746
747 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
748 gfs2_trans_add_bh(ip->i_gl, bh, 1);
749
750 bstart = 0;
751 blen = 0;
752
753 for (p = top; p < bottom; p++) {
754 if (!*p)
755 continue;
756
757 bn = be64_to_cpu(*p);
758
759 if (bstart + blen == bn)
760 blen++;
761 else {
762 if (bstart) {
763 if (metadata)
764 gfs2_free_meta(ip, bstart, blen);
765 else
766 gfs2_free_data(ip, bstart, blen);
767 }
768
769 bstart = bn;
770 blen = 1;
771 }
772
773 *p = 0;
774 if (!ip->i_di.di_blocks)
775 gfs2_consist_inode(ip);
776 ip->i_di.di_blocks--;
777 }
778 if (bstart) {
779 if (metadata)
780 gfs2_free_meta(ip, bstart, blen);
781 else
782 gfs2_free_data(ip, bstart, blen);
783 }
784
785 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
786
787 gfs2_dinode_out(&ip->i_di, dibh->b_data);
788
789 up_write(&ip->i_rw_mutex);
790
791 gfs2_trans_end(sdp);
792
793 out_rg_gunlock:
794 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
795
796 out_rlist:
797 gfs2_rlist_free(&rlist);
798
799 out:
800 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
801
802 return error;
803}
804
805/**
806 * do_grow - Make a file look bigger than it is
807 * @ip: the inode
808 * @size: the size to set the file to
809 *
810 * Called with an exclusive lock on @ip.
811 *
812 * Returns: errno
813 */
814
815static int do_grow(struct gfs2_inode *ip, uint64_t size)
816{
817 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
818 struct gfs2_alloc *al;
819 struct buffer_head *dibh;
820 unsigned int h;
821 int error;
822
823 al = gfs2_alloc_get(ip);
824
825 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
826 if (error)
827 goto out;
828
829 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
830 if (error)
831 goto out_gunlock_q;
832
833 al->al_requested = sdp->sd_max_height + RES_DATA;
834
835 error = gfs2_inplace_reserve(ip);
836 if (error)
837 goto out_gunlock_q;
838
839 error = gfs2_trans_begin(sdp,
840 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
841 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
842 if (error)
843 goto out_ipres;
844
845 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
846 if (gfs2_is_stuffed(ip)) {
847 error = gfs2_unstuff_dinode(ip, NULL);
848 if (error)
849 goto out_end_trans;
850 }
851
852 h = calc_tree_height(ip, size);
853 if (ip->i_di.di_height < h) {
854 down_write(&ip->i_rw_mutex);
855 error = build_height(&ip->i_inode, h);
856 up_write(&ip->i_rw_mutex);
857 if (error)
858 goto out_end_trans;
859 }
860 }
861
862 ip->i_di.di_size = size;
863 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
864
865 error = gfs2_meta_inode_buffer(ip, &dibh);
866 if (error)
867 goto out_end_trans;
868
869 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
870 gfs2_dinode_out(&ip->i_di, dibh->b_data);
871 brelse(dibh);
872
873 out_end_trans:
874 gfs2_trans_end(sdp);
875
876 out_ipres:
877 gfs2_inplace_release(ip);
878
879 out_gunlock_q:
880 gfs2_quota_unlock(ip);
881
882 out:
883 gfs2_alloc_put(ip);
884
885 return error;
886}
887
888
889/**
890 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
891 *
892 * This is partly borrowed from ext3.
893 */
894static int gfs2_block_truncate_page(struct address_space *mapping)
895{
896 struct inode *inode = mapping->host;
897 struct gfs2_inode *ip = GFS2_I(inode);
898 struct gfs2_sbd *sdp = GFS2_SB(inode);
899 loff_t from = inode->i_size;
900 unsigned long index = from >> PAGE_CACHE_SHIFT;
901 unsigned offset = from & (PAGE_CACHE_SIZE-1);
902 unsigned blocksize, iblock, length, pos;
903 struct buffer_head *bh;
904 struct page *page;
905 void *kaddr;
906 int err;
907
908 page = grab_cache_page(mapping, index);
909 if (!page)
910 return 0;
911
912 blocksize = inode->i_sb->s_blocksize;
913 length = blocksize - (offset & (blocksize - 1));
914 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
915
916 if (!page_has_buffers(page))
917 create_empty_buffers(page, blocksize, 0);
918
919 /* Find the buffer that contains "offset" */
920 bh = page_buffers(page);
921 pos = blocksize;
922 while (offset >= pos) {
923 bh = bh->b_this_page;
924 iblock++;
925 pos += blocksize;
926 }
927
928 err = 0;
929
930 if (!buffer_mapped(bh)) {
931 gfs2_get_block(inode, iblock, bh, 0);
932 /* unmapped? It's a hole - nothing to do */
933 if (!buffer_mapped(bh))
934 goto unlock;
935 }
936
937 /* Ok, it's mapped. Make sure it's up-to-date */
938 if (PageUptodate(page))
939 set_buffer_uptodate(bh);
940
941 if (!buffer_uptodate(bh)) {
942 err = -EIO;
943 ll_rw_block(READ, 1, &bh);
944 wait_on_buffer(bh);
945 /* Uhhuh. Read error. Complain and punt. */
946 if (!buffer_uptodate(bh))
947 goto unlock;
948 }
949
950 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
951 gfs2_trans_add_bh(ip->i_gl, bh, 0);
952
953 kaddr = kmap_atomic(page, KM_USER0);
954 memset(kaddr + offset, 0, length);
955 flush_dcache_page(page);
956 kunmap_atomic(kaddr, KM_USER0);
957
958unlock:
959 unlock_page(page);
960 page_cache_release(page);
961 return err;
962}
963
964static int trunc_start(struct gfs2_inode *ip, uint64_t size)
965{
966 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
967 struct buffer_head *dibh;
968 int journaled = gfs2_is_jdata(ip);
969 int error;
970
971 error = gfs2_trans_begin(sdp,
972 RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
973 if (error)
974 return error;
975
976 error = gfs2_meta_inode_buffer(ip, &dibh);
977 if (error)
978 goto out;
979
980 if (gfs2_is_stuffed(ip)) {
981 ip->i_di.di_size = size;
982 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
983 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
984 gfs2_dinode_out(&ip->i_di, dibh->b_data);
985 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
986 error = 1;
987
988 } else {
989 if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
990 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
991
992 if (!error) {
993 ip->i_di.di_size = size;
994 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
995 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
996 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
997 gfs2_dinode_out(&ip->i_di, dibh->b_data);
998 }
999 }
1000
1001 brelse(dibh);
1002
1003 out:
1004 gfs2_trans_end(sdp);
1005
1006 return error;
1007}
1008
1009static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
1010{
1011 unsigned int height = ip->i_di.di_height;
1012 uint64_t lblock;
1013 struct metapath mp;
1014 int error;
1015
1016 if (!size)
1017 lblock = 0;
1018 else
1019 lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
1020
1021 find_metapath(ip, lblock, &mp);
1022 gfs2_alloc_get(ip);
1023
1024 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1025 if (error)
1026 goto out;
1027
1028 while (height--) {
1029 struct strip_mine sm;
1030 sm.sm_first = !!size;
1031 sm.sm_height = height;
1032
1033 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
1034 if (error)
1035 break;
1036 }
1037
1038 gfs2_quota_unhold(ip);
1039
1040 out:
1041 gfs2_alloc_put(ip);
1042 return error;
1043}
1044
1045static int trunc_end(struct gfs2_inode *ip)
1046{
1047 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1048 struct buffer_head *dibh;
1049 int error;
1050
1051 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1052 if (error)
1053 return error;
1054
1055 down_write(&ip->i_rw_mutex);
1056
1057 error = gfs2_meta_inode_buffer(ip, &dibh);
1058 if (error)
1059 goto out;
1060
1061 if (!ip->i_di.di_size) {
1062 ip->i_di.di_height = 0;
1063 ip->i_di.di_goal_meta =
1064 ip->i_di.di_goal_data =
1065 ip->i_num.no_addr;
1066 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1067 }
1068 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1069 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
1070
1071 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1072 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1073 brelse(dibh);
1074
1075 out:
1076 up_write(&ip->i_rw_mutex);
1077
1078 gfs2_trans_end(sdp);
1079
1080 return error;
1081}
1082
1083/**
1084 * do_shrink - make a file smaller
1085 * @ip: the inode
1086 * @size: the size to make the file
1087 * @truncator: function to truncate the last partial block
1088 *
1089 * Called with an exclusive lock on @ip.
1090 *
1091 * Returns: errno
1092 */
1093
1094static int do_shrink(struct gfs2_inode *ip, uint64_t size)
1095{
1096 int error;
1097
1098 error = trunc_start(ip, size);
1099 if (error < 0)
1100 return error;
1101 if (error > 0)
1102 return 0;
1103
1104 error = trunc_dealloc(ip, size);
1105 if (!error)
1106 error = trunc_end(ip);
1107
1108 return error;
1109}
1110
1111/**
1112 * gfs2_truncatei - make a file a given size
1113 * @ip: the inode
1114 * @size: the size to make the file
1115 * @truncator: function to truncate the last partial block
1116 *
1117 * The file size can grow, shrink, or stay the same size.
1118 *
1119 * Returns: errno
1120 */
1121
1122int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size)
1123{
1124 int error;
1125
1126 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
1127 return -EINVAL;
1128
1129 if (size > ip->i_di.di_size)
1130 error = do_grow(ip, size);
1131 else
1132 error = do_shrink(ip, size);
1133
1134 return error;
1135}
1136
1137int gfs2_truncatei_resume(struct gfs2_inode *ip)
1138{
1139 int error;
1140 error = trunc_dealloc(ip, ip->i_di.di_size);
1141 if (!error)
1142 error = trunc_end(ip);
1143 return error;
1144}
1145
1146int gfs2_file_dealloc(struct gfs2_inode *ip)
1147{
1148 return trunc_dealloc(ip, 0);
1149}
1150
1151/**
1152 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1153 * @ip: the file
1154 * @len: the number of bytes to be written to the file
1155 * @data_blocks: returns the number of data blocks required
1156 * @ind_blocks: returns the number of indirect blocks required
1157 *
1158 */
1159
1160void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1161 unsigned int *data_blocks, unsigned int *ind_blocks)
1162{
1163 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1164 unsigned int tmp;
1165
1166 if (gfs2_is_dir(ip)) {
1167 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1168 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1169 } else {
1170 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1171 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1172 }
1173
1174 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1175 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1176 *ind_blocks += tmp;
1177 }
1178}
1179
1180/**
1181 * gfs2_write_alloc_required - figure out if a write will require an allocation
1182 * @ip: the file being written to
1183 * @offset: the offset to write to
1184 * @len: the number of bytes being written
1185 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1186 *
1187 * Returns: errno
1188 */
1189
1190int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
1191 unsigned int len, int *alloc_required)
1192{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 uint64_t lblock, lblock_stop, dblock;
1195 uint32_t extlen;
1196 int new = 0;
1197 int error = 0;
1198
1199 *alloc_required = 0;
1200
1201 if (!len)
1202 return 0;
1203
1204 if (gfs2_is_stuffed(ip)) {
1205 if (offset + len >
1206 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1207 *alloc_required = 1;
1208 return 0;
1209 }
1210
1211 if (gfs2_is_dir(ip)) {
1212 unsigned int bsize = sdp->sd_jbsize;
1213 lblock = offset;
1214 do_div(lblock, bsize);
1215 lblock_stop = offset + len + bsize - 1;
1216 do_div(lblock_stop, bsize);
1217 } else {
1218 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1219 lblock = offset >> shift;
1220 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1221 }
1222
1223 for (; lblock < lblock_stop; lblock += extlen) {
1224 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
1225 if (error)
1226 return error;
1227
1228 if (!dblock) {
1229 *alloc_required = 1;
1230 return 0;
1231 }
1232 }
1233
1234 return 0;
1235}
1236
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..1a265412f7ee
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
14int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary);
15int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
16
17int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size);
18int gfs2_truncatei_resume(struct gfs2_inode *ip);
19int gfs2_file_dealloc(struct gfs2_inode *ip);
20
21void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
22 unsigned int *data_blocks,
23 unsigned int *ind_blocks);
24int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
25 unsigned int len, int *alloc_required);
26
27#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..1453605c8f32
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "daemon.h"
23#include "glock.h"
24#include "log.h"
25#include "quota.h"
26#include "recovery.h"
27#include "super.h"
28#include "util.h"
29
30/* This uses schedule_timeout() instead of msleep() because it's good for
31 the daemons to wake up more often than the timeout when unmounting so
32 the user's unmount doesn't sit there forever.
33
34 The kthread functions used to start these daemons block and flush signals. */
35
36/**
37 * gfs2_scand - Look for cached glocks and inodes to toss from memory
38 * @sdp: Pointer to GFS2 superblock
39 *
40 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
41 * See gfs2_glockd()
42 */
43
44int gfs2_scand(void *data)
45{
46 struct gfs2_sbd *sdp = data;
47 unsigned long t;
48
49 while (!kthread_should_stop()) {
50 gfs2_scand_internal(sdp);
51 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
52 schedule_timeout_interruptible(t);
53 }
54
55 return 0;
56}
57
58/**
59 * gfs2_glockd - Reclaim unused glock structures
60 * @sdp: Pointer to GFS2 superblock
61 *
62 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
63 * Number of daemons can be set by user, with num_glockd mount option.
64 */
65
66int gfs2_glockd(void *data)
67{
68 struct gfs2_sbd *sdp = data;
69
70 while (!kthread_should_stop()) {
71 while (atomic_read(&sdp->sd_reclaim_count))
72 gfs2_reclaim_glock(sdp);
73
74 wait_event_interruptible(sdp->sd_reclaim_wq,
75 (atomic_read(&sdp->sd_reclaim_count) ||
76 kthread_should_stop()));
77 }
78
79 return 0;
80}
81
82/**
83 * gfs2_recoverd - Recover dead machine's journals
84 * @sdp: Pointer to GFS2 superblock
85 *
86 */
87
88int gfs2_recoverd(void *data)
89{
90 struct gfs2_sbd *sdp = data;
91 unsigned long t;
92
93 while (!kthread_should_stop()) {
94 gfs2_check_journals(sdp);
95 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
96 schedule_timeout_interruptible(t);
97 }
98
99 return 0;
100}
101
102/**
103 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
104 * @sdp: Pointer to GFS2 superblock
105 *
106 * Also, periodically check to make sure that we're using the most recent
107 * journal index.
108 */
109
110int gfs2_logd(void *data)
111{
112 struct gfs2_sbd *sdp = data;
113 struct gfs2_holder ji_gh;
114 unsigned long t;
115
116 while (!kthread_should_stop()) {
117 /* Advance the log tail */
118
119 t = sdp->sd_log_flush_time +
120 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
121
122 gfs2_ail1_empty(sdp, DIO_ALL);
123
124 if (time_after_eq(jiffies, t)) {
125 gfs2_log_flush(sdp, NULL);
126 sdp->sd_log_flush_time = jiffies;
127 }
128
129 /* Check for latest journal index */
130
131 t = sdp->sd_jindex_refresh_time +
132 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
133
134 if (time_after_eq(jiffies, t)) {
135 if (!gfs2_jindex_hold(sdp, &ji_gh))
136 gfs2_glock_dq_uninit(&ji_gh);
137 sdp->sd_jindex_refresh_time = jiffies;
138 }
139
140 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
141 schedule_timeout_interruptible(t);
142 }
143
144 return 0;
145}
146
147/**
148 * gfs2_quotad - Write cached quota changes into the quota file
149 * @sdp: Pointer to GFS2 superblock
150 *
151 */
152
153int gfs2_quotad(void *data)
154{
155 struct gfs2_sbd *sdp = data;
156 unsigned long t;
157 int error;
158
159 while (!kthread_should_stop()) {
160 /* Update the master statfs file */
161
162 t = sdp->sd_statfs_sync_time +
163 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
164
165 if (time_after_eq(jiffies, t)) {
166 error = gfs2_statfs_sync(sdp);
167 if (error &&
168 error != -EROFS &&
169 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
170 fs_err(sdp, "quotad: (1) error=%d\n", error);
171 sdp->sd_statfs_sync_time = jiffies;
172 }
173
174 /* Update quota file */
175
176 t = sdp->sd_quota_sync_time +
177 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
178
179 if (time_after_eq(jiffies, t)) {
180 error = gfs2_quota_sync(sdp);
181 if (error &&
182 error != -EROFS &&
183 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
184 fs_err(sdp, "quotad: (2) error=%d\n", error);
185 sdp->sd_quota_sync_time = jiffies;
186 }
187
188 gfs2_quota_scan(sdp);
189
190 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
191 schedule_timeout_interruptible(t);
192 }
193
194 return 0;
195}
196
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..aa93eb6f668e
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18
19#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..563b99e419b6
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1974 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 * "Extendible Hashing" by Fagin, et al in
13 * __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/buffer_head.h>
60#include <linux/sort.h>
61#include <linux/gfs2_ondisk.h>
62#include <linux/crc32.h>
63#include <linux/vmalloc.h>
64
65#include "gfs2.h"
66#include "lm_interface.h"
67#include "incore.h"
68#include "dir.h"
69#include "glock.h"
70#include "inode.h"
71#include "meta_io.h"
72#include "quota.h"
73#include "rgrp.h"
74#include "trans.h"
75#include "bmap.h"
76#include "util.h"
77
78#define IS_LEAF 1 /* Hashed (leaf) directory */
79#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
80
81#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
82#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
83
84typedef int (*leaf_call_t) (struct gfs2_inode *dip,
85 uint32_t index, uint32_t len, uint64_t leaf_no,
86 void *data);
87
88
89int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
90 struct buffer_head **bhp)
91{
92 struct buffer_head *bh;
93
94 bh = gfs2_meta_new(ip->i_gl, block);
95 gfs2_trans_add_bh(ip->i_gl, bh, 1);
96 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
97 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
98 *bhp = bh;
99 return 0;
100}
101
102static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, uint64_t block,
103 struct buffer_head **bhp)
104{
105 struct buffer_head *bh;
106 int error;
107
108 error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT, &bh);
109 if (error)
110 return error;
111 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
112 brelse(bh);
113 return -EIO;
114 }
115 *bhp = bh;
116 return 0;
117}
118
119static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
120 unsigned int offset, unsigned int size)
121
122{
123 struct buffer_head *dibh;
124 int error;
125
126 error = gfs2_meta_inode_buffer(ip, &dibh);
127 if (error)
128 return error;
129
130 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
132 if (ip->i_di.di_size < offset + size)
133 ip->i_di.di_size = offset + size;
134 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
135 gfs2_dinode_out(&ip->i_di, dibh->b_data);
136
137 brelse(dibh);
138
139 return size;
140}
141
142
143
144/**
145 * gfs2_dir_write_data - Write directory information to the inode
146 * @ip: The GFS2 inode
147 * @buf: The buffer containing information to be written
148 * @offset: The file offset to start writing at
149 * @size: The amount of data to write
150 *
151 * Returns: The number of bytes correctly written or error code
152 */
153static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
154 uint64_t offset, unsigned int size)
155{
156 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
157 struct buffer_head *dibh;
158 uint64_t lblock, dblock;
159 uint32_t extlen = 0;
160 unsigned int o;
161 int copied = 0;
162 int error = 0;
163
164 if (!size)
165 return 0;
166
167 if (gfs2_is_stuffed(ip) &&
168 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
169 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
170 size);
171
172 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
173 return -EINVAL;
174
175 if (gfs2_is_stuffed(ip)) {
176 error = gfs2_unstuff_dinode(ip, NULL);
177 if (error)
178 return error;
179 }
180
181 lblock = offset;
182 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
183
184 while (copied < size) {
185 unsigned int amount;
186 struct buffer_head *bh;
187 int new;
188
189 amount = size - copied;
190 if (amount > sdp->sd_sb.sb_bsize - o)
191 amount = sdp->sd_sb.sb_bsize - o;
192
193 if (!extlen) {
194 new = 1;
195 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
196 &dblock, &extlen);
197 if (error)
198 goto fail;
199 error = -EIO;
200 if (gfs2_assert_withdraw(sdp, dblock))
201 goto fail;
202 }
203
204 if (amount == sdp->sd_jbsize || new)
205 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
206 else
207 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
208
209 if (error)
210 goto fail;
211
212 gfs2_trans_add_bh(ip->i_gl, bh, 1);
213 memcpy(bh->b_data + o, buf, amount);
214 brelse(bh);
215 if (error)
216 goto fail;
217
218 copied += amount;
219 lblock++;
220 dblock++;
221 extlen--;
222
223 o = sizeof(struct gfs2_meta_header);
224 }
225
226out:
227 error = gfs2_meta_inode_buffer(ip, &dibh);
228 if (error)
229 return error;
230
231 if (ip->i_di.di_size < offset + copied)
232 ip->i_di.di_size = offset + copied;
233 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
234
235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
236 gfs2_dinode_out(&ip->i_di, dibh->b_data);
237 brelse(dibh);
238
239 return copied;
240fail:
241 if (copied)
242 goto out;
243 return error;
244}
245
246static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
247 unsigned int offset, unsigned int size)
248{
249 struct buffer_head *dibh;
250 int error;
251
252 error = gfs2_meta_inode_buffer(ip, &dibh);
253 if (!error) {
254 offset += sizeof(struct gfs2_dinode);
255 memcpy(buf, dibh->b_data + offset, size);
256 brelse(dibh);
257 }
258
259 return (error) ? error : size;
260}
261
262
263/**
264 * gfs2_dir_read_data - Read a data from a directory inode
265 * @ip: The GFS2 Inode
266 * @buf: The buffer to place result into
267 * @offset: File offset to begin jdata_readng from
268 * @size: Amount of data to transfer
269 *
270 * Returns: The amount of data actually copied or the error
271 */
272static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf,
273 uint64_t offset, unsigned int size)
274{
275 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
276 uint64_t lblock, dblock;
277 uint32_t extlen = 0;
278 unsigned int o;
279 int copied = 0;
280 int error = 0;
281
282 if (offset >= ip->i_di.di_size)
283 return 0;
284
285 if ((offset + size) > ip->i_di.di_size)
286 size = ip->i_di.di_size - offset;
287
288 if (!size)
289 return 0;
290
291 if (gfs2_is_stuffed(ip))
292 return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset,
293 size);
294
295 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
296 return -EINVAL;
297
298 lblock = offset;
299 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
300
301 while (copied < size) {
302 unsigned int amount;
303 struct buffer_head *bh;
304 int new;
305
306 amount = size - copied;
307 if (amount > sdp->sd_sb.sb_bsize - o)
308 amount = sdp->sd_sb.sb_bsize - o;
309
310 if (!extlen) {
311 new = 0;
312 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
313 &dblock, &extlen);
314 if (error)
315 goto fail;
316 }
317
318 if (extlen > 1)
319 gfs2_meta_ra(ip->i_gl, dblock, extlen);
320
321 if (dblock) {
322 if (new)
323 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
324 else
325 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
326 if (error)
327 goto fail;
328 dblock++;
329 extlen--;
330 } else
331 bh = NULL;
332
333 memcpy(buf, bh->b_data + o, amount);
334 brelse(bh);
335 if (error)
336 goto fail;
337
338 copied += amount;
339 lblock++;
340
341 o = sizeof(struct gfs2_meta_header);
342 }
343
344 return copied;
345fail:
346 return (copied) ? copied : error;
347}
348
349typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
350 const struct qstr *name,
351 void *opaque);
352
353static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
354 const struct qstr *name, int ret)
355{
356 if (dent->de_inum.no_addr != 0 &&
357 be32_to_cpu(dent->de_hash) == name->hash &&
358 be16_to_cpu(dent->de_name_len) == name->len &&
359 memcmp((char *)(dent+1), name->name, name->len) == 0)
360 return ret;
361 return 0;
362}
363
364static int gfs2_dirent_find(const struct gfs2_dirent *dent,
365 const struct qstr *name,
366 void *opaque)
367{
368 return __gfs2_dirent_find(dent, name, 1);
369}
370
371static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
372 const struct qstr *name,
373 void *opaque)
374{
375 return __gfs2_dirent_find(dent, name, 2);
376}
377
378/*
379 * name->name holds ptr to start of block.
380 * name->len holds size of block.
381 */
382static int gfs2_dirent_last(const struct gfs2_dirent *dent,
383 const struct qstr *name,
384 void *opaque)
385{
386 const char *start = name->name;
387 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
388 if (name->len == (end - start))
389 return 1;
390 return 0;
391}
392
393static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
394 const struct qstr *name,
395 void *opaque)
396{
397 unsigned required = GFS2_DIRENT_SIZE(name->len);
398 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
399 unsigned totlen = be16_to_cpu(dent->de_rec_len);
400
401 if (!dent->de_inum.no_addr)
402 actual = GFS2_DIRENT_SIZE(0);
403 if ((totlen - actual) >= required)
404 return 1;
405 return 0;
406}
407
408struct dirent_gather {
409 const struct gfs2_dirent **pdent;
410 unsigned offset;
411};
412
413static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
414 const struct qstr *name,
415 void *opaque)
416{
417 struct dirent_gather *g = opaque;
418 if (dent->de_inum.no_addr) {
419 g->pdent[g->offset++] = dent;
420 }
421 return 0;
422}
423
424/*
425 * Other possible things to check:
426 * - Inode located within filesystem size (and on valid block)
427 * - Valid directory entry type
428 * Not sure how heavy-weight we want to make this... could also check
429 * hash is correct for example, but that would take a lot of extra time.
430 * For now the most important thing is to check that the various sizes
431 * are correct.
432 */
433static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
434 unsigned int size, unsigned int len, int first)
435{
436 const char *msg = "gfs2_dirent too small";
437 if (unlikely(size < sizeof(struct gfs2_dirent)))
438 goto error;
439 msg = "gfs2_dirent misaligned";
440 if (unlikely(offset & 0x7))
441 goto error;
442 msg = "gfs2_dirent points beyond end of block";
443 if (unlikely(offset + size > len))
444 goto error;
445 msg = "zero inode number";
446 if (unlikely(!first && !dent->de_inum.no_addr))
447 goto error;
448 msg = "name length is greater than space in dirent";
449 if (dent->de_inum.no_addr &&
450 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
451 size))
452 goto error;
453 return 0;
454error:
455 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
456 first ? "first in block" : "not first in block");
457 return -EIO;
458}
459
460static int gfs2_dirent_offset(const void *buf)
461{
462 const struct gfs2_meta_header *h = buf;
463 int offset;
464
465 BUG_ON(buf == NULL);
466
467 switch(be32_to_cpu(h->mh_type)) {
468 case GFS2_METATYPE_LF:
469 offset = sizeof(struct gfs2_leaf);
470 break;
471 case GFS2_METATYPE_DI:
472 offset = sizeof(struct gfs2_dinode);
473 break;
474 default:
475 goto wrong_type;
476 }
477 return offset;
478wrong_type:
479 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
480 be32_to_cpu(h->mh_type));
481 return -1;
482}
483
484static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode,
485 void *buf,
486 unsigned int len, gfs2_dscan_t scan,
487 const struct qstr *name,
488 void *opaque)
489{
490 struct gfs2_dirent *dent, *prev;
491 unsigned offset;
492 unsigned size;
493 int ret = 0;
494
495 ret = gfs2_dirent_offset(buf);
496 if (ret < 0)
497 goto consist_inode;
498
499 offset = ret;
500 prev = NULL;
501 dent = (struct gfs2_dirent *)(buf + offset);
502 size = be16_to_cpu(dent->de_rec_len);
503 if (gfs2_check_dirent(dent, offset, size, len, 1))
504 goto consist_inode;
505 do {
506 ret = scan(dent, name, opaque);
507 if (ret)
508 break;
509 offset += size;
510 if (offset == len)
511 break;
512 prev = dent;
513 dent = (struct gfs2_dirent *)(buf + offset);
514 size = be16_to_cpu(dent->de_rec_len);
515 if (gfs2_check_dirent(dent, offset, size, len, 0))
516 goto consist_inode;
517 } while(1);
518
519 switch(ret) {
520 case 0:
521 return NULL;
522 case 1:
523 return dent;
524 case 2:
525 return prev ? prev : dent;
526 default:
527 BUG_ON(ret > 0);
528 return ERR_PTR(ret);
529 }
530
531consist_inode:
532 gfs2_consist_inode(GFS2_I(inode));
533 return ERR_PTR(-EIO);
534}
535
536
537/**
538 * dirent_first - Return the first dirent
539 * @dip: the directory
540 * @bh: The buffer
541 * @dent: Pointer to list of dirents
542 *
543 * return first dirent whether bh points to leaf or stuffed dinode
544 *
545 * Returns: IS_LEAF, IS_DINODE, or -errno
546 */
547
548static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
549 struct gfs2_dirent **dent)
550{
551 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
552
553 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
554 if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
555 return -EIO;
556 *dent = (struct gfs2_dirent *)(bh->b_data +
557 sizeof(struct gfs2_leaf));
558 return IS_LEAF;
559 } else {
560 if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
561 return -EIO;
562 *dent = (struct gfs2_dirent *)(bh->b_data +
563 sizeof(struct gfs2_dinode));
564 return IS_DINODE;
565 }
566}
567
568/**
569 * dirent_next - Next dirent
570 * @dip: the directory
571 * @bh: The buffer
572 * @dent: Pointer to list of dirents
573 *
574 * Returns: 0 on success, error code otherwise
575 */
576
577static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
578 struct gfs2_dirent **dent)
579{
580 struct gfs2_dirent *tmp, *cur;
581 char *bh_end;
582 uint16_t cur_rec_len;
583
584 cur = *dent;
585 bh_end = bh->b_data + bh->b_size;
586 cur_rec_len = be16_to_cpu(cur->de_rec_len);
587
588 if ((char *)cur + cur_rec_len >= bh_end) {
589 if ((char *)cur + cur_rec_len > bh_end) {
590 gfs2_consist_inode(dip);
591 return -EIO;
592 }
593 return -ENOENT;
594 }
595
596 tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
597
598 if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) {
599 gfs2_consist_inode(dip);
600 return -EIO;
601 }
602
603 if (cur_rec_len == 0) {
604 gfs2_consist_inode(dip);
605 return -EIO;
606 }
607
608 /* Only the first dent could ever have de_inum.no_addr == 0 */
609 if (!tmp->de_inum.no_addr) {
610 gfs2_consist_inode(dip);
611 return -EIO;
612 }
613
614 *dent = tmp;
615
616 return 0;
617}
618
619/**
620 * dirent_del - Delete a dirent
621 * @dip: The GFS2 inode
622 * @bh: The buffer
623 * @prev: The previous dirent
624 * @cur: The current dirent
625 *
626 */
627
628static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
629 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
630{
631 uint16_t cur_rec_len, prev_rec_len;
632
633 if (!cur->de_inum.no_addr) {
634 gfs2_consist_inode(dip);
635 return;
636 }
637
638 gfs2_trans_add_bh(dip->i_gl, bh, 1);
639
640 /* If there is no prev entry, this is the first entry in the block.
641 The de_rec_len is already as big as it needs to be. Just zero
642 out the inode number and return. */
643
644 if (!prev) {
645 cur->de_inum.no_addr = 0; /* No endianess worries */
646 return;
647 }
648
649 /* Combine this dentry with the previous one. */
650
651 prev_rec_len = be16_to_cpu(prev->de_rec_len);
652 cur_rec_len = be16_to_cpu(cur->de_rec_len);
653
654 if ((char *)prev + prev_rec_len != (char *)cur)
655 gfs2_consist_inode(dip);
656 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
657 gfs2_consist_inode(dip);
658
659 prev_rec_len += cur_rec_len;
660 prev->de_rec_len = cpu_to_be16(prev_rec_len);
661}
662
663/*
664 * Takes a dent from which to grab space as an argument. Returns the
665 * newly created dent.
666 */
667static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
668 struct gfs2_dirent *dent,
669 const struct qstr *name,
670 struct buffer_head *bh)
671{
672 struct gfs2_inode *ip = GFS2_I(inode);
673 struct gfs2_dirent *ndent;
674 unsigned offset = 0, totlen;
675
676 if (dent->de_inum.no_addr)
677 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
678 totlen = be16_to_cpu(dent->de_rec_len);
679 BUG_ON(offset + name->len > totlen);
680 gfs2_trans_add_bh(ip->i_gl, bh, 1);
681 ndent = (struct gfs2_dirent *)((char *)dent + offset);
682 dent->de_rec_len = cpu_to_be16(offset);
683 gfs2_qstr2dirent(name, totlen - offset, ndent);
684 return ndent;
685}
686
687static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
688 struct buffer_head *bh,
689 const struct qstr *name)
690{
691 struct gfs2_dirent *dent;
692 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
693 gfs2_dirent_find_space, name, NULL);
694 if (!dent || IS_ERR(dent))
695 return dent;
696 return gfs2_init_dirent(inode, dent, name, bh);
697}
698
699static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
700 struct buffer_head **bhp)
701{
702 int error;
703
704 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
705 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
706 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
707 error = -EIO;
708 }
709
710 return error;
711}
712
713/**
714 * get_leaf_nr - Get a leaf number associated with the index
715 * @dip: The GFS2 inode
716 * @index:
717 * @leaf_out:
718 *
719 * Returns: 0 on success, error code otherwise
720 */
721
722static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
723 uint64_t *leaf_out)
724{
725 uint64_t leaf_no;
726 int error;
727
728 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
729 index * sizeof(uint64_t),
730 sizeof(uint64_t));
731 if (error != sizeof(uint64_t))
732 return (error < 0) ? error : -EIO;
733
734 *leaf_out = be64_to_cpu(leaf_no);
735
736 return 0;
737}
738
739static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
740 struct buffer_head **bh_out)
741{
742 uint64_t leaf_no;
743 int error;
744
745 error = get_leaf_nr(dip, index, &leaf_no);
746 if (!error)
747 error = get_leaf(dip, leaf_no, bh_out);
748
749 return error;
750}
751
752static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
753 const struct qstr *name,
754 gfs2_dscan_t scan,
755 struct buffer_head **pbh)
756{
757 struct buffer_head *bh;
758 struct gfs2_dirent *dent;
759 struct gfs2_inode *ip = GFS2_I(inode);
760 int error;
761
762 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
763 struct gfs2_leaf *leaf;
764 unsigned hsize = 1 << ip->i_di.di_depth;
765 unsigned index;
766 u64 ln;
767 if (hsize * sizeof(u64) != ip->i_di.di_size) {
768 gfs2_consist_inode(ip);
769 return ERR_PTR(-EIO);
770 }
771
772 index = name->hash >> (32 - ip->i_di.di_depth);
773 error = get_first_leaf(ip, index, &bh);
774 if (error)
775 return ERR_PTR(error);
776 do {
777 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
778 scan, name, NULL);
779 if (dent)
780 goto got_dent;
781 leaf = (struct gfs2_leaf *)bh->b_data;
782 ln = be64_to_cpu(leaf->lf_next);
783 brelse(bh);
784 if (!ln)
785 break;
786
787 error = get_leaf(ip, ln, &bh);
788 } while(!error);
789
790 return error ? ERR_PTR(error) : NULL;
791 }
792
793
794 error = gfs2_meta_inode_buffer(ip, &bh);
795 if (error)
796 return ERR_PTR(error);
797 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
798got_dent:
799 if (unlikely(dent == NULL || IS_ERR(dent))) {
800 brelse(bh);
801 bh = NULL;
802 }
803 *pbh = bh;
804 return dent;
805}
806
807static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
808{
809 struct gfs2_inode *ip = GFS2_I(inode);
810 u64 bn = gfs2_alloc_meta(ip);
811 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
812 struct gfs2_leaf *leaf;
813 struct gfs2_dirent *dent;
814 struct qstr name = { .name = "", .len = 0, .hash = 0 };
815 if (!bh)
816 return NULL;
817
818 gfs2_trans_add_bh(ip->i_gl, bh, 1);
819 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
820 leaf = (struct gfs2_leaf *)bh->b_data;
821 leaf->lf_depth = cpu_to_be16(depth);
822 leaf->lf_entries = cpu_to_be16(0);
823 leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
824 leaf->lf_next = cpu_to_be64(0);
825 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
826 dent = (struct gfs2_dirent *)(leaf+1);
827 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
828 *pbh = bh;
829 return leaf;
830}
831
832/**
833 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
834 * @dip: The GFS2 inode
835 *
836 * Returns: 0 on success, error code otherwise
837 */
838
839static int dir_make_exhash(struct inode *inode)
840{
841 struct gfs2_inode *dip = GFS2_I(inode);
842 struct gfs2_sbd *sdp = GFS2_SB(inode);
843 struct gfs2_dirent *dent;
844 struct qstr args;
845 struct buffer_head *bh, *dibh;
846 struct gfs2_leaf *leaf;
847 int y;
848 uint32_t x;
849 uint64_t *lp, bn;
850 int error;
851
852 error = gfs2_meta_inode_buffer(dip, &dibh);
853 if (error)
854 return error;
855
856 /* Turn over a new leaf */
857
858 leaf = new_leaf(inode, &bh, 0);
859 if (!leaf)
860 return -ENOSPC;
861 bn = bh->b_blocknr;
862
863 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
864 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
865
866 /* Copy dirents */
867
868 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
869 sizeof(struct gfs2_dinode));
870
871 /* Find last entry */
872
873 x = 0;
874 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
875 sizeof(struct gfs2_leaf);
876 args.name = bh->b_data;
877 dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
878 gfs2_dirent_last, &args, NULL);
879 if (!dent) {
880 brelse(bh);
881 brelse(dibh);
882 return -EIO;
883 }
884 if (IS_ERR(dent)) {
885 brelse(bh);
886 brelse(dibh);
887 return PTR_ERR(dent);
888 }
889
890 /* Adjust the last dirent's record length
891 (Remember that dent still points to the last entry.) */
892
893 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
894 sizeof(struct gfs2_dinode) -
895 sizeof(struct gfs2_leaf));
896
897 brelse(bh);
898
899 /* We're done with the new leaf block, now setup the new
900 hash table. */
901
902 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
903 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
904
905 lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
906
907 for (x = sdp->sd_hash_ptrs; x--; lp++)
908 *lp = cpu_to_be64(bn);
909
910 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
911 dip->i_di.di_blocks++;
912 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
913 dip->i_di.di_payload_format = 0;
914
915 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
916 dip->i_di.di_depth = y;
917
918 gfs2_dinode_out(&dip->i_di, dibh->b_data);
919
920 brelse(dibh);
921
922 return 0;
923}
924
925/**
926 * dir_split_leaf - Split a leaf block into two
927 * @dip: The GFS2 inode
928 * @index:
929 * @leaf_no:
930 *
931 * Returns: 0 on success, error code on failure
932 */
933
934static int dir_split_leaf(struct inode *inode, const struct qstr *name)
935{
936 struct gfs2_inode *dip = GFS2_I(inode);
937 struct buffer_head *nbh, *obh, *dibh;
938 struct gfs2_leaf *nleaf, *oleaf;
939 struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
940 uint32_t start, len, half_len, divider;
941 uint64_t bn, *lp, leaf_no;
942 uint32_t index;
943 int x, moved = 0;
944 int error;
945
946 index = name->hash >> (32 - dip->i_di.di_depth);
947 error = get_leaf_nr(dip, index, &leaf_no);
948 if (error)
949 return error;
950
951 /* Get the old leaf block */
952 error = get_leaf(dip, leaf_no, &obh);
953 if (error)
954 return error;
955
956 oleaf = (struct gfs2_leaf *)obh->b_data;
957 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
958 brelse(obh);
959 return 1; /* can't split */
960 }
961
962 gfs2_trans_add_bh(dip->i_gl, obh, 1);
963
964 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
965 if (!nleaf) {
966 brelse(obh);
967 return -ENOSPC;
968 }
969 bn = nbh->b_blocknr;
970
971 /* Compute the start and len of leaf pointers in the hash table. */
972 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
973 half_len = len >> 1;
974 if (!half_len) {
975 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
976 gfs2_consist_inode(dip);
977 error = -EIO;
978 goto fail_brelse;
979 }
980
981 start = (index & ~(len - 1));
982
983 /* Change the pointers.
984 Don't bother distinguishing stuffed from non-stuffed.
985 This code is complicated enough already. */
986 lp = kmalloc(half_len * sizeof(uint64_t), GFP_NOFS | __GFP_NOFAIL);
987 /* Change the pointers */
988 for (x = 0; x < half_len; x++)
989 lp[x] = cpu_to_be64(bn);
990
991 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t),
992 half_len * sizeof(uint64_t));
993 if (error != half_len * sizeof(uint64_t)) {
994 if (error >= 0)
995 error = -EIO;
996 goto fail_lpfree;
997 }
998
999 kfree(lp);
1000
1001 /* Compute the divider */
1002 divider = (start + half_len) << (32 - dip->i_di.di_depth);
1003
1004 /* Copy the entries */
1005 dirent_first(dip, obh, &dent);
1006
1007 do {
1008 next = dent;
1009 if (dirent_next(dip, obh, &next))
1010 next = NULL;
1011
1012 if (dent->de_inum.no_addr &&
1013 be32_to_cpu(dent->de_hash) < divider) {
1014 struct qstr str;
1015 str.name = (char*)(dent+1);
1016 str.len = be16_to_cpu(dent->de_name_len);
1017 str.hash = be32_to_cpu(dent->de_hash);
1018 new = gfs2_dirent_alloc(inode, nbh, &str);
1019 if (IS_ERR(new)) {
1020 error = PTR_ERR(new);
1021 break;
1022 }
1023
1024 new->de_inum = dent->de_inum; /* No endian worries */
1025 new->de_type = dent->de_type; /* No endian worries */
1026 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1027
1028 dirent_del(dip, obh, prev, dent);
1029
1030 if (!oleaf->lf_entries)
1031 gfs2_consist_inode(dip);
1032 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1033
1034 if (!prev)
1035 prev = dent;
1036
1037 moved = 1;
1038 } else {
1039 prev = dent;
1040 }
1041 dent = next;
1042 } while (dent);
1043
1044 oleaf->lf_depth = nleaf->lf_depth;
1045
1046 error = gfs2_meta_inode_buffer(dip, &dibh);
1047 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1048 dip->i_di.di_blocks++;
1049 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1050 brelse(dibh);
1051 }
1052
1053 brelse(obh);
1054 brelse(nbh);
1055
1056 return error;
1057
1058fail_lpfree:
1059 kfree(lp);
1060
1061fail_brelse:
1062 brelse(obh);
1063 brelse(nbh);
1064 return error;
1065}
1066
1067/**
1068 * dir_double_exhash - Double size of ExHash table
1069 * @dip: The GFS2 dinode
1070 *
1071 * Returns: 0 on success, error code on failure
1072 */
1073
1074static int dir_double_exhash(struct gfs2_inode *dip)
1075{
1076 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1077 struct buffer_head *dibh;
1078 uint32_t hsize;
1079 uint64_t *buf;
1080 uint64_t *from, *to;
1081 uint64_t block;
1082 int x;
1083 int error = 0;
1084
1085 hsize = 1 << dip->i_di.di_depth;
1086 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1087 gfs2_consist_inode(dip);
1088 return -EIO;
1089 }
1090
1091 /* Allocate both the "from" and "to" buffers in one big chunk */
1092
1093 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1094
1095 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1096 error = gfs2_dir_read_data(dip, (char *)buf,
1097 block * sdp->sd_hash_bsize,
1098 sdp->sd_hash_bsize);
1099 if (error != sdp->sd_hash_bsize) {
1100 if (error >= 0)
1101 error = -EIO;
1102 goto fail;
1103 }
1104
1105 from = buf;
1106 to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
1107
1108 for (x = sdp->sd_hash_ptrs; x--; from++) {
1109 *to++ = *from; /* No endianess worries */
1110 *to++ = *from;
1111 }
1112
1113 error = gfs2_dir_write_data(dip,
1114 (char *)buf + sdp->sd_hash_bsize,
1115 block * sdp->sd_sb.sb_bsize,
1116 sdp->sd_sb.sb_bsize);
1117 if (error != sdp->sd_sb.sb_bsize) {
1118 if (error >= 0)
1119 error = -EIO;
1120 goto fail;
1121 }
1122 }
1123
1124 kfree(buf);
1125
1126 error = gfs2_meta_inode_buffer(dip, &dibh);
1127 if (!gfs2_assert_withdraw(sdp, !error)) {
1128 dip->i_di.di_depth++;
1129 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1130 brelse(dibh);
1131 }
1132
1133 return error;
1134
1135 fail:
1136 kfree(buf);
1137
1138 return error;
1139}
1140
1141/**
1142 * compare_dents - compare directory entries by hash value
1143 * @a: first dent
1144 * @b: second dent
1145 *
1146 * When comparing the hash entries of @a to @b:
1147 * gt: returns 1
1148 * lt: returns -1
1149 * eq: returns 0
1150 */
1151
1152static int compare_dents(const void *a, const void *b)
1153{
1154 struct gfs2_dirent *dent_a, *dent_b;
1155 uint32_t hash_a, hash_b;
1156 int ret = 0;
1157
1158 dent_a = *(struct gfs2_dirent **)a;
1159 hash_a = be32_to_cpu(dent_a->de_hash);
1160
1161 dent_b = *(struct gfs2_dirent **)b;
1162 hash_b = be32_to_cpu(dent_b->de_hash);
1163
1164 if (hash_a > hash_b)
1165 ret = 1;
1166 else if (hash_a < hash_b)
1167 ret = -1;
1168 else {
1169 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1170 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1171
1172 if (len_a > len_b)
1173 ret = 1;
1174 else if (len_a < len_b)
1175 ret = -1;
1176 else
1177 ret = memcmp((char *)(dent_a + 1),
1178 (char *)(dent_b + 1),
1179 len_a);
1180 }
1181
1182 return ret;
1183}
1184
1185/**
1186 * do_filldir_main - read out directory entries
1187 * @dip: The GFS2 inode
1188 * @offset: The offset in the file to read from
1189 * @opaque: opaque data to pass to filldir
1190 * @filldir: The function to pass entries to
1191 * @darr: an array of struct gfs2_dirent pointers to read
1192 * @entries: the number of entries in darr
1193 * @copied: pointer to int that's non-zero if a entry has been copied out
1194 *
1195 * Jump through some hoops to make sure that if there are hash collsions,
1196 * they are read out at the beginning of a buffer. We want to minimize
1197 * the possibility that they will fall into different readdir buffers or
1198 * that someone will want to seek to that location.
1199 *
1200 * Returns: errno, >0 on exception from filldir
1201 */
1202
1203static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
1204 void *opaque, gfs2_filldir_t filldir,
1205 const struct gfs2_dirent **darr, uint32_t entries,
1206 int *copied)
1207{
1208 const struct gfs2_dirent *dent, *dent_next;
1209 struct gfs2_inum inum;
1210 uint64_t off, off_next;
1211 unsigned int x, y;
1212 int run = 0;
1213 int error = 0;
1214
1215 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1216
1217 dent_next = darr[0];
1218 off_next = be32_to_cpu(dent_next->de_hash);
1219 off_next = gfs2_disk_hash2offset(off_next);
1220
1221 for (x = 0, y = 1; x < entries; x++, y++) {
1222 dent = dent_next;
1223 off = off_next;
1224
1225 if (y < entries) {
1226 dent_next = darr[y];
1227 off_next = be32_to_cpu(dent_next->de_hash);
1228 off_next = gfs2_disk_hash2offset(off_next);
1229
1230 if (off < *offset)
1231 continue;
1232 *offset = off;
1233
1234 if (off_next == off) {
1235 if (*copied && !run)
1236 return 1;
1237 run = 1;
1238 } else
1239 run = 0;
1240 } else {
1241 if (off < *offset)
1242 continue;
1243 *offset = off;
1244 }
1245
1246 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1247
1248 error = filldir(opaque, (char *)(dent + 1),
1249 be16_to_cpu(dent->de_name_len),
1250 off, &inum,
1251 be16_to_cpu(dent->de_type));
1252 if (error)
1253 return 1;
1254
1255 *copied = 1;
1256 }
1257
1258 /* Increment the *offset by one, so the next time we come into the
1259 do_filldir fxn, we get the next entry instead of the last one in the
1260 current leaf */
1261
1262 (*offset)++;
1263
1264 return 0;
1265}
1266
1267static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1268 gfs2_filldir_t filldir, int *copied,
1269 unsigned *depth, u64 leaf_no)
1270{
1271 struct gfs2_inode *ip = GFS2_I(inode);
1272 struct buffer_head *bh;
1273 struct gfs2_leaf *lf;
1274 unsigned entries = 0;
1275 unsigned leaves = 0;
1276 const struct gfs2_dirent **darr, *dent;
1277 struct dirent_gather g;
1278 struct buffer_head **larr;
1279 int leaf = 0;
1280 int error, i;
1281 u64 lfn = leaf_no;
1282
1283 do {
1284 error = get_leaf(ip, lfn, &bh);
1285 if (error)
1286 goto out;
1287 lf = (struct gfs2_leaf *)bh->b_data;
1288 if (leaves == 0)
1289 *depth = be16_to_cpu(lf->lf_depth);
1290 entries += be16_to_cpu(lf->lf_entries);
1291 leaves++;
1292 lfn = be64_to_cpu(lf->lf_next);
1293 brelse(bh);
1294 } while(lfn);
1295
1296 if (!entries)
1297 return 0;
1298
1299 error = -ENOMEM;
1300 larr = vmalloc((leaves + entries) * sizeof(void*));
1301 if (!larr)
1302 goto out;
1303 darr = (const struct gfs2_dirent **)(larr + leaves);
1304 g.pdent = darr;
1305 g.offset = 0;
1306 lfn = leaf_no;
1307
1308 do {
1309 error = get_leaf(ip, lfn, &bh);
1310 if (error)
1311 goto out_kfree;
1312 lf = (struct gfs2_leaf *)bh->b_data;
1313 lfn = be64_to_cpu(lf->lf_next);
1314 if (lf->lf_entries) {
1315 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1316 gfs2_dirent_gather, NULL, &g);
1317 error = PTR_ERR(dent);
1318 if (IS_ERR(dent)) {
1319 goto out_kfree;
1320 }
1321 error = 0;
1322 larr[leaf++] = bh;
1323 } else {
1324 brelse(bh);
1325 }
1326 } while(lfn);
1327
1328 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1329 entries, copied);
1330out_kfree:
1331 for(i = 0; i < leaf; i++)
1332 brelse(larr[i]);
1333 vfree(larr);
1334out:
1335 return error;
1336}
1337
1338/**
1339 * dir_e_read - Reads the entries from a directory into a filldir buffer
1340 * @dip: dinode pointer
1341 * @offset: the hash of the last entry read shifted to the right once
1342 * @opaque: buffer for the filldir function to fill
1343 * @filldir: points to the filldir function to use
1344 *
1345 * Returns: errno
1346 */
1347
1348static int dir_e_read(struct inode *inode, uint64_t *offset, void *opaque,
1349 gfs2_filldir_t filldir)
1350{
1351 struct gfs2_inode *dip = GFS2_I(inode);
1352 struct gfs2_sbd *sdp = GFS2_SB(inode);
1353 uint32_t hsize, len = 0;
1354 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1355 uint32_t hash, index;
1356 uint64_t *lp;
1357 int copied = 0;
1358 int error = 0;
1359 unsigned depth = 0;
1360
1361 hsize = 1 << dip->i_di.di_depth;
1362 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1363 gfs2_consist_inode(dip);
1364 return -EIO;
1365 }
1366
1367 hash = gfs2_dir_offset2hash(*offset);
1368 index = hash >> (32 - dip->i_di.di_depth);
1369
1370 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1371 if (!lp)
1372 return -ENOMEM;
1373
1374 while (index < hsize) {
1375 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1376 ht_offset = index - lp_offset;
1377
1378 if (ht_offset_cur != ht_offset) {
1379 error = gfs2_dir_read_data(dip, (char *)lp,
1380 ht_offset * sizeof(uint64_t),
1381 sdp->sd_hash_bsize);
1382 if (error != sdp->sd_hash_bsize) {
1383 if (error >= 0)
1384 error = -EIO;
1385 goto out;
1386 }
1387 ht_offset_cur = ht_offset;
1388 }
1389
1390 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1391 &copied, &depth,
1392 be64_to_cpu(lp[lp_offset]));
1393 if (error)
1394 break;
1395
1396 len = 1 << (dip->i_di.di_depth - depth);
1397 index = (index & ~(len - 1)) + len;
1398 }
1399
1400out:
1401 kfree(lp);
1402 if (error > 0)
1403 error = 0;
1404 return error;
1405}
1406
1407int gfs2_dir_read(struct inode *inode, uint64_t *offset, void *opaque,
1408 gfs2_filldir_t filldir)
1409{
1410 struct gfs2_inode *dip = GFS2_I(inode);
1411 struct dirent_gather g;
1412 const struct gfs2_dirent **darr, *dent;
1413 struct buffer_head *dibh;
1414 int copied = 0;
1415 int error;
1416
1417 if (!dip->i_di.di_entries)
1418 return 0;
1419
1420 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1421 return dir_e_read(inode, offset, opaque, filldir);
1422
1423 if (!gfs2_is_stuffed(dip)) {
1424 gfs2_consist_inode(dip);
1425 return -EIO;
1426 }
1427
1428 error = gfs2_meta_inode_buffer(dip, &dibh);
1429 if (error)
1430 return error;
1431
1432 error = -ENOMEM;
1433 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1434 GFP_KERNEL);
1435 if (darr) {
1436 g.pdent = darr;
1437 g.offset = 0;
1438 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1439 gfs2_dirent_gather, NULL, &g);
1440 if (IS_ERR(dent)) {
1441 error = PTR_ERR(dent);
1442 goto out;
1443 }
1444 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1445 dip->i_di.di_entries, &copied);
1446out:
1447 kfree(darr);
1448 }
1449
1450 if (error > 0)
1451 error = 0;
1452
1453 brelse(dibh);
1454
1455 return error;
1456}
1457
1458/**
1459 * gfs2_dir_search - Search a directory
1460 * @dip: The GFS2 inode
1461 * @filename:
1462 * @inode:
1463 *
1464 * This routine searches a directory for a file or another directory.
1465 * Assumes a glock is held on dip.
1466 *
1467 * Returns: errno
1468 */
1469
1470int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1471 struct gfs2_inum *inum, unsigned int *type)
1472{
1473 struct buffer_head *bh;
1474 struct gfs2_dirent *dent;
1475
1476 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1477 if (dent) {
1478 if (IS_ERR(dent))
1479 return PTR_ERR(dent);
1480 if (inum)
1481 gfs2_inum_in(inum, (char *)&dent->de_inum);
1482 if (type)
1483 *type = be16_to_cpu(dent->de_type);
1484 brelse(bh);
1485 return 0;
1486 }
1487 return -ENOENT;
1488}
1489
1490static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1491{
1492 struct buffer_head *bh, *obh;
1493 struct gfs2_inode *ip = GFS2_I(inode);
1494 struct gfs2_leaf *leaf, *oleaf;
1495 int error;
1496 u32 index;
1497 u64 bn;
1498
1499 index = name->hash >> (32 - ip->i_di.di_depth);
1500 error = get_first_leaf(ip, index, &obh);
1501 if (error)
1502 return error;
1503 do {
1504 oleaf = (struct gfs2_leaf *)obh->b_data;
1505 bn = be64_to_cpu(oleaf->lf_next);
1506 if (!bn)
1507 break;
1508 brelse(obh);
1509 error = get_leaf(ip, bn, &obh);
1510 if (error)
1511 return error;
1512 } while(1);
1513
1514 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1515
1516 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1517 if (!leaf) {
1518 brelse(obh);
1519 return -ENOSPC;
1520 }
1521 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1522 brelse(bh);
1523 brelse(obh);
1524
1525 error = gfs2_meta_inode_buffer(ip, &bh);
1526 if (error)
1527 return error;
1528 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1529 ip->i_di.di_blocks++;
1530 gfs2_dinode_out(&ip->i_di, bh->b_data);
1531 brelse(bh);
1532 return 0;
1533}
1534
1535/**
1536 * gfs2_dir_add - Add new filename into directory
1537 * @dip: The GFS2 inode
1538 * @filename: The new name
1539 * @inode: The inode number of the entry
1540 * @type: The type of the entry
1541 *
1542 * Returns: 0 on success, error code on failure
1543 */
1544
1545int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1546 const struct gfs2_inum *inum, unsigned type)
1547{
1548 struct gfs2_inode *ip = GFS2_I(inode);
1549 struct buffer_head *bh;
1550 struct gfs2_dirent *dent;
1551 struct gfs2_leaf *leaf;
1552 int error;
1553
1554 while(1) {
1555 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1556 &bh);
1557 if (dent) {
1558 if (IS_ERR(dent))
1559 return PTR_ERR(dent);
1560 dent = gfs2_init_dirent(inode, dent, name, bh);
1561 gfs2_inum_out(inum, (char *)&dent->de_inum);
1562 dent->de_type = cpu_to_be16(type);
1563 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1564 leaf = (struct gfs2_leaf *)bh->b_data;
1565 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1566 }
1567 brelse(bh);
1568 error = gfs2_meta_inode_buffer(ip, &bh);
1569 if (error)
1570 break;
1571 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1572 ip->i_di.di_entries++;
1573 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1574 gfs2_dinode_out(&ip->i_di, bh->b_data);
1575 brelse(bh);
1576 error = 0;
1577 break;
1578 }
1579 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1580 error = dir_make_exhash(inode);
1581 if (error)
1582 break;
1583 continue;
1584 }
1585 error = dir_split_leaf(inode, name);
1586 if (error == 0)
1587 continue;
1588 if (error < 0)
1589 break;
1590 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1591 error = dir_double_exhash(ip);
1592 if (error)
1593 break;
1594 error = dir_split_leaf(inode, name);
1595 if (error < 0)
1596 break;
1597 if (error == 0)
1598 continue;
1599 }
1600 error = dir_new_leaf(inode, name);
1601 if (!error)
1602 continue;
1603 error = -ENOSPC;
1604 break;
1605 }
1606 return error;
1607}
1608
1609
1610/**
1611 * gfs2_dir_del - Delete a directory entry
1612 * @dip: The GFS2 inode
1613 * @filename: The filename
1614 *
1615 * Returns: 0 on success, error code on failure
1616 */
1617
1618int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1619{
1620 struct gfs2_dirent *dent, *prev = NULL;
1621 struct buffer_head *bh;
1622 int error;
1623
1624 /* Returns _either_ the entry (if its first in block) or the
1625 previous entry otherwise */
1626 dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1627 if (!dent) {
1628 gfs2_consist_inode(dip);
1629 return -EIO;
1630 }
1631 if (IS_ERR(dent)) {
1632 gfs2_consist_inode(dip);
1633 return PTR_ERR(dent);
1634 }
1635 /* If not first in block, adjust pointers accordingly */
1636 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1637 prev = dent;
1638 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1639 }
1640
1641 dirent_del(dip, bh, prev, dent);
1642 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1643 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1644 u16 entries = be16_to_cpu(leaf->lf_entries);
1645 if (!entries)
1646 gfs2_consist_inode(dip);
1647 leaf->lf_entries = cpu_to_be16(--entries);
1648 }
1649 brelse(bh);
1650
1651 error = gfs2_meta_inode_buffer(dip, &bh);
1652 if (error)
1653 return error;
1654
1655 if (!dip->i_di.di_entries)
1656 gfs2_consist_inode(dip);
1657 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1658 dip->i_di.di_entries--;
1659 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1660 gfs2_dinode_out(&dip->i_di, bh->b_data);
1661 brelse(bh);
1662 mark_inode_dirty(&dip->i_inode);
1663
1664 return error;
1665}
1666
1667/**
1668 * gfs2_dir_mvino - Change inode number of directory entry
1669 * @dip: The GFS2 inode
1670 * @filename:
1671 * @new_inode:
1672 *
1673 * This routine changes the inode number of a directory entry. It's used
1674 * by rename to change ".." when a directory is moved.
1675 * Assumes a glock is held on dvp.
1676 *
1677 * Returns: errno
1678 */
1679
1680int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1681 struct gfs2_inum *inum, unsigned int new_type)
1682{
1683 struct buffer_head *bh;
1684 struct gfs2_dirent *dent;
1685 int error;
1686
1687 dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1688 if (!dent) {
1689 gfs2_consist_inode(dip);
1690 return -EIO;
1691 }
1692 if (IS_ERR(dent))
1693 return PTR_ERR(dent);
1694
1695 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1696 gfs2_inum_out(inum, (char *)&dent->de_inum);
1697 dent->de_type = cpu_to_be16(new_type);
1698
1699 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1700 brelse(bh);
1701 error = gfs2_meta_inode_buffer(dip, &bh);
1702 if (error)
1703 return error;
1704 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1705 }
1706
1707 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1708 gfs2_dinode_out(&dip->i_di, bh->b_data);
1709 brelse(bh);
1710 return 0;
1711}
1712
1713/**
1714 * foreach_leaf - call a function for each leaf in a directory
1715 * @dip: the directory
1716 * @lc: the function to call for each each
1717 * @data: private data to pass to it
1718 *
1719 * Returns: errno
1720 */
1721
1722static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1723{
1724 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1725 struct buffer_head *bh;
1726 struct gfs2_leaf *leaf;
1727 uint32_t hsize, len;
1728 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1729 uint32_t index = 0;
1730 uint64_t *lp;
1731 uint64_t leaf_no;
1732 int error = 0;
1733
1734 hsize = 1 << dip->i_di.di_depth;
1735 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1736 gfs2_consist_inode(dip);
1737 return -EIO;
1738 }
1739
1740 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1741 if (!lp)
1742 return -ENOMEM;
1743
1744 while (index < hsize) {
1745 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1746 ht_offset = index - lp_offset;
1747
1748 if (ht_offset_cur != ht_offset) {
1749 error = gfs2_dir_read_data(dip, (char *)lp,
1750 ht_offset * sizeof(uint64_t),
1751 sdp->sd_hash_bsize);
1752 if (error != sdp->sd_hash_bsize) {
1753 if (error >= 0)
1754 error = -EIO;
1755 goto out;
1756 }
1757 ht_offset_cur = ht_offset;
1758 }
1759
1760 leaf_no = be64_to_cpu(lp[lp_offset]);
1761 if (leaf_no) {
1762 error = get_leaf(dip, leaf_no, &bh);
1763 if (error)
1764 goto out;
1765 leaf = (struct gfs2_leaf *)bh->b_data;
1766 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1767 brelse(bh);
1768
1769 error = lc(dip, index, len, leaf_no, data);
1770 if (error)
1771 goto out;
1772
1773 index = (index & ~(len - 1)) + len;
1774 } else
1775 index++;
1776 }
1777
1778 if (index != hsize) {
1779 gfs2_consist_inode(dip);
1780 error = -EIO;
1781 }
1782
1783out:
1784 kfree(lp);
1785
1786 return error;
1787}
1788
1789/**
1790 * leaf_dealloc - Deallocate a directory leaf
1791 * @dip: the directory
1792 * @index: the hash table offset in the directory
1793 * @len: the number of pointers to this leaf
1794 * @leaf_no: the leaf number
1795 * @data: not used
1796 *
1797 * Returns: errno
1798 */
1799
1800static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
1801 uint64_t leaf_no, void *data)
1802{
1803 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1804 struct gfs2_leaf *tmp_leaf;
1805 struct gfs2_rgrp_list rlist;
1806 struct buffer_head *bh, *dibh;
1807 uint64_t blk, nblk;
1808 unsigned int rg_blocks = 0, l_blocks = 0;
1809 char *ht;
1810 unsigned int x, size = len * sizeof(uint64_t);
1811 int error;
1812
1813 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1814
1815 ht = kzalloc(size, GFP_KERNEL);
1816 if (!ht)
1817 return -ENOMEM;
1818
1819 gfs2_alloc_get(dip);
1820
1821 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1822 if (error)
1823 goto out;
1824
1825 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1826 if (error)
1827 goto out_qs;
1828
1829 /* Count the number of leaves */
1830
1831 for (blk = leaf_no; blk; blk = nblk) {
1832 error = get_leaf(dip, blk, &bh);
1833 if (error)
1834 goto out_rlist;
1835 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1836 nblk = be64_to_cpu(tmp_leaf->lf_next);
1837 brelse(bh);
1838
1839 gfs2_rlist_add(sdp, &rlist, blk);
1840 l_blocks++;
1841 }
1842
1843 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1844
1845 for (x = 0; x < rlist.rl_rgrps; x++) {
1846 struct gfs2_rgrpd *rgd;
1847 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1848 rg_blocks += rgd->rd_ri.ri_length;
1849 }
1850
1851 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1852 if (error)
1853 goto out_rlist;
1854
1855 error = gfs2_trans_begin(sdp,
1856 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1857 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1858 if (error)
1859 goto out_rg_gunlock;
1860
1861 for (blk = leaf_no; blk; blk = nblk) {
1862 error = get_leaf(dip, blk, &bh);
1863 if (error)
1864 goto out_end_trans;
1865 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1866 nblk = be64_to_cpu(tmp_leaf->lf_next);
1867 brelse(bh);
1868
1869 gfs2_free_meta(dip, blk, 1);
1870
1871 if (!dip->i_di.di_blocks)
1872 gfs2_consist_inode(dip);
1873 dip->i_di.di_blocks--;
1874 }
1875
1876 error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size);
1877 if (error != size) {
1878 if (error >= 0)
1879 error = -EIO;
1880 goto out_end_trans;
1881 }
1882
1883 error = gfs2_meta_inode_buffer(dip, &dibh);
1884 if (error)
1885 goto out_end_trans;
1886
1887 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1888 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1889 brelse(dibh);
1890
1891 out_end_trans:
1892 gfs2_trans_end(sdp);
1893
1894 out_rg_gunlock:
1895 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1896
1897 out_rlist:
1898 gfs2_rlist_free(&rlist);
1899 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1900
1901 out_qs:
1902 gfs2_quota_unhold(dip);
1903
1904 out:
1905 gfs2_alloc_put(dip);
1906 kfree(ht);
1907
1908 return error;
1909}
1910
1911/**
1912 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1913 * @dip: the directory
1914 *
1915 * Dealloc all on-disk directory leaves to FREEMETA state
1916 * Change on-disk inode type to "regular file"
1917 *
1918 * Returns: errno
1919 */
1920
1921int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1922{
1923 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1924 struct buffer_head *bh;
1925 int error;
1926
1927 /* Dealloc on-disk leaves to FREEMETA state */
1928 error = foreach_leaf(dip, leaf_dealloc, NULL);
1929 if (error)
1930 return error;
1931
1932 /* Make this a regular file in case we crash.
1933 (We don't want to free these blocks a second time.) */
1934
1935 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1936 if (error)
1937 return error;
1938
1939 error = gfs2_meta_inode_buffer(dip, &bh);
1940 if (!error) {
1941 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1942 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1943 cpu_to_be32(S_IFREG);
1944 brelse(bh);
1945 }
1946
1947 gfs2_trans_end(sdp);
1948
1949 return error;
1950}
1951
1952/**
1953 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1954 * @ip: the file being written to
1955 * @filname: the filename that's going to be added
1956 *
1957 * Returns: 1 if alloc required, 0 if not, -ve on error
1958 */
1959
1960int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1961{
1962 struct gfs2_dirent *dent;
1963 struct buffer_head *bh;
1964
1965 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1966 if (!dent) {
1967 return 1;
1968 }
1969 if (IS_ERR(dent))
1970 return PTR_ERR(dent);
1971 brelse(bh);
1972 return 0;
1973}
1974
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..173403095eb2
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,73 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13/**
14 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
15 * @opaque: opaque data used by the function
16 * @name: the name of the directory entry
17 * @length: the length of the name
18 * @offset: the entry's offset in the directory
19 * @inum: the inode number the entry points to
20 * @type: the type of inode the entry points to
21 *
22 * Returns: 0 on success, 1 if buffer full
23 */
24
25typedef int (*gfs2_filldir_t) (void *opaque,
26 const char *name, unsigned int length,
27 uint64_t offset,
28 struct gfs2_inum *inum, unsigned int type);
29
30int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
31 struct gfs2_inum *inum, unsigned int *type);
32int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
33 const struct gfs2_inum *inum, unsigned int type);
34int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
35int gfs2_dir_read(struct inode *inode, uint64_t * offset, void *opaque,
36 gfs2_filldir_t filldir);
37int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
38 struct gfs2_inum *new_inum, unsigned int new_type);
39
40int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
41
42int gfs2_diradd_alloc_required(struct inode *dir,
43 const struct qstr *filename);
44int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
45 struct buffer_head **bhp);
46
47static inline uint32_t gfs2_disk_hash(const char *data, int len)
48{
49 return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF;
50}
51
52
53static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
54{
55 name->name = fname;
56 name->len = strlen(fname);
57 name->hash = gfs2_disk_hash(name->name, name->len);
58}
59
60/* N.B. This probably ought to take inum & type as args as well */
61static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
62{
63 dent->de_inum.no_addr = cpu_to_be64(0);
64 dent->de_inum.no_formal_ino = cpu_to_be64(0);
65 dent->de_hash = cpu_to_be32(name->hash);
66 dent->de_rec_len = cpu_to_be16(reclen);
67 dent->de_name_len = cpu_to_be16(name->len);
68 dent->de_type = cpu_to_be16(0);
69 memset(dent->__pad, 0, sizeof(dent->__pad));
70 memcpy((char*)(dent+1), name->name, name->len);
71}
72
73#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..3ace242f2b16
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "util.h"
26
27/**
28 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
29 * @namep: ea name, possibly with type appended
30 *
31 * Returns: GFS2_EATYPE_XXX
32 */
33
34unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
35{
36 unsigned int type;
37
38 if (strncmp(name, "system.", 7) == 0) {
39 type = GFS2_EATYPE_SYS;
40 if (truncated_name)
41 *truncated_name = strchr(name, '.') + 1;
42 } else if (strncmp(name, "user.", 5) == 0) {
43 type = GFS2_EATYPE_USR;
44 if (truncated_name)
45 *truncated_name = strchr(name, '.') + 1;
46 } else if (strncmp(name, "security.", 9) == 0) {
47 type = GFS2_EATYPE_SECURITY;
48 if (truncated_name)
49 *truncated_name = strchr(name, '.') + 1;
50 } else {
51 type = GFS2_EATYPE_UNUSED;
52 if (truncated_name)
53 *truncated_name = NULL;
54 }
55
56 return type;
57}
58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
102 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
103 !capable(CAP_SYS_ADMIN))
104 return -EPERM;
105
106 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
107 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP;
110
111
112
113 return gfs2_ea_get_i(ip, er);
114}
115
116static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
117{
118 int remove = 0;
119 int error;
120
121 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
122 if (!(er->er_flags & GFS2_ERF_MODE)) {
123 er->er_mode = ip->i_di.di_mode;
124 er->er_flags |= GFS2_ERF_MODE;
125 }
126 error = gfs2_acl_validate_set(ip, 1, er,
127 &remove, &er->er_mode);
128 if (error)
129 return error;
130 error = gfs2_ea_set_i(ip, er);
131 if (error)
132 return error;
133 if (remove)
134 gfs2_ea_remove_i(ip, er);
135 return 0;
136
137 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
138 error = gfs2_acl_validate_set(ip, 0, er,
139 &remove, NULL);
140 if (error)
141 return error;
142 if (!remove)
143 error = gfs2_ea_set_i(ip, er);
144 else {
145 error = gfs2_ea_remove_i(ip, er);
146 if (error == -ENODATA)
147 error = 0;
148 }
149 return error;
150 }
151
152 return -EPERM;
153}
154
155static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
156{
157 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
158 int error = gfs2_acl_validate_remove(ip, 1);
159 if (error)
160 return error;
161
162 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
163 int error = gfs2_acl_validate_remove(ip, 0);
164 if (error)
165 return error;
166
167 } else
168 return -EPERM;
169
170 return gfs2_ea_remove_i(ip, er);
171}
172
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove,
207 .eo_name = "user",
208};
209
210struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove,
214 .eo_name = "system",
215};
216
217static struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove,
221 .eo_name = "security",
222};
223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL,
226 &gfs2_user_eaops,
227 &gfs2_system_eaops,
228 &gfs2_security_eaops,
229};
230
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..3dece17e3116
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14
15struct gfs2_eattr_operations {
16 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
17 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 char *eo_name;
20};
21
22unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
23
24extern struct gfs2_eattr_operations gfs2_system_eaops;
25
26extern struct gfs2_eattr_operations *gfs2_ea_ops[];
27
28#endif /* __EAOPS_DOT_H__ */
29
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..96736932260f
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1548 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33/**
34 * ea_calc_size - returns the acutal number of bytes the request will take up
35 * (not counting any unstuffed data blocks)
36 * @sdp:
37 * @er:
38 * @size:
39 *
40 * Returns: 1 if the EA should be stuffed
41 */
42
43static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
44 unsigned int *size)
45{
46 *size = GFS2_EAREQ_SIZE_STUFFED(er);
47 if (*size <= sdp->sd_jbsize)
48 return 1;
49
50 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
51
52 return 0;
53}
54
55static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
56{
57 unsigned int size;
58
59 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
60 return -ERANGE;
61
62 ea_calc_size(sdp, er, &size);
63
64 /* This can only happen with 512 byte blocks */
65 if (size > sdp->sd_jbsize)
66 return -ERANGE;
67
68 return 0;
69}
70
71typedef int (*ea_call_t) (struct gfs2_inode *ip,
72 struct buffer_head *bh,
73 struct gfs2_ea_header *ea,
74 struct gfs2_ea_header *prev,
75 void *private);
76
77static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
78 ea_call_t ea_call, void *data)
79{
80 struct gfs2_ea_header *ea, *prev = NULL;
81 int error = 0;
82
83 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
84 return -EIO;
85
86 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
87 if (!GFS2_EA_REC_LEN(ea))
88 goto fail;
89 if (!(bh->b_data <= (char *)ea &&
90 (char *)GFS2_EA2NEXT(ea) <=
91 bh->b_data + bh->b_size))
92 goto fail;
93 if (!GFS2_EATYPE_VALID(ea->ea_type))
94 goto fail;
95
96 error = ea_call(ip, bh, ea, prev, data);
97 if (error)
98 return error;
99
100 if (GFS2_EA_IS_LAST(ea)) {
101 if ((char *)GFS2_EA2NEXT(ea) !=
102 bh->b_data + bh->b_size)
103 goto fail;
104 break;
105 }
106 }
107
108 return error;
109
110 fail:
111 gfs2_consist_inode(ip);
112 return -EIO;
113}
114
115static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
116{
117 struct buffer_head *bh, *eabh;
118 uint64_t *eablk, *end;
119 int error;
120
121 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
122 DIO_START | DIO_WAIT, &bh);
123 if (error)
124 return error;
125
126 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
127 error = ea_foreach_i(ip, bh, ea_call, data);
128 goto out;
129 }
130
131 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
132 error = -EIO;
133 goto out;
134 }
135
136 eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
137 end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
138
139 for (; eablk < end; eablk++) {
140 uint64_t bn;
141
142 if (!*eablk)
143 break;
144 bn = be64_to_cpu(*eablk);
145
146 error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
147 &eabh);
148 if (error)
149 break;
150 error = ea_foreach_i(ip, eabh, ea_call, data);
151 brelse(eabh);
152 if (error)
153 break;
154 }
155 out:
156 brelse(bh);
157
158 return error;
159}
160
161struct ea_find {
162 struct gfs2_ea_request *ef_er;
163 struct gfs2_ea_location *ef_el;
164};
165
166static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
167 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
168 void *private)
169{
170 struct ea_find *ef = private;
171 struct gfs2_ea_request *er = ef->ef_er;
172
173 if (ea->ea_type == GFS2_EATYPE_UNUSED)
174 return 0;
175
176 if (ea->ea_type == er->er_type) {
177 if (ea->ea_name_len == er->er_name_len &&
178 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
179 struct gfs2_ea_location *el = ef->ef_el;
180 get_bh(bh);
181 el->el_bh = bh;
182 el->el_ea = ea;
183 el->el_prev = prev;
184 return 1;
185 }
186 }
187
188#if 0
189 else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
190 er->er_type == GFS2_EATYPE_SYS)
191 return 1;
192#endif
193
194 return 0;
195}
196
197int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
198 struct gfs2_ea_location *el)
199{
200 struct ea_find ef;
201 int error;
202
203 ef.ef_er = er;
204 ef.ef_el = el;
205
206 memset(el, 0, sizeof(struct gfs2_ea_location));
207
208 error = ea_foreach(ip, ea_find_i, &ef);
209 if (error > 0)
210 return 0;
211
212 return error;
213}
214
215/**
216 * ea_dealloc_unstuffed -
217 * @ip:
218 * @bh:
219 * @ea:
220 * @prev:
221 * @private:
222 *
223 * Take advantage of the fact that all unstuffed blocks are
224 * allocated from the same RG. But watch, this may not always
225 * be true.
226 *
227 * Returns: errno
228 */
229
230static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
231 struct gfs2_ea_header *ea,
232 struct gfs2_ea_header *prev, void *private)
233{
234 int *leave = private;
235 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
236 struct gfs2_rgrpd *rgd;
237 struct gfs2_holder rg_gh;
238 struct buffer_head *dibh;
239 uint64_t *dataptrs, bn = 0;
240 uint64_t bstart = 0;
241 unsigned int blen = 0;
242 unsigned int blks = 0;
243 unsigned int x;
244 int error;
245
246 if (GFS2_EA_IS_STUFFED(ea))
247 return 0;
248
249 dataptrs = GFS2_EA2DATAPTRS(ea);
250 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
251 if (*dataptrs) {
252 blks++;
253 bn = be64_to_cpu(*dataptrs);
254 }
255 if (!blks)
256 return 0;
257
258 rgd = gfs2_blk2rgrpd(sdp, bn);
259 if (!rgd) {
260 gfs2_consist_inode(ip);
261 return -EIO;
262 }
263
264 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
265 if (error)
266 return error;
267
268 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
269 RES_DINODE + RES_EATTR + RES_STATFS +
270 RES_QUOTA, blks);
271 if (error)
272 goto out_gunlock;
273
274 gfs2_trans_add_bh(ip->i_gl, bh, 1);
275
276 dataptrs = GFS2_EA2DATAPTRS(ea);
277 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
278 if (!*dataptrs)
279 break;
280 bn = be64_to_cpu(*dataptrs);
281
282 if (bstart + blen == bn)
283 blen++;
284 else {
285 if (bstart)
286 gfs2_free_meta(ip, bstart, blen);
287 bstart = bn;
288 blen = 1;
289 }
290
291 *dataptrs = 0;
292 if (!ip->i_di.di_blocks)
293 gfs2_consist_inode(ip);
294 ip->i_di.di_blocks--;
295 }
296 if (bstart)
297 gfs2_free_meta(ip, bstart, blen);
298
299 if (prev && !leave) {
300 uint32_t len;
301
302 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
303 prev->ea_rec_len = cpu_to_be32(len);
304
305 if (GFS2_EA_IS_LAST(ea))
306 prev->ea_flags |= GFS2_EAFLAG_LAST;
307 } else {
308 ea->ea_type = GFS2_EATYPE_UNUSED;
309 ea->ea_num_ptrs = 0;
310 }
311
312 error = gfs2_meta_inode_buffer(ip, &dibh);
313 if (!error) {
314 ip->i_di.di_ctime = get_seconds();
315 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
316 gfs2_dinode_out(&ip->i_di, dibh->b_data);
317 brelse(dibh);
318 }
319
320 gfs2_trans_end(sdp);
321
322 out_gunlock:
323 gfs2_glock_dq_uninit(&rg_gh);
324
325 return error;
326}
327
328static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
329 struct gfs2_ea_header *ea,
330 struct gfs2_ea_header *prev, int leave)
331{
332 struct gfs2_alloc *al;
333 int error;
334
335 al = gfs2_alloc_get(ip);
336
337 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
338 if (error)
339 goto out_alloc;
340
341 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
342 if (error)
343 goto out_quota;
344
345 error = ea_dealloc_unstuffed(ip,
346 bh, ea, prev,
347 (leave) ? &error : NULL);
348
349 gfs2_glock_dq_uninit(&al->al_ri_gh);
350
351 out_quota:
352 gfs2_quota_unhold(ip);
353
354 out_alloc:
355 gfs2_alloc_put(ip);
356
357 return error;
358}
359
360struct ea_list {
361 struct gfs2_ea_request *ei_er;
362 unsigned int ei_size;
363};
364
365static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
366 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
367 void *private)
368{
369 struct ea_list *ei = private;
370 struct gfs2_ea_request *er = ei->ei_er;
371 unsigned int ea_size = gfs2_ea_strlen(ea);
372
373 if (ea->ea_type == GFS2_EATYPE_UNUSED)
374 return 0;
375
376 if (er->er_data_len) {
377 char *prefix = NULL;
378 unsigned int l = 0;
379 char c = 0;
380
381 if (ei->ei_size + ea_size > er->er_data_len)
382 return -ERANGE;
383
384 switch (ea->ea_type) {
385 case GFS2_EATYPE_USR:
386 prefix = "user.";
387 l = 5;
388 break;
389 case GFS2_EATYPE_SYS:
390 prefix = "system.";
391 l = 7;
392 break;
393 case GFS2_EATYPE_SECURITY:
394 prefix = "security.";
395 l = 9;
396 break;
397 }
398
399 BUG_ON(l == 0);
400
401 memcpy(er->er_data + ei->ei_size, prefix, l);
402 memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
403 ea->ea_name_len);
404 memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
405 }
406
407 ei->ei_size += ea_size;
408
409 return 0;
410}
411
412/**
413 * gfs2_ea_list -
414 * @ip:
415 * @er:
416 *
417 * Returns: actual size of data on success, -errno on error
418 */
419
420int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
421{
422 struct gfs2_holder i_gh;
423 int error;
424
425 if (!er->er_data || !er->er_data_len) {
426 er->er_data = NULL;
427 er->er_data_len = 0;
428 }
429
430 error = gfs2_glock_nq_init(ip->i_gl,
431 LM_ST_SHARED, LM_FLAG_ANY,
432 &i_gh);
433 if (error)
434 return error;
435
436 if (ip->i_di.di_eattr) {
437 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
438
439 error = ea_foreach(ip, ea_list_i, &ei);
440 if (!error)
441 error = ei.ei_size;
442 }
443
444 gfs2_glock_dq_uninit(&i_gh);
445
446 return error;
447}
448
449/**
450 * ea_get_unstuffed - actually copies the unstuffed data into the
451 * request buffer
452 * @ip:
453 * @ea:
454 * @data:
455 *
456 * Returns: errno
457 */
458
459static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
460 char *data)
461{
462 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
463 struct buffer_head **bh;
464 unsigned int amount = GFS2_EA_DATA_LEN(ea);
465 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
466 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
467 unsigned int x;
468 int error = 0;
469
470 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
471 if (!bh)
472 return -ENOMEM;
473
474 for (x = 0; x < nptrs; x++) {
475 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
476 DIO_START, bh + x);
477 if (error) {
478 while (x--)
479 brelse(bh[x]);
480 goto out;
481 }
482 dataptrs++;
483 }
484
485 for (x = 0; x < nptrs; x++) {
486 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
487 if (error) {
488 for (; x < nptrs; x++)
489 brelse(bh[x]);
490 goto out;
491 }
492 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
493 for (; x < nptrs; x++)
494 brelse(bh[x]);
495 error = -EIO;
496 goto out;
497 }
498
499 memcpy(data,
500 bh[x]->b_data + sizeof(struct gfs2_meta_header),
501 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
502
503 amount -= sdp->sd_jbsize;
504 data += sdp->sd_jbsize;
505
506 brelse(bh[x]);
507 }
508
509 out:
510 kfree(bh);
511
512 return error;
513}
514
515int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
516 char *data)
517{
518 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
519 memcpy(data,
520 GFS2_EA2DATA(el->el_ea),
521 GFS2_EA_DATA_LEN(el->el_ea));
522 return 0;
523 } else
524 return ea_get_unstuffed(ip, el->el_ea, data);
525}
526
527/**
528 * gfs2_ea_get_i -
529 * @ip:
530 * @er:
531 *
532 * Returns: actual size of data on success, -errno on error
533 */
534
535int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
536{
537 struct gfs2_ea_location el;
538 int error;
539
540 if (!ip->i_di.di_eattr)
541 return -ENODATA;
542
543 error = gfs2_ea_find(ip, er, &el);
544 if (error)
545 return error;
546 if (!el.el_ea)
547 return -ENODATA;
548
549 if (er->er_data_len) {
550 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
551 error = -ERANGE;
552 else
553 error = gfs2_ea_get_copy(ip, &el, er->er_data);
554 }
555 if (!error)
556 error = GFS2_EA_DATA_LEN(el.el_ea);
557
558 brelse(el.el_bh);
559
560 return error;
561}
562
563/**
564 * gfs2_ea_get -
565 * @ip:
566 * @er:
567 *
568 * Returns: actual size of data on success, -errno on error
569 */
570
571int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
572{
573 struct gfs2_holder i_gh;
574 int error;
575
576 if (!er->er_name_len ||
577 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
578 return -EINVAL;
579 if (!er->er_data || !er->er_data_len) {
580 er->er_data = NULL;
581 er->er_data_len = 0;
582 }
583
584 error = gfs2_glock_nq_init(ip->i_gl,
585 LM_ST_SHARED, LM_FLAG_ANY,
586 &i_gh);
587 if (error)
588 return error;
589
590 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
591
592 gfs2_glock_dq_uninit(&i_gh);
593
594 return error;
595}
596
597/**
598 * ea_alloc_blk - allocates a new block for extended attributes.
599 * @ip: A pointer to the inode that's getting extended attributes
600 * @bhp:
601 *
602 * Returns: errno
603 */
604
605static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
606{
607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
608 struct gfs2_ea_header *ea;
609 uint64_t block;
610
611 block = gfs2_alloc_meta(ip);
612
613 *bhp = gfs2_meta_new(ip->i_gl, block);
614 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
615 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
616 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
617
618 ea = GFS2_EA_BH2FIRST(*bhp);
619 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
620 ea->ea_type = GFS2_EATYPE_UNUSED;
621 ea->ea_flags = GFS2_EAFLAG_LAST;
622 ea->ea_num_ptrs = 0;
623
624 ip->i_di.di_blocks++;
625
626 return 0;
627}
628
629/**
630 * ea_write - writes the request info to an ea, creating new blocks if
631 * necessary
632 * @ip: inode that is being modified
633 * @ea: the location of the new ea in a block
634 * @er: the write request
635 *
636 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
637 *
638 * returns : errno
639 */
640
641static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
642 struct gfs2_ea_request *er)
643{
644 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
645
646 ea->ea_data_len = cpu_to_be32(er->er_data_len);
647 ea->ea_name_len = er->er_name_len;
648 ea->ea_type = er->er_type;
649 ea->__pad = 0;
650
651 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
652
653 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
654 ea->ea_num_ptrs = 0;
655 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
656 } else {
657 uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
658 const char *data = er->er_data;
659 unsigned int data_len = er->er_data_len;
660 unsigned int copy;
661 unsigned int x;
662
663 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
664 for (x = 0; x < ea->ea_num_ptrs; x++) {
665 struct buffer_head *bh;
666 uint64_t block;
667 int mh_size = sizeof(struct gfs2_meta_header);
668
669 block = gfs2_alloc_meta(ip);
670
671 bh = gfs2_meta_new(ip->i_gl, block);
672 gfs2_trans_add_bh(ip->i_gl, bh, 1);
673 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
674
675 ip->i_di.di_blocks++;
676
677 copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
678 data_len;
679 memcpy(bh->b_data + mh_size, data, copy);
680 if (copy < sdp->sd_jbsize)
681 memset(bh->b_data + mh_size + copy, 0,
682 sdp->sd_jbsize - copy);
683
684 *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
685 data += copy;
686 data_len -= copy;
687
688 brelse(bh);
689 }
690
691 gfs2_assert_withdraw(sdp, !data_len);
692 }
693
694 return 0;
695}
696
697typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
698 struct gfs2_ea_request *er,
699 void *private);
700
701static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
702 unsigned int blks,
703 ea_skeleton_call_t skeleton_call,
704 void *private)
705{
706 struct gfs2_alloc *al;
707 struct buffer_head *dibh;
708 int error;
709
710 al = gfs2_alloc_get(ip);
711
712 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
713 if (error)
714 goto out;
715
716 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
717 if (error)
718 goto out_gunlock_q;
719
720 al->al_requested = blks;
721
722 error = gfs2_inplace_reserve(ip);
723 if (error)
724 goto out_gunlock_q;
725
726 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
727 blks + al->al_rgd->rd_ri.ri_length +
728 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
729 if (error)
730 goto out_ipres;
731
732 error = skeleton_call(ip, er, private);
733 if (error)
734 goto out_end_trans;
735
736 error = gfs2_meta_inode_buffer(ip, &dibh);
737 if (!error) {
738 if (er->er_flags & GFS2_ERF_MODE) {
739 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
740 (ip->i_di.di_mode & S_IFMT) ==
741 (er->er_mode & S_IFMT));
742 ip->i_di.di_mode = er->er_mode;
743 }
744 ip->i_di.di_ctime = get_seconds();
745 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
746 gfs2_dinode_out(&ip->i_di, dibh->b_data);
747 brelse(dibh);
748 }
749
750 out_end_trans:
751 gfs2_trans_end(GFS2_SB(&ip->i_inode));
752
753 out_ipres:
754 gfs2_inplace_release(ip);
755
756 out_gunlock_q:
757 gfs2_quota_unlock(ip);
758
759 out:
760 gfs2_alloc_put(ip);
761
762 return error;
763}
764
765static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
766 void *private)
767{
768 struct buffer_head *bh;
769 int error;
770
771 error = ea_alloc_blk(ip, &bh);
772 if (error)
773 return error;
774
775 ip->i_di.di_eattr = bh->b_blocknr;
776 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
777
778 brelse(bh);
779
780 return error;
781}
782
783/**
784 * ea_init - initializes a new eattr block
785 * @ip:
786 * @er:
787 *
788 * Returns: errno
789 */
790
791static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
792{
793 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
794 unsigned int blks = 1;
795
796 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
797 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
798
799 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
800}
801
802static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
803{
804 uint32_t ea_size = GFS2_EA_SIZE(ea);
805 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
806 ea_size);
807 uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
808 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
809
810 ea->ea_rec_len = cpu_to_be32(ea_size);
811 ea->ea_flags ^= last;
812
813 new->ea_rec_len = cpu_to_be32(new_size);
814 new->ea_flags = last;
815
816 return new;
817}
818
819static void ea_set_remove_stuffed(struct gfs2_inode *ip,
820 struct gfs2_ea_location *el)
821{
822 struct gfs2_ea_header *ea = el->el_ea;
823 struct gfs2_ea_header *prev = el->el_prev;
824 uint32_t len;
825
826 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
827
828 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
829 ea->ea_type = GFS2_EATYPE_UNUSED;
830 return;
831 } else if (GFS2_EA2NEXT(prev) != ea) {
832 prev = GFS2_EA2NEXT(prev);
833 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
834 }
835
836 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
837 prev->ea_rec_len = cpu_to_be32(len);
838
839 if (GFS2_EA_IS_LAST(ea))
840 prev->ea_flags |= GFS2_EAFLAG_LAST;
841}
842
843struct ea_set {
844 int ea_split;
845
846 struct gfs2_ea_request *es_er;
847 struct gfs2_ea_location *es_el;
848
849 struct buffer_head *es_bh;
850 struct gfs2_ea_header *es_ea;
851};
852
853static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
854 struct gfs2_ea_header *ea, struct ea_set *es)
855{
856 struct gfs2_ea_request *er = es->es_er;
857 struct buffer_head *dibh;
858 int error;
859
860 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
861 if (error)
862 return error;
863
864 gfs2_trans_add_bh(ip->i_gl, bh, 1);
865
866 if (es->ea_split)
867 ea = ea_split_ea(ea);
868
869 ea_write(ip, ea, er);
870
871 if (es->es_el)
872 ea_set_remove_stuffed(ip, es->es_el);
873
874 error = gfs2_meta_inode_buffer(ip, &dibh);
875 if (error)
876 goto out;
877
878 if (er->er_flags & GFS2_ERF_MODE) {
879 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
880 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
881 ip->i_di.di_mode = er->er_mode;
882 }
883 ip->i_di.di_ctime = get_seconds();
884 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
885 gfs2_dinode_out(&ip->i_di, dibh->b_data);
886 brelse(dibh);
887 out:
888 gfs2_trans_end(GFS2_SB(&ip->i_inode));
889
890 return error;
891}
892
893static int ea_set_simple_alloc(struct gfs2_inode *ip,
894 struct gfs2_ea_request *er, void *private)
895{
896 struct ea_set *es = private;
897 struct gfs2_ea_header *ea = es->es_ea;
898 int error;
899
900 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
901
902 if (es->ea_split)
903 ea = ea_split_ea(ea);
904
905 error = ea_write(ip, ea, er);
906 if (error)
907 return error;
908
909 if (es->es_el)
910 ea_set_remove_stuffed(ip, es->es_el);
911
912 return 0;
913}
914
915static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
916 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
917 void *private)
918{
919 struct ea_set *es = private;
920 unsigned int size;
921 int stuffed;
922 int error;
923
924 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
925
926 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
927 if (GFS2_EA_REC_LEN(ea) < size)
928 return 0;
929 if (!GFS2_EA_IS_STUFFED(ea)) {
930 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
931 if (error)
932 return error;
933 }
934 es->ea_split = 0;
935 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
936 es->ea_split = 1;
937 else
938 return 0;
939
940 if (stuffed) {
941 error = ea_set_simple_noalloc(ip, bh, ea, es);
942 if (error)
943 return error;
944 } else {
945 unsigned int blks;
946
947 es->es_bh = bh;
948 es->es_ea = ea;
949 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
950 GFS2_SB(&ip->i_inode)->sd_jbsize);
951
952 error = ea_alloc_skeleton(ip, es->es_er, blks,
953 ea_set_simple_alloc, es);
954 if (error)
955 return error;
956 }
957
958 return 1;
959}
960
961static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
962 void *private)
963{
964 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
965 struct buffer_head *indbh, *newbh;
966 uint64_t *eablk;
967 int error;
968 int mh_size = sizeof(struct gfs2_meta_header);
969
970 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
971 uint64_t *end;
972
973 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
974 DIO_START | DIO_WAIT, &indbh);
975 if (error)
976 return error;
977
978 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
979 error = -EIO;
980 goto out;
981 }
982
983 eablk = (uint64_t *)(indbh->b_data + mh_size);
984 end = eablk + sdp->sd_inptrs;
985
986 for (; eablk < end; eablk++)
987 if (!*eablk)
988 break;
989
990 if (eablk == end) {
991 error = -ENOSPC;
992 goto out;
993 }
994
995 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
996 } else {
997 uint64_t blk;
998
999 blk = gfs2_alloc_meta(ip);
1000
1001 indbh = gfs2_meta_new(ip->i_gl, blk);
1002 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1003 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
1004 gfs2_buffer_clear_tail(indbh, mh_size);
1005
1006 eablk = (uint64_t *)(indbh->b_data + mh_size);
1007 *eablk = cpu_to_be64(ip->i_di.di_eattr);
1008 ip->i_di.di_eattr = blk;
1009 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
1010 ip->i_di.di_blocks++;
1011
1012 eablk++;
1013 }
1014
1015 error = ea_alloc_blk(ip, &newbh);
1016 if (error)
1017 goto out;
1018
1019 *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
1020 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
1021 brelse(newbh);
1022 if (error)
1023 goto out;
1024
1025 if (private)
1026 ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
1027
1028 out:
1029 brelse(indbh);
1030
1031 return error;
1032}
1033
1034static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1035 struct gfs2_ea_location *el)
1036{
1037 struct ea_set es;
1038 unsigned int blks = 2;
1039 int error;
1040
1041 memset(&es, 0, sizeof(struct ea_set));
1042 es.es_er = er;
1043 es.es_el = el;
1044
1045 error = ea_foreach(ip, ea_set_simple, &es);
1046 if (error > 0)
1047 return 0;
1048 if (error)
1049 return error;
1050
1051 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1052 blks++;
1053 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1054 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1055
1056 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1057}
1058
1059static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1060 struct gfs2_ea_location *el)
1061{
1062 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1063 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1064 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
1065 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1066 }
1067
1068 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1069}
1070
1071int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1072{
1073 struct gfs2_ea_location el;
1074 int error;
1075
1076 if (!ip->i_di.di_eattr) {
1077 if (er->er_flags & XATTR_REPLACE)
1078 return -ENODATA;
1079 return ea_init(ip, er);
1080 }
1081
1082 error = gfs2_ea_find(ip, er, &el);
1083 if (error)
1084 return error;
1085
1086 if (el.el_ea) {
1087 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1088 brelse(el.el_bh);
1089 return -EPERM;
1090 }
1091
1092 error = -EEXIST;
1093 if (!(er->er_flags & XATTR_CREATE)) {
1094 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1095 error = ea_set_i(ip, er, &el);
1096 if (!error && unstuffed)
1097 ea_set_remove_unstuffed(ip, &el);
1098 }
1099
1100 brelse(el.el_bh);
1101 } else {
1102 error = -ENODATA;
1103 if (!(er->er_flags & XATTR_REPLACE))
1104 error = ea_set_i(ip, er, NULL);
1105 }
1106
1107 return error;
1108}
1109
1110int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1111{
1112 struct gfs2_holder i_gh;
1113 int error;
1114
1115 if (!er->er_name_len ||
1116 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1117 return -EINVAL;
1118 if (!er->er_data || !er->er_data_len) {
1119 er->er_data = NULL;
1120 er->er_data_len = 0;
1121 }
1122 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1123 if (error)
1124 return error;
1125
1126 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1127 if (error)
1128 return error;
1129
1130 if (IS_IMMUTABLE(&ip->i_inode))
1131 error = -EPERM;
1132 else
1133 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1134
1135 gfs2_glock_dq_uninit(&i_gh);
1136
1137 return error;
1138}
1139
1140static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1141{
1142 struct gfs2_ea_header *ea = el->el_ea;
1143 struct gfs2_ea_header *prev = el->el_prev;
1144 struct buffer_head *dibh;
1145 int error;
1146
1147 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1148 if (error)
1149 return error;
1150
1151 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1152
1153 if (prev) {
1154 uint32_t len;
1155
1156 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1157 prev->ea_rec_len = cpu_to_be32(len);
1158
1159 if (GFS2_EA_IS_LAST(ea))
1160 prev->ea_flags |= GFS2_EAFLAG_LAST;
1161 } else
1162 ea->ea_type = GFS2_EATYPE_UNUSED;
1163
1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1165 if (!error) {
1166 ip->i_di.di_ctime = get_seconds();
1167 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1168 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1169 brelse(dibh);
1170 }
1171
1172 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1173
1174 return error;
1175}
1176
1177int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1178{
1179 struct gfs2_ea_location el;
1180 int error;
1181
1182 if (!ip->i_di.di_eattr)
1183 return -ENODATA;
1184
1185 error = gfs2_ea_find(ip, er, &el);
1186 if (error)
1187 return error;
1188 if (!el.el_ea)
1189 return -ENODATA;
1190
1191 if (GFS2_EA_IS_STUFFED(el.el_ea))
1192 error = ea_remove_stuffed(ip, &el);
1193 else
1194 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1195 0);
1196
1197 brelse(el.el_bh);
1198
1199 return error;
1200}
1201
1202/**
1203 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1204 * @ip: pointer to the inode of the target file
1205 * @er: request information
1206 *
1207 * Returns: errno
1208 */
1209
1210int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1211{
1212 struct gfs2_holder i_gh;
1213 int error;
1214
1215 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1216 return -EINVAL;
1217
1218 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1219 if (error)
1220 return error;
1221
1222 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1223 error = -EPERM;
1224 else
1225 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1226
1227 gfs2_glock_dq_uninit(&i_gh);
1228
1229 return error;
1230}
1231
1232static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1233 struct gfs2_ea_header *ea, char *data)
1234{
1235 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1236 struct buffer_head **bh;
1237 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1238 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1239 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
1240 unsigned int x;
1241 int error;
1242
1243 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1244 if (!bh)
1245 return -ENOMEM;
1246
1247 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1248 if (error)
1249 goto out;
1250
1251 for (x = 0; x < nptrs; x++) {
1252 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
1253 DIO_START, bh + x);
1254 if (error) {
1255 while (x--)
1256 brelse(bh[x]);
1257 goto fail;
1258 }
1259 dataptrs++;
1260 }
1261
1262 for (x = 0; x < nptrs; x++) {
1263 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
1264 if (error) {
1265 for (; x < nptrs; x++)
1266 brelse(bh[x]);
1267 goto fail;
1268 }
1269 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1270 for (; x < nptrs; x++)
1271 brelse(bh[x]);
1272 error = -EIO;
1273 goto fail;
1274 }
1275
1276 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1277
1278 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
1279 data,
1280 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1281
1282 amount -= sdp->sd_jbsize;
1283 data += sdp->sd_jbsize;
1284
1285 brelse(bh[x]);
1286 }
1287
1288 out:
1289 kfree(bh);
1290
1291 return error;
1292
1293 fail:
1294 gfs2_trans_end(sdp);
1295 kfree(bh);
1296
1297 return error;
1298}
1299
1300int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1301 struct iattr *attr, char *data)
1302{
1303 struct buffer_head *dibh;
1304 int error;
1305
1306 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1307 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1308 if (error)
1309 return error;
1310
1311 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1312 memcpy(GFS2_EA2DATA(el->el_ea),
1313 data,
1314 GFS2_EA_DATA_LEN(el->el_ea));
1315 } else
1316 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1317
1318 if (error)
1319 return error;
1320
1321 error = gfs2_meta_inode_buffer(ip, &dibh);
1322 if (!error) {
1323 error = inode_setattr(&ip->i_inode, attr);
1324 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1325 gfs2_inode_attr_out(ip);
1326 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1327 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1328 brelse(dibh);
1329 }
1330
1331 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1332
1333 return error;
1334}
1335
1336static int ea_dealloc_indirect(struct gfs2_inode *ip)
1337{
1338 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1339 struct gfs2_rgrp_list rlist;
1340 struct buffer_head *indbh, *dibh;
1341 uint64_t *eablk, *end;
1342 unsigned int rg_blocks = 0;
1343 uint64_t bstart = 0;
1344 unsigned int blen = 0;
1345 unsigned int blks = 0;
1346 unsigned int x;
1347 int error;
1348
1349 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1350
1351 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
1352 DIO_START | DIO_WAIT, &indbh);
1353 if (error)
1354 return error;
1355
1356 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1357 error = -EIO;
1358 goto out;
1359 }
1360
1361 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1362 end = eablk + sdp->sd_inptrs;
1363
1364 for (; eablk < end; eablk++) {
1365 uint64_t bn;
1366
1367 if (!*eablk)
1368 break;
1369 bn = be64_to_cpu(*eablk);
1370
1371 if (bstart + blen == bn)
1372 blen++;
1373 else {
1374 if (bstart)
1375 gfs2_rlist_add(sdp, &rlist, bstart);
1376 bstart = bn;
1377 blen = 1;
1378 }
1379 blks++;
1380 }
1381 if (bstart)
1382 gfs2_rlist_add(sdp, &rlist, bstart);
1383 else
1384 goto out;
1385
1386 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1387
1388 for (x = 0; x < rlist.rl_rgrps; x++) {
1389 struct gfs2_rgrpd *rgd;
1390 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1391 rg_blocks += rgd->rd_ri.ri_length;
1392 }
1393
1394 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1395 if (error)
1396 goto out_rlist_free;
1397
1398 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
1399 RES_INDIRECT + RES_STATFS +
1400 RES_QUOTA, blks);
1401 if (error)
1402 goto out_gunlock;
1403
1404 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1405
1406 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1407 bstart = 0;
1408 blen = 0;
1409
1410 for (; eablk < end; eablk++) {
1411 uint64_t bn;
1412
1413 if (!*eablk)
1414 break;
1415 bn = be64_to_cpu(*eablk);
1416
1417 if (bstart + blen == bn)
1418 blen++;
1419 else {
1420 if (bstart)
1421 gfs2_free_meta(ip, bstart, blen);
1422 bstart = bn;
1423 blen = 1;
1424 }
1425
1426 *eablk = 0;
1427 if (!ip->i_di.di_blocks)
1428 gfs2_consist_inode(ip);
1429 ip->i_di.di_blocks--;
1430 }
1431 if (bstart)
1432 gfs2_free_meta(ip, bstart, blen);
1433
1434 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1435
1436 error = gfs2_meta_inode_buffer(ip, &dibh);
1437 if (!error) {
1438 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1439 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1440 brelse(dibh);
1441 }
1442
1443 gfs2_trans_end(sdp);
1444
1445 out_gunlock:
1446 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1447
1448 out_rlist_free:
1449 gfs2_rlist_free(&rlist);
1450
1451 out:
1452 brelse(indbh);
1453
1454 return error;
1455}
1456
1457static int ea_dealloc_block(struct gfs2_inode *ip)
1458{
1459 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1460 struct gfs2_alloc *al = &ip->i_alloc;
1461 struct gfs2_rgrpd *rgd;
1462 struct buffer_head *dibh;
1463 int error;
1464
1465 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1466 if (!rgd) {
1467 gfs2_consist_inode(ip);
1468 return -EIO;
1469 }
1470
1471 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1472 &al->al_rgd_gh);
1473 if (error)
1474 return error;
1475
1476 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
1477 RES_STATFS + RES_QUOTA, 1);
1478 if (error)
1479 goto out_gunlock;
1480
1481 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1482
1483 ip->i_di.di_eattr = 0;
1484 if (!ip->i_di.di_blocks)
1485 gfs2_consist_inode(ip);
1486 ip->i_di.di_blocks--;
1487
1488 error = gfs2_meta_inode_buffer(ip, &dibh);
1489 if (!error) {
1490 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1491 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1492 brelse(dibh);
1493 }
1494
1495 gfs2_trans_end(sdp);
1496
1497 out_gunlock:
1498 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1499
1500 return error;
1501}
1502
1503/**
1504 * gfs2_ea_dealloc - deallocate the extended attribute fork
1505 * @ip: the inode
1506 *
1507 * Returns: errno
1508 */
1509
1510int gfs2_ea_dealloc(struct gfs2_inode *ip)
1511{
1512 struct gfs2_alloc *al;
1513 int error;
1514
1515 al = gfs2_alloc_get(ip);
1516
1517 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1518 if (error)
1519 goto out_alloc;
1520
1521 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1522 if (error)
1523 goto out_quota;
1524
1525 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1526 if (error)
1527 goto out_rindex;
1528
1529 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1530 error = ea_dealloc_indirect(ip);
1531 if (error)
1532 goto out_rindex;
1533 }
1534
1535 error = ea_dealloc_block(ip);
1536
1537 out_rindex:
1538 gfs2_glock_dq_uninit(&al->al_ri_gh);
1539
1540 out_quota:
1541 gfs2_quota_unhold(ip);
1542
1543 out_alloc:
1544 gfs2_alloc_put(ip);
1545
1546 return error;
1547}
1548
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ae199692e51d
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,97 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
14#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
15
16#define GFS2_EA_SIZE(ea) \
17ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
18 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
19 (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
20
21#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
22#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
23
24#define GFS2_EAREQ_SIZE_STUFFED(er) \
25ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
26
27#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
29 sizeof(uint64_t) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
30
31#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
32#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
33
34#define GFS2_EA2DATAPTRS(ea) \
35((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
36
37#define GFS2_EA2NEXT(ea) \
38((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
39
40#define GFS2_EA_BH2FIRST(bh) \
41((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
42
43#define GFS2_ERF_MODE 0x80000000
44
45struct gfs2_ea_request {
46 char *er_name;
47 char *er_data;
48 unsigned int er_name_len;
49 unsigned int er_data_len;
50 unsigned int er_type; /* GFS2_EATYPE_... */
51 int er_flags;
52 mode_t er_mode;
53};
54
55struct gfs2_ea_location {
56 struct buffer_head *el_bh;
57 struct gfs2_ea_header *el_ea;
58 struct gfs2_ea_header *el_prev;
59};
60
61int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
62int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
63int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
64
65int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
66int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
68int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69
70int gfs2_ea_dealloc(struct gfs2_inode *ip);
71
72/* Exported to acl.c */
73
74int gfs2_ea_find(struct gfs2_inode *ip,
75 struct gfs2_ea_request *er,
76 struct gfs2_ea_location *el);
77int gfs2_ea_get_copy(struct gfs2_inode *ip,
78 struct gfs2_ea_location *el,
79 char *data);
80int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
81 struct iattr *attr, char *data);
82
83static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
84{
85 switch (ea->ea_type) {
86 case GFS2_EATYPE_USR:
87 return (5 + (ea->ea_name_len + 1));
88 case GFS2_EATYPE_SYS:
89 return (7 + (ea->ea_name_len + 1));
90 case GFS2_EATYPE_SECURITY:
91 return (9 + (ea->ea_name_len + 1));
92 default:
93 return (0);
94 }
95}
96
97#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644
index 000000000000..239f0c3553fc
--- /dev/null
+++ b/fs/gfs2/format.h
@@ -0,0 +1,21 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __FORMAT_DOT_H__
11#define __FORMAT_DOT_H__
12
13static const uint32_t gfs2_old_fs_formats[] = {
14 0
15};
16
17static const uint32_t gfs2_old_multihost_formats[] = {
18 0
19};
20
21#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..6edbd551a4c0
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..b6edf7c0923b
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2279 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kref.h>
19#include <linux/kallsyms.h>
20#include <linux/gfs2_ondisk.h>
21#include <asm/uaccess.h>
22
23#include "gfs2.h"
24#include "lm_interface.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "quota.h"
33#include "super.h"
34#include "util.h"
35
36/* Must be kept in sync with the beginning of struct gfs2_glock */
37struct glock_plug {
38 struct list_head gl_list;
39 unsigned long gl_flags;
40};
41
42struct greedy {
43 struct gfs2_holder gr_gh;
44 struct work_struct gr_work;
45};
46
47typedef void (*glock_examiner) (struct gfs2_glock * gl);
48
49static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
50static int dump_glock(struct gfs2_glock *gl);
51
52/**
53 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
54 * @actual: the current state of the lock
55 * @requested: the lock state that was requested by the caller
56 * @flags: the modifier flags passed in by the caller
57 *
58 * Returns: 1 if the locks are compatible, 0 otherwise
59 */
60
61static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
62 int flags)
63{
64 if (actual == requested)
65 return 1;
66
67 if (flags & GL_EXACT)
68 return 0;
69
70 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
71 return 1;
72
73 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
74 return 1;
75
76 return 0;
77}
78
79/**
80 * gl_hash() - Turn glock number into hash bucket number
81 * @lock: The glock number
82 *
83 * Returns: The number of the corresponding hash bucket
84 */
85
86static unsigned int gl_hash(struct lm_lockname *name)
87{
88 unsigned int h;
89
90 h = jhash(&name->ln_number, sizeof(uint64_t), 0);
91 h = jhash(&name->ln_type, sizeof(unsigned int), h);
92 h &= GFS2_GL_HASH_MASK;
93
94 return h;
95}
96
97/**
98 * glock_free() - Perform a few checks and then release struct gfs2_glock
99 * @gl: The glock to release
100 *
101 * Also calls lock module to release its internal structure for this glock.
102 *
103 */
104
105static void glock_free(struct gfs2_glock *gl)
106{
107 struct gfs2_sbd *sdp = gl->gl_sbd;
108 struct inode *aspace = gl->gl_aspace;
109
110 gfs2_lm_put_lock(sdp, gl->gl_lock);
111
112 if (aspace)
113 gfs2_aspace_put(aspace);
114
115 kmem_cache_free(gfs2_glock_cachep, gl);
116}
117
118/**
119 * gfs2_glock_hold() - increment reference count on glock
120 * @gl: The glock to hold
121 *
122 */
123
124void gfs2_glock_hold(struct gfs2_glock *gl)
125{
126 kref_get(&gl->gl_ref);
127}
128
129/* All work is done after the return from kref_put() so we
130 can release the write_lock before the free. */
131
132static void kill_glock(struct kref *kref)
133{
134 struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
135 struct gfs2_sbd *sdp = gl->gl_sbd;
136
137 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
138 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
139 gfs2_assert(sdp, list_empty(&gl->gl_holders));
140 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
141 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
142 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
143}
144
145/**
146 * gfs2_glock_put() - Decrement reference count on glock
147 * @gl: The glock to put
148 *
149 */
150
151int gfs2_glock_put(struct gfs2_glock *gl)
152{
153 struct gfs2_sbd *sdp = gl->gl_sbd;
154 struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
155 int rv = 0;
156
157 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
158
159 write_lock(&bucket->hb_lock);
160 if (kref_put(&gl->gl_ref, kill_glock)) {
161 list_del_init(&gl->gl_list);
162 write_unlock(&bucket->hb_lock);
163 BUG_ON(spin_is_locked(&gl->gl_spin));
164 glock_free(gl);
165 rv = 1;
166 goto out;
167 }
168 write_unlock(&bucket->hb_lock);
169 out:
170 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
171 return rv;
172}
173
174/**
175 * queue_empty - check to see if a glock's queue is empty
176 * @gl: the glock
177 * @head: the head of the queue to check
178 *
179 * This function protects the list in the event that a process already
180 * has a holder on the list and is adding a second holder for itself.
181 * The glmutex lock is what generally prevents processes from working
182 * on the same glock at once, but the special case of adding a second
183 * holder for yourself ("recursive" locking) doesn't involve locking
184 * glmutex, making the spin lock necessary.
185 *
186 * Returns: 1 if the queue is empty
187 */
188
189static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
190{
191 int empty;
192 spin_lock(&gl->gl_spin);
193 empty = list_empty(head);
194 spin_unlock(&gl->gl_spin);
195 return empty;
196}
197
198/**
199 * search_bucket() - Find struct gfs2_glock by lock number
200 * @bucket: the bucket to search
201 * @name: The lock name
202 *
203 * Returns: NULL, or the struct gfs2_glock with the requested number
204 */
205
206static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
207 struct lm_lockname *name)
208{
209 struct gfs2_glock *gl;
210
211 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
212 if (test_bit(GLF_PLUG, &gl->gl_flags))
213 continue;
214 if (!lm_name_equal(&gl->gl_name, name))
215 continue;
216
217 kref_get(&gl->gl_ref);
218
219 return gl;
220 }
221
222 return NULL;
223}
224
225/**
226 * gfs2_glock_find() - Find glock by lock number
227 * @sdp: The GFS2 superblock
228 * @name: The lock name
229 *
230 * Returns: NULL, or the struct gfs2_glock with the requested number
231 */
232
233static struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
234 struct lm_lockname *name)
235{
236 struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
237 struct gfs2_glock *gl;
238
239 read_lock(&bucket->hb_lock);
240 gl = search_bucket(bucket, name);
241 read_unlock(&bucket->hb_lock);
242
243 return gl;
244}
245
246/**
247 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
248 * @sdp: The GFS2 superblock
249 * @number: the lock number
250 * @glops: The glock_operations to use
251 * @create: If 0, don't create the glock if it doesn't exist
252 * @glp: the glock is returned here
253 *
254 * This does not lock a glock, just finds/creates structures for one.
255 *
256 * Returns: errno
257 */
258
259int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
260 struct gfs2_glock_operations *glops, int create,
261 struct gfs2_glock **glp)
262{
263 struct lm_lockname name;
264 struct gfs2_glock *gl, *tmp;
265 struct gfs2_gl_hash_bucket *bucket;
266 int error;
267
268 name.ln_number = number;
269 name.ln_type = glops->go_type;
270 bucket = &sdp->sd_gl_hash[gl_hash(&name)];
271
272 read_lock(&bucket->hb_lock);
273 gl = search_bucket(bucket, &name);
274 read_unlock(&bucket->hb_lock);
275
276 if (gl || !create) {
277 *glp = gl;
278 return 0;
279 }
280
281 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
282 if (!gl)
283 return -ENOMEM;
284
285 memset(gl, 0, sizeof(struct gfs2_glock));
286
287 INIT_LIST_HEAD(&gl->gl_list);
288 gl->gl_name = name;
289 kref_init(&gl->gl_ref);
290
291 spin_lock_init(&gl->gl_spin);
292
293 gl->gl_state = LM_ST_UNLOCKED;
294 gl->gl_owner = NULL;
295 gl->gl_ip = 0;
296 INIT_LIST_HEAD(&gl->gl_holders);
297 INIT_LIST_HEAD(&gl->gl_waiters1);
298 INIT_LIST_HEAD(&gl->gl_waiters2);
299 INIT_LIST_HEAD(&gl->gl_waiters3);
300
301 gl->gl_ops = glops;
302
303 gl->gl_bucket = bucket;
304 INIT_LIST_HEAD(&gl->gl_reclaim);
305
306 gl->gl_sbd = sdp;
307
308 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
309 INIT_LIST_HEAD(&gl->gl_ail_list);
310
311 /* If this glock protects actual on-disk data or metadata blocks,
312 create a VFS inode to manage the pages/buffers holding them. */
313 if (glops == &gfs2_inode_glops ||
314 glops == &gfs2_rgrp_glops) {
315 gl->gl_aspace = gfs2_aspace_get(sdp);
316 if (!gl->gl_aspace) {
317 error = -ENOMEM;
318 goto fail;
319 }
320 }
321
322 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
323 if (error)
324 goto fail_aspace;
325
326 write_lock(&bucket->hb_lock);
327 tmp = search_bucket(bucket, &name);
328 if (tmp) {
329 write_unlock(&bucket->hb_lock);
330 glock_free(gl);
331 gl = tmp;
332 } else {
333 list_add_tail(&gl->gl_list, &bucket->hb_list);
334 write_unlock(&bucket->hb_lock);
335 }
336
337 *glp = gl;
338
339 return 0;
340
341 fail_aspace:
342 if (gl->gl_aspace)
343 gfs2_aspace_put(gl->gl_aspace);
344
345 fail:
346 kmem_cache_free(gfs2_glock_cachep, gl);
347
348 return error;
349}
350
351/**
352 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
353 * @gl: the glock
354 * @state: the state we're requesting
355 * @flags: the modifier flags
356 * @gh: the holder structure
357 *
358 */
359
360void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
361 struct gfs2_holder *gh)
362{
363 INIT_LIST_HEAD(&gh->gh_list);
364 gh->gh_gl = gl;
365 gh->gh_ip = (unsigned long)__builtin_return_address(0);
366 gh->gh_owner = current;
367 gh->gh_state = state;
368 gh->gh_flags = flags;
369 gh->gh_error = 0;
370 gh->gh_iflags = 0;
371 init_completion(&gh->gh_wait);
372
373 if (gh->gh_state == LM_ST_EXCLUSIVE)
374 gh->gh_flags |= GL_LOCAL_EXCL;
375
376 gfs2_glock_hold(gl);
377}
378
379/**
380 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
381 * @state: the state we're requesting
382 * @flags: the modifier flags
383 * @gh: the holder structure
384 *
385 * Don't mess with the glock.
386 *
387 */
388
389void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
390{
391 gh->gh_state = state;
392 gh->gh_flags = flags;
393 if (gh->gh_state == LM_ST_EXCLUSIVE)
394 gh->gh_flags |= GL_LOCAL_EXCL;
395
396 gh->gh_iflags &= 1 << HIF_ALLOCED;
397 gh->gh_ip = (unsigned long)__builtin_return_address(0);
398}
399
400/**
401 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
402 * @gh: the holder structure
403 *
404 */
405
406void gfs2_holder_uninit(struct gfs2_holder *gh)
407{
408 gfs2_glock_put(gh->gh_gl);
409 gh->gh_gl = NULL;
410 gh->gh_ip = 0;
411}
412
413/**
414 * gfs2_holder_get - get a struct gfs2_holder structure
415 * @gl: the glock
416 * @state: the state we're requesting
417 * @flags: the modifier flags
418 * @gfp_flags:
419 *
420 * Figure out how big an impact this function has. Either:
421 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
422 * 2) Leave it like it is
423 *
424 * Returns: the holder structure, NULL on ENOMEM
425 */
426
427static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
428 unsigned int state,
429 int flags, gfp_t gfp_flags)
430{
431 struct gfs2_holder *gh;
432
433 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
434 if (!gh)
435 return NULL;
436
437 gfs2_holder_init(gl, state, flags, gh);
438 set_bit(HIF_ALLOCED, &gh->gh_iflags);
439 gh->gh_ip = (unsigned long)__builtin_return_address(0);
440 return gh;
441}
442
443/**
444 * gfs2_holder_put - get rid of a struct gfs2_holder structure
445 * @gh: the holder structure
446 *
447 */
448
449static void gfs2_holder_put(struct gfs2_holder *gh)
450{
451 gfs2_holder_uninit(gh);
452 kfree(gh);
453}
454
455/**
456 * rq_mutex - process a mutex request in the queue
457 * @gh: the glock holder
458 *
459 * Returns: 1 if the queue is blocked
460 */
461
462static int rq_mutex(struct gfs2_holder *gh)
463{
464 struct gfs2_glock *gl = gh->gh_gl;
465
466 list_del_init(&gh->gh_list);
467 /* gh->gh_error never examined. */
468 set_bit(GLF_LOCK, &gl->gl_flags);
469 complete(&gh->gh_wait);
470
471 return 1;
472}
473
474/**
475 * rq_promote - process a promote request in the queue
476 * @gh: the glock holder
477 *
478 * Acquire a new inter-node lock, or change a lock state to more restrictive.
479 *
480 * Returns: 1 if the queue is blocked
481 */
482
483static int rq_promote(struct gfs2_holder *gh)
484{
485 struct gfs2_glock *gl = gh->gh_gl;
486 struct gfs2_sbd *sdp = gl->gl_sbd;
487 struct gfs2_glock_operations *glops = gl->gl_ops;
488
489 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
490 if (list_empty(&gl->gl_holders)) {
491 gl->gl_req_gh = gh;
492 set_bit(GLF_LOCK, &gl->gl_flags);
493 spin_unlock(&gl->gl_spin);
494
495 if (atomic_read(&sdp->sd_reclaim_count) >
496 gfs2_tune_get(sdp, gt_reclaim_limit) &&
497 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
498 gfs2_reclaim_glock(sdp);
499 gfs2_reclaim_glock(sdp);
500 }
501
502 glops->go_xmote_th(gl, gh->gh_state,
503 gh->gh_flags);
504
505 spin_lock(&gl->gl_spin);
506 }
507 return 1;
508 }
509
510 if (list_empty(&gl->gl_holders)) {
511 set_bit(HIF_FIRST, &gh->gh_iflags);
512 set_bit(GLF_LOCK, &gl->gl_flags);
513 } else {
514 struct gfs2_holder *next_gh;
515 if (gh->gh_flags & GL_LOCAL_EXCL)
516 return 1;
517 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
518 gh_list);
519 if (next_gh->gh_flags & GL_LOCAL_EXCL)
520 return 1;
521 }
522
523 list_move_tail(&gh->gh_list, &gl->gl_holders);
524 gh->gh_error = 0;
525 set_bit(HIF_HOLDER, &gh->gh_iflags);
526
527 complete(&gh->gh_wait);
528
529 return 0;
530}
531
532/**
533 * rq_demote - process a demote request in the queue
534 * @gh: the glock holder
535 *
536 * Returns: 1 if the queue is blocked
537 */
538
539static int rq_demote(struct gfs2_holder *gh)
540{
541 struct gfs2_glock *gl = gh->gh_gl;
542 struct gfs2_glock_operations *glops = gl->gl_ops;
543
544 if (!list_empty(&gl->gl_holders))
545 return 1;
546
547 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
548 list_del_init(&gh->gh_list);
549 gh->gh_error = 0;
550 spin_unlock(&gl->gl_spin);
551 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
552 gfs2_holder_put(gh);
553 else
554 complete(&gh->gh_wait);
555 spin_lock(&gl->gl_spin);
556 } else {
557 gl->gl_req_gh = gh;
558 set_bit(GLF_LOCK, &gl->gl_flags);
559 spin_unlock(&gl->gl_spin);
560
561 if (gh->gh_state == LM_ST_UNLOCKED ||
562 gl->gl_state != LM_ST_EXCLUSIVE)
563 glops->go_drop_th(gl);
564 else
565 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
566
567 spin_lock(&gl->gl_spin);
568 }
569
570 return 0;
571}
572
573/**
574 * rq_greedy - process a queued request to drop greedy status
575 * @gh: the glock holder
576 *
577 * Returns: 1 if the queue is blocked
578 */
579
580static int rq_greedy(struct gfs2_holder *gh)
581{
582 struct gfs2_glock *gl = gh->gh_gl;
583
584 list_del_init(&gh->gh_list);
585 /* gh->gh_error never examined. */
586 clear_bit(GLF_GREEDY, &gl->gl_flags);
587 spin_unlock(&gl->gl_spin);
588
589 gfs2_holder_uninit(gh);
590 kfree(container_of(gh, struct greedy, gr_gh));
591
592 spin_lock(&gl->gl_spin);
593
594 return 0;
595}
596
597/**
598 * run_queue - process holder structures on a glock
599 * @gl: the glock
600 *
601 */
602static void run_queue(struct gfs2_glock *gl)
603{
604 struct gfs2_holder *gh;
605 int blocked = 1;
606
607 for (;;) {
608 if (test_bit(GLF_LOCK, &gl->gl_flags))
609 break;
610
611 if (!list_empty(&gl->gl_waiters1)) {
612 gh = list_entry(gl->gl_waiters1.next,
613 struct gfs2_holder, gh_list);
614
615 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
616 blocked = rq_mutex(gh);
617 else
618 gfs2_assert_warn(gl->gl_sbd, 0);
619
620 } else if (!list_empty(&gl->gl_waiters2) &&
621 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
622 gh = list_entry(gl->gl_waiters2.next,
623 struct gfs2_holder, gh_list);
624
625 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
626 blocked = rq_demote(gh);
627 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
628 blocked = rq_greedy(gh);
629 else
630 gfs2_assert_warn(gl->gl_sbd, 0);
631
632 } else if (!list_empty(&gl->gl_waiters3)) {
633 gh = list_entry(gl->gl_waiters3.next,
634 struct gfs2_holder, gh_list);
635
636 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
637 blocked = rq_promote(gh);
638 else
639 gfs2_assert_warn(gl->gl_sbd, 0);
640
641 } else
642 break;
643
644 if (blocked)
645 break;
646 }
647}
648
649/**
650 * gfs2_glmutex_lock - acquire a local lock on a glock
651 * @gl: the glock
652 *
653 * Gives caller exclusive access to manipulate a glock structure.
654 */
655
656static void gfs2_glmutex_lock(struct gfs2_glock *gl)
657{
658 struct gfs2_holder gh;
659
660 gfs2_holder_init(gl, 0, 0, &gh);
661 set_bit(HIF_MUTEX, &gh.gh_iflags);
662
663 spin_lock(&gl->gl_spin);
664 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
665 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
666 else {
667 gl->gl_owner = current;
668 gl->gl_ip = (unsigned long)__builtin_return_address(0);
669 complete(&gh.gh_wait);
670 }
671 spin_unlock(&gl->gl_spin);
672
673 wait_for_completion(&gh.gh_wait);
674 gfs2_holder_uninit(&gh);
675}
676
677/**
678 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
679 * @gl: the glock
680 *
681 * Returns: 1 if the glock is acquired
682 */
683
684static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
685{
686 int acquired = 1;
687
688 spin_lock(&gl->gl_spin);
689 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
690 acquired = 0;
691 else {
692 gl->gl_owner = current;
693 gl->gl_ip = (unsigned long)__builtin_return_address(0);
694 }
695 spin_unlock(&gl->gl_spin);
696
697 return acquired;
698}
699
700/**
701 * gfs2_glmutex_unlock - release a local lock on a glock
702 * @gl: the glock
703 *
704 */
705
706static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
707{
708 spin_lock(&gl->gl_spin);
709 clear_bit(GLF_LOCK, &gl->gl_flags);
710 gl->gl_owner = NULL;
711 gl->gl_ip = 0;
712 run_queue(gl);
713 BUG_ON(!spin_is_locked(&gl->gl_spin));
714 spin_unlock(&gl->gl_spin);
715}
716
717/**
718 * handle_callback - add a demote request to a lock's queue
719 * @gl: the glock
720 * @state: the state the caller wants us to change to
721 *
722 * Note: This may fail sliently if we are out of memory.
723 */
724
725static void handle_callback(struct gfs2_glock *gl, unsigned int state)
726{
727 struct gfs2_holder *gh, *new_gh = NULL;
728
729restart:
730 spin_lock(&gl->gl_spin);
731
732 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
733 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
734 gl->gl_req_gh != gh) {
735 if (gh->gh_state != state)
736 gh->gh_state = LM_ST_UNLOCKED;
737 goto out;
738 }
739 }
740
741 if (new_gh) {
742 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
743 new_gh = NULL;
744 } else {
745 spin_unlock(&gl->gl_spin);
746
747 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
748 if (!new_gh)
749 return;
750 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
751 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
752
753 goto restart;
754 }
755
756out:
757 spin_unlock(&gl->gl_spin);
758
759 if (new_gh)
760 gfs2_holder_put(new_gh);
761}
762
763void gfs2_glock_inode_squish(struct inode *inode)
764{
765 struct gfs2_holder gh;
766 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
767 gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
768 set_bit(HIF_DEMOTE, &gh.gh_iflags);
769 spin_lock(&gl->gl_spin);
770 gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
771 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
772 run_queue(gl);
773 spin_unlock(&gl->gl_spin);
774 wait_for_completion(&gh.gh_wait);
775 gfs2_holder_uninit(&gh);
776}
777
778/**
779 * state_change - record that the glock is now in a different state
780 * @gl: the glock
781 * @new_state the new state
782 *
783 */
784
785static void state_change(struct gfs2_glock *gl, unsigned int new_state)
786{
787 int held1, held2;
788
789 held1 = (gl->gl_state != LM_ST_UNLOCKED);
790 held2 = (new_state != LM_ST_UNLOCKED);
791
792 if (held1 != held2) {
793 if (held2)
794 gfs2_glock_hold(gl);
795 else
796 gfs2_glock_put(gl);
797 }
798
799 gl->gl_state = new_state;
800}
801
802/**
803 * xmote_bh - Called after the lock module is done acquiring a lock
804 * @gl: The glock in question
805 * @ret: the int returned from the lock module
806 *
807 */
808
809static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
810{
811 struct gfs2_sbd *sdp = gl->gl_sbd;
812 struct gfs2_glock_operations *glops = gl->gl_ops;
813 struct gfs2_holder *gh = gl->gl_req_gh;
814 int prev_state = gl->gl_state;
815 int op_done = 1;
816
817 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
818 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
819 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
820
821 state_change(gl, ret & LM_OUT_ST_MASK);
822
823 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
824 if (glops->go_inval)
825 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
826 } else if (gl->gl_state == LM_ST_DEFERRED) {
827 /* We might not want to do this here.
828 Look at moving to the inode glops. */
829 if (glops->go_inval)
830 glops->go_inval(gl, DIO_DATA);
831 }
832
833 /* Deal with each possible exit condition */
834
835 if (!gh)
836 gl->gl_stamp = jiffies;
837
838 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
839 spin_lock(&gl->gl_spin);
840 list_del_init(&gh->gh_list);
841 gh->gh_error = -EIO;
842 spin_unlock(&gl->gl_spin);
843
844 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
845 spin_lock(&gl->gl_spin);
846 list_del_init(&gh->gh_list);
847 if (gl->gl_state == gh->gh_state ||
848 gl->gl_state == LM_ST_UNLOCKED)
849 gh->gh_error = 0;
850 else {
851 if (gfs2_assert_warn(sdp, gh->gh_flags &
852 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
853 fs_warn(sdp, "ret = 0x%.8X\n", ret);
854 gh->gh_error = GLR_TRYFAILED;
855 }
856 spin_unlock(&gl->gl_spin);
857
858 if (ret & LM_OUT_CANCELED)
859 handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
860
861 } else if (ret & LM_OUT_CANCELED) {
862 spin_lock(&gl->gl_spin);
863 list_del_init(&gh->gh_list);
864 gh->gh_error = GLR_CANCELED;
865 spin_unlock(&gl->gl_spin);
866
867 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
868 spin_lock(&gl->gl_spin);
869 list_move_tail(&gh->gh_list, &gl->gl_holders);
870 gh->gh_error = 0;
871 set_bit(HIF_HOLDER, &gh->gh_iflags);
872 spin_unlock(&gl->gl_spin);
873
874 set_bit(HIF_FIRST, &gh->gh_iflags);
875
876 op_done = 0;
877
878 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
879 spin_lock(&gl->gl_spin);
880 list_del_init(&gh->gh_list);
881 gh->gh_error = GLR_TRYFAILED;
882 spin_unlock(&gl->gl_spin);
883
884 } else {
885 if (gfs2_assert_withdraw(sdp, 0) == -1)
886 fs_err(sdp, "ret = 0x%.8X\n", ret);
887 }
888
889 if (glops->go_xmote_bh)
890 glops->go_xmote_bh(gl);
891
892 if (op_done) {
893 spin_lock(&gl->gl_spin);
894 gl->gl_req_gh = NULL;
895 gl->gl_req_bh = NULL;
896 clear_bit(GLF_LOCK, &gl->gl_flags);
897 run_queue(gl);
898 spin_unlock(&gl->gl_spin);
899 }
900
901 gfs2_glock_put(gl);
902
903 if (gh) {
904 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
905 gfs2_holder_put(gh);
906 else
907 complete(&gh->gh_wait);
908 }
909}
910
911/**
912 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
913 * @gl: The glock in question
914 * @state: the requested state
915 * @flags: modifier flags to the lock call
916 *
917 */
918
919void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
920{
921 struct gfs2_sbd *sdp = gl->gl_sbd;
922 struct gfs2_glock_operations *glops = gl->gl_ops;
923 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
924 LM_FLAG_NOEXP | LM_FLAG_ANY |
925 LM_FLAG_PRIORITY);
926 unsigned int lck_ret;
927
928 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
929 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
930 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
931 gfs2_assert_warn(sdp, state != gl->gl_state);
932
933 if (gl->gl_state == LM_ST_EXCLUSIVE) {
934 if (glops->go_sync)
935 glops->go_sync(gl,
936 DIO_METADATA | DIO_DATA | DIO_RELEASE);
937 }
938
939 gfs2_glock_hold(gl);
940 gl->gl_req_bh = xmote_bh;
941
942 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
943 lck_flags);
944
945 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
946 return;
947
948 if (lck_ret & LM_OUT_ASYNC)
949 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
950 else
951 xmote_bh(gl, lck_ret);
952}
953
954/**
955 * drop_bh - Called after a lock module unlock completes
956 * @gl: the glock
957 * @ret: the return status
958 *
959 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
960 * Doesn't drop the reference on the glock the top half took out
961 *
962 */
963
964static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
965{
966 struct gfs2_sbd *sdp = gl->gl_sbd;
967 struct gfs2_glock_operations *glops = gl->gl_ops;
968 struct gfs2_holder *gh = gl->gl_req_gh;
969
970 clear_bit(GLF_PREFETCH, &gl->gl_flags);
971
972 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
973 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
974 gfs2_assert_warn(sdp, !ret);
975
976 state_change(gl, LM_ST_UNLOCKED);
977
978 if (glops->go_inval)
979 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
980
981 if (gh) {
982 spin_lock(&gl->gl_spin);
983 list_del_init(&gh->gh_list);
984 gh->gh_error = 0;
985 spin_unlock(&gl->gl_spin);
986 }
987
988 if (glops->go_drop_bh)
989 glops->go_drop_bh(gl);
990
991 spin_lock(&gl->gl_spin);
992 gl->gl_req_gh = NULL;
993 gl->gl_req_bh = NULL;
994 clear_bit(GLF_LOCK, &gl->gl_flags);
995 run_queue(gl);
996 spin_unlock(&gl->gl_spin);
997
998 gfs2_glock_put(gl);
999
1000 if (gh) {
1001 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
1002 gfs2_holder_put(gh);
1003 else
1004 complete(&gh->gh_wait);
1005 }
1006}
1007
1008/**
1009 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1010 * @gl: the glock
1011 *
1012 */
1013
1014void gfs2_glock_drop_th(struct gfs2_glock *gl)
1015{
1016 struct gfs2_sbd *sdp = gl->gl_sbd;
1017 struct gfs2_glock_operations *glops = gl->gl_ops;
1018 unsigned int ret;
1019
1020 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1021 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1022 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1023
1024 if (gl->gl_state == LM_ST_EXCLUSIVE) {
1025 if (glops->go_sync)
1026 glops->go_sync(gl,
1027 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1028 }
1029
1030 gfs2_glock_hold(gl);
1031 gl->gl_req_bh = drop_bh;
1032
1033 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1034
1035 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1036 return;
1037
1038 if (!ret)
1039 drop_bh(gl, ret);
1040 else
1041 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1042}
1043
1044/**
1045 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1046 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1047 *
1048 * Don't cancel GL_NOCANCEL requests.
1049 */
1050
1051static void do_cancels(struct gfs2_holder *gh)
1052{
1053 struct gfs2_glock *gl = gh->gh_gl;
1054
1055 spin_lock(&gl->gl_spin);
1056
1057 while (gl->gl_req_gh != gh &&
1058 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1059 !list_empty(&gh->gh_list)) {
1060 if (gl->gl_req_bh &&
1061 !(gl->gl_req_gh &&
1062 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1063 spin_unlock(&gl->gl_spin);
1064 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1065 msleep(100);
1066 spin_lock(&gl->gl_spin);
1067 } else {
1068 spin_unlock(&gl->gl_spin);
1069 msleep(100);
1070 spin_lock(&gl->gl_spin);
1071 }
1072 }
1073
1074 spin_unlock(&gl->gl_spin);
1075}
1076
1077/**
1078 * glock_wait_internal - wait on a glock acquisition
1079 * @gh: the glock holder
1080 *
1081 * Returns: 0 on success
1082 */
1083
1084static int glock_wait_internal(struct gfs2_holder *gh)
1085{
1086 struct gfs2_glock *gl = gh->gh_gl;
1087 struct gfs2_sbd *sdp = gl->gl_sbd;
1088 struct gfs2_glock_operations *glops = gl->gl_ops;
1089
1090 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1091 return -EIO;
1092
1093 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1094 spin_lock(&gl->gl_spin);
1095 if (gl->gl_req_gh != gh &&
1096 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1097 !list_empty(&gh->gh_list)) {
1098 list_del_init(&gh->gh_list);
1099 gh->gh_error = GLR_TRYFAILED;
1100 run_queue(gl);
1101 spin_unlock(&gl->gl_spin);
1102 return gh->gh_error;
1103 }
1104 spin_unlock(&gl->gl_spin);
1105 }
1106
1107 if (gh->gh_flags & LM_FLAG_PRIORITY)
1108 do_cancels(gh);
1109
1110 wait_for_completion(&gh->gh_wait);
1111
1112 if (gh->gh_error)
1113 return gh->gh_error;
1114
1115 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1116 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
1117 gh->gh_state,
1118 gh->gh_flags));
1119
1120 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1121 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1122
1123 if (glops->go_lock) {
1124 gh->gh_error = glops->go_lock(gh);
1125 if (gh->gh_error) {
1126 spin_lock(&gl->gl_spin);
1127 list_del_init(&gh->gh_list);
1128 spin_unlock(&gl->gl_spin);
1129 }
1130 }
1131
1132 spin_lock(&gl->gl_spin);
1133 gl->gl_req_gh = NULL;
1134 gl->gl_req_bh = NULL;
1135 clear_bit(GLF_LOCK, &gl->gl_flags);
1136 run_queue(gl);
1137 spin_unlock(&gl->gl_spin);
1138 }
1139
1140 return gh->gh_error;
1141}
1142
1143static inline struct gfs2_holder *
1144find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1145{
1146 struct gfs2_holder *gh;
1147
1148 list_for_each_entry(gh, head, gh_list) {
1149 if (gh->gh_owner == owner)
1150 return gh;
1151 }
1152
1153 return NULL;
1154}
1155
1156/**
1157 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1158 * @gh: the holder structure to add
1159 *
1160 */
1161
1162static void add_to_queue(struct gfs2_holder *gh)
1163{
1164 struct gfs2_glock *gl = gh->gh_gl;
1165 struct gfs2_holder *existing;
1166
1167 BUG_ON(!gh->gh_owner);
1168
1169 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1170 if (existing) {
1171 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1172 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1173 BUG();
1174 }
1175
1176 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1177 if (existing) {
1178 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1179 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1180 BUG();
1181 }
1182
1183 if (gh->gh_flags & LM_FLAG_PRIORITY)
1184 list_add(&gh->gh_list, &gl->gl_waiters3);
1185 else
1186 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1187}
1188
1189/**
1190 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1191 * @gh: the holder structure
1192 *
1193 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1194 *
1195 * Returns: 0, GLR_TRYFAILED, or errno on failure
1196 */
1197
1198int gfs2_glock_nq(struct gfs2_holder *gh)
1199{
1200 struct gfs2_glock *gl = gh->gh_gl;
1201 struct gfs2_sbd *sdp = gl->gl_sbd;
1202 int error = 0;
1203
1204restart:
1205 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1206 set_bit(HIF_ABORTED, &gh->gh_iflags);
1207 return -EIO;
1208 }
1209
1210 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1211
1212 spin_lock(&gl->gl_spin);
1213 add_to_queue(gh);
1214 run_queue(gl);
1215 spin_unlock(&gl->gl_spin);
1216
1217 if (!(gh->gh_flags & GL_ASYNC)) {
1218 error = glock_wait_internal(gh);
1219 if (error == GLR_CANCELED) {
1220 msleep(100);
1221 goto restart;
1222 }
1223 }
1224
1225 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1226
1227 if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
1228 dump_glock(gl);
1229
1230 return error;
1231}
1232
1233/**
1234 * gfs2_glock_poll - poll to see if an async request has been completed
1235 * @gh: the holder
1236 *
1237 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1238 */
1239
1240int gfs2_glock_poll(struct gfs2_holder *gh)
1241{
1242 struct gfs2_glock *gl = gh->gh_gl;
1243 int ready = 0;
1244
1245 spin_lock(&gl->gl_spin);
1246
1247 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1248 ready = 1;
1249 else if (list_empty(&gh->gh_list)) {
1250 if (gh->gh_error == GLR_CANCELED) {
1251 spin_unlock(&gl->gl_spin);
1252 msleep(100);
1253 if (gfs2_glock_nq(gh))
1254 return 1;
1255 return 0;
1256 } else
1257 ready = 1;
1258 }
1259
1260 spin_unlock(&gl->gl_spin);
1261
1262 return ready;
1263}
1264
1265/**
1266 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1267 * @gh: the holder structure
1268 *
1269 * Returns: 0, GLR_TRYFAILED, or errno on failure
1270 */
1271
1272int gfs2_glock_wait(struct gfs2_holder *gh)
1273{
1274 int error;
1275
1276 error = glock_wait_internal(gh);
1277 if (error == GLR_CANCELED) {
1278 msleep(100);
1279 gh->gh_flags &= ~GL_ASYNC;
1280 error = gfs2_glock_nq(gh);
1281 }
1282
1283 return error;
1284}
1285
1286/**
1287 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1288 * @gh: the glock holder
1289 *
1290 */
1291
1292void gfs2_glock_dq(struct gfs2_holder *gh)
1293{
1294 struct gfs2_glock *gl = gh->gh_gl;
1295 struct gfs2_glock_operations *glops = gl->gl_ops;
1296
1297 if (gh->gh_flags & GL_SYNC)
1298 set_bit(GLF_SYNC, &gl->gl_flags);
1299
1300 if (gh->gh_flags & GL_NOCACHE)
1301 handle_callback(gl, LM_ST_UNLOCKED);
1302
1303 gfs2_glmutex_lock(gl);
1304
1305 spin_lock(&gl->gl_spin);
1306 list_del_init(&gh->gh_list);
1307
1308 if (list_empty(&gl->gl_holders)) {
1309 spin_unlock(&gl->gl_spin);
1310
1311 if (glops->go_unlock)
1312 glops->go_unlock(gh);
1313
1314 if (test_bit(GLF_SYNC, &gl->gl_flags)) {
1315 if (glops->go_sync)
1316 glops->go_sync(gl, DIO_METADATA | DIO_DATA);
1317 }
1318
1319 gl->gl_stamp = jiffies;
1320
1321 spin_lock(&gl->gl_spin);
1322 }
1323
1324 clear_bit(GLF_LOCK, &gl->gl_flags);
1325 run_queue(gl);
1326 spin_unlock(&gl->gl_spin);
1327}
1328
1329/**
1330 * gfs2_glock_prefetch - Try to prefetch a glock
1331 * @gl: the glock
1332 * @state: the state to prefetch in
1333 * @flags: flags passed to go_xmote_th()
1334 *
1335 */
1336
1337static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1338 int flags)
1339{
1340 struct gfs2_glock_operations *glops = gl->gl_ops;
1341
1342 spin_lock(&gl->gl_spin);
1343
1344 if (test_bit(GLF_LOCK, &gl->gl_flags) ||
1345 !list_empty(&gl->gl_holders) ||
1346 !list_empty(&gl->gl_waiters1) ||
1347 !list_empty(&gl->gl_waiters2) ||
1348 !list_empty(&gl->gl_waiters3) ||
1349 relaxed_state_ok(gl->gl_state, state, flags)) {
1350 spin_unlock(&gl->gl_spin);
1351 return;
1352 }
1353
1354 set_bit(GLF_PREFETCH, &gl->gl_flags);
1355 set_bit(GLF_LOCK, &gl->gl_flags);
1356 spin_unlock(&gl->gl_spin);
1357
1358 glops->go_xmote_th(gl, state, flags);
1359}
1360
1361static void greedy_work(void *data)
1362{
1363 struct greedy *gr = data;
1364 struct gfs2_holder *gh = &gr->gr_gh;
1365 struct gfs2_glock *gl = gh->gh_gl;
1366 struct gfs2_glock_operations *glops = gl->gl_ops;
1367
1368 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1369
1370 if (glops->go_greedy)
1371 glops->go_greedy(gl);
1372
1373 spin_lock(&gl->gl_spin);
1374
1375 if (list_empty(&gl->gl_waiters2)) {
1376 clear_bit(GLF_GREEDY, &gl->gl_flags);
1377 spin_unlock(&gl->gl_spin);
1378 gfs2_holder_uninit(gh);
1379 kfree(gr);
1380 } else {
1381 gfs2_glock_hold(gl);
1382 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1383 run_queue(gl);
1384 spin_unlock(&gl->gl_spin);
1385 gfs2_glock_put(gl);
1386 }
1387}
1388
1389/**
1390 * gfs2_glock_be_greedy -
1391 * @gl:
1392 * @time:
1393 *
1394 * Returns: 0 if go_greedy will be called, 1 otherwise
1395 */
1396
1397int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1398{
1399 struct greedy *gr;
1400 struct gfs2_holder *gh;
1401
1402 if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
1403 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1404 return 1;
1405
1406 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1407 if (!gr) {
1408 clear_bit(GLF_GREEDY, &gl->gl_flags);
1409 return 1;
1410 }
1411 gh = &gr->gr_gh;
1412
1413 gfs2_holder_init(gl, 0, 0, gh);
1414 set_bit(HIF_GREEDY, &gh->gh_iflags);
1415 INIT_WORK(&gr->gr_work, greedy_work, gr);
1416
1417 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1418 schedule_delayed_work(&gr->gr_work, time);
1419
1420 return 0;
1421}
1422
1423/**
1424 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1425 * @gh: the holder structure
1426 *
1427 */
1428
1429void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1430{
1431 gfs2_glock_dq(gh);
1432 gfs2_holder_uninit(gh);
1433}
1434
1435/**
1436 * gfs2_glock_nq_num - acquire a glock based on lock number
1437 * @sdp: the filesystem
1438 * @number: the lock number
1439 * @glops: the glock operations for the type of glock
1440 * @state: the state to acquire the glock in
1441 * @flags: modifier flags for the aquisition
1442 * @gh: the struct gfs2_holder
1443 *
1444 * Returns: errno
1445 */
1446
1447int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
1448 struct gfs2_glock_operations *glops, unsigned int state,
1449 int flags, struct gfs2_holder *gh)
1450{
1451 struct gfs2_glock *gl;
1452 int error;
1453
1454 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1455 if (!error) {
1456 error = gfs2_glock_nq_init(gl, state, flags, gh);
1457 gfs2_glock_put(gl);
1458 }
1459
1460 return error;
1461}
1462
1463/**
1464 * glock_compare - Compare two struct gfs2_glock structures for sorting
1465 * @arg_a: the first structure
1466 * @arg_b: the second structure
1467 *
1468 */
1469
1470static int glock_compare(const void *arg_a, const void *arg_b)
1471{
1472 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1473 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1474 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1475 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1476 int ret = 0;
1477
1478 if (a->ln_number > b->ln_number)
1479 ret = 1;
1480 else if (a->ln_number < b->ln_number)
1481 ret = -1;
1482 else {
1483 if (gh_a->gh_state == LM_ST_SHARED &&
1484 gh_b->gh_state == LM_ST_EXCLUSIVE)
1485 ret = 1;
1486 else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
1487 (gh_b->gh_flags & GL_LOCAL_EXCL))
1488 ret = 1;
1489 }
1490
1491 return ret;
1492}
1493
1494/**
1495 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1496 * @num_gh: the number of structures
1497 * @ghs: an array of struct gfs2_holder structures
1498 *
1499 * Returns: 0 on success (all glocks acquired),
1500 * errno on failure (no glocks acquired)
1501 */
1502
1503static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1504 struct gfs2_holder **p)
1505{
1506 unsigned int x;
1507 int error = 0;
1508
1509 for (x = 0; x < num_gh; x++)
1510 p[x] = &ghs[x];
1511
1512 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1513
1514 for (x = 0; x < num_gh; x++) {
1515 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1516
1517 error = gfs2_glock_nq(p[x]);
1518 if (error) {
1519 while (x--)
1520 gfs2_glock_dq(p[x]);
1521 break;
1522 }
1523 }
1524
1525 return error;
1526}
1527
1528/**
1529 * gfs2_glock_nq_m - acquire multiple glocks
1530 * @num_gh: the number of structures
1531 * @ghs: an array of struct gfs2_holder structures
1532 *
1533 * Figure out how big an impact this function has. Either:
1534 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1535 * 2) Forget async stuff and just call nq_m_sync()
1536 * 3) Leave it like it is
1537 *
1538 * Returns: 0 on success (all glocks acquired),
1539 * errno on failure (no glocks acquired)
1540 */
1541
1542int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1543{
1544 int *e;
1545 unsigned int x;
1546 int borked = 0, serious = 0;
1547 int error = 0;
1548
1549 if (!num_gh)
1550 return 0;
1551
1552 if (num_gh == 1) {
1553 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1554 return gfs2_glock_nq(ghs);
1555 }
1556
1557 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1558 if (!e)
1559 return -ENOMEM;
1560
1561 for (x = 0; x < num_gh; x++) {
1562 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1563 error = gfs2_glock_nq(&ghs[x]);
1564 if (error) {
1565 borked = 1;
1566 serious = error;
1567 num_gh = x;
1568 break;
1569 }
1570 }
1571
1572 for (x = 0; x < num_gh; x++) {
1573 error = e[x] = glock_wait_internal(&ghs[x]);
1574 if (error) {
1575 borked = 1;
1576 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1577 serious = error;
1578 }
1579 }
1580
1581 if (!borked) {
1582 kfree(e);
1583 return 0;
1584 }
1585
1586 for (x = 0; x < num_gh; x++)
1587 if (!e[x])
1588 gfs2_glock_dq(&ghs[x]);
1589
1590 if (serious)
1591 error = serious;
1592 else {
1593 for (x = 0; x < num_gh; x++)
1594 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1595 &ghs[x]);
1596 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1597 }
1598
1599 kfree(e);
1600
1601 return error;
1602}
1603
1604/**
1605 * gfs2_glock_dq_m - release multiple glocks
1606 * @num_gh: the number of structures
1607 * @ghs: an array of struct gfs2_holder structures
1608 *
1609 */
1610
1611void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1612{
1613 unsigned int x;
1614
1615 for (x = 0; x < num_gh; x++)
1616 gfs2_glock_dq(&ghs[x]);
1617}
1618
1619/**
1620 * gfs2_glock_dq_uninit_m - release multiple glocks
1621 * @num_gh: the number of structures
1622 * @ghs: an array of struct gfs2_holder structures
1623 *
1624 */
1625
1626void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1627{
1628 unsigned int x;
1629
1630 for (x = 0; x < num_gh; x++)
1631 gfs2_glock_dq_uninit(&ghs[x]);
1632}
1633
1634/**
1635 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1636 * @sdp: the filesystem
1637 * @number: the lock number
1638 * @glops: the glock operations for the type of glock
1639 * @state: the state to acquire the glock in
1640 * @flags: modifier flags for the aquisition
1641 *
1642 * Returns: errno
1643 */
1644
1645void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
1646 struct gfs2_glock_operations *glops,
1647 unsigned int state, int flags)
1648{
1649 struct gfs2_glock *gl;
1650 int error;
1651
1652 if (atomic_read(&sdp->sd_reclaim_count) <
1653 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1654 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1655 if (!error) {
1656 gfs2_glock_prefetch(gl, state, flags);
1657 gfs2_glock_put(gl);
1658 }
1659 }
1660}
1661
1662/**
1663 * gfs2_lvb_hold - attach a LVB from a glock
1664 * @gl: The glock in question
1665 *
1666 */
1667
1668int gfs2_lvb_hold(struct gfs2_glock *gl)
1669{
1670 int error;
1671
1672 gfs2_glmutex_lock(gl);
1673
1674 if (!atomic_read(&gl->gl_lvb_count)) {
1675 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1676 if (error) {
1677 gfs2_glmutex_unlock(gl);
1678 return error;
1679 }
1680 gfs2_glock_hold(gl);
1681 }
1682 atomic_inc(&gl->gl_lvb_count);
1683
1684 gfs2_glmutex_unlock(gl);
1685
1686 return 0;
1687}
1688
1689/**
1690 * gfs2_lvb_unhold - detach a LVB from a glock
1691 * @gl: The glock in question
1692 *
1693 */
1694
1695void gfs2_lvb_unhold(struct gfs2_glock *gl)
1696{
1697 gfs2_glock_hold(gl);
1698 gfs2_glmutex_lock(gl);
1699
1700 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1701 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1702 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1703 gl->gl_lvb = NULL;
1704 gfs2_glock_put(gl);
1705 }
1706
1707 gfs2_glmutex_unlock(gl);
1708 gfs2_glock_put(gl);
1709}
1710
1711#if 0
1712void gfs2_lvb_sync(struct gfs2_glock *gl)
1713{
1714 gfs2_glmutex_lock(gl);
1715
1716 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
1717 if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
1718 gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1719
1720 gfs2_glmutex_unlock(gl);
1721}
1722#endif /* 0 */
1723
1724static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1725 unsigned int state)
1726{
1727 struct gfs2_glock *gl;
1728
1729 gl = gfs2_glock_find(sdp, name);
1730 if (!gl)
1731 return;
1732
1733 if (gl->gl_ops->go_callback)
1734 gl->gl_ops->go_callback(gl, state);
1735 handle_callback(gl, state);
1736
1737 spin_lock(&gl->gl_spin);
1738 run_queue(gl);
1739 spin_unlock(&gl->gl_spin);
1740
1741 gfs2_glock_put(gl);
1742}
1743
1744/**
1745 * gfs2_glock_cb - Callback used by locking module
1746 * @fsdata: Pointer to the superblock
1747 * @type: Type of callback
1748 * @data: Type dependent data pointer
1749 *
1750 * Called by the locking module when it wants to tell us something.
1751 * Either we need to drop a lock, one of our ASYNC requests completed, or
1752 * a journal from another client needs to be recovered.
1753 */
1754
1755void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
1756{
1757 struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
1758
1759 switch (type) {
1760 case LM_CB_NEED_E:
1761 blocking_cb(sdp, data, LM_ST_UNLOCKED);
1762 return;
1763
1764 case LM_CB_NEED_D:
1765 blocking_cb(sdp, data, LM_ST_DEFERRED);
1766 return;
1767
1768 case LM_CB_NEED_S:
1769 blocking_cb(sdp, data, LM_ST_SHARED);
1770 return;
1771
1772 case LM_CB_ASYNC: {
1773 struct lm_async_cb *async = data;
1774 struct gfs2_glock *gl;
1775
1776 gl = gfs2_glock_find(sdp, &async->lc_name);
1777 if (gfs2_assert_warn(sdp, gl))
1778 return;
1779 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1780 gl->gl_req_bh(gl, async->lc_ret);
1781 gfs2_glock_put(gl);
1782 return;
1783 }
1784
1785 case LM_CB_NEED_RECOVERY:
1786 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1787 if (sdp->sd_recoverd_process)
1788 wake_up_process(sdp->sd_recoverd_process);
1789 return;
1790
1791 case LM_CB_DROPLOCKS:
1792 gfs2_gl_hash_clear(sdp, NO_WAIT);
1793 gfs2_quota_scan(sdp);
1794 return;
1795
1796 default:
1797 gfs2_assert_warn(sdp, 0);
1798 return;
1799 }
1800}
1801
1802/**
1803 * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
1804 * iopen glock from memory
1805 * @io_gl: the iopen glock
1806 * @state: the state into which the glock should be put
1807 *
1808 */
1809
1810void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
1811{
1812
1813 if (state != LM_ST_UNLOCKED)
1814 return;
1815 /* FIXME: remove this? */
1816}
1817
1818/**
1819 * demote_ok - Check to see if it's ok to unlock a glock
1820 * @gl: the glock
1821 *
1822 * Returns: 1 if it's ok
1823 */
1824
1825static int demote_ok(struct gfs2_glock *gl)
1826{
1827 struct gfs2_sbd *sdp = gl->gl_sbd;
1828 struct gfs2_glock_operations *glops = gl->gl_ops;
1829 int demote = 1;
1830
1831 if (test_bit(GLF_STICKY, &gl->gl_flags))
1832 demote = 0;
1833 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1834 demote = time_after_eq(jiffies,
1835 gl->gl_stamp +
1836 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1837 else if (glops->go_demote_ok)
1838 demote = glops->go_demote_ok(gl);
1839
1840 return demote;
1841}
1842
1843/**
1844 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1845 * @gl: the glock
1846 *
1847 */
1848
1849void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1850{
1851 struct gfs2_sbd *sdp = gl->gl_sbd;
1852
1853 spin_lock(&sdp->sd_reclaim_lock);
1854 if (list_empty(&gl->gl_reclaim)) {
1855 gfs2_glock_hold(gl);
1856 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1857 atomic_inc(&sdp->sd_reclaim_count);
1858 }
1859 spin_unlock(&sdp->sd_reclaim_lock);
1860
1861 wake_up(&sdp->sd_reclaim_wq);
1862}
1863
1864/**
1865 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1866 * @sdp: the filesystem
1867 *
1868 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1869 * different glock and we notice that there are a lot of glocks in the
1870 * reclaim list.
1871 *
1872 */
1873
1874void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1875{
1876 struct gfs2_glock *gl;
1877
1878 spin_lock(&sdp->sd_reclaim_lock);
1879 if (list_empty(&sdp->sd_reclaim_list)) {
1880 spin_unlock(&sdp->sd_reclaim_lock);
1881 return;
1882 }
1883 gl = list_entry(sdp->sd_reclaim_list.next,
1884 struct gfs2_glock, gl_reclaim);
1885 list_del_init(&gl->gl_reclaim);
1886 spin_unlock(&sdp->sd_reclaim_lock);
1887
1888 atomic_dec(&sdp->sd_reclaim_count);
1889 atomic_inc(&sdp->sd_reclaimed);
1890
1891 if (gfs2_glmutex_trylock(gl)) {
1892 if (queue_empty(gl, &gl->gl_holders) &&
1893 gl->gl_state != LM_ST_UNLOCKED &&
1894 demote_ok(gl))
1895 handle_callback(gl, LM_ST_UNLOCKED);
1896 gfs2_glmutex_unlock(gl);
1897 }
1898
1899 gfs2_glock_put(gl);
1900}
1901
1902/**
1903 * examine_bucket - Call a function for glock in a hash bucket
1904 * @examiner: the function
1905 * @sdp: the filesystem
1906 * @bucket: the bucket
1907 *
1908 * Returns: 1 if the bucket has entries
1909 */
1910
1911static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1912 struct gfs2_gl_hash_bucket *bucket)
1913{
1914 struct glock_plug plug;
1915 struct list_head *tmp;
1916 struct gfs2_glock *gl;
1917 int entries;
1918
1919 /* Add "plug" to end of bucket list, work back up list from there */
1920 memset(&plug.gl_flags, 0, sizeof(unsigned long));
1921 set_bit(GLF_PLUG, &plug.gl_flags);
1922
1923 write_lock(&bucket->hb_lock);
1924 list_add(&plug.gl_list, &bucket->hb_list);
1925 write_unlock(&bucket->hb_lock);
1926
1927 for (;;) {
1928 write_lock(&bucket->hb_lock);
1929
1930 for (;;) {
1931 tmp = plug.gl_list.next;
1932
1933 if (tmp == &bucket->hb_list) {
1934 list_del(&plug.gl_list);
1935 entries = !list_empty(&bucket->hb_list);
1936 write_unlock(&bucket->hb_lock);
1937 return entries;
1938 }
1939 gl = list_entry(tmp, struct gfs2_glock, gl_list);
1940
1941 /* Move plug up list */
1942 list_move(&plug.gl_list, &gl->gl_list);
1943
1944 if (test_bit(GLF_PLUG, &gl->gl_flags))
1945 continue;
1946
1947 /* examiner() must glock_put() */
1948 gfs2_glock_hold(gl);
1949
1950 break;
1951 }
1952
1953 write_unlock(&bucket->hb_lock);
1954
1955 examiner(gl);
1956 }
1957}
1958
1959/**
1960 * scan_glock - look at a glock and see if we can reclaim it
1961 * @gl: the glock to look at
1962 *
1963 */
1964
1965static void scan_glock(struct gfs2_glock *gl)
1966{
1967 if (gfs2_glmutex_trylock(gl)) {
1968 if (gl->gl_ops == &gfs2_inode_glops)
1969 goto out;
1970 if (queue_empty(gl, &gl->gl_holders) &&
1971 gl->gl_state != LM_ST_UNLOCKED &&
1972 demote_ok(gl))
1973 goto out_schedule;
1974out:
1975 gfs2_glmutex_unlock(gl);
1976 }
1977
1978 gfs2_glock_put(gl);
1979
1980 return;
1981
1982out_schedule:
1983 gfs2_glmutex_unlock(gl);
1984 gfs2_glock_schedule_for_reclaim(gl);
1985 gfs2_glock_put(gl);
1986}
1987
1988/**
1989 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1990 * @sdp: the filesystem
1991 *
1992 */
1993
1994void gfs2_scand_internal(struct gfs2_sbd *sdp)
1995{
1996 unsigned int x;
1997
1998 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1999 examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
2000 cond_resched();
2001 }
2002}
2003
2004/**
2005 * clear_glock - look at a glock and see if we can free it from glock cache
2006 * @gl: the glock to look at
2007 *
2008 */
2009
2010static void clear_glock(struct gfs2_glock *gl)
2011{
2012 struct gfs2_sbd *sdp = gl->gl_sbd;
2013 int released;
2014
2015 spin_lock(&sdp->sd_reclaim_lock);
2016 if (!list_empty(&gl->gl_reclaim)) {
2017 list_del_init(&gl->gl_reclaim);
2018 atomic_dec(&sdp->sd_reclaim_count);
2019 spin_unlock(&sdp->sd_reclaim_lock);
2020 released = gfs2_glock_put(gl);
2021 gfs2_assert(sdp, !released);
2022 } else {
2023 spin_unlock(&sdp->sd_reclaim_lock);
2024 }
2025
2026 if (gfs2_glmutex_trylock(gl)) {
2027 if (queue_empty(gl, &gl->gl_holders) &&
2028 gl->gl_state != LM_ST_UNLOCKED)
2029 handle_callback(gl, LM_ST_UNLOCKED);
2030
2031 gfs2_glmutex_unlock(gl);
2032 }
2033
2034 gfs2_glock_put(gl);
2035}
2036
2037/**
2038 * gfs2_gl_hash_clear - Empty out the glock hash table
2039 * @sdp: the filesystem
2040 * @wait: wait until it's all gone
2041 *
2042 * Called when unmounting the filesystem, or when inter-node lock manager
2043 * requests DROPLOCKS because it is running out of capacity.
2044 */
2045
2046void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2047{
2048 unsigned long t;
2049 unsigned int x;
2050 int cont;
2051
2052 t = jiffies;
2053
2054 for (;;) {
2055 cont = 0;
2056
2057 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2058 if (examine_bucket(clear_glock, sdp,
2059 &sdp->sd_gl_hash[x]))
2060 cont = 1;
2061
2062 if (!wait || !cont)
2063 break;
2064
2065 if (time_after_eq(jiffies,
2066 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2067 fs_warn(sdp, "Unmount seems to be stalled. "
2068 "Dumping lock state...\n");
2069 gfs2_dump_lockstate(sdp);
2070 t = jiffies;
2071 }
2072
2073 /* invalidate_inodes() requires that the sb inodes list
2074 not change, but an async completion callback for an
2075 unlock can occur which does glock_put() which
2076 can call iput() which will change the sb inodes list.
2077 invalidate_inodes_mutex prevents glock_put()'s during
2078 an invalidate_inodes() */
2079
2080 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
2081 invalidate_inodes(sdp->sd_vfs);
2082 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
2083 msleep(10);
2084 }
2085}
2086
2087/*
2088 * Diagnostic routines to help debug distributed deadlock
2089 */
2090
2091/**
2092 * dump_holder - print information about a glock holder
2093 * @str: a string naming the type of holder
2094 * @gh: the glock holder
2095 *
2096 * Returns: 0 on success, -ENOBUFS when we run out of space
2097 */
2098
2099static int dump_holder(char *str, struct gfs2_holder *gh)
2100{
2101 unsigned int x;
2102 int error = -ENOBUFS;
2103
2104 printk(KERN_INFO " %s\n", str);
2105 printk(KERN_INFO " owner = %ld\n",
2106 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2107 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2108 printk(KERN_INFO " gh_flags =");
2109 for (x = 0; x < 32; x++)
2110 if (gh->gh_flags & (1 << x))
2111 printk(" %u", x);
2112 printk(" \n");
2113 printk(KERN_INFO " error = %d\n", gh->gh_error);
2114 printk(KERN_INFO " gh_iflags =");
2115 for (x = 0; x < 32; x++)
2116 if (test_bit(x, &gh->gh_iflags))
2117 printk(" %u", x);
2118 printk(" \n");
2119 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2120
2121 error = 0;
2122
2123 return error;
2124}
2125
2126/**
2127 * dump_inode - print information about an inode
2128 * @ip: the inode
2129 *
2130 * Returns: 0 on success, -ENOBUFS when we run out of space
2131 */
2132
2133static int dump_inode(struct gfs2_inode *ip)
2134{
2135 unsigned int x;
2136 int error = -ENOBUFS;
2137
2138 printk(KERN_INFO " Inode:\n");
2139 printk(KERN_INFO " num = %llu %llu\n",
2140 (unsigned long long)ip->i_num.no_formal_ino,
2141 (unsigned long long)ip->i_num.no_addr);
2142 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2143 printk(KERN_INFO " i_flags =");
2144 for (x = 0; x < 32; x++)
2145 if (test_bit(x, &ip->i_flags))
2146 printk(" %u", x);
2147 printk(" \n");
2148
2149 error = 0;
2150
2151 return error;
2152}
2153
2154/**
2155 * dump_glock - print information about a glock
2156 * @gl: the glock
2157 * @count: where we are in the buffer
2158 *
2159 * Returns: 0 on success, -ENOBUFS when we run out of space
2160 */
2161
2162static int dump_glock(struct gfs2_glock *gl)
2163{
2164 struct gfs2_holder *gh;
2165 unsigned int x;
2166 int error = -ENOBUFS;
2167
2168 spin_lock(&gl->gl_spin);
2169
2170 printk(KERN_INFO "Glock (%u, %llu)\n", gl->gl_name.ln_type,
2171 (unsigned long long)gl->gl_name.ln_number);
2172 printk(KERN_INFO " gl_flags =");
2173 for (x = 0; x < 32; x++)
2174 if (test_bit(x, &gl->gl_flags))
2175 printk(" %u", x);
2176 printk(" \n");
2177 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
2178 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2179 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
2180 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
2181 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2182 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2183 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2184 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2185 printk(KERN_INFO " le = %s\n",
2186 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2187 printk(KERN_INFO " reclaim = %s\n",
2188 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2189 if (gl->gl_aspace)
2190 printk(KERN_INFO " aspace = %lu\n",
2191 gl->gl_aspace->i_mapping->nrpages);
2192 else
2193 printk(KERN_INFO " aspace = no\n");
2194 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2195 if (gl->gl_req_gh) {
2196 error = dump_holder("Request", gl->gl_req_gh);
2197 if (error)
2198 goto out;
2199 }
2200 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2201 error = dump_holder("Holder", gh);
2202 if (error)
2203 goto out;
2204 }
2205 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2206 error = dump_holder("Waiter1", gh);
2207 if (error)
2208 goto out;
2209 }
2210 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2211 error = dump_holder("Waiter2", gh);
2212 if (error)
2213 goto out;
2214 }
2215 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2216 error = dump_holder("Waiter3", gh);
2217 if (error)
2218 goto out;
2219 }
2220 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2221 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2222 list_empty(&gl->gl_holders)) {
2223 error = dump_inode(gl->gl_object);
2224 if (error)
2225 goto out;
2226 } else {
2227 error = -ENOBUFS;
2228 printk(KERN_INFO " Inode: busy\n");
2229 }
2230 }
2231
2232 error = 0;
2233
2234 out:
2235 spin_unlock(&gl->gl_spin);
2236
2237 return error;
2238}
2239
2240/**
2241 * gfs2_dump_lockstate - print out the current lockstate
2242 * @sdp: the filesystem
2243 * @ub: the buffer to copy the information into
2244 *
2245 * If @ub is NULL, dump the lockstate to the console.
2246 *
2247 */
2248
2249static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2250{
2251 struct gfs2_gl_hash_bucket *bucket;
2252 struct gfs2_glock *gl;
2253 unsigned int x;
2254 int error = 0;
2255
2256 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2257 bucket = &sdp->sd_gl_hash[x];
2258
2259 read_lock(&bucket->hb_lock);
2260
2261 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
2262 if (test_bit(GLF_PLUG, &gl->gl_flags))
2263 continue;
2264
2265 error = dump_glock(gl);
2266 if (error)
2267 break;
2268 }
2269
2270 read_unlock(&bucket->hb_lock);
2271
2272 if (error)
2273 break;
2274 }
2275
2276
2277 return error;
2278}
2279
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..fdf58db44ae3
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,152 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13/* Flags for lock requests; used in gfs2_holder gh_flag field.
14 From lm_interface.h:
15#define LM_FLAG_TRY 0x00000001
16#define LM_FLAG_TRY_1CB 0x00000002
17#define LM_FLAG_NOEXP 0x00000004
18#define LM_FLAG_ANY 0x00000008
19#define LM_FLAG_PRIORITY 0x00000010 */
20
21#define GL_LOCAL_EXCL 0x00000020
22#define GL_ASYNC 0x00000040
23#define GL_EXACT 0x00000080
24#define GL_SKIP 0x00000100
25#define GL_ATIME 0x00000200
26#define GL_NOCACHE 0x00000400
27#define GL_SYNC 0x00000800
28#define GL_NOCANCEL 0x00001000
29#define GL_AOP 0x00004000
30#define GL_DUMP 0x00008000
31
32#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34
35static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{
37 struct gfs2_holder *gh;
38 int locked = 0;
39
40 /* Look in glock's list of holders for one with current task as owner */
41 spin_lock(&gl->gl_spin);
42 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
43 if (gh->gh_owner == current) {
44 locked = 1;
45 break;
46 }
47 }
48 spin_unlock(&gl->gl_spin);
49
50 return locked;
51}
52
53static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
54{
55 return (gl->gl_state == LM_ST_EXCLUSIVE);
56}
57
58static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
59{
60 return (gl->gl_state == LM_ST_DEFERRED);
61}
62
63static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
64{
65 return (gl->gl_state == LM_ST_SHARED);
66}
67
68static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
69{
70 int ret;
71 spin_lock(&gl->gl_spin);
72 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
73 spin_unlock(&gl->gl_spin);
74 return ret;
75}
76
77int gfs2_glock_get(struct gfs2_sbd *sdp,
78 uint64_t number, struct gfs2_glock_operations *glops,
79 int create, struct gfs2_glock **glp);
80void gfs2_glock_hold(struct gfs2_glock *gl);
81int gfs2_glock_put(struct gfs2_glock *gl);
82void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
83 struct gfs2_holder *gh);
84void gfs2_holder_reinit(unsigned int state, unsigned flags,
85 struct gfs2_holder *gh);
86void gfs2_holder_uninit(struct gfs2_holder *gh);
87
88void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
89void gfs2_glock_drop_th(struct gfs2_glock *gl);
90
91int gfs2_glock_nq(struct gfs2_holder *gh);
92int gfs2_glock_poll(struct gfs2_holder *gh);
93int gfs2_glock_wait(struct gfs2_holder *gh);
94void gfs2_glock_dq(struct gfs2_holder *gh);
95
96int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
97
98void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
99int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
100 uint64_t number, struct gfs2_glock_operations *glops,
101 unsigned int state, int flags, struct gfs2_holder *gh);
102
103int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
104void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
105void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
106
107void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
108 struct gfs2_glock_operations *glops,
109 unsigned int state, int flags);
110void gfs2_glock_inode_squish(struct inode *inode);
111
112/**
113 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
114 * @gl: the glock
115 * @state: the state we're requesting
116 * @flags: the modifier flags
117 * @gh: the holder structure
118 *
119 * Returns: 0, GLR_*, or errno
120 */
121
122static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
123 unsigned int state, int flags,
124 struct gfs2_holder *gh)
125{
126 int error;
127
128 gfs2_holder_init(gl, state, flags, gh);
129
130 error = gfs2_glock_nq(gh);
131 if (error)
132 gfs2_holder_uninit(gh);
133
134 return error;
135}
136
137/* Lock Value Block functions */
138
139int gfs2_lvb_hold(struct gfs2_glock *gl);
140void gfs2_lvb_unhold(struct gfs2_glock *gl);
141
142void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
143
144void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
145
146void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
147void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
148
149void gfs2_scand_internal(struct gfs2_sbd *sdp);
150void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
151
152#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..75d4c50cff45
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,564 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "bmap.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "recovery.h"
27#include "rgrp.h"
28#include "util.h"
29
30
31/**
32 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
33 * @gl: the glock
34 *
35 */
36
37static void gfs2_pte_inval(struct gfs2_glock *gl)
38{
39 struct gfs2_inode *ip;
40 struct inode *inode;
41
42 ip = gl->gl_object;
43 inode = &ip->i_inode;
44 if (!ip || !S_ISREG(ip->i_di.di_mode))
45 return;
46
47 if (!test_bit(GIF_PAGED, &ip->i_flags))
48 return;
49
50 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
51
52 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
53 set_bit(GLF_DIRTY, &gl->gl_flags);
54
55 clear_bit(GIF_SW_PAGED, &ip->i_flags);
56}
57
58/**
59 * gfs2_page_inval - Invalidate all pages associated with a glock
60 * @gl: the glock
61 *
62 */
63
64static void gfs2_page_inval(struct gfs2_glock *gl)
65{
66 struct gfs2_inode *ip;
67 struct inode *inode;
68
69 ip = gl->gl_object;
70 inode = &ip->i_inode;
71 if (!ip || !S_ISREG(ip->i_di.di_mode))
72 return;
73
74 truncate_inode_pages(inode->i_mapping, 0);
75 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
76 clear_bit(GIF_PAGED, &ip->i_flags);
77}
78
79/**
80 * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock
81 * @gl: the glock
82 * @flags: DIO_START | DIO_WAIT
83 *
84 * Syncs data (not metadata) for a regular file.
85 * No-op for all other types.
86 */
87
88static void gfs2_page_sync(struct gfs2_glock *gl, int flags)
89{
90 struct gfs2_inode *ip;
91 struct inode *inode;
92 struct address_space *mapping;
93 int error = 0;
94
95 ip = gl->gl_object;
96 inode = &ip->i_inode;
97 if (!ip || !S_ISREG(ip->i_di.di_mode))
98 return;
99
100 mapping = inode->i_mapping;
101
102 if (flags & DIO_START)
103 filemap_fdatawrite(mapping);
104 if (!error && (flags & DIO_WAIT))
105 error = filemap_fdatawait(mapping);
106
107 /* Put back any errors cleared by filemap_fdatawait()
108 so they can be caught by someone who can pass them
109 up to user space. */
110
111 if (error == -ENOSPC)
112 set_bit(AS_ENOSPC, &mapping->flags);
113 else if (error)
114 set_bit(AS_EIO, &mapping->flags);
115
116}
117
118/**
119 * meta_go_sync - sync out the metadata for this glock
120 * @gl: the glock
121 * @flags: DIO_*
122 *
123 * Called when demoting or unlocking an EX glock. We must flush
124 * to disk all dirty buffers/pages relating to this glock, and must not
125 * not return to caller to demote/unlock the glock until I/O is complete.
126 */
127
128static void meta_go_sync(struct gfs2_glock *gl, int flags)
129{
130 if (!(flags & DIO_METADATA))
131 return;
132
133 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
134 gfs2_log_flush(gl->gl_sbd, gl);
135 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
136 if (flags & DIO_RELEASE)
137 gfs2_ail_empty_gl(gl);
138 }
139
140 clear_bit(GLF_SYNC, &gl->gl_flags);
141}
142
143/**
144 * meta_go_inval - invalidate the metadata for this glock
145 * @gl: the glock
146 * @flags:
147 *
148 */
149
150static void meta_go_inval(struct gfs2_glock *gl, int flags)
151{
152 if (!(flags & DIO_METADATA))
153 return;
154
155 gfs2_meta_inval(gl);
156 gl->gl_vn++;
157}
158
159/**
160 * inode_go_xmote_th - promote/demote a glock
161 * @gl: the glock
162 * @state: the requested state
163 * @flags:
164 *
165 */
166
167static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
168 int flags)
169{
170 if (gl->gl_state != LM_ST_UNLOCKED)
171 gfs2_pte_inval(gl);
172 gfs2_glock_xmote_th(gl, state, flags);
173}
174
175/**
176 * inode_go_xmote_bh - After promoting/demoting a glock
177 * @gl: the glock
178 *
179 */
180
181static void inode_go_xmote_bh(struct gfs2_glock *gl)
182{
183 struct gfs2_holder *gh = gl->gl_req_gh;
184 struct buffer_head *bh;
185 int error;
186
187 if (gl->gl_state != LM_ST_UNLOCKED &&
188 (!gh || !(gh->gh_flags & GL_SKIP))) {
189 error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
190 &bh);
191 if (!error)
192 brelse(bh);
193 }
194}
195
196/**
197 * inode_go_drop_th - unlock a glock
198 * @gl: the glock
199 *
200 * Invoked from rq_demote().
201 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
202 * is being purged from our node's glock cache; we're dropping lock.
203 */
204
205static void inode_go_drop_th(struct gfs2_glock *gl)
206{
207 gfs2_pte_inval(gl);
208 gfs2_glock_drop_th(gl);
209}
210
211/**
212 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
213 * @gl: the glock protecting the inode
214 * @flags:
215 *
216 */
217
218static void inode_go_sync(struct gfs2_glock *gl, int flags)
219{
220 int meta = (flags & DIO_METADATA);
221 int data = (flags & DIO_DATA);
222
223 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
224 if (meta && data) {
225 gfs2_page_sync(gl, flags | DIO_START);
226 gfs2_log_flush(gl->gl_sbd, gl);
227 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
228 gfs2_page_sync(gl, flags | DIO_WAIT);
229 clear_bit(GLF_DIRTY, &gl->gl_flags);
230 } else if (meta) {
231 gfs2_log_flush(gl->gl_sbd, gl);
232 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
233 } else if (data)
234 gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
235 if (flags & DIO_RELEASE)
236 gfs2_ail_empty_gl(gl);
237 }
238
239 clear_bit(GLF_SYNC, &gl->gl_flags);
240}
241
242/**
243 * inode_go_inval - prepare a inode glock to be released
244 * @gl: the glock
245 * @flags:
246 *
247 */
248
249static void inode_go_inval(struct gfs2_glock *gl, int flags)
250{
251 int meta = (flags & DIO_METADATA);
252 int data = (flags & DIO_DATA);
253
254 if (meta) {
255 gfs2_meta_inval(gl);
256 gl->gl_vn++;
257 }
258 if (data)
259 gfs2_page_inval(gl);
260}
261
262/**
263 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
264 * @gl: the glock
265 *
266 * Returns: 1 if it's ok
267 */
268
269static int inode_go_demote_ok(struct gfs2_glock *gl)
270{
271 struct gfs2_sbd *sdp = gl->gl_sbd;
272 int demote = 0;
273
274 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
275 demote = 1;
276 else if (!sdp->sd_args.ar_localcaching &&
277 time_after_eq(jiffies, gl->gl_stamp +
278 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
279 demote = 1;
280
281 return demote;
282}
283
284/**
285 * inode_go_lock - operation done after an inode lock is locked by a process
286 * @gl: the glock
287 * @flags:
288 *
289 * Returns: errno
290 */
291
292static int inode_go_lock(struct gfs2_holder *gh)
293{
294 struct gfs2_glock *gl = gh->gh_gl;
295 struct gfs2_inode *ip = gl->gl_object;
296 int error = 0;
297
298 if (!ip)
299 return 0;
300
301 if (ip->i_vn != gl->gl_vn) {
302 error = gfs2_inode_refresh(ip);
303 if (error)
304 return error;
305 gfs2_inode_attr_in(ip);
306 }
307
308 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
309 (gl->gl_state == LM_ST_EXCLUSIVE) &&
310 (gh->gh_flags & GL_LOCAL_EXCL))
311 error = gfs2_truncatei_resume(ip);
312
313 return error;
314}
315
316/**
317 * inode_go_unlock - operation done before an inode lock is unlocked by a
318 * process
319 * @gl: the glock
320 * @flags:
321 *
322 */
323
324static void inode_go_unlock(struct gfs2_holder *gh)
325{
326 struct gfs2_glock *gl = gh->gh_gl;
327 struct gfs2_inode *ip = gl->gl_object;
328
329 if (ip) {
330 if (test_bit(GLF_DIRTY, &gl->gl_flags))
331 gfs2_inode_attr_in(ip);
332
333 gfs2_meta_cache_flush(ip);
334 }
335}
336
337/**
338 * inode_greedy -
339 * @gl: the glock
340 *
341 */
342
343static void inode_greedy(struct gfs2_glock *gl)
344{
345 struct gfs2_sbd *sdp = gl->gl_sbd;
346 struct gfs2_inode *ip = gl->gl_object;
347 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
348 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
349 unsigned int new_time;
350
351 spin_lock(&ip->i_spin);
352
353 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
354 new_time = ip->i_greedy + quantum;
355 if (new_time > max)
356 new_time = max;
357 } else {
358 new_time = ip->i_greedy - quantum;
359 if (!new_time || new_time > max)
360 new_time = 1;
361 }
362
363 ip->i_greedy = new_time;
364
365 spin_unlock(&ip->i_spin);
366
367 iput(&ip->i_inode);
368}
369
370/**
371 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
372 * @gl: the glock
373 *
374 * Returns: 1 if it's ok
375 */
376
377static int rgrp_go_demote_ok(struct gfs2_glock *gl)
378{
379 return !gl->gl_aspace->i_mapping->nrpages;
380}
381
382/**
383 * rgrp_go_lock - operation done after an rgrp lock is locked by
384 * a first holder on this node.
385 * @gl: the glock
386 * @flags:
387 *
388 * Returns: errno
389 */
390
391static int rgrp_go_lock(struct gfs2_holder *gh)
392{
393 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
394}
395
396/**
397 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
398 * a last holder on this node.
399 * @gl: the glock
400 * @flags:
401 *
402 */
403
404static void rgrp_go_unlock(struct gfs2_holder *gh)
405{
406 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
407}
408
409/**
410 * trans_go_xmote_th - promote/demote the transaction glock
411 * @gl: the glock
412 * @state: the requested state
413 * @flags:
414 *
415 */
416
417static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
418 int flags)
419{
420 struct gfs2_sbd *sdp = gl->gl_sbd;
421
422 if (gl->gl_state != LM_ST_UNLOCKED &&
423 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
424 gfs2_meta_syncfs(sdp);
425 gfs2_log_shutdown(sdp);
426 }
427
428 gfs2_glock_xmote_th(gl, state, flags);
429}
430
431/**
432 * trans_go_xmote_bh - After promoting/demoting the transaction glock
433 * @gl: the glock
434 *
435 */
436
437static void trans_go_xmote_bh(struct gfs2_glock *gl)
438{
439 struct gfs2_sbd *sdp = gl->gl_sbd;
440 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
441 struct gfs2_glock *j_gl = ip->i_gl;
442 struct gfs2_log_header head;
443 int error;
444
445 if (gl->gl_state != LM_ST_UNLOCKED &&
446 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
447 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
448 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
449
450 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
451 if (error)
452 gfs2_consist(sdp);
453 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
454 gfs2_consist(sdp);
455
456 /* Initialize some head of the log stuff */
457 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
458 sdp->sd_log_sequence = head.lh_sequence + 1;
459 gfs2_log_pointers_init(sdp, head.lh_blkno);
460 }
461 }
462}
463
464/**
465 * trans_go_drop_th - unlock the transaction glock
466 * @gl: the glock
467 *
468 * We want to sync the device even with localcaching. Remember
469 * that localcaching journal replay only marks buffers dirty.
470 */
471
472static void trans_go_drop_th(struct gfs2_glock *gl)
473{
474 struct gfs2_sbd *sdp = gl->gl_sbd;
475
476 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
477 gfs2_meta_syncfs(sdp);
478 gfs2_log_shutdown(sdp);
479 }
480
481 gfs2_glock_drop_th(gl);
482}
483
484/**
485 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
486 * @gl: the glock
487 *
488 * Returns: 1 if it's ok
489 */
490
491static int quota_go_demote_ok(struct gfs2_glock *gl)
492{
493 return !atomic_read(&gl->gl_lvb_count);
494}
495
496struct gfs2_glock_operations gfs2_meta_glops = {
497 .go_xmote_th = gfs2_glock_xmote_th,
498 .go_drop_th = gfs2_glock_drop_th,
499 .go_type = LM_TYPE_META
500};
501
502struct gfs2_glock_operations gfs2_inode_glops = {
503 .go_xmote_th = inode_go_xmote_th,
504 .go_xmote_bh = inode_go_xmote_bh,
505 .go_drop_th = inode_go_drop_th,
506 .go_sync = inode_go_sync,
507 .go_inval = inode_go_inval,
508 .go_demote_ok = inode_go_demote_ok,
509 .go_lock = inode_go_lock,
510 .go_unlock = inode_go_unlock,
511 .go_greedy = inode_greedy,
512 .go_type = LM_TYPE_INODE
513};
514
515struct gfs2_glock_operations gfs2_rgrp_glops = {
516 .go_xmote_th = gfs2_glock_xmote_th,
517 .go_drop_th = gfs2_glock_drop_th,
518 .go_sync = meta_go_sync,
519 .go_inval = meta_go_inval,
520 .go_demote_ok = rgrp_go_demote_ok,
521 .go_lock = rgrp_go_lock,
522 .go_unlock = rgrp_go_unlock,
523 .go_type = LM_TYPE_RGRP
524};
525
526struct gfs2_glock_operations gfs2_trans_glops = {
527 .go_xmote_th = trans_go_xmote_th,
528 .go_xmote_bh = trans_go_xmote_bh,
529 .go_drop_th = trans_go_drop_th,
530 .go_type = LM_TYPE_NONDISK
531};
532
533struct gfs2_glock_operations gfs2_iopen_glops = {
534 .go_xmote_th = gfs2_glock_xmote_th,
535 .go_drop_th = gfs2_glock_drop_th,
536 .go_callback = gfs2_iopen_go_callback,
537 .go_type = LM_TYPE_IOPEN
538};
539
540struct gfs2_glock_operations gfs2_flock_glops = {
541 .go_xmote_th = gfs2_glock_xmote_th,
542 .go_drop_th = gfs2_glock_drop_th,
543 .go_type = LM_TYPE_FLOCK
544};
545
546struct gfs2_glock_operations gfs2_nondisk_glops = {
547 .go_xmote_th = gfs2_glock_xmote_th,
548 .go_drop_th = gfs2_glock_drop_th,
549 .go_type = LM_TYPE_NONDISK
550};
551
552struct gfs2_glock_operations gfs2_quota_glops = {
553 .go_xmote_th = gfs2_glock_xmote_th,
554 .go_drop_th = gfs2_glock_drop_th,
555 .go_demote_ok = quota_go_demote_ok,
556 .go_type = LM_TYPE_QUOTA
557};
558
559struct gfs2_glock_operations gfs2_journal_glops = {
560 .go_xmote_th = gfs2_glock_xmote_th,
561 .go_drop_th = gfs2_glock_drop_th,
562 .go_type = LM_TYPE_JOURNAL
563};
564
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..5c1e9491024f
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13extern struct gfs2_glock_operations gfs2_meta_glops;
14extern struct gfs2_glock_operations gfs2_inode_glops;
15extern struct gfs2_glock_operations gfs2_rgrp_glops;
16extern struct gfs2_glock_operations gfs2_trans_glops;
17extern struct gfs2_glock_operations gfs2_iopen_glops;
18extern struct gfs2_glock_operations gfs2_flock_glops;
19extern struct gfs2_glock_operations gfs2_nondisk_glops;
20extern struct gfs2_glock_operations gfs2_quota_glops;
21extern struct gfs2_glock_operations gfs2_journal_glops;
22
23#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..90e0624d8065
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,658 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#define DIO_FORCE 0x00000001
14#define DIO_CLEAN 0x00000002
15#define DIO_DIRTY 0x00000004
16#define DIO_START 0x00000008
17#define DIO_WAIT 0x00000010
18#define DIO_METADATA 0x00000020
19#define DIO_DATA 0x00000040
20#define DIO_RELEASE 0x00000080
21#define DIO_ALL 0x00000100
22
23struct gfs2_log_operations;
24struct gfs2_log_element;
25struct gfs2_bitmap;
26struct gfs2_rgrpd;
27struct gfs2_bufdata;
28struct gfs2_glock_operations;
29struct gfs2_holder;
30struct gfs2_glock;
31struct gfs2_alloc;
32struct gfs2_inode;
33struct gfs2_file;
34struct gfs2_revoke;
35struct gfs2_revoke_replay;
36struct gfs2_quota_data;
37struct gfs2_log_buf;
38struct gfs2_trans;
39struct gfs2_ail;
40struct gfs2_jdesc;
41struct gfs2_args;
42struct gfs2_tune;
43struct gfs2_gl_hash_bucket;
44struct gfs2_sbd;
45
46typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
47
48/*
49 * Structure of operations that are associated with each
50 * type of element in the log.
51 */
52
53struct gfs2_log_operations {
54 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
55 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
56 void (*lo_before_commit) (struct gfs2_sbd *sdp);
57 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
58 void (*lo_before_scan) (struct gfs2_jdesc *jd,
59 struct gfs2_log_header *head, int pass);
60 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
61 struct gfs2_log_descriptor *ld, __be64 *ptr,
62 int pass);
63 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
64 const char *lo_name;
65};
66
67struct gfs2_log_element {
68 struct list_head le_list;
69 const struct gfs2_log_operations *le_ops;
70};
71
72struct gfs2_bitmap {
73 struct buffer_head *bi_bh;
74 char *bi_clone;
75 uint32_t bi_offset;
76 uint32_t bi_start;
77 uint32_t bi_len;
78};
79
80struct gfs2_rgrpd {
81 struct list_head rd_list; /* Link with superblock */
82 struct list_head rd_list_mru;
83 struct list_head rd_recent; /* Recently used rgrps */
84 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
85 struct gfs2_rindex rd_ri;
86 struct gfs2_rgrp rd_rg;
87 uint64_t rd_rg_vn;
88 struct gfs2_bitmap *rd_bits;
89 unsigned int rd_bh_count;
90 struct mutex rd_mutex;
91 uint32_t rd_free_clone;
92 struct gfs2_log_element rd_le;
93 uint32_t rd_last_alloc_data;
94 uint32_t rd_last_alloc_meta;
95 struct gfs2_sbd *rd_sbd;
96};
97
98enum gfs2_state_bits {
99 BH_Pinned = BH_PrivateStart,
100 BH_Escaped = BH_PrivateStart + 1,
101};
102
103BUFFER_FNS(Pinned, pinned)
104TAS_BUFFER_FNS(Pinned, pinned)
105BUFFER_FNS(Escaped, escaped)
106TAS_BUFFER_FNS(Escaped, escaped)
107
108struct gfs2_bufdata {
109 struct buffer_head *bd_bh;
110 struct gfs2_glock *bd_gl;
111
112 struct list_head bd_list_tr;
113 struct gfs2_log_element bd_le;
114
115 struct gfs2_ail *bd_ail;
116 struct list_head bd_ail_st_list;
117 struct list_head bd_ail_gl_list;
118};
119
120struct gfs2_glock_operations {
121 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
122 int flags);
123 void (*go_xmote_bh) (struct gfs2_glock * gl);
124 void (*go_drop_th) (struct gfs2_glock * gl);
125 void (*go_drop_bh) (struct gfs2_glock * gl);
126 void (*go_sync) (struct gfs2_glock * gl, int flags);
127 void (*go_inval) (struct gfs2_glock * gl, int flags);
128 int (*go_demote_ok) (struct gfs2_glock * gl);
129 int (*go_lock) (struct gfs2_holder * gh);
130 void (*go_unlock) (struct gfs2_holder * gh);
131 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
132 void (*go_greedy) (struct gfs2_glock * gl);
133 int go_type;
134};
135
136enum {
137 /* Actions */
138 HIF_MUTEX = 0,
139 HIF_PROMOTE = 1,
140 HIF_DEMOTE = 2,
141 HIF_GREEDY = 3,
142
143 /* States */
144 HIF_ALLOCED = 4,
145 HIF_DEALLOC = 5,
146 HIF_HOLDER = 6,
147 HIF_FIRST = 7,
148 HIF_ABORTED = 9,
149};
150
151struct gfs2_holder {
152 struct list_head gh_list;
153
154 struct gfs2_glock *gh_gl;
155 struct task_struct *gh_owner;
156 unsigned int gh_state;
157 unsigned gh_flags;
158
159 int gh_error;
160 unsigned long gh_iflags;
161 struct completion gh_wait;
162 unsigned long gh_ip;
163};
164
165enum {
166 GLF_PLUG = 0,
167 GLF_LOCK = 1,
168 GLF_STICKY = 2,
169 GLF_PREFETCH = 3,
170 GLF_SYNC = 4,
171 GLF_DIRTY = 5,
172 GLF_SKIP_WAITERS2 = 6,
173 GLF_GREEDY = 7,
174};
175
176struct gfs2_glock {
177 struct list_head gl_list;
178 unsigned long gl_flags; /* GLF_... */
179 struct lm_lockname gl_name;
180 struct kref gl_ref;
181
182 spinlock_t gl_spin;
183
184 unsigned int gl_state;
185 struct task_struct *gl_owner;
186 unsigned long gl_ip;
187 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
190 struct list_head gl_waiters3; /* HIF_PROMOTE */
191
192 struct gfs2_glock_operations *gl_ops;
193
194 struct gfs2_holder *gl_req_gh;
195 gfs2_glop_bh_t gl_req_bh;
196
197 lm_lock_t *gl_lock;
198 char *gl_lvb;
199 atomic_t gl_lvb_count;
200
201 uint64_t gl_vn;
202 unsigned long gl_stamp;
203 void *gl_object;
204
205 struct gfs2_gl_hash_bucket *gl_bucket;
206 struct list_head gl_reclaim;
207
208 struct gfs2_sbd *gl_sbd;
209
210 struct inode *gl_aspace;
211 struct gfs2_log_element gl_le;
212 struct list_head gl_ail_list;
213 atomic_t gl_ail_count;
214};
215
216struct gfs2_alloc {
217 /* Quota stuff */
218
219 struct gfs2_quota_data *al_qd[4];
220 struct gfs2_holder al_qd_ghs[4];
221 unsigned int al_qd_num;
222
223 u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
224 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
225
226 /* Filled in by gfs2_inplace_reserve() */
227
228 unsigned int al_line;
229 char *al_file;
230 struct gfs2_holder al_ri_gh;
231 struct gfs2_holder al_rgd_gh;
232 struct gfs2_rgrpd *al_rgd;
233
234};
235
236enum {
237 GIF_QD_LOCKED = 1,
238 GIF_PAGED = 2,
239 GIF_SW_PAGED = 3,
240};
241
242struct gfs2_inode {
243 struct inode i_inode;
244 struct gfs2_inum i_num;
245
246 unsigned long i_flags; /* GIF_... */
247
248 uint64_t i_vn;
249 struct gfs2_dinode i_di; /* To be replaced by ref to block */
250
251 struct gfs2_glock *i_gl; /* Move into i_gh? */
252 struct gfs2_holder i_iopen_gh;
253 struct gfs2_holder i_gh; /* for prepare/commit_write only */
254 struct gfs2_alloc i_alloc;
255 uint64_t i_last_rg_alloc;
256
257 spinlock_t i_spin;
258 struct rw_semaphore i_rw_mutex;
259 unsigned int i_greedy;
260 unsigned long i_last_pfault;
261
262 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
263};
264
265/*
266 * Since i_inode is the first element of struct gfs2_inode,
267 * this is effectively a cast.
268 */
269static inline struct gfs2_inode *GFS2_I(struct inode *inode)
270{
271 return container_of(inode, struct gfs2_inode, i_inode);
272}
273
274/* To be removed? */
275static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
276{
277 return inode->i_sb->s_fs_info;
278}
279
280enum {
281 GFF_DID_DIRECT_ALLOC = 0,
282};
283
284struct gfs2_file {
285 unsigned long f_flags; /* GFF_... */
286 struct mutex f_fl_mutex;
287 struct gfs2_holder f_fl_gh;
288};
289
290struct gfs2_revoke {
291 struct gfs2_log_element rv_le;
292 uint64_t rv_blkno;
293};
294
295struct gfs2_revoke_replay {
296 struct list_head rr_list;
297 uint64_t rr_blkno;
298 unsigned int rr_where;
299};
300
301enum {
302 QDF_USER = 0,
303 QDF_CHANGE = 1,
304 QDF_LOCKED = 2,
305};
306
307struct gfs2_quota_lvb {
308 uint32_t qb_magic;
309 uint32_t __pad;
310 uint64_t qb_limit; /* Hard limit of # blocks to alloc */
311 uint64_t qb_warn; /* Warn user when alloc is above this # */
312 int64_t qb_value; /* Current # blocks allocated */
313};
314
315struct gfs2_quota_data {
316 struct list_head qd_list;
317 unsigned int qd_count;
318
319 uint32_t qd_id;
320 unsigned long qd_flags; /* QDF_... */
321
322 int64_t qd_change;
323 int64_t qd_change_sync;
324
325 unsigned int qd_slot;
326 unsigned int qd_slot_count;
327
328 struct buffer_head *qd_bh;
329 struct gfs2_quota_change *qd_bh_qc;
330 unsigned int qd_bh_count;
331
332 struct gfs2_glock *qd_gl;
333 struct gfs2_quota_lvb qd_qb;
334
335 uint64_t qd_sync_gen;
336 unsigned long qd_last_warn;
337 unsigned long qd_last_touched;
338};
339
340struct gfs2_log_buf {
341 struct list_head lb_list;
342 struct buffer_head *lb_bh;
343 struct buffer_head *lb_real;
344};
345
346struct gfs2_trans {
347 unsigned long tr_ip;
348
349 unsigned int tr_blocks;
350 unsigned int tr_revokes;
351 unsigned int tr_reserved;
352
353 struct gfs2_holder tr_t_gh;
354
355 int tr_touched;
356
357 unsigned int tr_num_buf;
358 unsigned int tr_num_buf_new;
359 unsigned int tr_num_buf_rm;
360 struct list_head tr_list_buf;
361
362 unsigned int tr_num_revoke;
363 unsigned int tr_num_revoke_rm;
364};
365
366struct gfs2_ail {
367 struct list_head ai_list;
368
369 unsigned int ai_first;
370 struct list_head ai_ail1_list;
371 struct list_head ai_ail2_list;
372
373 uint64_t ai_sync_gen;
374};
375
376struct gfs2_jdesc {
377 struct list_head jd_list;
378
379 struct inode *jd_inode;
380 unsigned int jd_jid;
381 int jd_dirty;
382
383 unsigned int jd_blocks;
384};
385
386#define GFS2_GLOCKD_DEFAULT 1
387#define GFS2_GLOCKD_MAX 16
388
389#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
390#define GFS2_QUOTA_OFF 0
391#define GFS2_QUOTA_ACCOUNT 1
392#define GFS2_QUOTA_ON 2
393
394#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
395#define GFS2_DATA_WRITEBACK 1
396#define GFS2_DATA_ORDERED 2
397
398struct gfs2_args {
399 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
400 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
401 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
402 int ar_spectator; /* Don't get a journal because we're always RO */
403 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
404 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
405 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
406 int ar_debug; /* Oops on errors instead of trying to be graceful */
407 int ar_upgrade; /* Upgrade ondisk/multihost format */
408 unsigned int ar_num_glockd; /* Number of glockd threads */
409 int ar_posix_acl; /* Enable posix acls */
410 int ar_quota; /* off/account/on */
411 int ar_suiddir; /* suiddir support */
412 int ar_data; /* ordered/writeback */
413};
414
415struct gfs2_tune {
416 spinlock_t gt_spin;
417
418 unsigned int gt_ilimit;
419 unsigned int gt_ilimit_tries;
420 unsigned int gt_ilimit_min;
421 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
422 unsigned int gt_incore_log_blocks;
423 unsigned int gt_log_flush_secs;
424 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
425
426 unsigned int gt_scand_secs;
427 unsigned int gt_recoverd_secs;
428 unsigned int gt_logd_secs;
429 unsigned int gt_quotad_secs;
430
431 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
432 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
433 unsigned int gt_quota_scale_num; /* Numerator */
434 unsigned int gt_quota_scale_den; /* Denominator */
435 unsigned int gt_quota_cache_secs;
436 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
437 unsigned int gt_atime_quantum; /* Min secs between atime updates */
438 unsigned int gt_new_files_jdata;
439 unsigned int gt_new_files_directio;
440 unsigned int gt_max_atomic_write; /* Split big writes into this size */
441 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
442 unsigned int gt_lockdump_size;
443 unsigned int gt_stall_secs; /* Detects trouble! */
444 unsigned int gt_complain_secs;
445 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
446 unsigned int gt_entries_per_readdir;
447 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
448 unsigned int gt_greedy_default;
449 unsigned int gt_greedy_quantum;
450 unsigned int gt_greedy_max;
451 unsigned int gt_statfs_quantum;
452 unsigned int gt_statfs_slow;
453};
454
455struct gfs2_gl_hash_bucket {
456 rwlock_t hb_lock;
457 struct list_head hb_list;
458};
459
460enum {
461 SDF_JOURNAL_CHECKED = 0,
462 SDF_JOURNAL_LIVE = 1,
463 SDF_SHUTDOWN = 2,
464 SDF_NOATIME = 3,
465};
466
467#define GFS2_GL_HASH_SHIFT 13
468#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
469#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
470#define GFS2_FSNAME_LEN 256
471
472struct gfs2_sbd {
473 struct super_block *sd_vfs;
474 struct kobject sd_kobj;
475 unsigned long sd_flags; /* SDF_... */
476 struct gfs2_sb sd_sb;
477
478 /* Constants computed on mount */
479
480 uint32_t sd_fsb2bb;
481 uint32_t sd_fsb2bb_shift;
482 uint32_t sd_diptrs; /* Number of pointers in a dinode */
483 uint32_t sd_inptrs; /* Number of pointers in a indirect block */
484 uint32_t sd_jbsize; /* Size of a journaled data block */
485 uint32_t sd_hash_bsize; /* sizeof(exhash block) */
486 uint32_t sd_hash_bsize_shift;
487 uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */
488 uint32_t sd_qc_per_block;
489 uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
490 uint32_t sd_max_height; /* Max height of a file's metadata tree */
491 uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
492 uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */
493 uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
494
495 struct gfs2_args sd_args; /* Mount arguments */
496 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
497
498 /* Lock Stuff */
499
500 struct lm_lockstruct sd_lockstruct;
501 struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE];
502 struct list_head sd_reclaim_list;
503 spinlock_t sd_reclaim_lock;
504 wait_queue_head_t sd_reclaim_wq;
505 atomic_t sd_reclaim_count;
506 struct gfs2_holder sd_live_gh;
507 struct gfs2_glock *sd_rename_gl;
508 struct gfs2_glock *sd_trans_gl;
509 struct mutex sd_invalidate_inodes_mutex;
510
511 /* Inode Stuff */
512
513 struct inode *sd_master_dir;
514 struct inode *sd_jindex;
515 struct inode *sd_inum_inode;
516 struct inode *sd_statfs_inode;
517 struct inode *sd_ir_inode;
518 struct inode *sd_sc_inode;
519 struct inode *sd_qc_inode;
520 struct inode *sd_rindex;
521 struct inode *sd_quota_inode;
522
523 /* Inum stuff */
524
525 struct mutex sd_inum_mutex;
526
527 /* StatFS stuff */
528
529 spinlock_t sd_statfs_spin;
530 struct mutex sd_statfs_mutex;
531 struct gfs2_statfs_change sd_statfs_master;
532 struct gfs2_statfs_change sd_statfs_local;
533 unsigned long sd_statfs_sync_time;
534
535 /* Resource group stuff */
536
537 uint64_t sd_rindex_vn;
538 spinlock_t sd_rindex_spin;
539 struct mutex sd_rindex_mutex;
540 struct list_head sd_rindex_list;
541 struct list_head sd_rindex_mru_list;
542 struct list_head sd_rindex_recent_list;
543 struct gfs2_rgrpd *sd_rindex_forward;
544 unsigned int sd_rgrps;
545
546 /* Journal index stuff */
547
548 struct list_head sd_jindex_list;
549 spinlock_t sd_jindex_spin;
550 struct mutex sd_jindex_mutex;
551 unsigned int sd_journals;
552 unsigned long sd_jindex_refresh_time;
553
554 struct gfs2_jdesc *sd_jdesc;
555 struct gfs2_holder sd_journal_gh;
556 struct gfs2_holder sd_jinode_gh;
557
558 struct gfs2_holder sd_ir_gh;
559 struct gfs2_holder sd_sc_gh;
560 struct gfs2_holder sd_qc_gh;
561
562 /* Daemon stuff */
563
564 struct task_struct *sd_scand_process;
565 struct task_struct *sd_recoverd_process;
566 struct task_struct *sd_logd_process;
567 struct task_struct *sd_quotad_process;
568 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
569 unsigned int sd_glockd_num;
570
571 /* Quota stuff */
572
573 struct list_head sd_quota_list;
574 atomic_t sd_quota_count;
575 spinlock_t sd_quota_spin;
576 struct mutex sd_quota_mutex;
577
578 unsigned int sd_quota_slots;
579 unsigned int sd_quota_chunks;
580 unsigned char **sd_quota_bitmap;
581
582 uint64_t sd_quota_sync_gen;
583 unsigned long sd_quota_sync_time;
584
585 /* Log stuff */
586
587 spinlock_t sd_log_lock;
588
589 unsigned int sd_log_blks_reserved;
590 unsigned int sd_log_commited_buf;
591 unsigned int sd_log_commited_revoke;
592
593 unsigned int sd_log_num_gl;
594 unsigned int sd_log_num_buf;
595 unsigned int sd_log_num_revoke;
596 unsigned int sd_log_num_rg;
597 unsigned int sd_log_num_databuf;
598 unsigned int sd_log_num_jdata;
599 unsigned int sd_log_num_hdrs;
600
601 struct list_head sd_log_le_gl;
602 struct list_head sd_log_le_buf;
603 struct list_head sd_log_le_revoke;
604 struct list_head sd_log_le_rg;
605 struct list_head sd_log_le_databuf;
606
607 unsigned int sd_log_blks_free;
608 struct mutex sd_log_reserve_mutex;
609
610 uint64_t sd_log_sequence;
611 unsigned int sd_log_head;
612 unsigned int sd_log_tail;
613 int sd_log_idle;
614
615 unsigned long sd_log_flush_time;
616 struct rw_semaphore sd_log_flush_lock;
617 struct list_head sd_log_flush_list;
618
619 unsigned int sd_log_flush_head;
620 uint64_t sd_log_flush_wrapped;
621
622 struct list_head sd_ail1_list;
623 struct list_head sd_ail2_list;
624 uint64_t sd_ail_sync_gen;
625
626 /* Replay stuff */
627
628 struct list_head sd_revoke_list;
629 unsigned int sd_replay_tail;
630
631 unsigned int sd_found_blocks;
632 unsigned int sd_found_revokes;
633 unsigned int sd_replayed_blocks;
634
635 /* For quiescing the filesystem */
636
637 struct gfs2_holder sd_freeze_gh;
638 struct mutex sd_freeze_lock;
639 unsigned int sd_freeze_count;
640
641 /* Counters */
642
643 atomic_t sd_glock_count;
644 atomic_t sd_glock_held_count;
645 atomic_t sd_inode_count;
646 atomic_t sd_reclaimed;
647
648 char sd_fsname[GFS2_FSNAME_LEN];
649 char sd_table_name[GFS2_FSNAME_LEN];
650 char sd_proto_name[GFS2_FSNAME_LEN];
651
652 /* Debugging crud */
653
654 unsigned long sd_last_warning;
655};
656
657#endif /* __INCORE_DOT_H__ */
658
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..e76f345517b7
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1354 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "bmap.h"
25#include "dir.h"
26#include "eattr.h"
27#include "glock.h"
28#include "glops.h"
29#include "inode.h"
30#include "log.h"
31#include "meta_io.h"
32#include "ops_address.h"
33#include "ops_file.h"
34#include "ops_inode.h"
35#include "quota.h"
36#include "rgrp.h"
37#include "trans.h"
38#include "util.h"
39
40/**
41 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
42 * @ip: The GFS2 inode (with embedded disk inode data)
43 * @inode: The Linux VFS inode
44 *
45 */
46
47void gfs2_inode_attr_in(struct gfs2_inode *ip)
48{
49 struct inode *inode = &ip->i_inode;
50
51 inode->i_ino = ip->i_num.no_addr;
52
53 switch (ip->i_di.di_mode & S_IFMT) {
54 case S_IFBLK:
55 case S_IFCHR:
56 inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
57 break;
58 default:
59 inode->i_rdev = 0;
60 break;
61 };
62
63 inode->i_mode = ip->i_di.di_mode;
64 inode->i_nlink = ip->i_di.di_nlink;
65 inode->i_uid = ip->i_di.di_uid;
66 inode->i_gid = ip->i_di.di_gid;
67 i_size_write(inode, ip->i_di.di_size);
68 inode->i_atime.tv_sec = ip->i_di.di_atime;
69 inode->i_mtime.tv_sec = ip->i_di.di_mtime;
70 inode->i_ctime.tv_sec = ip->i_di.di_ctime;
71 inode->i_atime.tv_nsec = 0;
72 inode->i_mtime.tv_nsec = 0;
73 inode->i_ctime.tv_nsec = 0;
74 inode->i_blksize = PAGE_SIZE;
75 inode->i_blocks = ip->i_di.di_blocks <<
76 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
77
78 if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE)
79 inode->i_flags |= S_IMMUTABLE;
80 else
81 inode->i_flags &= ~S_IMMUTABLE;
82
83 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY)
84 inode->i_flags |= S_APPEND;
85 else
86 inode->i_flags &= ~S_APPEND;
87}
88
89/**
90 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
91 * @ip: The GFS2 inode
92 *
93 * Only copy out the attributes that we want the VFS layer
94 * to be able to modify.
95 */
96
97void gfs2_inode_attr_out(struct gfs2_inode *ip)
98{
99 struct inode *inode = &ip->i_inode;
100
101 gfs2_assert_withdraw(GFS2_SB(inode),
102 (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
103 ip->i_di.di_mode = inode->i_mode;
104 ip->i_di.di_uid = inode->i_uid;
105 ip->i_di.di_gid = inode->i_gid;
106 ip->i_di.di_atime = inode->i_atime.tv_sec;
107 ip->i_di.di_mtime = inode->i_mtime.tv_sec;
108 ip->i_di.di_ctime = inode->i_ctime.tv_sec;
109}
110
111static int iget_test(struct inode *inode, void *opaque)
112{
113 struct gfs2_inode *ip = GFS2_I(inode);
114 struct gfs2_inum *inum = opaque;
115
116 if (ip && ip->i_num.no_addr == inum->no_addr)
117 return 1;
118
119 return 0;
120}
121
122static int iget_set(struct inode *inode, void *opaque)
123{
124 struct gfs2_inode *ip = GFS2_I(inode);
125 struct gfs2_inum *inum = opaque;
126
127 ip->i_num = *inum;
128 return 0;
129}
130
131struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
132{
133 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
134 iget_test, inum);
135}
136
137static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
138{
139 return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
140 iget_test, iget_set, inum);
141}
142
143/**
144 * gfs2_inode_lookup - Lookup an inode
145 * @sb: The super block
146 * @inum: The inode number
147 * @type: The type of the inode
148 *
149 * Returns: A VFS inode, or an error
150 */
151
152struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
153{
154 struct inode *inode = gfs2_iget(sb, inum);
155 struct gfs2_inode *ip = GFS2_I(inode);
156 struct gfs2_glock *io_gl;
157 int error;
158
159 if (inode->i_state & I_NEW) {
160 struct gfs2_sbd *sdp = GFS2_SB(inode);
161 umode_t mode = DT2IF(type);
162 inode->u.generic_ip = ip;
163 inode->i_mode = mode;
164
165 if (S_ISREG(mode)) {
166 inode->i_op = &gfs2_file_iops;
167 inode->i_fop = &gfs2_file_fops;
168 inode->i_mapping->a_ops = &gfs2_file_aops;
169 } else if (S_ISDIR(mode)) {
170 inode->i_op = &gfs2_dir_iops;
171 inode->i_fop = &gfs2_dir_fops;
172 } else if (S_ISLNK(mode)) {
173 inode->i_op = &gfs2_symlink_iops;
174 } else {
175 inode->i_op = &gfs2_dev_iops;
176 }
177
178 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
179 if (unlikely(error))
180 goto fail;
181 ip->i_gl->gl_object = ip;
182
183 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
184 if (unlikely(error))
185 goto fail_put;
186
187 ip->i_vn = ip->i_gl->gl_vn - 1;
188 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
189 if (unlikely(error))
190 goto fail_iopen;
191
192 gfs2_glock_put(io_gl);
193 unlock_new_inode(inode);
194 }
195
196 return inode;
197fail_iopen:
198 gfs2_glock_put(io_gl);
199fail_put:
200 ip->i_gl->gl_object = NULL;
201 gfs2_glock_put(ip->i_gl);
202fail:
203 iput(inode);
204 return ERR_PTR(error);
205}
206
207/**
208 * gfs2_inode_refresh - Refresh the incore copy of the dinode
209 * @ip: The GFS2 inode
210 *
211 * Returns: errno
212 */
213
214int gfs2_inode_refresh(struct gfs2_inode *ip)
215{
216 struct buffer_head *dibh;
217 int error;
218
219 error = gfs2_meta_inode_buffer(ip, &dibh);
220 if (error)
221 return error;
222
223 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
224 brelse(dibh);
225 return -EIO;
226 }
227
228 gfs2_dinode_in(&ip->i_di, dibh->b_data);
229
230 brelse(dibh);
231
232 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
233 if (gfs2_consist_inode(ip))
234 gfs2_dinode_print(&ip->i_di);
235 return -EIO;
236 }
237 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
238 return -ESTALE;
239
240 ip->i_vn = ip->i_gl->gl_vn;
241
242 return 0;
243}
244
245int gfs2_dinode_dealloc(struct gfs2_inode *ip)
246{
247 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
248 struct gfs2_alloc *al;
249 struct gfs2_rgrpd *rgd;
250 int error;
251
252 if (ip->i_di.di_blocks != 1) {
253 if (gfs2_consist_inode(ip))
254 gfs2_dinode_print(&ip->i_di);
255 return -EIO;
256 }
257
258 al = gfs2_alloc_get(ip);
259
260 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
261 if (error)
262 goto out;
263
264 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
265 if (error)
266 goto out_qs;
267
268 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
269 if (!rgd) {
270 gfs2_consist_inode(ip);
271 error = -EIO;
272 goto out_rindex_relse;
273 }
274
275 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
276 &al->al_rgd_gh);
277 if (error)
278 goto out_rindex_relse;
279
280 error = gfs2_trans_begin(sdp, RES_RG_BIT +
281 RES_STATFS + RES_QUOTA, 1);
282 if (error)
283 goto out_rg_gunlock;
284
285 gfs2_trans_add_gl(ip->i_gl);
286
287 gfs2_free_di(rgd, ip);
288
289 gfs2_trans_end(sdp);
290 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
291
292out_rg_gunlock:
293 gfs2_glock_dq_uninit(&al->al_rgd_gh);
294out_rindex_relse:
295 gfs2_glock_dq_uninit(&al->al_ri_gh);
296out_qs:
297 gfs2_quota_unhold(ip);
298out:
299 gfs2_alloc_put(ip);
300 return error;
301}
302
303/**
304 * gfs2_change_nlink - Change nlink count on inode
305 * @ip: The GFS2 inode
306 * @diff: The change in the nlink count required
307 *
308 * Returns: errno
309 */
310
311int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
312{
313 struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
314 struct buffer_head *dibh;
315 uint32_t nlink;
316 int error;
317
318 BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
319 nlink = ip->i_di.di_nlink + diff;
320
321 /* If we are reducing the nlink count, but the new value ends up being
322 bigger than the old one, we must have underflowed. */
323 if (diff < 0 && nlink > ip->i_di.di_nlink) {
324 if (gfs2_consist_inode(ip))
325 gfs2_dinode_print(&ip->i_di);
326 return -EIO;
327 }
328
329 error = gfs2_meta_inode_buffer(ip, &dibh);
330 if (error)
331 return error;
332
333 ip->i_di.di_nlink = nlink;
334 ip->i_di.di_ctime = get_seconds();
335 ip->i_inode.i_nlink = nlink;
336
337 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
338 gfs2_dinode_out(&ip->i_di, dibh->b_data);
339 brelse(dibh);
340 mark_inode_dirty(&ip->i_inode);
341
342 if (ip->i_di.di_nlink == 0) {
343 struct gfs2_rgrpd *rgd;
344 struct gfs2_holder ri_gh, rg_gh;
345
346 error = gfs2_rindex_hold(sdp, &ri_gh);
347 if (error)
348 goto out;
349 error = -EIO;
350 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
351 if (!rgd)
352 goto out_norgrp;
353 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
354 if (error)
355 goto out_norgrp;
356
357 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
358 gfs2_glock_dq_uninit(&rg_gh);
359out_norgrp:
360 gfs2_glock_dq_uninit(&ri_gh);
361 }
362out:
363 return error;
364}
365
366struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
367{
368 struct qstr qstr;
369 gfs2_str2qstr(&qstr, name);
370 return gfs2_lookupi(dip, &qstr, 1, NULL);
371}
372
373
374/**
375 * gfs2_lookupi - Look up a filename in a directory and return its inode
376 * @d_gh: An initialized holder for the directory glock
377 * @name: The name of the inode to look for
378 * @is_root: If 1, ignore the caller's permissions
379 * @i_gh: An uninitialized holder for the new inode glock
380 *
381 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
382 * @is_root is true.
383 *
384 * Returns: errno
385 */
386
387struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
388 int is_root, struct nameidata *nd)
389
390{
391 struct super_block *sb = dir->i_sb;
392 struct gfs2_inode *dip = GFS2_I(dir);
393 struct gfs2_holder d_gh;
394 struct gfs2_inum inum;
395 unsigned int type;
396 int error = 0;
397 struct inode *inode = NULL;
398
399 if (!name->len || name->len > GFS2_FNAMESIZE)
400 return ERR_PTR(-ENAMETOOLONG);
401
402 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
403 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
404 dir == sb->s_root->d_inode)) {
405 igrab(dir);
406 return dir;
407 }
408
409 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
410 if (error)
411 return ERR_PTR(error);
412
413 if (!is_root) {
414 error = permission(dir, MAY_EXEC, NULL);
415 if (error)
416 goto out;
417 }
418
419 error = gfs2_dir_search(dir, name, &inum, &type);
420 if (error)
421 goto out;
422
423 inode = gfs2_inode_lookup(sb, &inum, type);
424
425out:
426 gfs2_glock_dq_uninit(&d_gh);
427 if (error == -ENOENT)
428 return NULL;
429 return inode;
430}
431
432static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino)
433{
434 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
435 struct buffer_head *bh;
436 struct gfs2_inum_range ir;
437 int error;
438
439 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
440 if (error)
441 return error;
442 mutex_lock(&sdp->sd_inum_mutex);
443
444 error = gfs2_meta_inode_buffer(ip, &bh);
445 if (error) {
446 mutex_unlock(&sdp->sd_inum_mutex);
447 gfs2_trans_end(sdp);
448 return error;
449 }
450
451 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
452
453 if (ir.ir_length) {
454 *formal_ino = ir.ir_start++;
455 ir.ir_length--;
456 gfs2_trans_add_bh(ip->i_gl, bh, 1);
457 gfs2_inum_range_out(&ir,
458 bh->b_data + sizeof(struct gfs2_dinode));
459 brelse(bh);
460 mutex_unlock(&sdp->sd_inum_mutex);
461 gfs2_trans_end(sdp);
462 return 0;
463 }
464
465 brelse(bh);
466
467 mutex_unlock(&sdp->sd_inum_mutex);
468 gfs2_trans_end(sdp);
469
470 return 1;
471}
472
473static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino)
474{
475 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
476 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
477 struct gfs2_holder gh;
478 struct buffer_head *bh;
479 struct gfs2_inum_range ir;
480 int error;
481
482 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
483 if (error)
484 return error;
485
486 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
487 if (error)
488 goto out;
489 mutex_lock(&sdp->sd_inum_mutex);
490
491 error = gfs2_meta_inode_buffer(ip, &bh);
492 if (error)
493 goto out_end_trans;
494
495 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
496
497 if (!ir.ir_length) {
498 struct buffer_head *m_bh;
499 uint64_t x, y;
500
501 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
502 if (error)
503 goto out_brelse;
504
505 x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode));
506 x = y = be64_to_cpu(x);
507 ir.ir_start = x;
508 ir.ir_length = GFS2_INUM_QUANTUM;
509 x += GFS2_INUM_QUANTUM;
510 if (x < y)
511 gfs2_consist_inode(m_ip);
512 x = cpu_to_be64(x);
513 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
514 *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
515
516 brelse(m_bh);
517 }
518
519 *formal_ino = ir.ir_start++;
520 ir.ir_length--;
521
522 gfs2_trans_add_bh(ip->i_gl, bh, 1);
523 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
524
525 out_brelse:
526 brelse(bh);
527
528 out_end_trans:
529 mutex_unlock(&sdp->sd_inum_mutex);
530 gfs2_trans_end(sdp);
531
532 out:
533 gfs2_glock_dq_uninit(&gh);
534
535 return error;
536}
537
538static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum)
539{
540 int error;
541
542 error = pick_formal_ino_1(sdp, inum);
543 if (error <= 0)
544 return error;
545
546 error = pick_formal_ino_2(sdp, inum);
547
548 return error;
549}
550
551/**
552 * create_ok - OK to create a new on-disk inode here?
553 * @dip: Directory in which dinode is to be created
554 * @name: Name of new dinode
555 * @mode:
556 *
557 * Returns: errno
558 */
559
560static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
561 unsigned int mode)
562{
563 int error;
564
565 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
566 if (error)
567 return error;
568
569 /* Don't create entries in an unlinked directory */
570 if (!dip->i_di.di_nlink)
571 return -EPERM;
572
573 error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
574 switch (error) {
575 case -ENOENT:
576 error = 0;
577 break;
578 case 0:
579 return -EEXIST;
580 default:
581 return error;
582 }
583
584 if (dip->i_di.di_entries == (uint32_t)-1)
585 return -EFBIG;
586 if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1)
587 return -EMLINK;
588
589 return 0;
590}
591
592static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
593 unsigned int *uid, unsigned int *gid)
594{
595 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
596 (dip->i_di.di_mode & S_ISUID) &&
597 dip->i_di.di_uid) {
598 if (S_ISDIR(*mode))
599 *mode |= S_ISUID;
600 else if (dip->i_di.di_uid != current->fsuid)
601 *mode &= ~07111;
602 *uid = dip->i_di.di_uid;
603 } else
604 *uid = current->fsuid;
605
606 if (dip->i_di.di_mode & S_ISGID) {
607 if (S_ISDIR(*mode))
608 *mode |= S_ISGID;
609 *gid = dip->i_di.di_gid;
610 } else
611 *gid = current->fsgid;
612}
613
614static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
615 u64 *generation)
616{
617 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
618 int error;
619
620 gfs2_alloc_get(dip);
621
622 dip->i_alloc.al_requested = RES_DINODE;
623 error = gfs2_inplace_reserve(dip);
624 if (error)
625 goto out;
626
627 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
628 if (error)
629 goto out_ipreserv;
630
631 inum->no_addr = gfs2_alloc_di(dip, generation);
632
633 gfs2_trans_end(sdp);
634
635out_ipreserv:
636 gfs2_inplace_release(dip);
637
638out:
639 gfs2_alloc_put(dip);
640
641 return error;
642}
643
644/**
645 * init_dinode - Fill in a new dinode structure
646 * @dip: the directory this inode is being created in
647 * @gl: The glock covering the new inode
648 * @inum: the inode number
649 * @mode: the file permissions
650 * @uid:
651 * @gid:
652 *
653 */
654
655static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
656 const struct gfs2_inum *inum, unsigned int mode,
657 unsigned int uid, unsigned int gid,
658 const u64 *generation)
659{
660 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
661 struct gfs2_dinode *di;
662 struct buffer_head *dibh;
663
664 dibh = gfs2_meta_new(gl, inum->no_addr);
665 gfs2_trans_add_bh(gl, dibh, 1);
666 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
667 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
668 di = (struct gfs2_dinode *)dibh->b_data;
669
670 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
671 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
672 di->di_mode = cpu_to_be32(mode);
673 di->di_uid = cpu_to_be32(uid);
674 di->di_gid = cpu_to_be32(gid);
675 di->di_nlink = cpu_to_be32(0);
676 di->di_size = cpu_to_be64(0);
677 di->di_blocks = cpu_to_be64(1);
678 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
679 di->di_major = di->di_minor = cpu_to_be32(0);
680 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
681 di->di_generation = cpu_to_be64(*generation);
682 di->di_flags = cpu_to_be32(0);
683
684 if (S_ISREG(mode)) {
685 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
686 gfs2_tune_get(sdp, gt_new_files_jdata))
687 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
688 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
689 gfs2_tune_get(sdp, gt_new_files_directio))
690 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
691 } else if (S_ISDIR(mode)) {
692 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
693 GFS2_DIF_INHERIT_DIRECTIO);
694 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
695 GFS2_DIF_INHERIT_JDATA);
696 }
697
698 di->__pad1 = 0;
699 di->di_payload_format = cpu_to_be32(0);
700 di->di_height = cpu_to_be32(0);
701 di->__pad2 = 0;
702 di->__pad3 = 0;
703 di->di_depth = cpu_to_be16(0);
704 di->di_entries = cpu_to_be32(0);
705 memset(&di->__pad4, 0, sizeof(di->__pad4));
706 di->di_eattr = cpu_to_be64(0);
707 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
708
709 brelse(dibh);
710}
711
712static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
713 unsigned int mode, const struct gfs2_inum *inum,
714 const u64 *generation)
715{
716 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
717 unsigned int uid, gid;
718 int error;
719
720 munge_mode_uid_gid(dip, &mode, &uid, &gid);
721 gfs2_alloc_get(dip);
722
723 error = gfs2_quota_lock(dip, uid, gid);
724 if (error)
725 goto out;
726
727 error = gfs2_quota_check(dip, uid, gid);
728 if (error)
729 goto out_quota;
730
731 error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
732 if (error)
733 goto out_quota;
734
735 init_dinode(dip, gl, inum, mode, uid, gid, generation);
736 gfs2_quota_change(dip, +1, uid, gid);
737 gfs2_trans_end(sdp);
738
739out_quota:
740 gfs2_quota_unlock(dip);
741out:
742 gfs2_alloc_put(dip);
743 return error;
744}
745
746static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
747 struct gfs2_inode *ip)
748{
749 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
750 struct gfs2_alloc *al;
751 int alloc_required;
752 struct buffer_head *dibh;
753 int error;
754
755 al = gfs2_alloc_get(dip);
756
757 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
758 if (error)
759 goto fail;
760
761 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
762 if (alloc_required < 0)
763 goto fail;
764 if (alloc_required) {
765 error = gfs2_quota_check(dip, dip->i_di.di_uid,
766 dip->i_di.di_gid);
767 if (error)
768 goto fail_quota_locks;
769
770 al->al_requested = sdp->sd_max_dirres;
771
772 error = gfs2_inplace_reserve(dip);
773 if (error)
774 goto fail_quota_locks;
775
776 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
777 al->al_rgd->rd_ri.ri_length +
778 2 * RES_DINODE +
779 RES_STATFS + RES_QUOTA, 0);
780 if (error)
781 goto fail_ipreserv;
782 } else {
783 error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
784 if (error)
785 goto fail_quota_locks;
786 }
787
788 error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
789 if (error)
790 goto fail_end_trans;
791
792 error = gfs2_meta_inode_buffer(ip, &dibh);
793 if (error)
794 goto fail_end_trans;
795 ip->i_di.di_nlink = 1;
796 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
797 gfs2_dinode_out(&ip->i_di, dibh->b_data);
798 brelse(dibh);
799 return 0;
800
801fail_end_trans:
802 gfs2_trans_end(sdp);
803
804fail_ipreserv:
805 if (dip->i_alloc.al_rgd)
806 gfs2_inplace_release(dip);
807
808fail_quota_locks:
809 gfs2_quota_unlock(dip);
810
811fail:
812 gfs2_alloc_put(dip);
813 return error;
814}
815
816/**
817 * gfs2_createi - Create a new inode
818 * @ghs: An array of two holders
819 * @name: The name of the new file
820 * @mode: the permissions on the new inode
821 *
822 * @ghs[0] is an initialized holder for the directory
823 * @ghs[1] is the holder for the inode lock
824 *
825 * If the return value is not NULL, the glocks on both the directory and the new
826 * file are held. A transaction has been started and an inplace reservation
827 * is held, as well.
828 *
829 * Returns: An inode
830 */
831
832struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
833 unsigned int mode)
834{
835 struct inode *inode;
836 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
837 struct inode *dir = &dip->i_inode;
838 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
839 struct gfs2_inum inum;
840 int error;
841 u64 generation;
842
843 if (!name->len || name->len > GFS2_FNAMESIZE)
844 return ERR_PTR(-ENAMETOOLONG);
845
846 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
847 error = gfs2_glock_nq(ghs);
848 if (error)
849 goto fail;
850
851 error = create_ok(dip, name, mode);
852 if (error)
853 goto fail_gunlock;
854
855 error = pick_formal_ino(sdp, &inum.no_formal_ino);
856 if (error)
857 goto fail_gunlock;
858
859 error = alloc_dinode(dip, &inum, &generation);
860 if (error)
861 goto fail_gunlock;
862
863 if (inum.no_addr < dip->i_num.no_addr) {
864 gfs2_glock_dq(ghs);
865
866 error = gfs2_glock_nq_num(sdp, inum.no_addr,
867 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
868 GL_SKIP, ghs + 1);
869 if (error) {
870 return ERR_PTR(error);
871 }
872
873 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
874 error = gfs2_glock_nq(ghs);
875 if (error) {
876 gfs2_glock_dq_uninit(ghs + 1);
877 return ERR_PTR(error);
878 }
879
880 error = create_ok(dip, name, mode);
881 if (error)
882 goto fail_gunlock2;
883 } else {
884 error = gfs2_glock_nq_num(sdp, inum.no_addr,
885 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
886 GL_SKIP, ghs + 1);
887 if (error)
888 goto fail_gunlock;
889 }
890
891 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
892 if (error)
893 goto fail_gunlock2;
894
895 inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
896 if (IS_ERR(inode))
897 goto fail_gunlock2;
898
899 error = gfs2_inode_refresh(GFS2_I(inode));
900 if (error)
901 goto fail_iput;
902
903 error = gfs2_acl_create(dip, GFS2_I(inode));
904 if (error)
905 goto fail_iput;
906
907 error = link_dinode(dip, name, GFS2_I(inode));
908 if (error)
909 goto fail_iput;
910
911 if (!inode)
912 return ERR_PTR(-ENOMEM);
913 return inode;
914
915fail_iput:
916 iput(inode);
917fail_gunlock2:
918 gfs2_glock_dq_uninit(ghs + 1);
919fail_gunlock:
920 gfs2_glock_dq(ghs);
921fail:
922 return ERR_PTR(error);
923}
924
925/**
926 * gfs2_rmdiri - Remove a directory
927 * @dip: The parent directory of the directory to be removed
928 * @name: The name of the directory to be removed
929 * @ip: The GFS2 inode of the directory to be removed
930 *
931 * Assumes Glocks on dip and ip are held
932 *
933 * Returns: errno
934 */
935
936int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
937 struct gfs2_inode *ip)
938{
939 struct qstr dotname;
940 int error;
941
942 if (ip->i_di.di_entries != 2) {
943 if (gfs2_consist_inode(ip))
944 gfs2_dinode_print(&ip->i_di);
945 return -EIO;
946 }
947
948 error = gfs2_dir_del(dip, name);
949 if (error)
950 return error;
951
952 error = gfs2_change_nlink(dip, -1);
953 if (error)
954 return error;
955
956 gfs2_str2qstr(&dotname, ".");
957 error = gfs2_dir_del(ip, &dotname);
958 if (error)
959 return error;
960
961 gfs2_str2qstr(&dotname, "..");
962 error = gfs2_dir_del(ip, &dotname);
963 if (error)
964 return error;
965
966 error = gfs2_change_nlink(ip, -2);
967 if (error)
968 return error;
969
970 return error;
971}
972
973/*
974 * gfs2_unlink_ok - check to see that a inode is still in a directory
975 * @dip: the directory
976 * @name: the name of the file
977 * @ip: the inode
978 *
979 * Assumes that the lock on (at least) @dip is held.
980 *
981 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
982 */
983
984int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
985 struct gfs2_inode *ip)
986{
987 struct gfs2_inum inum;
988 unsigned int type;
989 int error;
990
991 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
992 return -EPERM;
993
994 if ((dip->i_di.di_mode & S_ISVTX) &&
995 dip->i_di.di_uid != current->fsuid &&
996 ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
997 return -EPERM;
998
999 if (IS_APPEND(&dip->i_inode))
1000 return -EPERM;
1001
1002 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
1003 if (error)
1004 return error;
1005
1006 error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
1007 if (error)
1008 return error;
1009
1010 if (!gfs2_inum_equal(&inum, &ip->i_num))
1011 return -ENOENT;
1012
1013 if (IF2DT(ip->i_di.di_mode) != type) {
1014 gfs2_consist_inode(dip);
1015 return -EIO;
1016 }
1017
1018 return 0;
1019}
1020
1021/*
1022 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1023 * @this: move this
1024 * @to: to here
1025 *
1026 * Follow @to back to the root and make sure we don't encounter @this
1027 * Assumes we already hold the rename lock.
1028 *
1029 * Returns: errno
1030 */
1031
1032int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1033{
1034 struct inode *dir = &to->i_inode;
1035 struct super_block *sb = dir->i_sb;
1036 struct inode *tmp;
1037 struct qstr dotdot;
1038 int error = 0;
1039
1040 gfs2_str2qstr(&dotdot, "..");
1041
1042 igrab(dir);
1043
1044 for (;;) {
1045 if (dir == &this->i_inode) {
1046 error = -EINVAL;
1047 break;
1048 }
1049 if (dir == sb->s_root->d_inode) {
1050 error = 0;
1051 break;
1052 }
1053
1054 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1055 if (IS_ERR(tmp)) {
1056 error = PTR_ERR(tmp);
1057 break;
1058 }
1059
1060 iput(dir);
1061 dir = tmp;
1062 }
1063
1064 iput(dir);
1065
1066 return error;
1067}
1068
1069/**
1070 * gfs2_readlinki - return the contents of a symlink
1071 * @ip: the symlink's inode
1072 * @buf: a pointer to the buffer to be filled
1073 * @len: a pointer to the length of @buf
1074 *
1075 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1076 * to be freed by the caller.
1077 *
1078 * Returns: errno
1079 */
1080
1081int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1082{
1083 struct gfs2_holder i_gh;
1084 struct buffer_head *dibh;
1085 unsigned int x;
1086 int error;
1087
1088 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1089 error = gfs2_glock_nq_atime(&i_gh);
1090 if (error) {
1091 gfs2_holder_uninit(&i_gh);
1092 return error;
1093 }
1094
1095 if (!ip->i_di.di_size) {
1096 gfs2_consist_inode(ip);
1097 error = -EIO;
1098 goto out;
1099 }
1100
1101 error = gfs2_meta_inode_buffer(ip, &dibh);
1102 if (error)
1103 goto out;
1104
1105 x = ip->i_di.di_size + 1;
1106 if (x > *len) {
1107 *buf = kmalloc(x, GFP_KERNEL);
1108 if (!*buf) {
1109 error = -ENOMEM;
1110 goto out_brelse;
1111 }
1112 }
1113
1114 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1115 *len = x;
1116
1117out_brelse:
1118 brelse(dibh);
1119out:
1120 gfs2_glock_dq_uninit(&i_gh);
1121 return error;
1122}
1123
1124/**
1125 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1126 * conditionally update the inode's atime
1127 * @gh: the holder to acquire
1128 *
1129 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1130 * Update if the difference between the current time and the inode's current
1131 * atime is greater than an interval specified at mount.
1132 *
1133 * Returns: errno
1134 */
1135
1136int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1137{
1138 struct gfs2_glock *gl = gh->gh_gl;
1139 struct gfs2_sbd *sdp = gl->gl_sbd;
1140 struct gfs2_inode *ip = gl->gl_object;
1141 int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1142 unsigned int state;
1143 int flags;
1144 int error;
1145
1146 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1147 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1148 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1149 return -EINVAL;
1150
1151 state = gh->gh_state;
1152 flags = gh->gh_flags;
1153
1154 error = gfs2_glock_nq(gh);
1155 if (error)
1156 return error;
1157
1158 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1159 (sdp->sd_vfs->s_flags & MS_RDONLY))
1160 return 0;
1161
1162 curtime = get_seconds();
1163 if (curtime - ip->i_di.di_atime >= quantum) {
1164 gfs2_glock_dq(gh);
1165 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1166 gh);
1167 error = gfs2_glock_nq(gh);
1168 if (error)
1169 return error;
1170
1171 /* Verify that atime hasn't been updated while we were
1172 trying to get exclusive lock. */
1173
1174 curtime = get_seconds();
1175 if (curtime - ip->i_di.di_atime >= quantum) {
1176 struct buffer_head *dibh;
1177
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1179 if (error == -EROFS)
1180 return 0;
1181 if (error)
1182 goto fail;
1183
1184 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error)
1186 goto fail_end_trans;
1187
1188 ip->i_di.di_atime = curtime;
1189
1190 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1191 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1192 brelse(dibh);
1193
1194 gfs2_trans_end(sdp);
1195 }
1196
1197 /* If someone else has asked for the glock,
1198 unlock and let them have it. Then reacquire
1199 in the original state. */
1200 if (gfs2_glock_is_blocking(gl)) {
1201 gfs2_glock_dq(gh);
1202 gfs2_holder_reinit(state, flags, gh);
1203 return gfs2_glock_nq(gh);
1204 }
1205 }
1206
1207 return 0;
1208
1209fail_end_trans:
1210 gfs2_trans_end(sdp);
1211fail:
1212 gfs2_glock_dq(gh);
1213 return error;
1214}
1215
1216/**
1217 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1218 * @arg_a: the first structure
1219 * @arg_b: the second structure
1220 *
1221 * Returns: 1 if A > B
1222 * -1 if A < B
1223 * 0 if A = B
1224 */
1225
1226static int glock_compare_atime(const void *arg_a, const void *arg_b)
1227{
1228 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1229 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1230 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1231 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1232 int ret = 0;
1233
1234 if (a->ln_number > b->ln_number)
1235 ret = 1;
1236 else if (a->ln_number < b->ln_number)
1237 ret = -1;
1238 else {
1239 if (gh_a->gh_state == LM_ST_SHARED &&
1240 gh_b->gh_state == LM_ST_EXCLUSIVE)
1241 ret = 1;
1242 else if (gh_a->gh_state == LM_ST_SHARED &&
1243 (gh_b->gh_flags & GL_ATIME))
1244 ret = 1;
1245 }
1246
1247 return ret;
1248}
1249
1250/**
1251 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1252 * atime update
1253 * @num_gh: the number of structures
1254 * @ghs: an array of struct gfs2_holder structures
1255 *
1256 * Returns: 0 on success (all glocks acquired),
1257 * errno on failure (no glocks acquired)
1258 */
1259
1260int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1261{
1262 struct gfs2_holder **p;
1263 unsigned int x;
1264 int error = 0;
1265
1266 if (!num_gh)
1267 return 0;
1268
1269 if (num_gh == 1) {
1270 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1271 if (ghs->gh_flags & GL_ATIME)
1272 error = gfs2_glock_nq_atime(ghs);
1273 else
1274 error = gfs2_glock_nq(ghs);
1275 return error;
1276 }
1277
1278 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1279 if (!p)
1280 return -ENOMEM;
1281
1282 for (x = 0; x < num_gh; x++)
1283 p[x] = &ghs[x];
1284
1285 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1286
1287 for (x = 0; x < num_gh; x++) {
1288 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1289
1290 if (p[x]->gh_flags & GL_ATIME)
1291 error = gfs2_glock_nq_atime(p[x]);
1292 else
1293 error = gfs2_glock_nq(p[x]);
1294
1295 if (error) {
1296 while (x--)
1297 gfs2_glock_dq(p[x]);
1298 break;
1299 }
1300 }
1301
1302 kfree(p);
1303
1304 return error;
1305}
1306
1307
1308static int
1309__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1310{
1311 struct buffer_head *dibh;
1312 int error;
1313
1314 error = gfs2_meta_inode_buffer(ip, &dibh);
1315 if (!error) {
1316 error = inode_setattr(&ip->i_inode, attr);
1317 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1318 gfs2_inode_attr_out(ip);
1319
1320 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1321 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1322 brelse(dibh);
1323 }
1324 return error;
1325}
1326
1327/**
1328 * gfs2_setattr_simple -
1329 * @ip:
1330 * @attr:
1331 *
1332 * Called with a reference on the vnode.
1333 *
1334 * Returns: errno
1335 */
1336
1337int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1338{
1339 int error;
1340
1341 if (current->journal_info)
1342 return __gfs2_setattr_simple(ip, attr);
1343
1344 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
1345 if (error)
1346 return error;
1347
1348 error = __gfs2_setattr_simple(ip, attr);
1349
1350 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1351
1352 return error;
1353}
1354
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..8bb8b559bcea
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
31struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
32
33int gfs2_inode_refresh(struct gfs2_inode *ip);
34
35int gfs2_dinode_dealloc(struct gfs2_inode *inode);
36int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
37struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
38 int is_root, struct nameidata *nd);
39struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
40 unsigned int mode);
41int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
42 struct gfs2_inode *ip);
43int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
44 struct gfs2_inode *ip);
45int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
46int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
47
48int gfs2_glock_nq_atime(struct gfs2_holder *gh);
49int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
50
51int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
52
53struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
54
55#endif /* __INODE_DOT_H__ */
56
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..f45c0ffd1c35
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,244 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "lm.h"
23#include "super.h"
24#include "util.h"
25#include "lvb.h"
26
27/**
28 * gfs2_lm_mount - mount a locking protocol
29 * @sdp: the filesystem
30 * @args: mount arguements
31 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
32 *
33 * Returns: errno
34 */
35
36int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
37{
38 char *proto = sdp->sd_proto_name;
39 char *table = sdp->sd_table_name;
40 int flags = 0;
41 int error;
42
43 if (sdp->sd_args.ar_spectator)
44 flags |= LM_MFLAG_SPECTATOR;
45
46 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
47
48 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
49 gfs2_glock_cb, sdp,
50 GFS2_MIN_LVB_SIZE, flags,
51 &sdp->sd_lockstruct, &sdp->sd_kobj);
52 if (error) {
53 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
54 proto, table, sdp->sd_args.ar_hostdata);
55 goto out;
56 }
57
58 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
59 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
60 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
61 GFS2_MIN_LVB_SIZE)) {
62 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
63 goto out;
64 }
65
66 if (sdp->sd_args.ar_spectator)
67 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
68 else
69 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
70 sdp->sd_lockstruct.ls_jid);
71
72 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
73
74 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
75 !sdp->sd_args.ar_ignore_local_fs) {
76 sdp->sd_args.ar_localflocks = 1;
77 sdp->sd_args.ar_localcaching = 1;
78 }
79
80 out:
81 return error;
82}
83
84void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
85{
86 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
87 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
88 sdp->sd_lockstruct.ls_lockspace);
89}
90
91void gfs2_lm_unmount(struct gfs2_sbd *sdp)
92{
93 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
94 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
95}
96
97int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
98{
99 va_list args;
100
101 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
102 return 0;
103
104 va_start(args, fmt);
105 vprintk(fmt, args);
106 va_end(args);
107
108 fs_err(sdp, "about to withdraw from the cluster\n");
109 BUG_ON(sdp->sd_args.ar_debug);
110
111
112 fs_err(sdp, "waiting for outstanding I/O\n");
113
114 /* FIXME: suspend dm device so oustanding bio's complete
115 and all further io requests fail */
116
117 fs_err(sdp, "telling LM to withdraw\n");
118 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
119 fs_err(sdp, "withdrawn\n");
120 dump_stack();
121
122 return -1;
123}
124
125int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
126 lm_lock_t **lockp)
127{
128 int error;
129 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
130 error = -EIO;
131 else
132 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
133 sdp->sd_lockstruct.ls_lockspace, name, lockp);
134 return error;
135}
136
137void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock)
138{
139 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
140 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
141}
142
143unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
144 unsigned int cur_state, unsigned int req_state,
145 unsigned int flags)
146{
147 int ret;
148 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
149 ret = 0;
150 else
151 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
152 cur_state,
153 req_state, flags);
154 return ret;
155}
156
157unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
158 unsigned int cur_state)
159{
160 int ret;
161 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
162 ret = 0;
163 else
164 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
165 return ret;
166}
167
168void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock)
169{
170 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
171 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
172}
173
174int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp)
175{
176 int error;
177 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
178 error = -EIO;
179 else
180 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
181 return error;
182}
183
184void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
185{
186 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
187 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
188}
189
190#if 0
191void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
192{
193 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
194 sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb);
195}
196#endif /* 0 */
197
198int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
203 error = -EIO;
204 else
205 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
206 sdp->sd_lockstruct.ls_lockspace,
207 name, file, fl);
208 return error;
209}
210
211int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
212 struct file *file, int cmd, struct file_lock *fl)
213{
214 int error;
215 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
216 error = -EIO;
217 else
218 error = sdp->sd_lockstruct.ls_ops->lm_plock(
219 sdp->sd_lockstruct.ls_lockspace,
220 name, file, cmd, fl);
221 return error;
222}
223
224int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
225 struct file *file, struct file_lock *fl)
226{
227 int error;
228 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
229 error = -EIO;
230 else
231 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
232 sdp->sd_lockstruct.ls_lockspace,
233 name, file, fl);
234 return error;
235}
236
237void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
238 unsigned int message)
239{
240 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
241 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
242 sdp->sd_lockstruct.ls_lockspace, jid, message);
243}
244
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..e821101d19c0
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
14void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
16int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
17__attribute__ ((format(printf, 2, 3)));
18int gfs2_lm_get_lock(struct gfs2_sbd *sdp,
19 struct lm_lockname *name, lm_lock_t **lockp);
20void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock);
21unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
22 unsigned int cur_state, unsigned int req_state,
23 unsigned int flags);
24unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
25 unsigned int cur_state);
26void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock);
27int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp);
28void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
29int gfs2_lm_plock_get(struct gfs2_sbd *sdp,
30 struct lm_lockname *name,
31 struct file *file, struct file_lock *fl);
32int gfs2_lm_plock(struct gfs2_sbd *sdp,
33 struct lm_lockname *name,
34 struct file *file, int cmd, struct file_lock *fl);
35int gfs2_lm_punlock(struct gfs2_sbd *sdp,
36 struct lm_lockname *name,
37 struct file *file, struct file_lock *fl);
38void gfs2_lm_recovery_done(struct gfs2_sbd *sdp,
39 unsigned int jid, unsigned int message);
40
41#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/lm_interface.h b/fs/gfs2/lm_interface.h
new file mode 100644
index 000000000000..9d34bf3df103
--- /dev/null
+++ b/fs/gfs2/lm_interface.h
@@ -0,0 +1,295 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13/*
14 * Opaque handles represent the lock module's lockspace structure, the lock
15 * module's lock structures, and GFS's file system (superblock) structure.
16 */
17
18typedef void lm_lockspace_t;
19typedef void lm_lock_t;
20typedef void lm_fsdata_t;
21
22typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
23 void *data);
24
25/*
26 * lm_mount() flags
27 *
28 * LM_MFLAG_SPECTATOR
29 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
30 * modify the filesystem. The lock module shouldn't assign a journal to the FS
31 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
32 * dies or withdraws, all locks can be wiped immediately.
33 */
34
35#define LM_MFLAG_SPECTATOR 0x00000001
36
37/*
38 * lm_lockstruct flags
39 *
40 * LM_LSFLAG_LOCAL
41 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
42 * can make single-node optimizations.
43 */
44
45#define LM_LSFLAG_LOCAL 0x00000001
46
47/*
48 * lm_lockname types
49 */
50
51#define LM_TYPE_RESERVED 0x00
52#define LM_TYPE_NONDISK 0x01
53#define LM_TYPE_INODE 0x02
54#define LM_TYPE_RGRP 0x03
55#define LM_TYPE_META 0x04
56#define LM_TYPE_IOPEN 0x05
57#define LM_TYPE_FLOCK 0x06
58#define LM_TYPE_PLOCK 0x07
59#define LM_TYPE_QUOTA 0x08
60#define LM_TYPE_JOURNAL 0x09
61
62/*
63 * lm_lock() states
64 *
65 * SHARED is compatible with SHARED, not with DEFERRED or EX.
66 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
67 */
68
69#define LM_ST_UNLOCKED 0
70#define LM_ST_EXCLUSIVE 1
71#define LM_ST_DEFERRED 2
72#define LM_ST_SHARED 3
73
74/*
75 * lm_lock() flags
76 *
77 * LM_FLAG_TRY
78 * Don't wait to acquire the lock if it can't be granted immediately.
79 *
80 * LM_FLAG_TRY_1CB
81 * Send one blocking callback if TRY is set and the lock is not granted.
82 *
83 * LM_FLAG_NOEXP
84 * GFS sets this flag on lock requests it makes while doing journal recovery.
85 * These special requests should not be blocked due to the recovery like
86 * ordinary locks would be.
87 *
88 * LM_FLAG_ANY
89 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
90 * also be granted in SHARED. The preferred state is whichever is compatible
91 * with other granted locks, or the specified state if no other locks exist.
92 *
93 * LM_FLAG_PRIORITY
94 * Override fairness considerations. Suppose a lock is held in a shared state
95 * and there is a pending request for the deferred state. A shared lock
96 * request with the priority flag would be allowed to bypass the deferred
97 * request and directly join the other shared lock. A shared lock request
98 * without the priority flag might be forced to wait until the deferred
99 * requested had acquired and released the lock.
100 */
101
102#define LM_FLAG_TRY 0x00000001
103#define LM_FLAG_TRY_1CB 0x00000002
104#define LM_FLAG_NOEXP 0x00000004
105#define LM_FLAG_ANY 0x00000008
106#define LM_FLAG_PRIORITY 0x00000010
107
108/*
109 * lm_lock() and lm_async_cb return flags
110 *
111 * LM_OUT_ST_MASK
112 * Masks the lower two bits of lock state in the returned value.
113 *
114 * LM_OUT_CACHEABLE
115 * The lock hasn't been released so GFS can continue to cache data for it.
116 *
117 * LM_OUT_CANCELED
118 * The lock request was canceled.
119 *
120 * LM_OUT_ASYNC
121 * The result of the request will be returned in an LM_CB_ASYNC callback.
122 */
123
124#define LM_OUT_ST_MASK 0x00000003
125#define LM_OUT_CACHEABLE 0x00000004
126#define LM_OUT_CANCELED 0x00000008
127#define LM_OUT_ASYNC 0x00000080
128#define LM_OUT_ERROR 0x00000100
129
130/*
131 * lm_callback_t types
132 *
133 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
134 * Blocking callback, a remote node is requesting the given lock in
135 * EXCLUSIVE, DEFERRED, or SHARED.
136 *
137 * LM_CB_NEED_RECOVERY
138 * The given journal needs to be recovered.
139 *
140 * LM_CB_DROPLOCKS
141 * Reduce the number of cached locks.
142 *
143 * LM_CB_ASYNC
144 * The given lock has been granted.
145 */
146
147#define LM_CB_NEED_E 257
148#define LM_CB_NEED_D 258
149#define LM_CB_NEED_S 259
150#define LM_CB_NEED_RECOVERY 260
151#define LM_CB_DROPLOCKS 261
152#define LM_CB_ASYNC 262
153
154/*
155 * lm_recovery_done() messages
156 */
157
158#define LM_RD_GAVEUP 308
159#define LM_RD_SUCCESS 309
160
161
162struct lm_lockname {
163 uint64_t ln_number;
164 unsigned int ln_type;
165};
166
167#define lm_name_equal(name1, name2) \
168 (((name1)->ln_number == (name2)->ln_number) && \
169 ((name1)->ln_type == (name2)->ln_type)) \
170
171struct lm_async_cb {
172 struct lm_lockname lc_name;
173 int lc_ret;
174};
175
176struct lm_lockstruct;
177
178struct lm_lockops {
179 char lm_proto_name[256];
180
181 /*
182 * Mount/Unmount
183 */
184
185 int (*lm_mount) (char *table_name, char *host_data,
186 lm_callback_t cb, lm_fsdata_t *fsdata,
187 unsigned int min_lvb_size, int flags,
188 struct lm_lockstruct *lockstruct,
189 struct kobject *fskobj);
190
191 void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
192
193 void (*lm_unmount) (lm_lockspace_t *lockspace);
194
195 void (*lm_withdraw) (lm_lockspace_t *lockspace);
196
197 /*
198 * Lock oriented operations
199 */
200
201 int (*lm_get_lock) (lm_lockspace_t *lockspace,
202 struct lm_lockname *name, lm_lock_t **lockp);
203
204 void (*lm_put_lock) (lm_lock_t *lock);
205
206 unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
207 unsigned int req_state, unsigned int flags);
208
209 unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
210
211 void (*lm_cancel) (lm_lock_t *lock);
212
213 int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
214 void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
215 void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
216
217 /*
218 * Posix Lock oriented operations
219 */
220
221 int (*lm_plock_get) (lm_lockspace_t *lockspace,
222 struct lm_lockname *name,
223 struct file *file, struct file_lock *fl);
224
225 int (*lm_plock) (lm_lockspace_t *lockspace,
226 struct lm_lockname *name,
227 struct file *file, int cmd, struct file_lock *fl);
228
229 int (*lm_punlock) (lm_lockspace_t *lockspace,
230 struct lm_lockname *name,
231 struct file *file, struct file_lock *fl);
232
233 /*
234 * Client oriented operations
235 */
236
237 void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
238 unsigned int message);
239
240 struct module *lm_owner;
241};
242
243/*
244 * lm_mount() return values
245 *
246 * ls_jid - the journal ID this node should use
247 * ls_first - this node is the first to mount the file system
248 * ls_lvb_size - size in bytes of lock value blocks
249 * ls_lockspace - lock module's context for this file system
250 * ls_ops - lock module's functions
251 * ls_flags - lock module features
252 */
253
254struct lm_lockstruct {
255 unsigned int ls_jid;
256 unsigned int ls_first;
257 unsigned int ls_lvb_size;
258 lm_lockspace_t *ls_lockspace;
259 struct lm_lockops *ls_ops;
260 int ls_flags;
261};
262
263void __init gfs2_init_lmh(void);
264
265/*
266 * Lock module bottom interface. A lock module makes itself available to GFS
267 * with these functions.
268 *
269 * For the time being, we copy the gfs1 lock module bottom interface so the
270 * same lock modules can be used with both gfs1 and gfs2 (it won't be possible
271 * to load both gfs1 and gfs2 at once.) Eventually the lock modules will fork
272 * for gfs1/gfs2 and this API can change to the gfs2_ prefix.
273 */
274
275int gfs_register_lockproto(struct lm_lockops *proto);
276
277void gfs_unregister_lockproto(struct lm_lockops *proto);
278
279/*
280 * Lock module top interface. GFS calls these functions when mounting or
281 * unmounting a file system.
282 */
283
284int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
285 lm_callback_t cb, lm_fsdata_t *fsdata,
286 unsigned int min_lvb_size, int flags,
287 struct lm_lockstruct *lockstruct,
288 struct kobject *fskobj);
289
290void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
291
292void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
293
294#endif /* __LM_INTERFACE_DOT_H__ */
295
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..183192836e98
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19
20#include "lm_interface.h"
21
22struct lmh_wrapper {
23 struct list_head lw_list;
24 struct lm_lockops *lw_ops;
25};
26
27/* List of registered low-level locking protocols. A file system selects one
28 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
29
30static struct list_head lmh_list;
31static struct mutex lmh_lock;
32
33/**
34 * gfs_register_lockproto - Register a low-level locking protocol
35 * @proto: the protocol definition
36 *
37 * Returns: 0 on success, -EXXX on failure
38 */
39
40int gfs_register_lockproto(struct lm_lockops *proto)
41{
42 struct lmh_wrapper *lw;
43
44 mutex_lock(&lmh_lock);
45
46 list_for_each_entry(lw, &lmh_list, lw_list) {
47 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
48 mutex_unlock(&lmh_lock);
49 printk(KERN_INFO "GFS2: protocol %s already exists\n",
50 proto->lm_proto_name);
51 return -EEXIST;
52 }
53 }
54
55 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
56 if (!lw) {
57 mutex_unlock(&lmh_lock);
58 return -ENOMEM;
59 }
60
61 lw->lw_ops = proto;
62 list_add(&lw->lw_list, &lmh_list);
63
64 mutex_unlock(&lmh_lock);
65
66 return 0;
67}
68
69/**
70 * gfs_unregister_lockproto - Unregister a low-level locking protocol
71 * @proto: the protocol definition
72 *
73 */
74
75void gfs_unregister_lockproto(struct lm_lockops *proto)
76{
77 struct lmh_wrapper *lw;
78
79 mutex_lock(&lmh_lock);
80
81 list_for_each_entry(lw, &lmh_list, lw_list) {
82 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
83 list_del(&lw->lw_list);
84 mutex_unlock(&lmh_lock);
85 kfree(lw);
86 return;
87 }
88 }
89
90 mutex_unlock(&lmh_lock);
91
92 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
93 proto->lm_proto_name);
94}
95
96/**
97 * gfs2_mount_lockproto - Mount a lock protocol
98 * @proto_name - the name of the protocol
99 * @table_name - the name of the lock space
100 * @host_data - data specific to this host
101 * @cb - the callback to the code using the lock module
102 * @fsdata - data to pass back with the callback
103 * @min_lvb_size - the mininum LVB size that the caller can deal with
104 * @flags - LM_MFLAG_*
105 * @lockstruct - a structure returned describing the mount
106 *
107 * Returns: 0 on success, -EXXX on failure
108 */
109
110int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
111 lm_callback_t cb, lm_fsdata_t *fsdata,
112 unsigned int min_lvb_size, int flags,
113 struct lm_lockstruct *lockstruct,
114 struct kobject *fskobj)
115{
116 struct lmh_wrapper *lw = NULL;
117 int try = 0;
118 int error, found;
119
120 retry:
121 mutex_lock(&lmh_lock);
122
123 found = 0;
124 list_for_each_entry(lw, &lmh_list, lw_list) {
125 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
126 found = 1;
127 break;
128 }
129 }
130
131 if (!found) {
132 if (!try && capable(CAP_SYS_MODULE)) {
133 try = 1;
134 mutex_unlock(&lmh_lock);
135 request_module(proto_name);
136 goto retry;
137 }
138 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
139 error = -ENOENT;
140 goto out;
141 }
142
143 if (!try_module_get(lw->lw_ops->lm_owner)) {
144 try = 0;
145 mutex_unlock(&lmh_lock);
146 msleep(1000);
147 goto retry;
148 }
149
150 error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata,
151 min_lvb_size, flags, lockstruct, fskobj);
152 if (error)
153 module_put(lw->lw_ops->lm_owner);
154 out:
155 mutex_unlock(&lmh_lock);
156 return error;
157}
158
159void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
160{
161 mutex_lock(&lmh_lock);
162 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
163 if (lockstruct->ls_ops->lm_owner)
164 module_put(lockstruct->ls_ops->lm_owner);
165 mutex_unlock(&lmh_lock);
166}
167
168/**
169 * gfs2_withdraw_lockproto - abnormally unmount a lock module
170 * @lockstruct: the lockstruct passed into mount
171 *
172 */
173
174void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
175{
176 mutex_lock(&lmh_lock);
177 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
178 if (lockstruct->ls_ops->lm_owner)
179 module_put(lockstruct->ls_ops->lm_owner);
180 mutex_unlock(&lmh_lock);
181}
182
183void __init gfs2_init_lmh(void)
184{
185 mutex_init(&lmh_lock);
186 INIT_LIST_HEAD(&lmh_list);
187}
188
189EXPORT_SYMBOL_GPL(gfs_register_lockproto);
190EXPORT_SYMBOL_GPL(gfs_unregister_lockproto);
191
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..a9733ff80371
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..f769eac1a34a
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,541 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete(astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) {
45 list_add_tail(&lp->blist, &ls->blocking);
46 lp->bast_mode = mode;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait);
51}
52
53void gdlm_queue_delayed(struct gdlm_lock *lp)
54{
55 struct gdlm_ls *ls = lp->ls;
56
57 spin_lock(&ls->async_lock);
58 list_add_tail(&lp->delay_list, &ls->delayed);
59 spin_unlock(&ls->async_lock);
60}
61
62/* convert gfs lock-state to dlm lock-mode */
63
64static int16_t make_mode(int16_t lmstate)
65{
66 switch (lmstate) {
67 case LM_ST_UNLOCKED:
68 return DLM_LOCK_NL;
69 case LM_ST_EXCLUSIVE:
70 return DLM_LOCK_EX;
71 case LM_ST_DEFERRED:
72 return DLM_LOCK_CW;
73 case LM_ST_SHARED:
74 return DLM_LOCK_PR;
75 }
76 gdlm_assert(0, "unknown LM state %d", lmstate);
77 return -1;
78}
79
80/* convert dlm lock-mode to gfs lock-state */
81
82int16_t gdlm_make_lmstate(int16_t dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
101
102static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
103{
104 int16_t cur = make_mode(cur_state);
105 if (lp->cur != DLM_LOCK_IV)
106 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
107}
108
109static inline unsigned int make_flags(struct gdlm_lock *lp,
110 unsigned int gfs_flags,
111 int16_t cur, int16_t req)
112{
113 unsigned int lkf = 0;
114
115 if (gfs_flags & LM_FLAG_TRY)
116 lkf |= DLM_LKF_NOQUEUE;
117
118 if (gfs_flags & LM_FLAG_TRY_1CB) {
119 lkf |= DLM_LKF_NOQUEUE;
120 lkf |= DLM_LKF_NOQUEUEBAST;
121 }
122
123 if (gfs_flags & LM_FLAG_PRIORITY) {
124 lkf |= DLM_LKF_NOORDER;
125 lkf |= DLM_LKF_HEADQUE;
126 }
127
128 if (gfs_flags & LM_FLAG_ANY) {
129 if (req == DLM_LOCK_PR)
130 lkf |= DLM_LKF_ALTCW;
131 else if (req == DLM_LOCK_CW)
132 lkf |= DLM_LKF_ALTPR;
133 }
134
135 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK;
144 }
145
146 if (lp->lvb)
147 lkf |= DLM_LKF_VALBLK;
148
149 return lkf;
150}
151
152/* make_strname - convert GFS lock numbers to a string */
153
154static inline void make_strname(struct lm_lockname *lockname,
155 struct gdlm_strname *str)
156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
158 (unsigned long long)lockname->ln_number);
159 str->namelen = GDLM_STRNAME_BYTES;
160}
161
162static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
163 struct gdlm_lock **lpp)
164{
165 struct gdlm_lock *lp;
166
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
168 if (!lp)
169 return -ENOMEM;
170
171 lp->lockname = *name;
172 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL;
175 lp->hold_null = NULL;
176 init_completion(&lp->ast_wait);
177 INIT_LIST_HEAD(&lp->clist);
178 INIT_LIST_HEAD(&lp->blist);
179 INIT_LIST_HEAD(&lp->delay_list);
180
181 spin_lock(&ls->async_lock);
182 list_add(&lp->all_list, &ls->all_locks);
183 ls->all_locks_count++;
184 spin_unlock(&ls->async_lock);
185
186 *lpp = lp;
187 return 0;
188}
189
190void gdlm_delete_lp(struct gdlm_lock *lp)
191{
192 struct gdlm_ls *ls = lp->ls;
193
194 spin_lock(&ls->async_lock);
195 if (!list_empty(&lp->clist))
196 list_del_init(&lp->clist);
197 if (!list_empty(&lp->blist))
198 list_del_init(&lp->blist);
199 if (!list_empty(&lp->delay_list))
200 list_del_init(&lp->delay_list);
201 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
202 (unsigned long long)lp->lockname.ln_number);
203 list_del_init(&lp->all_list);
204 ls->all_locks_count--;
205 spin_unlock(&ls->async_lock);
206
207 kfree(lp);
208}
209
210int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
211 lm_lock_t **lockp)
212{
213 struct gdlm_lock *lp;
214 int error;
215
216 error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp);
217
218 *lockp = (lm_lock_t *) lp;
219 return error;
220}
221
222void gdlm_put_lock(lm_lock_t *lock)
223{
224 gdlm_delete_lp((struct gdlm_lock *) lock);
225}
226
227unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{
229 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1;
232
233 /*
234 * When recovery is in progress, delay lock requests for submission
235 * once recovery is done. Requests for recovery (NOEXP) and unlocks
236 * can pass.
237 */
238
239 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
240 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
241 gdlm_queue_delayed(lp);
242 return LM_OUT_ASYNC;
243 }
244
245 /*
246 * Submit the actual lock request.
247 */
248
249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0;
251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags);
255
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
257 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
258 lp->cur, lp->req, lp->lkf);
259
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp,
262 bast ? gdlm_bast : NULL);
263
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN;
266 queue_complete(lp);
267 error = 0;
268 }
269
270 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags);
275 return LM_OUT_ERROR;
276 }
277 return LM_OUT_ASYNC;
278}
279
280static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
281{
282 struct gdlm_ls *ls = lp->ls;
283 unsigned int lkf = 0;
284 int error;
285
286 set_bit(LFL_DLM_UNLOCK, &lp->flags);
287 set_bit(LFL_ACTIVE, &lp->flags);
288
289 if (lp->lvb)
290 lkf = DLM_LKF_VALBLK;
291
292 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
293 (unsigned long long)lp->lockname.ln_number,
294 lp->lksb.sb_lkid, lp->cur, lkf);
295
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297
298 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags);
303 return LM_OUT_ERROR;
304 }
305 return LM_OUT_ASYNC;
306}
307
308unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state,
309 unsigned int req_state, unsigned int flags)
310{
311 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
312
313 clear_bit(LFL_DLM_CANCEL, &lp->flags);
314 if (flags & LM_FLAG_NOEXP)
315 set_bit(LFL_NOBLOCK, &lp->flags);
316
317 check_cur_state(lp, cur_state);
318 lp->req = make_mode(req_state);
319 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
320
321 return gdlm_do_lock(lp);
322}
323
324unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state)
325{
326 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
327
328 clear_bit(LFL_DLM_CANCEL, &lp->flags);
329 if (lp->cur == DLM_LOCK_IV)
330 return 0;
331 return gdlm_do_unlock(lp);
332}
333
334void gdlm_cancel(lm_lock_t *lock)
335{
336 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
337 struct gdlm_ls *ls = lp->ls;
338 int error, delay_list = 0;
339
340 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
341 return;
342
343 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
344 (unsigned long long)lp->lockname.ln_number, lp->flags);
345
346 spin_lock(&ls->async_lock);
347 if (!list_empty(&lp->delay_list)) {
348 list_del_init(&lp->delay_list);
349 delay_list = 1;
350 }
351 spin_unlock(&ls->async_lock);
352
353 if (delay_list) {
354 set_bit(LFL_CANCEL, &lp->flags);
355 set_bit(LFL_ACTIVE, &lp->flags);
356 queue_complete(lp);
357 return;
358 }
359
360 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
361 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
362 log_info("gdlm_cancel skip %x,%llx flags %lx",
363 lp->lockname.ln_type,
364 (unsigned long long)lp->lockname.ln_number, lp->flags);
365 return;
366 }
367
368 /* the lock is blocked in the dlm */
369
370 set_bit(LFL_DLM_CANCEL, &lp->flags);
371 set_bit(LFL_ACTIVE, &lp->flags);
372
373 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
374 NULL, lp);
375
376 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
377 lp->lockname.ln_type,
378 (unsigned long long)lp->lockname.ln_number, lp->flags);
379
380 if (error == -EBUSY)
381 clear_bit(LFL_DLM_CANCEL, &lp->flags);
382}
383
384static int gdlm_add_lvb(struct gdlm_lock *lp)
385{
386 char *lvb;
387
388 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
389 if (!lvb)
390 return -ENOMEM;
391
392 lp->lksb.sb_lvbptr = lvb;
393 lp->lvb = lvb;
394 return 0;
395}
396
397static void gdlm_del_lvb(struct gdlm_lock *lp)
398{
399 kfree(lp->lvb);
400 lp->lvb = NULL;
401 lp->lksb.sb_lvbptr = NULL;
402}
403
404/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
405 the completion) because gfs won't call hold_lvb() during a callback (from
406 the context of a lock_dlm thread). */
407
408static int hold_null_lock(struct gdlm_lock *lp)
409{
410 struct gdlm_lock *lpn = NULL;
411 int error;
412
413 if (lp->hold_null) {
414 printk(KERN_INFO "lock_dlm: lvb already held\n");
415 return 0;
416 }
417
418 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
419 if (error)
420 goto out;
421
422 lpn->lksb.sb_lvbptr = junk_lvb;
423 lpn->lvb = junk_lvb;
424
425 lpn->req = DLM_LOCK_NL;
426 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
427 set_bit(LFL_NOBAST, &lpn->flags);
428 set_bit(LFL_INLOCK, &lpn->flags);
429
430 init_completion(&lpn->ast_wait);
431 gdlm_do_lock(lpn);
432 wait_for_completion(&lpn->ast_wait);
433 error = lpn->lksb.sb_status;
434 if (error) {
435 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
436 error);
437 gdlm_delete_lp(lpn);
438 lpn = NULL;
439 }
440 out:
441 lp->hold_null = lpn;
442 return error;
443}
444
445/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
446 the completion) because gfs may call unhold_lvb() during a callback (from
447 the context of a lock_dlm thread) which could cause a deadlock since the
448 other lock_dlm thread could be engaged in recovery. */
449
450static void unhold_null_lock(struct gdlm_lock *lp)
451{
452 struct gdlm_lock *lpn = lp->hold_null;
453
454 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
455 (unsigned long long)lp->lockname.ln_number);
456 lpn->lksb.sb_lvbptr = NULL;
457 lpn->lvb = NULL;
458 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
459 gdlm_do_unlock(lpn);
460 lp->hold_null = NULL;
461}
462
463/* Acquire a NL lock because gfs requires the value block to remain
464 intact on the resource while the lvb is "held" even if it's holding no locks
465 on the resource. */
466
467int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp)
468{
469 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
470 int error;
471
472 error = gdlm_add_lvb(lp);
473 if (error)
474 return error;
475
476 *lvbp = lp->lvb;
477
478 error = hold_null_lock(lp);
479 if (error)
480 gdlm_del_lvb(lp);
481
482 return error;
483}
484
485void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb)
486{
487 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
488
489 unhold_null_lock(lp);
490 gdlm_del_lvb(lp);
491}
492
493void gdlm_sync_lvb(lm_lock_t *lock, char *lvb)
494{
495 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
496
497 if (lp->cur != DLM_LOCK_EX)
498 return;
499
500 init_completion(&lp->ast_wait);
501 set_bit(LFL_SYNC_LVB, &lp->flags);
502
503 lp->req = DLM_LOCK_EX;
504 lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
505
506 gdlm_do_lock(lp);
507 wait_for_completion(&lp->ast_wait);
508}
509
510void gdlm_submit_delayed(struct gdlm_ls *ls)
511{
512 struct gdlm_lock *lp, *safe;
513
514 spin_lock(&ls->async_lock);
515 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
516 list_del_init(&lp->delay_list);
517 list_add_tail(&lp->delay_list, &ls->submit);
518 }
519 spin_unlock(&ls->async_lock);
520 wake_up(&ls->thread_wait);
521}
522
523int gdlm_release_all_locks(struct gdlm_ls *ls)
524{
525 struct gdlm_lock *lp, *safe;
526 int count = 0;
527
528 spin_lock(&ls->async_lock);
529 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
530 list_del_init(&lp->all_list);
531
532 if (lp->lvb && lp->lvb != junk_lvb)
533 kfree(lp->lvb);
534 kfree(lp);
535 count++;
536 }
537 spin_unlock(&ls->async_lock);
538
539 return count;
540}
541
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..530c2f542584
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,188 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include "../../lm_interface.h"
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 uint32_t id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 lm_fsdata_t *fsdata;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed;
78 struct list_head submit;
79 struct list_head all_locks;
80 uint32_t all_locks_count;
81 wait_queue_head_t wait_control;
82 struct task_struct *thread1;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88};
89
90enum {
91 LFL_NOBLOCK = 0,
92 LFL_NOCACHE = 1,
93 LFL_DLM_UNLOCK = 2,
94 LFL_DLM_CANCEL = 3,
95 LFL_SYNC_LVB = 4,
96 LFL_FORCE_PROMOTE = 5,
97 LFL_REREQUEST = 6,
98 LFL_ACTIVE = 7,
99 LFL_INLOCK = 8,
100 LFL_CANCEL = 9,
101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12,
104};
105
106struct gdlm_lock {
107 struct gdlm_ls *ls;
108 struct lm_lockname lockname;
109 char *lvb;
110 struct dlm_lksb lksb;
111
112 int16_t cur;
113 int16_t req;
114 int16_t prev_req;
115 uint32_t lkf; /* dlm flags DLM_LKF_ */
116 unsigned long flags; /* lock_dlm flags LFL_ */
117
118 int bast_mode; /* protected by async_lock */
119 struct completion ast_wait;
120
121 struct list_head clist; /* complete */
122 struct list_head blist; /* blocking */
123 struct list_head delay_list; /* delayed */
124 struct list_head all_list; /* all locks for the fs */
125 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
126};
127
128#define gdlm_assert(assertion, fmt, args...) \
129do { \
130 if (unlikely(!(assertion))) { \
131 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
132 "lock_dlm: " fmt "\n", \
133 #assertion, ##args); \
134 BUG(); \
135 } \
136} while (0)
137
138#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
139#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
140#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
141#ifdef LOCK_DLM_LOG_DEBUG
142#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
143#else
144#define log_debug(fmt, arg...)
145#endif
146
147/* sysfs.c */
148
149int gdlm_sysfs_init(void);
150void gdlm_sysfs_exit(void);
151int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
152void gdlm_kobject_release(struct gdlm_ls *);
153
154/* thread.c */
155
156int gdlm_init_threads(struct gdlm_ls *);
157void gdlm_release_threads(struct gdlm_ls *);
158
159/* lock.c */
160
161int16_t gdlm_make_lmstate(int16_t);
162void gdlm_queue_delayed(struct gdlm_lock *);
163void gdlm_submit_delayed(struct gdlm_ls *);
164int gdlm_release_all_locks(struct gdlm_ls *);
165void gdlm_delete_lp(struct gdlm_lock *);
166unsigned int gdlm_do_lock(struct gdlm_lock *);
167
168int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **);
169void gdlm_put_lock(lm_lock_t *);
170unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int);
171unsigned int gdlm_unlock(lm_lock_t *, unsigned int);
172void gdlm_cancel(lm_lock_t *);
173int gdlm_hold_lvb(lm_lock_t *, char **);
174void gdlm_unhold_lvb(lm_lock_t *, char *);
175void gdlm_sync_lvb(lm_lock_t *, char *);
176
177/* plock.c */
178
179int gdlm_plock_init(void);
180void gdlm_plock_exit(void);
181int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int,
182 struct file_lock *);
183int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *,
184 struct file_lock *);
185int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *,
186 struct file_lock *);
187#endif
188
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..89728c91665f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19static int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51static void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..3caeafc02a1b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,256 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->fsdata = fsdata;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, lm_fsdata_t *fsdata,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, fsdata, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = gdlm_kobject_setup(ls, fskobj);
147 if (error)
148 goto out_thread;
149
150 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
151 &ls->dlm_lockspace,
152 nodir ? DLM_LSFL_NODIR : 0,
153 GDLM_LVB_SIZE);
154 if (error) {
155 log_error("dlm_new_lockspace error %d", error);
156 goto out_kobj;
157 }
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167 out_kobj:
168 gdlm_kobject_release(ls);
169 out_thread:
170 gdlm_release_threads(ls);
171 out_free:
172 kfree(ls);
173 out:
174 return error;
175}
176
177static void gdlm_unmount(lm_lockspace_t *lockspace)
178{
179 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197 out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
205 ls->recover_jid_done = jid;
206 ls->recover_jid_status = message;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
208}
209
210static void gdlm_others_may_mount(lm_lockspace_t *lockspace)
211{
212 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
213 ls->first_done = 1;
214 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
215}
216
217/* Userspace gets the offline uevent, blocks new gfs locks on
218 other mounters, and lets us know (sets WITHDRAW flag). Then,
219 userspace leaves the mount group while we leave the lockspace. */
220
221static void gdlm_withdraw(lm_lockspace_t *lockspace)
222{
223 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
224
225 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
226
227 wait_event_interruptible(ls->wait_control,
228 test_bit(DFL_WITHDRAW, &ls->flags));
229
230 dlm_release_lockspace(ls->dlm_lockspace, 2);
231 gdlm_release_threads(ls);
232 gdlm_release_all_locks(ls);
233 gdlm_kobject_release(ls);
234}
235
236struct lm_lockops gdlm_ops = {
237 .lm_proto_name = "lock_dlm",
238 .lm_mount = gdlm_mount,
239 .lm_others_may_mount = gdlm_others_may_mount,
240 .lm_unmount = gdlm_unmount,
241 .lm_withdraw = gdlm_withdraw,
242 .lm_get_lock = gdlm_get_lock,
243 .lm_put_lock = gdlm_put_lock,
244 .lm_lock = gdlm_lock,
245 .lm_unlock = gdlm_unlock,
246 .lm_plock = gdlm_plock,
247 .lm_punlock = gdlm_punlock,
248 .lm_plock_get = gdlm_plock_get,
249 .lm_cancel = gdlm_cancel,
250 .lm_hold_lvb = gdlm_hold_lvb,
251 .lm_unhold_lvb = gdlm_unhold_lvb,
252 .lm_sync_lvb = gdlm_sync_lvb,
253 .lm_recovery_done = gdlm_recovery_done,
254 .lm_owner = THIS_MODULE,
255};
256
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..6adfb2d4fd8c
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,299 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = fl->fl_pid;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80
81 send_op(op);
82 wait_event(recv_wq, (op->done != 0));
83
84 spin_lock(&ops_lock);
85 if (!list_empty(&op->list)) {
86 printk(KERN_INFO "plock op on list\n");
87 list_del(&op->list);
88 }
89 spin_unlock(&ops_lock);
90
91 rv = op->info.rv;
92
93 if (!rv) {
94 if (posix_lock_file_wait(file, fl) < 0)
95 log_error("gdlm_plock: vfs lock error %x,%llx",
96 name->ln_type,
97 (unsigned long long)name->ln_number);
98 }
99
100 kfree(op);
101 return rv;
102}
103
104int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
105 struct file *file, struct file_lock *fl)
106{
107 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
108 struct plock_op *op;
109 int rv;
110
111 op = kzalloc(sizeof(*op), GFP_KERNEL);
112 if (!op)
113 return -ENOMEM;
114
115 if (posix_lock_file_wait(file, fl) < 0)
116 log_error("gdlm_punlock: vfs unlock error %x,%llx",
117 name->ln_type, (unsigned long long)name->ln_number);
118
119 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
120 op->info.pid = fl->fl_pid;
121 op->info.fsid = ls->id;
122 op->info.number = name->ln_number;
123 op->info.start = fl->fl_start;
124 op->info.end = fl->fl_end;
125
126 send_op(op);
127 wait_event(recv_wq, (op->done != 0));
128
129 spin_lock(&ops_lock);
130 if (!list_empty(&op->list)) {
131 printk(KERN_INFO "punlock op on list\n");
132 list_del(&op->list);
133 }
134 spin_unlock(&ops_lock);
135
136 rv = op->info.rv;
137
138 kfree(op);
139 return rv;
140}
141
142int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
143 struct file *file, struct file_lock *fl)
144{
145 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
146 struct plock_op *op;
147 int rv;
148
149 op = kzalloc(sizeof(*op), GFP_KERNEL);
150 if (!op)
151 return -ENOMEM;
152
153 op->info.optype = GDLM_PLOCK_OP_GET;
154 op->info.pid = fl->fl_pid;
155 op->info.ex = (fl->fl_type == F_WRLCK);
156 op->info.fsid = ls->id;
157 op->info.number = name->ln_number;
158 op->info.start = fl->fl_start;
159 op->info.end = fl->fl_end;
160
161 send_op(op);
162 wait_event(recv_wq, (op->done != 0));
163
164 spin_lock(&ops_lock);
165 if (!list_empty(&op->list)) {
166 printk(KERN_INFO "plock_get op on list\n");
167 list_del(&op->list);
168 }
169 spin_unlock(&ops_lock);
170
171 rv = op->info.rv;
172
173 if (rv == 0)
174 fl->fl_type = F_UNLCK;
175 else if (rv > 0) {
176 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
177 fl->fl_pid = op->info.pid;
178 fl->fl_start = op->info.start;
179 fl->fl_end = op->info.end;
180 }
181
182 kfree(op);
183 return rv;
184}
185
186/* a read copies out one plock request from the send list */
187static ssize_t dev_read(struct file *file, char __user *u, size_t count,
188 loff_t *ppos)
189{
190 struct gdlm_plock_info info;
191 struct plock_op *op = NULL;
192
193 if (count < sizeof(info))
194 return -EINVAL;
195
196 spin_lock(&ops_lock);
197 if (!list_empty(&send_list)) {
198 op = list_entry(send_list.next, struct plock_op, list);
199 list_move(&op->list, &recv_list);
200 memcpy(&info, &op->info, sizeof(info));
201 }
202 spin_unlock(&ops_lock);
203
204 if (!op)
205 return -EAGAIN;
206
207 if (copy_to_user(u, &info, sizeof(info)))
208 return -EFAULT;
209 return sizeof(info);
210}
211
212/* a write copies in one plock result that should match a plock_op
213 on the recv list */
214static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
215 loff_t *ppos)
216{
217 struct gdlm_plock_info info;
218 struct plock_op *op;
219 int found = 0;
220
221 if (count != sizeof(info))
222 return -EINVAL;
223
224 if (copy_from_user(&info, u, sizeof(info)))
225 return -EFAULT;
226
227 if (check_version(&info))
228 return -EINVAL;
229
230 spin_lock(&ops_lock);
231 list_for_each_entry(op, &recv_list, list) {
232 if (op->info.fsid == info.fsid &&
233 op->info.number == info.number) {
234 list_del_init(&op->list);
235 found = 1;
236 op->done = 1;
237 memcpy(&op->info, &info, sizeof(info));
238 break;
239 }
240 }
241 spin_unlock(&ops_lock);
242
243 if (found)
244 wake_up(&recv_wq);
245 else
246 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
247 (unsigned long long)info.number);
248 return count;
249}
250
251static unsigned int dev_poll(struct file *file, poll_table *wait)
252{
253 poll_wait(file, &send_wq, wait);
254
255 spin_lock(&ops_lock);
256 if (!list_empty(&send_list)) {
257 spin_unlock(&ops_lock);
258 return POLLIN | POLLRDNORM;
259 }
260 spin_unlock(&ops_lock);
261 return 0;
262}
263
264static struct file_operations dev_fops = {
265 .read = dev_read,
266 .write = dev_write,
267 .poll = dev_poll,
268 .owner = THIS_MODULE
269};
270
271static struct miscdevice plock_dev_misc = {
272 .minor = MISC_DYNAMIC_MINOR,
273 .name = GDLM_PLOCK_MISC_NAME,
274 .fops = &dev_fops
275};
276
277int gdlm_plock_init(void)
278{
279 int rv;
280
281 spin_lock_init(&ops_lock);
282 INIT_LIST_HEAD(&send_list);
283 INIT_LIST_HEAD(&recv_list);
284 init_waitqueue_head(&send_wq);
285 init_waitqueue_head(&recv_wq);
286
287 rv = misc_register(&plock_dev_misc);
288 if (rv)
289 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
290 rv);
291 return rv;
292}
293
294void gdlm_plock_exit(void)
295{
296 if (misc_deregister(&plock_dev_misc) < 0)
297 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
298}
299
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..0d8bd0806dba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,225 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else
46 ret = -EINVAL;
47 return ret;
48}
49
50static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
51{
52 ssize_t ret;
53 int val = 0;
54
55 if (test_bit(DFL_WITHDRAW, &ls->flags))
56 val = 1;
57 ret = sprintf(buf, "%d\n", val);
58 return ret;
59}
60
61static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
62{
63 ssize_t ret = len;
64 int val;
65
66 val = simple_strtol(buf, NULL, 0);
67
68 if (val == 1)
69 set_bit(DFL_WITHDRAW, &ls->flags);
70 else
71 ret = -EINVAL;
72 wake_up(&ls->wait_control);
73 return ret;
74}
75
76static ssize_t id_show(struct gdlm_ls *ls, char *buf)
77{
78 return sprintf(buf, "%u\n", ls->id);
79}
80
81static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
82{
83 return sprintf(buf, "%d\n", ls->jid);
84}
85
86static ssize_t first_show(struct gdlm_ls *ls, char *buf)
87{
88 return sprintf(buf, "%d\n", ls->first);
89}
90
91static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
92{
93 return sprintf(buf, "%d\n", ls->first_done);
94}
95
96static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
97{
98 return sprintf(buf, "%d\n", ls->recover_jid);
99}
100
101static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
102{
103 ls->recover_jid = simple_strtol(buf, NULL, 0);
104 ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid);
105 return len;
106}
107
108static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
109{
110 return sprintf(buf, "%d\n", ls->recover_jid_done);
111}
112
113static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
114{
115 return sprintf(buf, "%d\n", ls->recover_jid_status);
116}
117
118struct gdlm_attr {
119 struct attribute attr;
120 ssize_t (*show)(struct gdlm_ls *, char *);
121 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
122};
123
124#define GDLM_ATTR(_name,_mode,_show,_store) \
125static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
126
127GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
128GDLM_ATTR(block, 0644, block_show, block_store);
129GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
130GDLM_ATTR(id, 0444, id_show, NULL);
131GDLM_ATTR(jid, 0444, jid_show, NULL);
132GDLM_ATTR(first, 0444, first_show, NULL);
133GDLM_ATTR(first_done, 0444, first_done_show, NULL);
134GDLM_ATTR(recover, 0644, recover_show, recover_store);
135GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
136GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
137
138static struct attribute *gdlm_attrs[] = {
139 &gdlm_attr_proto_name.attr,
140 &gdlm_attr_block.attr,
141 &gdlm_attr_withdraw.attr,
142 &gdlm_attr_id.attr,
143 &gdlm_attr_jid.attr,
144 &gdlm_attr_first.attr,
145 &gdlm_attr_first_done.attr,
146 &gdlm_attr_recover.attr,
147 &gdlm_attr_recover_done.attr,
148 &gdlm_attr_recover_status.attr,
149 NULL,
150};
151
152static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
153 char *buf)
154{
155 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
156 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
157 return a->show ? a->show(ls, buf) : 0;
158}
159
160static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
161 const char *buf, size_t len)
162{
163 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
164 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
165 return a->store ? a->store(ls, buf, len) : len;
166}
167
168static struct sysfs_ops gdlm_attr_ops = {
169 .show = gdlm_attr_show,
170 .store = gdlm_attr_store,
171};
172
173static struct kobj_type gdlm_ktype = {
174 .default_attrs = gdlm_attrs,
175 .sysfs_ops = &gdlm_attr_ops,
176};
177
178static struct kset gdlm_kset = {
179 .subsys = &kernel_subsys,
180 .kobj = {.name = "lock_dlm",},
181 .ktype = &gdlm_ktype,
182};
183
184int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
185{
186 int error;
187
188 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
189 if (error) {
190 log_error("can't set kobj name %d", error);
191 return error;
192 }
193
194 ls->kobj.kset = &gdlm_kset;
195 ls->kobj.ktype = &gdlm_ktype;
196 ls->kobj.parent = fskobj;
197
198 error = kobject_register(&ls->kobj);
199 if (error)
200 log_error("can't register kobj %d", error);
201
202 return error;
203}
204
205void gdlm_kobject_release(struct gdlm_ls *ls)
206{
207 kobject_unregister(&ls->kobj);
208}
209
210int gdlm_sysfs_init(void)
211{
212 int error;
213
214 error = kset_register(&gdlm_kset);
215 if (error)
216 printk("lock_dlm: cannot register kset %d\n", error);
217
218 return error;
219}
220
221void gdlm_sysfs_exit(void)
222{
223 kset_unregister(&gdlm_kset);
224}
225
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..489235b2edba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->fsdata, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 int16_t prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type,
58 (unsigned long long)lp->lockname.ln_number,
59 lp->flags);
60
61 lp->req = lp->cur;
62 acb.lc_ret |= LM_OUT_CANCELED;
63 if (lp->cur == DLM_LOCK_IV)
64 lp->lksb.sb_lkid = 0;
65 goto out;
66 }
67
68 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
69 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
70 log_info("unlock sb_status %d %x,%llx flags %lx",
71 lp->lksb.sb_status, lp->lockname.ln_type,
72 (unsigned long long)lp->lockname.ln_number,
73 lp->flags);
74 return;
75 }
76
77 lp->cur = DLM_LOCK_IV;
78 lp->req = DLM_LOCK_IV;
79 lp->lksb.sb_lkid = 0;
80
81 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
82 gdlm_delete_lp(lp);
83 return;
84 }
85 goto out;
86 }
87
88 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
89 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
90
91 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
92 if (lp->req == DLM_LOCK_PR)
93 lp->req = DLM_LOCK_CW;
94 else if (lp->req == DLM_LOCK_CW)
95 lp->req = DLM_LOCK_PR;
96 }
97
98 /*
99 * A canceled lock request. The lock was just taken off the delayed
100 * list and was never even submitted to dlm.
101 */
102
103 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
104 log_info("complete internal cancel %x,%llx",
105 lp->lockname.ln_type,
106 (unsigned long long)lp->lockname.ln_number);
107 lp->req = lp->cur;
108 acb.lc_ret |= LM_OUT_CANCELED;
109 goto out;
110 }
111
112 /*
113 * An error occured.
114 */
115
116 if (lp->lksb.sb_status) {
117 /* a "normal" error */
118 if ((lp->lksb.sb_status == -EAGAIN) &&
119 (lp->lkf & DLM_LKF_NOQUEUE)) {
120 lp->req = lp->cur;
121 if (lp->cur == DLM_LOCK_IV)
122 lp->lksb.sb_lkid = 0;
123 goto out;
124 }
125
126 /* this could only happen with cancels I think */
127 log_info("ast sb_status %d %x,%llx flags %lx",
128 lp->lksb.sb_status, lp->lockname.ln_type,
129 (unsigned long long)lp->lockname.ln_number,
130 lp->flags);
131 return;
132 }
133
134 /*
135 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
136 */
137
138 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
139 complete(&lp->ast_wait);
140 return;
141 }
142
143 /*
144 * A lock has been demoted to NL because it initially completed during
145 * BLOCK_LOCKS. Now it must be requested in the originally requested
146 * mode.
147 */
148
149 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
150 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
151 lp->lockname.ln_type,
152 (unsigned long long)lp->lockname.ln_number);
153 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
154 lp->lockname.ln_type,
155 (unsigned long long)lp->lockname.ln_number);
156
157 lp->cur = DLM_LOCK_NL;
158 lp->req = lp->prev_req;
159 lp->prev_req = DLM_LOCK_IV;
160 lp->lkf &= ~DLM_LKF_CONVDEADLK;
161
162 set_bit(LFL_NOCACHE, &lp->flags);
163
164 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
165 !test_bit(LFL_NOBLOCK, &lp->flags))
166 gdlm_queue_delayed(lp);
167 else
168 queue_submit(lp);
169 return;
170 }
171
172 /*
173 * A request is granted during dlm recovery. It may be granted
174 * because the locks of a failed node were cleared. In that case,
175 * there may be inconsistent data beneath this lock and we must wait
176 * for recovery to complete to use it. When gfs recovery is done this
177 * granted lock will be converted to NL and then reacquired in this
178 * granted state.
179 */
180
181 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
182 !test_bit(LFL_NOBLOCK, &lp->flags) &&
183 lp->req != DLM_LOCK_NL) {
184
185 lp->cur = lp->req;
186 lp->prev_req = lp->req;
187 lp->req = DLM_LOCK_NL;
188 lp->lkf |= DLM_LKF_CONVERT;
189 lp->lkf &= ~DLM_LKF_CONVDEADLK;
190
191 log_debug("rereq %x,%llx id %x %d,%d",
192 lp->lockname.ln_type,
193 (unsigned long long)lp->lockname.ln_number,
194 lp->lksb.sb_lkid, lp->cur, lp->req);
195
196 set_bit(LFL_REREQUEST, &lp->flags);
197 queue_submit(lp);
198 return;
199 }
200
201 /*
202 * DLM demoted the lock to NL before it was granted so GFS must be
203 * told it cannot cache data for this lock.
204 */
205
206 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
207 set_bit(LFL_NOCACHE, &lp->flags);
208
209 out:
210 /*
211 * This is an internal lock_dlm lock
212 */
213
214 if (test_bit(LFL_INLOCK, &lp->flags)) {
215 clear_bit(LFL_NOBLOCK, &lp->flags);
216 lp->cur = lp->req;
217 complete(&lp->ast_wait);
218 return;
219 }
220
221 /*
222 * Normal completion of a lock request. Tell GFS it now has the lock.
223 */
224
225 clear_bit(LFL_NOBLOCK, &lp->flags);
226 lp->cur = lp->req;
227
228 acb.lc_name = lp->lockname;
229 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
230
231 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
232 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
233 acb.lc_ret |= LM_OUT_CACHEABLE;
234
235 ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb);
236}
237
238static inline int no_work(struct gdlm_ls *ls, int blocking)
239{
240 int ret;
241
242 spin_lock(&ls->async_lock);
243 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
244 if (ret && blocking)
245 ret = list_empty(&ls->blocking);
246 spin_unlock(&ls->async_lock);
247
248 return ret;
249}
250
251static inline int check_drop(struct gdlm_ls *ls)
252{
253 if (!ls->drop_locks_count)
254 return 0;
255
256 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
257 ls->drop_time = jiffies;
258 if (ls->all_locks_count >= ls->drop_locks_count)
259 return 1;
260 }
261 return 0;
262}
263
264static int gdlm_thread(void *data)
265{
266 struct gdlm_ls *ls = (struct gdlm_ls *) data;
267 struct gdlm_lock *lp = NULL;
268 int blist = 0;
269 uint8_t complete, blocking, submit, drop;
270 DECLARE_WAITQUEUE(wait, current);
271
272 /* Only thread1 is allowed to do blocking callbacks since gfs
273 may wait for a completion callback within a blocking cb. */
274
275 if (current == ls->thread1)
276 blist = 1;
277
278 while (!kthread_should_stop()) {
279 set_current_state(TASK_INTERRUPTIBLE);
280 add_wait_queue(&ls->thread_wait, &wait);
281 if (no_work(ls, blist))
282 schedule();
283 remove_wait_queue(&ls->thread_wait, &wait);
284 set_current_state(TASK_RUNNING);
285
286 complete = blocking = submit = drop = 0;
287
288 spin_lock(&ls->async_lock);
289
290 if (blist && !list_empty(&ls->blocking)) {
291 lp = list_entry(ls->blocking.next, struct gdlm_lock,
292 blist);
293 list_del_init(&lp->blist);
294 blocking = lp->bast_mode;
295 lp->bast_mode = 0;
296 } else if (!list_empty(&ls->complete)) {
297 lp = list_entry(ls->complete.next, struct gdlm_lock,
298 clist);
299 list_del_init(&lp->clist);
300 complete = 1;
301 } else if (!list_empty(&ls->submit)) {
302 lp = list_entry(ls->submit.next, struct gdlm_lock,
303 delay_list);
304 list_del_init(&lp->delay_list);
305 submit = 1;
306 }
307
308 drop = check_drop(ls);
309 spin_unlock(&ls->async_lock);
310
311 if (complete)
312 process_complete(lp);
313
314 else if (blocking)
315 process_blocking(lp, blocking);
316
317 else if (submit)
318 gdlm_do_lock(lp);
319
320 if (drop)
321 ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL);
322
323 schedule();
324 }
325
326 return 0;
327}
328
329int gdlm_init_threads(struct gdlm_ls *ls)
330{
331 struct task_struct *p;
332 int error;
333
334 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
335 error = IS_ERR(p);
336 if (error) {
337 log_error("can't start lock_dlm1 thread %d", error);
338 return error;
339 }
340 ls->thread1 = p;
341
342 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
343 error = IS_ERR(p);
344 if (error) {
345 log_error("can't start lock_dlm2 thread %d", error);
346 kthread_stop(ls->thread1);
347 return error;
348 }
349 ls->thread2 = p;
350
351 return 0;
352}
353
354void gdlm_release_threads(struct gdlm_ls *ls)
355{
356 kthread_stop(ls->thread1);
357 kthread_stop(ls->thread2);
358}
359
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..cdadf956c831
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..97ffac5cdefb
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,259 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17
18#include "../../lm_interface.h"
19
20struct nolock_lockspace {
21 unsigned int nl_lvb_size;
22};
23
24static struct lm_lockops nolock_ops;
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, lm_fsdata_t *fsdata,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj)
31{
32 char *c;
33 unsigned int jid;
34 struct nolock_lockspace *nl;
35
36 c = strstr(host_data, "jid=");
37 if (!c)
38 jid = 0;
39 else {
40 c += 4;
41 sscanf(c, "%u", &jid);
42 }
43
44 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
45 if (!nl)
46 return -ENOMEM;
47
48 nl->nl_lvb_size = min_lvb_size;
49
50 lockstruct->ls_jid = jid;
51 lockstruct->ls_first = 1;
52 lockstruct->ls_lvb_size = min_lvb_size;
53 lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
54 lockstruct->ls_ops = &nolock_ops;
55 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
56
57 return 0;
58}
59
60static void nolock_others_may_mount(lm_lockspace_t *lockspace)
61{
62}
63
64static void nolock_unmount(lm_lockspace_t *lockspace)
65{
66 struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
67 kfree(nl);
68}
69
70static void nolock_withdraw(lm_lockspace_t *lockspace)
71{
72}
73
74/**
75 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
76 * @lockspace: the lockspace the lock lives in
77 * @name: the name of the lock
78 * @lockp: return the lm_lock_t here
79 *
80 * Returns: 0 on success, -EXXX on failure
81 */
82
83static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
84 lm_lock_t **lockp)
85{
86 *lockp = (lm_lock_t *)lockspace;
87 return 0;
88}
89
90/**
91 * nolock_put_lock - get rid of a lock structure
92 * @lock: the lock to throw away
93 *
94 */
95
96static void nolock_put_lock(lm_lock_t *lock)
97{
98}
99
100/**
101 * nolock_lock - acquire a lock
102 * @lock: the lock to manipulate
103 * @cur_state: the current state
104 * @req_state: the requested state
105 * @flags: modifier flags
106 *
107 * Returns: A bitmap of LM_OUT_*
108 */
109
110static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state,
111 unsigned int req_state, unsigned int flags)
112{
113 return req_state | LM_OUT_CACHEABLE;
114}
115
116/**
117 * nolock_unlock - unlock a lock
118 * @lock: the lock to manipulate
119 * @cur_state: the current state
120 *
121 * Returns: 0
122 */
123
124static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
125{
126 return 0;
127}
128
129static void nolock_cancel(lm_lock_t *lock)
130{
131}
132
133/**
134 * nolock_hold_lvb - hold on to a lock value block
135 * @lock: the lock the LVB is associated with
136 * @lvbp: return the lm_lvb_t here
137 *
138 * Returns: 0 on success, -EXXX on failure
139 */
140
141static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
142{
143 struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
144 int error = 0;
145
146 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
147 if (!*lvbp)
148 error = -ENOMEM;
149
150 return error;
151}
152
153/**
154 * nolock_unhold_lvb - release a LVB
155 * @lock: the lock the LVB is associated with
156 * @lvb: the lock value block
157 *
158 */
159
160static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
161{
162 kfree(lvb);
163}
164
165/**
166 * nolock_sync_lvb - sync out the value of a lvb
167 * @lock: the lock the LVB is associated with
168 * @lvb: the lock value block
169 *
170 */
171
172static void nolock_sync_lvb(lm_lock_t *lock, char *lvb)
173{
174}
175
176static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
177 struct file *file, struct file_lock *fl)
178{
179 struct file_lock tmp;
180 int ret;
181
182 ret = posix_test_lock(file, fl, &tmp);
183 fl->fl_type = F_UNLCK;
184 if (ret)
185 memcpy(fl, &tmp, sizeof(struct file_lock));
186
187 return 0;
188}
189
190static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error;
194 error = posix_lock_file_wait(file, fl);
195 return error;
196}
197
198static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 error = posix_lock_file_wait(file, fl);
203 return error;
204}
205
206static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
207 unsigned int message)
208{
209}
210
211static struct lm_lockops nolock_ops = {
212 .lm_proto_name = "lock_nolock",
213 .lm_mount = nolock_mount,
214 .lm_others_may_mount = nolock_others_may_mount,
215 .lm_unmount = nolock_unmount,
216 .lm_withdraw = nolock_withdraw,
217 .lm_get_lock = nolock_get_lock,
218 .lm_put_lock = nolock_put_lock,
219 .lm_lock = nolock_lock,
220 .lm_unlock = nolock_unlock,
221 .lm_cancel = nolock_cancel,
222 .lm_hold_lvb = nolock_hold_lvb,
223 .lm_unhold_lvb = nolock_unhold_lvb,
224 .lm_sync_lvb = nolock_sync_lvb,
225 .lm_plock_get = nolock_plock_get,
226 .lm_plock = nolock_plock,
227 .lm_punlock = nolock_punlock,
228 .lm_recovery_done = nolock_recovery_done,
229 .lm_owner = THIS_MODULE,
230};
231
232static int __init init_nolock(void)
233{
234 int error;
235
236 error = gfs_register_lockproto(&nolock_ops);
237 if (error) {
238 printk(KERN_WARNING
239 "lock_nolock: can't register protocol: %d\n", error);
240 return error;
241 }
242
243 printk(KERN_INFO
244 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
245 return 0;
246}
247
248static void __exit exit_nolock(void)
249{
250 gfs_unregister_lockproto(&nolock_ops);
251}
252
253module_init(init_nolock);
254module_exit(exit_nolock);
255
256MODULE_DESCRIPTION("GFS Nolock Locking Module");
257MODULE_AUTHOR("Red Hat, Inc.");
258MODULE_LICENSE("GPL");
259
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..60fdc94ccc8a
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,601 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "util.h"
27#include "dir.h"
28
29#define PULL 1
30
31/**
32 * gfs2_struct2blk - compute stuff
33 * @sdp: the filesystem
34 * @nstruct: the number of structures
35 * @ssize: the size of the structures
36 *
37 * Compute the number of log descriptor blocks needed to hold a certain number
38 * of structures of a certain size.
39 *
40 * Returns: the number of blocks needed (minimum is always 1)
41 */
42
43unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
44 unsigned int ssize)
45{
46 unsigned int blks;
47 unsigned int first, second;
48
49 blks = 1;
50 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) /
51 ssize;
52
53 if (nstruct > first) {
54 second = (sdp->sd_sb.sb_bsize -
55 sizeof(struct gfs2_meta_header)) / ssize;
56 blks += DIV_ROUND_UP(nstruct - first, second);
57 }
58
59 return blks;
60}
61
62void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
63{
64 struct list_head *head = &sdp->sd_ail1_list;
65 uint64_t sync_gen;
66 struct list_head *first, *tmp;
67 struct gfs2_ail *first_ai, *ai;
68
69 gfs2_log_lock(sdp);
70 if (list_empty(head)) {
71 gfs2_log_unlock(sdp);
72 return;
73 }
74 sync_gen = sdp->sd_ail_sync_gen++;
75
76 first = head->prev;
77 first_ai = list_entry(first, struct gfs2_ail, ai_list);
78 first_ai->ai_sync_gen = sync_gen;
79 gfs2_ail1_start_one(sdp, first_ai);
80
81 if (flags & DIO_ALL)
82 first = NULL;
83
84 for (;;) {
85 if (first && (head->prev != first ||
86 gfs2_ail1_empty_one(sdp, first_ai, 0)))
87 break;
88
89 for (tmp = head->prev; tmp != head; tmp = tmp->prev) {
90 ai = list_entry(tmp, struct gfs2_ail, ai_list);
91 if (ai->ai_sync_gen >= sync_gen)
92 continue;
93 ai->ai_sync_gen = sync_gen;
94 gfs2_ail1_start_one(sdp, ai);
95 break;
96 }
97
98 if (tmp == head)
99 break;
100 }
101
102 gfs2_log_unlock(sdp);
103}
104
105int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
106{
107 struct gfs2_ail *ai, *s;
108 int ret;
109
110 gfs2_log_lock(sdp);
111
112 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
113 if (gfs2_ail1_empty_one(sdp, ai, flags))
114 list_move(&ai->ai_list, &sdp->sd_ail2_list);
115 else if (!(flags & DIO_ALL))
116 break;
117 }
118
119 ret = list_empty(&sdp->sd_ail1_list);
120
121 gfs2_log_unlock(sdp);
122
123 return ret;
124}
125
126static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
127{
128 struct gfs2_ail *ai, *safe;
129 unsigned int old_tail = sdp->sd_log_tail;
130 int wrap = (new_tail < old_tail);
131 int a, b, rm;
132
133 gfs2_log_lock(sdp);
134
135 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
136 a = (old_tail <= ai->ai_first);
137 b = (ai->ai_first < new_tail);
138 rm = (wrap) ? (a || b) : (a && b);
139 if (!rm)
140 continue;
141
142 gfs2_ail2_empty_one(sdp, ai);
143 list_del(&ai->ai_list);
144 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
145 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
146 kfree(ai);
147 }
148
149 gfs2_log_unlock(sdp);
150}
151
152/**
153 * gfs2_log_reserve - Make a log reservation
154 * @sdp: The GFS2 superblock
155 * @blks: The number of blocks to reserve
156 *
157 * Returns: errno
158 */
159
160int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
161{
162 unsigned int try = 0;
163
164 if (gfs2_assert_warn(sdp, blks) ||
165 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
166 return -EINVAL;
167
168 mutex_lock(&sdp->sd_log_reserve_mutex);
169 gfs2_log_lock(sdp);
170 while(sdp->sd_log_blks_free <= blks) {
171 gfs2_log_unlock(sdp);
172 gfs2_ail1_empty(sdp, 0);
173 gfs2_log_flush(sdp, NULL);
174
175 if (try++)
176 gfs2_ail1_start(sdp, 0);
177 gfs2_log_lock(sdp);
178 }
179 sdp->sd_log_blks_free -= blks;
180 /* printk(KERN_INFO "reserved %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
181 gfs2_log_unlock(sdp);
182 mutex_unlock(&sdp->sd_log_reserve_mutex);
183
184 down_read(&sdp->sd_log_flush_lock);
185
186 return 0;
187}
188
189/**
190 * gfs2_log_release - Release a given number of log blocks
191 * @sdp: The GFS2 superblock
192 * @blks: The number of blocks
193 *
194 */
195
196void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
197{
198
199 gfs2_log_lock(sdp);
200 sdp->sd_log_blks_free += blks;
201 /* printk(KERN_INFO "released %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
202 gfs2_assert_withdraw(sdp,
203 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
204 gfs2_log_unlock(sdp);
205 up_read(&sdp->sd_log_flush_lock);
206}
207
208static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
209{
210 int new = 0;
211 uint64_t dbn;
212 int error;
213 int bdy;
214
215 error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, &new, &dbn, &bdy);
216 if (!(!error && dbn)) {
217 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, (unsigned long long)dbn, lbn);
218 }
219 gfs2_assert_withdraw(sdp, !error && dbn);
220
221 return dbn;
222}
223
224/**
225 * log_distance - Compute distance between two journal blocks
226 * @sdp: The GFS2 superblock
227 * @newer: The most recent journal block of the pair
228 * @older: The older journal block of the pair
229 *
230 * Compute the distance (in the journal direction) between two
231 * blocks in the journal
232 *
233 * Returns: the distance in blocks
234 */
235
236static inline unsigned int log_distance(struct gfs2_sbd *sdp,
237 unsigned int newer,
238 unsigned int older)
239{
240 int dist;
241
242 dist = newer - older;
243 if (dist < 0)
244 dist += sdp->sd_jdesc->jd_blocks;
245
246 return dist;
247}
248
249static unsigned int current_tail(struct gfs2_sbd *sdp)
250{
251 struct gfs2_ail *ai;
252 unsigned int tail;
253
254 gfs2_log_lock(sdp);
255
256 if (list_empty(&sdp->sd_ail1_list))
257 tail = sdp->sd_log_head;
258 else {
259 ai = list_entry(sdp->sd_ail1_list.prev,
260 struct gfs2_ail, ai_list);
261 tail = ai->ai_first;
262 }
263
264 gfs2_log_unlock(sdp);
265
266 return tail;
267}
268
269static inline void log_incr_head(struct gfs2_sbd *sdp)
270{
271 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
272 gfs2_assert_withdraw(sdp,
273 sdp->sd_log_flush_head == sdp->sd_log_head);
274
275 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
276 sdp->sd_log_flush_head = 0;
277 sdp->sd_log_flush_wrapped = 1;
278 }
279}
280
281/**
282 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
283 * @sdp: The GFS2 superblock
284 *
285 * Returns: the buffer_head
286 */
287
288struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
289{
290 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
291 struct gfs2_log_buf *lb;
292 struct buffer_head *bh;
293
294 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
295 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
296
297 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
298 lock_buffer(bh);
299 memset(bh->b_data, 0, bh->b_size);
300 set_buffer_uptodate(bh);
301 clear_buffer_dirty(bh);
302 unlock_buffer(bh);
303
304 log_incr_head(sdp);
305
306 return bh;
307}
308
309/**
310 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
311 * @sdp: the filesystem
312 * @data: the data the buffer_head should point to
313 *
314 * Returns: the log buffer descriptor
315 */
316
317struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
318 struct buffer_head *real)
319{
320 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
321 struct gfs2_log_buf *lb;
322 struct buffer_head *bh;
323
324 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
325 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
326 lb->lb_real = real;
327
328 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
329 atomic_set(&bh->b_count, 1);
330 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
331 set_bh_page(bh, real->b_page, bh_offset(real));
332 bh->b_blocknr = blkno;
333 bh->b_size = sdp->sd_sb.sb_bsize;
334 bh->b_bdev = sdp->sd_vfs->s_bdev;
335
336 log_incr_head(sdp);
337
338 return bh;
339}
340
341static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
342{
343 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
344
345 ail2_empty(sdp, new_tail);
346
347 gfs2_log_lock(sdp);
348 sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0);
349 /* printk(KERN_INFO "pull tail refunding %u blocks (%u left) pull=%d\n", dist - ((pull) ? 1 : 0), sdp->sd_log_blks_free, pull); */
350 gfs2_assert_withdraw(sdp,
351 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
352 gfs2_log_unlock(sdp);
353
354 sdp->sd_log_tail = new_tail;
355}
356
357/**
358 * log_write_header - Get and initialize a journal header buffer
359 * @sdp: The GFS2 superblock
360 *
361 * Returns: the initialized log buffer descriptor
362 */
363
364static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull)
365{
366 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
367 struct buffer_head *bh;
368 struct gfs2_log_header *lh;
369 unsigned int tail;
370 uint32_t hash;
371
372 /* printk(KERN_INFO "log write header start (flags=%08x, pull=%d)\n", flags, pull); */
373
374 bh = sb_getblk(sdp->sd_vfs, blkno);
375 lock_buffer(bh);
376 memset(bh->b_data, 0, bh->b_size);
377 set_buffer_uptodate(bh);
378 clear_buffer_dirty(bh);
379 unlock_buffer(bh);
380
381 gfs2_ail1_empty(sdp, 0);
382 tail = current_tail(sdp);
383
384 lh = (struct gfs2_log_header *)bh->b_data;
385 memset(lh, 0, sizeof(struct gfs2_log_header));
386 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
387 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
388 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
389 lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
390 lh->lh_flags = cpu_to_be32(flags);
391 lh->lh_tail = cpu_to_be32(tail);
392 lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
393 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
394 lh->lh_hash = cpu_to_be32(hash);
395
396 set_buffer_dirty(bh);
397 if (sync_dirty_buffer(bh))
398 gfs2_io_error_bh(sdp, bh);
399 brelse(bh);
400
401 if (sdp->sd_log_tail != tail)
402 log_pull_tail(sdp, tail, pull);
403 else
404 gfs2_assert_withdraw(sdp, !pull);
405
406 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
407 log_incr_head(sdp);
408
409 /* printk(KERN_INFO "log write header out\n"); */
410}
411
412static void log_flush_commit(struct gfs2_sbd *sdp)
413{
414 struct list_head *head = &sdp->sd_log_flush_list;
415 struct gfs2_log_buf *lb;
416 struct buffer_head *bh;
417#if 0
418 unsigned int d;
419
420 d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head);
421
422 gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved);
423#endif
424
425 while (!list_empty(head)) {
426 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
427 list_del(&lb->lb_list);
428 bh = lb->lb_bh;
429
430 wait_on_buffer(bh);
431 if (!buffer_uptodate(bh))
432 gfs2_io_error_bh(sdp, bh);
433 if (lb->lb_real) {
434 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
435 schedule();
436 free_buffer_head(bh);
437 } else
438 brelse(bh);
439 kfree(lb);
440 }
441
442 log_write_header(sdp, 0, 0);
443}
444
445/**
446 * gfs2_log_flush - flush incore transaction(s)
447 * @sdp: the filesystem
448 * @gl: The glock structure to flush. If NULL, flush the whole incore log
449 *
450 */
451
452void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
453{
454 struct gfs2_ail *ai;
455
456 down_write(&sdp->sd_log_flush_lock);
457
458 if (gl) {
459 gfs2_log_lock(sdp);
460 if (list_empty(&gl->gl_le.le_list)) {
461 gfs2_log_unlock(sdp);
462 up_write(&sdp->sd_log_flush_lock);
463 return;
464 }
465 gfs2_log_unlock(sdp);
466 }
467
468 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
469 INIT_LIST_HEAD(&ai->ai_ail1_list);
470 INIT_LIST_HEAD(&ai->ai_ail2_list);
471
472 gfs2_assert_withdraw(sdp,
473 sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
474 gfs2_assert_withdraw(sdp,
475 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
476
477 sdp->sd_log_flush_head = sdp->sd_log_head;
478 sdp->sd_log_flush_wrapped = 0;
479 ai->ai_first = sdp->sd_log_flush_head;
480
481 lops_before_commit(sdp);
482 if (!list_empty(&sdp->sd_log_flush_list))
483 log_flush_commit(sdp);
484 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
485 log_write_header(sdp, 0, PULL);
486 lops_after_commit(sdp, ai);
487 sdp->sd_log_head = sdp->sd_log_flush_head;
488
489 /* printk(KERN_INFO "sd_log_num_hdrs %u\n", sdp->sd_log_num_hdrs); */
490 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
491
492 sdp->sd_log_blks_reserved =
493 sdp->sd_log_commited_buf =
494 sdp->sd_log_num_hdrs =
495 sdp->sd_log_commited_revoke = 0;
496
497 gfs2_log_lock(sdp);
498 if (!list_empty(&ai->ai_ail1_list)) {
499 list_add(&ai->ai_list, &sdp->sd_ail1_list);
500 ai = NULL;
501 }
502 gfs2_log_unlock(sdp);
503
504 sdp->sd_vfs->s_dirt = 0;
505 up_write(&sdp->sd_log_flush_lock);
506
507 kfree(ai);
508}
509
510static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
511{
512 unsigned int reserved = 1;
513 unsigned int old;
514
515 gfs2_log_lock(sdp);
516
517 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
518 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
519 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
520 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
521
522 if (sdp->sd_log_commited_buf)
523 reserved += sdp->sd_log_commited_buf;
524 if (sdp->sd_log_commited_revoke)
525 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
526 sizeof(uint64_t));
527
528 old = sdp->sd_log_blks_free;
529 sdp->sd_log_blks_free += tr->tr_reserved -
530 (reserved - sdp->sd_log_blks_reserved);
531
532 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
533 gfs2_assert_withdraw(sdp,
534 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
535 sdp->sd_log_num_hdrs);
536
537 sdp->sd_log_blks_reserved = reserved;
538
539 gfs2_log_unlock(sdp);
540}
541
542/**
543 * gfs2_log_commit - Commit a transaction to the log
544 * @sdp: the filesystem
545 * @tr: the transaction
546 *
547 * Returns: errno
548 */
549
550void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
551{
552 log_refund(sdp, tr);
553 lops_incore_commit(sdp, tr);
554
555 sdp->sd_vfs->s_dirt = 1;
556 up_read(&sdp->sd_log_flush_lock);
557
558 gfs2_log_lock(sdp);
559 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
560 gfs2_log_unlock(sdp);
561 gfs2_log_flush(sdp, NULL);
562 } else
563 gfs2_log_unlock(sdp);
564}
565
566/**
567 * gfs2_log_shutdown - write a shutdown header into a journal
568 * @sdp: the filesystem
569 *
570 */
571
572void gfs2_log_shutdown(struct gfs2_sbd *sdp)
573{
574 down_write(&sdp->sd_log_flush_lock);
575
576 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
577 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
578 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
579 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
580 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
581 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
582 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
583 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
584 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
585
586 sdp->sd_log_flush_head = sdp->sd_log_head;
587 sdp->sd_log_flush_wrapped = 0;
588
589 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
590
591 /* printk(KERN_INFO "sd_log_blks_free %u, sd_jdesc->jd_blocks %u\n", sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); */
592 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
593 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
594 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
595
596 sdp->sd_log_head = sdp->sd_log_flush_head;
597 sdp->sd_log_tail = sdp->sd_log_head;
598
599 up_write(&sdp->sd_log_flush_lock);
600}
601
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..8cfd0f1d29f8
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,61 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13/**
14 * gfs2_log_lock - acquire the right to mess with the log manager
15 * @sdp: the filesystem
16 *
17 */
18
19static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
20{
21 spin_lock(&sdp->sd_log_lock);
22}
23
24/**
25 * gfs2_log_unlock - release the right to mess with the log manager
26 * @sdp: the filesystem
27 *
28 */
29
30static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
31{
32 spin_unlock(&sdp->sd_log_lock);
33}
34
35static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
36 unsigned int value)
37{
38 if (++value == sdp->sd_jdesc->jd_blocks) {
39 value = 0;
40 }
41 sdp->sd_log_head = sdp->sd_log_tail = value;
42}
43
44unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
45 unsigned int ssize);
46
47void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
48int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
49
50int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
51void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
52
53struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
54struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
55 struct buffer_head *real);
56void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
57void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
58
59void gfs2_log_shutdown(struct gfs2_sbd *sdp);
60
61#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..af03bf380f46
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,800 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "glock.h"
21#include "log.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "rgrp.h"
26#include "trans.h"
27#include "util.h"
28
29static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
30{
31 struct gfs2_glock *gl;
32 struct gfs2_trans *tr = current->journal_info;
33
34 tr->tr_touched = 1;
35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44
45 gfs2_log_lock(sdp);
46 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp);
49}
50
51static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 struct list_head *head = &sdp->sd_log_le_gl;
54 struct gfs2_glock *gl;
55
56 while (!list_empty(head)) {
57 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
58 list_del_init(&gl->gl_le.le_list);
59 sdp->sd_log_num_gl--;
60
61 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
62 gfs2_glock_put(gl);
63 }
64 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
65}
66
67static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
68{
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr;
71
72 if (!list_empty(&bd->bd_list_tr))
73 return;
74
75 tr = current->journal_info;
76 tr->tr_touched = 1;
77 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
79
80 if (!list_empty(&le->le_list))
81 return;
82
83 gfs2_trans_add_gl(bd->bd_gl);
84
85 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf);
91 gfs2_log_unlock(sdp);
92
93 tr->tr_num_buf_new++;
94}
95
96static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
97{
98 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd;
100
101 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--;
105 }
106 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107}
108
109static void buf_lo_before_commit(struct gfs2_sbd *sdp)
110{
111 struct buffer_head *bh;
112 struct gfs2_log_descriptor *ld;
113 struct gfs2_bufdata *bd1 = NULL, *bd2;
114 unsigned int total = sdp->sd_log_num_buf;
115 unsigned int offset = sizeof(struct gfs2_log_descriptor);
116 unsigned int limit;
117 unsigned int num;
118 unsigned n;
119 __be64 *ptr;
120
121 offset += (sizeof(__be64) - 1);
122 offset &= ~(sizeof(__be64) - 1);
123 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
124 /* for 4k blocks, limit = 503 */
125
126 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
127 while(total) {
128 num = total;
129 if (total > limit)
130 num = limit;
131 bh = gfs2_log_get_buf(sdp);
132 sdp->sd_log_num_hdrs++;
133 ld = (struct gfs2_log_descriptor *)bh->b_data;
134 ptr = (__be64 *)(bh->b_data + offset);
135 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
136 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
137 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
138 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
139 ld->ld_length = cpu_to_be32(num + 1);
140 ld->ld_data1 = cpu_to_be32(num);
141 ld->ld_data2 = cpu_to_be32(0);
142 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
143
144 n = 0;
145 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
146 bd_le.le_list) {
147 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
148 if (++n >= num)
149 break;
150 }
151
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154
155 n = 0;
156 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
157 bd_le.le_list) {
158 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
159 set_buffer_dirty(bh);
160 ll_rw_block(WRITE, 1, &bh);
161 if (++n >= num)
162 break;
163 }
164
165 total -= num;
166 }
167}
168
169static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
170{
171 struct list_head *head = &sdp->sd_log_le_buf;
172 struct gfs2_bufdata *bd;
173
174 while (!list_empty(head)) {
175 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
176 list_del_init(&bd->bd_le.le_list);
177 sdp->sd_log_num_buf--;
178
179 gfs2_unpin(sdp, bd->bd_bh, ai);
180 }
181 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
182}
183
184static void buf_lo_before_scan(struct gfs2_jdesc *jd,
185 struct gfs2_log_header *head, int pass)
186{
187 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
188
189 if (pass != 0)
190 return;
191
192 sdp->sd_found_blocks = 0;
193 sdp->sd_replayed_blocks = 0;
194}
195
196static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
197 struct gfs2_log_descriptor *ld, __be64 *ptr,
198 int pass)
199{
200 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
201 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
202 struct gfs2_glock *gl = ip->i_gl;
203 unsigned int blks = be32_to_cpu(ld->ld_data1);
204 struct buffer_head *bh_log, *bh_ip;
205 uint64_t blkno;
206 int error = 0;
207
208 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
209 return 0;
210
211 gfs2_replay_incr_blk(sdp, &start);
212
213 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
214 blkno = be64_to_cpu(*ptr++);
215
216 sdp->sd_found_blocks++;
217
218 if (gfs2_revoke_check(sdp, blkno, start))
219 continue;
220
221 error = gfs2_replay_read_block(jd, start, &bh_log);
222 if (error)
223 return error;
224
225 bh_ip = gfs2_meta_new(gl, blkno);
226 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
227
228 if (gfs2_meta_check(sdp, bh_ip))
229 error = -EIO;
230 else
231 mark_buffer_dirty(bh_ip);
232
233 brelse(bh_log);
234 brelse(bh_ip);
235
236 if (error)
237 break;
238
239 sdp->sd_replayed_blocks++;
240 }
241
242 return error;
243}
244
245static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
246{
247 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
248 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
249
250 if (error) {
251 gfs2_meta_sync(ip->i_gl,
252 DIO_START | DIO_WAIT);
253 return;
254 }
255 if (pass != 1)
256 return;
257
258 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
259
260 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
261 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
262}
263
264static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
265{
266 struct gfs2_trans *tr;
267
268 tr = current->journal_info;
269 tr->tr_touched = 1;
270 tr->tr_num_revoke++;
271
272 gfs2_log_lock(sdp);
273 sdp->sd_log_num_revoke++;
274 list_add(&le->le_list, &sdp->sd_log_le_revoke);
275 gfs2_log_unlock(sdp);
276}
277
278static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
279{
280 struct gfs2_log_descriptor *ld;
281 struct gfs2_meta_header *mh;
282 struct buffer_head *bh;
283 unsigned int offset;
284 struct list_head *head = &sdp->sd_log_le_revoke;
285 struct gfs2_revoke *rv;
286
287 if (!sdp->sd_log_num_revoke)
288 return;
289
290 bh = gfs2_log_get_buf(sdp);
291 ld = (struct gfs2_log_descriptor *)bh->b_data;
292 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
293 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
294 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
295 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
296 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
297 sizeof(uint64_t)));
298 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
299 ld->ld_data2 = cpu_to_be32(0);
300 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
301 offset = sizeof(struct gfs2_log_descriptor);
302
303 while (!list_empty(head)) {
304 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
305 list_del_init(&rv->rv_le.le_list);
306 sdp->sd_log_num_revoke--;
307
308 if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) {
309 set_buffer_dirty(bh);
310 ll_rw_block(WRITE, 1, &bh);
311
312 bh = gfs2_log_get_buf(sdp);
313 mh = (struct gfs2_meta_header *)bh->b_data;
314 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
315 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
316 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
317 offset = sizeof(struct gfs2_meta_header);
318 }
319
320 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
321 kfree(rv);
322
323 offset += sizeof(uint64_t);
324 }
325 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
326
327 set_buffer_dirty(bh);
328 ll_rw_block(WRITE, 1, &bh);
329}
330
331static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
332 struct gfs2_log_header *head, int pass)
333{
334 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
335
336 if (pass != 0)
337 return;
338
339 sdp->sd_found_revokes = 0;
340 sdp->sd_replay_tail = head->lh_tail;
341}
342
343static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
344 struct gfs2_log_descriptor *ld, __be64 *ptr,
345 int pass)
346{
347 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
348 unsigned int blks = be32_to_cpu(ld->ld_length);
349 unsigned int revokes = be32_to_cpu(ld->ld_data1);
350 struct buffer_head *bh;
351 unsigned int offset;
352 uint64_t blkno;
353 int first = 1;
354 int error;
355
356 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
357 return 0;
358
359 offset = sizeof(struct gfs2_log_descriptor);
360
361 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
362 error = gfs2_replay_read_block(jd, start, &bh);
363 if (error)
364 return error;
365
366 if (!first)
367 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
368
369 while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) {
370 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
371
372 error = gfs2_revoke_add(sdp, blkno, start);
373 if (error < 0)
374 return error;
375 else if (error)
376 sdp->sd_found_revokes++;
377
378 if (!--revokes)
379 break;
380 offset += sizeof(uint64_t);
381 }
382
383 brelse(bh);
384 offset = sizeof(struct gfs2_meta_header);
385 first = 0;
386 }
387
388 return 0;
389}
390
391static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
392{
393 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
394
395 if (error) {
396 gfs2_revoke_clean(sdp);
397 return;
398 }
399 if (pass != 1)
400 return;
401
402 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
403 jd->jd_jid, sdp->sd_found_revokes);
404
405 gfs2_revoke_clean(sdp);
406}
407
408static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
409{
410 struct gfs2_rgrpd *rgd;
411 struct gfs2_trans *tr = current->journal_info;
412
413 tr->tr_touched = 1;
414
415 if (!list_empty(&le->le_list))
416 return;
417
418 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
419 gfs2_rgrp_bh_hold(rgd);
420
421 gfs2_log_lock(sdp);
422 sdp->sd_log_num_rg++;
423 list_add(&le->le_list, &sdp->sd_log_le_rg);
424 gfs2_log_unlock(sdp);
425}
426
427static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
428{
429 struct list_head *head = &sdp->sd_log_le_rg;
430 struct gfs2_rgrpd *rgd;
431
432 while (!list_empty(head)) {
433 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
434 list_del_init(&rgd->rd_le.le_list);
435 sdp->sd_log_num_rg--;
436
437 gfs2_rgrp_repolish_clones(rgd);
438 gfs2_rgrp_bh_put(rgd);
439 }
440 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
441}
442
443/**
444 * databuf_lo_add - Add a databuf to the transaction.
445 *
446 * This is used in two distinct cases:
447 * i) In ordered write mode
448 * We put the data buffer on a list so that we can ensure that its
449 * synced to disk at the right time
450 * ii) In journaled data mode
451 * We need to journal the data block in the same way as metadata in
452 * the functions above. The difference is that here we have a tag
453 * which is two __be64's being the block number (as per meta data)
454 * and a flag which says whether the data block needs escaping or
455 * not. This means we need a new log entry for each 251 or so data
456 * blocks, which isn't an enormous overhead but twice as much as
457 * for normal metadata blocks.
458 */
459static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
460{
461 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
462 struct gfs2_trans *tr = current->journal_info;
463 struct address_space *mapping = bd->bd_bh->b_page->mapping;
464 struct gfs2_inode *ip = GFS2_I(mapping->host);
465
466 tr->tr_touched = 1;
467 if (!list_empty(&bd->bd_list_tr) &&
468 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
469 tr->tr_num_buf++;
470 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
471 gfs2_pin(sdp, bd->bd_bh);
472 tr->tr_num_buf_new++;
473 }
474 gfs2_trans_add_gl(bd->bd_gl);
475 gfs2_log_lock(sdp);
476 if (!list_empty(&le->le_list)) {
477 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
478 sdp->sd_log_num_jdata++;
479 sdp->sd_log_num_databuf++;
480 list_add(&le->le_list, &sdp->sd_log_le_databuf);
481 }
482 gfs2_log_unlock(sdp);
483}
484
485static int gfs2_check_magic(struct buffer_head *bh)
486{
487 struct page *page = bh->b_page;
488 void *kaddr;
489 __be32 *ptr;
490 int rv = 0;
491
492 kaddr = kmap_atomic(page, KM_USER0);
493 ptr = kaddr + bh_offset(bh);
494 if (*ptr == cpu_to_be32(GFS2_MAGIC))
495 rv = 1;
496 kunmap_atomic(page, KM_USER0);
497
498 return rv;
499}
500
501/**
502 * databuf_lo_before_commit - Scan the data buffers, writing as we go
503 *
504 * Here we scan through the lists of buffers and make the assumption
505 * that any buffer thats been pinned is being journaled, and that
506 * any unpinned buffer is an ordered write data buffer and therefore
507 * will be written back rather than journaled.
508 */
509static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
510{
511 LIST_HEAD(started);
512 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
513 struct buffer_head *bh = NULL;
514 unsigned int offset = sizeof(struct gfs2_log_descriptor);
515 struct gfs2_log_descriptor *ld;
516 unsigned int limit;
517 unsigned int total_dbuf = sdp->sd_log_num_databuf;
518 unsigned int total_jdata = sdp->sd_log_num_jdata;
519 unsigned int num, n;
520 __be64 *ptr = NULL;
521
522 offset += (2*sizeof(__be64) - 1);
523 offset &= ~(2*sizeof(__be64) - 1);
524 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
525
526 /*
527 * Start writing ordered buffers, write journaled buffers
528 * into the log along with a header
529 */
530 gfs2_log_lock(sdp);
531 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
532 bd_le.le_list);
533 while(total_dbuf) {
534 num = total_jdata;
535 if (num > limit)
536 num = limit;
537 n = 0;
538 list_for_each_entry_safe_continue(bd1, bdt,
539 &sdp->sd_log_le_databuf,
540 bd_le.le_list) {
541 /* An ordered write buffer */
542 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
543 list_move(&bd1->bd_le.le_list, &started);
544 if (bd1 == bd2) {
545 bd2 = NULL;
546 bd2 = list_prepare_entry(bd2,
547 &sdp->sd_log_le_databuf,
548 bd_le.le_list);
549 }
550 total_dbuf--;
551 if (bd1->bd_bh) {
552 get_bh(bd1->bd_bh);
553 if (buffer_dirty(bd1->bd_bh)) {
554 gfs2_log_unlock(sdp);
555 wait_on_buffer(bd1->bd_bh);
556 ll_rw_block(WRITE, 1,
557 &bd1->bd_bh);
558 gfs2_log_lock(sdp);
559 }
560 brelse(bd1->bd_bh);
561 continue;
562 }
563 continue;
564 } else if (bd1->bd_bh) { /* A journaled buffer */
565 int magic;
566 gfs2_log_unlock(sdp);
567 if (!bh) {
568 bh = gfs2_log_get_buf(sdp);
569 sdp->sd_log_num_hdrs++;
570 ld = (struct gfs2_log_descriptor *)
571 bh->b_data;
572 ptr = (__be64 *)(bh->b_data + offset);
573 ld->ld_header.mh_magic =
574 cpu_to_be32(GFS2_MAGIC);
575 ld->ld_header.mh_type =
576 cpu_to_be32(GFS2_METATYPE_LD);
577 ld->ld_header.mh_format =
578 cpu_to_be32(GFS2_FORMAT_LD);
579 ld->ld_type =
580 cpu_to_be32(GFS2_LOG_DESC_JDATA);
581 ld->ld_length = cpu_to_be32(num + 1);
582 ld->ld_data1 = cpu_to_be32(num);
583 ld->ld_data2 = cpu_to_be32(0);
584 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
585 }
586 magic = gfs2_check_magic(bd1->bd_bh);
587 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
588 *ptr++ = cpu_to_be64((__u64)magic);
589 clear_buffer_escaped(bd1->bd_bh);
590 if (unlikely(magic != 0))
591 set_buffer_escaped(bd1->bd_bh);
592 gfs2_log_lock(sdp);
593 if (n++ > num)
594 break;
595 }
596 }
597 gfs2_log_unlock(sdp);
598 if (bh) {
599 set_buffer_dirty(bh);
600 ll_rw_block(WRITE, 1, &bh);
601 bh = NULL;
602 }
603 n = 0;
604 gfs2_log_lock(sdp);
605 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
606 bd_le.le_list) {
607 if (!bd2->bd_bh)
608 continue;
609 /* copy buffer if it needs escaping */
610 gfs2_log_unlock(sdp);
611 if (unlikely(buffer_escaped(bd2->bd_bh))) {
612 void *kaddr;
613 struct page *page = bd2->bd_bh->b_page;
614 bh = gfs2_log_get_buf(sdp);
615 kaddr = kmap_atomic(page, KM_USER0);
616 memcpy(bh->b_data,
617 kaddr + bh_offset(bd2->bd_bh),
618 sdp->sd_sb.sb_bsize);
619 kunmap_atomic(page, KM_USER0);
620 *(__be32 *)bh->b_data = 0;
621 } else {
622 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
623 }
624 set_buffer_dirty(bh);
625 ll_rw_block(WRITE, 1, &bh);
626 gfs2_log_lock(sdp);
627 if (++n >= num)
628 break;
629 }
630 bh = NULL;
631 total_dbuf -= num;
632 total_jdata -= num;
633 }
634 gfs2_log_unlock(sdp);
635
636 /* Wait on all ordered buffers */
637 while (!list_empty(&started)) {
638 gfs2_log_lock(sdp);
639 bd1 = list_entry(started.next, struct gfs2_bufdata,
640 bd_le.le_list);
641 list_del(&bd1->bd_le.le_list);
642 sdp->sd_log_num_databuf--;
643
644 bh = bd1->bd_bh;
645 if (bh) {
646 bh->b_private = NULL;
647 gfs2_log_unlock(sdp);
648 wait_on_buffer(bh);
649 brelse(bh);
650 } else
651 gfs2_log_unlock(sdp);
652
653 kmem_cache_free(gfs2_bufdata_cachep, bd1);
654 }
655
656 /* We've removed all the ordered write bufs here, so only jdata left */
657 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
658}
659
660static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
661 struct gfs2_log_descriptor *ld,
662 __be64 *ptr, int pass)
663{
664 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
665 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
666 struct gfs2_glock *gl = ip->i_gl;
667 unsigned int blks = be32_to_cpu(ld->ld_data1);
668 struct buffer_head *bh_log, *bh_ip;
669 uint64_t blkno;
670 uint64_t esc;
671 int error = 0;
672
673 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
674 return 0;
675
676 gfs2_replay_incr_blk(sdp, &start);
677 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
678 blkno = be64_to_cpu(*ptr++);
679 esc = be64_to_cpu(*ptr++);
680
681 sdp->sd_found_blocks++;
682
683 if (gfs2_revoke_check(sdp, blkno, start))
684 continue;
685
686 error = gfs2_replay_read_block(jd, start, &bh_log);
687 if (error)
688 return error;
689
690 bh_ip = gfs2_meta_new(gl, blkno);
691 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
692
693 /* Unescape */
694 if (esc) {
695 __be32 *eptr = (__be32 *)bh_ip->b_data;
696 *eptr = cpu_to_be32(GFS2_MAGIC);
697 }
698 mark_buffer_dirty(bh_ip);
699
700 brelse(bh_log);
701 brelse(bh_ip);
702 if (error)
703 break;
704
705 sdp->sd_replayed_blocks++;
706 }
707
708 return error;
709}
710
711/* FIXME: sort out accounting for log blocks etc. */
712
713static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
714{
715 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
716 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
717
718 if (error) {
719 gfs2_meta_sync(ip->i_gl,
720 DIO_START | DIO_WAIT);
721 return;
722 }
723 if (pass != 1)
724 return;
725
726 /* data sync? */
727 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
728
729 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
730 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
731}
732
733static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
734{
735 struct list_head *head = &sdp->sd_log_le_databuf;
736 struct gfs2_bufdata *bd;
737
738 while (!list_empty(head)) {
739 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
740 list_del(&bd->bd_le.le_list);
741 sdp->sd_log_num_databuf--;
742 sdp->sd_log_num_jdata--;
743 gfs2_unpin(sdp, bd->bd_bh, ai);
744 }
745 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
746 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
747}
748
749
750const struct gfs2_log_operations gfs2_glock_lops = {
751 .lo_add = glock_lo_add,
752 .lo_after_commit = glock_lo_after_commit,
753 .lo_name = "glock"
754};
755
756const struct gfs2_log_operations gfs2_buf_lops = {
757 .lo_add = buf_lo_add,
758 .lo_incore_commit = buf_lo_incore_commit,
759 .lo_before_commit = buf_lo_before_commit,
760 .lo_after_commit = buf_lo_after_commit,
761 .lo_before_scan = buf_lo_before_scan,
762 .lo_scan_elements = buf_lo_scan_elements,
763 .lo_after_scan = buf_lo_after_scan,
764 .lo_name = "buf"
765};
766
767const struct gfs2_log_operations gfs2_revoke_lops = {
768 .lo_add = revoke_lo_add,
769 .lo_before_commit = revoke_lo_before_commit,
770 .lo_before_scan = revoke_lo_before_scan,
771 .lo_scan_elements = revoke_lo_scan_elements,
772 .lo_after_scan = revoke_lo_after_scan,
773 .lo_name = "revoke"
774};
775
776const struct gfs2_log_operations gfs2_rg_lops = {
777 .lo_add = rg_lo_add,
778 .lo_after_commit = rg_lo_after_commit,
779 .lo_name = "rg"
780};
781
782const struct gfs2_log_operations gfs2_databuf_lops = {
783 .lo_add = databuf_lo_add,
784 .lo_incore_commit = buf_lo_incore_commit,
785 .lo_before_commit = databuf_lo_before_commit,
786 .lo_after_commit = databuf_lo_after_commit,
787 .lo_scan_elements = databuf_lo_scan_elements,
788 .lo_after_scan = databuf_lo_after_scan,
789 .lo_name = "databuf"
790};
791
792const struct gfs2_log_operations *gfs2_log_ops[] = {
793 &gfs2_glock_lops,
794 &gfs2_buf_lops,
795 &gfs2_revoke_lops,
796 &gfs2_rg_lops,
797 &gfs2_databuf_lops,
798 NULL
799};
800
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..8a1029d3d389
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13extern const struct gfs2_log_operations gfs2_glock_lops;
14extern const struct gfs2_log_operations gfs2_buf_lops;
15extern const struct gfs2_log_operations gfs2_revoke_lops;
16extern const struct gfs2_log_operations gfs2_rg_lops;
17extern const struct gfs2_log_operations gfs2_databuf_lops;
18
19extern const struct gfs2_log_operations *gfs2_log_ops[];
20
21static inline void lops_init_le(struct gfs2_log_element *le,
22 const struct gfs2_log_operations *lops)
23{
24 INIT_LIST_HEAD(&le->le_list);
25 le->le_ops = lops;
26}
27
28static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
29{
30 if (le->le_ops->lo_add)
31 le->le_ops->lo_add(sdp, le);
32}
33
34static inline void lops_incore_commit(struct gfs2_sbd *sdp,
35 struct gfs2_trans *tr)
36{
37 int x;
38 for (x = 0; gfs2_log_ops[x]; x++)
39 if (gfs2_log_ops[x]->lo_incore_commit)
40 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
41}
42
43static inline void lops_before_commit(struct gfs2_sbd *sdp)
44{
45 int x;
46 for (x = 0; gfs2_log_ops[x]; x++)
47 if (gfs2_log_ops[x]->lo_before_commit)
48 gfs2_log_ops[x]->lo_before_commit(sdp);
49}
50
51static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 int x;
54 for (x = 0; gfs2_log_ops[x]; x++)
55 if (gfs2_log_ops[x]->lo_after_commit)
56 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
57}
58
59static inline void lops_before_scan(struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head,
61 unsigned int pass)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_before_scan)
66 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
67}
68
69static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
70 struct gfs2_log_descriptor *ld,
71 __be64 *ptr,
72 unsigned int pass)
73{
74 int x, error;
75 for (x = 0; gfs2_log_ops[x]; x++)
76 if (gfs2_log_ops[x]->lo_scan_elements) {
77 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
78 ld, ptr, pass);
79 if (error)
80 return error;
81 }
82
83 return 0;
84}
85
86static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
87 unsigned int pass)
88{
89 int x;
90 for (x = 0; gfs2_log_ops[x]; x++)
91 if (gfs2_log_ops[x]->lo_before_scan)
92 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
93}
94
95#endif /* __LOPS_DOT_H__ */
96
diff --git a/fs/gfs2/lvb.c b/fs/gfs2/lvb.c
new file mode 100644
index 000000000000..e88e9cce14e7
--- /dev/null
+++ b/fs/gfs2/lvb.c
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "lvb.h"
21
22#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
23 struct->member);
24
25void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb)
26{
27 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
28
29 qb->qb_magic = be32_to_cpu(str->qb_magic);
30 qb->qb_limit = be64_to_cpu(str->qb_limit);
31 qb->qb_warn = be64_to_cpu(str->qb_warn);
32 qb->qb_value = be64_to_cpu(str->qb_value);
33}
34
35void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb)
36{
37 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
38
39 str->qb_magic = cpu_to_be32(qb->qb_magic);
40 str->qb_limit = cpu_to_be64(qb->qb_limit);
41 str->qb_warn = cpu_to_be64(qb->qb_warn);
42 str->qb_value = cpu_to_be64(qb->qb_value);
43}
44
45
diff --git a/fs/gfs2/lvb.h b/fs/gfs2/lvb.h
new file mode 100644
index 000000000000..1b1a8b75219a
--- /dev/null
+++ b/fs/gfs2/lvb.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LVB_DOT_H__
11#define __LVB_DOT_H__
12
13#define GFS2_MIN_LVB_SIZE 32
14
15void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb);
16void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb);
17
18#endif /* __LVB_DOT_H__ */
19
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..c112943ee8c1
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "ops_fstype.h"
23#include "sys.h"
24#include "util.h"
25
26static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
27{
28 struct gfs2_inode *ip = foo;
29 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
30 SLAB_CTOR_CONSTRUCTOR) {
31 inode_init_once(&ip->i_inode);
32 spin_lock_init(&ip->i_spin);
33 init_rwsem(&ip->i_rw_mutex);
34 memset(ip->i_cache, 0, sizeof(ip->i_cache));
35 }
36}
37
38/**
39 * init_gfs2_fs - Register GFS2 as a filesystem
40 *
41 * Returns: 0 on success, error code on failure
42 */
43
44static int __init init_gfs2_fs(void)
45{
46 int error;
47
48 gfs2_init_lmh();
49
50 error = gfs2_sys_init();
51 if (error)
52 return error;
53
54 error = -ENOMEM;
55
56 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
57 sizeof(struct gfs2_glock),
58 0, 0, NULL, NULL);
59 if (!gfs2_glock_cachep)
60 goto fail;
61
62 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
63 sizeof(struct gfs2_inode),
64 0, (SLAB_RECLAIM_ACCOUNT|
65 SLAB_PANIC|SLAB_MEM_SPREAD),
66 gfs2_init_inode_once, NULL);
67 if (!gfs2_inode_cachep)
68 goto fail;
69
70 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
71 sizeof(struct gfs2_bufdata),
72 0, 0, NULL, NULL);
73 if (!gfs2_bufdata_cachep)
74 goto fail;
75
76 error = register_filesystem(&gfs2_fs_type);
77 if (error)
78 goto fail;
79
80 error = register_filesystem(&gfs2meta_fs_type);
81 if (error)
82 goto fail_unregister;
83
84 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
85
86 return 0;
87
88fail_unregister:
89 unregister_filesystem(&gfs2_fs_type);
90fail:
91 if (gfs2_bufdata_cachep)
92 kmem_cache_destroy(gfs2_bufdata_cachep);
93
94 if (gfs2_inode_cachep)
95 kmem_cache_destroy(gfs2_inode_cachep);
96
97 if (gfs2_glock_cachep)
98 kmem_cache_destroy(gfs2_glock_cachep);
99
100 gfs2_sys_uninit();
101 return error;
102}
103
104/**
105 * exit_gfs2_fs - Unregister the file system
106 *
107 */
108
109static void __exit exit_gfs2_fs(void)
110{
111 unregister_filesystem(&gfs2_fs_type);
112 unregister_filesystem(&gfs2meta_fs_type);
113
114 kmem_cache_destroy(gfs2_bufdata_cachep);
115 kmem_cache_destroy(gfs2_inode_cachep);
116 kmem_cache_destroy(gfs2_glock_cachep);
117
118 gfs2_sys_uninit();
119}
120
121MODULE_DESCRIPTION("Global File System");
122MODULE_AUTHOR("Red Hat, Inc.");
123MODULE_LICENSE("GPL");
124
125module_init(init_gfs2_fs);
126module_exit(exit_gfs2_fs);
127
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..42dfd32059bc
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,780 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21
22#include "gfs2.h"
23#include "lm_interface.h"
24#include "incore.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "lops.h"
30#include "meta_io.h"
31#include "rgrp.h"
32#include "trans.h"
33#include "util.h"
34#include "ops_address.h"
35
36#define buffer_busy(bh) \
37((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
38#define buffer_in_io(bh) \
39((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
40
41static int aspace_get_block(struct inode *inode, sector_t lblock,
42 struct buffer_head *bh_result, int create)
43{
44 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
45 return -EOPNOTSUPP;
46}
47
48static int gfs2_aspace_writepage(struct page *page,
49 struct writeback_control *wbc)
50{
51 return block_write_full_page(page, aspace_get_block, wbc);
52}
53
54static const struct address_space_operations aspace_aops = {
55 .writepage = gfs2_aspace_writepage,
56 .releasepage = gfs2_releasepage,
57};
58
59/**
60 * gfs2_aspace_get - Create and initialize a struct inode structure
61 * @sdp: the filesystem the aspace is in
62 *
63 * Right now a struct inode is just a struct inode. Maybe Linux
64 * will supply a more lightweight address space construct (that works)
65 * in the future.
66 *
67 * Make sure pages/buffers in this aspace aren't in high memory.
68 *
69 * Returns: the aspace
70 */
71
72struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
73{
74 struct inode *aspace;
75
76 aspace = new_inode(sdp->sd_vfs);
77 if (aspace) {
78 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
79 aspace->i_mapping->a_ops = &aspace_aops;
80 aspace->i_size = ~0ULL;
81 aspace->u.generic_ip = NULL;
82 insert_inode_hash(aspace);
83 }
84 return aspace;
85}
86
87void gfs2_aspace_put(struct inode *aspace)
88{
89 remove_inode_hash(aspace);
90 iput(aspace);
91}
92
93/**
94 * gfs2_ail1_start_one - Start I/O on a part of the AIL
95 * @sdp: the filesystem
96 * @tr: the part of the AIL
97 *
98 */
99
100void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
101{
102 struct gfs2_bufdata *bd, *s;
103 struct buffer_head *bh;
104 int retry;
105
106 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
107
108 do {
109 retry = 0;
110
111 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
112 bd_ail_st_list) {
113 bh = bd->bd_bh;
114
115 gfs2_assert(sdp, bd->bd_ail == ai);
116
117 if (!buffer_busy(bh)) {
118 if (!buffer_uptodate(bh)) {
119 gfs2_log_unlock(sdp);
120 gfs2_io_error_bh(sdp, bh);
121 gfs2_log_lock(sdp);
122 }
123 list_move(&bd->bd_ail_st_list,
124 &ai->ai_ail2_list);
125 continue;
126 }
127
128 if (!buffer_dirty(bh))
129 continue;
130
131 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
132
133 gfs2_log_unlock(sdp);
134 wait_on_buffer(bh);
135 ll_rw_block(WRITE, 1, &bh);
136 gfs2_log_lock(sdp);
137
138 retry = 1;
139 break;
140 }
141 } while (retry);
142}
143
144/**
145 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
146 * @sdp: the filesystem
147 * @ai: the AIL entry
148 *
149 */
150
151int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
152{
153 struct gfs2_bufdata *bd, *s;
154 struct buffer_head *bh;
155
156 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
157 bd_ail_st_list) {
158 bh = bd->bd_bh;
159
160 gfs2_assert(sdp, bd->bd_ail == ai);
161
162 if (buffer_busy(bh)) {
163 if (flags & DIO_ALL)
164 continue;
165 else
166 break;
167 }
168
169 if (!buffer_uptodate(bh))
170 gfs2_io_error_bh(sdp, bh);
171
172 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
173 }
174
175 return list_empty(&ai->ai_ail1_list);
176}
177
178/**
179 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
180 * @sdp: the filesystem
181 * @ai: the AIL entry
182 *
183 */
184
185void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
186{
187 struct list_head *head = &ai->ai_ail2_list;
188 struct gfs2_bufdata *bd;
189
190 while (!list_empty(head)) {
191 bd = list_entry(head->prev, struct gfs2_bufdata,
192 bd_ail_st_list);
193 gfs2_assert(sdp, bd->bd_ail == ai);
194 bd->bd_ail = NULL;
195 list_del(&bd->bd_ail_st_list);
196 list_del(&bd->bd_ail_gl_list);
197 atomic_dec(&bd->bd_gl->gl_ail_count);
198 brelse(bd->bd_bh);
199 }
200}
201
202/**
203 * ail_empty_gl - remove all buffers for a given lock from the AIL
204 * @gl: the glock
205 *
206 * None of the buffers should be dirty, locked, or pinned.
207 */
208
209void gfs2_ail_empty_gl(struct gfs2_glock *gl)
210{
211 struct gfs2_sbd *sdp = gl->gl_sbd;
212 unsigned int blocks;
213 struct list_head *head = &gl->gl_ail_list;
214 struct gfs2_bufdata *bd;
215 struct buffer_head *bh;
216 uint64_t blkno;
217 int error;
218
219 blocks = atomic_read(&gl->gl_ail_count);
220 if (!blocks)
221 return;
222
223 error = gfs2_trans_begin(sdp, 0, blocks);
224 if (gfs2_assert_withdraw(sdp, !error))
225 return;
226
227 gfs2_log_lock(sdp);
228 while (!list_empty(head)) {
229 bd = list_entry(head->next, struct gfs2_bufdata,
230 bd_ail_gl_list);
231 bh = bd->bd_bh;
232 blkno = bh->b_blocknr;
233 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
234
235 bd->bd_ail = NULL;
236 list_del(&bd->bd_ail_st_list);
237 list_del(&bd->bd_ail_gl_list);
238 atomic_dec(&gl->gl_ail_count);
239 brelse(bh);
240 gfs2_log_unlock(sdp);
241
242 gfs2_trans_add_revoke(sdp, blkno);
243
244 gfs2_log_lock(sdp);
245 }
246 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
247 gfs2_log_unlock(sdp);
248
249 gfs2_trans_end(sdp);
250 gfs2_log_flush(sdp, NULL);
251}
252
253/**
254 * gfs2_meta_inval - Invalidate all buffers associated with a glock
255 * @gl: the glock
256 *
257 */
258
259void gfs2_meta_inval(struct gfs2_glock *gl)
260{
261 struct gfs2_sbd *sdp = gl->gl_sbd;
262 struct inode *aspace = gl->gl_aspace;
263 struct address_space *mapping = gl->gl_aspace->i_mapping;
264
265 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
266
267 atomic_inc(&aspace->i_writecount);
268 truncate_inode_pages(mapping, 0);
269 atomic_dec(&aspace->i_writecount);
270
271 gfs2_assert_withdraw(sdp, !mapping->nrpages);
272}
273
274/**
275 * gfs2_meta_sync - Sync all buffers associated with a glock
276 * @gl: The glock
277 * @flags: DIO_START | DIO_WAIT
278 *
279 */
280
281void gfs2_meta_sync(struct gfs2_glock *gl, int flags)
282{
283 struct address_space *mapping = gl->gl_aspace->i_mapping;
284 int error = 0;
285
286 if (flags & DIO_START)
287 filemap_fdatawrite(mapping);
288 if (!error && (flags & DIO_WAIT))
289 error = filemap_fdatawait(mapping);
290
291 if (error)
292 gfs2_io_error(gl->gl_sbd);
293}
294
295/**
296 * getbuf - Get a buffer with a given address space
297 * @sdp: the filesystem
298 * @aspace: the address space
299 * @blkno: the block number (filesystem scope)
300 * @create: 1 if the buffer should be created
301 *
302 * Returns: the buffer
303 */
304
305static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
306 uint64_t blkno, int create)
307{
308 struct page *page;
309 struct buffer_head *bh;
310 unsigned int shift;
311 unsigned long index;
312 unsigned int bufnum;
313
314 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
315 index = blkno >> shift; /* convert block to page */
316 bufnum = blkno - (index << shift); /* block buf index within page */
317
318 if (create) {
319 for (;;) {
320 page = grab_cache_page(aspace->i_mapping, index);
321 if (page)
322 break;
323 yield();
324 }
325 } else {
326 page = find_lock_page(aspace->i_mapping, index);
327 if (!page)
328 return NULL;
329 }
330
331 if (!page_has_buffers(page))
332 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
333
334 /* Locate header for our buffer within our page */
335 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
336 /* Do nothing */;
337 get_bh(bh);
338
339 if (!buffer_mapped(bh))
340 map_bh(bh, sdp->sd_vfs, blkno);
341
342 unlock_page(page);
343 mark_page_accessed(page);
344 page_cache_release(page);
345
346 return bh;
347}
348
349static void meta_prep_new(struct buffer_head *bh)
350{
351 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
352
353 lock_buffer(bh);
354 clear_buffer_dirty(bh);
355 set_buffer_uptodate(bh);
356 unlock_buffer(bh);
357
358 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
359}
360
361/**
362 * gfs2_meta_new - Get a block
363 * @gl: The glock associated with this block
364 * @blkno: The block number
365 *
366 * Returns: The buffer
367 */
368
369struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno)
370{
371 struct buffer_head *bh;
372 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
373 meta_prep_new(bh);
374 return bh;
375}
376
377/**
378 * gfs2_meta_read - Read a block from disk
379 * @gl: The glock covering the block
380 * @blkno: The block number
381 * @flags: flags to gfs2_dreread()
382 * @bhp: the place where the buffer is returned (NULL on failure)
383 *
384 * Returns: errno
385 */
386
387int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags,
388 struct buffer_head **bhp)
389{
390 int error;
391
392 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
393 error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags);
394 if (error)
395 brelse(*bhp);
396
397 return error;
398}
399
400/**
401 * gfs2_meta_reread - Reread a block from disk
402 * @sdp: the filesystem
403 * @bh: The block to read
404 * @flags: Flags that control the read
405 *
406 * Returns: errno
407 */
408
409int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags)
410{
411 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
412 return -EIO;
413
414 if (flags & DIO_FORCE)
415 clear_buffer_uptodate(bh);
416
417 if ((flags & DIO_START) && !buffer_uptodate(bh))
418 ll_rw_block(READ, 1, &bh);
419
420 if (flags & DIO_WAIT) {
421 wait_on_buffer(bh);
422
423 if (!buffer_uptodate(bh)) {
424 struct gfs2_trans *tr = current->journal_info;
425 if (tr && tr->tr_touched)
426 gfs2_io_error_bh(sdp, bh);
427 return -EIO;
428 }
429 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
430 return -EIO;
431 }
432
433 return 0;
434}
435
436/**
437 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
438 * @gl: the glock the buffer belongs to
439 * @bh: The buffer to be attached to
440 * @meta: Flag to indicate whether its metadata or not
441 */
442
443void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
444 int meta)
445{
446 struct gfs2_bufdata *bd;
447
448 if (meta)
449 lock_page(bh->b_page);
450
451 if (bh->b_private) {
452 if (meta)
453 unlock_page(bh->b_page);
454 return;
455 }
456
457 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
458 memset(bd, 0, sizeof(struct gfs2_bufdata));
459 bd->bd_bh = bh;
460 bd->bd_gl = gl;
461
462 INIT_LIST_HEAD(&bd->bd_list_tr);
463 if (meta) {
464 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
465 } else {
466 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
467 get_bh(bh);
468 }
469 bh->b_private = bd;
470
471 if (meta)
472 unlock_page(bh->b_page);
473}
474
475/**
476 * gfs2_pin - Pin a buffer in memory
477 * @sdp: the filesystem the buffer belongs to
478 * @bh: The buffer to be pinned
479 *
480 */
481
482void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
483{
484 struct gfs2_bufdata *bd = bh->b_private;
485
486 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
487
488 if (test_set_buffer_pinned(bh))
489 gfs2_assert_withdraw(sdp, 0);
490
491 wait_on_buffer(bh);
492
493 /* If this buffer is in the AIL and it has already been written
494 to in-place disk block, remove it from the AIL. */
495
496 gfs2_log_lock(sdp);
497 if (bd->bd_ail && !buffer_in_io(bh))
498 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
499 gfs2_log_unlock(sdp);
500
501 clear_buffer_dirty(bh);
502 wait_on_buffer(bh);
503
504 if (!buffer_uptodate(bh))
505 gfs2_io_error_bh(sdp, bh);
506
507 get_bh(bh);
508}
509
510/**
511 * gfs2_unpin - Unpin a buffer
512 * @sdp: the filesystem the buffer belongs to
513 * @bh: The buffer to unpin
514 * @ai:
515 *
516 */
517
518void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
519 struct gfs2_ail *ai)
520{
521 struct gfs2_bufdata *bd = bh->b_private;
522
523 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
524
525 if (!buffer_pinned(bh))
526 gfs2_assert_withdraw(sdp, 0);
527
528 mark_buffer_dirty(bh);
529 clear_buffer_pinned(bh);
530
531 gfs2_log_lock(sdp);
532 if (bd->bd_ail) {
533 list_del(&bd->bd_ail_st_list);
534 brelse(bh);
535 } else {
536 struct gfs2_glock *gl = bd->bd_gl;
537 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
538 atomic_inc(&gl->gl_ail_count);
539 }
540 bd->bd_ail = ai;
541 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
542 gfs2_log_unlock(sdp);
543}
544
545/**
546 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
547 * @ip: the inode who owns the buffers
548 * @bstart: the first buffer in the run
549 * @blen: the number of buffers in the run
550 *
551 */
552
553void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
554{
555 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
556 struct inode *aspace = ip->i_gl->gl_aspace;
557 struct buffer_head *bh;
558
559 while (blen) {
560 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
561 if (bh) {
562 struct gfs2_bufdata *bd = bh->b_private;
563
564 if (test_clear_buffer_pinned(bh)) {
565 struct gfs2_trans *tr = current->journal_info;
566 gfs2_log_lock(sdp);
567 list_del_init(&bd->bd_le.le_list);
568 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
569 sdp->sd_log_num_buf--;
570 gfs2_log_unlock(sdp);
571 tr->tr_num_buf_rm++;
572 brelse(bh);
573 }
574 if (bd) {
575 gfs2_log_lock(sdp);
576 if (bd->bd_ail) {
577 uint64_t blkno = bh->b_blocknr;
578 bd->bd_ail = NULL;
579 list_del(&bd->bd_ail_st_list);
580 list_del(&bd->bd_ail_gl_list);
581 atomic_dec(&bd->bd_gl->gl_ail_count);
582 brelse(bh);
583 gfs2_log_unlock(sdp);
584 gfs2_trans_add_revoke(sdp, blkno);
585 } else
586 gfs2_log_unlock(sdp);
587 }
588
589 lock_buffer(bh);
590 clear_buffer_dirty(bh);
591 clear_buffer_uptodate(bh);
592 unlock_buffer(bh);
593
594 brelse(bh);
595 }
596
597 bstart++;
598 blen--;
599 }
600}
601
602/**
603 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
604 * @ip: The GFS2 inode
605 *
606 * This releases buffers that are in the most-recently-used array of
607 * blocks used for indirect block addressing for this inode.
608 */
609
610void gfs2_meta_cache_flush(struct gfs2_inode *ip)
611{
612 struct buffer_head **bh_slot;
613 unsigned int x;
614
615 spin_lock(&ip->i_spin);
616
617 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
618 bh_slot = &ip->i_cache[x];
619 if (!*bh_slot)
620 break;
621 brelse(*bh_slot);
622 *bh_slot = NULL;
623 }
624
625 spin_unlock(&ip->i_spin);
626}
627
628/**
629 * gfs2_meta_indirect_buffer - Get a metadata buffer
630 * @ip: The GFS2 inode
631 * @height: The level of this buf in the metadata (indir addr) tree (if any)
632 * @num: The block number (device relative) of the buffer
633 * @new: Non-zero if we may create a new buffer
634 * @bhp: the buffer is returned here
635 *
636 * Try to use the gfs2_inode's MRU metadata tree cache.
637 *
638 * Returns: errno
639 */
640
641int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
642 int new, struct buffer_head **bhp)
643{
644 struct buffer_head *bh, **bh_slot = ip->i_cache + height;
645 int error;
646
647 spin_lock(&ip->i_spin);
648 bh = *bh_slot;
649 if (bh) {
650 if (bh->b_blocknr == num)
651 get_bh(bh);
652 else
653 bh = NULL;
654 }
655 spin_unlock(&ip->i_spin);
656
657 if (bh) {
658 if (new)
659 meta_prep_new(bh);
660 else {
661 error = gfs2_meta_reread(GFS2_SB(&ip->i_inode), bh,
662 DIO_START | DIO_WAIT);
663 if (error) {
664 brelse(bh);
665 return error;
666 }
667 }
668 } else {
669 if (new)
670 bh = gfs2_meta_new(ip->i_gl, num);
671 else {
672 error = gfs2_meta_read(ip->i_gl, num,
673 DIO_START | DIO_WAIT, &bh);
674 if (error)
675 return error;
676 }
677
678 spin_lock(&ip->i_spin);
679 if (*bh_slot != bh) {
680 brelse(*bh_slot);
681 *bh_slot = bh;
682 get_bh(bh);
683 }
684 spin_unlock(&ip->i_spin);
685 }
686
687 if (new) {
688 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), height)) {
689 brelse(bh);
690 return -EIO;
691 }
692 gfs2_trans_add_bh(ip->i_gl, bh, 1);
693 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
694 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
695
696 } else if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh,
697 (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) {
698 brelse(bh);
699 return -EIO;
700 }
701
702 *bhp = bh;
703
704 return 0;
705}
706
707/**
708 * gfs2_meta_ra - start readahead on an extent of a file
709 * @gl: the glock the blocks belong to
710 * @dblock: the starting disk block
711 * @extlen: the number of blocks in the extent
712 *
713 */
714
715void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen)
716{
717 struct gfs2_sbd *sdp = gl->gl_sbd;
718 struct inode *aspace = gl->gl_aspace;
719 struct buffer_head *first_bh, *bh;
720 uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
721 sdp->sd_sb.sb_bsize_shift;
722 int error;
723
724 if (!extlen || !max_ra)
725 return;
726 if (extlen > max_ra)
727 extlen = max_ra;
728
729 first_bh = getbuf(sdp, aspace, dblock, CREATE);
730
731 if (buffer_uptodate(first_bh))
732 goto out;
733 if (!buffer_locked(first_bh)) {
734 error = gfs2_meta_reread(sdp, first_bh, DIO_START);
735 if (error)
736 goto out;
737 }
738
739 dblock++;
740 extlen--;
741
742 while (extlen) {
743 bh = getbuf(sdp, aspace, dblock, CREATE);
744
745 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
746 error = gfs2_meta_reread(sdp, bh, DIO_START);
747 brelse(bh);
748 if (error)
749 goto out;
750 } else
751 brelse(bh);
752
753 dblock++;
754 extlen--;
755
756 if (buffer_uptodate(first_bh))
757 break;
758 }
759
760 out:
761 brelse(first_bh);
762}
763
764/**
765 * gfs2_meta_syncfs - sync all the buffers in a filesystem
766 * @sdp: the filesystem
767 *
768 */
769
770void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
771{
772 gfs2_log_flush(sdp, NULL);
773 for (;;) {
774 gfs2_ail1_start(sdp, DIO_ALL);
775 if (gfs2_ail1_empty(sdp, DIO_ALL))
776 break;
777 msleep(10);
778 }
779}
780
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..23c6a596fd9e
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,89 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13static inline void gfs2_buffer_clear(struct buffer_head *bh)
14{
15 memset(bh->b_data, 0, bh->b_size);
16}
17
18static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
19{
20 memset(bh->b_data + head, 0, bh->b_size - head);
21}
22
23static inline void gfs2_buffer_clear_ends(struct buffer_head *bh, int offset,
24 int amount, int journaled)
25{
26 int z_off1 = (journaled) ? sizeof(struct gfs2_meta_header) : 0;
27 int z_len1 = offset - z_off1;
28 int z_off2 = offset + amount;
29 int z_len2 = (bh)->b_size - z_off2;
30
31 if (z_len1)
32 memset(bh->b_data + z_off1, 0, z_len1);
33
34 if (z_len2)
35 memset(bh->b_data + z_off2, 0, z_len2);
36}
37
38static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
39 int to_head,
40 struct buffer_head *from_bh,
41 int from_head)
42{
43 memcpy(to_bh->b_data + to_head,
44 from_bh->b_data + from_head,
45 from_bh->b_size - from_head);
46 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
47 0,
48 from_head - to_head);
49}
50
51struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
52void gfs2_aspace_put(struct inode *aspace);
53
54void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
55int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags);
56void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
57void gfs2_ail_empty_gl(struct gfs2_glock *gl);
58
59void gfs2_meta_inval(struct gfs2_glock *gl);
60void gfs2_meta_sync(struct gfs2_glock *gl, int flags);
61
62struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno);
63int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno,
64 int flags, struct buffer_head **bhp);
65int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags);
66
67void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
68 int meta);
69void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
70void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
71 struct gfs2_ail *ai);
72
73void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
74
75void gfs2_meta_cache_flush(struct gfs2_inode *ip);
76int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
77 int new, struct buffer_head **bhp);
78
79static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
80 struct buffer_head **bhp)
81{
82 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
83}
84
85void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen);
86void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
87
88#endif /* __DIO_DOT_H__ */
89
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..0d4b230785af
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "mount.h"
21#include "sys.h"
22#include "util.h"
23
24/**
25 * gfs2_mount_args - Parse mount options
26 * @sdp:
27 * @data:
28 *
29 * Return: errno
30 */
31
32int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
33{
34 struct gfs2_args *args = &sdp->sd_args;
35 char *data = data_arg;
36 char *options, *o, *v;
37 int error = 0;
38
39 if (!remount) {
40 /* If someone preloaded options, use those instead */
41 spin_lock(&gfs2_sys_margs_lock);
42 if (gfs2_sys_margs) {
43 data = gfs2_sys_margs;
44 gfs2_sys_margs = NULL;
45 }
46 spin_unlock(&gfs2_sys_margs_lock);
47
48 /* Set some defaults */
49 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
50 args->ar_quota = GFS2_QUOTA_DEFAULT;
51 args->ar_data = GFS2_DATA_DEFAULT;
52 }
53
54 /* Split the options into tokens with the "," character and
55 process them */
56
57 for (options = data; (o = strsep(&options, ",")); ) {
58 if (!*o)
59 continue;
60
61 v = strchr(o, '=');
62 if (v)
63 *v++ = 0;
64
65 if (!strcmp(o, "lockproto")) {
66 if (!v)
67 goto need_value;
68 if (remount && strcmp(v, args->ar_lockproto))
69 goto cant_remount;
70 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
71 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
72 }
73
74 else if (!strcmp(o, "locktable")) {
75 if (!v)
76 goto need_value;
77 if (remount && strcmp(v, args->ar_locktable))
78 goto cant_remount;
79 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
80 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
81 }
82
83 else if (!strcmp(o, "hostdata")) {
84 if (!v)
85 goto need_value;
86 if (remount && strcmp(v, args->ar_hostdata))
87 goto cant_remount;
88 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
89 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
90 }
91
92 else if (!strcmp(o, "spectator")) {
93 if (remount && !args->ar_spectator)
94 goto cant_remount;
95 args->ar_spectator = 1;
96 sdp->sd_vfs->s_flags |= MS_RDONLY;
97 }
98
99 else if (!strcmp(o, "ignore_local_fs")) {
100 if (remount && !args->ar_ignore_local_fs)
101 goto cant_remount;
102 args->ar_ignore_local_fs = 1;
103 }
104
105 else if (!strcmp(o, "localflocks")) {
106 if (remount && !args->ar_localflocks)
107 goto cant_remount;
108 args->ar_localflocks = 1;
109 }
110
111 else if (!strcmp(o, "localcaching")) {
112 if (remount && !args->ar_localcaching)
113 goto cant_remount;
114 args->ar_localcaching = 1;
115 }
116
117 else if (!strcmp(o, "debug"))
118 args->ar_debug = 1;
119
120 else if (!strcmp(o, "nodebug"))
121 args->ar_debug = 0;
122
123 else if (!strcmp(o, "upgrade")) {
124 if (remount && !args->ar_upgrade)
125 goto cant_remount;
126 args->ar_upgrade = 1;
127 }
128
129 else if (!strcmp(o, "num_glockd")) {
130 unsigned int x;
131 if (!v)
132 goto need_value;
133 sscanf(v, "%u", &x);
134 if (remount && x != args->ar_num_glockd)
135 goto cant_remount;
136 if (!x || x > GFS2_GLOCKD_MAX) {
137 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
138 GFS2_GLOCKD_MAX, x);
139 error = -EINVAL;
140 break;
141 }
142 args->ar_num_glockd = x;
143 }
144
145 else if (!strcmp(o, "acl")) {
146 args->ar_posix_acl = 1;
147 sdp->sd_vfs->s_flags |= MS_POSIXACL;
148 }
149
150 else if (!strcmp(o, "noacl")) {
151 args->ar_posix_acl = 0;
152 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
153 }
154
155 else if (!strcmp(o, "quota")) {
156 if (!v)
157 goto need_value;
158 if (!strcmp(v, "off"))
159 args->ar_quota = GFS2_QUOTA_OFF;
160 else if (!strcmp(v, "account"))
161 args->ar_quota = GFS2_QUOTA_ACCOUNT;
162 else if (!strcmp(v, "on"))
163 args->ar_quota = GFS2_QUOTA_ON;
164 else {
165 fs_info(sdp, "invalid value for quota\n");
166 error = -EINVAL;
167 break;
168 }
169 }
170
171 else if (!strcmp(o, "suiddir"))
172 args->ar_suiddir = 1;
173
174 else if (!strcmp(o, "nosuiddir"))
175 args->ar_suiddir = 0;
176
177 else if (!strcmp(o, "data")) {
178 if (!v)
179 goto need_value;
180 if (!strcmp(v, "writeback"))
181 args->ar_data = GFS2_DATA_WRITEBACK;
182 else if (!strcmp(v, "ordered"))
183 args->ar_data = GFS2_DATA_ORDERED;
184 else {
185 fs_info(sdp, "invalid value for data\n");
186 error = -EINVAL;
187 break;
188 }
189 }
190
191 else {
192 fs_info(sdp, "unknown option: %s\n", o);
193 error = -EINVAL;
194 break;
195 }
196 }
197
198 if (error)
199 fs_info(sdp, "invalid mount option(s)\n");
200
201 if (data != data_arg)
202 kfree(data);
203
204 return error;
205
206 need_value:
207 fs_info(sdp, "need value for option %s\n", o);
208 return -EINVAL;
209
210 cant_remount:
211 fs_info(sdp, "can't remount with option %s\n", o);
212 return -EINVAL;
213}
214
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..2eb14722144f
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
14
15#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..39c7f0345fc6
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15
16#include "gfs2.h"
17#include <linux/gfs2_ondisk.h>
18
19#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
20 struct->member);
21
22/*
23 * gfs2_xxx_in - read in an xxx struct
24 * first arg: the cpu-order structure
25 * buf: the disk-order buffer
26 *
27 * gfs2_xxx_out - write out an xxx struct
28 * first arg: the cpu-order structure
29 * buf: the disk-order buffer
30 *
31 * gfs2_xxx_print - print out an xxx struct
32 * first arg: the cpu-order structure
33 */
34
35void gfs2_inum_in(struct gfs2_inum *no, char *buf)
36{
37 struct gfs2_inum *str = (struct gfs2_inum *)buf;
38
39 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
40 no->no_addr = be64_to_cpu(str->no_addr);
41}
42
43void gfs2_inum_out(const struct gfs2_inum *no, char *buf)
44{
45 struct gfs2_inum *str = (struct gfs2_inum *)buf;
46
47 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
48 str->no_addr = cpu_to_be64(no->no_addr);
49}
50
51static void gfs2_inum_print(struct gfs2_inum *no)
52{
53 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
54 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
55}
56
57static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf)
58{
59 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
60
61 mh->mh_magic = be32_to_cpu(str->mh_magic);
62 mh->mh_type = be32_to_cpu(str->mh_type);
63 mh->mh_format = be32_to_cpu(str->mh_format);
64}
65
66static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf)
67{
68 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
69
70 str->mh_magic = cpu_to_be32(mh->mh_magic);
71 str->mh_type = cpu_to_be32(mh->mh_type);
72 str->mh_format = cpu_to_be32(mh->mh_format);
73}
74
75static void gfs2_meta_header_print(struct gfs2_meta_header *mh)
76{
77 pv(mh, mh_magic, "0x%.8X");
78 pv(mh, mh_type, "%u");
79 pv(mh, mh_format, "%u");
80}
81
82void gfs2_sb_in(struct gfs2_sb *sb, char *buf)
83{
84 struct gfs2_sb *str = (struct gfs2_sb *)buf;
85
86 gfs2_meta_header_in(&sb->sb_header, buf);
87
88 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
89 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
90 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
91 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
92
93 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
94 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
95
96 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
97 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
98}
99
100void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf)
101{
102 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
103
104 ri->ri_addr = be64_to_cpu(str->ri_addr);
105 ri->ri_length = be32_to_cpu(str->ri_length);
106 ri->ri_data0 = be64_to_cpu(str->ri_data0);
107 ri->ri_data = be32_to_cpu(str->ri_data);
108 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
109
110}
111
112void gfs2_rindex_print(struct gfs2_rindex *ri)
113{
114 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
115 pv(ri, ri_length, "%u");
116
117 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
118 pv(ri, ri_data, "%u");
119
120 pv(ri, ri_bitbytes, "%u");
121}
122
123void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf)
124{
125 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
126
127 gfs2_meta_header_in(&rg->rg_header, buf);
128 rg->rg_flags = be32_to_cpu(str->rg_flags);
129 rg->rg_free = be32_to_cpu(str->rg_free);
130 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
131 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
132}
133
134void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf)
135{
136 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
137
138 gfs2_meta_header_out(&rg->rg_header, buf);
139 str->rg_flags = cpu_to_be32(rg->rg_flags);
140 str->rg_free = cpu_to_be32(rg->rg_free);
141 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
142 str->__pad = cpu_to_be32(0);
143 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
144 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
145}
146
147void gfs2_quota_in(struct gfs2_quota *qu, char *buf)
148{
149 struct gfs2_quota *str = (struct gfs2_quota *)buf;
150
151 qu->qu_limit = be64_to_cpu(str->qu_limit);
152 qu->qu_warn = be64_to_cpu(str->qu_warn);
153 qu->qu_value = be64_to_cpu(str->qu_value);
154}
155
156void gfs2_dinode_in(struct gfs2_dinode *di, char *buf)
157{
158 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
159
160 gfs2_meta_header_in(&di->di_header, buf);
161 gfs2_inum_in(&di->di_num, (char *)&str->di_num);
162
163 di->di_mode = be32_to_cpu(str->di_mode);
164 di->di_uid = be32_to_cpu(str->di_uid);
165 di->di_gid = be32_to_cpu(str->di_gid);
166 di->di_nlink = be32_to_cpu(str->di_nlink);
167 di->di_size = be64_to_cpu(str->di_size);
168 di->di_blocks = be64_to_cpu(str->di_blocks);
169 di->di_atime = be64_to_cpu(str->di_atime);
170 di->di_mtime = be64_to_cpu(str->di_mtime);
171 di->di_ctime = be64_to_cpu(str->di_ctime);
172 di->di_major = be32_to_cpu(str->di_major);
173 di->di_minor = be32_to_cpu(str->di_minor);
174
175 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
176 di->di_goal_data = be64_to_cpu(str->di_goal_data);
177 di->di_generation = be64_to_cpu(str->di_generation);
178
179 di->di_flags = be32_to_cpu(str->di_flags);
180 di->di_payload_format = be32_to_cpu(str->di_payload_format);
181 di->di_height = be16_to_cpu(str->di_height);
182
183 di->di_depth = be16_to_cpu(str->di_depth);
184 di->di_entries = be32_to_cpu(str->di_entries);
185
186 di->di_eattr = be64_to_cpu(str->di_eattr);
187
188}
189
190void gfs2_dinode_out(struct gfs2_dinode *di, char *buf)
191{
192 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
193
194 gfs2_meta_header_out(&di->di_header, buf);
195 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
196
197 str->di_mode = cpu_to_be32(di->di_mode);
198 str->di_uid = cpu_to_be32(di->di_uid);
199 str->di_gid = cpu_to_be32(di->di_gid);
200 str->di_nlink = cpu_to_be32(di->di_nlink);
201 str->di_size = cpu_to_be64(di->di_size);
202 str->di_blocks = cpu_to_be64(di->di_blocks);
203 str->di_atime = cpu_to_be64(di->di_atime);
204 str->di_mtime = cpu_to_be64(di->di_mtime);
205 str->di_ctime = cpu_to_be64(di->di_ctime);
206 str->di_major = cpu_to_be32(di->di_major);
207 str->di_minor = cpu_to_be32(di->di_minor);
208
209 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
210 str->di_goal_data = cpu_to_be64(di->di_goal_data);
211 str->di_generation = cpu_to_be64(di->di_generation);
212
213 str->di_flags = cpu_to_be32(di->di_flags);
214 str->di_payload_format = cpu_to_be32(di->di_payload_format);
215 str->di_height = cpu_to_be16(di->di_height);
216
217 str->di_depth = cpu_to_be16(di->di_depth);
218 str->di_entries = cpu_to_be32(di->di_entries);
219
220 str->di_eattr = cpu_to_be64(di->di_eattr);
221
222}
223
224void gfs2_dinode_print(struct gfs2_dinode *di)
225{
226 gfs2_meta_header_print(&di->di_header);
227 gfs2_inum_print(&di->di_num);
228
229 pv(di, di_mode, "0%o");
230 pv(di, di_uid, "%u");
231 pv(di, di_gid, "%u");
232 pv(di, di_nlink, "%u");
233 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
234 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
235 printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
236 printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
237 printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
238 pv(di, di_major, "%u");
239 pv(di, di_minor, "%u");
240
241 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
242 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
243
244 pv(di, di_flags, "0x%.8X");
245 pv(di, di_payload_format, "%u");
246 pv(di, di_height, "%u");
247
248 pv(di, di_depth, "%u");
249 pv(di, di_entries, "%u");
250
251 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
252}
253
254void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf)
255{
256 struct gfs2_log_header *str = (struct gfs2_log_header *)buf;
257
258 gfs2_meta_header_in(&lh->lh_header, buf);
259 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
260 lh->lh_flags = be32_to_cpu(str->lh_flags);
261 lh->lh_tail = be32_to_cpu(str->lh_tail);
262 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
263 lh->lh_hash = be32_to_cpu(str->lh_hash);
264}
265
266void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf)
267{
268 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
269
270 ir->ir_start = be64_to_cpu(str->ir_start);
271 ir->ir_length = be64_to_cpu(str->ir_length);
272}
273
274void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf)
275{
276 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
277
278 str->ir_start = cpu_to_be64(ir->ir_start);
279 str->ir_length = cpu_to_be64(ir->ir_length);
280}
281
282void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf)
283{
284 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
285
286 sc->sc_total = be64_to_cpu(str->sc_total);
287 sc->sc_free = be64_to_cpu(str->sc_free);
288 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
289}
290
291void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf)
292{
293 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
294
295 str->sc_total = cpu_to_be64(sc->sc_total);
296 str->sc_free = cpu_to_be64(sc->sc_free);
297 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
298}
299
300void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf)
301{
302 struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf;
303
304 qc->qc_change = be64_to_cpu(str->qc_change);
305 qc->qc_flags = be32_to_cpu(str->qc_flags);
306 qc->qc_id = be32_to_cpu(str->qc_id);
307}
308
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..fca69f12e4de
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,778 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/pagevec.h>
17#include <linux/mpage.h>
18#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "quota.h"
31#include "trans.h"
32#include "rgrp.h"
33#include "ops_file.h"
34#include "util.h"
35#include "glops.h"
36
37
38static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
39 unsigned int from, unsigned int to)
40{
41 struct buffer_head *head = page_buffers(page);
42 unsigned int bsize = head->b_size;
43 struct buffer_head *bh;
44 unsigned int start, end;
45
46 for (bh = head, start = 0; bh != head || !start;
47 bh = bh->b_this_page, start = end) {
48 end = start + bsize;
49 if (end <= from || start >= to)
50 continue;
51 gfs2_trans_add_bh(ip->i_gl, bh, 0);
52 }
53}
54
55/**
56 * gfs2_get_block - Fills in a buffer head with details about a block
57 * @inode: The inode
58 * @lblock: The block number to look up
59 * @bh_result: The buffer head to return the result in
60 * @create: Non-zero if we may add block to the file
61 *
62 * Returns: errno
63 */
64
65int gfs2_get_block(struct inode *inode, sector_t lblock,
66 struct buffer_head *bh_result, int create)
67{
68 int new = create;
69 uint64_t dblock;
70 int error;
71 int boundary;
72
73 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
74 if (error)
75 return error;
76
77 if (!dblock)
78 return 0;
79
80 map_bh(bh_result, inode->i_sb, dblock);
81 if (new)
82 set_buffer_new(bh_result);
83 if (boundary)
84 set_buffer_boundary(bh_result);
85
86 return 0;
87}
88
89/**
90 * get_block_noalloc - Fills in a buffer head with details about a block
91 * @inode: The inode
92 * @lblock: The block number to look up
93 * @bh_result: The buffer head to return the result in
94 * @create: Non-zero if we may add block to the file
95 *
96 * Returns: errno
97 */
98
99static int get_block_noalloc(struct inode *inode, sector_t lblock,
100 struct buffer_head *bh_result, int create)
101{
102 int new = 0;
103 uint64_t dblock;
104 int error;
105 int boundary;
106
107 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
108 if (error)
109 return error;
110
111 if (dblock)
112 map_bh(bh_result, inode->i_sb, dblock);
113 else if (gfs2_assert_withdraw(GFS2_SB(inode), !create))
114 error = -EIO;
115 if (boundary)
116 set_buffer_boundary(bh_result);
117
118 return error;
119}
120
121/**
122 * gfs2_writepage - Write complete page
123 * @page: Page to write
124 *
125 * Returns: errno
126 *
127 * Some of this is copied from block_write_full_page() although we still
128 * call it to do most of the work.
129 */
130
131static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
132{
133 struct inode *inode = page->mapping->host;
134 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
135 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
136 loff_t i_size = i_size_read(inode);
137 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
138 unsigned offset;
139 int error;
140 int done_trans = 0;
141
142 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
143 unlock_page(page);
144 return -EIO;
145 }
146 if (current->journal_info)
147 goto out_ignore;
148
149 /* Is the page fully outside i_size? (truncate in progress) */
150 offset = i_size & (PAGE_CACHE_SIZE-1);
151 if (page->index > end_index || (page->index == end_index && !offset)) {
152 page->mapping->a_ops->invalidatepage(page, 0);
153 unlock_page(page);
154 return 0; /* don't care */
155 }
156
157 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
158 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
159 if (error)
160 goto out_ignore;
161 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
162 done_trans = 1;
163 }
164 error = block_write_full_page(page, get_block_noalloc, wbc);
165 if (done_trans)
166 gfs2_trans_end(sdp);
167 gfs2_meta_cache_flush(ip);
168 return error;
169
170out_ignore:
171 redirty_page_for_writepage(wbc, page);
172 unlock_page(page);
173 return 0;
174}
175
176static int zero_readpage(struct page *page)
177{
178 void *kaddr;
179
180 kaddr = kmap_atomic(page, KM_USER0);
181 memset(kaddr, 0, PAGE_CACHE_SIZE);
182 kunmap_atomic(page, KM_USER0);
183
184 SetPageUptodate(page);
185
186 return 0;
187}
188
189/**
190 * stuffed_readpage - Fill in a Linux page with stuffed file data
191 * @ip: the inode
192 * @page: the page
193 *
194 * Returns: errno
195 */
196
197static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
198{
199 struct buffer_head *dibh;
200 void *kaddr;
201 int error;
202
203 /* Only the first page of a stuffed file might contain data */
204 if (unlikely(page->index))
205 return zero_readpage(page);
206
207 error = gfs2_meta_inode_buffer(ip, &dibh);
208 if (error)
209 return error;
210
211 kaddr = kmap_atomic(page, KM_USER0);
212 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
213 ip->i_di.di_size);
214 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
215 kunmap_atomic(page, KM_USER0);
216
217 brelse(dibh);
218
219 SetPageUptodate(page);
220
221 return 0;
222}
223
224
225/**
226 * gfs2_readpage - readpage with locking
227 * @file: The file to read a page for. N.B. This may be NULL if we are
228 * reading an internal file.
229 * @page: The page to read
230 *
231 * Returns: errno
232 */
233
234static int gfs2_readpage(struct file *file, struct page *page)
235{
236 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
237 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
238 struct gfs2_holder gh;
239 int error;
240
241 if (likely(file != &gfs2_internal_file_sentinal)) {
242 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
243 error = gfs2_glock_nq_m_atime(1, &gh);
244 if (unlikely(error))
245 goto out_unlock;
246 }
247
248 if (gfs2_is_stuffed(ip)) {
249 error = stuffed_readpage(ip, page);
250 unlock_page(page);
251 } else
252 error = mpage_readpage(page, gfs2_get_block);
253
254 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
255 error = -EIO;
256
257 if (file != &gfs2_internal_file_sentinal) {
258 gfs2_glock_dq_m(1, &gh);
259 gfs2_holder_uninit(&gh);
260 }
261out:
262 return error;
263out_unlock:
264 unlock_page(page);
265 if (file != &gfs2_internal_file_sentinal)
266 gfs2_holder_uninit(&gh);
267 goto out;
268}
269
270/**
271 * gfs2_readpages - Read a bunch of pages at once
272 *
273 * Some notes:
274 * 1. This is only for readahead, so we can simply ignore any things
275 * which are slightly inconvenient (such as locking conflicts between
276 * the page lock and the glock) and return having done no I/O. Its
277 * obviously not something we'd want to do on too regular a basis.
278 * Any I/O we ignore at this time will be done via readpage later.
279 * 2. We have to handle stuffed files here too.
280 * 3. mpage_readpages() does most of the heavy lifting in the common case.
281 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
282 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
283 * well as read-ahead.
284 */
285static int gfs2_readpages(struct file *file, struct address_space *mapping,
286 struct list_head *pages, unsigned nr_pages)
287{
288 struct inode *inode = mapping->host;
289 struct gfs2_inode *ip = GFS2_I(inode);
290 struct gfs2_sbd *sdp = GFS2_SB(inode);
291 struct gfs2_holder gh;
292 unsigned page_idx;
293 int ret;
294
295 if (likely(file != &gfs2_internal_file_sentinal)) {
296 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
297 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
298 ret = gfs2_glock_nq_m_atime(1, &gh);
299 if (ret == GLR_TRYFAILED)
300 goto out_noerror;
301 if (unlikely(ret))
302 goto out_unlock;
303 }
304
305 if (gfs2_is_stuffed(ip)) {
306 struct pagevec lru_pvec;
307 pagevec_init(&lru_pvec, 0);
308 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
309 struct page *page = list_entry(pages->prev, struct page, lru);
310 prefetchw(&page->flags);
311 list_del(&page->lru);
312 if (!add_to_page_cache(page, mapping,
313 page->index, GFP_KERNEL)) {
314 ret = stuffed_readpage(ip, page);
315 unlock_page(page);
316 if (!pagevec_add(&lru_pvec, page))
317 __pagevec_lru_add(&lru_pvec);
318 } else {
319 page_cache_release(page);
320 }
321 }
322 pagevec_lru_add(&lru_pvec);
323 ret = 0;
324 } else {
325 /* What we really want to do .... */
326 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
327 }
328
329 if (likely(file != &gfs2_internal_file_sentinal)) {
330 gfs2_glock_dq_m(1, &gh);
331 gfs2_holder_uninit(&gh);
332 }
333out:
334 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
335 ret = -EIO;
336 return ret;
337out_noerror:
338 ret = 0;
339out_unlock:
340 /* unlock all pages, we can't do any I/O right now */
341 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
342 struct page *page = list_entry(pages->prev, struct page, lru);
343 list_del(&page->lru);
344 unlock_page(page);
345 page_cache_release(page);
346 }
347 if (likely(file != &gfs2_internal_file_sentinal))
348 gfs2_holder_uninit(&gh);
349 goto out;
350}
351
352/**
353 * gfs2_prepare_write - Prepare to write a page to a file
354 * @file: The file to write to
355 * @page: The page which is to be prepared for writing
356 * @from: From (byte range within page)
357 * @to: To (byte range within page)
358 *
359 * Returns: errno
360 */
361
362static int gfs2_prepare_write(struct file *file, struct page *page,
363 unsigned from, unsigned to)
364{
365 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
366 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
367 unsigned int data_blocks, ind_blocks, rblocks;
368 int alloc_required;
369 int error = 0;
370 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
371 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
372 struct gfs2_alloc *al;
373
374 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
375 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
376 if (error)
377 goto out_uninit;
378
379 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
380
381 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
382 if (error)
383 goto out_unlock;
384
385
386 if (alloc_required) {
387 al = gfs2_alloc_get(ip);
388
389 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
390 if (error)
391 goto out_alloc_put;
392
393 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
394 if (error)
395 goto out_qunlock;
396
397 al->al_requested = data_blocks + ind_blocks;
398 error = gfs2_inplace_reserve(ip);
399 if (error)
400 goto out_qunlock;
401 }
402
403 rblocks = RES_DINODE + ind_blocks;
404 if (gfs2_is_jdata(ip))
405 rblocks += data_blocks ? data_blocks : 1;
406 if (ind_blocks || data_blocks)
407 rblocks += RES_STATFS + RES_QUOTA;
408
409 error = gfs2_trans_begin(sdp, rblocks, 0);
410 if (error)
411 goto out;
412
413 if (gfs2_is_stuffed(ip)) {
414 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
415 error = gfs2_unstuff_dinode(ip, page);
416 if (error == 0)
417 goto prepare_write;
418 } else if (!PageUptodate(page))
419 error = stuffed_readpage(ip, page);
420 goto out;
421 }
422
423prepare_write:
424 error = block_prepare_write(page, from, to, gfs2_get_block);
425
426out:
427 if (error) {
428 gfs2_trans_end(sdp);
429 if (alloc_required) {
430 gfs2_inplace_release(ip);
431out_qunlock:
432 gfs2_quota_unlock(ip);
433out_alloc_put:
434 gfs2_alloc_put(ip);
435 }
436out_unlock:
437 gfs2_glock_dq_m(1, &ip->i_gh);
438out_uninit:
439 gfs2_holder_uninit(&ip->i_gh);
440 }
441
442 return error;
443}
444
445/**
446 * gfs2_commit_write - Commit write to a file
447 * @file: The file to write to
448 * @page: The page containing the data
449 * @from: From (byte range within page)
450 * @to: To (byte range within page)
451 *
452 * Returns: errno
453 */
454
455static int gfs2_commit_write(struct file *file, struct page *page,
456 unsigned from, unsigned to)
457{
458 struct inode *inode = page->mapping->host;
459 struct gfs2_inode *ip = GFS2_I(inode);
460 struct gfs2_sbd *sdp = GFS2_SB(inode);
461 int error = -EOPNOTSUPP;
462 struct buffer_head *dibh;
463 struct gfs2_alloc *al = &ip->i_alloc;;
464
465 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
466 goto fail_nounlock;
467
468 error = gfs2_meta_inode_buffer(ip, &dibh);
469 if (error)
470 goto fail_endtrans;
471
472 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
473
474 if (gfs2_is_stuffed(ip)) {
475 uint64_t file_size;
476 void *kaddr;
477
478 file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
479
480 kaddr = kmap_atomic(page, KM_USER0);
481 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
482 (char *)kaddr + from, to - from);
483 kunmap_atomic(page, KM_USER0);
484
485 SetPageUptodate(page);
486
487 if (inode->i_size < file_size)
488 i_size_write(inode, file_size);
489 } else {
490 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
491 gfs2_is_jdata(ip))
492 gfs2_page_add_databufs(ip, page, from, to);
493 error = generic_commit_write(file, page, from, to);
494 if (error)
495 goto fail;
496 }
497
498 if (ip->i_di.di_size < inode->i_size)
499 ip->i_di.di_size = inode->i_size;
500
501 gfs2_dinode_out(&ip->i_di, dibh->b_data);
502 brelse(dibh);
503 gfs2_trans_end(sdp);
504 if (al->al_requested) {
505 gfs2_inplace_release(ip);
506 gfs2_quota_unlock(ip);
507 gfs2_alloc_put(ip);
508 }
509 gfs2_glock_dq_m(1, &ip->i_gh);
510 gfs2_holder_uninit(&ip->i_gh);
511 return 0;
512
513fail:
514 brelse(dibh);
515fail_endtrans:
516 gfs2_trans_end(sdp);
517 if (al->al_requested) {
518 gfs2_inplace_release(ip);
519 gfs2_quota_unlock(ip);
520 gfs2_alloc_put(ip);
521 }
522 gfs2_glock_dq_m(1, &ip->i_gh);
523 gfs2_holder_uninit(&ip->i_gh);
524fail_nounlock:
525 ClearPageUptodate(page);
526 return error;
527}
528
529/**
530 * gfs2_bmap - Block map function
531 * @mapping: Address space info
532 * @lblock: The block to map
533 *
534 * Returns: The disk address for the block or 0 on hole or error
535 */
536
537static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
538{
539 struct gfs2_inode *ip = GFS2_I(mapping->host);
540 struct gfs2_holder i_gh;
541 sector_t dblock = 0;
542 int error;
543
544 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
545 if (error)
546 return 0;
547
548 if (!gfs2_is_stuffed(ip))
549 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
550
551 gfs2_glock_dq_uninit(&i_gh);
552
553 return dblock;
554}
555
556static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
557{
558 struct gfs2_bufdata *bd;
559
560 gfs2_log_lock(sdp);
561 bd = bh->b_private;
562 if (bd) {
563 bd->bd_bh = NULL;
564 bh->b_private = NULL;
565 gfs2_log_unlock(sdp);
566 brelse(bh);
567 } else
568 gfs2_log_unlock(sdp);
569
570 lock_buffer(bh);
571 clear_buffer_dirty(bh);
572 bh->b_bdev = NULL;
573 clear_buffer_mapped(bh);
574 clear_buffer_req(bh);
575 clear_buffer_new(bh);
576 clear_buffer_delay(bh);
577 unlock_buffer(bh);
578}
579
580static void gfs2_invalidatepage(struct page *page, unsigned long offset)
581{
582 struct gfs2_sbd *sdp = page->mapping->host->i_sb->s_fs_info;
583 struct buffer_head *head, *bh, *next;
584 unsigned int curr_off = 0;
585
586 BUG_ON(!PageLocked(page));
587 if (!page_has_buffers(page))
588 return;
589
590 bh = head = page_buffers(page);
591 do {
592 unsigned int next_off = curr_off + bh->b_size;
593 next = bh->b_this_page;
594
595 if (offset <= curr_off)
596 discard_buffer(sdp, bh);
597
598 curr_off = next_off;
599 bh = next;
600 } while (bh != head);
601
602 if (!offset)
603 try_to_release_page(page, 0);
604
605 return;
606}
607
608static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
609 const struct iovec *iov, loff_t offset,
610 unsigned long nr_segs)
611{
612 struct file *file = iocb->ki_filp;
613 struct inode *inode = file->f_mapping->host;
614 struct gfs2_inode *ip = GFS2_I(inode);
615 struct gfs2_holder gh;
616 int rv;
617
618 if (rw == READ)
619 mutex_lock(&inode->i_mutex);
620 /*
621 * Shared lock, even if its a write, since we do no allocation
622 * on this path. All we need change is atime.
623 */
624 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
625 rv = gfs2_glock_nq_m_atime(1, &gh);
626 if (rv)
627 goto out;
628
629 if (offset > i_size_read(inode))
630 goto out;
631
632 /*
633 * Should we return an error here? I can't see that O_DIRECT for
634 * a journaled file makes any sense. For now we'll silently fall
635 * back to buffered I/O, likewise we do the same for stuffed
636 * files since they are (a) small and (b) unaligned.
637 */
638 if (gfs2_is_jdata(ip))
639 goto out;
640
641 if (gfs2_is_stuffed(ip))
642 goto out;
643
644 rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
645 inode->i_sb->s_bdev,
646 iov, offset, nr_segs,
647 gfs2_get_block, NULL);
648out:
649 gfs2_glock_dq_m(1, &gh);
650 gfs2_holder_uninit(&gh);
651 if (rw == READ)
652 mutex_unlock(&inode->i_mutex);
653
654 return rv;
655}
656
657/**
658 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
659 * @bh: the buffer we're stuck on
660 *
661 */
662
663static void stuck_releasepage(struct buffer_head *bh)
664{
665 struct inode *inode = bh->b_page->mapping->host;
666 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
667 struct gfs2_bufdata *bd = bh->b_private;
668 struct gfs2_glock *gl;
669
670 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
671 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
672 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
673 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
674 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
675
676 if (!bd)
677 return;
678
679 gl = bd->bd_gl;
680
681 fs_warn(sdp, "gl = (%u, %llu)\n",
682 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
683
684 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
685 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
686 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
687
688 if (gl->gl_ops == &gfs2_inode_glops) {
689 struct gfs2_inode *ip = gl->gl_object;
690 unsigned int x;
691
692 if (!ip)
693 return;
694
695 fs_warn(sdp, "ip = %llu %llu\n",
696 (unsigned long long)ip->i_num.no_formal_ino,
697 (unsigned long long)ip->i_num.no_addr);
698
699 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
700 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
701 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
702 }
703}
704
705/**
706 * gfs2_aspace_releasepage - free the metadata associated with a page
707 * @page: the page that's being released
708 * @gfp_mask: passed from Linux VFS, ignored by us
709 *
710 * Call try_to_free_buffers() if the buffers in this page can be
711 * released.
712 *
713 * Returns: 0
714 */
715
716int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
717{
718 struct inode *aspace = page->mapping->host;
719 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
720 struct buffer_head *bh, *head;
721 struct gfs2_bufdata *bd;
722 unsigned long t;
723
724 if (!page_has_buffers(page))
725 goto out;
726
727 head = bh = page_buffers(page);
728 do {
729 t = jiffies;
730
731 while (atomic_read(&bh->b_count)) {
732 if (atomic_read(&aspace->i_writecount)) {
733 if (time_after_eq(jiffies, t +
734 gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
735 stuck_releasepage(bh);
736 t = jiffies;
737 }
738
739 yield();
740 continue;
741 }
742
743 return 0;
744 }
745
746 gfs2_assert_warn(sdp, !buffer_pinned(bh));
747
748 bd = bh->b_private;
749 if (bd) {
750 gfs2_assert_warn(sdp, bd->bd_bh == bh);
751 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
752 gfs2_assert_warn(sdp, list_empty(&bd->bd_le.le_list));
753 gfs2_assert_warn(sdp, !bd->bd_ail);
754 kmem_cache_free(gfs2_bufdata_cachep, bd);
755 bh->b_private = NULL;
756 }
757
758 bh = bh->b_this_page;
759 }
760 while (bh != head);
761
762out:
763 return try_to_free_buffers(page);
764}
765
766const struct address_space_operations gfs2_file_aops = {
767 .writepage = gfs2_writepage,
768 .readpage = gfs2_readpage,
769 .readpages = gfs2_readpages,
770 .sync_page = block_sync_page,
771 .prepare_write = gfs2_prepare_write,
772 .commit_write = gfs2_commit_write,
773 .bmap = gfs2_bmap,
774 .invalidatepage = gfs2_invalidatepage,
775 .releasepage = gfs2_releasepage,
776 .direct_IO = gfs2_direct_IO,
777};
778
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..dfc3dda6de11
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13extern const struct address_space_operations gfs2_file_aops;
14extern int gfs2_get_block(struct inode *inode, sector_t lblock,
15 struct buffer_head *bh_result, int create);
16extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
17
18#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..fd55979ec428
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "ops_dentry.h"
25#include "util.h"
26
27/**
28 * gfs2_drevalidate - Check directory lookup consistency
29 * @dentry: the mapping to check
30 * @nd:
31 *
32 * Check to make sure the lookup necessary to arrive at this inode from its
33 * parent is still good.
34 *
35 * Returns: 1 if the dentry is ok, 0 if it isn't
36 */
37
38static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
39{
40 struct dentry *parent = dget_parent(dentry);
41 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
42 struct gfs2_inode *dip = GFS2_I(parent->d_inode);
43 struct inode *inode = dentry->d_inode;
44 struct gfs2_holder d_gh;
45 struct gfs2_inode *ip;
46 struct gfs2_inum inum;
47 unsigned int type;
48 int error;
49
50 if (inode && is_bad_inode(inode))
51 goto invalid;
52
53 if (sdp->sd_args.ar_localcaching)
54 goto valid;
55
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
57 if (error)
58 goto fail;
59
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) {
62 case 0:
63 if (!inode)
64 goto invalid_gunlock;
65 break;
66 case -ENOENT:
67 if (!inode)
68 goto valid_gunlock;
69 goto invalid_gunlock;
70 default:
71 goto fail_gunlock;
72 }
73
74 ip = GFS2_I(inode);
75
76 if (!gfs2_inum_equal(&ip->i_num, &inum))
77 goto invalid_gunlock;
78
79 if (IF2DT(ip->i_di.di_mode) != type) {
80 gfs2_consist_inode(dip);
81 goto fail_gunlock;
82 }
83
84 valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh);
86
87 valid:
88 dput(parent);
89 return 1;
90
91 invalid_gunlock:
92 gfs2_glock_dq_uninit(&d_gh);
93
94 invalid:
95 if (inode && S_ISDIR(inode->i_mode)) {
96 if (have_submounts(dentry))
97 goto valid;
98 shrink_dcache_parent(dentry);
99 }
100 d_drop(dentry);
101
102 dput(parent);
103 return 0;
104
105 fail_gunlock:
106 gfs2_glock_dq_uninit(&d_gh);
107
108 fail:
109 dput(parent);
110 return 0;
111}
112
113static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
114{
115 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0;
117}
118
119struct dentry_operations gfs2_dops = {
120 .d_revalidate = gfs2_drevalidate,
121 .d_hash = gfs2_dhash,
122};
123
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..1b6e75c0a4a7
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13extern struct dentry_operations gfs2_dops;
14
15#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..6354f4799e68
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,293 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "dir.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "ops_export.h"
26#include "rgrp.h"
27#include "util.h"
28
29static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *fh,
31 int fh_len,
32 int fh_type,
33 int (*acceptable)(void *context,
34 struct dentry *dentry),
35 void *context)
36{
37 struct gfs2_fh_obj fh_obj;
38 struct gfs2_inum *this, parent;
39
40 if (fh_type != fh_len)
41 return NULL;
42
43 this = &fh_obj.this;
44 fh_obj.imode = DT_UNKNOWN;
45 memset(&parent, 0, sizeof(struct gfs2_inum));
46
47 switch (fh_type) {
48 case 10:
49 parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32;
50 parent.no_formal_ino |= be32_to_cpu(fh[5]);
51 parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32;
52 parent.no_addr |= be32_to_cpu(fh[7]);
53 fh_obj.imode = be32_to_cpu(fh[8]);
54 case 4:
55 this->no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32;
56 this->no_formal_ino |= be32_to_cpu(fh[1]);
57 this->no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32;
58 this->no_addr |= be32_to_cpu(fh[3]);
59 break;
60 default:
61 return NULL;
62 }
63
64 return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
65 acceptable, context);
66}
67
68static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
69 int connectable)
70{
71 struct inode *inode = dentry->d_inode;
72 struct super_block *sb = inode->i_sb;
73 struct gfs2_inode *ip = GFS2_I(inode);
74
75 if (*len < 4 || (connectable && *len < 10))
76 return 255;
77
78 fh[0] = ip->i_num.no_formal_ino >> 32;
79 fh[0] = cpu_to_be32(fh[0]);
80 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
81 fh[1] = cpu_to_be32(fh[1]);
82 fh[2] = ip->i_num.no_addr >> 32;
83 fh[2] = cpu_to_be32(fh[2]);
84 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
85 fh[3] = cpu_to_be32(fh[3]);
86 *len = 4;
87
88 if (!connectable || inode == sb->s_root->d_inode)
89 return *len;
90
91 spin_lock(&dentry->d_lock);
92 inode = dentry->d_parent->d_inode;
93 ip = GFS2_I(inode);
94 igrab(inode);
95 spin_unlock(&dentry->d_lock);
96
97 fh[4] = ip->i_num.no_formal_ino >> 32;
98 fh[4] = cpu_to_be32(fh[4]);
99 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
100 fh[5] = cpu_to_be32(fh[5]);
101 fh[6] = ip->i_num.no_addr >> 32;
102 fh[6] = cpu_to_be32(fh[6]);
103 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
104 fh[7] = cpu_to_be32(fh[7]);
105
106 fh[8] = cpu_to_be32(inode->i_mode);
107 fh[9] = 0; /* pad to double word */
108 *len = 10;
109
110 iput(inode);
111
112 return *len;
113}
114
115struct get_name_filldir {
116 struct gfs2_inum inum;
117 char *name;
118};
119
120static int get_name_filldir(void *opaque, const char *name, unsigned int length,
121 uint64_t offset, struct gfs2_inum *inum,
122 unsigned int type)
123{
124 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
125
126 if (!gfs2_inum_equal(inum, &gnfd->inum))
127 return 0;
128
129 memcpy(gnfd->name, name, length);
130 gnfd->name[length] = 0;
131
132 return 1;
133}
134
135static int gfs2_get_name(struct dentry *parent, char *name,
136 struct dentry *child)
137{
138 struct inode *dir = parent->d_inode;
139 struct inode *inode = child->d_inode;
140 struct gfs2_inode *dip, *ip;
141 struct get_name_filldir gnfd;
142 struct gfs2_holder gh;
143 uint64_t offset = 0;
144 int error;
145
146 if (!dir)
147 return -EINVAL;
148
149 if (!S_ISDIR(dir->i_mode) || !inode)
150 return -EINVAL;
151
152 dip = GFS2_I(dir);
153 ip = GFS2_I(inode);
154
155 *name = 0;
156 gnfd.inum = ip->i_num;
157 gnfd.name = name;
158
159 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
160 if (error)
161 return error;
162
163 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
164
165 gfs2_glock_dq_uninit(&gh);
166
167 if (!error && !*name)
168 error = -ENOENT;
169
170 return error;
171}
172
173static struct dentry *gfs2_get_parent(struct dentry *child)
174{
175 struct qstr dotdot;
176 struct inode *inode;
177 struct dentry *dentry;
178
179 gfs2_str2qstr(&dotdot, "..");
180 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
181
182 if (!inode)
183 return ERR_PTR(-ENOENT);
184 if (IS_ERR(inode))
185 return ERR_PTR(PTR_ERR(inode));
186
187 dentry = d_alloc_anon(inode);
188 if (!dentry) {
189 iput(inode);
190 return ERR_PTR(-ENOMEM);
191 }
192
193 return dentry;
194}
195
196static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
197{
198 struct gfs2_sbd *sdp = sb->s_fs_info;
199 struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
200 struct gfs2_inum *inum = &fh_obj->this;
201 struct gfs2_holder i_gh, ri_gh, rgd_gh;
202 struct gfs2_rgrpd *rgd;
203 struct inode *inode;
204 struct dentry *dentry;
205 int error;
206
207 /* System files? */
208
209 inode = gfs2_ilookup(sb, inum);
210 if (inode) {
211 if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
212 iput(inode);
213 return ERR_PTR(-ESTALE);
214 }
215 goto out_inode;
216 }
217
218 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
219 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
220 &i_gh);
221 if (error)
222 return ERR_PTR(error);
223
224 error = gfs2_rindex_hold(sdp, &ri_gh);
225 if (error)
226 goto fail;
227
228 error = -EINVAL;
229 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
230 if (!rgd)
231 goto fail_rindex;
232
233 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
234 if (error)
235 goto fail_rindex;
236
237 error = -ESTALE;
238 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
239 goto fail_rgd;
240
241 gfs2_glock_dq_uninit(&rgd_gh);
242 gfs2_glock_dq_uninit(&ri_gh);
243
244 inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
245 if (!inode)
246 goto fail;
247 if (IS_ERR(inode)) {
248 error = PTR_ERR(inode);
249 goto fail;
250 }
251
252 error = gfs2_inode_refresh(GFS2_I(inode));
253 if (error) {
254 iput(inode);
255 goto fail;
256 }
257
258 error = -EIO;
259 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
260 iput(inode);
261 goto fail;
262 }
263
264 gfs2_glock_dq_uninit(&i_gh);
265
266out_inode:
267 dentry = d_alloc_anon(inode);
268 if (!dentry) {
269 iput(inode);
270 return ERR_PTR(-ENOMEM);
271 }
272
273 return dentry;
274
275fail_rgd:
276 gfs2_glock_dq_uninit(&rgd_gh);
277
278fail_rindex:
279 gfs2_glock_dq_uninit(&ri_gh);
280
281fail:
282 gfs2_glock_dq_uninit(&i_gh);
283 return ERR_PTR(error);
284}
285
286struct export_operations gfs2_export_ops = {
287 .decode_fh = gfs2_decode_fh,
288 .encode_fh = gfs2_encode_fh,
289 .get_name = gfs2_get_name,
290 .get_parent = gfs2_get_parent,
291 .get_dentry = gfs2_get_dentry,
292};
293
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09fc077657d1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13extern struct export_operations gfs2_export_ops;
14struct gfs2_fh_obj {
15 struct gfs2_inum this;
16 __u32 imode;
17};
18
19#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..d13e04e8a96a
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,815 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/iflags.h>
25#include <asm/uaccess.h>
26
27#include "gfs2.h"
28#include "lm_interface.h"
29#include "incore.h"
30#include "bmap.h"
31#include "dir.h"
32#include "glock.h"
33#include "glops.h"
34#include "inode.h"
35#include "lm.h"
36#include "log.h"
37#include "meta_io.h"
38#include "ops_file.h"
39#include "ops_vm.h"
40#include "quota.h"
41#include "rgrp.h"
42#include "trans.h"
43#include "util.h"
44#include "eaops.h"
45
46/* "bad" is for NFS support */
47struct filldir_bad_entry {
48 char *fbe_name;
49 unsigned int fbe_length;
50 uint64_t fbe_offset;
51 struct gfs2_inum fbe_inum;
52 unsigned int fbe_type;
53};
54
55struct filldir_bad {
56 struct gfs2_sbd *fdb_sbd;
57
58 struct filldir_bad_entry *fdb_entry;
59 unsigned int fdb_entry_num;
60 unsigned int fdb_entry_off;
61
62 char *fdb_name;
63 unsigned int fdb_name_size;
64 unsigned int fdb_name_off;
65};
66
67/* For regular, non-NFS */
68struct filldir_reg {
69 struct gfs2_sbd *fdr_sbd;
70 int fdr_prefetch;
71
72 filldir_t fdr_filldir;
73 void *fdr_opaque;
74};
75
76/*
77 * Most fields left uninitialised to catch anybody who tries to
78 * use them. f_flags set to prevent file_accessed() from touching
79 * any other part of this. Its use is purely as a flag so that we
80 * know (in readpage()) whether or not do to locking.
81 */
82struct file gfs2_internal_file_sentinal = {
83 .f_flags = O_NOATIME|O_RDONLY,
84};
85
86static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
87 unsigned long offset, unsigned long size)
88{
89 char *kaddr;
90 unsigned long count = desc->count;
91
92 if (size > count)
93 size = count;
94
95 kaddr = kmap(page);
96 memcpy(desc->arg.buf, kaddr + offset, size);
97 kunmap(page);
98
99 desc->count = count - size;
100 desc->written += size;
101 desc->arg.buf += size;
102 return size;
103}
104
105int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
106 char *buf, loff_t *pos, unsigned size)
107{
108 struct inode *inode = &ip->i_inode;
109 read_descriptor_t desc;
110 desc.written = 0;
111 desc.arg.buf = buf;
112 desc.count = size;
113 desc.error = 0;
114 do_generic_mapping_read(inode->i_mapping, ra_state,
115 &gfs2_internal_file_sentinal, pos, &desc,
116 gfs2_read_actor);
117 return desc.written ? desc.written : desc.error;
118}
119
120/**
121 * gfs2_llseek - seek to a location in a file
122 * @file: the file
123 * @offset: the offset
124 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
125 *
126 * SEEK_END requires the glock for the file because it references the
127 * file's size.
128 *
129 * Returns: The new offset, or errno
130 */
131
132static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
133{
134 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
135 struct gfs2_holder i_gh;
136 loff_t error;
137
138 if (origin == 2) {
139 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
140 &i_gh);
141 if (!error) {
142 error = remote_llseek(file, offset, origin);
143 gfs2_glock_dq_uninit(&i_gh);
144 }
145 } else
146 error = remote_llseek(file, offset, origin);
147
148 return error;
149}
150
151/**
152 * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read()
153 * @opaque: opaque data used by the function
154 * @name: the name of the directory entry
155 * @length: the length of the name
156 * @offset: the entry's offset in the directory
157 * @inum: the inode number the entry points to
158 * @type: the type of inode the entry points to
159 *
160 * Returns: 0 on success, 1 if buffer full
161 */
162
163static int filldir_reg_func(void *opaque, const char *name, unsigned int length,
164 uint64_t offset, struct gfs2_inum *inum,
165 unsigned int type)
166{
167 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
168 struct gfs2_sbd *sdp = fdr->fdr_sbd;
169 int error;
170
171 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
172 inum->no_addr, type);
173 if (error)
174 return 1;
175
176 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
177 gfs2_glock_prefetch_num(sdp,
178 inum->no_addr, &gfs2_inode_glops,
179 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
180 gfs2_glock_prefetch_num(sdp,
181 inum->no_addr, &gfs2_iopen_glops,
182 LM_ST_SHARED, LM_FLAG_TRY);
183 }
184
185 return 0;
186}
187
188/**
189 * readdir_reg - Read directory entries from a directory
190 * @file: The directory to read from
191 * @dirent: Buffer for dirents
192 * @filldir: Function used to do the copying
193 *
194 * Returns: errno
195 */
196
197static int readdir_reg(struct file *file, void *dirent, filldir_t filldir)
198{
199 struct inode *dir = file->f_mapping->host;
200 struct gfs2_inode *dip = GFS2_I(dir);
201 struct filldir_reg fdr;
202 struct gfs2_holder d_gh;
203 uint64_t offset = file->f_pos;
204 int error;
205
206 fdr.fdr_sbd = GFS2_SB(dir);
207 fdr.fdr_prefetch = 1;
208 fdr.fdr_filldir = filldir;
209 fdr.fdr_opaque = dirent;
210
211 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
212 error = gfs2_glock_nq_atime(&d_gh);
213 if (error) {
214 gfs2_holder_uninit(&d_gh);
215 return error;
216 }
217
218 error = gfs2_dir_read(dir, &offset, &fdr, filldir_reg_func);
219
220 gfs2_glock_dq_uninit(&d_gh);
221
222 file->f_pos = offset;
223
224 return error;
225}
226
227/**
228 * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read()
229 * @opaque: opaque data used by the function
230 * @name: the name of the directory entry
231 * @length: the length of the name
232 * @offset: the entry's offset in the directory
233 * @inum: the inode number the entry points to
234 * @type: the type of inode the entry points to
235 *
236 * For supporting NFS.
237 *
238 * Returns: 0 on success, 1 if buffer full
239 */
240
241static int filldir_bad_func(void *opaque, const char *name, unsigned int length,
242 uint64_t offset, struct gfs2_inum *inum,
243 unsigned int type)
244{
245 struct filldir_bad *fdb = (struct filldir_bad *)opaque;
246 struct gfs2_sbd *sdp = fdb->fdb_sbd;
247 struct filldir_bad_entry *fbe;
248
249 if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
250 fdb->fdb_name_off + length > fdb->fdb_name_size)
251 return 1;
252
253 fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
254 fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
255 memcpy(fbe->fbe_name, name, length);
256 fbe->fbe_length = length;
257 fbe->fbe_offset = offset;
258 fbe->fbe_inum = *inum;
259 fbe->fbe_type = type;
260
261 fdb->fdb_entry_off++;
262 fdb->fdb_name_off += length;
263
264 if (!(length == 1 && *name == '.')) {
265 gfs2_glock_prefetch_num(sdp,
266 inum->no_addr, &gfs2_inode_glops,
267 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
268 gfs2_glock_prefetch_num(sdp,
269 inum->no_addr, &gfs2_iopen_glops,
270 LM_ST_SHARED, LM_FLAG_TRY);
271 }
272
273 return 0;
274}
275
276/**
277 * readdir_bad - Read directory entries from a directory
278 * @file: The directory to read from
279 * @dirent: Buffer for dirents
280 * @filldir: Function used to do the copying
281 *
282 * For supporting NFS.
283 *
284 * Returns: errno
285 */
286
287static int readdir_bad(struct file *file, void *dirent, filldir_t filldir)
288{
289 struct inode *dir = file->f_mapping->host;
290 struct gfs2_inode *dip = GFS2_I(dir);
291 struct gfs2_sbd *sdp = GFS2_SB(dir);
292 struct filldir_reg fdr;
293 unsigned int entries, size;
294 struct filldir_bad *fdb;
295 struct gfs2_holder d_gh;
296 uint64_t offset = file->f_pos;
297 unsigned int x;
298 struct filldir_bad_entry *fbe;
299 int error;
300
301 entries = gfs2_tune_get(sdp, gt_entries_per_readdir);
302 size = sizeof(struct filldir_bad) +
303 entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE);
304
305 fdb = kzalloc(size, GFP_KERNEL);
306 if (!fdb)
307 return -ENOMEM;
308
309 fdb->fdb_sbd = sdp;
310 fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
311 fdb->fdb_entry_num = entries;
312 fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
313 entries * sizeof(struct filldir_bad_entry);
314 fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE;
315
316 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
317 error = gfs2_glock_nq_atime(&d_gh);
318 if (error) {
319 gfs2_holder_uninit(&d_gh);
320 goto out;
321 }
322
323 error = gfs2_dir_read(dir, &offset, fdb, filldir_bad_func);
324
325 gfs2_glock_dq_uninit(&d_gh);
326
327 fdr.fdr_sbd = sdp;
328 fdr.fdr_prefetch = 0;
329 fdr.fdr_filldir = filldir;
330 fdr.fdr_opaque = dirent;
331
332 for (x = 0; x < fdb->fdb_entry_off; x++) {
333 fbe = &fdb->fdb_entry[x];
334
335 error = filldir_reg_func(&fdr,
336 fbe->fbe_name, fbe->fbe_length,
337 fbe->fbe_offset,
338 &fbe->fbe_inum, fbe->fbe_type);
339 if (error) {
340 file->f_pos = fbe->fbe_offset;
341 error = 0;
342 goto out;
343 }
344 }
345
346 file->f_pos = offset;
347
348 out:
349 kfree(fdb);
350
351 return error;
352}
353
354/**
355 * gfs2_readdir - Read directory entries from a directory
356 * @file: The directory to read from
357 * @dirent: Buffer for dirents
358 * @filldir: Function used to do the copying
359 *
360 * Returns: errno
361 */
362
363static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
364{
365 int error;
366
367 if (strcmp(current->comm, "nfsd") != 0)
368 error = readdir_reg(file, dirent, filldir);
369 else
370 error = readdir_bad(file, dirent, filldir);
371
372 return error;
373}
374
375static const u32 iflags_to_gfs2[32] = {
376 [iflag_Sync] = GFS2_DIF_SYNC,
377 [iflag_Immutable] = GFS2_DIF_IMMUTABLE,
378 [iflag_Append] = GFS2_DIF_APPENDONLY,
379 [iflag_NoAtime] = GFS2_DIF_NOATIME,
380 [iflag_Index] = GFS2_DIF_EXHASH,
381 [iflag_JournalData] = GFS2_DIF_JDATA,
382 [iflag_DirectIO] = GFS2_DIF_DIRECTIO,
383};
384
385static const u32 gfs2_to_iflags[32] = {
386 [gfs2fl_Sync] = IFLAG_SYNC,
387 [gfs2fl_Immutable] = IFLAG_IMMUTABLE,
388 [gfs2fl_AppendOnly] = IFLAG_APPEND,
389 [gfs2fl_NoAtime] = IFLAG_NOATIME,
390 [gfs2fl_ExHash] = IFLAG_INDEX,
391 [gfs2fl_Jdata] = IFLAG_JOURNAL_DATA,
392 [gfs2fl_Directio] = IFLAG_DIRECTIO,
393 [gfs2fl_InheritDirectio] = IFLAG_DIRECTIO,
394 [gfs2fl_InheritJdata] = IFLAG_JOURNAL_DATA,
395};
396
397static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
398{
399 struct inode *inode = filp->f_dentry->d_inode;
400 struct gfs2_inode *ip = GFS2_I(inode);
401 struct gfs2_holder gh;
402 int error;
403 u32 iflags;
404
405 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
406 error = gfs2_glock_nq_m_atime(1, &gh);
407 if (error)
408 return error;
409
410 iflags = iflags_cvt(gfs2_to_iflags, ip->i_di.di_flags);
411 if (put_user(iflags, ptr))
412 error = -EFAULT;
413
414 gfs2_glock_dq_m(1, &gh);
415 gfs2_holder_uninit(&gh);
416 return error;
417}
418
419/* Flags that can be set by user space */
420#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
421 GFS2_DIF_DIRECTIO| \
422 GFS2_DIF_IMMUTABLE| \
423 GFS2_DIF_APPENDONLY| \
424 GFS2_DIF_NOATIME| \
425 GFS2_DIF_SYNC| \
426 GFS2_DIF_SYSTEM| \
427 GFS2_DIF_INHERIT_DIRECTIO| \
428 GFS2_DIF_INHERIT_JDATA)
429
430/**
431 * gfs2_set_flags - set flags on an inode
432 * @inode: The inode
433 * @flags: The flags to set
434 * @mask: Indicates which flags are valid
435 *
436 */
437static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
438{
439 struct inode *inode = filp->f_dentry->d_inode;
440 struct gfs2_inode *ip = GFS2_I(inode);
441 struct gfs2_sbd *sdp = GFS2_SB(inode);
442 struct buffer_head *bh;
443 struct gfs2_holder gh;
444 int error;
445 u32 new_flags, flags;
446
447 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
448 if (error)
449 return error;
450
451 flags = ip->i_di.di_flags;
452 new_flags = (flags & ~mask) | (reqflags & mask);
453 if ((new_flags ^ flags) == 0)
454 goto out;
455
456 if (S_ISDIR(inode->i_mode)) {
457 if ((new_flags ^ flags) & GFS2_DIF_JDATA)
458 new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
459 if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
460 new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
461 }
462
463 error = -EINVAL;
464 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
465 goto out;
466
467 error = -EPERM;
468 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
469 goto out;
470 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
471 goto out;
472 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
473 !capable(CAP_LINUX_IMMUTABLE))
474 goto out;
475 if (!IS_IMMUTABLE(inode)) {
476 error = permission(inode, MAY_WRITE, NULL);
477 if (error)
478 goto out;
479 }
480
481 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
482 if (error)
483 goto out;
484 error = gfs2_meta_inode_buffer(ip, &bh);
485 if (error)
486 goto out_trans_end;
487 gfs2_trans_add_bh(ip->i_gl, bh, 1);
488 ip->i_di.di_flags = new_flags;
489 gfs2_dinode_out(&ip->i_di, bh->b_data);
490 brelse(bh);
491out_trans_end:
492 gfs2_trans_end(sdp);
493out:
494 gfs2_glock_dq_uninit(&gh);
495 return error;
496}
497
498static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
499{
500 u32 iflags, gfsflags;
501 if (get_user(iflags, ptr))
502 return -EFAULT;
503 gfsflags = iflags_cvt(iflags_to_gfs2, iflags);
504 return do_gfs2_set_flags(filp, gfsflags, ~0);
505}
506
507static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
508{
509 switch(cmd) {
510 case IFLAGS_GET_IOC:
511 return gfs2_get_flags(filp, (u32 __user *)arg);
512 case IFLAGS_SET_IOC:
513 return gfs2_set_flags(filp, (u32 __user *)arg);
514 }
515 return -ENOTTY;
516}
517
518
519/**
520 * gfs2_mmap -
521 * @file: The file to map
522 * @vma: The VMA which described the mapping
523 *
524 * Returns: 0 or error code
525 */
526
527static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
528{
529 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
530 struct gfs2_holder i_gh;
531 int error;
532
533 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
534 error = gfs2_glock_nq_atime(&i_gh);
535 if (error) {
536 gfs2_holder_uninit(&i_gh);
537 return error;
538 }
539
540 /* This is VM_MAYWRITE instead of VM_WRITE because a call
541 to mprotect() can turn on VM_WRITE later. */
542
543 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
544 (VM_MAYSHARE | VM_MAYWRITE))
545 vma->vm_ops = &gfs2_vm_ops_sharewrite;
546 else
547 vma->vm_ops = &gfs2_vm_ops_private;
548
549 gfs2_glock_dq_uninit(&i_gh);
550
551 return error;
552}
553
554/**
555 * gfs2_open - open a file
556 * @inode: the inode to open
557 * @file: the struct file for this opening
558 *
559 * Returns: errno
560 */
561
562static int gfs2_open(struct inode *inode, struct file *file)
563{
564 struct gfs2_inode *ip = GFS2_I(inode);
565 struct gfs2_holder i_gh;
566 struct gfs2_file *fp;
567 int error;
568
569 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
570 if (!fp)
571 return -ENOMEM;
572
573 mutex_init(&fp->f_fl_mutex);
574
575 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
576 file->private_data = fp;
577
578 if (S_ISREG(ip->i_di.di_mode)) {
579 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
580 &i_gh);
581 if (error)
582 goto fail;
583
584 if (!(file->f_flags & O_LARGEFILE) &&
585 ip->i_di.di_size > MAX_NON_LFS) {
586 error = -EFBIG;
587 goto fail_gunlock;
588 }
589
590 /* Listen to the Direct I/O flag */
591
592 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
593 file->f_flags |= O_DIRECT;
594
595 gfs2_glock_dq_uninit(&i_gh);
596 }
597
598 return 0;
599
600 fail_gunlock:
601 gfs2_glock_dq_uninit(&i_gh);
602
603 fail:
604 file->private_data = NULL;
605 kfree(fp);
606
607 return error;
608}
609
610/**
611 * gfs2_close - called to close a struct file
612 * @inode: the inode the struct file belongs to
613 * @file: the struct file being closed
614 *
615 * Returns: errno
616 */
617
618static int gfs2_close(struct inode *inode, struct file *file)
619{
620 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
621 struct gfs2_file *fp;
622
623 fp = file->private_data;
624 file->private_data = NULL;
625
626 if (gfs2_assert_warn(sdp, fp))
627 return -EIO;
628
629 kfree(fp);
630
631 return 0;
632}
633
634/**
635 * gfs2_fsync - sync the dirty data for a file (across the cluster)
636 * @file: the file that points to the dentry (we ignore this)
637 * @dentry: the dentry that points to the inode to sync
638 *
639 * Returns: errno
640 */
641
642static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
643{
644 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
645
646 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
647
648 return 0;
649}
650
651/**
652 * gfs2_lock - acquire/release a posix lock on a file
653 * @file: the file pointer
654 * @cmd: either modify or retrieve lock state, possibly wait
655 * @fl: type and range of lock
656 *
657 * Returns: errno
658 */
659
660static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
661{
662 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
663 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
664 struct lm_lockname name =
665 { .ln_number = ip->i_num.no_addr,
666 .ln_type = LM_TYPE_PLOCK };
667
668 if (!(fl->fl_flags & FL_POSIX))
669 return -ENOLCK;
670 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
671 return -ENOLCK;
672
673 if (sdp->sd_args.ar_localflocks) {
674 if (IS_GETLK(cmd)) {
675 struct file_lock tmp;
676 int ret;
677 ret = posix_test_lock(file, fl, &tmp);
678 fl->fl_type = F_UNLCK;
679 if (ret)
680 memcpy(fl, &tmp, sizeof(struct file_lock));
681 return 0;
682 } else {
683 return posix_lock_file_wait(file, fl);
684 }
685 }
686
687 if (IS_GETLK(cmd))
688 return gfs2_lm_plock_get(sdp, &name, file, fl);
689 else if (fl->fl_type == F_UNLCK)
690 return gfs2_lm_punlock(sdp, &name, file, fl);
691 else
692 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
693}
694
695static int do_flock(struct file *file, int cmd, struct file_lock *fl)
696{
697 struct gfs2_file *fp = file->private_data;
698 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
699 struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
700 struct gfs2_glock *gl;
701 unsigned int state;
702 int flags;
703 int error = 0;
704
705 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
706 flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
707
708 mutex_lock(&fp->f_fl_mutex);
709
710 gl = fl_gh->gh_gl;
711 if (gl) {
712 if (fl_gh->gh_state == state)
713 goto out;
714 gfs2_glock_hold(gl);
715 flock_lock_file_wait(file,
716 &(struct file_lock){.fl_type = F_UNLCK});
717 gfs2_glock_dq_uninit(fl_gh);
718 } else {
719 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
720 ip->i_num.no_addr, &gfs2_flock_glops,
721 CREATE, &gl);
722 if (error)
723 goto out;
724 }
725
726 gfs2_holder_init(gl, state, flags, fl_gh);
727 gfs2_glock_put(gl);
728
729 error = gfs2_glock_nq(fl_gh);
730 if (error) {
731 gfs2_holder_uninit(fl_gh);
732 if (error == GLR_TRYFAILED)
733 error = -EAGAIN;
734 } else {
735 error = flock_lock_file_wait(file, fl);
736 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
737 }
738
739 out:
740 mutex_unlock(&fp->f_fl_mutex);
741
742 return error;
743}
744
745static void do_unflock(struct file *file, struct file_lock *fl)
746{
747 struct gfs2_file *fp = file->private_data;
748 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
749
750 mutex_lock(&fp->f_fl_mutex);
751 flock_lock_file_wait(file, fl);
752 if (fl_gh->gh_gl)
753 gfs2_glock_dq_uninit(fl_gh);
754 mutex_unlock(&fp->f_fl_mutex);
755}
756
757/**
758 * gfs2_flock - acquire/release a flock lock on a file
759 * @file: the file pointer
760 * @cmd: either modify or retrieve lock state, possibly wait
761 * @fl: type and range of lock
762 *
763 * Returns: errno
764 */
765
766static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
767{
768 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
769 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
770
771 if (!(fl->fl_flags & FL_FLOCK))
772 return -ENOLCK;
773 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
774 return -ENOLCK;
775
776 if (sdp->sd_args.ar_localflocks)
777 return flock_lock_file_wait(file, fl);
778
779 if (fl->fl_type == F_UNLCK) {
780 do_unflock(file, fl);
781 return 0;
782 } else
783 return do_flock(file, cmd, fl);
784}
785
786const struct file_operations gfs2_file_fops = {
787 .llseek = gfs2_llseek,
788 .read = generic_file_read,
789 .readv = generic_file_readv,
790 .aio_read = generic_file_aio_read,
791 .write = generic_file_write,
792 .writev = generic_file_writev,
793 .aio_write = generic_file_aio_write,
794 .unlocked_ioctl = gfs2_ioctl,
795 .mmap = gfs2_mmap,
796 .open = gfs2_open,
797 .release = gfs2_close,
798 .fsync = gfs2_fsync,
799 .lock = gfs2_lock,
800 .sendfile = generic_file_sendfile,
801 .flock = gfs2_flock,
802 .splice_read = generic_file_splice_read,
803 .splice_write = generic_file_splice_write,
804};
805
806const struct file_operations gfs2_dir_fops = {
807 .readdir = gfs2_readdir,
808 .unlocked_ioctl = gfs2_ioctl,
809 .open = gfs2_open,
810 .release = gfs2_close,
811 .fsync = gfs2_fsync,
812 .lock = gfs2_lock,
813 .flock = gfs2_flock,
814};
815
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..46302b513937
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12extern struct file gfs2_internal_file_sentinal;
13extern int gfs2_internal_read(struct gfs2_inode *ip,
14 struct file_ra_state *ra_state,
15 char *buf, loff_t *pos, unsigned size);
16
17extern const struct file_operations gfs2_file_fops;
18extern const struct file_operations gfs2_dir_fops;
19
20#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..de18923eea70
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,836 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/blkdev.h>
17#include <linux/kthread.h>
18#include <linux/gfs2_ondisk.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "lm.h"
28#include "mount.h"
29#include "ops_export.h"
30#include "ops_fstype.h"
31#include "ops_super.h"
32#include "recovery.h"
33#include "rgrp.h"
34#include "super.h"
35#include "sys.h"
36#include "util.h"
37
38#define DO 0
39#define UNDO 1
40
41extern struct dentry_operations gfs2_dops;
42
43static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{
45 struct gfs2_sbd *sdp;
46 unsigned int x;
47
48 sdp = vmalloc(sizeof(struct gfs2_sbd));
49 if (!sdp)
50 return NULL;
51
52 memset(sdp, 0, sizeof(struct gfs2_sbd));
53
54 sb->s_fs_info = sdp;
55 sdp->sd_vfs = sb;
56
57 gfs2_tune_init(&sdp->sd_tune);
58
59 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
60 rwlock_init(&sdp->sd_gl_hash[x].hb_lock);
61 INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
62 }
63 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
64 spin_lock_init(&sdp->sd_reclaim_lock);
65 init_waitqueue_head(&sdp->sd_reclaim_wq);
66 mutex_init(&sdp->sd_invalidate_inodes_mutex);
67
68 mutex_init(&sdp->sd_inum_mutex);
69 spin_lock_init(&sdp->sd_statfs_spin);
70 mutex_init(&sdp->sd_statfs_mutex);
71
72 spin_lock_init(&sdp->sd_rindex_spin);
73 mutex_init(&sdp->sd_rindex_mutex);
74 INIT_LIST_HEAD(&sdp->sd_rindex_list);
75 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
76 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
77
78 INIT_LIST_HEAD(&sdp->sd_jindex_list);
79 spin_lock_init(&sdp->sd_jindex_spin);
80 mutex_init(&sdp->sd_jindex_mutex);
81
82 INIT_LIST_HEAD(&sdp->sd_quota_list);
83 spin_lock_init(&sdp->sd_quota_spin);
84 mutex_init(&sdp->sd_quota_mutex);
85
86 spin_lock_init(&sdp->sd_log_lock);
87
88 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
89 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
90 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
91 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
92 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
93
94 mutex_init(&sdp->sd_log_reserve_mutex);
95 INIT_LIST_HEAD(&sdp->sd_ail1_list);
96 INIT_LIST_HEAD(&sdp->sd_ail2_list);
97
98 init_rwsem(&sdp->sd_log_flush_lock);
99 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
100
101 INIT_LIST_HEAD(&sdp->sd_revoke_list);
102
103 mutex_init(&sdp->sd_freeze_lock);
104
105 return sdp;
106}
107
108static void init_vfs(struct super_block *sb, unsigned noatime)
109{
110 struct gfs2_sbd *sdp = sb->s_fs_info;
111
112 sb->s_magic = GFS2_MAGIC;
113 sb->s_op = &gfs2_super_ops;
114 sb->s_export_op = &gfs2_export_ops;
115 sb->s_maxbytes = MAX_LFS_FILESIZE;
116
117 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
118 set_bit(noatime, &sdp->sd_flags);
119
120 /* Don't let the VFS update atimes. GFS2 handles this itself. */
121 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
122}
123
124static int init_names(struct gfs2_sbd *sdp, int silent)
125{
126 struct gfs2_sb *sb = NULL;
127 char *proto, *table;
128 int error = 0;
129
130 proto = sdp->sd_args.ar_lockproto;
131 table = sdp->sd_args.ar_locktable;
132
133 /* Try to autodetect */
134
135 if (!proto[0] || !table[0]) {
136 struct buffer_head *bh;
137 bh = sb_getblk(sdp->sd_vfs,
138 GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
139 lock_buffer(bh);
140 clear_buffer_uptodate(bh);
141 clear_buffer_dirty(bh);
142 unlock_buffer(bh);
143 ll_rw_block(READ, 1, &bh);
144 wait_on_buffer(bh);
145
146 if (!buffer_uptodate(bh)) {
147 brelse(bh);
148 return -EIO;
149 }
150
151 sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL);
152 if (!sb) {
153 brelse(bh);
154 return -ENOMEM;
155 }
156 gfs2_sb_in(sb, bh->b_data);
157 brelse(bh);
158
159 error = gfs2_check_sb(sdp, sb, silent);
160 if (error)
161 goto out;
162
163 if (!proto[0])
164 proto = sb->sb_lockproto;
165 if (!table[0])
166 table = sb->sb_locktable;
167 }
168
169 if (!table[0])
170 table = sdp->sd_vfs->s_id;
171
172 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
173 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
174
175 out:
176 kfree(sb);
177
178 return error;
179}
180
181static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
182 int undo)
183{
184 struct task_struct *p;
185 int error = 0;
186
187 if (undo)
188 goto fail_trans;
189
190 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
191 error = IS_ERR(p);
192 if (error) {
193 fs_err(sdp, "can't start scand thread: %d\n", error);
194 return error;
195 }
196 sdp->sd_scand_process = p;
197
198 for (sdp->sd_glockd_num = 0;
199 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
200 sdp->sd_glockd_num++) {
201 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
202 error = IS_ERR(p);
203 if (error) {
204 fs_err(sdp, "can't start glockd thread: %d\n", error);
205 goto fail;
206 }
207 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
208 }
209
210 error = gfs2_glock_nq_num(sdp,
211 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
212 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
213 mount_gh);
214 if (error) {
215 fs_err(sdp, "can't acquire mount glock: %d\n", error);
216 goto fail;
217 }
218
219 error = gfs2_glock_nq_num(sdp,
220 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
221 LM_ST_SHARED,
222 LM_FLAG_NOEXP | GL_EXACT,
223 &sdp->sd_live_gh);
224 if (error) {
225 fs_err(sdp, "can't acquire live glock: %d\n", error);
226 goto fail_mount;
227 }
228
229 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
230 CREATE, &sdp->sd_rename_gl);
231 if (error) {
232 fs_err(sdp, "can't create rename glock: %d\n", error);
233 goto fail_live;
234 }
235
236 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
237 CREATE, &sdp->sd_trans_gl);
238 if (error) {
239 fs_err(sdp, "can't create transaction glock: %d\n", error);
240 goto fail_rename;
241 }
242 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
243
244 return 0;
245
246fail_trans:
247 gfs2_glock_put(sdp->sd_trans_gl);
248
249fail_rename:
250 gfs2_glock_put(sdp->sd_rename_gl);
251
252fail_live:
253 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
254
255fail_mount:
256 gfs2_glock_dq_uninit(mount_gh);
257
258fail:
259 while (sdp->sd_glockd_num--)
260 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
261
262 kthread_stop(sdp->sd_scand_process);
263
264 return error;
265}
266
267static struct inode *gfs2_lookup_root(struct super_block *sb,
268 struct gfs2_inum *inum)
269{
270 return gfs2_inode_lookup(sb, inum, DT_DIR);
271}
272
273static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
274{
275 struct super_block *sb = sdp->sd_vfs;
276 struct gfs2_holder sb_gh;
277 struct gfs2_inum *inum;
278 struct inode *inode;
279 int error = 0;
280
281 if (undo) {
282 return 0;
283 }
284
285 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
286 LM_ST_SHARED, 0, &sb_gh);
287 if (error) {
288 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
289 return error;
290 }
291
292 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
293 if (error) {
294 fs_err(sdp, "can't read superblock: %d\n", error);
295 goto out;
296 }
297
298 /* Set up the buffer cache and SB for real */
299 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
300 error = -EINVAL;
301 fs_err(sdp, "FS block size (%u) is too small for device "
302 "block size (%u)\n",
303 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
304 goto out;
305 }
306 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
307 error = -EINVAL;
308 fs_err(sdp, "FS block size (%u) is too big for machine "
309 "page size (%u)\n",
310 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
311 goto out;
312 }
313 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
314
315 /* Get the root inode */
316 inum = &sdp->sd_sb.sb_root_dir;
317 if (sb->s_type == &gfs2meta_fs_type)
318 inum = &sdp->sd_sb.sb_master_dir;
319 inode = gfs2_lookup_root(sb, inum);
320 if (IS_ERR(inode)) {
321 error = PTR_ERR(inode);
322 fs_err(sdp, "can't read in root inode: %d\n", error);
323 goto out;
324 }
325
326 sb->s_root = d_alloc_root(inode);
327 if (!sb->s_root) {
328 fs_err(sdp, "can't get root dentry\n");
329 error = -ENOMEM;
330 iput(inode);
331 }
332 sb->s_root->d_op = &gfs2_dops;
333out:
334 gfs2_glock_dq_uninit(&sb_gh);
335 return error;
336}
337
338static int init_journal(struct gfs2_sbd *sdp, int undo)
339{
340 struct gfs2_holder ji_gh;
341 struct task_struct *p;
342 struct gfs2_inode *ip;
343 int jindex = 1;
344 int error = 0;
345
346 if (undo) {
347 jindex = 0;
348 goto fail_recoverd;
349 }
350
351 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
352 if (IS_ERR(sdp->sd_jindex)) {
353 fs_err(sdp, "can't lookup journal index: %d\n", error);
354 return PTR_ERR(sdp->sd_jindex);
355 }
356 ip = GFS2_I(sdp->sd_jindex);
357 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
358
359 /* Load in the journal index special file */
360
361 error = gfs2_jindex_hold(sdp, &ji_gh);
362 if (error) {
363 fs_err(sdp, "can't read journal index: %d\n", error);
364 goto fail;
365 }
366
367 error = -EINVAL;
368 if (!gfs2_jindex_size(sdp)) {
369 fs_err(sdp, "no journals!\n");
370 goto fail_jindex;
371 }
372
373 if (sdp->sd_args.ar_spectator) {
374 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
375 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
376 } else {
377 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
378 fs_err(sdp, "can't mount journal #%u\n",
379 sdp->sd_lockstruct.ls_jid);
380 fs_err(sdp, "there are only %u journals (0 - %u)\n",
381 gfs2_jindex_size(sdp),
382 gfs2_jindex_size(sdp) - 1);
383 goto fail_jindex;
384 }
385 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
386
387 error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
388 &gfs2_journal_glops,
389 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
390 &sdp->sd_journal_gh);
391 if (error) {
392 fs_err(sdp, "can't acquire journal glock: %d\n", error);
393 goto fail_jindex;
394 }
395
396 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
397 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
398 LM_FLAG_NOEXP | GL_EXACT,
399 &sdp->sd_jinode_gh);
400 if (error) {
401 fs_err(sdp, "can't acquire journal inode glock: %d\n",
402 error);
403 goto fail_journal_gh;
404 }
405
406 error = gfs2_jdesc_check(sdp->sd_jdesc);
407 if (error) {
408 fs_err(sdp, "my journal (%u) is bad: %d\n",
409 sdp->sd_jdesc->jd_jid, error);
410 goto fail_jinode_gh;
411 }
412 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
413 }
414
415 if (sdp->sd_lockstruct.ls_first) {
416 unsigned int x;
417 for (x = 0; x < sdp->sd_journals; x++) {
418 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
419 if (error) {
420 fs_err(sdp, "error recovering journal %u: %d\n",
421 x, error);
422 goto fail_jinode_gh;
423 }
424 }
425
426 gfs2_lm_others_may_mount(sdp);
427 } else if (!sdp->sd_args.ar_spectator) {
428 error = gfs2_recover_journal(sdp->sd_jdesc);
429 if (error) {
430 fs_err(sdp, "error recovering my journal: %d\n", error);
431 goto fail_jinode_gh;
432 }
433 }
434
435 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
436 gfs2_glock_dq_uninit(&ji_gh);
437 jindex = 0;
438
439 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
440 error = IS_ERR(p);
441 if (error) {
442 fs_err(sdp, "can't start recoverd thread: %d\n", error);
443 goto fail_jinode_gh;
444 }
445 sdp->sd_recoverd_process = p;
446
447 return 0;
448
449 fail_recoverd:
450 kthread_stop(sdp->sd_recoverd_process);
451
452 fail_jinode_gh:
453 if (!sdp->sd_args.ar_spectator)
454 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
455
456 fail_journal_gh:
457 if (!sdp->sd_args.ar_spectator)
458 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
459
460 fail_jindex:
461 gfs2_jindex_free(sdp);
462 if (jindex)
463 gfs2_glock_dq_uninit(&ji_gh);
464
465 fail:
466 iput(sdp->sd_jindex);
467
468 return error;
469}
470
471
472static int init_inodes(struct gfs2_sbd *sdp, int undo)
473{
474 int error = 0;
475 struct gfs2_inode *ip;
476 struct inode *inode;
477
478 if (undo)
479 goto fail_qinode;
480
481 inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
482 if (IS_ERR(inode)) {
483 error = PTR_ERR(inode);
484 fs_err(sdp, "can't read in master directory: %d\n", error);
485 goto fail;
486 }
487 sdp->sd_master_dir = inode;
488
489 error = init_journal(sdp, undo);
490 if (error)
491 goto fail_master;
492
493 /* Read in the master inode number inode */
494 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
495 if (IS_ERR(sdp->sd_inum_inode)) {
496 error = PTR_ERR(sdp->sd_inum_inode);
497 fs_err(sdp, "can't read in inum inode: %d\n", error);
498 goto fail_journal;
499 }
500
501
502 /* Read in the master statfs inode */
503 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
504 if (IS_ERR(sdp->sd_statfs_inode)) {
505 error = PTR_ERR(sdp->sd_statfs_inode);
506 fs_err(sdp, "can't read in statfs inode: %d\n", error);
507 goto fail_inum;
508 }
509
510 /* Read in the resource index inode */
511 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
512 if (IS_ERR(sdp->sd_rindex)) {
513 error = PTR_ERR(sdp->sd_rindex);
514 fs_err(sdp, "can't get resource index inode: %d\n", error);
515 goto fail_statfs;
516 }
517 ip = GFS2_I(sdp->sd_rindex);
518 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
519 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
520
521 /* Read in the quota inode */
522 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
523 if (IS_ERR(sdp->sd_quota_inode)) {
524 error = PTR_ERR(sdp->sd_quota_inode);
525 fs_err(sdp, "can't get quota file inode: %d\n", error);
526 goto fail_rindex;
527 }
528 return 0;
529
530fail_qinode:
531 iput(sdp->sd_quota_inode);
532
533fail_rindex:
534 gfs2_clear_rgrpd(sdp);
535 iput(sdp->sd_rindex);
536
537fail_statfs:
538 iput(sdp->sd_statfs_inode);
539
540fail_inum:
541 iput(sdp->sd_inum_inode);
542fail_journal:
543 init_journal(sdp, UNDO);
544fail_master:
545 iput(sdp->sd_master_dir);
546fail:
547 return error;
548}
549
550static int init_per_node(struct gfs2_sbd *sdp, int undo)
551{
552 struct inode *pn = NULL;
553 char buf[30];
554 int error = 0;
555 struct gfs2_inode *ip;
556
557 if (sdp->sd_args.ar_spectator)
558 return 0;
559
560 if (undo)
561 goto fail_qc_gh;
562
563 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
564 if (IS_ERR(pn)) {
565 error = PTR_ERR(pn);
566 fs_err(sdp, "can't find per_node directory: %d\n", error);
567 return error;
568 }
569
570 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
571 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
572 if (IS_ERR(sdp->sd_ir_inode)) {
573 error = PTR_ERR(sdp->sd_ir_inode);
574 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
575 goto fail;
576 }
577
578 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
579 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
580 if (IS_ERR(sdp->sd_sc_inode)) {
581 error = PTR_ERR(sdp->sd_sc_inode);
582 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
583 goto fail_ir_i;
584 }
585
586 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
587 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
588 if (IS_ERR(sdp->sd_qc_inode)) {
589 error = PTR_ERR(sdp->sd_qc_inode);
590 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
591 goto fail_ut_i;
592 }
593
594 iput(pn);
595 pn = NULL;
596
597 ip = GFS2_I(sdp->sd_ir_inode);
598 error = gfs2_glock_nq_init(ip->i_gl,
599 LM_ST_EXCLUSIVE, 0,
600 &sdp->sd_ir_gh);
601 if (error) {
602 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
603 goto fail_qc_i;
604 }
605
606 ip = GFS2_I(sdp->sd_sc_inode);
607 error = gfs2_glock_nq_init(ip->i_gl,
608 LM_ST_EXCLUSIVE, 0,
609 &sdp->sd_sc_gh);
610 if (error) {
611 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
612 goto fail_ir_gh;
613 }
614
615 ip = GFS2_I(sdp->sd_qc_inode);
616 error = gfs2_glock_nq_init(ip->i_gl,
617 LM_ST_EXCLUSIVE, 0,
618 &sdp->sd_qc_gh);
619 if (error) {
620 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
621 goto fail_ut_gh;
622 }
623
624 return 0;
625
626 fail_qc_gh:
627 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
628
629 fail_ut_gh:
630
631 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
632
633 fail_ir_gh:
634 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
635
636 fail_qc_i:
637 iput(sdp->sd_qc_inode);
638
639 fail_ut_i:
640
641 iput(sdp->sd_sc_inode);
642
643 fail_ir_i:
644 iput(sdp->sd_ir_inode);
645
646 fail:
647 if (pn)
648 iput(pn);
649 return error;
650}
651
652static int init_threads(struct gfs2_sbd *sdp, int undo)
653{
654 struct task_struct *p;
655 int error = 0;
656
657 if (undo)
658 goto fail_quotad;
659
660 sdp->sd_log_flush_time = jiffies;
661 sdp->sd_jindex_refresh_time = jiffies;
662
663 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
664 error = IS_ERR(p);
665 if (error) {
666 fs_err(sdp, "can't start logd thread: %d\n", error);
667 return error;
668 }
669 sdp->sd_logd_process = p;
670
671 sdp->sd_statfs_sync_time = jiffies;
672 sdp->sd_quota_sync_time = jiffies;
673
674 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
675 error = IS_ERR(p);
676 if (error) {
677 fs_err(sdp, "can't start quotad thread: %d\n", error);
678 goto fail;
679 }
680 sdp->sd_quotad_process = p;
681
682 return 0;
683
684
685fail_quotad:
686 kthread_stop(sdp->sd_quotad_process);
687fail:
688 kthread_stop(sdp->sd_logd_process);
689 return error;
690}
691
692/**
693 * fill_super - Read in superblock
694 * @sb: The VFS superblock
695 * @data: Mount options
696 * @silent: Don't complain if it's not a GFS2 filesystem
697 *
698 * Returns: errno
699 */
700
701static int fill_super(struct super_block *sb, void *data, int silent)
702{
703 struct gfs2_sbd *sdp;
704 struct gfs2_holder mount_gh;
705 int error;
706
707 sdp = init_sbd(sb);
708 if (!sdp) {
709 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
710 return -ENOMEM;
711 }
712
713 error = gfs2_mount_args(sdp, (char *)data, 0);
714 if (error) {
715 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
716 goto fail;
717 }
718
719 init_vfs(sb, SDF_NOATIME);
720
721 /* Set up the buffer cache and fill in some fake block size values
722 to allow us to read-in the on-disk superblock. */
723 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
724 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
725 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
726 GFS2_BASIC_BLOCK_SHIFT;
727 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
728
729 error = init_names(sdp, silent);
730 if (error)
731 goto fail;
732
733 error = gfs2_sys_fs_add(sdp);
734 if (error)
735 goto fail;
736
737 error = gfs2_lm_mount(sdp, silent);
738 if (error)
739 goto fail_sys;
740
741 error = init_locking(sdp, &mount_gh, DO);
742 if (error)
743 goto fail_lm;
744
745 error = init_sb(sdp, silent, DO);
746 if (error)
747 goto fail_locking;
748
749 error = init_inodes(sdp, DO);
750 if (error)
751 goto fail_sb;
752
753 error = init_per_node(sdp, DO);
754 if (error)
755 goto fail_inodes;
756
757 error = gfs2_statfs_init(sdp);
758 if (error) {
759 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
760 goto fail_per_node;
761 }
762
763 error = init_threads(sdp, DO);
764 if (error)
765 goto fail_per_node;
766
767 if (!(sb->s_flags & MS_RDONLY)) {
768 error = gfs2_make_fs_rw(sdp);
769 if (error) {
770 fs_err(sdp, "can't make FS RW: %d\n", error);
771 goto fail_threads;
772 }
773 }
774
775 gfs2_glock_dq_uninit(&mount_gh);
776
777 return 0;
778
779 fail_threads:
780 init_threads(sdp, UNDO);
781
782 fail_per_node:
783 init_per_node(sdp, UNDO);
784
785 fail_inodes:
786 init_inodes(sdp, UNDO);
787
788 fail_sb:
789 init_sb(sdp, 0, UNDO);
790
791 fail_locking:
792 init_locking(sdp, &mount_gh, UNDO);
793
794 fail_lm:
795 gfs2_gl_hash_clear(sdp, WAIT);
796 gfs2_lm_unmount(sdp);
797 while (invalidate_inodes(sb))
798 yield();
799
800 fail_sys:
801 gfs2_sys_fs_del(sdp);
802
803 fail:
804 vfree(sdp);
805 sb->s_fs_info = NULL;
806
807 return error;
808}
809
810static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
811 const char *dev_name, void *data, struct vfsmount *mnt)
812{
813 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
814}
815
816static void gfs2_kill_sb(struct super_block *sb)
817{
818 kill_block_super(sb);
819}
820
821struct file_system_type gfs2_fs_type = {
822 .name = "gfs2",
823 .fs_flags = FS_REQUIRES_DEV,
824 .get_sb = gfs2_get_sb,
825 .kill_sb = gfs2_kill_sb,
826 .owner = THIS_MODULE,
827};
828
829struct file_system_type gfs2meta_fs_type = {
830 .name = "gfs2meta",
831 .fs_flags = FS_REQUIRES_DEV,
832 .get_sb = gfs2_get_sb,
833 .kill_sb = gfs2_kill_sb,
834 .owner = THIS_MODULE,
835};
836
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..622f5760d6b2
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13extern struct file_system_type gfs2_fs_type;
14extern struct file_system_type gfs2meta_fs_type;
15
16#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..8fb7c5c9a7c3
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1165 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "acl.h"
28#include "bmap.h"
29#include "dir.h"
30#include "eaops.h"
31#include "eattr.h"
32#include "glock.h"
33#include "inode.h"
34#include "meta_io.h"
35#include "ops_dentry.h"
36#include "ops_inode.h"
37#include "quota.h"
38#include "rgrp.h"
39#include "trans.h"
40#include "util.h"
41
42/**
43 * gfs2_create - Create a file
44 * @dir: The directory in which to create the file
45 * @dentry: The dentry of the new file
46 * @mode: The mode of the new file
47 *
48 * Returns: errno
49 */
50
51static int gfs2_create(struct inode *dir, struct dentry *dentry,
52 int mode, struct nameidata *nd)
53{
54 struct gfs2_inode *dip = GFS2_I(dir);
55 struct gfs2_sbd *sdp = GFS2_SB(dir);
56 struct gfs2_holder ghs[2];
57 struct inode *inode;
58
59 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
60
61 for (;;) {
62 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
63 if (!IS_ERR(inode)) {
64 gfs2_trans_end(sdp);
65 if (dip->i_alloc.al_rgd)
66 gfs2_inplace_release(dip);
67 gfs2_quota_unlock(dip);
68 gfs2_alloc_put(dip);
69 gfs2_glock_dq_uninit_m(2, ghs);
70 mark_inode_dirty(inode);
71 break;
72 } else if (PTR_ERR(inode) != -EEXIST ||
73 (nd->intent.open.flags & O_EXCL)) {
74 gfs2_holder_uninit(ghs);
75 return PTR_ERR(inode);
76 }
77
78 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
79 if (inode) {
80 if (!IS_ERR(inode)) {
81 gfs2_holder_uninit(ghs);
82 break;
83 } else {
84 gfs2_holder_uninit(ghs);
85 return PTR_ERR(inode);
86 }
87 }
88 }
89
90 d_instantiate(dentry, inode);
91
92 return 0;
93}
94
95/**
96 * gfs2_lookup - Look up a filename in a directory and return its inode
97 * @dir: The directory inode
98 * @dentry: The dentry of the new inode
99 * @nd: passed from Linux VFS, ignored by us
100 *
101 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
102 *
103 * Returns: errno
104 */
105
106static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
107 struct nameidata *nd)
108{
109 struct inode *inode = NULL;
110
111 dentry->d_op = &gfs2_dops;
112
113 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
114 if (inode && IS_ERR(inode))
115 return ERR_PTR(PTR_ERR(inode));
116
117 if (inode)
118 return d_splice_alias(inode, dentry);
119 d_add(dentry, inode);
120
121 return NULL;
122}
123
124/**
125 * gfs2_link - Link to a file
126 * @old_dentry: The inode to link
127 * @dir: Add link to this directory
128 * @dentry: The name of the link
129 *
130 * Link the inode in "old_dentry" into the directory "dir" with the
131 * name in "dentry".
132 *
133 * Returns: errno
134 */
135
136static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
137 struct dentry *dentry)
138{
139 struct gfs2_inode *dip = GFS2_I(dir);
140 struct gfs2_sbd *sdp = GFS2_SB(dir);
141 struct inode *inode = old_dentry->d_inode;
142 struct gfs2_inode *ip = GFS2_I(inode);
143 struct gfs2_holder ghs[2];
144 int alloc_required;
145 int error;
146
147 if (S_ISDIR(ip->i_di.di_mode))
148 return -EPERM;
149
150 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
151 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
152
153 error = gfs2_glock_nq_m(2, ghs);
154 if (error)
155 goto out;
156
157 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
158 if (error)
159 goto out_gunlock;
160
161 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
162 switch (error) {
163 case -ENOENT:
164 break;
165 case 0:
166 error = -EEXIST;
167 default:
168 goto out_gunlock;
169 }
170
171 error = -EINVAL;
172 if (!dip->i_di.di_nlink)
173 goto out_gunlock;
174 error = -EFBIG;
175 if (dip->i_di.di_entries == (uint32_t)-1)
176 goto out_gunlock;
177 error = -EPERM;
178 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
179 goto out_gunlock;
180 error = -EINVAL;
181 if (!ip->i_di.di_nlink)
182 goto out_gunlock;
183 error = -EMLINK;
184 if (ip->i_di.di_nlink == (uint32_t)-1)
185 goto out_gunlock;
186
187 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
188 if (error < 0)
189 goto out_gunlock;
190 error = 0;
191
192 if (alloc_required) {
193 struct gfs2_alloc *al = gfs2_alloc_get(dip);
194
195 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
196 if (error)
197 goto out_alloc;
198
199 error = gfs2_quota_check(dip, dip->i_di.di_uid,
200 dip->i_di.di_gid);
201 if (error)
202 goto out_gunlock_q;
203
204 al->al_requested = sdp->sd_max_dirres;
205
206 error = gfs2_inplace_reserve(dip);
207 if (error)
208 goto out_gunlock_q;
209
210 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
211 al->al_rgd->rd_ri.ri_length +
212 2 * RES_DINODE + RES_STATFS +
213 RES_QUOTA, 0);
214 if (error)
215 goto out_ipres;
216 } else {
217 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
218 if (error)
219 goto out_ipres;
220 }
221
222 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
223 IF2DT(ip->i_di.di_mode));
224 if (error)
225 goto out_end_trans;
226
227 error = gfs2_change_nlink(ip, +1);
228
229out_end_trans:
230 gfs2_trans_end(sdp);
231
232out_ipres:
233 if (alloc_required)
234 gfs2_inplace_release(dip);
235
236out_gunlock_q:
237 if (alloc_required)
238 gfs2_quota_unlock(dip);
239
240out_alloc:
241 if (alloc_required)
242 gfs2_alloc_put(dip);
243
244out_gunlock:
245 gfs2_glock_dq_m(2, ghs);
246
247out:
248 gfs2_holder_uninit(ghs);
249 gfs2_holder_uninit(ghs + 1);
250
251 if (!error) {
252 atomic_inc(&inode->i_count);
253 d_instantiate(dentry, inode);
254 mark_inode_dirty(inode);
255 }
256
257 return error;
258}
259
260/**
261 * gfs2_unlink - Unlink a file
262 * @dir: The inode of the directory containing the file to unlink
263 * @dentry: The file itself
264 *
265 * Unlink a file. Call gfs2_unlinki()
266 *
267 * Returns: errno
268 */
269
270static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
271{
272 struct gfs2_inode *dip = GFS2_I(dir);
273 struct gfs2_sbd *sdp = GFS2_SB(dir);
274 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
275 struct gfs2_holder ghs[2];
276 int error;
277
278 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
279 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
280
281 error = gfs2_glock_nq_m(2, ghs);
282 if (error)
283 goto out;
284
285 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
286 if (error)
287 goto out_gunlock;
288
289 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
290 if (error)
291 goto out_gunlock;
292
293 error = gfs2_dir_del(dip, &dentry->d_name);
294 if (error)
295 goto out_end_trans;
296
297 error = gfs2_change_nlink(ip, -1);
298
299out_end_trans:
300 gfs2_trans_end(sdp);
301out_gunlock:
302 gfs2_glock_dq_m(2, ghs);
303out:
304 gfs2_holder_uninit(ghs);
305 gfs2_holder_uninit(ghs + 1);
306 return error;
307}
308
309/**
310 * gfs2_symlink - Create a symlink
311 * @dir: The directory to create the symlink in
312 * @dentry: The dentry to put the symlink in
313 * @symname: The thing which the link points to
314 *
315 * Returns: errno
316 */
317
318static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
319 const char *symname)
320{
321 struct gfs2_inode *dip = GFS2_I(dir), *ip;
322 struct gfs2_sbd *sdp = GFS2_SB(dir);
323 struct gfs2_holder ghs[2];
324 struct inode *inode;
325 struct buffer_head *dibh;
326 int size;
327 int error;
328
329 /* Must be stuffed with a null terminator for gfs2_follow_link() */
330 size = strlen(symname);
331 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
332 return -ENAMETOOLONG;
333
334 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
335
336 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
337 if (IS_ERR(inode)) {
338 gfs2_holder_uninit(ghs);
339 return PTR_ERR(inode);
340 }
341
342 ip = ghs[1].gh_gl->gl_object;
343
344 ip->i_di.di_size = size;
345
346 error = gfs2_meta_inode_buffer(ip, &dibh);
347
348 if (!gfs2_assert_withdraw(sdp, !error)) {
349 gfs2_dinode_out(&ip->i_di, dibh->b_data);
350 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
351 size);
352 brelse(dibh);
353 }
354
355 gfs2_trans_end(sdp);
356 if (dip->i_alloc.al_rgd)
357 gfs2_inplace_release(dip);
358 gfs2_quota_unlock(dip);
359 gfs2_alloc_put(dip);
360
361 gfs2_glock_dq_uninit_m(2, ghs);
362
363 d_instantiate(dentry, inode);
364 mark_inode_dirty(inode);
365
366 return 0;
367}
368
369/**
370 * gfs2_mkdir - Make a directory
371 * @dir: The parent directory of the new one
372 * @dentry: The dentry of the new directory
373 * @mode: The mode of the new directory
374 *
375 * Returns: errno
376 */
377
378static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
379{
380 struct gfs2_inode *dip = GFS2_I(dir), *ip;
381 struct gfs2_sbd *sdp = GFS2_SB(dir);
382 struct gfs2_holder ghs[2];
383 struct inode *inode;
384 struct buffer_head *dibh;
385 int error;
386
387 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
388
389 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
390 if (IS_ERR(inode)) {
391 gfs2_holder_uninit(ghs);
392 return PTR_ERR(inode);
393 }
394
395 ip = ghs[1].gh_gl->gl_object;
396
397 ip->i_di.di_nlink = 2;
398 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
399 ip->i_di.di_flags |= GFS2_DIF_JDATA;
400 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
401 ip->i_di.di_entries = 2;
402
403 error = gfs2_meta_inode_buffer(ip, &dibh);
404
405 if (!gfs2_assert_withdraw(sdp, !error)) {
406 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
407 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
408 struct qstr str;
409
410 gfs2_str2qstr(&str, ".");
411 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
412 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
413 dent->de_inum = di->di_num; /* already GFS2 endian */
414 dent->de_type = DT_DIR;
415 di->di_entries = cpu_to_be32(1);
416
417 gfs2_str2qstr(&str, "..");
418 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
419 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
420
421 gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum);
422 dent->de_type = DT_DIR;
423
424 gfs2_dinode_out(&ip->i_di, (char *)di);
425
426 brelse(dibh);
427 }
428
429 error = gfs2_change_nlink(dip, +1);
430 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
431
432 gfs2_trans_end(sdp);
433 if (dip->i_alloc.al_rgd)
434 gfs2_inplace_release(dip);
435 gfs2_quota_unlock(dip);
436 gfs2_alloc_put(dip);
437
438 gfs2_glock_dq_uninit_m(2, ghs);
439
440 d_instantiate(dentry, inode);
441 mark_inode_dirty(inode);
442
443 return 0;
444}
445
446/**
447 * gfs2_rmdir - Remove a directory
448 * @dir: The parent directory of the directory to be removed
449 * @dentry: The dentry of the directory to remove
450 *
451 * Remove a directory. Call gfs2_rmdiri()
452 *
453 * Returns: errno
454 */
455
456static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
457{
458 struct gfs2_inode *dip = GFS2_I(dir);
459 struct gfs2_sbd *sdp = GFS2_SB(dir);
460 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
461 struct gfs2_holder ghs[2];
462 int error;
463
464 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
465 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
466
467 error = gfs2_glock_nq_m(2, ghs);
468 if (error)
469 goto out;
470
471 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
472 if (error)
473 goto out_gunlock;
474
475 if (ip->i_di.di_entries < 2) {
476 if (gfs2_consist_inode(ip))
477 gfs2_dinode_print(&ip->i_di);
478 error = -EIO;
479 goto out_gunlock;
480 }
481 if (ip->i_di.di_entries > 2) {
482 error = -ENOTEMPTY;
483 goto out_gunlock;
484 }
485
486 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
487 if (error)
488 goto out_gunlock;
489
490 error = gfs2_rmdiri(dip, &dentry->d_name, ip);
491
492 gfs2_trans_end(sdp);
493
494 out_gunlock:
495 gfs2_glock_dq_m(2, ghs);
496
497 out:
498 gfs2_holder_uninit(ghs);
499 gfs2_holder_uninit(ghs + 1);
500
501 return error;
502}
503
504/**
505 * gfs2_mknod - Make a special file
506 * @dir: The directory in which the special file will reside
507 * @dentry: The dentry of the special file
508 * @mode: The mode of the special file
509 * @rdev: The device specification of the special file
510 *
511 */
512
513static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
514 dev_t dev)
515{
516 struct gfs2_inode *dip = GFS2_I(dir), *ip;
517 struct gfs2_sbd *sdp = GFS2_SB(dir);
518 struct gfs2_holder ghs[2];
519 struct inode *inode;
520 struct buffer_head *dibh;
521 uint32_t major = 0, minor = 0;
522 int error;
523
524 switch (mode & S_IFMT) {
525 case S_IFBLK:
526 case S_IFCHR:
527 major = MAJOR(dev);
528 minor = MINOR(dev);
529 break;
530 case S_IFIFO:
531 case S_IFSOCK:
532 break;
533 default:
534 return -EOPNOTSUPP;
535 };
536
537 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
538
539 inode = gfs2_createi(ghs, &dentry->d_name, mode);
540 if (IS_ERR(inode)) {
541 gfs2_holder_uninit(ghs);
542 return PTR_ERR(inode);
543 }
544
545 ip = ghs[1].gh_gl->gl_object;
546
547 ip->i_di.di_major = major;
548 ip->i_di.di_minor = minor;
549
550 error = gfs2_meta_inode_buffer(ip, &dibh);
551
552 if (!gfs2_assert_withdraw(sdp, !error)) {
553 gfs2_dinode_out(&ip->i_di, dibh->b_data);
554 brelse(dibh);
555 }
556
557 gfs2_trans_end(sdp);
558 if (dip->i_alloc.al_rgd)
559 gfs2_inplace_release(dip);
560 gfs2_quota_unlock(dip);
561 gfs2_alloc_put(dip);
562
563 gfs2_glock_dq_uninit_m(2, ghs);
564
565 d_instantiate(dentry, inode);
566 mark_inode_dirty(inode);
567
568 return 0;
569}
570
571/**
572 * gfs2_rename - Rename a file
573 * @odir: Parent directory of old file name
574 * @odentry: The old dentry of the file
575 * @ndir: Parent directory of new file name
576 * @ndentry: The new dentry of the file
577 *
578 * Returns: errno
579 */
580
581static int gfs2_rename(struct inode *odir, struct dentry *odentry,
582 struct inode *ndir, struct dentry *ndentry)
583{
584 struct gfs2_inode *odip = GFS2_I(odir);
585 struct gfs2_inode *ndip = GFS2_I(ndir);
586 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
587 struct gfs2_inode *nip = NULL;
588 struct gfs2_sbd *sdp = GFS2_SB(odir);
589 struct gfs2_holder ghs[4], r_gh;
590 unsigned int num_gh;
591 int dir_rename = 0;
592 int alloc_required;
593 unsigned int x;
594 int error;
595
596 if (ndentry->d_inode) {
597 nip = GFS2_I(ndentry->d_inode);
598 if (ip == nip)
599 return 0;
600 }
601
602 /* Make sure we aren't trying to move a dirctory into it's subdir */
603
604 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
605 dir_rename = 1;
606
607 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
608 LM_ST_EXCLUSIVE, 0,
609 &r_gh);
610 if (error)
611 goto out;
612
613 error = gfs2_ok_to_move(ip, ndip);
614 if (error)
615 goto out_gunlock_r;
616 }
617
618 num_gh = 1;
619 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
620 if (odip != ndip) {
621 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
622 num_gh++;
623 }
624 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
625 num_gh++;
626
627 if (nip) {
628 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
629 num_gh++;
630 }
631
632 error = gfs2_glock_nq_m(num_gh, ghs);
633 if (error)
634 goto out_uninit;
635
636 /* Check out the old directory */
637
638 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
639 if (error)
640 goto out_gunlock;
641
642 /* Check out the new directory */
643
644 if (nip) {
645 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
646 if (error)
647 goto out_gunlock;
648
649 if (S_ISDIR(nip->i_di.di_mode)) {
650 if (nip->i_di.di_entries < 2) {
651 if (gfs2_consist_inode(nip))
652 gfs2_dinode_print(&nip->i_di);
653 error = -EIO;
654 goto out_gunlock;
655 }
656 if (nip->i_di.di_entries > 2) {
657 error = -ENOTEMPTY;
658 goto out_gunlock;
659 }
660 }
661 } else {
662 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
663 if (error)
664 goto out_gunlock;
665
666 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
667 switch (error) {
668 case -ENOENT:
669 error = 0;
670 break;
671 case 0:
672 error = -EEXIST;
673 default:
674 goto out_gunlock;
675 };
676
677 if (odip != ndip) {
678 if (!ndip->i_di.di_nlink) {
679 error = -EINVAL;
680 goto out_gunlock;
681 }
682 if (ndip->i_di.di_entries == (uint32_t)-1) {
683 error = -EFBIG;
684 goto out_gunlock;
685 }
686 if (S_ISDIR(ip->i_di.di_mode) &&
687 ndip->i_di.di_nlink == (uint32_t)-1) {
688 error = -EMLINK;
689 goto out_gunlock;
690 }
691 }
692 }
693
694 /* Check out the dir to be renamed */
695
696 if (dir_rename) {
697 error = permission(odentry->d_inode, MAY_WRITE, NULL);
698 if (error)
699 goto out_gunlock;
700 }
701
702 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
703 if (error < 0)
704 goto out_gunlock;
705 error = 0;
706
707 if (alloc_required) {
708 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
709
710 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
711 if (error)
712 goto out_alloc;
713
714 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
715 ndip->i_di.di_gid);
716 if (error)
717 goto out_gunlock_q;
718
719 al->al_requested = sdp->sd_max_dirres;
720
721 error = gfs2_inplace_reserve(ndip);
722 if (error)
723 goto out_gunlock_q;
724
725 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
726 al->al_rgd->rd_ri.ri_length +
727 4 * RES_DINODE + 4 * RES_LEAF +
728 RES_STATFS + RES_QUOTA, 0);
729 if (error)
730 goto out_ipreserv;
731 } else {
732 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
733 5 * RES_LEAF, 0);
734 if (error)
735 goto out_gunlock;
736 }
737
738 /* Remove the target file, if it exists */
739
740 if (nip) {
741 if (S_ISDIR(nip->i_di.di_mode))
742 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
743 else {
744 error = gfs2_dir_del(ndip, &ndentry->d_name);
745 if (error)
746 goto out_end_trans;
747 error = gfs2_change_nlink(nip, -1);
748 }
749 if (error)
750 goto out_end_trans;
751 }
752
753 if (dir_rename) {
754 struct qstr name;
755 gfs2_str2qstr(&name, "..");
756
757 error = gfs2_change_nlink(ndip, +1);
758 if (error)
759 goto out_end_trans;
760 error = gfs2_change_nlink(odip, -1);
761 if (error)
762 goto out_end_trans;
763
764 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
765 if (error)
766 goto out_end_trans;
767 } else {
768 struct buffer_head *dibh;
769 error = gfs2_meta_inode_buffer(ip, &dibh);
770 if (error)
771 goto out_end_trans;
772 ip->i_di.di_ctime = get_seconds();
773 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
774 gfs2_dinode_out(&ip->i_di, dibh->b_data);
775 brelse(dibh);
776 }
777
778 error = gfs2_dir_del(odip, &odentry->d_name);
779 if (error)
780 goto out_end_trans;
781
782 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
783 IF2DT(ip->i_di.di_mode));
784 if (error)
785 goto out_end_trans;
786
787out_end_trans:
788 gfs2_trans_end(sdp);
789out_ipreserv:
790 if (alloc_required)
791 gfs2_inplace_release(ndip);
792out_gunlock_q:
793 if (alloc_required)
794 gfs2_quota_unlock(ndip);
795out_alloc:
796 if (alloc_required)
797 gfs2_alloc_put(ndip);
798out_gunlock:
799 gfs2_glock_dq_m(num_gh, ghs);
800out_uninit:
801 for (x = 0; x < num_gh; x++)
802 gfs2_holder_uninit(ghs + x);
803out_gunlock_r:
804 if (dir_rename)
805 gfs2_glock_dq_uninit(&r_gh);
806out:
807 return error;
808}
809
810/**
811 * gfs2_readlink - Read the value of a symlink
812 * @dentry: the symlink
813 * @buf: the buffer to read the symlink data into
814 * @size: the size of the buffer
815 *
816 * Returns: errno
817 */
818
819static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
820 int user_size)
821{
822 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
823 char array[GFS2_FAST_NAME_SIZE], *buf = array;
824 unsigned int len = GFS2_FAST_NAME_SIZE;
825 int error;
826
827 error = gfs2_readlinki(ip, &buf, &len);
828 if (error)
829 return error;
830
831 if (user_size > len - 1)
832 user_size = len - 1;
833
834 if (copy_to_user(user_buf, buf, user_size))
835 error = -EFAULT;
836 else
837 error = user_size;
838
839 if (buf != array)
840 kfree(buf);
841
842 return error;
843}
844
845/**
846 * gfs2_follow_link - Follow a symbolic link
847 * @dentry: The dentry of the link
848 * @nd: Data that we pass to vfs_follow_link()
849 *
850 * This can handle symlinks of any size. It is optimised for symlinks
851 * under GFS2_FAST_NAME_SIZE.
852 *
853 * Returns: 0 on success or error code
854 */
855
856static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
857{
858 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
859 char array[GFS2_FAST_NAME_SIZE], *buf = array;
860 unsigned int len = GFS2_FAST_NAME_SIZE;
861 int error;
862
863 error = gfs2_readlinki(ip, &buf, &len);
864 if (!error) {
865 error = vfs_follow_link(nd, buf);
866 if (buf != array)
867 kfree(buf);
868 }
869
870 return ERR_PTR(error);
871}
872
873/**
874 * gfs2_permission -
875 * @inode:
876 * @mask:
877 * @nd: passed from Linux VFS, ignored by us
878 *
879 * Returns: errno
880 */
881
882static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
883{
884 struct gfs2_inode *ip = GFS2_I(inode);
885 struct gfs2_holder i_gh;
886 int error;
887
888 if (ip->i_vn == ip->i_gl->gl_vn)
889 return generic_permission(inode, mask, gfs2_check_acl);
890
891 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
892 if (!error) {
893 error = generic_permission(inode, mask, gfs2_check_acl_locked);
894 gfs2_glock_dq_uninit(&i_gh);
895 }
896
897 return error;
898}
899
900static int setattr_size(struct inode *inode, struct iattr *attr)
901{
902 struct gfs2_inode *ip = GFS2_I(inode);
903 int error;
904
905 if (attr->ia_size != ip->i_di.di_size) {
906 error = vmtruncate(inode, attr->ia_size);
907 if (error)
908 return error;
909 }
910
911 error = gfs2_truncatei(ip, attr->ia_size);
912 if (error)
913 return error;
914
915 return error;
916}
917
918static int setattr_chown(struct inode *inode, struct iattr *attr)
919{
920 struct gfs2_inode *ip = GFS2_I(inode);
921 struct gfs2_sbd *sdp = GFS2_SB(inode);
922 struct buffer_head *dibh;
923 uint32_t ouid, ogid, nuid, ngid;
924 int error;
925
926 ouid = ip->i_di.di_uid;
927 ogid = ip->i_di.di_gid;
928 nuid = attr->ia_uid;
929 ngid = attr->ia_gid;
930
931 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
932 ouid = nuid = NO_QUOTA_CHANGE;
933 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
934 ogid = ngid = NO_QUOTA_CHANGE;
935
936 gfs2_alloc_get(ip);
937
938 error = gfs2_quota_lock(ip, nuid, ngid);
939 if (error)
940 goto out_alloc;
941
942 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
943 error = gfs2_quota_check(ip, nuid, ngid);
944 if (error)
945 goto out_gunlock_q;
946 }
947
948 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
949 if (error)
950 goto out_gunlock_q;
951
952 error = gfs2_meta_inode_buffer(ip, &dibh);
953 if (error)
954 goto out_end_trans;
955
956 error = inode_setattr(inode, attr);
957 gfs2_assert_warn(sdp, !error);
958 gfs2_inode_attr_out(ip);
959
960 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
961 gfs2_dinode_out(&ip->i_di, dibh->b_data);
962 brelse(dibh);
963
964 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
965 gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
966 gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
967 }
968
969 out_end_trans:
970 gfs2_trans_end(sdp);
971
972 out_gunlock_q:
973 gfs2_quota_unlock(ip);
974
975 out_alloc:
976 gfs2_alloc_put(ip);
977
978 return error;
979}
980
981/**
982 * gfs2_setattr - Change attributes on an inode
983 * @dentry: The dentry which is changing
984 * @attr: The structure describing the change
985 *
986 * The VFS layer wants to change one or more of an inodes attributes. Write
987 * that change out to disk.
988 *
989 * Returns: errno
990 */
991
992static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
993{
994 struct inode *inode = dentry->d_inode;
995 struct gfs2_inode *ip = GFS2_I(inode);
996 struct gfs2_holder i_gh;
997 int error;
998
999 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1000 if (error)
1001 return error;
1002
1003 error = -EPERM;
1004 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1005 goto out;
1006
1007 error = inode_change_ok(inode, attr);
1008 if (error)
1009 goto out;
1010
1011 if (attr->ia_valid & ATTR_SIZE)
1012 error = setattr_size(inode, attr);
1013 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1014 error = setattr_chown(inode, attr);
1015 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1016 error = gfs2_acl_chmod(ip, attr);
1017 else
1018 error = gfs2_setattr_simple(ip, attr);
1019
1020 out:
1021 gfs2_glock_dq_uninit(&i_gh);
1022
1023 if (!error)
1024 mark_inode_dirty(inode);
1025
1026 return error;
1027}
1028
1029/**
1030 * gfs2_getattr - Read out an inode's attributes
1031 * @mnt: ?
1032 * @dentry: The dentry to stat
1033 * @stat: The inode's stats
1034 *
1035 * Returns: errno
1036 */
1037
1038static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1039 struct kstat *stat)
1040{
1041 struct inode *inode = dentry->d_inode;
1042 struct gfs2_inode *ip = GFS2_I(inode);
1043 struct gfs2_holder gh;
1044 int error;
1045
1046 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1047 if (!error) {
1048 generic_fillattr(inode, stat);
1049 gfs2_glock_dq_uninit(&gh);
1050 }
1051
1052 return error;
1053}
1054
1055static int gfs2_setxattr(struct dentry *dentry, const char *name,
1056 const void *data, size_t size, int flags)
1057{
1058 struct inode *inode = dentry->d_inode;
1059 struct gfs2_ea_request er;
1060
1061 memset(&er, 0, sizeof(struct gfs2_ea_request));
1062 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1063 if (er.er_type == GFS2_EATYPE_UNUSED)
1064 return -EOPNOTSUPP;
1065 er.er_data = (char *)data;
1066 er.er_name_len = strlen(er.er_name);
1067 er.er_data_len = size;
1068 er.er_flags = flags;
1069
1070 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1071
1072 return gfs2_ea_set(GFS2_I(inode), &er);
1073}
1074
1075static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1076 void *data, size_t size)
1077{
1078 struct gfs2_ea_request er;
1079
1080 memset(&er, 0, sizeof(struct gfs2_ea_request));
1081 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1082 if (er.er_type == GFS2_EATYPE_UNUSED)
1083 return -EOPNOTSUPP;
1084 er.er_data = data;
1085 er.er_name_len = strlen(er.er_name);
1086 er.er_data_len = size;
1087
1088 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1089}
1090
1091static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1092{
1093 struct gfs2_ea_request er;
1094
1095 memset(&er, 0, sizeof(struct gfs2_ea_request));
1096 er.er_data = (size) ? buffer : NULL;
1097 er.er_data_len = size;
1098
1099 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
1100}
1101
1102static int gfs2_removexattr(struct dentry *dentry, const char *name)
1103{
1104 struct gfs2_ea_request er;
1105
1106 memset(&er, 0, sizeof(struct gfs2_ea_request));
1107 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1108 if (er.er_type == GFS2_EATYPE_UNUSED)
1109 return -EOPNOTSUPP;
1110 er.er_name_len = strlen(er.er_name);
1111
1112 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1113}
1114
1115struct inode_operations gfs2_file_iops = {
1116 .permission = gfs2_permission,
1117 .setattr = gfs2_setattr,
1118 .getattr = gfs2_getattr,
1119 .setxattr = gfs2_setxattr,
1120 .getxattr = gfs2_getxattr,
1121 .listxattr = gfs2_listxattr,
1122 .removexattr = gfs2_removexattr,
1123};
1124
1125struct inode_operations gfs2_dev_iops = {
1126 .permission = gfs2_permission,
1127 .setattr = gfs2_setattr,
1128 .getattr = gfs2_getattr,
1129 .setxattr = gfs2_setxattr,
1130 .getxattr = gfs2_getxattr,
1131 .listxattr = gfs2_listxattr,
1132 .removexattr = gfs2_removexattr,
1133};
1134
1135struct inode_operations gfs2_dir_iops = {
1136 .create = gfs2_create,
1137 .lookup = gfs2_lookup,
1138 .link = gfs2_link,
1139 .unlink = gfs2_unlink,
1140 .symlink = gfs2_symlink,
1141 .mkdir = gfs2_mkdir,
1142 .rmdir = gfs2_rmdir,
1143 .mknod = gfs2_mknod,
1144 .rename = gfs2_rename,
1145 .permission = gfs2_permission,
1146 .setattr = gfs2_setattr,
1147 .getattr = gfs2_getattr,
1148 .setxattr = gfs2_setxattr,
1149 .getxattr = gfs2_getxattr,
1150 .listxattr = gfs2_listxattr,
1151 .removexattr = gfs2_removexattr,
1152};
1153
1154struct inode_operations gfs2_symlink_iops = {
1155 .readlink = gfs2_readlink,
1156 .follow_link = gfs2_follow_link,
1157 .permission = gfs2_permission,
1158 .setattr = gfs2_setattr,
1159 .getattr = gfs2_getattr,
1160 .setxattr = gfs2_setxattr,
1161 .getxattr = gfs2_getxattr,
1162 .listxattr = gfs2_listxattr,
1163 .removexattr = gfs2_removexattr,
1164};
1165
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..930aaae91377
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13extern struct inode_operations gfs2_file_iops;
14extern struct inode_operations gfs2_dir_iops;
15extern struct inode_operations gfs2_symlink_iops;
16extern struct inode_operations gfs2_dev_iops;
17
18#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..730c7228f6ad
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,471 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/vmalloc.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/crc32.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "glock.h"
28#include "inode.h"
29#include "lm.h"
30#include "log.h"
31#include "mount.h"
32#include "ops_super.h"
33#include "quota.h"
34#include "recovery.h"
35#include "rgrp.h"
36#include "super.h"
37#include "sys.h"
38#include "util.h"
39#include "trans.h"
40#include "dir.h"
41#include "eattr.h"
42#include "bmap.h"
43
44/**
45 * gfs2_write_inode - Make sure the inode is stable on the disk
46 * @inode: The inode
47 * @sync: synchronous write flag
48 *
49 * Returns: errno
50 */
51
52static int gfs2_write_inode(struct inode *inode, int sync)
53{
54 struct gfs2_inode *ip = GFS2_I(inode);
55
56 /* Check this is a "normal" inode */
57 if (inode->u.generic_ip) {
58 if (current->flags & PF_MEMALLOC)
59 return 0;
60 if (sync)
61 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
62 }
63
64 return 0;
65}
66
67/**
68 * gfs2_put_super - Unmount the filesystem
69 * @sb: The VFS superblock
70 *
71 */
72
73static void gfs2_put_super(struct super_block *sb)
74{
75 struct gfs2_sbd *sdp = sb->s_fs_info;
76 int error;
77
78 if (!sdp)
79 return;
80
81 /* Unfreeze the filesystem, if we need to */
82
83 mutex_lock(&sdp->sd_freeze_lock);
84 if (sdp->sd_freeze_count)
85 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
86 mutex_unlock(&sdp->sd_freeze_lock);
87
88 kthread_stop(sdp->sd_quotad_process);
89 kthread_stop(sdp->sd_logd_process);
90 kthread_stop(sdp->sd_recoverd_process);
91 while (sdp->sd_glockd_num--)
92 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
93 kthread_stop(sdp->sd_scand_process);
94
95 if (!(sb->s_flags & MS_RDONLY)) {
96 error = gfs2_make_fs_ro(sdp);
97 if (error)
98 gfs2_io_error(sdp);
99 }
100 /* At this point, we're through modifying the disk */
101
102 /* Release stuff */
103
104 iput(sdp->sd_master_dir);
105 iput(sdp->sd_jindex);
106 iput(sdp->sd_inum_inode);
107 iput(sdp->sd_statfs_inode);
108 iput(sdp->sd_rindex);
109 iput(sdp->sd_quota_inode);
110
111 gfs2_glock_put(sdp->sd_rename_gl);
112 gfs2_glock_put(sdp->sd_trans_gl);
113
114 if (!sdp->sd_args.ar_spectator) {
115 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
116 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
117 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
118 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
119 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
120 iput(sdp->sd_ir_inode);
121 iput(sdp->sd_sc_inode);
122 iput(sdp->sd_qc_inode);
123 }
124
125 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
126 gfs2_clear_rgrpd(sdp);
127 gfs2_jindex_free(sdp);
128 /* Take apart glock structures and buffer lists */
129 gfs2_gl_hash_clear(sdp, WAIT);
130 /* Unmount the locking protocol */
131 gfs2_lm_unmount(sdp);
132
133 /* At this point, we're through participating in the lockspace */
134 gfs2_sys_fs_del(sdp);
135 vfree(sdp);
136 sb->s_fs_info = NULL;
137}
138
139/**
140 * gfs2_write_super - disk commit all incore transactions
141 * @sb: the filesystem
142 *
143 * This function is called every time sync(2) is called.
144 * After this exits, all dirty buffers are synced.
145 */
146
147static void gfs2_write_super(struct super_block *sb)
148{
149 struct gfs2_sbd *sdp = sb->s_fs_info;
150 gfs2_log_flush(sdp, NULL);
151}
152
153/**
154 * gfs2_write_super_lockfs - prevent further writes to the filesystem
155 * @sb: the VFS structure for the filesystem
156 *
157 */
158
159static void gfs2_write_super_lockfs(struct super_block *sb)
160{
161 struct gfs2_sbd *sdp = sb->s_fs_info;
162 int error;
163
164 for (;;) {
165 error = gfs2_freeze_fs(sdp);
166 if (!error)
167 break;
168
169 switch (error) {
170 case -EBUSY:
171 fs_err(sdp, "waiting for recovery before freeze\n");
172 break;
173
174 default:
175 fs_err(sdp, "error freezing FS: %d\n", error);
176 break;
177 }
178
179 fs_err(sdp, "retrying...\n");
180 msleep(1000);
181 }
182}
183
184/**
185 * gfs2_unlockfs - reallow writes to the filesystem
186 * @sb: the VFS structure for the filesystem
187 *
188 */
189
190static void gfs2_unlockfs(struct super_block *sb)
191{
192 struct gfs2_sbd *sdp = sb->s_fs_info;
193 gfs2_unfreeze_fs(sdp);
194}
195
196/**
197 * gfs2_statfs - Gather and return stats about the filesystem
198 * @sb: The superblock
199 * @statfsbuf: The buffer
200 *
201 * Returns: 0 on success or error code
202 */
203
204static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
205{
206 struct super_block *sb = dentry->d_inode->i_sb;
207 struct gfs2_sbd *sdp = sb->s_fs_info;
208 struct gfs2_statfs_change sc;
209 int error;
210
211 if (gfs2_tune_get(sdp, gt_statfs_slow))
212 error = gfs2_statfs_slow(sdp, &sc);
213 else
214 error = gfs2_statfs_i(sdp, &sc);
215
216 if (error)
217 return error;
218
219 memset(buf, 0, sizeof(struct kstatfs));
220
221 buf->f_type = GFS2_MAGIC;
222 buf->f_bsize = sdp->sd_sb.sb_bsize;
223 buf->f_blocks = sc.sc_total;
224 buf->f_bfree = sc.sc_free;
225 buf->f_bavail = sc.sc_free;
226 buf->f_files = sc.sc_dinodes + sc.sc_free;
227 buf->f_ffree = sc.sc_free;
228 buf->f_namelen = GFS2_FNAMESIZE;
229
230 return 0;
231}
232
233/**
234 * gfs2_remount_fs - called when the FS is remounted
235 * @sb: the filesystem
236 * @flags: the remount flags
237 * @data: extra data passed in (not used right now)
238 *
239 * Returns: errno
240 */
241
242static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
243{
244 struct gfs2_sbd *sdp = sb->s_fs_info;
245 int error;
246
247 error = gfs2_mount_args(sdp, data, 1);
248 if (error)
249 return error;
250
251 if (sdp->sd_args.ar_spectator)
252 *flags |= MS_RDONLY;
253 else {
254 if (*flags & MS_RDONLY) {
255 if (!(sb->s_flags & MS_RDONLY))
256 error = gfs2_make_fs_ro(sdp);
257 } else if (!(*flags & MS_RDONLY) &&
258 (sb->s_flags & MS_RDONLY)) {
259 error = gfs2_make_fs_rw(sdp);
260 }
261 }
262
263 if (*flags & (MS_NOATIME | MS_NODIRATIME))
264 set_bit(SDF_NOATIME, &sdp->sd_flags);
265 else
266 clear_bit(SDF_NOATIME, &sdp->sd_flags);
267
268 /* Don't let the VFS update atimes. GFS2 handles this itself. */
269 *flags |= MS_NOATIME | MS_NODIRATIME;
270
271 return error;
272}
273
274/**
275 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
276 * @inode: The VFS inode
277 *
278 */
279
280static void gfs2_clear_inode(struct inode *inode)
281{
282 /* This tells us its a "real" inode and not one which only
283 * serves to contain an address space (see rgrp.c, meta_io.c)
284 * which therefore doesn't have its own glocks.
285 */
286 if (inode->u.generic_ip) {
287 struct gfs2_inode *ip = GFS2_I(inode);
288 gfs2_glock_inode_squish(inode);
289 gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
290 ip->i_gl->gl_object = NULL;
291 gfs2_glock_schedule_for_reclaim(ip->i_gl);
292 gfs2_glock_put(ip->i_gl);
293 ip->i_gl = NULL;
294 if (ip->i_iopen_gh.gh_gl)
295 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
296 }
297}
298
299/**
300 * gfs2_show_options - Show mount options for /proc/mounts
301 * @s: seq_file structure
302 * @mnt: vfsmount
303 *
304 * Returns: 0 on success or error code
305 */
306
307static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
308{
309 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
310 struct gfs2_args *args = &sdp->sd_args;
311
312 if (args->ar_lockproto[0])
313 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
314 if (args->ar_locktable[0])
315 seq_printf(s, ",locktable=%s", args->ar_locktable);
316 if (args->ar_hostdata[0])
317 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
318 if (args->ar_spectator)
319 seq_printf(s, ",spectator");
320 if (args->ar_ignore_local_fs)
321 seq_printf(s, ",ignore_local_fs");
322 if (args->ar_localflocks)
323 seq_printf(s, ",localflocks");
324 if (args->ar_localcaching)
325 seq_printf(s, ",localcaching");
326 if (args->ar_debug)
327 seq_printf(s, ",debug");
328 if (args->ar_upgrade)
329 seq_printf(s, ",upgrade");
330 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
331 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
332 if (args->ar_posix_acl)
333 seq_printf(s, ",acl");
334 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
335 char *state;
336 switch (args->ar_quota) {
337 case GFS2_QUOTA_OFF:
338 state = "off";
339 break;
340 case GFS2_QUOTA_ACCOUNT:
341 state = "account";
342 break;
343 case GFS2_QUOTA_ON:
344 state = "on";
345 break;
346 default:
347 state = "unknown";
348 break;
349 }
350 seq_printf(s, ",quota=%s", state);
351 }
352 if (args->ar_suiddir)
353 seq_printf(s, ",suiddir");
354 if (args->ar_data != GFS2_DATA_DEFAULT) {
355 char *state;
356 switch (args->ar_data) {
357 case GFS2_DATA_WRITEBACK:
358 state = "writeback";
359 break;
360 case GFS2_DATA_ORDERED:
361 state = "ordered";
362 break;
363 default:
364 state = "unknown";
365 break;
366 }
367 seq_printf(s, ",data=%s", state);
368 }
369
370 return 0;
371}
372
373/*
374 * We have to (at the moment) hold the inodes main lock to cover
375 * the gap between unlocking the shared lock on the iopen lock and
376 * taking the exclusive lock. I'd rather do a shared -> exclusive
377 * conversion on the iopen lock, but we can change that later. This
378 * is safe, just less efficient.
379 */
380static void gfs2_delete_inode(struct inode *inode)
381{
382 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
383 struct gfs2_inode *ip = GFS2_I(inode);
384 struct gfs2_holder gh;
385 int error;
386
387 if (!inode->u.generic_ip)
388 goto out;
389
390 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
391 if (unlikely(error)) {
392 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
393 goto out;
394 }
395
396 gfs2_glock_dq(&ip->i_iopen_gh);
397 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
398 error = gfs2_glock_nq(&ip->i_iopen_gh);
399 if (error)
400 goto out_uninit;
401
402 if (S_ISDIR(ip->i_di.di_mode) &&
403 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
404 error = gfs2_dir_exhash_dealloc(ip);
405 if (error)
406 goto out_unlock;
407 }
408
409 if (ip->i_di.di_eattr) {
410 error = gfs2_ea_dealloc(ip);
411 if (error)
412 goto out_unlock;
413 }
414
415 if (!gfs2_is_stuffed(ip)) {
416 error = gfs2_file_dealloc(ip);
417 if (error)
418 goto out_unlock;
419 }
420
421 error = gfs2_dinode_dealloc(ip);
422
423out_unlock:
424 gfs2_glock_dq(&ip->i_iopen_gh);
425out_uninit:
426 gfs2_holder_uninit(&ip->i_iopen_gh);
427 gfs2_glock_dq_uninit(&gh);
428 if (error)
429 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
430out:
431 truncate_inode_pages(&inode->i_data, 0);
432 clear_inode(inode);
433}
434
435
436
437static struct inode *gfs2_alloc_inode(struct super_block *sb)
438{
439 struct gfs2_sbd *sdp = sb->s_fs_info;
440 struct gfs2_inode *ip;
441
442 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
443 if (ip) {
444 ip->i_flags = 0;
445 ip->i_gl = NULL;
446 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
447 ip->i_last_pfault = jiffies;
448 }
449 return &ip->i_inode;
450}
451
452static void gfs2_destroy_inode(struct inode *inode)
453{
454 kmem_cache_free(gfs2_inode_cachep, inode);
455}
456
457struct super_operations gfs2_super_ops = {
458 .alloc_inode = gfs2_alloc_inode,
459 .destroy_inode = gfs2_destroy_inode,
460 .write_inode = gfs2_write_inode,
461 .delete_inode = gfs2_delete_inode,
462 .put_super = gfs2_put_super,
463 .write_super = gfs2_write_super,
464 .write_super_lockfs = gfs2_write_super_lockfs,
465 .unlockfs = gfs2_unlockfs,
466 .statfs = gfs2_statfs,
467 .remount_fs = gfs2_remount_fs,
468 .clear_inode = gfs2_clear_inode,
469 .show_options = gfs2_show_options,
470};
471
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..a15ccc276113
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13extern struct super_operations gfs2_super_ops;
14
15#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..aff66373b7e1
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,194 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "ops_vm.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29#include "util.h"
30
31static void pfault_be_greedy(struct gfs2_inode *ip)
32{
33 unsigned int time;
34
35 spin_lock(&ip->i_spin);
36 time = ip->i_greedy;
37 ip->i_last_pfault = jiffies;
38 spin_unlock(&ip->i_spin);
39
40 igrab(&ip->i_inode);
41 if (gfs2_glock_be_greedy(ip->i_gl, time))
42 iput(&ip->i_inode);
43}
44
45static struct page *gfs2_private_nopage(struct vm_area_struct *area,
46 unsigned long address, int *type)
47{
48 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
49 struct gfs2_holder i_gh;
50 struct page *result;
51 int error;
52
53 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
54 if (error)
55 return NULL;
56
57 set_bit(GIF_PAGED, &ip->i_flags);
58
59 result = filemap_nopage(area, address, type);
60
61 if (result && result != NOPAGE_OOM)
62 pfault_be_greedy(ip);
63
64 gfs2_glock_dq_uninit(&i_gh);
65
66 return result;
67}
68
69static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
70{
71 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
72 unsigned long index = page->index;
73 uint64_t lblock = index << (PAGE_CACHE_SHIFT -
74 sdp->sd_sb.sb_bsize_shift);
75 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
76 struct gfs2_alloc *al;
77 unsigned int data_blocks, ind_blocks;
78 unsigned int x;
79 int error;
80
81 al = gfs2_alloc_get(ip);
82
83 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
84 if (error)
85 goto out;
86
87 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
88 if (error)
89 goto out_gunlock_q;
90
91 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
92
93 al->al_requested = data_blocks + ind_blocks;
94
95 error = gfs2_inplace_reserve(ip);
96 if (error)
97 goto out_gunlock_q;
98
99 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
100 ind_blocks + RES_DINODE +
101 RES_STATFS + RES_QUOTA, 0);
102 if (error)
103 goto out_ipres;
104
105 if (gfs2_is_stuffed(ip)) {
106 error = gfs2_unstuff_dinode(ip, NULL);
107 if (error)
108 goto out_trans;
109 }
110
111 for (x = 0; x < blocks; ) {
112 uint64_t dblock;
113 unsigned int extlen;
114 int new = 1;
115
116 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
117 if (error)
118 goto out_trans;
119
120 lblock += extlen;
121 x += extlen;
122 }
123
124 gfs2_assert_warn(sdp, al->al_alloced);
125
126 out_trans:
127 gfs2_trans_end(sdp);
128
129 out_ipres:
130 gfs2_inplace_release(ip);
131
132 out_gunlock_q:
133 gfs2_quota_unlock(ip);
134
135 out:
136 gfs2_alloc_put(ip);
137
138 return error;
139}
140
141static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
142 unsigned long address, int *type)
143{
144 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
145 struct gfs2_holder i_gh;
146 struct page *result = NULL;
147 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
148 area->vm_pgoff;
149 int alloc_required;
150 int error;
151
152 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
153 if (error)
154 return NULL;
155
156 set_bit(GIF_PAGED, &ip->i_flags);
157 set_bit(GIF_SW_PAGED, &ip->i_flags);
158
159 error = gfs2_write_alloc_required(ip,
160 (uint64_t)index << PAGE_CACHE_SHIFT,
161 PAGE_CACHE_SIZE, &alloc_required);
162 if (error)
163 goto out;
164
165 result = filemap_nopage(area, address, type);
166 if (!result || result == NOPAGE_OOM)
167 goto out;
168
169 if (alloc_required) {
170 error = alloc_page_backing(ip, result);
171 if (error) {
172 page_cache_release(result);
173 result = NULL;
174 goto out;
175 }
176 set_page_dirty(result);
177 }
178
179 pfault_be_greedy(ip);
180
181 out:
182 gfs2_glock_dq_uninit(&i_gh);
183
184 return result;
185}
186
187struct vm_operations_struct gfs2_vm_ops_private = {
188 .nopage = gfs2_private_nopage,
189};
190
191struct vm_operations_struct gfs2_vm_ops_sharewrite = {
192 .nopage = gfs2_sharewrite_nopage,
193};
194
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..077cffcd4085
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13extern struct vm_operations_struct gfs2_vm_ops_private;
14extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
15
16#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..3ca65c37c354
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1286 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/gfs2_ondisk.h>
47
48#include "gfs2.h"
49#include "lm_interface.h"
50#include "incore.h"
51#include "bmap.h"
52#include "glock.h"
53#include "glops.h"
54#include "log.h"
55#include "lvb.h"
56#include "meta_io.h"
57#include "quota.h"
58#include "rgrp.h"
59#include "super.h"
60#include "trans.h"
61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h"
64#include "util.h"
65
66#define QUOTA_USER 1
67#define QUOTA_GROUP 0
68
69static uint64_t qd2offset(struct gfs2_quota_data *qd)
70{
71 uint64_t offset;
72
73 offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
74 offset *= sizeof(struct gfs2_quota);
75
76 return offset;
77}
78
79static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id,
80 struct gfs2_quota_data **qdp)
81{
82 struct gfs2_quota_data *qd;
83 int error;
84
85 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
86 if (!qd)
87 return -ENOMEM;
88
89 qd->qd_count = 1;
90 qd->qd_id = id;
91 if (user)
92 set_bit(QDF_USER, &qd->qd_flags);
93 qd->qd_slot = -1;
94
95 error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user,
96 &gfs2_quota_glops, CREATE, &qd->qd_gl);
97 if (error)
98 goto fail;
99
100 error = gfs2_lvb_hold(qd->qd_gl);
101 gfs2_glock_put(qd->qd_gl);
102 if (error)
103 goto fail;
104
105 *qdp = qd;
106
107 return 0;
108
109 fail:
110 kfree(qd);
111 return error;
112}
113
114static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
115 struct gfs2_quota_data **qdp)
116{
117 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
118 int error, found;
119
120 *qdp = NULL;
121
122 for (;;) {
123 found = 0;
124 spin_lock(&sdp->sd_quota_spin);
125 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
126 if (qd->qd_id == id &&
127 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
128 qd->qd_count++;
129 found = 1;
130 break;
131 }
132 }
133
134 if (!found)
135 qd = NULL;
136
137 if (!qd && new_qd) {
138 qd = new_qd;
139 list_add(&qd->qd_list, &sdp->sd_quota_list);
140 atomic_inc(&sdp->sd_quota_count);
141 new_qd = NULL;
142 }
143
144 spin_unlock(&sdp->sd_quota_spin);
145
146 if (qd || !create) {
147 if (new_qd) {
148 gfs2_lvb_unhold(new_qd->qd_gl);
149 kfree(new_qd);
150 }
151 *qdp = qd;
152 return 0;
153 }
154
155 error = qd_alloc(sdp, user, id, &new_qd);
156 if (error)
157 return error;
158 }
159}
160
161static void qd_hold(struct gfs2_quota_data *qd)
162{
163 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
164
165 spin_lock(&sdp->sd_quota_spin);
166 gfs2_assert(sdp, qd->qd_count);
167 qd->qd_count++;
168 spin_unlock(&sdp->sd_quota_spin);
169}
170
171static void qd_put(struct gfs2_quota_data *qd)
172{
173 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
174 spin_lock(&sdp->sd_quota_spin);
175 gfs2_assert(sdp, qd->qd_count);
176 if (!--qd->qd_count)
177 qd->qd_last_touched = jiffies;
178 spin_unlock(&sdp->sd_quota_spin);
179}
180
181static int slot_get(struct gfs2_quota_data *qd)
182{
183 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
184 unsigned int c, o = 0, b;
185 unsigned char byte = 0;
186
187 spin_lock(&sdp->sd_quota_spin);
188
189 if (qd->qd_slot_count++) {
190 spin_unlock(&sdp->sd_quota_spin);
191 return 0;
192 }
193
194 for (c = 0; c < sdp->sd_quota_chunks; c++)
195 for (o = 0; o < PAGE_SIZE; o++) {
196 byte = sdp->sd_quota_bitmap[c][o];
197 if (byte != 0xFF)
198 goto found;
199 }
200
201 goto fail;
202
203 found:
204 for (b = 0; b < 8; b++)
205 if (!(byte & (1 << b)))
206 break;
207 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
208
209 if (qd->qd_slot >= sdp->sd_quota_slots)
210 goto fail;
211
212 sdp->sd_quota_bitmap[c][o] |= 1 << b;
213
214 spin_unlock(&sdp->sd_quota_spin);
215
216 return 0;
217
218 fail:
219 qd->qd_slot_count--;
220 spin_unlock(&sdp->sd_quota_spin);
221 return -ENOSPC;
222}
223
224static void slot_hold(struct gfs2_quota_data *qd)
225{
226 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
227
228 spin_lock(&sdp->sd_quota_spin);
229 gfs2_assert(sdp, qd->qd_slot_count);
230 qd->qd_slot_count++;
231 spin_unlock(&sdp->sd_quota_spin);
232}
233
234static void slot_put(struct gfs2_quota_data *qd)
235{
236 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
237
238 spin_lock(&sdp->sd_quota_spin);
239 gfs2_assert(sdp, qd->qd_slot_count);
240 if (!--qd->qd_slot_count) {
241 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
242 qd->qd_slot = -1;
243 }
244 spin_unlock(&sdp->sd_quota_spin);
245}
246
247static int bh_get(struct gfs2_quota_data *qd)
248{
249 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
250 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
251 unsigned int block, offset;
252 uint64_t dblock;
253 int new = 0;
254 struct buffer_head *bh;
255 int error;
256 int boundary;
257
258 mutex_lock(&sdp->sd_quota_mutex);
259
260 if (qd->qd_bh_count++) {
261 mutex_unlock(&sdp->sd_quota_mutex);
262 return 0;
263 }
264
265 block = qd->qd_slot / sdp->sd_qc_per_block;
266 offset = qd->qd_slot % sdp->sd_qc_per_block;;
267
268 error = gfs2_block_map(&ip->i_inode, block, &new, &dblock, &boundary);
269 if (error)
270 goto fail;
271 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
272 if (error)
273 goto fail;
274 error = -EIO;
275 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
276 goto fail_brelse;
277
278 qd->qd_bh = bh;
279 qd->qd_bh_qc = (struct gfs2_quota_change *)
280 (bh->b_data + sizeof(struct gfs2_meta_header) +
281 offset * sizeof(struct gfs2_quota_change));
282
283 mutex_lock(&sdp->sd_quota_mutex);
284
285 return 0;
286
287 fail_brelse:
288 brelse(bh);
289
290 fail:
291 qd->qd_bh_count--;
292 mutex_unlock(&sdp->sd_quota_mutex);
293 return error;
294}
295
296static void bh_put(struct gfs2_quota_data *qd)
297{
298 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
299
300 mutex_lock(&sdp->sd_quota_mutex);
301 gfs2_assert(sdp, qd->qd_bh_count);
302 if (!--qd->qd_bh_count) {
303 brelse(qd->qd_bh);
304 qd->qd_bh = NULL;
305 qd->qd_bh_qc = NULL;
306 }
307 mutex_unlock(&sdp->sd_quota_mutex);
308}
309
310static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
311{
312 struct gfs2_quota_data *qd = NULL;
313 int error;
314 int found = 0;
315
316 *qdp = NULL;
317
318 if (sdp->sd_vfs->s_flags & MS_RDONLY)
319 return 0;
320
321 spin_lock(&sdp->sd_quota_spin);
322
323 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
324 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
325 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
326 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
327 continue;
328
329 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
330
331 set_bit(QDF_LOCKED, &qd->qd_flags);
332 gfs2_assert_warn(sdp, qd->qd_count);
333 qd->qd_count++;
334 qd->qd_change_sync = qd->qd_change;
335 gfs2_assert_warn(sdp, qd->qd_slot_count);
336 qd->qd_slot_count++;
337 found = 1;
338
339 break;
340 }
341
342 if (!found)
343 qd = NULL;
344
345 spin_unlock(&sdp->sd_quota_spin);
346
347 if (qd) {
348 gfs2_assert_warn(sdp, qd->qd_change_sync);
349 error = bh_get(qd);
350 if (error) {
351 clear_bit(QDF_LOCKED, &qd->qd_flags);
352 slot_put(qd);
353 qd_put(qd);
354 return error;
355 }
356 }
357
358 *qdp = qd;
359
360 return 0;
361}
362
363static int qd_trylock(struct gfs2_quota_data *qd)
364{
365 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
366
367 if (sdp->sd_vfs->s_flags & MS_RDONLY)
368 return 0;
369
370 spin_lock(&sdp->sd_quota_spin);
371
372 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
373 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
374 spin_unlock(&sdp->sd_quota_spin);
375 return 0;
376 }
377
378 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
379
380 set_bit(QDF_LOCKED, &qd->qd_flags);
381 gfs2_assert_warn(sdp, qd->qd_count);
382 qd->qd_count++;
383 qd->qd_change_sync = qd->qd_change;
384 gfs2_assert_warn(sdp, qd->qd_slot_count);
385 qd->qd_slot_count++;
386
387 spin_unlock(&sdp->sd_quota_spin);
388
389 gfs2_assert_warn(sdp, qd->qd_change_sync);
390 if (bh_get(qd)) {
391 clear_bit(QDF_LOCKED, &qd->qd_flags);
392 slot_put(qd);
393 qd_put(qd);
394 return 0;
395 }
396
397 return 1;
398}
399
400static void qd_unlock(struct gfs2_quota_data *qd)
401{
402 gfs2_assert_warn(qd->qd_gl->gl_sbd,
403 test_bit(QDF_LOCKED, &qd->qd_flags));
404 clear_bit(QDF_LOCKED, &qd->qd_flags);
405 bh_put(qd);
406 slot_put(qd);
407 qd_put(qd);
408}
409
410static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
411 struct gfs2_quota_data **qdp)
412{
413 int error;
414
415 error = qd_get(sdp, user, id, create, qdp);
416 if (error)
417 return error;
418
419 error = slot_get(*qdp);
420 if (error)
421 goto fail;
422
423 error = bh_get(*qdp);
424 if (error)
425 goto fail_slot;
426
427 return 0;
428
429 fail_slot:
430 slot_put(*qdp);
431
432 fail:
433 qd_put(*qdp);
434 return error;
435}
436
437static void qdsb_put(struct gfs2_quota_data *qd)
438{
439 bh_put(qd);
440 slot_put(qd);
441 qd_put(qd);
442}
443
444int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
445{
446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
447 struct gfs2_alloc *al = &ip->i_alloc;
448 struct gfs2_quota_data **qd = al->al_qd;
449 int error;
450
451 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
452 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
453 return -EIO;
454
455 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
456 return 0;
457
458 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
459 if (error)
460 goto out;
461 al->al_qd_num++;
462 qd++;
463
464 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
465 if (error)
466 goto out;
467 al->al_qd_num++;
468 qd++;
469
470 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
471 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
472 if (error)
473 goto out;
474 al->al_qd_num++;
475 qd++;
476 }
477
478 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
479 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
480 if (error)
481 goto out;
482 al->al_qd_num++;
483 qd++;
484 }
485
486 out:
487 if (error)
488 gfs2_quota_unhold(ip);
489
490 return error;
491}
492
493void gfs2_quota_unhold(struct gfs2_inode *ip)
494{
495 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
496 struct gfs2_alloc *al = &ip->i_alloc;
497 unsigned int x;
498
499 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
500
501 for (x = 0; x < al->al_qd_num; x++) {
502 qdsb_put(al->al_qd[x]);
503 al->al_qd[x] = NULL;
504 }
505 al->al_qd_num = 0;
506}
507
508static int sort_qd(const void *a, const void *b)
509{
510 struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a;
511 struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b;
512 int ret = 0;
513
514 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
515 !test_bit(QDF_USER, &qd_b->qd_flags)) {
516 if (test_bit(QDF_USER, &qd_a->qd_flags))
517 ret = -1;
518 else
519 ret = 1;
520 } else {
521 if (qd_a->qd_id < qd_b->qd_id)
522 ret = -1;
523 else if (qd_a->qd_id > qd_b->qd_id)
524 ret = 1;
525 }
526
527 return ret;
528}
529
530static void do_qc(struct gfs2_quota_data *qd, int64_t change)
531{
532 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
533 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
534 struct gfs2_quota_change *qc = qd->qd_bh_qc;
535 int64_t x;
536
537 mutex_lock(&sdp->sd_quota_mutex);
538 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
539
540 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
541 qc->qc_change = 0;
542 qc->qc_flags = 0;
543 if (test_bit(QDF_USER, &qd->qd_flags))
544 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
545 qc->qc_id = cpu_to_be32(qd->qd_id);
546 }
547
548 x = qc->qc_change;
549 x = be64_to_cpu(x) + change;
550 qc->qc_change = cpu_to_be64(x);
551
552 spin_lock(&sdp->sd_quota_spin);
553 qd->qd_change = x;
554 spin_unlock(&sdp->sd_quota_spin);
555
556 if (!x) {
557 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
558 clear_bit(QDF_CHANGE, &qd->qd_flags);
559 qc->qc_flags = 0;
560 qc->qc_id = 0;
561 slot_put(qd);
562 qd_put(qd);
563 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
564 qd_hold(qd);
565 slot_hold(qd);
566 }
567
568 mutex_unlock(&sdp->sd_quota_mutex);
569}
570
571/**
572 * gfs2_adjust_quota
573 *
574 * This function was mostly borrowed from gfs2_block_truncate_page which was
575 * in turn mostly borrowed from ext3
576 */
577static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
578 int64_t change, struct gfs2_quota_data *qd)
579{
580 struct inode *inode = &ip->i_inode;
581 struct address_space *mapping = inode->i_mapping;
582 unsigned long index = loc >> PAGE_CACHE_SHIFT;
583 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
584 unsigned blocksize, iblock, pos;
585 struct buffer_head *bh;
586 struct page *page;
587 void *kaddr;
588 __be64 *ptr;
589 u64 value;
590 int err = -EIO;
591
592 page = grab_cache_page(mapping, index);
593 if (!page)
594 return -ENOMEM;
595
596 blocksize = inode->i_sb->s_blocksize;
597 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
598
599 if (!page_has_buffers(page))
600 create_empty_buffers(page, blocksize, 0);
601
602 bh = page_buffers(page);
603 pos = blocksize;
604 while (offset >= pos) {
605 bh = bh->b_this_page;
606 iblock++;
607 pos += blocksize;
608 }
609
610 if (!buffer_mapped(bh)) {
611 gfs2_get_block(inode, iblock, bh, 1);
612 if (!buffer_mapped(bh))
613 goto unlock;
614 }
615
616 if (PageUptodate(page))
617 set_buffer_uptodate(bh);
618
619 if (!buffer_uptodate(bh)) {
620 ll_rw_block(READ, 1, &bh);
621 wait_on_buffer(bh);
622 if (!buffer_uptodate(bh))
623 goto unlock;
624 }
625
626 gfs2_trans_add_bh(ip->i_gl, bh, 0);
627
628 kaddr = kmap_atomic(page, KM_USER0);
629 ptr = (__be64 *)(kaddr + offset);
630 value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change);
631 flush_dcache_page(page);
632 kunmap_atomic(kaddr, KM_USER0);
633 err = 0;
634 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
635#if 0
636 qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit);
637 qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn);
638#endif
639 qd->qd_qb.qb_value = cpu_to_be64(value);
640unlock:
641 unlock_page(page);
642 page_cache_release(page);
643 return err;
644}
645
646static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
647{
648 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
649 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
650 unsigned int data_blocks, ind_blocks;
651 struct gfs2_holder *ghs, i_gh;
652 unsigned int qx, x;
653 struct gfs2_quota_data *qd;
654 loff_t offset;
655 unsigned int nalloc = 0;
656 struct gfs2_alloc *al = NULL;
657 int error;
658
659 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
660 &data_blocks, &ind_blocks);
661
662 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
663 if (!ghs)
664 return -ENOMEM;
665
666 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
667 for (qx = 0; qx < num_qd; qx++) {
668 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
669 LM_ST_EXCLUSIVE,
670 GL_NOCACHE, &ghs[qx]);
671 if (error)
672 goto out;
673 }
674
675 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
676 if (error)
677 goto out;
678
679 for (x = 0; x < num_qd; x++) {
680 int alloc_required;
681
682 offset = qd2offset(qda[x]);
683 error = gfs2_write_alloc_required(ip, offset,
684 sizeof(struct gfs2_quota),
685 &alloc_required);
686 if (error)
687 goto out_gunlock;
688 if (alloc_required)
689 nalloc++;
690 }
691
692 if (nalloc) {
693 al = gfs2_alloc_get(ip);
694
695 al->al_requested = nalloc * (data_blocks + ind_blocks);
696
697 error = gfs2_inplace_reserve(ip);
698 if (error)
699 goto out_alloc;
700
701 error = gfs2_trans_begin(sdp,
702 al->al_rgd->rd_ri.ri_length +
703 num_qd * data_blocks +
704 nalloc * ind_blocks +
705 RES_DINODE + num_qd +
706 RES_STATFS, 0);
707 if (error)
708 goto out_ipres;
709 } else {
710 error = gfs2_trans_begin(sdp,
711 num_qd * data_blocks +
712 RES_DINODE + num_qd, 0);
713 if (error)
714 goto out_gunlock;
715 }
716
717 for (x = 0; x < num_qd; x++) {
718 qd = qda[x];
719 offset = qd2offset(qd);
720 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
721 (struct gfs2_quota_data *)
722 qd->qd_gl->gl_lvb);
723 if (error)
724 goto out_end_trans;
725
726 do_qc(qd, -qd->qd_change_sync);
727 }
728
729 error = 0;
730
731 out_end_trans:
732 gfs2_trans_end(sdp);
733
734 out_ipres:
735 if (nalloc)
736 gfs2_inplace_release(ip);
737
738 out_alloc:
739 if (nalloc)
740 gfs2_alloc_put(ip);
741
742 out_gunlock:
743 gfs2_glock_dq_uninit(&i_gh);
744
745 out:
746 while (qx--)
747 gfs2_glock_dq_uninit(&ghs[qx]);
748 kfree(ghs);
749 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
750
751 return error;
752}
753
754static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
755 struct gfs2_holder *q_gh)
756{
757 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
758 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
759 struct gfs2_holder i_gh;
760 struct gfs2_quota q;
761 char buf[sizeof(struct gfs2_quota)];
762 struct file_ra_state ra_state;
763 int error;
764
765 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
766 restart:
767 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
768 if (error)
769 return error;
770
771 gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
772
773 if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) {
774 loff_t pos;
775 gfs2_glock_dq_uninit(q_gh);
776 error = gfs2_glock_nq_init(qd->qd_gl,
777 LM_ST_EXCLUSIVE, GL_NOCACHE,
778 q_gh);
779 if (error)
780 return error;
781
782 error = gfs2_glock_nq_init(ip->i_gl,
783 LM_ST_SHARED, 0,
784 &i_gh);
785 if (error)
786 goto fail;
787
788 memset(buf, 0, sizeof(struct gfs2_quota));
789 pos = qd2offset(qd);
790 error = gfs2_internal_read(ip, &ra_state, buf,
791 &pos, sizeof(struct gfs2_quota));
792 if (error < 0)
793 goto fail_gunlock;
794
795 gfs2_glock_dq_uninit(&i_gh);
796
797 gfs2_quota_in(&q, buf);
798
799 memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb));
800 qd->qd_qb.qb_magic = GFS2_MAGIC;
801 qd->qd_qb.qb_limit = q.qu_limit;
802 qd->qd_qb.qb_warn = q.qu_warn;
803 qd->qd_qb.qb_value = q.qu_value;
804
805 gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
806
807 if (gfs2_glock_is_blocking(qd->qd_gl)) {
808 gfs2_glock_dq_uninit(q_gh);
809 force_refresh = 0;
810 goto restart;
811 }
812 }
813
814 return 0;
815
816 fail_gunlock:
817 gfs2_glock_dq_uninit(&i_gh);
818
819 fail:
820 gfs2_glock_dq_uninit(q_gh);
821
822 return error;
823}
824
825int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
826{
827 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
828 struct gfs2_alloc *al = &ip->i_alloc;
829 unsigned int x;
830 int error = 0;
831
832 gfs2_quota_hold(ip, uid, gid);
833
834 if (capable(CAP_SYS_RESOURCE) ||
835 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
836 return 0;
837
838 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
839 sort_qd, NULL);
840
841 for (x = 0; x < al->al_qd_num; x++) {
842 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
843 if (error)
844 break;
845 }
846
847 if (!error)
848 set_bit(GIF_QD_LOCKED, &ip->i_flags);
849 else {
850 while (x--)
851 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
852 gfs2_quota_unhold(ip);
853 }
854
855 return error;
856}
857
858static int need_sync(struct gfs2_quota_data *qd)
859{
860 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
861 struct gfs2_tune *gt = &sdp->sd_tune;
862 int64_t value;
863 unsigned int num, den;
864 int do_sync = 1;
865
866 if (!qd->qd_qb.qb_limit)
867 return 0;
868
869 spin_lock(&sdp->sd_quota_spin);
870 value = qd->qd_change;
871 spin_unlock(&sdp->sd_quota_spin);
872
873 spin_lock(&gt->gt_spin);
874 num = gt->gt_quota_scale_num;
875 den = gt->gt_quota_scale_den;
876 spin_unlock(&gt->gt_spin);
877
878 if (value < 0)
879 do_sync = 0;
880 else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
881 do_sync = 0;
882 else {
883 value *= gfs2_jindex_size(sdp) * num;
884 do_div(value, den);
885 value += qd->qd_qb.qb_value;
886 if (value < (int64_t)qd->qd_qb.qb_limit)
887 do_sync = 0;
888 }
889
890 return do_sync;
891}
892
893void gfs2_quota_unlock(struct gfs2_inode *ip)
894{
895 struct gfs2_alloc *al = &ip->i_alloc;
896 struct gfs2_quota_data *qda[4];
897 unsigned int count = 0;
898 unsigned int x;
899
900 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
901 goto out;
902
903 for (x = 0; x < al->al_qd_num; x++) {
904 struct gfs2_quota_data *qd;
905 int sync;
906
907 qd = al->al_qd[x];
908 sync = need_sync(qd);
909
910 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
911
912 if (sync && qd_trylock(qd))
913 qda[count++] = qd;
914 }
915
916 if (count) {
917 do_sync(count, qda);
918 for (x = 0; x < count; x++)
919 qd_unlock(qda[x]);
920 }
921
922 out:
923 gfs2_quota_unhold(ip);
924}
925
926#define MAX_LINE 256
927
928static int print_message(struct gfs2_quota_data *qd, char *type)
929{
930 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
931
932 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
933 sdp->sd_fsname, type,
934 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
935 qd->qd_id);
936
937 return 0;
938}
939
940int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
941{
942 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
943 struct gfs2_alloc *al = &ip->i_alloc;
944 struct gfs2_quota_data *qd;
945 int64_t value;
946 unsigned int x;
947 int error = 0;
948
949 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
950 return 0;
951
952 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
953 return 0;
954
955 for (x = 0; x < al->al_qd_num; x++) {
956 qd = al->al_qd[x];
957
958 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
959 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
960 continue;
961
962 value = qd->qd_qb.qb_value;
963 spin_lock(&sdp->sd_quota_spin);
964 value += qd->qd_change;
965 spin_unlock(&sdp->sd_quota_spin);
966
967 if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
968 print_message(qd, "exceeded");
969 error = -EDQUOT;
970 break;
971 } else if (qd->qd_qb.qb_warn &&
972 (int64_t)qd->qd_qb.qb_warn < value &&
973 time_after_eq(jiffies, qd->qd_last_warn +
974 gfs2_tune_get(sdp,
975 gt_quota_warn_period) * HZ)) {
976 error = print_message(qd, "warning");
977 qd->qd_last_warn = jiffies;
978 }
979 }
980
981 return error;
982}
983
984void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
985 uint32_t uid, uint32_t gid)
986{
987 struct gfs2_alloc *al = &ip->i_alloc;
988 struct gfs2_quota_data *qd;
989 unsigned int x;
990 unsigned int found = 0;
991
992 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
993 return;
994 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
995 return;
996
997 for (x = 0; x < al->al_qd_num; x++) {
998 qd = al->al_qd[x];
999
1000 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1001 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1002 do_qc(qd, change);
1003 found++;
1004 }
1005 }
1006}
1007
1008int gfs2_quota_sync(struct gfs2_sbd *sdp)
1009{
1010 struct gfs2_quota_data **qda;
1011 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1012 unsigned int num_qd;
1013 unsigned int x;
1014 int error = 0;
1015
1016 sdp->sd_quota_sync_gen++;
1017
1018 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1019 if (!qda)
1020 return -ENOMEM;
1021
1022 do {
1023 num_qd = 0;
1024
1025 for (;;) {
1026 error = qd_fish(sdp, qda + num_qd);
1027 if (error || !qda[num_qd])
1028 break;
1029 if (++num_qd == max_qd)
1030 break;
1031 }
1032
1033 if (num_qd) {
1034 if (!error)
1035 error = do_sync(num_qd, qda);
1036 if (!error)
1037 for (x = 0; x < num_qd; x++)
1038 qda[x]->qd_sync_gen =
1039 sdp->sd_quota_sync_gen;
1040
1041 for (x = 0; x < num_qd; x++)
1042 qd_unlock(qda[x]);
1043 }
1044 } while (!error && num_qd == max_qd);
1045
1046 kfree(qda);
1047
1048 return error;
1049}
1050
1051int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id)
1052{
1053 struct gfs2_quota_data *qd;
1054 struct gfs2_holder q_gh;
1055 int error;
1056
1057 error = qd_get(sdp, user, id, CREATE, &qd);
1058 if (error)
1059 return error;
1060
1061 error = do_glock(qd, FORCE, &q_gh);
1062 if (!error)
1063 gfs2_glock_dq_uninit(&q_gh);
1064
1065 qd_put(qd);
1066
1067 return error;
1068}
1069
1070#if 0
1071int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
1072 struct gfs2_quota *q)
1073{
1074 struct gfs2_quota_data *qd;
1075 struct gfs2_holder q_gh;
1076 int error;
1077
1078 if (((user) ? (id != current->fsuid) : (!in_group_p(id))) &&
1079 !capable(CAP_SYS_ADMIN))
1080 return -EACCES;
1081
1082 error = qd_get(sdp, user, id, CREATE, &qd);
1083 if (error)
1084 return error;
1085
1086 error = do_glock(qd, NO_FORCE, &q_gh);
1087 if (error)
1088 goto out;
1089
1090 memset(q, 0, sizeof(struct gfs2_quota));
1091 q->qu_limit = qd->qd_qb.qb_limit;
1092 q->qu_warn = qd->qd_qb.qb_warn;
1093 q->qu_value = qd->qd_qb.qb_value;
1094
1095 spin_lock(&sdp->sd_quota_spin);
1096 q->qu_value += qd->qd_change;
1097 spin_unlock(&sdp->sd_quota_spin);
1098
1099 gfs2_glock_dq_uninit(&q_gh);
1100
1101 out:
1102 qd_put(qd);
1103
1104 return error;
1105}
1106#endif /* 0 */
1107
1108int gfs2_quota_init(struct gfs2_sbd *sdp)
1109{
1110 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1111 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1112 unsigned int x, slot = 0;
1113 unsigned int found = 0;
1114 uint64_t dblock;
1115 uint32_t extlen = 0;
1116 int error;
1117
1118 if (!ip->i_di.di_size ||
1119 ip->i_di.di_size > (64 << 20) ||
1120 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1121 gfs2_consist_inode(ip);
1122 return -EIO;
1123 }
1124 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1125 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1126
1127 error = -ENOMEM;
1128
1129 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1130 sizeof(unsigned char *), GFP_KERNEL);
1131 if (!sdp->sd_quota_bitmap)
1132 return error;
1133
1134 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1135 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1136 if (!sdp->sd_quota_bitmap[x])
1137 goto fail;
1138 }
1139
1140 for (x = 0; x < blocks; x++) {
1141 struct buffer_head *bh;
1142 unsigned int y;
1143
1144 if (!extlen) {
1145 int new = 0;
1146 error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
1147 if (error)
1148 goto fail;
1149 }
1150 gfs2_meta_ra(ip->i_gl, dblock, extlen);
1151 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
1152 &bh);
1153 if (error)
1154 goto fail;
1155 error = -EIO;
1156 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1157 brelse(bh);
1158 goto fail;
1159 }
1160
1161 for (y = 0;
1162 y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1163 y++, slot++) {
1164 struct gfs2_quota_change qc;
1165 struct gfs2_quota_data *qd;
1166
1167 gfs2_quota_change_in(&qc, bh->b_data +
1168 sizeof(struct gfs2_meta_header) +
1169 y * sizeof(struct gfs2_quota_change));
1170 if (!qc.qc_change)
1171 continue;
1172
1173 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1174 qc.qc_id, &qd);
1175 if (error) {
1176 brelse(bh);
1177 goto fail;
1178 }
1179
1180 set_bit(QDF_CHANGE, &qd->qd_flags);
1181 qd->qd_change = qc.qc_change;
1182 qd->qd_slot = slot;
1183 qd->qd_slot_count = 1;
1184 qd->qd_last_touched = jiffies;
1185
1186 spin_lock(&sdp->sd_quota_spin);
1187 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1188 list_add(&qd->qd_list, &sdp->sd_quota_list);
1189 atomic_inc(&sdp->sd_quota_count);
1190 spin_unlock(&sdp->sd_quota_spin);
1191
1192 found++;
1193 }
1194
1195 brelse(bh);
1196 dblock++;
1197 extlen--;
1198 }
1199
1200 if (found)
1201 fs_info(sdp, "found %u quota changes\n", found);
1202
1203 return 0;
1204
1205 fail:
1206 gfs2_quota_cleanup(sdp);
1207 return error;
1208}
1209
1210void gfs2_quota_scan(struct gfs2_sbd *sdp)
1211{
1212 struct gfs2_quota_data *qd, *safe;
1213 LIST_HEAD(dead);
1214
1215 spin_lock(&sdp->sd_quota_spin);
1216 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1217 if (!qd->qd_count &&
1218 time_after_eq(jiffies, qd->qd_last_touched +
1219 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1220 list_move(&qd->qd_list, &dead);
1221 gfs2_assert_warn(sdp,
1222 atomic_read(&sdp->sd_quota_count) > 0);
1223 atomic_dec(&sdp->sd_quota_count);
1224 }
1225 }
1226 spin_unlock(&sdp->sd_quota_spin);
1227
1228 while (!list_empty(&dead)) {
1229 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1230 list_del(&qd->qd_list);
1231
1232 gfs2_assert_warn(sdp, !qd->qd_change);
1233 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1234 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1235
1236 gfs2_lvb_unhold(qd->qd_gl);
1237 kfree(qd);
1238 }
1239}
1240
1241void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1242{
1243 struct list_head *head = &sdp->sd_quota_list;
1244 struct gfs2_quota_data *qd;
1245 unsigned int x;
1246
1247 spin_lock(&sdp->sd_quota_spin);
1248 while (!list_empty(head)) {
1249 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1250
1251 if (qd->qd_count > 1 ||
1252 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1253 list_move(&qd->qd_list, head);
1254 spin_unlock(&sdp->sd_quota_spin);
1255 schedule();
1256 spin_lock(&sdp->sd_quota_spin);
1257 continue;
1258 }
1259
1260 list_del(&qd->qd_list);
1261 atomic_dec(&sdp->sd_quota_count);
1262 spin_unlock(&sdp->sd_quota_spin);
1263
1264 if (!qd->qd_count) {
1265 gfs2_assert_warn(sdp, !qd->qd_change);
1266 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1267 } else
1268 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1269 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1270
1271 gfs2_lvb_unhold(qd->qd_gl);
1272 kfree(qd);
1273
1274 spin_lock(&sdp->sd_quota_spin);
1275 }
1276 spin_unlock(&sdp->sd_quota_spin);
1277
1278 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1279
1280 if (sdp->sd_quota_bitmap) {
1281 for (x = 0; x < sdp->sd_quota_chunks; x++)
1282 kfree(sdp->sd_quota_bitmap[x]);
1283 kfree(sdp->sd_quota_bitmap);
1284 }
1285}
1286
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..af05492f9644
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13#define NO_QUOTA_CHANGE ((uint32_t)-1)
14
15int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
16void gfs2_quota_unhold(struct gfs2_inode *ip);
17
18int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
19void gfs2_quota_unlock(struct gfs2_inode *ip);
20
21int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
22void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
23 uint32_t uid, uint32_t gid);
24
25int gfs2_quota_sync(struct gfs2_sbd *sdp);
26int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id);
27
28int gfs2_quota_init(struct gfs2_sbd *sdp);
29void gfs2_quota_scan(struct gfs2_sbd *sdp);
30void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
31
32#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..7aabc03e4abd
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "lm.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30#include "dir.h"
31
32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
33 struct buffer_head **bh)
34{
35 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
36 struct gfs2_glock *gl = ip->i_gl;
37 int new = 0;
38 uint64_t dblock;
39 uint32_t extlen;
40 int error;
41
42 error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
43 if (error)
44 return error;
45 if (!dblock) {
46 gfs2_consist_inode(ip);
47 return -EIO;
48 }
49
50 gfs2_meta_ra(gl, dblock, extlen);
51 error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh);
52
53 return error;
54}
55
56int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
57{
58 struct list_head *head = &sdp->sd_revoke_list;
59 struct gfs2_revoke_replay *rr;
60 int found = 0;
61
62 list_for_each_entry(rr, head, rr_list) {
63 if (rr->rr_blkno == blkno) {
64 found = 1;
65 break;
66 }
67 }
68
69 if (found) {
70 rr->rr_where = where;
71 return 0;
72 }
73
74 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
75 if (!rr)
76 return -ENOMEM;
77
78 rr->rr_blkno = blkno;
79 rr->rr_where = where;
80 list_add(&rr->rr_list, head);
81
82 return 1;
83}
84
85int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
86{
87 struct gfs2_revoke_replay *rr;
88 int wrap, a, b, revoke;
89 int found = 0;
90
91 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
92 if (rr->rr_blkno == blkno) {
93 found = 1;
94 break;
95 }
96 }
97
98 if (!found)
99 return 0;
100
101 wrap = (rr->rr_where < sdp->sd_replay_tail);
102 a = (sdp->sd_replay_tail < where);
103 b = (where < rr->rr_where);
104 revoke = (wrap) ? (a || b) : (a && b);
105
106 return revoke;
107}
108
109void gfs2_revoke_clean(struct gfs2_sbd *sdp)
110{
111 struct list_head *head = &sdp->sd_revoke_list;
112 struct gfs2_revoke_replay *rr;
113
114 while (!list_empty(head)) {
115 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
116 list_del(&rr->rr_list);
117 kfree(rr);
118 }
119}
120
121/**
122 * get_log_header - read the log header for a given segment
123 * @jd: the journal
124 * @blk: the block to look at
125 * @lh: the log header to return
126 *
127 * Read the log header for a given segement in a given journal. Do a few
128 * sanity checks on it.
129 *
130 * Returns: 0 on success,
131 * 1 if the header was invalid or incomplete,
132 * errno on error
133 */
134
135static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
136 struct gfs2_log_header *head)
137{
138 struct buffer_head *bh;
139 struct gfs2_log_header lh;
140 uint32_t hash;
141 int error;
142
143 error = gfs2_replay_read_block(jd, blk, &bh);
144 if (error)
145 return error;
146
147 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
148 lh.lh_hash = 0;
149 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
150 gfs2_log_header_in(&lh, bh->b_data);
151
152 brelse(bh);
153
154 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
155 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
156 lh.lh_blkno != blk ||
157 lh.lh_hash != hash)
158 return 1;
159
160 *head = lh;
161
162 return 0;
163}
164
165/**
166 * find_good_lh - find a good log header
167 * @jd: the journal
168 * @blk: the segment to start searching from
169 * @lh: the log header to fill in
170 * @forward: if true search forward in the log, else search backward
171 *
172 * Call get_log_header() to get a log header for a segment, but if the
173 * segment is bad, either scan forward or backward until we find a good one.
174 *
175 * Returns: errno
176 */
177
178static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
179 struct gfs2_log_header *head)
180{
181 unsigned int orig_blk = *blk;
182 int error;
183
184 for (;;) {
185 error = get_log_header(jd, *blk, head);
186 if (error <= 0)
187 return error;
188
189 if (++*blk == jd->jd_blocks)
190 *blk = 0;
191
192 if (*blk == orig_blk) {
193 gfs2_consist_inode(GFS2_I(jd->jd_inode));
194 return -EIO;
195 }
196 }
197}
198
199/**
200 * jhead_scan - make sure we've found the head of the log
201 * @jd: the journal
202 * @head: this is filled in with the log descriptor of the head
203 *
204 * At this point, seg and lh should be either the head of the log or just
205 * before. Scan forward until we find the head.
206 *
207 * Returns: errno
208 */
209
210static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
211{
212 unsigned int blk = head->lh_blkno;
213 struct gfs2_log_header lh;
214 int error;
215
216 for (;;) {
217 if (++blk == jd->jd_blocks)
218 blk = 0;
219
220 error = get_log_header(jd, blk, &lh);
221 if (error < 0)
222 return error;
223 if (error == 1)
224 continue;
225
226 if (lh.lh_sequence == head->lh_sequence) {
227 gfs2_consist_inode(GFS2_I(jd->jd_inode));
228 return -EIO;
229 }
230 if (lh.lh_sequence < head->lh_sequence)
231 break;
232
233 *head = lh;
234 }
235
236 return 0;
237}
238
239/**
240 * gfs2_find_jhead - find the head of a log
241 * @jd: the journal
242 * @head: the log descriptor for the head of the log is returned here
243 *
244 * Do a binary search of a journal and find the valid log entry with the
245 * highest sequence number. (i.e. the log head)
246 *
247 * Returns: errno
248 */
249
250int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
251{
252 struct gfs2_log_header lh_1, lh_m;
253 uint32_t blk_1, blk_2, blk_m;
254 int error;
255
256 blk_1 = 0;
257 blk_2 = jd->jd_blocks - 1;
258
259 for (;;) {
260 blk_m = (blk_1 + blk_2) / 2;
261
262 error = find_good_lh(jd, &blk_1, &lh_1);
263 if (error)
264 return error;
265
266 error = find_good_lh(jd, &blk_m, &lh_m);
267 if (error)
268 return error;
269
270 if (blk_1 == blk_m || blk_m == blk_2)
271 break;
272
273 if (lh_1.lh_sequence <= lh_m.lh_sequence)
274 blk_1 = blk_m;
275 else
276 blk_2 = blk_m;
277 }
278
279 error = jhead_scan(jd, &lh_1);
280 if (error)
281 return error;
282
283 *head = lh_1;
284
285 return error;
286}
287
288/**
289 * foreach_descriptor - go through the active part of the log
290 * @jd: the journal
291 * @start: the first log header in the active region
292 * @end: the last log header (don't process the contents of this entry))
293 *
294 * Call a given function once for every log descriptor in the active
295 * portion of the log.
296 *
297 * Returns: errno
298 */
299
300static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
301 unsigned int end, int pass)
302{
303 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
304 struct buffer_head *bh;
305 struct gfs2_log_descriptor *ld;
306 int error = 0;
307 u32 length;
308 __be64 *ptr;
309 unsigned int offset = sizeof(struct gfs2_log_descriptor);
310 offset += (sizeof(__be64)-1);
311 offset &= ~(sizeof(__be64)-1);
312
313 while (start != end) {
314 error = gfs2_replay_read_block(jd, start, &bh);
315 if (error)
316 return error;
317 if (gfs2_meta_check(sdp, bh)) {
318 brelse(bh);
319 return -EIO;
320 }
321 ld = (struct gfs2_log_descriptor *)bh->b_data;
322 length = be32_to_cpu(ld->ld_length);
323
324 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
325 struct gfs2_log_header lh;
326 error = get_log_header(jd, start, &lh);
327 if (!error) {
328 gfs2_replay_incr_blk(sdp, &start);
329 continue;
330 }
331 if (error == 1) {
332 gfs2_consist_inode(GFS2_I(jd->jd_inode));
333 error = -EIO;
334 }
335 brelse(bh);
336 return error;
337 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
338 brelse(bh);
339 return -EIO;
340 }
341 ptr = (__be64 *)(bh->b_data + offset);
342 error = lops_scan_elements(jd, start, ld, ptr, pass);
343 if (error) {
344 brelse(bh);
345 return error;
346 }
347
348 while (length--)
349 gfs2_replay_incr_blk(sdp, &start);
350
351 brelse(bh);
352 }
353
354 return 0;
355}
356
357/**
358 * clean_journal - mark a dirty journal as being clean
359 * @sdp: the filesystem
360 * @jd: the journal
361 * @gl: the journal's glock
362 * @head: the head journal to start from
363 *
364 * Returns: errno
365 */
366
367static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
368{
369 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
370 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
371 unsigned int lblock;
372 int new = 0;
373 uint64_t dblock;
374 struct gfs2_log_header *lh;
375 uint32_t hash;
376 struct buffer_head *bh;
377 int error;
378 int boundary;
379
380 lblock = head->lh_blkno;
381 gfs2_replay_incr_blk(sdp, &lblock);
382 error = gfs2_block_map(&ip->i_inode, lblock, &new, &dblock, &boundary);
383 if (error)
384 return error;
385 if (!dblock) {
386 gfs2_consist_inode(ip);
387 return -EIO;
388 }
389
390 bh = sb_getblk(sdp->sd_vfs, dblock);
391 lock_buffer(bh);
392 memset(bh->b_data, 0, bh->b_size);
393 set_buffer_uptodate(bh);
394 clear_buffer_dirty(bh);
395 unlock_buffer(bh);
396
397 lh = (struct gfs2_log_header *)bh->b_data;
398 memset(lh, 0, sizeof(struct gfs2_log_header));
399 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
400 lh->lh_header.mh_type = cpu_to_be16(GFS2_METATYPE_LH);
401 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
402 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
403 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
404 lh->lh_blkno = cpu_to_be32(lblock);
405 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
406 lh->lh_hash = cpu_to_be32(hash);
407
408 set_buffer_dirty(bh);
409 if (sync_dirty_buffer(bh))
410 gfs2_io_error_bh(sdp, bh);
411 brelse(bh);
412
413 return error;
414}
415
416/**
417 * gfs2_recover_journal - recovery a given journal
418 * @jd: the struct gfs2_jdesc describing the journal
419 *
420 * Acquire the journal's lock, check to see if the journal is clean, and
421 * do recovery if necessary.
422 *
423 * Returns: errno
424 */
425
426int gfs2_recover_journal(struct gfs2_jdesc *jd)
427{
428 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
429 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
430 struct gfs2_log_header head;
431 struct gfs2_holder j_gh, ji_gh, t_gh;
432 unsigned long t;
433 int ro = 0;
434 unsigned int pass;
435 int error;
436
437 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
438 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
439 jd->jd_jid);
440
441 /* Aquire the journal lock so we can do recovery */
442
443 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
444 LM_ST_EXCLUSIVE,
445 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
446 &j_gh);
447 switch (error) {
448 case 0:
449 break;
450
451 case GLR_TRYFAILED:
452 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
453 error = 0;
454
455 default:
456 goto fail;
457 };
458
459 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
460 LM_FLAG_NOEXP, &ji_gh);
461 if (error)
462 goto fail_gunlock_j;
463 } else {
464 fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
465 }
466
467 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
468
469 error = gfs2_jdesc_check(jd);
470 if (error)
471 goto fail_gunlock_ji;
472
473 error = gfs2_find_jhead(jd, &head);
474 if (error)
475 goto fail_gunlock_ji;
476
477 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
478 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
479 jd->jd_jid);
480
481 t = jiffies;
482
483 /* Acquire a shared hold on the transaction lock */
484
485 error = gfs2_glock_nq_init(sdp->sd_trans_gl,
486 LM_ST_SHARED,
487 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
488 GL_NOCANCEL | GL_NOCACHE,
489 &t_gh);
490 if (error)
491 goto fail_gunlock_ji;
492
493 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
494 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
495 ro = 1;
496 } else {
497 if (sdp->sd_vfs->s_flags & MS_RDONLY)
498 ro = 1;
499 }
500
501 if (ro) {
502 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
503 jd->jd_jid);
504 error = -EROFS;
505 goto fail_gunlock_tr;
506 }
507
508 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
509
510 for (pass = 0; pass < 2; pass++) {
511 lops_before_scan(jd, &head, pass);
512 error = foreach_descriptor(jd, head.lh_tail,
513 head.lh_blkno, pass);
514 lops_after_scan(jd, error, pass);
515 if (error)
516 goto fail_gunlock_tr;
517 }
518
519 error = clean_journal(jd, &head);
520 if (error)
521 goto fail_gunlock_tr;
522
523 gfs2_glock_dq_uninit(&t_gh);
524 t = DIV_ROUND_UP(jiffies - t, HZ);
525 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
526 jd->jd_jid, t);
527 }
528
529 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
530 gfs2_glock_dq_uninit(&ji_gh);
531
532 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
533
534 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
535 gfs2_glock_dq_uninit(&j_gh);
536
537 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
538 return 0;
539
540fail_gunlock_tr:
541 gfs2_glock_dq_uninit(&t_gh);
542fail_gunlock_ji:
543 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
544 gfs2_glock_dq_uninit(&ji_gh);
545fail_gunlock_j:
546 gfs2_glock_dq_uninit(&j_gh);
547 }
548
549 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
550
551fail:
552 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
553 return error;
554}
555
556/**
557 * gfs2_check_journals - Recover any dirty journals
558 * @sdp: the filesystem
559 *
560 */
561
562void gfs2_check_journals(struct gfs2_sbd *sdp)
563{
564 struct gfs2_jdesc *jd;
565
566 for (;;) {
567 jd = gfs2_jdesc_find_dirty(sdp);
568 if (!jd)
569 break;
570
571 if (jd != sdp->sd_jdesc)
572 gfs2_recover_journal(jd);
573 }
574}
575
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..ac0f1d6ce456
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
14{
15 if (++*blk == sdp->sd_jdesc->jd_blocks)
16 *blk = 0;
17}
18
19int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
20 struct buffer_head **bh);
21
22int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
23int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
24void gfs2_revoke_clean(struct gfs2_sbd *sdp);
25
26int gfs2_find_jhead(struct gfs2_jdesc *jd,
27 struct gfs2_log_header *head);
28int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
29void gfs2_check_journals(struct gfs2_sbd *sdp);
30
31#endif /* __RECOVERY_DOT_H__ */
32
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..4e0357dc838b
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1528 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "glops.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "super.h"
28#include "trans.h"
29#include "ops_file.h"
30#include "util.h"
31
32#define BFITNOENT 0xFFFFFFFF
33
34/*
35 * These routines are used by the resource group routines (rgrp.c)
36 * to keep track of block allocation. Each block is represented by two
37 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
38 *
39 * 0 = Free
40 * 1 = Used (not metadata)
41 * 2 = Unlinked (still in use) inode
42 * 3 = Used (metadata)
43 */
44
45static const char valid_change[16] = {
46 /* current */
47 /* n */ 0, 1, 1, 1,
48 /* e */ 1, 0, 0, 0,
49 /* w */ 0, 0, 0, 1,
50 1, 0, 0, 0
51};
52
53/**
54 * gfs2_setbit - Set a bit in the bitmaps
55 * @buffer: the buffer that holds the bitmaps
56 * @buflen: the length (in bytes) of the buffer
57 * @block: the block to set
58 * @new_state: the new state of the block
59 *
60 */
61
62static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
63 unsigned int buflen, uint32_t block,
64 unsigned char new_state)
65{
66 unsigned char *byte, *end, cur_state;
67 unsigned int bit;
68
69 byte = buffer + (block / GFS2_NBBY);
70 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
71 end = buffer + buflen;
72
73 gfs2_assert(rgd->rd_sbd, byte < end);
74
75 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
76
77 if (valid_change[new_state * 4 + cur_state]) {
78 *byte ^= cur_state << bit;
79 *byte |= new_state << bit;
80 } else
81 gfs2_consist_rgrpd(rgd);
82}
83
84/**
85 * gfs2_testbit - test a bit in the bitmaps
86 * @buffer: the buffer that holds the bitmaps
87 * @buflen: the length (in bytes) of the buffer
88 * @block: the block to read
89 *
90 */
91
92static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
93 unsigned int buflen, uint32_t block)
94{
95 unsigned char *byte, *end, cur_state;
96 unsigned int bit;
97
98 byte = buffer + (block / GFS2_NBBY);
99 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
100 end = buffer + buflen;
101
102 gfs2_assert(rgd->rd_sbd, byte < end);
103
104 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
105
106 return cur_state;
107}
108
109/**
110 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
111 * a block in a given allocation state.
112 * @buffer: the buffer that holds the bitmaps
113 * @buflen: the length (in bytes) of the buffer
114 * @goal: start search at this block's bit-pair (within @buffer)
115 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
116 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
117 *
118 * Scope of @goal and returned block number is only within this bitmap buffer,
119 * not entire rgrp or filesystem. @buffer will be offset from the actual
120 * beginning of a bitmap block buffer, skipping any header structures.
121 *
122 * Return: the block number (bitmap buffer scope) that was found
123 */
124
125static uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 unsigned int buflen, uint32_t goal,
127 unsigned char old_state)
128{
129 unsigned char *byte, *end, alloc;
130 uint32_t blk = goal;
131 unsigned int bit;
132
133 byte = buffer + (goal / GFS2_NBBY);
134 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
135 end = buffer + buflen;
136 alloc = (old_state & 1) ? 0 : 0x55;
137
138 while (byte < end) {
139 if ((*byte & 0x55) == alloc) {
140 blk += (8 - bit) >> 1;
141
142 bit = 0;
143 byte++;
144
145 continue;
146 }
147
148 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
149 return blk;
150
151 bit += GFS2_BIT_SIZE;
152 if (bit >= 8) {
153 bit = 0;
154 byte++;
155 }
156
157 blk++;
158 }
159
160 return BFITNOENT;
161}
162
163/**
164 * gfs2_bitcount - count the number of bits in a certain state
165 * @buffer: the buffer that holds the bitmaps
166 * @buflen: the length (in bytes) of the buffer
167 * @state: the state of the block we're looking for
168 *
169 * Returns: The number of bits
170 */
171
172static uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
173 unsigned int buflen, unsigned char state)
174{
175 unsigned char *byte = buffer;
176 unsigned char *end = buffer + buflen;
177 unsigned char state1 = state << 2;
178 unsigned char state2 = state << 4;
179 unsigned char state3 = state << 6;
180 uint32_t count = 0;
181
182 for (; byte < end; byte++) {
183 if (((*byte) & 0x03) == state)
184 count++;
185 if (((*byte) & 0x0C) == state1)
186 count++;
187 if (((*byte) & 0x30) == state2)
188 count++;
189 if (((*byte) & 0xC0) == state3)
190 count++;
191 }
192
193 return count;
194}
195
196/**
197 * gfs2_rgrp_verify - Verify that a resource group is consistent
198 * @sdp: the filesystem
199 * @rgd: the rgrp
200 *
201 */
202
203void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
204{
205 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi = NULL;
207 uint32_t length = rgd->rd_ri.ri_length;
208 uint32_t count[4], tmp;
209 int buf, x;
210
211 memset(count, 0, 4 * sizeof(uint32_t));
212
213 /* Count # blocks in each of 4 possible allocation states */
214 for (buf = 0; buf < length; buf++) {
215 bi = rgd->rd_bits + buf;
216 for (x = 0; x < 4; x++)
217 count[x] += gfs2_bitcount(rgd,
218 bi->bi_bh->b_data +
219 bi->bi_offset,
220 bi->bi_len, x);
221 }
222
223 if (count[0] != rgd->rd_rg.rg_free) {
224 if (gfs2_consist_rgrpd(rgd))
225 fs_err(sdp, "free data mismatch: %u != %u\n",
226 count[0], rgd->rd_rg.rg_free);
227 return;
228 }
229
230 tmp = rgd->rd_ri.ri_data -
231 rgd->rd_rg.rg_free -
232 rgd->rd_rg.rg_dinodes;
233 if (count[1] + count[2] != tmp) {
234 if (gfs2_consist_rgrpd(rgd))
235 fs_err(sdp, "used data mismatch: %u != %u\n",
236 count[1], tmp);
237 return;
238 }
239
240 if (count[3] != rgd->rd_rg.rg_dinodes) {
241 if (gfs2_consist_rgrpd(rgd))
242 fs_err(sdp, "used metadata mismatch: %u != %u\n",
243 count[3], rgd->rd_rg.rg_dinodes);
244 return;
245 }
246
247 if (count[2] > count[3]) {
248 if (gfs2_consist_rgrpd(rgd))
249 fs_err(sdp, "unlinked inodes > inodes: %u\n",
250 count[2]);
251 return;
252 }
253
254}
255
256static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block)
257{
258 uint64_t first = ri->ri_data0;
259 uint64_t last = first + ri->ri_data;
260 return !!(first <= block && block < last);
261}
262
263/**
264 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
265 * @sdp: The GFS2 superblock
266 * @n: The data block number
267 *
268 * Returns: The resource group, or NULL if not found
269 */
270
271struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk)
272{
273 struct gfs2_rgrpd *rgd;
274
275 spin_lock(&sdp->sd_rindex_spin);
276
277 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
278 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
279 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
280 spin_unlock(&sdp->sd_rindex_spin);
281 return rgd;
282 }
283 }
284
285 spin_unlock(&sdp->sd_rindex_spin);
286
287 return NULL;
288}
289
290/**
291 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
292 * @sdp: The GFS2 superblock
293 *
294 * Returns: The first rgrp in the filesystem
295 */
296
297struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
298{
299 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
300 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
301}
302
303/**
304 * gfs2_rgrpd_get_next - get the next RG
305 * @rgd: A RG
306 *
307 * Returns: The next rgrp
308 */
309
310struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
311{
312 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
313 return NULL;
314 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
315}
316
317static void clear_rgrpdi(struct gfs2_sbd *sdp)
318{
319 struct list_head *head;
320 struct gfs2_rgrpd *rgd;
321 struct gfs2_glock *gl;
322
323 spin_lock(&sdp->sd_rindex_spin);
324 sdp->sd_rindex_forward = NULL;
325 head = &sdp->sd_rindex_recent_list;
326 while (!list_empty(head)) {
327 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
328 list_del(&rgd->rd_recent);
329 }
330 spin_unlock(&sdp->sd_rindex_spin);
331
332 head = &sdp->sd_rindex_list;
333 while (!list_empty(head)) {
334 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
335 gl = rgd->rd_gl;
336
337 list_del(&rgd->rd_list);
338 list_del(&rgd->rd_list_mru);
339
340 if (gl) {
341 gl->gl_object = NULL;
342 gfs2_glock_put(gl);
343 }
344
345 kfree(rgd->rd_bits);
346 kfree(rgd);
347 }
348}
349
350void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
351{
352 mutex_lock(&sdp->sd_rindex_mutex);
353 clear_rgrpdi(sdp);
354 mutex_unlock(&sdp->sd_rindex_mutex);
355}
356
357/**
358 * gfs2_compute_bitstructs - Compute the bitmap sizes
359 * @rgd: The resource group descriptor
360 *
361 * Calculates bitmap descriptors, one for each block that contains bitmap data
362 *
363 * Returns: errno
364 */
365
366static int compute_bitstructs(struct gfs2_rgrpd *rgd)
367{
368 struct gfs2_sbd *sdp = rgd->rd_sbd;
369 struct gfs2_bitmap *bi;
370 uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
371 uint32_t bytes_left, bytes;
372 int x;
373
374 if (!length)
375 return -EINVAL;
376
377 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
378 if (!rgd->rd_bits)
379 return -ENOMEM;
380
381 bytes_left = rgd->rd_ri.ri_bitbytes;
382
383 for (x = 0; x < length; x++) {
384 bi = rgd->rd_bits + x;
385
386 /* small rgrp; bitmap stored completely in header block */
387 if (length == 1) {
388 bytes = bytes_left;
389 bi->bi_offset = sizeof(struct gfs2_rgrp);
390 bi->bi_start = 0;
391 bi->bi_len = bytes;
392 /* header block */
393 } else if (x == 0) {
394 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
395 bi->bi_offset = sizeof(struct gfs2_rgrp);
396 bi->bi_start = 0;
397 bi->bi_len = bytes;
398 /* last block */
399 } else if (x + 1 == length) {
400 bytes = bytes_left;
401 bi->bi_offset = sizeof(struct gfs2_meta_header);
402 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
403 bi->bi_len = bytes;
404 /* other blocks */
405 } else {
406 bytes = sdp->sd_sb.sb_bsize -
407 sizeof(struct gfs2_meta_header);
408 bi->bi_offset = sizeof(struct gfs2_meta_header);
409 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
410 bi->bi_len = bytes;
411 }
412
413 bytes_left -= bytes;
414 }
415
416 if (bytes_left) {
417 gfs2_consist_rgrpd(rgd);
418 return -EIO;
419 }
420 bi = rgd->rd_bits + (length - 1);
421 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
422 if (gfs2_consist_rgrpd(rgd)) {
423 gfs2_rindex_print(&rgd->rd_ri);
424 fs_err(sdp, "start=%u len=%u offset=%u\n",
425 bi->bi_start, bi->bi_len, bi->bi_offset);
426 }
427 return -EIO;
428 }
429
430 return 0;
431}
432
433/**
434 * gfs2_ri_update - Pull in a new resource index from the disk
435 * @gl: The glock covering the rindex inode
436 *
437 * Returns: 0 on successful update, error code otherwise
438 */
439
440static int gfs2_ri_update(struct gfs2_inode *ip)
441{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct inode *inode = &ip->i_inode;
444 struct gfs2_rgrpd *rgd;
445 char buf[sizeof(struct gfs2_rindex)];
446 struct file_ra_state ra_state;
447 uint64_t junk = ip->i_di.di_size;
448 int error;
449
450 if (do_div(junk, sizeof(struct gfs2_rindex))) {
451 gfs2_consist_inode(ip);
452 return -EIO;
453 }
454
455 clear_rgrpdi(sdp);
456
457 file_ra_state_init(&ra_state, inode->i_mapping);
458 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
459 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
460 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
461 sizeof(struct gfs2_rindex));
462 if (!error)
463 break;
464 if (error != sizeof(struct gfs2_rindex)) {
465 if (error > 0)
466 error = -EIO;
467 goto fail;
468 }
469
470 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
471 error = -ENOMEM;
472 if (!rgd)
473 goto fail;
474
475 mutex_init(&rgd->rd_mutex);
476 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
477 rgd->rd_sbd = sdp;
478
479 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
480 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
481
482 gfs2_rindex_in(&rgd->rd_ri, buf);
483 error = compute_bitstructs(rgd);
484 if (error)
485 goto fail;
486
487 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
488 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
489 if (error)
490 goto fail;
491
492 rgd->rd_gl->gl_object = rgd;
493 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
494 }
495
496 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
497 return 0;
498
499fail:
500 clear_rgrpdi(sdp);
501 return error;
502}
503
504/**
505 * gfs2_rindex_hold - Grab a lock on the rindex
506 * @sdp: The GFS2 superblock
507 * @ri_gh: the glock holder
508 *
509 * We grab a lock on the rindex inode to make sure that it doesn't
510 * change whilst we are performing an operation. We keep this lock
511 * for quite long periods of time compared to other locks. This
512 * doesn't matter, since it is shared and it is very, very rarely
513 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
514 *
515 * This makes sure that we're using the latest copy of the resource index
516 * special file, which might have been updated if someone expanded the
517 * filesystem (via gfs2_grow utility), which adds new resource groups.
518 *
519 * Returns: 0 on success, error code otherwise
520 */
521
522int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
523{
524 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
525 struct gfs2_glock *gl = ip->i_gl;
526 int error;
527
528 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
529 if (error)
530 return error;
531
532 /* Read new copy from disk if we don't have the latest */
533 if (sdp->sd_rindex_vn != gl->gl_vn) {
534 mutex_lock(&sdp->sd_rindex_mutex);
535 if (sdp->sd_rindex_vn != gl->gl_vn) {
536 error = gfs2_ri_update(ip);
537 if (error)
538 gfs2_glock_dq_uninit(ri_gh);
539 }
540 mutex_unlock(&sdp->sd_rindex_mutex);
541 }
542
543 return error;
544}
545
546/**
547 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
548 * @rgd: the struct gfs2_rgrpd describing the RG to read in
549 *
550 * Read in all of a Resource Group's header and bitmap blocks.
551 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
552 *
553 * Returns: errno
554 */
555
556int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
557{
558 struct gfs2_sbd *sdp = rgd->rd_sbd;
559 struct gfs2_glock *gl = rgd->rd_gl;
560 unsigned int length = rgd->rd_ri.ri_length;
561 struct gfs2_bitmap *bi;
562 unsigned int x, y;
563 int error;
564
565 mutex_lock(&rgd->rd_mutex);
566
567 spin_lock(&sdp->sd_rindex_spin);
568 if (rgd->rd_bh_count) {
569 rgd->rd_bh_count++;
570 spin_unlock(&sdp->sd_rindex_spin);
571 mutex_unlock(&rgd->rd_mutex);
572 return 0;
573 }
574 spin_unlock(&sdp->sd_rindex_spin);
575
576 for (x = 0; x < length; x++) {
577 bi = rgd->rd_bits + x;
578 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START,
579 &bi->bi_bh);
580 if (error)
581 goto fail;
582 }
583
584 for (y = length; y--;) {
585 bi = rgd->rd_bits + y;
586 error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT);
587 if (error)
588 goto fail;
589 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
590 GFS2_METATYPE_RG)) {
591 error = -EIO;
592 goto fail;
593 }
594 }
595
596 if (rgd->rd_rg_vn != gl->gl_vn) {
597 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
598 rgd->rd_rg_vn = gl->gl_vn;
599 }
600
601 spin_lock(&sdp->sd_rindex_spin);
602 rgd->rd_free_clone = rgd->rd_rg.rg_free;
603 rgd->rd_bh_count++;
604 spin_unlock(&sdp->sd_rindex_spin);
605
606 mutex_unlock(&rgd->rd_mutex);
607
608 return 0;
609
610fail:
611 while (x--) {
612 bi = rgd->rd_bits + x;
613 brelse(bi->bi_bh);
614 bi->bi_bh = NULL;
615 gfs2_assert_warn(sdp, !bi->bi_clone);
616 }
617 mutex_unlock(&rgd->rd_mutex);
618
619 return error;
620}
621
622void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
623{
624 struct gfs2_sbd *sdp = rgd->rd_sbd;
625
626 spin_lock(&sdp->sd_rindex_spin);
627 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
628 rgd->rd_bh_count++;
629 spin_unlock(&sdp->sd_rindex_spin);
630}
631
632/**
633 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
634 * @rgd: the struct gfs2_rgrpd describing the RG to read in
635 *
636 */
637
638void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
639{
640 struct gfs2_sbd *sdp = rgd->rd_sbd;
641 int x, length = rgd->rd_ri.ri_length;
642
643 spin_lock(&sdp->sd_rindex_spin);
644 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
645 if (--rgd->rd_bh_count) {
646 spin_unlock(&sdp->sd_rindex_spin);
647 return;
648 }
649
650 for (x = 0; x < length; x++) {
651 struct gfs2_bitmap *bi = rgd->rd_bits + x;
652 kfree(bi->bi_clone);
653 bi->bi_clone = NULL;
654 brelse(bi->bi_bh);
655 bi->bi_bh = NULL;
656 }
657
658 spin_unlock(&sdp->sd_rindex_spin);
659}
660
661void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
662{
663 struct gfs2_sbd *sdp = rgd->rd_sbd;
664 unsigned int length = rgd->rd_ri.ri_length;
665 unsigned int x;
666
667 for (x = 0; x < length; x++) {
668 struct gfs2_bitmap *bi = rgd->rd_bits + x;
669 if (!bi->bi_clone)
670 continue;
671 memcpy(bi->bi_clone + bi->bi_offset,
672 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
673 }
674
675 spin_lock(&sdp->sd_rindex_spin);
676 rgd->rd_free_clone = rgd->rd_rg.rg_free;
677 spin_unlock(&sdp->sd_rindex_spin);
678}
679
680/**
681 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
682 * @ip: the incore GFS2 inode structure
683 *
684 * Returns: the struct gfs2_alloc
685 */
686
687struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
688{
689 struct gfs2_alloc *al = &ip->i_alloc;
690
691 /* FIXME: Should assert that the correct locks are held here... */
692 memset(al, 0, sizeof(*al));
693 return al;
694}
695
696/**
697 * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode
698 * @ip: the inode
699 *
700 */
701
702void gfs2_alloc_put(struct gfs2_inode *ip)
703{
704 return;
705}
706
707/**
708 * try_rgrp_fit - See if a given reservation will fit in a given RG
709 * @rgd: the RG data
710 * @al: the struct gfs2_alloc structure describing the reservation
711 *
712 * If there's room for the requested blocks to be allocated from the RG:
713 * Sets the $al_reserved_data field in @al.
714 * Sets the $al_reserved_meta field in @al.
715 * Sets the $al_rgd field in @al.
716 *
717 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
718 */
719
720static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
721{
722 struct gfs2_sbd *sdp = rgd->rd_sbd;
723 int ret = 0;
724
725 spin_lock(&sdp->sd_rindex_spin);
726 if (rgd->rd_free_clone >= al->al_requested) {
727 al->al_rgd = rgd;
728 ret = 1;
729 }
730 spin_unlock(&sdp->sd_rindex_spin);
731
732 return ret;
733}
734
735/**
736 * recent_rgrp_first - get first RG from "recent" list
737 * @sdp: The GFS2 superblock
738 * @rglast: address of the rgrp used last
739 *
740 * Returns: The first rgrp in the recent list
741 */
742
743static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
744 uint64_t rglast)
745{
746 struct gfs2_rgrpd *rgd = NULL;
747
748 spin_lock(&sdp->sd_rindex_spin);
749
750 if (list_empty(&sdp->sd_rindex_recent_list))
751 goto out;
752
753 if (!rglast)
754 goto first;
755
756 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
757 if (rgd->rd_ri.ri_addr == rglast)
758 goto out;
759 }
760
761first:
762 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
763 rd_recent);
764out:
765 spin_unlock(&sdp->sd_rindex_spin);
766 return rgd;
767}
768
769/**
770 * recent_rgrp_next - get next RG from "recent" list
771 * @cur_rgd: current rgrp
772 * @remove:
773 *
774 * Returns: The next rgrp in the recent list
775 */
776
777static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
778 int remove)
779{
780 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
781 struct list_head *head;
782 struct gfs2_rgrpd *rgd;
783
784 spin_lock(&sdp->sd_rindex_spin);
785
786 head = &sdp->sd_rindex_recent_list;
787
788 list_for_each_entry(rgd, head, rd_recent) {
789 if (rgd == cur_rgd) {
790 if (cur_rgd->rd_recent.next != head)
791 rgd = list_entry(cur_rgd->rd_recent.next,
792 struct gfs2_rgrpd, rd_recent);
793 else
794 rgd = NULL;
795
796 if (remove)
797 list_del(&cur_rgd->rd_recent);
798
799 goto out;
800 }
801 }
802
803 rgd = NULL;
804 if (!list_empty(head))
805 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
806
807out:
808 spin_unlock(&sdp->sd_rindex_spin);
809 return rgd;
810}
811
812/**
813 * recent_rgrp_add - add an RG to tail of "recent" list
814 * @new_rgd: The rgrp to add
815 *
816 */
817
818static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
819{
820 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
821 struct gfs2_rgrpd *rgd;
822 unsigned int count = 0;
823 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
824
825 spin_lock(&sdp->sd_rindex_spin);
826
827 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
828 if (rgd == new_rgd)
829 goto out;
830
831 if (++count >= max)
832 goto out;
833 }
834 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
835
836out:
837 spin_unlock(&sdp->sd_rindex_spin);
838}
839
840/**
841 * forward_rgrp_get - get an rgrp to try next from full list
842 * @sdp: The GFS2 superblock
843 *
844 * Returns: The rgrp to try next
845 */
846
847static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
848{
849 struct gfs2_rgrpd *rgd;
850 unsigned int journals = gfs2_jindex_size(sdp);
851 unsigned int rg = 0, x;
852
853 spin_lock(&sdp->sd_rindex_spin);
854
855 rgd = sdp->sd_rindex_forward;
856 if (!rgd) {
857 if (sdp->sd_rgrps >= journals)
858 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
859
860 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp);
861 x < rg;
862 x++, rgd = gfs2_rgrpd_get_next(rgd))
863 /* Do Nothing */;
864
865 sdp->sd_rindex_forward = rgd;
866 }
867
868 spin_unlock(&sdp->sd_rindex_spin);
869
870 return rgd;
871}
872
873/**
874 * forward_rgrp_set - set the forward rgrp pointer
875 * @sdp: the filesystem
876 * @rgd: The new forward rgrp
877 *
878 */
879
880static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
881{
882 spin_lock(&sdp->sd_rindex_spin);
883 sdp->sd_rindex_forward = rgd;
884 spin_unlock(&sdp->sd_rindex_spin);
885}
886
887/**
888 * get_local_rgrp - Choose and lock a rgrp for allocation
889 * @ip: the inode to reserve space for
890 * @rgp: the chosen and locked rgrp
891 *
892 * Try to acquire rgrp in way which avoids contending with others.
893 *
894 * Returns: errno
895 */
896
897static int get_local_rgrp(struct gfs2_inode *ip)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_rgrpd *rgd, *begin = NULL;
901 struct gfs2_alloc *al = &ip->i_alloc;
902 int flags = LM_FLAG_TRY;
903 int skipped = 0;
904 int loops = 0;
905 int error;
906
907 /* Try recently successful rgrps */
908
909 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
910
911 while (rgd) {
912 error = gfs2_glock_nq_init(rgd->rd_gl,
913 LM_ST_EXCLUSIVE, LM_FLAG_TRY,
914 &al->al_rgd_gh);
915 switch (error) {
916 case 0:
917 if (try_rgrp_fit(rgd, al))
918 goto out;
919 gfs2_glock_dq_uninit(&al->al_rgd_gh);
920 rgd = recent_rgrp_next(rgd, 1);
921 break;
922
923 case GLR_TRYFAILED:
924 rgd = recent_rgrp_next(rgd, 0);
925 break;
926
927 default:
928 return error;
929 }
930 }
931
932 /* Go through full list of rgrps */
933
934 begin = rgd = forward_rgrp_get(sdp);
935
936 for (;;) {
937 error = gfs2_glock_nq_init(rgd->rd_gl,
938 LM_ST_EXCLUSIVE, flags,
939 &al->al_rgd_gh);
940 switch (error) {
941 case 0:
942 if (try_rgrp_fit(rgd, al))
943 goto out;
944 gfs2_glock_dq_uninit(&al->al_rgd_gh);
945 break;
946
947 case GLR_TRYFAILED:
948 skipped++;
949 break;
950
951 default:
952 return error;
953 }
954
955 rgd = gfs2_rgrpd_get_next(rgd);
956 if (!rgd)
957 rgd = gfs2_rgrpd_get_first(sdp);
958
959 if (rgd == begin) {
960 if (++loops >= 2 || !skipped)
961 return -ENOSPC;
962 flags = 0;
963 }
964 }
965
966out:
967 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
968
969 if (begin) {
970 recent_rgrp_add(rgd);
971 rgd = gfs2_rgrpd_get_next(rgd);
972 if (!rgd)
973 rgd = gfs2_rgrpd_get_first(sdp);
974 forward_rgrp_set(sdp, rgd);
975 }
976
977 return 0;
978}
979
980/**
981 * gfs2_inplace_reserve_i - Reserve space in the filesystem
982 * @ip: the inode to reserve space for
983 *
984 * Returns: errno
985 */
986
987int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
988{
989 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
990 struct gfs2_alloc *al = &ip->i_alloc;
991 int error;
992
993 if (gfs2_assert_warn(sdp, al->al_requested))
994 return -EINVAL;
995
996 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
997 if (error)
998 return error;
999
1000 error = get_local_rgrp(ip);
1001 if (error) {
1002 gfs2_glock_dq_uninit(&al->al_ri_gh);
1003 return error;
1004 }
1005
1006 al->al_file = file;
1007 al->al_line = line;
1008
1009 return 0;
1010}
1011
1012/**
1013 * gfs2_inplace_release - release an inplace reservation
1014 * @ip: the inode the reservation was taken out on
1015 *
1016 * Release a reservation made by gfs2_inplace_reserve().
1017 */
1018
1019void gfs2_inplace_release(struct gfs2_inode *ip)
1020{
1021 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1022 struct gfs2_alloc *al = &ip->i_alloc;
1023
1024 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1025 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1026 "al_file = %s, al_line = %u\n",
1027 al->al_alloced, al->al_requested, al->al_file,
1028 al->al_line);
1029
1030 al->al_rgd = NULL;
1031 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1032 gfs2_glock_dq_uninit(&al->al_ri_gh);
1033}
1034
1035/**
1036 * gfs2_get_block_type - Check a block in a RG is of given type
1037 * @rgd: the resource group holding the block
1038 * @block: the block number
1039 *
1040 * Returns: The block type (GFS2_BLKST_*)
1041 */
1042
1043unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block)
1044{
1045 struct gfs2_bitmap *bi = NULL;
1046 uint32_t length, rgrp_block, buf_block;
1047 unsigned int buf;
1048 unsigned char type;
1049
1050 length = rgd->rd_ri.ri_length;
1051 rgrp_block = block - rgd->rd_ri.ri_data0;
1052
1053 for (buf = 0; buf < length; buf++) {
1054 bi = rgd->rd_bits + buf;
1055 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1056 break;
1057 }
1058
1059 gfs2_assert(rgd->rd_sbd, buf < length);
1060 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
1061
1062 type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1063 bi->bi_len, buf_block);
1064
1065 return type;
1066}
1067
1068/**
1069 * rgblk_search - find a block in @old_state, change allocation
1070 * state to @new_state
1071 * @rgd: the resource group descriptor
1072 * @goal: the goal block within the RG (start here to search for avail block)
1073 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1074 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1075 *
1076 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1077 * Add the found bitmap buffer to the transaction.
1078 * Set the found bits to @new_state to change block's allocation state.
1079 *
1080 * This function never fails, because we wouldn't call it unless we
1081 * know (from reservation results, etc.) that a block is available.
1082 *
1083 * Scope of @goal and returned block is just within rgrp, not the whole
1084 * filesystem.
1085 *
1086 * Returns: the block number allocated
1087 */
1088
1089static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal,
1090 unsigned char old_state, unsigned char new_state)
1091{
1092 struct gfs2_bitmap *bi = NULL;
1093 uint32_t length = rgd->rd_ri.ri_length;
1094 uint32_t blk = 0;
1095 unsigned int buf, x;
1096
1097 /* Find bitmap block that contains bits for goal block */
1098 for (buf = 0; buf < length; buf++) {
1099 bi = rgd->rd_bits + buf;
1100 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1101 break;
1102 }
1103
1104 gfs2_assert(rgd->rd_sbd, buf < length);
1105
1106 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1107 goal -= bi->bi_start * GFS2_NBBY;
1108
1109 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1110 "x <= length", instead of "x < length", because we typically start
1111 the search in the middle of a bit block, but if we can't find an
1112 allocatable block anywhere else, we want to be able wrap around and
1113 search in the first part of our first-searched bit block. */
1114 for (x = 0; x <= length; x++) {
1115 if (bi->bi_clone)
1116 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1117 bi->bi_len, goal, old_state);
1118 else
1119 blk = gfs2_bitfit(rgd,
1120 bi->bi_bh->b_data + bi->bi_offset,
1121 bi->bi_len, goal, old_state);
1122 if (blk != BFITNOENT)
1123 break;
1124
1125 /* Try next bitmap block (wrap back to rgrp header if at end) */
1126 buf = (buf + 1) % length;
1127 bi = rgd->rd_bits + buf;
1128 goal = 0;
1129 }
1130
1131 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
1132 blk = 0;
1133
1134 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1135 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1136 bi->bi_len, blk, new_state);
1137 if (bi->bi_clone)
1138 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1139 bi->bi_len, blk, new_state);
1140
1141 return bi->bi_start * GFS2_NBBY + blk;
1142}
1143
1144/**
1145 * rgblk_free - Change alloc state of given block(s)
1146 * @sdp: the filesystem
1147 * @bstart: the start of a run of blocks to free
1148 * @blen: the length of the block run (all must lie within ONE RG!)
1149 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1150 *
1151 * Returns: Resource group containing the block(s)
1152 */
1153
1154static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart,
1155 uint32_t blen, unsigned char new_state)
1156{
1157 struct gfs2_rgrpd *rgd;
1158 struct gfs2_bitmap *bi = NULL;
1159 uint32_t length, rgrp_blk, buf_blk;
1160 unsigned int buf;
1161
1162 rgd = gfs2_blk2rgrpd(sdp, bstart);
1163 if (!rgd) {
1164 if (gfs2_consist(sdp))
1165 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1166 return NULL;
1167 }
1168
1169 length = rgd->rd_ri.ri_length;
1170
1171 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1172
1173 while (blen--) {
1174 for (buf = 0; buf < length; buf++) {
1175 bi = rgd->rd_bits + buf;
1176 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1177 break;
1178 }
1179
1180 gfs2_assert(rgd->rd_sbd, buf < length);
1181
1182 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1183 rgrp_blk++;
1184
1185 if (!bi->bi_clone) {
1186 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1187 GFP_NOFS | __GFP_NOFAIL);
1188 memcpy(bi->bi_clone + bi->bi_offset,
1189 bi->bi_bh->b_data + bi->bi_offset,
1190 bi->bi_len);
1191 }
1192 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1193 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1194 bi->bi_len, buf_blk, new_state);
1195 }
1196
1197 return rgd;
1198}
1199
1200/**
1201 * gfs2_alloc_data - Allocate a data block
1202 * @ip: the inode to allocate the data block for
1203 *
1204 * Returns: the allocated block
1205 */
1206
1207u64 gfs2_alloc_data(struct gfs2_inode *ip)
1208{
1209 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1210 struct gfs2_alloc *al = &ip->i_alloc;
1211 struct gfs2_rgrpd *rgd = al->al_rgd;
1212 uint32_t goal, blk;
1213 uint64_t block;
1214
1215 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1216 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1217 else
1218 goal = rgd->rd_last_alloc_data;
1219
1220 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1221 rgd->rd_last_alloc_data = blk;
1222
1223 block = rgd->rd_ri.ri_data0 + blk;
1224 ip->i_di.di_goal_data = block;
1225
1226 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1227 rgd->rd_rg.rg_free--;
1228
1229 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1230 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1231
1232 al->al_alloced++;
1233
1234 gfs2_statfs_change(sdp, 0, -1, 0);
1235 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1236
1237 spin_lock(&sdp->sd_rindex_spin);
1238 rgd->rd_free_clone--;
1239 spin_unlock(&sdp->sd_rindex_spin);
1240
1241 return block;
1242}
1243
1244/**
1245 * gfs2_alloc_meta - Allocate a metadata block
1246 * @ip: the inode to allocate the metadata block for
1247 *
1248 * Returns: the allocated block
1249 */
1250
1251u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1252{
1253 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1254 struct gfs2_alloc *al = &ip->i_alloc;
1255 struct gfs2_rgrpd *rgd = al->al_rgd;
1256 uint32_t goal, blk;
1257 uint64_t block;
1258
1259 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1260 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1261 else
1262 goal = rgd->rd_last_alloc_meta;
1263
1264 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1265 rgd->rd_last_alloc_meta = blk;
1266
1267 block = rgd->rd_ri.ri_data0 + blk;
1268 ip->i_di.di_goal_meta = block;
1269
1270 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1271 rgd->rd_rg.rg_free--;
1272
1273 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1274 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1275
1276 al->al_alloced++;
1277
1278 gfs2_statfs_change(sdp, 0, -1, 0);
1279 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1280 gfs2_trans_add_unrevoke(sdp, block);
1281
1282 spin_lock(&sdp->sd_rindex_spin);
1283 rgd->rd_free_clone--;
1284 spin_unlock(&sdp->sd_rindex_spin);
1285
1286 return block;
1287}
1288
1289/**
1290 * gfs2_alloc_di - Allocate a dinode
1291 * @dip: the directory that the inode is going in
1292 *
1293 * Returns: the block allocated
1294 */
1295
1296u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1297{
1298 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1299 struct gfs2_alloc *al = &dip->i_alloc;
1300 struct gfs2_rgrpd *rgd = al->al_rgd;
1301 u32 blk;
1302 u64 block;
1303
1304 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1305 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1306
1307 rgd->rd_last_alloc_meta = blk;
1308
1309 block = rgd->rd_ri.ri_data0 + blk;
1310
1311 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1312 rgd->rd_rg.rg_free--;
1313 rgd->rd_rg.rg_dinodes++;
1314 *generation = rgd->rd_rg.rg_igeneration++;
1315 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1316 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1317
1318 al->al_alloced++;
1319
1320 gfs2_statfs_change(sdp, 0, -1, +1);
1321 gfs2_trans_add_unrevoke(sdp, block);
1322
1323 spin_lock(&sdp->sd_rindex_spin);
1324 rgd->rd_free_clone--;
1325 spin_unlock(&sdp->sd_rindex_spin);
1326
1327 return block;
1328}
1329
1330/**
1331 * gfs2_free_data - free a contiguous run of data block(s)
1332 * @ip: the inode these blocks are being freed from
1333 * @bstart: first block of a run of contiguous blocks
1334 * @blen: the length of the block run
1335 *
1336 */
1337
1338void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1339{
1340 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1341 struct gfs2_rgrpd *rgd;
1342
1343 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1344 if (!rgd)
1345 return;
1346
1347 rgd->rd_rg.rg_free += blen;
1348
1349 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1350 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1351
1352 gfs2_trans_add_rg(rgd);
1353
1354 gfs2_statfs_change(sdp, 0, +blen, 0);
1355 gfs2_quota_change(ip, -(int64_t)blen,
1356 ip->i_di.di_uid, ip->i_di.di_gid);
1357}
1358
1359/**
1360 * gfs2_free_meta - free a contiguous run of data block(s)
1361 * @ip: the inode these blocks are being freed from
1362 * @bstart: first block of a run of contiguous blocks
1363 * @blen: the length of the block run
1364 *
1365 */
1366
1367void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1368{
1369 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1370 struct gfs2_rgrpd *rgd;
1371
1372 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1373 if (!rgd)
1374 return;
1375
1376 rgd->rd_rg.rg_free += blen;
1377
1378 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1379 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1380
1381 gfs2_trans_add_rg(rgd);
1382
1383 gfs2_statfs_change(sdp, 0, +blen, 0);
1384 gfs2_quota_change(ip, -(int64_t)blen, ip->i_di.di_uid, ip->i_di.di_gid);
1385 gfs2_meta_wipe(ip, bstart, blen);
1386}
1387
1388void gfs2_unlink_di(struct inode *inode)
1389{
1390 struct gfs2_inode *ip = GFS2_I(inode);
1391 struct gfs2_sbd *sdp = GFS2_SB(inode);
1392 struct gfs2_rgrpd *rgd;
1393 u64 blkno = ip->i_num.no_addr;
1394
1395 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1396 if (!rgd)
1397 return;
1398 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1399 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1400 gfs2_trans_add_rg(rgd);
1401}
1402
1403static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno)
1404{
1405 struct gfs2_sbd *sdp = rgd->rd_sbd;
1406 struct gfs2_rgrpd *tmp_rgd;
1407
1408 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1409 if (!tmp_rgd)
1410 return;
1411 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1412
1413 if (!rgd->rd_rg.rg_dinodes)
1414 gfs2_consist_rgrpd(rgd);
1415 rgd->rd_rg.rg_dinodes--;
1416 rgd->rd_rg.rg_free++;
1417
1418 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1419 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1420
1421 gfs2_statfs_change(sdp, 0, +1, -1);
1422 gfs2_trans_add_rg(rgd);
1423}
1424
1425
1426void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1427{
1428 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1429 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1430 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1431}
1432
1433/**
1434 * gfs2_rlist_add - add a RG to a list of RGs
1435 * @sdp: the filesystem
1436 * @rlist: the list of resource groups
1437 * @block: the block
1438 *
1439 * Figure out what RG a block belongs to and add that RG to the list
1440 *
1441 * FIXME: Don't use NOFAIL
1442 *
1443 */
1444
1445void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1446 uint64_t block)
1447{
1448 struct gfs2_rgrpd *rgd;
1449 struct gfs2_rgrpd **tmp;
1450 unsigned int new_space;
1451 unsigned int x;
1452
1453 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1454 return;
1455
1456 rgd = gfs2_blk2rgrpd(sdp, block);
1457 if (!rgd) {
1458 if (gfs2_consist(sdp))
1459 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1460 return;
1461 }
1462
1463 for (x = 0; x < rlist->rl_rgrps; x++)
1464 if (rlist->rl_rgd[x] == rgd)
1465 return;
1466
1467 if (rlist->rl_rgrps == rlist->rl_space) {
1468 new_space = rlist->rl_space + 10;
1469
1470 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1471 GFP_NOFS | __GFP_NOFAIL);
1472
1473 if (rlist->rl_rgd) {
1474 memcpy(tmp, rlist->rl_rgd,
1475 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1476 kfree(rlist->rl_rgd);
1477 }
1478
1479 rlist->rl_space = new_space;
1480 rlist->rl_rgd = tmp;
1481 }
1482
1483 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1484}
1485
1486/**
1487 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1488 * and initialize an array of glock holders for them
1489 * @rlist: the list of resource groups
1490 * @state: the lock state to acquire the RG lock in
1491 * @flags: the modifier flags for the holder structures
1492 *
1493 * FIXME: Don't use NOFAIL
1494 *
1495 */
1496
1497void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1498 int flags)
1499{
1500 unsigned int x;
1501
1502 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1503 GFP_NOFS | __GFP_NOFAIL);
1504 for (x = 0; x < rlist->rl_rgrps; x++)
1505 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1506 state, flags,
1507 &rlist->rl_ghs[x]);
1508}
1509
1510/**
1511 * gfs2_rlist_free - free a resource group list
1512 * @list: the list of resource groups
1513 *
1514 */
1515
1516void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1517{
1518 unsigned int x;
1519
1520 kfree(rlist->rl_rgd);
1521
1522 if (rlist->rl_ghs) {
1523 for (x = 0; x < rlist->rl_rgrps; x++)
1524 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1525 kfree(rlist->rl_ghs);
1526 }
1527}
1528
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..14600944d184
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
14
15struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk);
16struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
17struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
18
19void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
20int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
21
22int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
23void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
24void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
25
26void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
27
28struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
29void gfs2_alloc_put(struct gfs2_inode *ip);
30
31int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
32 char *file, unsigned int line);
33#define gfs2_inplace_reserve(ip) \
34gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
35
36void gfs2_inplace_release(struct gfs2_inode *ip);
37
38unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block);
39
40u64 gfs2_alloc_data(struct gfs2_inode *ip);
41u64 gfs2_alloc_meta(struct gfs2_inode *ip);
42u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
43
44void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
45void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
46void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
47void gfs2_unlink_di(struct inode *inode);
48
49struct gfs2_rgrp_list {
50 unsigned int rl_rgrps;
51 unsigned int rl_space;
52 struct gfs2_rgrpd **rl_rgd;
53 struct gfs2_holder *rl_ghs;
54};
55
56void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
57 uint64_t block);
58void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
59 int flags);
60void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
61
62#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..3c318a9e8a8c
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,979 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "dir.h"
24#include "format.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "meta_io.h"
30#include "quota.h"
31#include "recovery.h"
32#include "rgrp.h"
33#include "super.h"
34#include "trans.h"
35#include "util.h"
36
37/**
38 * gfs2_tune_init - Fill a gfs2_tune structure with default values
39 * @gt: tune
40 *
41 */
42
43void gfs2_tune_init(struct gfs2_tune *gt)
44{
45 spin_lock_init(&gt->gt_spin);
46
47 gt->gt_ilimit = 100;
48 gt->gt_ilimit_tries = 3;
49 gt->gt_ilimit_min = 1;
50 gt->gt_demote_secs = 300;
51 gt->gt_incore_log_blocks = 1024;
52 gt->gt_log_flush_secs = 60;
53 gt->gt_jindex_refresh_secs = 60;
54 gt->gt_scand_secs = 15;
55 gt->gt_recoverd_secs = 60;
56 gt->gt_logd_secs = 1;
57 gt->gt_quotad_secs = 5;
58 gt->gt_quota_simul_sync = 64;
59 gt->gt_quota_warn_period = 10;
60 gt->gt_quota_scale_num = 1;
61 gt->gt_quota_scale_den = 1;
62 gt->gt_quota_cache_secs = 300;
63 gt->gt_quota_quantum = 60;
64 gt->gt_atime_quantum = 3600;
65 gt->gt_new_files_jdata = 0;
66 gt->gt_new_files_directio = 0;
67 gt->gt_max_atomic_write = 4 << 20;
68 gt->gt_max_readahead = 1 << 18;
69 gt->gt_lockdump_size = 131072;
70 gt->gt_stall_secs = 600;
71 gt->gt_complain_secs = 10;
72 gt->gt_reclaim_limit = 5000;
73 gt->gt_entries_per_readdir = 32;
74 gt->gt_prefetch_secs = 10;
75 gt->gt_greedy_default = HZ / 10;
76 gt->gt_greedy_quantum = HZ / 40;
77 gt->gt_greedy_max = HZ / 4;
78 gt->gt_statfs_quantum = 30;
79 gt->gt_statfs_slow = 0;
80}
81
82/**
83 * gfs2_check_sb - Check superblock
84 * @sdp: the filesystem
85 * @sb: The superblock
86 * @silent: Don't print a message if the check fails
87 *
88 * Checks the version code of the FS is one that we understand how to
89 * read and that the sizes of the various on-disk structures have not
90 * changed.
91 */
92
93int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
94{
95 unsigned int x;
96
97 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
98 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
99 if (!silent)
100 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
101 return -EINVAL;
102 }
103
104 /* If format numbers match exactly, we're done. */
105
106 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
107 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
108 return 0;
109
110 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
111 for (x = 0; gfs2_old_fs_formats[x]; x++)
112 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
113 break;
114
115 if (!gfs2_old_fs_formats[x]) {
116 printk(KERN_WARNING
117 "GFS2: code version (%u, %u) is incompatible "
118 "with ondisk format (%u, %u)\n",
119 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
120 sb->sb_fs_format, sb->sb_multihost_format);
121 printk(KERN_WARNING
122 "GFS2: I don't know how to upgrade this FS\n");
123 return -EINVAL;
124 }
125 }
126
127 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
128 for (x = 0; gfs2_old_multihost_formats[x]; x++)
129 if (gfs2_old_multihost_formats[x] ==
130 sb->sb_multihost_format)
131 break;
132
133 if (!gfs2_old_multihost_formats[x]) {
134 printk(KERN_WARNING
135 "GFS2: code version (%u, %u) is incompatible "
136 "with ondisk format (%u, %u)\n",
137 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
138 sb->sb_fs_format, sb->sb_multihost_format);
139 printk(KERN_WARNING
140 "GFS2: I don't know how to upgrade this FS\n");
141 return -EINVAL;
142 }
143 }
144
145 if (!sdp->sd_args.ar_upgrade) {
146 printk(KERN_WARNING
147 "GFS2: code version (%u, %u) is incompatible "
148 "with ondisk format (%u, %u)\n",
149 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
150 sb->sb_fs_format, sb->sb_multihost_format);
151 printk(KERN_INFO
152 "GFS2: Use the \"upgrade\" mount option to upgrade "
153 "the FS\n");
154 printk(KERN_INFO "GFS2: See the manual for more details\n");
155 return -EINVAL;
156 }
157
158 return 0;
159}
160
161
162static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
163{
164 struct page *page = bio->bi_private;
165 if (bio->bi_size)
166 return 1;
167
168 if (!error)
169 SetPageUptodate(page);
170 else
171 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
172 unlock_page(page);
173 return 0;
174}
175
176static struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
177{
178 struct page *page;
179 struct bio *bio;
180
181 page = alloc_page(GFP_KERNEL);
182 if (unlikely(!page))
183 return NULL;
184
185 ClearPageUptodate(page);
186 ClearPageDirty(page);
187 lock_page(page);
188
189 bio = bio_alloc(GFP_KERNEL, 1);
190 if (unlikely(!bio)) {
191 __free_page(page);
192 return NULL;
193 }
194
195 bio->bi_sector = sector;
196 bio->bi_bdev = sb->s_bdev;
197 bio_add_page(bio, page, PAGE_SIZE, 0);
198
199 bio->bi_end_io = end_bio_io_page;
200 bio->bi_private = page;
201 submit_bio(READ_SYNC, bio);
202 wait_on_page_locked(page);
203 bio_put(bio);
204 if (!PageUptodate(page)) {
205 __free_page(page);
206 return NULL;
207 }
208 return page;
209}
210
211/**
212 * gfs2_read_sb - Read super block
213 * @sdp: The GFS2 superblock
214 * @gl: the glock for the superblock (assumed to be held)
215 * @silent: Don't print message if mount fails
216 *
217 */
218
219int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
220{
221 uint32_t hash_blocks, ind_blocks, leaf_blocks;
222 uint32_t tmp_blocks;
223 unsigned int x;
224 int error;
225 struct page *page;
226 char *sb;
227
228 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
229 if (!page) {
230 if (!silent)
231 fs_err(sdp, "can't read superblock\n");
232 return -EIO;
233 }
234 sb = kmap(page);
235 gfs2_sb_in(&sdp->sd_sb, sb);
236 kunmap(page);
237 __free_page(page);
238
239 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
240 if (error)
241 return error;
242
243 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
244 GFS2_BASIC_BLOCK_SHIFT;
245 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
246 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
247 sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
248 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
249 sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
250 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
251 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
252 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
253 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
254 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
255 sizeof(struct gfs2_meta_header)) /
256 sizeof(struct gfs2_quota_change);
257
258 /* Compute maximum reservation required to add a entry to a directory */
259
260 hash_blocks = DIV_ROUND_UP(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH),
261 sdp->sd_jbsize);
262
263 ind_blocks = 0;
264 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
265 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
266 ind_blocks += tmp_blocks;
267 }
268
269 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
270
271 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
272
273 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
274 sizeof(struct gfs2_dinode);
275 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
276 for (x = 2;; x++) {
277 uint64_t space, d;
278 uint32_t m;
279
280 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
281 d = space;
282 m = do_div(d, sdp->sd_inptrs);
283
284 if (d != sdp->sd_heightsize[x - 1] || m)
285 break;
286 sdp->sd_heightsize[x] = space;
287 }
288 sdp->sd_max_height = x;
289 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
290
291 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
292 sizeof(struct gfs2_dinode);
293 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
294 for (x = 2;; x++) {
295 uint64_t space, d;
296 uint32_t m;
297
298 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
299 d = space;
300 m = do_div(d, sdp->sd_inptrs);
301
302 if (d != sdp->sd_jheightsize[x - 1] || m)
303 break;
304 sdp->sd_jheightsize[x] = space;
305 }
306 sdp->sd_max_jheight = x;
307 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
308
309 return 0;
310}
311
312/**
313 * gfs2_jindex_hold - Grab a lock on the jindex
314 * @sdp: The GFS2 superblock
315 * @ji_gh: the holder for the jindex glock
316 *
317 * This is very similar to the gfs2_rindex_hold() function, except that
318 * in general we hold the jindex lock for longer periods of time and
319 * we grab it far less frequently (in general) then the rgrp lock.
320 *
321 * Returns: errno
322 */
323
324int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
325{
326 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
327 struct qstr name;
328 char buf[20];
329 struct gfs2_jdesc *jd;
330 int error;
331
332 name.name = buf;
333
334 mutex_lock(&sdp->sd_jindex_mutex);
335
336 for (;;) {
337 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
338 GL_LOCAL_EXCL, ji_gh);
339 if (error)
340 break;
341
342 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
343 name.hash = gfs2_disk_hash(name.name, name.len);
344
345 error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
346 if (error == -ENOENT) {
347 error = 0;
348 break;
349 }
350
351 gfs2_glock_dq_uninit(ji_gh);
352
353 if (error)
354 break;
355
356 error = -ENOMEM;
357 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
358 if (!jd)
359 break;
360
361 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
362 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
363 if (!jd->jd_inode)
364 error = -ENOENT;
365 else
366 error = PTR_ERR(jd->jd_inode);
367 kfree(jd);
368 break;
369 }
370
371 spin_lock(&sdp->sd_jindex_spin);
372 jd->jd_jid = sdp->sd_journals++;
373 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
374 spin_unlock(&sdp->sd_jindex_spin);
375 }
376
377 mutex_unlock(&sdp->sd_jindex_mutex);
378
379 return error;
380}
381
382/**
383 * gfs2_jindex_free - Clear all the journal index information
384 * @sdp: The GFS2 superblock
385 *
386 */
387
388void gfs2_jindex_free(struct gfs2_sbd *sdp)
389{
390 struct list_head list;
391 struct gfs2_jdesc *jd;
392
393 spin_lock(&sdp->sd_jindex_spin);
394 list_add(&list, &sdp->sd_jindex_list);
395 list_del_init(&sdp->sd_jindex_list);
396 sdp->sd_journals = 0;
397 spin_unlock(&sdp->sd_jindex_spin);
398
399 while (!list_empty(&list)) {
400 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
401 list_del(&jd->jd_list);
402 iput(jd->jd_inode);
403 kfree(jd);
404 }
405}
406
407static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
408{
409 struct gfs2_jdesc *jd;
410 int found = 0;
411
412 list_for_each_entry(jd, head, jd_list) {
413 if (jd->jd_jid == jid) {
414 found = 1;
415 break;
416 }
417 }
418
419 if (!found)
420 jd = NULL;
421
422 return jd;
423}
424
425struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
426{
427 struct gfs2_jdesc *jd;
428
429 spin_lock(&sdp->sd_jindex_spin);
430 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
431 spin_unlock(&sdp->sd_jindex_spin);
432
433 return jd;
434}
435
436void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
437{
438 struct gfs2_jdesc *jd;
439
440 spin_lock(&sdp->sd_jindex_spin);
441 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
442 if (jd)
443 jd->jd_dirty = 1;
444 spin_unlock(&sdp->sd_jindex_spin);
445}
446
447struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
448{
449 struct gfs2_jdesc *jd;
450 int found = 0;
451
452 spin_lock(&sdp->sd_jindex_spin);
453
454 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
455 if (jd->jd_dirty) {
456 jd->jd_dirty = 0;
457 found = 1;
458 break;
459 }
460 }
461 spin_unlock(&sdp->sd_jindex_spin);
462
463 if (!found)
464 jd = NULL;
465
466 return jd;
467}
468
469int gfs2_jdesc_check(struct gfs2_jdesc *jd)
470{
471 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
472 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
473 int ar;
474 int error;
475
476 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
477 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
478 gfs2_consist_inode(ip);
479 return -EIO;
480 }
481 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
482
483 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
484 if (!error && ar) {
485 gfs2_consist_inode(ip);
486 error = -EIO;
487 }
488
489 return error;
490}
491
492/**
493 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
494 * @sdp: the filesystem
495 *
496 * Returns: errno
497 */
498
499int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
500{
501 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
502 struct gfs2_glock *j_gl = ip->i_gl;
503 struct gfs2_holder t_gh;
504 struct gfs2_log_header head;
505 int error;
506
507 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
508 GL_LOCAL_EXCL, &t_gh);
509 if (error)
510 return error;
511
512 gfs2_meta_cache_flush(ip);
513 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
514
515 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
516 if (error)
517 goto fail;
518
519 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
520 gfs2_consist(sdp);
521 error = -EIO;
522 goto fail;
523 }
524
525 /* Initialize some head of the log stuff */
526 sdp->sd_log_sequence = head.lh_sequence + 1;
527 gfs2_log_pointers_init(sdp, head.lh_blkno);
528
529 error = gfs2_quota_init(sdp);
530 if (error)
531 goto fail_unlinked;
532
533 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
534
535 gfs2_glock_dq_uninit(&t_gh);
536
537 return 0;
538
539 fail_unlinked:
540
541 fail:
542 t_gh.gh_flags |= GL_NOCACHE;
543 gfs2_glock_dq_uninit(&t_gh);
544
545 return error;
546}
547
548/**
549 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
550 * @sdp: the filesystem
551 *
552 * Returns: errno
553 */
554
555int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
556{
557 struct gfs2_holder t_gh;
558 int error;
559
560 gfs2_quota_sync(sdp);
561 gfs2_statfs_sync(sdp);
562
563 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
564 GL_LOCAL_EXCL | GL_NOCACHE,
565 &t_gh);
566 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
567 return error;
568
569 gfs2_meta_syncfs(sdp);
570 gfs2_log_shutdown(sdp);
571
572 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
573
574 if (t_gh.gh_gl)
575 gfs2_glock_dq_uninit(&t_gh);
576
577 gfs2_quota_cleanup(sdp);
578
579 return error;
580}
581
582int gfs2_statfs_init(struct gfs2_sbd *sdp)
583{
584 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
585 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
586 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
587 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
588 struct buffer_head *m_bh, *l_bh;
589 struct gfs2_holder gh;
590 int error;
591
592 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
593 &gh);
594 if (error)
595 return error;
596
597 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
598 if (error)
599 goto out;
600
601 if (sdp->sd_args.ar_spectator) {
602 spin_lock(&sdp->sd_statfs_spin);
603 gfs2_statfs_change_in(m_sc, m_bh->b_data +
604 sizeof(struct gfs2_dinode));
605 spin_unlock(&sdp->sd_statfs_spin);
606 } else {
607 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
608 if (error)
609 goto out_m_bh;
610
611 spin_lock(&sdp->sd_statfs_spin);
612 gfs2_statfs_change_in(m_sc, m_bh->b_data +
613 sizeof(struct gfs2_dinode));
614 gfs2_statfs_change_in(l_sc, l_bh->b_data +
615 sizeof(struct gfs2_dinode));
616 spin_unlock(&sdp->sd_statfs_spin);
617
618 brelse(l_bh);
619 }
620
621 out_m_bh:
622 brelse(m_bh);
623
624 out:
625 gfs2_glock_dq_uninit(&gh);
626
627 return 0;
628}
629
630void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free,
631 int64_t dinodes)
632{
633 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
634 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
635 struct buffer_head *l_bh;
636 int error;
637
638 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
639 if (error)
640 return;
641
642 mutex_lock(&sdp->sd_statfs_mutex);
643 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
644 mutex_unlock(&sdp->sd_statfs_mutex);
645
646 spin_lock(&sdp->sd_statfs_spin);
647 l_sc->sc_total += total;
648 l_sc->sc_free += free;
649 l_sc->sc_dinodes += dinodes;
650 gfs2_statfs_change_out(l_sc, l_bh->b_data +
651 sizeof(struct gfs2_dinode));
652 spin_unlock(&sdp->sd_statfs_spin);
653
654 brelse(l_bh);
655}
656
657int gfs2_statfs_sync(struct gfs2_sbd *sdp)
658{
659 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
660 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
661 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
662 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
663 struct gfs2_holder gh;
664 struct buffer_head *m_bh, *l_bh;
665 int error;
666
667 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
668 &gh);
669 if (error)
670 return error;
671
672 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
673 if (error)
674 goto out;
675
676 spin_lock(&sdp->sd_statfs_spin);
677 gfs2_statfs_change_in(m_sc, m_bh->b_data +
678 sizeof(struct gfs2_dinode));
679 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
680 spin_unlock(&sdp->sd_statfs_spin);
681 goto out_bh;
682 }
683 spin_unlock(&sdp->sd_statfs_spin);
684
685 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
686 if (error)
687 goto out_bh;
688
689 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
690 if (error)
691 goto out_bh2;
692
693 mutex_lock(&sdp->sd_statfs_mutex);
694 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
695 mutex_unlock(&sdp->sd_statfs_mutex);
696
697 spin_lock(&sdp->sd_statfs_spin);
698 m_sc->sc_total += l_sc->sc_total;
699 m_sc->sc_free += l_sc->sc_free;
700 m_sc->sc_dinodes += l_sc->sc_dinodes;
701 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
702 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
703 0, sizeof(struct gfs2_statfs_change));
704 spin_unlock(&sdp->sd_statfs_spin);
705
706 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
707 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
708
709 gfs2_trans_end(sdp);
710
711 out_bh2:
712 brelse(l_bh);
713
714 out_bh:
715 brelse(m_bh);
716
717 out:
718 gfs2_glock_dq_uninit(&gh);
719
720 return error;
721}
722
723/**
724 * gfs2_statfs_i - Do a statfs
725 * @sdp: the filesystem
726 * @sg: the sg structure
727 *
728 * Returns: errno
729 */
730
731int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
732{
733 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
734 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
735
736 spin_lock(&sdp->sd_statfs_spin);
737
738 *sc = *m_sc;
739 sc->sc_total += l_sc->sc_total;
740 sc->sc_free += l_sc->sc_free;
741 sc->sc_dinodes += l_sc->sc_dinodes;
742
743 spin_unlock(&sdp->sd_statfs_spin);
744
745 if (sc->sc_free < 0)
746 sc->sc_free = 0;
747 if (sc->sc_free > sc->sc_total)
748 sc->sc_free = sc->sc_total;
749 if (sc->sc_dinodes < 0)
750 sc->sc_dinodes = 0;
751
752 return 0;
753}
754
755/**
756 * statfs_fill - fill in the sg for a given RG
757 * @rgd: the RG
758 * @sc: the sc structure
759 *
760 * Returns: 0 on success, -ESTALE if the LVB is invalid
761 */
762
763static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
764 struct gfs2_statfs_change *sc)
765{
766 gfs2_rgrp_verify(rgd);
767 sc->sc_total += rgd->rd_ri.ri_data;
768 sc->sc_free += rgd->rd_rg.rg_free;
769 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
770 return 0;
771}
772
773/**
774 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
775 * @sdp: the filesystem
776 * @sc: the sc info that will be returned
777 *
778 * Any error (other than a signal) will cause this routine to fall back
779 * to the synchronous version.
780 *
781 * FIXME: This really shouldn't busy wait like this.
782 *
783 * Returns: errno
784 */
785
786int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
787{
788 struct gfs2_holder ri_gh;
789 struct gfs2_rgrpd *rgd_next;
790 struct gfs2_holder *gha, *gh;
791 unsigned int slots = 64;
792 unsigned int x;
793 int done;
794 int error = 0, err;
795
796 memset(sc, 0, sizeof(struct gfs2_statfs_change));
797 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
798 if (!gha)
799 return -ENOMEM;
800
801 error = gfs2_rindex_hold(sdp, &ri_gh);
802 if (error)
803 goto out;
804
805 rgd_next = gfs2_rgrpd_get_first(sdp);
806
807 for (;;) {
808 done = 1;
809
810 for (x = 0; x < slots; x++) {
811 gh = gha + x;
812
813 if (gh->gh_gl && gfs2_glock_poll(gh)) {
814 err = gfs2_glock_wait(gh);
815 if (err) {
816 gfs2_holder_uninit(gh);
817 error = err;
818 } else {
819 if (!error)
820 error = statfs_slow_fill(
821 gh->gh_gl->gl_object, sc);
822 gfs2_glock_dq_uninit(gh);
823 }
824 }
825
826 if (gh->gh_gl)
827 done = 0;
828 else if (rgd_next && !error) {
829 error = gfs2_glock_nq_init(rgd_next->rd_gl,
830 LM_ST_SHARED,
831 GL_ASYNC,
832 gh);
833 rgd_next = gfs2_rgrpd_get_next(rgd_next);
834 done = 0;
835 }
836
837 if (signal_pending(current))
838 error = -ERESTARTSYS;
839 }
840
841 if (done)
842 break;
843
844 yield();
845 }
846
847 gfs2_glock_dq_uninit(&ri_gh);
848
849 out:
850 kfree(gha);
851
852 return error;
853}
854
855struct lfcc {
856 struct list_head list;
857 struct gfs2_holder gh;
858};
859
860/**
861 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
862 * journals are clean
863 * @sdp: the file system
864 * @state: the state to put the transaction lock into
865 * @t_gh: the hold on the transaction lock
866 *
867 * Returns: errno
868 */
869
870static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
871 struct gfs2_holder *t_gh)
872{
873 struct gfs2_inode *ip;
874 struct gfs2_holder ji_gh;
875 struct gfs2_jdesc *jd;
876 struct lfcc *lfcc;
877 LIST_HEAD(list);
878 struct gfs2_log_header lh;
879 int error;
880
881 error = gfs2_jindex_hold(sdp, &ji_gh);
882 if (error)
883 return error;
884
885 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
886 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
887 if (!lfcc) {
888 error = -ENOMEM;
889 goto out;
890 }
891 ip = GFS2_I(jd->jd_inode);
892 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
893 if (error) {
894 kfree(lfcc);
895 goto out;
896 }
897 list_add(&lfcc->list, &list);
898 }
899
900 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
901 LM_FLAG_PRIORITY | GL_NOCACHE,
902 t_gh);
903
904 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
905 error = gfs2_jdesc_check(jd);
906 if (error)
907 break;
908 error = gfs2_find_jhead(jd, &lh);
909 if (error)
910 break;
911 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
912 error = -EBUSY;
913 break;
914 }
915 }
916
917 if (error)
918 gfs2_glock_dq_uninit(t_gh);
919
920 out:
921 while (!list_empty(&list)) {
922 lfcc = list_entry(list.next, struct lfcc, list);
923 list_del(&lfcc->list);
924 gfs2_glock_dq_uninit(&lfcc->gh);
925 kfree(lfcc);
926 }
927 gfs2_glock_dq_uninit(&ji_gh);
928
929 return error;
930}
931
932/**
933 * gfs2_freeze_fs - freezes the file system
934 * @sdp: the file system
935 *
936 * This function flushes data and meta data for all machines by
937 * aquiring the transaction log exclusively. All journals are
938 * ensured to be in a clean state as well.
939 *
940 * Returns: errno
941 */
942
943int gfs2_freeze_fs(struct gfs2_sbd *sdp)
944{
945 int error = 0;
946
947 mutex_lock(&sdp->sd_freeze_lock);
948
949 if (!sdp->sd_freeze_count++) {
950 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
951 if (error)
952 sdp->sd_freeze_count--;
953 }
954
955 mutex_unlock(&sdp->sd_freeze_lock);
956
957 return error;
958}
959
960/**
961 * gfs2_unfreeze_fs - unfreezes the file system
962 * @sdp: the file system
963 *
964 * This function allows the file system to proceed by unlocking
965 * the exclusively held transaction lock. Other GFS2 nodes are
966 * now free to acquire the lock shared and go on with their lives.
967 *
968 */
969
970void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
971{
972 mutex_lock(&sdp->sd_freeze_lock);
973
974 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
975 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
976
977 mutex_unlock(&sdp->sd_freeze_lock);
978}
979
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..df2495230402
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13void gfs2_tune_init(struct gfs2_tune *gt);
14
15int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
16int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
17
18static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
19{
20 unsigned int x;
21 spin_lock(&sdp->sd_jindex_spin);
22 x = sdp->sd_journals;
23 spin_unlock(&sdp->sd_jindex_spin);
24 return x;
25}
26
27int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
28void gfs2_jindex_free(struct gfs2_sbd *sdp);
29
30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
31void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
32struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
33int gfs2_jdesc_check(struct gfs2_jdesc *jd);
34
35int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp);
37
38int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
40
41int gfs2_statfs_init(struct gfs2_sbd *sdp);
42void gfs2_statfs_change(struct gfs2_sbd *sdp,
43 int64_t total, int64_t free, int64_t dinodes);
44int gfs2_statfs_sync(struct gfs2_sbd *sdp);
45int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
46int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
47
48int gfs2_freeze_fs(struct gfs2_sbd *sdp);
49void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
50
51#endif /* __SUPER_DOT_H__ */
52
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..3c4cb4558905
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,579 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "lm.h"
24#include "sys.h"
25#include "super.h"
26#include "glock.h"
27#include "quota.h"
28#include "util.h"
29
30char *gfs2_sys_margs;
31spinlock_t gfs2_sys_margs_lock;
32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{
35 return sprintf(buf, "%s\n", sdp->sd_vfs->s_id);
36}
37
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
39{
40 return sprintf(buf, "%s\n", sdp->sd_fsname);
41}
42
43static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
44{
45 unsigned int count;
46
47 mutex_lock(&sdp->sd_freeze_lock);
48 count = sdp->sd_freeze_count;
49 mutex_unlock(&sdp->sd_freeze_lock);
50
51 return sprintf(buf, "%u\n", count);
52}
53
54static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
55{
56 ssize_t ret = len;
57 int error = 0;
58 int n = simple_strtol(buf, NULL, 0);
59
60 if (!capable(CAP_SYS_ADMIN))
61 return -EACCES;
62
63 switch (n) {
64 case 0:
65 gfs2_unfreeze_fs(sdp);
66 break;
67 case 1:
68 error = gfs2_freeze_fs(sdp);
69 break;
70 default:
71 ret = -EINVAL;
72 }
73
74 if (error)
75 fs_warn(sdp, "freeze %d error %d", n, error);
76
77 return ret;
78}
79
80static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
81{
82 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
83 return sprintf(buf, "%u\n", b);
84}
85
86static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
87{
88 if (!capable(CAP_SYS_ADMIN))
89 return -EACCES;
90
91 if (simple_strtol(buf, NULL, 0) != 1)
92 return -EINVAL;
93
94 gfs2_lm_withdraw(sdp,
95 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
96 sdp->sd_fsname);
97 return len;
98}
99
100static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
101 size_t len)
102{
103 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES;
105
106 if (simple_strtol(buf, NULL, 0) != 1)
107 return -EINVAL;
108
109 gfs2_statfs_sync(sdp);
110 return len;
111}
112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len)
127{
128 if (!capable(CAP_SYS_ADMIN))
129 return -EACCES;
130
131 if (simple_strtol(buf, NULL, 0) != 1)
132 return -EINVAL;
133
134 gfs2_quota_sync(sdp);
135 return len;
136}
137
138static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
139 size_t len)
140{
141 uint32_t id;
142
143 if (!capable(CAP_SYS_ADMIN))
144 return -EACCES;
145
146 id = simple_strtoul(buf, NULL, 0);
147
148 gfs2_quota_refresh(sdp, 1, id);
149 return len;
150}
151
152static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
153 size_t len)
154{
155 uint32_t id;
156
157 if (!capable(CAP_SYS_ADMIN))
158 return -EACCES;
159
160 id = simple_strtoul(buf, NULL, 0);
161
162 gfs2_quota_refresh(sdp, 0, id);
163 return len;
164}
165
166struct gfs2_attr {
167 struct attribute attr;
168 ssize_t (*show)(struct gfs2_sbd *, char *);
169 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
170};
171
172#define GFS2_ATTR(name, mode, show, store) \
173static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
174
175GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
182GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
183GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
184
185static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr,
193 &gfs2_attr_quota_refresh_user.attr,
194 &gfs2_attr_quota_refresh_group.attr,
195 NULL,
196};
197
198static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
199 char *buf)
200{
201 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
202 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
203 return a->show ? a->show(sdp, buf) : 0;
204}
205
206static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
207 const char *buf, size_t len)
208{
209 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
210 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
211 return a->store ? a->store(sdp, buf, len) : len;
212}
213
214static struct sysfs_ops gfs2_attr_ops = {
215 .show = gfs2_attr_show,
216 .store = gfs2_attr_store,
217};
218
219static struct kobj_type gfs2_ktype = {
220 .default_attrs = gfs2_attrs,
221 .sysfs_ops = &gfs2_attr_ops,
222};
223
224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2",},
227 .ktype = &gfs2_ktype,
228};
229
230/*
231 * display struct lm_lockstruct fields
232 */
233
234struct lockstruct_attr {
235 struct attribute attr;
236 ssize_t (*show)(struct gfs2_sbd *, char *);
237};
238
239#define LOCKSTRUCT_ATTR(name, fmt) \
240static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
241{ \
242 return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \
243} \
244static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
245
246LOCKSTRUCT_ATTR(jid, "%u\n");
247LOCKSTRUCT_ATTR(first, "%u\n");
248LOCKSTRUCT_ATTR(lvb_size, "%u\n");
249LOCKSTRUCT_ATTR(flags, "%d\n");
250
251static struct attribute *lockstruct_attrs[] = {
252 &lockstruct_attr_jid.attr,
253 &lockstruct_attr_first.attr,
254 &lockstruct_attr_lvb_size.attr,
255 &lockstruct_attr_flags.attr,
256 NULL
257};
258
259/*
260 * display struct gfs2_args fields
261 */
262
263struct args_attr {
264 struct attribute attr;
265 ssize_t (*show)(struct gfs2_sbd *, char *);
266};
267
268#define ARGS_ATTR(name, fmt) \
269static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
270{ \
271 return sprintf(buf, fmt, sdp->sd_args.ar_##name); \
272} \
273static struct args_attr args_attr_##name = __ATTR_RO(name)
274
275ARGS_ATTR(lockproto, "%s\n");
276ARGS_ATTR(locktable, "%s\n");
277ARGS_ATTR(hostdata, "%s\n");
278ARGS_ATTR(spectator, "%d\n");
279ARGS_ATTR(ignore_local_fs, "%d\n");
280ARGS_ATTR(localcaching, "%d\n");
281ARGS_ATTR(localflocks, "%d\n");
282ARGS_ATTR(debug, "%d\n");
283ARGS_ATTR(upgrade, "%d\n");
284ARGS_ATTR(num_glockd, "%u\n");
285ARGS_ATTR(posix_acl, "%d\n");
286ARGS_ATTR(quota, "%u\n");
287ARGS_ATTR(suiddir, "%d\n");
288ARGS_ATTR(data, "%d\n");
289
290/* one oddball doesn't fit the macro mold */
291static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
292{
293 return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags));
294}
295static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
296
297static struct attribute *args_attrs[] = {
298 &args_attr_lockproto.attr,
299 &args_attr_locktable.attr,
300 &args_attr_hostdata.attr,
301 &args_attr_spectator.attr,
302 &args_attr_ignore_local_fs.attr,
303 &args_attr_localcaching.attr,
304 &args_attr_localflocks.attr,
305 &args_attr_debug.attr,
306 &args_attr_upgrade.attr,
307 &args_attr_num_glockd.attr,
308 &args_attr_posix_acl.attr,
309 &args_attr_quota.attr,
310 &args_attr_suiddir.attr,
311 &args_attr_data.attr,
312 &args_attr_noatime.attr,
313 NULL
314};
315
316/*
317 * display counters from superblock
318 */
319
320struct counters_attr {
321 struct attribute attr;
322 ssize_t (*show)(struct gfs2_sbd *, char *);
323};
324
325#define COUNTERS_ATTR(name, fmt) \
326static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
327{ \
328 return sprintf(buf, fmt, (unsigned int)atomic_read(&sdp->sd_##name)); \
329} \
330static struct counters_attr counters_attr_##name = __ATTR_RO(name)
331
332COUNTERS_ATTR(glock_count, "%u\n");
333COUNTERS_ATTR(glock_held_count, "%u\n");
334COUNTERS_ATTR(inode_count, "%u\n");
335COUNTERS_ATTR(reclaimed, "%u\n");
336
337static struct attribute *counters_attrs[] = {
338 &counters_attr_glock_count.attr,
339 &counters_attr_glock_held_count.attr,
340 &counters_attr_inode_count.attr,
341 &counters_attr_reclaimed.attr,
342 NULL
343};
344
345/*
346 * get and set struct gfs2_tune fields
347 */
348
349static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
350{
351 return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num,
352 sdp->sd_tune.gt_quota_scale_den);
353}
354
355static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
356 size_t len)
357{
358 struct gfs2_tune *gt = &sdp->sd_tune;
359 unsigned int x, y;
360
361 if (!capable(CAP_SYS_ADMIN))
362 return -EACCES;
363
364 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
365 return -EINVAL;
366
367 spin_lock(&gt->gt_spin);
368 gt->gt_quota_scale_num = x;
369 gt->gt_quota_scale_den = y;
370 spin_unlock(&gt->gt_spin);
371 return len;
372}
373
374static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
375 int check_zero, const char *buf, size_t len)
376{
377 struct gfs2_tune *gt = &sdp->sd_tune;
378 unsigned int x;
379
380 if (!capable(CAP_SYS_ADMIN))
381 return -EACCES;
382
383 x = simple_strtoul(buf, NULL, 0);
384
385 if (check_zero && !x)
386 return -EINVAL;
387
388 spin_lock(&gt->gt_spin);
389 *field = x;
390 spin_unlock(&gt->gt_spin);
391 return len;
392}
393
394struct tune_attr {
395 struct attribute attr;
396 ssize_t (*show)(struct gfs2_sbd *, char *);
397 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
398};
399
400#define TUNE_ATTR_3(name, show, store) \
401static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
402
403#define TUNE_ATTR_2(name, store) \
404static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
405{ \
406 return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \
407} \
408TUNE_ATTR_3(name, name##_show, store)
409
410#define TUNE_ATTR(name, check_zero) \
411static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
412{ \
413 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
414} \
415TUNE_ATTR_2(name, name##_store)
416
417#define TUNE_ATTR_DAEMON(name, process) \
418static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
419{ \
420 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
421 wake_up_process(sdp->sd_##process); \
422 return r; \
423} \
424TUNE_ATTR_2(name, name##_store)
425
426TUNE_ATTR(ilimit, 0);
427TUNE_ATTR(ilimit_tries, 0);
428TUNE_ATTR(ilimit_min, 0);
429TUNE_ATTR(demote_secs, 0);
430TUNE_ATTR(incore_log_blocks, 0);
431TUNE_ATTR(log_flush_secs, 0);
432TUNE_ATTR(jindex_refresh_secs, 0);
433TUNE_ATTR(quota_warn_period, 0);
434TUNE_ATTR(quota_quantum, 0);
435TUNE_ATTR(atime_quantum, 0);
436TUNE_ATTR(max_readahead, 0);
437TUNE_ATTR(complain_secs, 0);
438TUNE_ATTR(reclaim_limit, 0);
439TUNE_ATTR(prefetch_secs, 0);
440TUNE_ATTR(statfs_slow, 0);
441TUNE_ATTR(new_files_jdata, 0);
442TUNE_ATTR(new_files_directio, 0);
443TUNE_ATTR(quota_simul_sync, 1);
444TUNE_ATTR(quota_cache_secs, 1);
445TUNE_ATTR(max_atomic_write, 1);
446TUNE_ATTR(stall_secs, 1);
447TUNE_ATTR(entries_per_readdir, 1);
448TUNE_ATTR(greedy_default, 1);
449TUNE_ATTR(greedy_quantum, 1);
450TUNE_ATTR(greedy_max, 1);
451TUNE_ATTR(statfs_quantum, 1);
452TUNE_ATTR_DAEMON(scand_secs, scand_process);
453TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
454TUNE_ATTR_DAEMON(logd_secs, logd_process);
455TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
456TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
457
458static struct attribute *tune_attrs[] = {
459 &tune_attr_ilimit.attr,
460 &tune_attr_ilimit_tries.attr,
461 &tune_attr_ilimit_min.attr,
462 &tune_attr_demote_secs.attr,
463 &tune_attr_incore_log_blocks.attr,
464 &tune_attr_log_flush_secs.attr,
465 &tune_attr_jindex_refresh_secs.attr,
466 &tune_attr_quota_warn_period.attr,
467 &tune_attr_quota_quantum.attr,
468 &tune_attr_atime_quantum.attr,
469 &tune_attr_max_readahead.attr,
470 &tune_attr_complain_secs.attr,
471 &tune_attr_reclaim_limit.attr,
472 &tune_attr_prefetch_secs.attr,
473 &tune_attr_statfs_slow.attr,
474 &tune_attr_quota_simul_sync.attr,
475 &tune_attr_quota_cache_secs.attr,
476 &tune_attr_max_atomic_write.attr,
477 &tune_attr_stall_secs.attr,
478 &tune_attr_entries_per_readdir.attr,
479 &tune_attr_greedy_default.attr,
480 &tune_attr_greedy_quantum.attr,
481 &tune_attr_greedy_max.attr,
482 &tune_attr_statfs_quantum.attr,
483 &tune_attr_scand_secs.attr,
484 &tune_attr_recoverd_secs.attr,
485 &tune_attr_logd_secs.attr,
486 &tune_attr_quotad_secs.attr,
487 &tune_attr_quota_scale.attr,
488 &tune_attr_new_files_jdata.attr,
489 &tune_attr_new_files_directio.attr,
490 NULL
491};
492
493static struct attribute_group lockstruct_group = {
494 .name = "lockstruct",
495 .attrs = lockstruct_attrs
496};
497
498static struct attribute_group counters_group = {
499 .name = "counters",
500 .attrs = counters_attrs
501};
502
503static struct attribute_group args_group = {
504 .name = "args",
505 .attrs = args_attrs
506};
507
508static struct attribute_group tune_group = {
509 .name = "tune",
510 .attrs = tune_attrs
511};
512
513int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
514{
515 int error;
516
517 sdp->sd_kobj.kset = &gfs2_kset;
518 sdp->sd_kobj.ktype = &gfs2_ktype;
519
520 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
521 if (error)
522 goto fail;
523
524 error = kobject_register(&sdp->sd_kobj);
525 if (error)
526 goto fail;
527
528 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
529 if (error)
530 goto fail_reg;
531
532 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
533 if (error)
534 goto fail_lockstruct;
535
536 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
537 if (error)
538 goto fail_counters;
539
540 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
541 if (error)
542 goto fail_args;
543
544 return 0;
545
546 fail_args:
547 sysfs_remove_group(&sdp->sd_kobj, &args_group);
548 fail_counters:
549 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
550 fail_lockstruct:
551 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
552 fail_reg:
553 kobject_unregister(&sdp->sd_kobj);
554 fail:
555 return error;
556}
557
558void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
559{
560 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
561 sysfs_remove_group(&sdp->sd_kobj, &args_group);
562 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
563 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
564 kobject_unregister(&sdp->sd_kobj);
565}
566
567int gfs2_sys_init(void)
568{
569 gfs2_sys_margs = NULL;
570 spin_lock_init(&gfs2_sys_margs_lock);
571 return kset_register(&gfs2_kset);
572}
573
574void gfs2_sys_uninit(void)
575{
576 kfree(gfs2_sys_margs);
577 kset_unregister(&gfs2_kset);
578}
579
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..c46a700e801e
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13/* Allow args to be passed to GFS2 when using an initial ram disk */
14extern char *gfs2_sys_margs;
15extern spinlock_t gfs2_sys_margs_lock;
16
17int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
18void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19
20int gfs2_sys_init(void);
21void gfs2_sys_uninit(void);
22
23#endif /* __SYS_DOT_H__ */
24
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..05e0b72d56ff
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "util.h"
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes)
30{
31 struct gfs2_trans *tr;
32 int error;
33
34 BUG_ON(current->journal_info);
35 BUG_ON(blocks == 0 && revokes == 0);
36
37 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
38 if (!tr)
39 return -ENOMEM;
40
41 tr->tr_ip = (unsigned long)__builtin_return_address(0);
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 6 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(uint64_t));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
53
54 error = gfs2_glock_nq(&tr->tr_t_gh);
55 if (error)
56 goto fail_holder_uninit;
57
58 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
59 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
60 error = -EROFS;
61 goto fail_gunlock;
62 }
63
64 error = gfs2_log_reserve(sdp, tr->tr_reserved);
65 if (error)
66 goto fail_gunlock;
67
68 current->journal_info = tr;
69
70 return 0;
71
72fail_gunlock:
73 gfs2_glock_dq(&tr->tr_t_gh);
74
75fail_holder_uninit:
76 gfs2_holder_uninit(&tr->tr_t_gh);
77 kfree(tr);
78
79 return error;
80}
81
82void gfs2_trans_end(struct gfs2_sbd *sdp)
83{
84 struct gfs2_trans *tr = current->journal_info;
85
86 BUG_ON(!tr);
87 current->journal_info = NULL;
88
89 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh);
92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 return;
95 }
96
97 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
98 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
99 tr->tr_num_buf, tr->tr_blocks);
100 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
101 }
102 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
103 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
104 tr->tr_num_revoke, tr->tr_revokes);
105 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
106 }
107
108 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh);
110 gfs2_holder_uninit(&tr->tr_t_gh);
111 kfree(tr);
112
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL);
115}
116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to
125 * @bh: The buffer to add
126 * @meta: True in the case of adding metadata
127 *
128 */
129
130void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
131{
132 struct gfs2_sbd *sdp = gl->gl_sbd;
133 struct gfs2_bufdata *bd;
134
135 bd = bh->b_private;
136 if (bd)
137 gfs2_assert(sdp, bd->bd_gl == gl);
138 else {
139 gfs2_attach_bufdata(gl, bh, meta);
140 bd = bh->b_private;
141 }
142 lops_add(sdp, &bd->bd_le);
143}
144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno)
146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
148 GFP_NOFS | __GFP_NOFAIL);
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
150 rv->rv_blkno = blkno;
151 lops_add(sdp, &rv->rv_le);
152}
153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno)
155{
156 struct gfs2_revoke *rv;
157 int found = 0;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
162 if (rv->rv_blkno == blkno) {
163 list_del(&rv->rv_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--;
166 found = 1;
167 break;
168 }
169 }
170
171 gfs2_log_unlock(sdp);
172
173 if (found) {
174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv);
176 tr->tr_num_revoke_rm++;
177 }
178}
179
180void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
181{
182 lops_add(rgd->rd_sbd, &rgd->rd_le);
183}
184
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..fbef3f5a99e3
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#define RES_DINODE 1
14#define RES_INDIRECT 1
15#define RES_JDATA 1
16#define RES_DATA 1
17#define RES_LEAF 1
18#define RES_RG_BIT 2
19#define RES_EATTR 1
20#define RES_STATFS 1
21#define RES_QUOTA 2
22
23int gfs2_trans_begin(struct gfs2_sbd *sdp,
24 unsigned int blocks, unsigned int revokes);
25
26void gfs2_trans_end(struct gfs2_sbd *sdp);
27
28void gfs2_trans_add_gl(struct gfs2_glock *gl);
29void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
30void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno);
31void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno);
32void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
33
34#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..39e67b1ec70a
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "util.h"
25
26kmem_cache_t *gfs2_glock_cachep __read_mostly;
27kmem_cache_t *gfs2_inode_cachep __read_mostly;
28kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
29
30void gfs2_assert_i(struct gfs2_sbd *sdp)
31{
32 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
33 sdp->sd_fsname);
34}
35
36/**
37 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
38 * Returns: -1 if this call withdrew the machine,
39 * -2 if it was already withdrawn
40 */
41
42int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
43 const char *function, char *file, unsigned int line)
44{
45 int me;
46 me = gfs2_lm_withdraw(sdp,
47 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
48 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
49 sdp->sd_fsname, assertion,
50 sdp->sd_fsname, function, file, line);
51 dump_stack();
52 return (me) ? -1 : -2;
53}
54
55/**
56 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
57 * Returns: -1 if we printed something
58 * -2 if we didn't
59 */
60
61int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
62 const char *function, char *file, unsigned int line)
63{
64 if (time_before(jiffies,
65 sdp->sd_last_warning +
66 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
67 return -2;
68
69 printk(KERN_WARNING
70 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
71 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
72 sdp->sd_fsname, assertion,
73 sdp->sd_fsname, function, file, line);
74
75 if (sdp->sd_args.ar_debug)
76 BUG();
77 else
78 dump_stack();
79
80 sdp->sd_last_warning = jiffies;
81
82 return -1;
83}
84
85/**
86 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
87 * Returns: -1 if this call withdrew the machine,
88 * 0 if it was already withdrawn
89 */
90
91int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
92 char *file, unsigned int line)
93{
94 int rv;
95 rv = gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
97 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
98 sdp->sd_fsname,
99 sdp->sd_fsname, function, file, line);
100 return rv;
101}
102
103/**
104 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
105 * Returns: -1 if this call withdrew the machine,
106 * 0 if it was already withdrawn
107 */
108
109int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
110 const char *function, char *file, unsigned int line)
111{
112 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
113 int rv;
114 rv = gfs2_lm_withdraw(sdp,
115 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
116 "GFS2: fsid=%s: inode = %llu %llu\n"
117 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
118 sdp->sd_fsname,
119 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
120 (unsigned long long)ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 uint16_t type, uint16_t t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..8216d28bd816
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13
14#define fs_printk(level, fs, fmt, arg...) \
15 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
16
17#define fs_info(fs, fmt, arg...) \
18 fs_printk(KERN_INFO , fs , fmt , ## arg)
19
20#define fs_warn(fs, fmt, arg...) \
21 fs_printk(KERN_WARNING , fs , fmt , ## arg)
22
23#define fs_err(fs, fmt, arg...) \
24 fs_printk(KERN_ERR, fs , fmt , ## arg)
25
26
27void gfs2_assert_i(struct gfs2_sbd *sdp);
28
29#define gfs2_assert(sdp, assertion) \
30do { \
31 if (unlikely(!(assertion))) { \
32 gfs2_assert_i(sdp); \
33 BUG(); \
34 } \
35} while (0)
36
37
38int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
39 const char *function, char *file, unsigned int line);
40
41#define gfs2_assert_withdraw(sdp, assertion) \
42((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
43 __FUNCTION__, __FILE__, __LINE__))
44
45
46int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
47 const char *function, char *file, unsigned int line);
48
49#define gfs2_assert_warn(sdp, assertion) \
50((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
51 __FUNCTION__, __FILE__, __LINE__))
52
53
54int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
55 const char *function, char *file, unsigned int line);
56
57#define gfs2_consist(sdp) \
58gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
59
60
61int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
62 const char *function, char *file, unsigned int line);
63
64#define gfs2_consist_inode(ip) \
65gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
66
67
68int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
69 const char *function, char *file, unsigned int line);
70
71#define gfs2_consist_rgrpd(rgd) \
72gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
73
74
75int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
76 const char *type, const char *function,
77 char *file, unsigned int line);
78
79static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
80 struct buffer_head *bh,
81 const char *function,
82 char *file, unsigned int line)
83{
84 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
85 uint32_t magic = mh->mh_magic;
86 magic = be32_to_cpu(magic);
87 if (unlikely(magic != GFS2_MAGIC))
88 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
89 file, line);
90 return 0;
91}
92
93#define gfs2_meta_check(sdp, bh) \
94gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
95
96
97int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
98 uint16_t type, uint16_t t,
99 const char *function,
100 char *file, unsigned int line);
101
102static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
103 struct buffer_head *bh,
104 uint16_t type,
105 const char *function,
106 char *file, unsigned int line)
107{
108 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
109 uint32_t magic = mh->mh_magic;
110 uint16_t t = be32_to_cpu(mh->mh_type);
111 magic = be32_to_cpu(magic);
112 if (unlikely(magic != GFS2_MAGIC))
113 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
114 file, line);
115 if (unlikely(t != type))
116 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
117 file, line);
118 return 0;
119}
120
121#define gfs2_metatype_check(sdp, bh, type) \
122gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
123
124static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type,
125 uint16_t format)
126{
127 struct gfs2_meta_header *mh;
128 mh = (struct gfs2_meta_header *)bh->b_data;
129 mh->mh_type = cpu_to_be32(type);
130 mh->mh_format = cpu_to_be32(format);
131}
132
133
134int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
135 char *file, unsigned int line);
136
137#define gfs2_io_error(sdp) \
138gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
139
140
141int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
142 const char *function, char *file, unsigned int line);
143
144#define gfs2_io_error_bh(sdp, bh) \
145gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
146
147
148extern kmem_cache_t *gfs2_glock_cachep;
149extern kmem_cache_t *gfs2_inode_cachep;
150extern kmem_cache_t *gfs2_bufdata_cachep;
151
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p)
154{
155 unsigned int x;
156 spin_lock(&gt->gt_spin);
157 x = *p;
158 spin_unlock(&gt->gt_spin);
159 return x;
160}
161
162#define gfs2_tune_get(sdp, field) \
163gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
164
165void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
166 unsigned int bit, int new_value);
167
168#endif /* __UTIL_DOT_H__ */
169
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 2b8a7d68fae3..2121cde187d8 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -8,7 +8,7 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h \
8 atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h \ 8 atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h \
9 awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h \ 9 awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h \
10 bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h \ 10 bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h \
11 consolemap.h cycx_cfm.h dm-ioctl.h dn.h dqblk_v1.h \ 11 consolemap.h cycx_cfm.h dlm_device.h dm-ioctl.h dn.h dqblk_v1.h \
12 dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h \ 12 dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h \
13 fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h \ 13 fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h \
14 fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h \ 14 fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h \
@@ -18,28 +18,29 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h \
18 if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h \ 18 if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h \
19 in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h \ 19 in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h \
20 ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h \ 20 ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h \
21 jffs2.h keyctl.h limits.h major.h matroxfb.h meye.h minix_fs.h \ 21 jffs2.h keyctl.h limits.h lock_dlm_plock.h major.h matroxfb.h \
22 mmtimer.h mqueue.h mtio.h ncp_no.h netfilter_arp.h netrom.h \ 22 meye.h minix_fs.h mmtimer.h mqueue.h mtio.h ncp_no.h \
23 nfs2.h nfs4_mount.h nfs_mount.h openprom_fs.h param.h \ 23 netfilter_arp.h netrom.h nfs2.h nfs4_mount.h nfs_mount.h \
24 pci_ids.h pci_regs.h personality.h pfkeyv2.h pg.h pkt_cls.h \ 24 openprom_fs.h param.h pci_ids.h pci_regs.h personality.h \
25 pkt_sched.h posix_types.h ppdev.h prctl.h ps2esdi.h qic117.h \ 25 pfkeyv2.h pg.h pkt_cls.h pkt_sched.h posix_types.h ppdev.h \
26 qnxtypes.h quotaio_v1.h quotaio_v2.h radeonfb.h raw.h \ 26 prctl.h ps2esdi.h qic117.h qnxtypes.h quotaio_v1.h quotaio_v2.h \
27 resource.h rose.h sctp.h smbno.h snmp.h sockios.h som.h \ 27 radeonfb.h raw.h resource.h rose.h sctp.h smbno.h snmp.h \
28 sound.h stddef.h synclink.h telephony.h termios.h ticable.h \ 28 sockios.h som.h sound.h stddef.h synclink.h telephony.h \
29 times.h tiocl.h tipc.h toshiba.h ultrasound.h un.h utime.h \ 29 termios.h ticable.h times.h tiocl.h tipc.h toshiba.h \
30 utsname.h video_decoder.h video_encoder.h videotext.h vt.h \ 30 ultrasound.h un.h utime.h utsname.h video_decoder.h \
31 wavefront.h wireless.h xattr.h x25.h zorro_ids.h 31 video_encoder.h videotext.h vt.h wavefront.h wireless.h xattr.h \
32 x25.h zorro_ids.h
32 33
33unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h \ 34unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h \
34 atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h \ 35 atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h \
35 capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h \ 36 capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h \
36 cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h \ 37 cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h \
37 dccp.h dirent.h divert.h elfcore.h errno.h errqueue.h \ 38 dccp.h dirent.h divert.h dlm.h elfcore.h errno.h errqueue.h \
38 ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h \ 39 ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h \
39 filter.h flat.h fs.h ftape.h gameport.h generic_serial.h \ 40 filter.h flat.h fs.h ftape.h gameport.h generic_serial.h \
40 genhd.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h hiddev.h hpet.h \ 41 genhd.h gfs2_ondisk.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h \
41 i2c.h i2o-dev.h icmpv6.h if_bridge.h if_ec.h \ 42 hiddev.h hpet.h i2c.h i2o-dev.h icmpv6.h iflags.h if_bridge.h \
42 if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h \ 43 if_ec.h if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h \
43 if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h \ 44 if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h \
44 inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h \ 45 inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h \
45 ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h \ 46 ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h \
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..1b1dcb9a40bb
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,302 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock status block
164 *
165 * Use this structure to specify the contents of the lock value block. For a
166 * conversion request, this structure is used to specify the lock ID of the
167 * lock. DLM writes the status of the lock request and the lock ID assigned
168 * to the request in the lock status block.
169 *
170 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
171 * It is available when dlm_lock returns.
172 *
173 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
174 * shown for the DLM_LKF_VALBLK flag.
175 *
176 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
177 * it was first demoted to NL to avoid conversion deadlock.
178 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
179 *
180 * sb_status: the returned status of the lock request set prior to AST
181 * execution. Possible return values:
182 *
183 * 0 if lock request was successful
184 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
185 * -ENOMEM if there is no memory to process request
186 * -EINVAL if there are invalid parameters
187 * -DLM_EUNLOCK if unlock request was successful
188 * -DLM_ECANCEL if a cancel completed successfully
189 */
190
191#define DLM_SBF_DEMOTED 0x01
192#define DLM_SBF_VALNOTVALID 0x02
193#define DLM_SBF_ALTMODE 0x04
194
195struct dlm_lksb {
196 int sb_status;
197 uint32_t sb_lkid;
198 char sb_flags;
199 char * sb_lvbptr;
200};
201
202
203#ifdef __KERNEL__
204
205#define DLM_LSFL_NODIR 0x00000001
206
207/*
208 * dlm_new_lockspace
209 *
210 * Starts a lockspace with the given name. If the named lockspace exists in
211 * the cluster, the calling node joins it.
212 */
213
214int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
215 uint32_t flags, int lvblen);
216
217/*
218 * dlm_release_lockspace
219 *
220 * Stop a lockspace.
221 */
222
223int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
224
225/*
226 * dlm_lock
227 *
228 * Make an asyncronous request to acquire or convert a lock on a named
229 * resource.
230 *
231 * lockspace: context for the request
232 * mode: the requested mode of the lock (DLM_LOCK_)
233 * lksb: lock status block for input and async return values
234 * flags: input flags (DLM_LKF_)
235 * name: name of the resource to lock, can be binary
236 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
237 * parent: the lock ID of a parent lock or 0 if none
238 * lockast: function DLM executes when it completes processing the request
239 * astarg: argument passed to lockast and bast functions
240 * bast: function DLM executes when this lock later blocks another request
241 *
242 * Returns:
243 * 0 if request is successfully queued for processing
244 * -EINVAL if any input parameters are invalid
245 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
246 * -ENOMEM if there is no memory to process request
247 * -ENOTCONN if there is a communication error
248 *
249 * If the call to dlm_lock returns an error then the operation has failed and
250 * the AST routine will not be called. If dlm_lock returns 0 it is still
251 * possible that the lock operation will fail. The AST routine will be called
252 * when the locking is complete and the status is returned in the lksb.
253 *
254 * If the AST routines or parameter are passed to a conversion operation then
255 * they will overwrite those values that were passed to a previous dlm_lock
256 * call.
257 *
258 * AST routines should not block (at least not for long), but may make
259 * any locking calls they please.
260 */
261
262int dlm_lock(dlm_lockspace_t *lockspace,
263 int mode,
264 struct dlm_lksb *lksb,
265 uint32_t flags,
266 void *name,
267 unsigned int namelen,
268 uint32_t parent_lkid,
269 void (*lockast) (void *astarg),
270 void *astarg,
271 void (*bast) (void *astarg, int mode));
272
273/*
274 * dlm_unlock
275 *
276 * Asynchronously release a lock on a resource. The AST routine is called
277 * when the resource is successfully unlocked.
278 *
279 * lockspace: context for the request
280 * lkid: the lock ID as returned in the lksb
281 * flags: input flags (DLM_LKF_)
282 * lksb: if NULL the lksb parameter passed to last lock request is used
283 * astarg: the arg used with the completion ast for the unlock
284 *
285 * Returns:
286 * 0 if request is successfully queued for processing
287 * -EINVAL if any input parameters are invalid
288 * -ENOTEMPTY if the lock still has sublocks
289 * -EBUSY if the lock is waiting for a remote lock operation
290 * -ENOTCONN if there is a communication error
291 */
292
293int dlm_unlock(dlm_lockspace_t *lockspace,
294 uint32_t lkid,
295 uint32_t flags,
296 struct dlm_lksb *lksb,
297 void *astarg);
298
299#endif /* __KERNEL__ */
300
301#endif /* __DLM_DOT_H__ */
302
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..2a2dd189b9fd
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,86 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 5
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u8 namelen;
29 __u16 flags;
30 __u32 lkid;
31 __u32 parent;
32 void __user *castparam;
33 void __user *castaddr;
34 void __user *bastparam;
35 void __user *bastaddr;
36 struct dlm_lksb __user *lksb;
37 char lvb[DLM_USER_LVB_LEN];
38 char name[0];
39};
40
41struct dlm_lspace_params {
42 __u32 flags;
43 __u32 minor;
44 char name[0];
45};
46
47struct dlm_write_request {
48 __u32 version[3];
49 __u8 cmd;
50 __u8 is64bit;
51 __u8 unused[2];
52
53 union {
54 struct dlm_lock_params lock;
55 struct dlm_lspace_params lspace;
56 } i;
57};
58
59/* struct read from the "device" fd,
60 consists mainly of userspace pointers for the library to use */
61struct dlm_lock_result {
62 __u32 length;
63 void __user * user_astaddr;
64 void __user * user_astparam;
65 struct dlm_lksb __user * user_lksb;
66 struct dlm_lksb lksb;
67 __u8 bast_mode;
68 __u8 unused[3];
69 /* Offsets may be zero if no data is present */
70 __u32 lvb_offset;
71};
72
73/* Commands passed to the device */
74#define DLM_USER_LOCK 1
75#define DLM_USER_UNLOCK 2
76#define DLM_USER_QUERY 3
77#define DLM_USER_CREATE_LOCKSPACE 4
78#define DLM_USER_REMOVE_LOCKSPACE 5
79
80/* Arbitrary length restriction */
81#define MAX_LS_NAME_LEN 64
82
83/* Lockspace flags */
84#define DLM_USER_LSFLG_AUTOFREE 1
85#define DLM_USER_LSFLG_FORCEFREE 2
86
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 25610205c90d..83abd9d7898f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1375,6 +1375,9 @@ extern struct subsystem fs_subsys;
1375#define FLOCK_VERIFY_READ 1 1375#define FLOCK_VERIFY_READ 1
1376#define FLOCK_VERIFY_WRITE 2 1376#define FLOCK_VERIFY_WRITE 2
1377 1377
1378/* /sys/fs */
1379extern struct subsystem fs_subsys;
1380
1378extern int locks_mandatory_locked(struct inode *); 1381extern int locks_mandatory_locked(struct inode *);
1379extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1382extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1380 1383
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
new file mode 100644
index 000000000000..3ebd8743ce8c
--- /dev/null
+++ b/include/linux/gfs2_ondisk.h
@@ -0,0 +1,443 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_ONDISK_DOT_H__
11#define __GFS2_ONDISK_DOT_H__
12
13#define GFS2_MAGIC 0x01161970
14#define GFS2_BASIC_BLOCK 512
15#define GFS2_BASIC_BLOCK_SHIFT 9
16
17/* Lock numbers of the LM_TYPE_NONDISK type */
18
19#define GFS2_MOUNT_LOCK 0
20#define GFS2_LIVE_LOCK 1
21#define GFS2_TRANS_LOCK 2
22#define GFS2_RENAME_LOCK 3
23
24/* Format numbers for various metadata types */
25
26#define GFS2_FORMAT_NONE 0
27#define GFS2_FORMAT_SB 100
28#define GFS2_FORMAT_RG 200
29#define GFS2_FORMAT_RB 300
30#define GFS2_FORMAT_DI 400
31#define GFS2_FORMAT_IN 500
32#define GFS2_FORMAT_LF 600
33#define GFS2_FORMAT_JD 700
34#define GFS2_FORMAT_LH 800
35#define GFS2_FORMAT_LD 900
36#define GFS2_FORMAT_LB 1000
37#define GFS2_FORMAT_EA 1600
38#define GFS2_FORMAT_ED 1700
39#define GFS2_FORMAT_QC 1400
40/* These are format numbers for entities contained in files */
41#define GFS2_FORMAT_RI 1100
42#define GFS2_FORMAT_DE 1200
43#define GFS2_FORMAT_QU 1500
44/* These are part of the superblock */
45#define GFS2_FORMAT_FS 1801
46#define GFS2_FORMAT_MULTI 1900
47
48/*
49 * An on-disk inode number
50 */
51
52struct gfs2_inum {
53 __be64 no_formal_ino;
54 __be64 no_addr;
55};
56
57static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
58 const struct gfs2_inum *ino2)
59{
60 return ino1->no_formal_ino == ino2->no_formal_ino &&
61 ino1->no_addr == ino2->no_addr;
62}
63
64/*
65 * Generic metadata head structure
66 * Every inplace buffer logged in the journal must start with this.
67 */
68
69#define GFS2_METATYPE_NONE 0
70#define GFS2_METATYPE_SB 1
71#define GFS2_METATYPE_RG 2
72#define GFS2_METATYPE_RB 3
73#define GFS2_METATYPE_DI 4
74#define GFS2_METATYPE_IN 5
75#define GFS2_METATYPE_LF 6
76#define GFS2_METATYPE_JD 7
77#define GFS2_METATYPE_LH 8
78#define GFS2_METATYPE_LD 9
79#define GFS2_METATYPE_LB 12
80#define GFS2_METATYPE_EA 10
81#define GFS2_METATYPE_ED 11
82#define GFS2_METATYPE_QC 14
83
84struct gfs2_meta_header {
85 __be32 mh_magic;
86 __be32 mh_type;
87 __be64 __pad0; /* Was generation number in gfs1 */
88 __be32 mh_format;
89 __be32 __pad1; /* Was incarnation number in gfs1 */
90};
91
92/*
93 * super-block structure
94 *
95 * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes)
96 *
97 * Order is important, need to be able to read old superblocks to do on-disk
98 * version upgrades.
99 */
100
101/* Address of superblock in GFS2 basic blocks */
102#define GFS2_SB_ADDR 128
103
104/* The lock number for the superblock (must be zero) */
105#define GFS2_SB_LOCK 0
106
107/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0
108 Includes: the fencing zero at the end */
109#define GFS2_LOCKNAME_LEN 64
110
111struct gfs2_sb {
112 struct gfs2_meta_header sb_header;
113
114 __be32 sb_fs_format;
115 __be32 sb_multihost_format;
116 __u32 __pad0; /* Was superblock flags in gfs1 */
117
118 __be32 sb_bsize;
119 __be32 sb_bsize_shift;
120 __u32 __pad1; /* Was journal segment size in gfs1 */
121
122 struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
123 struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
124 struct gfs2_inum sb_root_dir;
125
126 char sb_lockproto[GFS2_LOCKNAME_LEN];
127 char sb_locktable[GFS2_LOCKNAME_LEN];
128 /* In gfs1, quota and license dinodes followed */
129};
130
131/*
132 * resource index structure
133 */
134
135struct gfs2_rindex {
136 __be64 ri_addr; /* grp block disk address */
137 __be32 ri_length; /* length of rgrp header in fs blocks */
138 __u32 __pad;
139
140 __be64 ri_data0; /* first data location */
141 __be32 ri_data; /* num of data blocks in rgrp */
142
143 __be32 ri_bitbytes; /* number of bytes in data bitmaps */
144
145 __u8 ri_reserved[64];
146};
147
148/*
149 * resource group header structure
150 */
151
152/* Number of blocks per byte in rgrp */
153#define GFS2_NBBY 4
154#define GFS2_BIT_SIZE 2
155#define GFS2_BIT_MASK 0x00000003
156
157#define GFS2_BLKST_FREE 0
158#define GFS2_BLKST_USED 1
159#define GFS2_BLKST_UNLINKED 2
160#define GFS2_BLKST_DINODE 3
161
162#define GFS2_RGF_JOURNAL 0x00000001
163#define GFS2_RGF_METAONLY 0x00000002
164#define GFS2_RGF_DATAONLY 0x00000004
165#define GFS2_RGF_NOALLOC 0x00000008
166
167struct gfs2_rgrp {
168 struct gfs2_meta_header rg_header;
169
170 __be32 rg_flags;
171 __be32 rg_free;
172 __be32 rg_dinodes;
173 __be32 __pad;
174 __be64 rg_igeneration;
175
176 __u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
177};
178
179/*
180 * quota structure
181 */
182
183struct gfs2_quota {
184 __be64 qu_limit;
185 __be64 qu_warn;
186 __be64 qu_value;
187 __u8 qu_reserved[64];
188};
189
190/*
191 * dinode structure
192 */
193
194#define GFS2_MAX_META_HEIGHT 10
195#define GFS2_DIR_MAX_DEPTH 17
196
197#define DT2IF(dt) (((dt) << 12) & S_IFMT)
198#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
199
200enum {
201 gfs2fl_Jdata = 0,
202 gfs2fl_ExHash = 1,
203 gfs2fl_Unused = 2,
204 gfs2fl_EaIndirect = 3,
205 gfs2fl_Directio = 4,
206 gfs2fl_Immutable = 5,
207 gfs2fl_AppendOnly = 6,
208 gfs2fl_NoAtime = 7,
209 gfs2fl_Sync = 8,
210 gfs2fl_System = 9,
211 gfs2fl_TruncInProg = 29,
212 gfs2fl_InheritDirectio = 30,
213 gfs2fl_InheritJdata = 31,
214};
215
216/* Dinode flags */
217#define GFS2_DIF_JDATA 0x00000001
218#define GFS2_DIF_EXHASH 0x00000002
219#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */
220#define GFS2_DIF_EA_INDIRECT 0x00000008
221#define GFS2_DIF_DIRECTIO 0x00000010
222#define GFS2_DIF_IMMUTABLE 0x00000020
223#define GFS2_DIF_APPENDONLY 0x00000040
224#define GFS2_DIF_NOATIME 0x00000080
225#define GFS2_DIF_SYNC 0x00000100
226#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */
227#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */
228#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000
229#define GFS2_DIF_INHERIT_JDATA 0x80000000
230
231struct gfs2_dinode {
232 struct gfs2_meta_header di_header;
233
234 struct gfs2_inum di_num;
235
236 __be32 di_mode; /* mode of file */
237 __be32 di_uid; /* owner's user id */
238 __be32 di_gid; /* owner's group id */
239 __be32 di_nlink; /* number of links to this file */
240 __be64 di_size; /* number of bytes in file */
241 __be64 di_blocks; /* number of blocks in file */
242 __be64 di_atime; /* time last accessed */
243 __be64 di_mtime; /* time last modified */
244 __be64 di_ctime; /* time last changed */
245 __be32 di_major; /* device major number */
246 __be32 di_minor; /* device minor number */
247
248 /* This section varies from gfs1. Padding added to align with
249 * remainder of dinode
250 */
251 __be64 di_goal_meta; /* rgrp to alloc from next */
252 __be64 di_goal_data; /* data block goal */
253 __be64 di_generation; /* generation number for NFS */
254
255 __be32 di_flags; /* GFS2_DIF_... */
256 __be32 di_payload_format; /* GFS2_FORMAT_... */
257 __u16 __pad1; /* Was ditype in gfs1 */
258 __be16 di_height; /* height of metadata */
259 __u32 __pad2; /* Unused incarnation number from gfs1 */
260
261 /* These only apply to directories */
262 __u16 __pad3; /* Padding */
263 __be16 di_depth; /* Number of bits in the table */
264 __be32 di_entries; /* The number of entries in the directory */
265
266 struct gfs2_inum __pad4; /* Unused even in current gfs1 */
267
268 __be64 di_eattr; /* extended attribute block number */
269
270 __u8 di_reserved[56];
271};
272
273/*
274 * directory structure - many of these per directory file
275 */
276
277#define GFS2_FNAMESIZE 255
278#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
279
280struct gfs2_dirent {
281 struct gfs2_inum de_inum;
282 __be32 de_hash;
283 __be16 de_rec_len;
284 __be16 de_name_len;
285 __be16 de_type;
286 __u8 __pad[14];
287};
288
289/*
290 * Header of leaf directory nodes
291 */
292
293struct gfs2_leaf {
294 struct gfs2_meta_header lf_header;
295
296 __be16 lf_depth; /* Depth of leaf */
297 __be16 lf_entries; /* Number of dirents in leaf */
298 __be32 lf_dirent_format; /* Format of the dirents */
299 __be64 lf_next; /* Next leaf, if overflow */
300
301 __u8 lf_reserved[64];
302};
303
304/*
305 * Extended attribute header format
306 */
307
308#define GFS2_EA_MAX_NAME_LEN 255
309#define GFS2_EA_MAX_DATA_LEN 65536
310
311#define GFS2_EATYPE_UNUSED 0
312#define GFS2_EATYPE_USR 1
313#define GFS2_EATYPE_SYS 2
314#define GFS2_EATYPE_SECURITY 3
315
316#define GFS2_EATYPE_LAST 3
317#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST)
318
319#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */
320
321struct gfs2_ea_header {
322 __be32 ea_rec_len;
323 __be32 ea_data_len;
324 __u8 ea_name_len; /* no NULL pointer after the string */
325 __u8 ea_type; /* GFS2_EATYPE_... */
326 __u8 ea_flags; /* GFS2_EAFLAG_... */
327 __u8 ea_num_ptrs;
328 __u32 __pad;
329};
330
331/*
332 * Log header structure
333 */
334
335#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */
336
337struct gfs2_log_header {
338 struct gfs2_meta_header lh_header;
339
340 __be64 lh_sequence; /* Sequence number of this transaction */
341 __be32 lh_flags; /* GFS2_LOG_HEAD_... */
342 __be32 lh_tail; /* Block number of log tail */
343 __be32 lh_blkno;
344 __be32 lh_hash;
345};
346
347/*
348 * Log type descriptor
349 */
350
351#define GFS2_LOG_DESC_METADATA 300
352/* ld_data1 is the number of metadata blocks in the descriptor.
353 ld_data2 is unused. */
354
355#define GFS2_LOG_DESC_REVOKE 301
356/* ld_data1 is the number of revoke blocks in the descriptor.
357 ld_data2 is unused. */
358
359#define GFS2_LOG_DESC_JDATA 302
360/* ld_data1 is the number of data blocks in the descriptor.
361 ld_data2 is unused. */
362
363struct gfs2_log_descriptor {
364 struct gfs2_meta_header ld_header;
365
366 __be32 ld_type; /* GFS2_LOG_DESC_... */
367 __be32 ld_length; /* Number of buffers in this chunk */
368 __be32 ld_data1; /* descriptor-specific field */
369 __be32 ld_data2; /* descriptor-specific field */
370
371 __u8 ld_reserved[32];
372};
373
374/*
375 * Inum Range
376 * Describe a range of formal inode numbers allocated to
377 * one machine to assign to inodes.
378 */
379
380#define GFS2_INUM_QUANTUM 1048576
381
382struct gfs2_inum_range {
383 __be64 ir_start;
384 __be64 ir_length;
385};
386
387/*
388 * Statfs change
389 * Describes an change to the pool of free and allocated
390 * blocks.
391 */
392
393struct gfs2_statfs_change {
394 __be64 sc_total;
395 __be64 sc_free;
396 __be64 sc_dinodes;
397};
398
399/*
400 * Quota change
401 * Describes an allocation change for a particular
402 * user or group.
403 */
404
405#define GFS2_QCF_USER 0x00000001
406
407struct gfs2_quota_change {
408 __be64 qc_change;
409 __be32 qc_flags; /* GFS2_QCF_... */
410 __be32 qc_id;
411};
412
413#ifdef __KERNEL__
414/* Translation functions */
415
416extern void gfs2_inum_in(struct gfs2_inum *no, char *buf);
417extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf);
418extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf);
419extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf);
420extern void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf);
421extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf);
422extern void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf);
423extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf);
424extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf);
425extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf);
426extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf);
427extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf);
428extern void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf);
429extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf);
430extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf);
431extern void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf);
432extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf);
433extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf);
434extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf);
435
436/* Printing functions */
437
438extern void gfs2_rindex_print(struct gfs2_rindex *ri);
439extern void gfs2_dinode_print(struct gfs2_dinode *di);
440
441#endif /* __KERNEL__ */
442
443#endif /* __GFS2_ONDISK_DOT_H__ */
diff --git a/include/linux/iflags.h b/include/linux/iflags.h
new file mode 100644
index 000000000000..5b27102dfeaf
--- /dev/null
+++ b/include/linux/iflags.h
@@ -0,0 +1,102 @@
1#ifndef _LINUX_IFLAGS_H
2#define _LINUX_IFLAGS_H
3
4/*
5 * A universal set of inode flags.
6 *
7 * Originally taken from ext2/3 with additions for other filesystems.
8 * Filesystems supporting this interface should interoperate with
9 * the lsattr and chattr command line tools.
10 *
11 * This interface is supported in whole or in part by:
12 * ext2
13 * ext3
14 * xfs
15 * jfs
16 * gfs2
17 *
18 */
19
20#define IFLAGS_GET_IOC _IOR('f', 1, long)
21#define IFLAGS_SET_IOC _IOW('f', 2, long)
22
23/*
24 * These values are provided for use as indices of an array
25 * for use with the iflags_cvt function below
26 */
27enum {
28 iflag_SecureRm = 0, /* Secure deletion */
29 iflag_Unrm = 1, /* Undelete */
30 iflag_Compress = 2, /* Compress file */
31 iflag_Sync = 3, /* Synchronous updates */
32 iflag_Immutable = 4, /* Immutable */
33 iflag_Append = 5, /* Append */
34 iflag_NoDump = 6, /* Don't dump file */
35 iflag_NoAtime = 7, /* No atime updates */
36 /* Reserved for compression usage */
37 iflag_Dirty = 8,
38 iflag_ComprBlk = 9, /* One or more compressed clusters */
39 iflag_NoComp = 10, /* Don't compress */
40 iflag_Ecompr = 11, /* Compression error */
41 /* End of compression flags */
42 iflag_Btree = 12, /* btree format dir */
43 iflag_Index = 12, /* hash-indexed directory */
44 iflag_Imagic = 13, /* AFS directory */
45 iflag_JournalData = 14, /* file data should be journaled */
46 iflag_NoTail = 15, /* file tail should not be merged */
47 iflag_DirSync = 16, /* dirsync behaviour */
48 iflag_TopDir = 17, /* Top of directory hierarchies */
49 iflag_Extent = 19, /* Extents */
50 iflag_DirectIO = 20, /* Always use direct I/O on this file */
51 iflag_Reserved = 31 /* reserved for ext2/3 lib */
52};
53
54#define __IFL(x) (1<<(iflag_##x))
55#define IFLAG_SECRM __IFL(SecureRm) /* 0x00000001 */
56#define IFLAG_UNRM __IFL(Unrm) /* 0x00000002 */
57#define IFLAG_COMPR __IFL(Compr) /* 0x00000004 */
58#define IFLAG_SYNC __IFL(Sync) /* 0x00000008 */
59#define IFLAG_IMMUTABLE __IFL(Immutable) /* 0x00000010 */
60#define IFLAG_APPEND __IFL(Append) /* 0x00000020 */
61#define IFLAG_NODUMP __IFL(NoDump) /* 0x00000040 */
62#define IFLAG_NOATIME __IFL(NoAtime) /* 0x00000080 */
63#define IFLAG_DIRTY __IFL(Dirty) /* 0x00000100 */
64#define IFLAG_COMPRBLK __IFL(ComprBlk) /* 0x00000200 */
65#define IFLAG_NOCOMP __IFL(NoComp) /* 0x00000400 */
66#define IFLAG_ECOMPR __IFL(Ecompr) /* 0x00000800 */
67#define IFLAG_BTREE __IFL(Btree) /* 0x00001000 */
68#define IFLAG_INDEX __IFL(Index) /* 0x00001000 */
69#define IFLAG_IMAGIC __IFL(Imagic) /* 0x00002000 */
70#define IFLAG_JOURNAL_DATA __IFL(JournalData) /* 0x00004000 */
71#define IFLAG_NOTAIL __IFL(NoTail) /* 0x00008000 */
72#define IFLAG_DIRSYNC __IFL(DirSync) /* 0x00010000 */
73#define IFLAG_TOPDIR __IFL(TopDir) /* 0x00020000 */
74#define IFLAG_EXTENT __IFL(Extent) /* 0x00080000 */
75#define IFLAG_DIRECTIO __IFL(DirectIO) /* 0x00100000 */
76#define IFLAG_RESERVED __IFL(Reserved) /* 0x80000000 */
77
78#ifdef __KERNEL__
79/**
80 * iflags_cvt
81 * @table: A table of 32 u32 flags
82 * @val: a 32 bit value to convert
83 *
84 * This function can be used to convert between IFLAGS values and
85 * the filesystem's own flags values.
86 *
87 * Returns: the converted flags
88 */
89static inline u32 iflags_cvt(const u32 *table, u32 val)
90{
91 u32 res = 0;
92 while(val) {
93 if (val & 1)
94 res |= *table;
95 table++;
96 val >>= 1;
97 }
98 return res;
99}
100#endif /* __KERNEL__ */
101
102#endif /* _LINUX_IFLAGS_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 181c69cad4e3..06c2768e1330 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -32,6 +32,7 @@ extern const char linux_banner[];
32 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 33#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
34#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) 34#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
35#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
35#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) 36#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
36#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) 37#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
37 38
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
new file mode 100644
index 000000000000..007b07a178ab
--- /dev/null
+++ b/include/linux/lock_dlm_plock.h
@@ -0,0 +1,40 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __LOCK_DLM_PLOCK_DOT_H__
10#define __LOCK_DLM_PLOCK_DOT_H__
11
12#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock"
13
14#define GDLM_PLOCK_VERSION_MAJOR 1
15#define GDLM_PLOCK_VERSION_MINOR 0
16#define GDLM_PLOCK_VERSION_PATCH 0
17
18enum {
19 GDLM_PLOCK_OP_LOCK = 1,
20 GDLM_PLOCK_OP_UNLOCK,
21 GDLM_PLOCK_OP_GET,
22};
23
24struct gdlm_plock_info {
25 __u32 version[3];
26 __u8 optype;
27 __u8 ex;
28 __u8 wait;
29 __u8 pad;
30 __u32 pid;
31 __s32 nodeid;
32 __s32 rv;
33 __u32 fsid;
34 __u64 number;
35 __u64 start;
36 __u64 end;
37};
38
39#endif
40
diff --git a/mm/filemap.c b/mm/filemap.c
index b9a60c43b61a..d9bbea1e87d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,7 +1181,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1181 *ppos = pos + retval; 1181 *ppos = pos + retval;
1182 } 1182 }
1183 file_accessed(filp); 1183 file_accessed(filp);
1184 goto out; 1184 if (retval != 0)
1185 goto out;
1185 } 1186 }
1186 1187
1187 retval = 0; 1188 retval = 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index aa7ec424656a..1ba736ac0367 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
38 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1; 39 ra->prev_page = -1;
40} 40}
41EXPORT_SYMBOL_GPL(file_ra_state_init);
41 42
42/* 43/*
43 * Return max readahead size for this inode in number-of-pages. 44 * Return max readahead size for this inode in number-of-pages.